fuzzy search in two datasets issue

22 Views Asked by At
import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor


column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')

    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')


    # Select columns to compare
    column1 = dataset1[column1_name].astype(str)
    column2 = dataset2[column2_name].astype(str)

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
           `your text`     s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except FileNotFoundError:
    print("Error: One or both dataset files not found.")
except KeyError:
    print("Error: One or both column names are not present in the datasets.")
except Exception as e:
    print(f"An error occurred: {str(e)}")

this code is returning An error occurred: 'str' object has no attribute 'shape'

my main goal is to compare between two different columns from two different datasets, they can be written with mistakes or abbreviations and all of the above, and the resulting dataset needs to have the matching values`

1

There are 1 best solutions below

0
Aravind Pillai On

The error means that there is an issue when calling the astype(str) method on column1 and column2 variables. This error to my knowledge occurs when the input to astype() is not a Pandas DataFrame or a Series object.

In your code, it seems that dataset1 and dataset2 are already DataFrames, so there's no need to call astype(str) on the cols. You can probably modify the code to remove the astype(str) calls on column1 and column2.

import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor

column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'

try:
    # Read datasets directly using read_csv
    dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')
    dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv", low_memory=False, on_bad_lines='skip', index_col=False, dtype='unicode')

    # Select columns to compare
    column1 = dataset1[column1_name]
    column2 = dataset2[column2_name]

    # Matching threshold
    threshold = 80

    # Drop rows with NaN or empty values
    column1 = column1.dropna().drop_duplicates().dropna()
    column2 = column2.dropna().drop_duplicates().dropna()

    matches = []

    # Create index for column2
    column2_index = column2.reset_index().values.tolist()

    with ThreadPoolExecutor() as executor:
        for i, value1 in column1.iteritems():
            best_match = None
            best_score = threshold

            for j, value2 in column2_index:
                s1 = clean(value1)  # Preprocess strings for fuzzy matching
                s2 = clean(value2)
                score = fuzz.token_set_ratio(s1, s2)  # Calculate fuzzy similarity

                if score > best_score:
                    best_match = j
                    best_score = score

            if best_match is not None:
                matches.append((i, best_match, best_score))

    matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
    matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
    matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values

    print(matched_data)

except Exception as e:
    print(f"An error occurred: {str(e)}")