import pandas as pd
from fuzzywuzzy import fuzz
from recordlinkage.preprocessing import clean
from concurrent.futures import ThreadPoolExecutor
column1_name = 'Sold To Customer Name'
column2_name = 'customer_name'
try:
# Read datasets directly using read_csv
dataset1 = pd.read_csv(r"C:\Users\JE\Downloads\edw_query_extract_distinct.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')
dataset2 = pd.read_csv(r"C:\Users\JE\Downloads\ISO_Code_joined.csv",low_memory=False,on_bad_lines='skip', index_col=False, dtype='unicode')
# Select columns to compare
column1 = dataset1[column1_name].astype(str)
column2 = dataset2[column2_name].astype(str)
# Matching threshold
threshold = 80
# Drop rows with NaN or empty values
column1 = column1.dropna().drop_duplicates().dropna()
column2 = column2.dropna().drop_duplicates().dropna()
matches = []
# Create index for column2
column2_index = column2.reset_index().values.tolist()
with ThreadPoolExecutor() as executor:
for i, value1 in column1.iteritems():
best_match = None
best_score = threshold
for j, value2 in column2_index:
`your text` s1 = clean(value1) # Preprocess strings for fuzzy matching
s2 = clean(value2)
score = fuzz.token_set_ratio(s1, s2) # Calculate fuzzy similarity
if score > best_score:
best_match = j
best_score = score
if best_match is not None:
matches.append((i, best_match, best_score))
matched_data = pd.DataFrame(matches, columns=['Index_1', 'Index_2', 'Similarity_Score'])
matched_data['Value_1'] = column1.loc[matched_data['Index_1']].values
matched_data['Value_2'] = column2.loc[matched_data['Index_2']].values
print(matched_data)
except FileNotFoundError:
print("Error: One or both dataset files not found.")
except KeyError:
print("Error: One or both column names are not present in the datasets.")
except Exception as e:
print(f"An error occurred: {str(e)}")
this code is returning An error occurred: 'str' object has no attribute 'shape'
my main goal is to compare between two different columns from two different datasets, they can be written with mistakes or abbreviations and all of the above, and the resulting dataset needs to have the matching values`
The error means that there is an issue when calling the astype(str) method on column1 and column2 variables. This error to my knowledge occurs when the input to astype() is not a Pandas DataFrame or a Series object.
In your code, it seems that dataset1 and dataset2 are already DataFrames, so there's no need to call astype(str) on the cols. You can probably modify the code to remove the astype(str) calls on column1 and column2.