I have created an SVM model using Sickit-learn:
import pandas as pd
df = pd.read_csv(r"C:\Users\aaa\Documents\bbb\svm_.csv", encoding='latin1', sep=';')
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(df.drop(columns=['alvo']), df['alvo'])
df_resampled = pd.concat([X_resampled, pd.DataFrame({'alvo': y_resampled})], axis=1)
print(df_resampled)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
X = df_resampled.drop(columns=['alvo'])
y = df_resampled['alvo']
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=2)
svm_classifier = SVC(kernel='poly', random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_result)
import pickle
model_file_path = "C:\\Users\\aaa\\Documents\\bbb\\svm_modelo_sent_simnao2.pkl"
with open(model_file_path, 'wb') as f:
pickle.dump(svm_classifier, f)
print("Model saved successfully!")
That simple implemention of Support Vector Machines to predict an categorical variable was correct, worked fine.
But, when loading the model and running the following code:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import joblib
model_file_path = "C:\\Users\\AAA\\Documents\\BBB\\svm_modelo_sent_simnao.pkl"
svm_classifier = joblib.load(model_file_path)
new_df = pd.read_csv(r"C:\Users\AAA\Documents\BBB\svm_simnao_rodar.csv", encoding='latin1', sep=';')
X_new = new_df.drop(columns=['alvo', 'ID_ASSUNTO'], axis=1) # Drop 'alvo' and 'ID_ASSUNTO' columns
categorical_columns = ['UF', 'TIPO_ACAO', 'AREA_JURIDICA', 'VARA_CAMARA', 'CLIENTE_NOME'] # List of categorical columns
encoder = OneHotEncoder(categories='auto', sparse=False)
X_new_encoded = encoder.fit_transform(X_new[categorical_columns])
X_new_processed = pd.concat([pd.DataFrame(X_new_encoded), X_new.drop(columns=categorical_columns)], axis=1)
y_pred_new = svm_classifier.predict(X_new_processed)
new_df['alvo'] = y_pred_new
predictions = new_df[['ID_ASSUNTO', 'alvo']]
print(predictions)
What results in the following error:
ValueError Traceback (most recent call last)
<ipython-input-3-522717c33138> in <module>
22
23 # Make predictions
---> 24 y_pred_new = svm_classifier.predict(X_new_processed)
25
26 # Add predictions ('alvo') to the new data
~\AppData\Roaming\Python\Python39\site-packages\sklearn\svm\_base.py in predict(self, X)
818 y = np.argmax(self.decision_function(X), axis=1)
819 else:
--> 820 y = super().predict(X)
821 return self.classes_.take(np.asarray(y, dtype=np.intp))
822
~\AppData\Roaming\Python\Python39\site-packages\sklearn\svm\_base.py in predict(self, X)
431 The predicted values.
432 """
--> 433 X = self._validate_for_predict(X)
434 predict = self._sparse_predict if self._sparse else self._dense_predict
435 return predict(X)
~\AppData\Roaming\Python\Python39\site-packages\sklearn\svm\_base.py in _validate_for_predict(self, X)
611
...
--> 389 raise ValueError(
390 f"X has {n_features} features, but {self.__class__.__name__} "
391 f"is expecting {self.n_features_in_} features as input."
ValueError: X has 2943 features, but SVC is expecting 330320 features as input.
The data i'm using to predict alvo in the deployment code have the exact same structure as the data which i trained the model and I use OneHotEncoding in both training and deployment code... so I'm kinda lost in this one.
Any idea of how can i resolve this problem?
Thanks in advance.