I'm asked to write a code using Titanic dataset and do following task:
- Data description
- Data visualization
- Create a test Set with stratified sampling 4. Data Cleaning
- Handling Text and Categorical Attributes
- Transformation Pipelines
- Select and Train the Model 8. Predictions and Evaluations include confusion matrix
- Choosing the best K Value
I wrote the following code but the code doesn't work. I think I made mistakes during writing the code. Confusion matrix is not accurate at all. would you please help me resolve the problems?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv("titanic_train")
df.describe()
(df['Survived'].value_counts()) / len(df) * 100
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')
sns.countplot(x='Survived',data=df,hue = 'Sex')
sns.countplot(x='Survived',data=df,hue = 'Pclass')
sns.heatmap(df.corr(),cmap='coolwarm')
df['Age'].dropna().plot.hist(bins=30)
df.groupby('Survived', group_keys=False).apply(lambda x: x.sample(frac=0.3)) #Create stratified sampling test set
df["Survived"].value_counts(normalize=True)*100
df.isnull().sum()
missing_values=df.isna().any()
print("Columns with missing values: \n{0}".format(missing_values[missing_values==True].index.tolist()))
categoricals=df.nunique().sort_values(ascending=True)
print("Categorical Variables in df data: \n{0}".format(categoricals))
def clean_data(df):
df.drop(['Cabin'], axis=1, inplace=True)
df.drop(['Embarked', 'Fare', 'Ticket', 'Name'], axis=1, inplace=True)
return df
def impute_age(cols):
Age=cols[0]
Pclass=cols[1]
if(pd.isnull(Age)):
if(Pclass==1):
return 37
elif(Pclass==2):
return 29
else:
return 24
else:
return Age
df["Age"]=df[["Age","Pclass"]].apply(impute_age,axis=1)
X=df[['Age','Sex','Pclass']]
from sklearn.model_selection import train_test_split
df_train, df_test, df_train, df_test = train_test_split(df, df,stratify=y,test_size=0.3)
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35, random_state=1, stratify=y)
X=df[['Pclass','Sex','Age']]
y=df['Survived']
from sklearn.model_selection import train_test_split
y_train.value_counts(normalize=True)*100
Y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, stratify=y, random_state = 12)
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors =10)
KNN.fit(X_train,y_train)
y_prediction = KNN.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
[[177 15]
[ 65 55]]
print(classification_report(y_test,y_prediction))
precision recall f1-score support
0 0.73 0.92 0.82 192
1 0.79 0.46 0.58 120
accuracy 0.74 312
macro avg 0.76 0.69 0.70 312 weighted avg 0.75 0.74 0.72 312
The Error rate significantly increases when k-values increases.
change the following
try it and let us know