Stratified Sampling using Titanic dataset

70 Views Asked by At

I'm asked to write a code using Titanic dataset and do following task:

  1. Data description
  2. Data visualization
  3. Create a test Set with stratified sampling 4. Data Cleaning
  4. Handling Text and Categorical Attributes
  5. Transformation Pipelines
  6. Select and Train the Model 8. Predictions and Evaluations include confusion matrix
  7. Choosing the best K Value

I wrote the following code but the code doesn't work. I think I made mistakes during writing the code. Confusion matrix is not accurate at all. would you please help me resolve the problems?

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df=pd.read_csv("titanic_train")

df.describe()

(df['Survived'].value_counts()) / len(df) * 100

sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

sns.countplot(x='Survived',data=df,hue = 'Sex')

sns.countplot(x='Survived',data=df,hue = 'Pclass')

sns.heatmap(df.corr(),cmap='coolwarm')

df['Age'].dropna().plot.hist(bins=30)

df.groupby('Survived', group_keys=False).apply(lambda x: x.sample(frac=0.3)) #Create stratified sampling test set

df["Survived"].value_counts(normalize=True)*100

df.isnull().sum()

missing_values=df.isna().any()
print("Columns with missing values: \n{0}".format(missing_values[missing_values==True].index.tolist()))

categoricals=df.nunique().sort_values(ascending=True)
print("Categorical Variables in df data: \n{0}".format(categoricals))


def clean_data(df):
    df.drop(['Cabin'], axis=1, inplace=True)
    df.drop(['Embarked', 'Fare', 'Ticket', 'Name'], axis=1, inplace=True)
    return df
def impute_age(cols):
    Age=cols[0]
    Pclass=cols[1]
    
    if(pd.isnull(Age)):
        if(Pclass==1):
            return 37
        elif(Pclass==2):
            return 29
        else:
            return 24
    else:
            return Age

df["Age"]=df[["Age","Pclass"]].apply(impute_age,axis=1)

X=df[['Age','Sex','Pclass']]

from sklearn.model_selection import train_test_split
df_train, df_test, df_train, df_test = train_test_split(df, df,stratify=y,test_size=0.3)

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35, random_state=1, stratify=y)

X=df[['Pclass','Sex','Age']]

y=df['Survived']

from sklearn.model_selection import train_test_split

y_train.value_counts(normalize=True)*100

Y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, stratify=y, random_state = 12)

from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors =10)

KNN.fit(X_train,y_train)

y_prediction = KNN.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report


[[177  15]
 [ 65  55]]

print(classification_report(y_test,y_prediction))

precision recall f1-score support

       0       0.73      0.92      0.82       192
       1       0.79      0.46      0.58       120

accuracy                           0.74       312

macro avg 0.76 0.69 0.70 312 weighted avg 0.75 0.74 0.72 312

The Error rate significantly increases when k-values increases.

1

There are 1 best solutions below

1
Rainy sidewalks On

change the following

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df=pd.read_csv("titanic_train") #<-- change this to incliude the file extansion like df=pd.read_csv("titanic_train.csv") assuming the dataset is in CSV                                                                                             format.

df.describe()

(df['Survived'].value_counts()) / len(df) * 100

sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

sns.countplot(x='Survived',data=df,hue = 'Sex')

sns.countplot(x='Survived',data=df,hue = 'Pclass')

sns.heatmap(df.corr(),cmap='coolwarm')

df['Age'].dropna().plot.hist(bins=30)

df.groupby('Survived', group_keys=False).apply(lambda x: x.sample(frac=0.3)) #Create stratified sampling test set

df["Survived"].value_counts(normalize=True)*100

df.isnull().sum()

missing_values=df.isna().any()
print("Columns with missing values: \n{0}".format(missing_values[missing_values==True].index.tolist()))

categoricals=df.nunique().sort_values(ascending=True)
print("Categorical Variables in df data: \n{0}".format(categoricals))


def clean_data(df):
    df.drop(['Cabin'], axis=1, inplace=True)
    df.drop(['Embarked', 'Fare', 'Ticket', 'Name'], axis=1, inplace=True)
    return df
def impute_age(cols):
    Age=cols[0]
    Pclass=cols[1]
    
    if(pd.isnull(Age)):
        if(Pclass==1):
            return 37
        elif(Pclass==2):
            return 29
        else:
            return 24
    else:
            return Age

df["Age"]=df[["Age","Pclass"]].apply(impute_age,axis=1)

X=df[['Age','Sex','Pclass']]

from sklearn.model_selection import train_test_split
df_train, df_test, df_train, df_test = train_test_split(df, df,stratify=y,test_size=0.3) #<-- df_train and df_test is assinged twice and used incorrect #variables. it should be df_train, df_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3).

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35, random_state=1, stratify=y)

X=df[['Pclass','Sex','Age']]  #<--X=df[['Age','Sex','Pclass']] is duplicated. removed it.

y=df['Survived']

from sklearn.model_selection import train_test_split

y_train.value_counts(normalize=True)*100

Y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, stratify=y, random_state = 12)

from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier(n_neighbors =10)

KNN.fit(X_train,y_train)

y_prediction = KNN.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report


[[177  15] #<-- assing it correctely like confusion_matrix = [[177, 15], [65, 55]]
 [ 65  55]]

print(classification_report(y_test,y_prediction))

try it and let us know