Machine Learning algorithms to predict survival on the Titanic
Published on November 17, 2021 by Vimal Octavius PJ
Machine Learning post knn SVM Decision Tree Logistic Regression
237 min READ
Machine Learning Algorithms to predict who survived the Titanic based on independent Xs like Age, Gender, Parent or Child etc...A lot can be done further, keeping it brief for this post. Data from https://www.kaggle.com/c/titanic
# Not all libraries maybe used...copied as a standard template for my use, not quite efficient for sure ;-)
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# Preprocessing allows us to standarsize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from sklearn import svm
import pylab as pl
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
train_data = pd.read_csv('/content/drive/MyDrive/Books/Kaggle/train.csv')
train_data.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test_data = pd.read_csv('/content/drive/MyDrive/Books/Kaggle/test.csv')
test_data.head()
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
features = ["Pclass","Sex","SibSp","Parch"]
X = pd.get_dummies(train_data[features])
X
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 1 | 0 | 0 | 1 |
1 | 1 | 1 | 0 | 1 | 0 |
2 | 3 | 0 | 0 | 1 | 0 |
3 | 1 | 1 | 0 | 1 | 0 |
4 | 3 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... |
886 | 2 | 0 | 0 | 0 | 1 |
887 | 1 | 0 | 0 | 1 | 0 |
888 | 3 | 1 | 2 | 1 | 0 |
889 | 1 | 0 | 0 | 0 | 1 |
890 | 3 | 0 | 0 | 0 | 1 |
891 rows × 5 columns
y = train_data["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)
Train set: (712, 5) (712,) Test set: (179, 5) (179,)
X_train
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
790 | 3 | 0 | 0 | 0 | 1 |
467 | 1 | 0 | 0 | 0 | 1 |
431 | 3 | 1 | 0 | 1 | 0 |
710 | 1 | 0 | 0 | 1 | 0 |
608 | 2 | 1 | 2 | 1 | 0 |
... | ... | ... | ... | ... | ... |
536 | 1 | 0 | 0 | 0 | 1 |
198 | 3 | 0 | 0 | 1 | 0 |
876 | 3 | 0 | 0 | 0 | 1 |
7 | 3 | 3 | 1 | 0 | 1 |
558 | 1 | 1 | 1 | 1 | 0 |
712 rows × 5 columns
y_train
790 0 467 0 431 1 710 1 608 1 .. 536 0 198 1 876 0 7 0 558 1 Name: Survived, Length: 712, dtype: int64
X_test
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
177 | 1 | 0 | 0 | 1 | 0 |
846 | 3 | 8 | 2 | 0 | 1 |
69 | 3 | 2 | 0 | 0 | 1 |
682 | 3 | 0 | 0 | 0 | 1 |
569 | 3 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... |
797 | 3 | 0 | 0 | 1 | 0 |
91 | 3 | 0 | 0 | 0 | 1 |
312 | 2 | 1 | 1 | 1 | 0 |
540 | 1 | 0 | 2 | 1 | 0 |
135 | 2 | 0 | 0 | 0 | 1 |
179 rows × 5 columns
y_test
177 0 846 0 69 0 682 0 569 1 .. 797 1 91 0 312 0 540 1 135 0 Name: Survived, Length: 179, dtype: int64
j = 10
for i in range(1,j):
KNN_model = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
yhat_Knn = KNN_model.predict(X_test)
acc_sc= (metrics.accuracy_score(y_test, yhat_Knn))
print('Neighbour {} with acuracy {}'. format(i, acc_sc))
Neighbour 1 with acuracy 0.8268156424581006 Neighbour 2 with acuracy 0.8212290502793296 Neighbour 3 with acuracy 0.8491620111731844 Neighbour 4 with acuracy 0.7988826815642458 Neighbour 5 with acuracy 0.8156424581005587 Neighbour 6 with acuracy 0.7988826815642458 Neighbour 7 with acuracy 0.7988826815642458 Neighbour 8 with acuracy 0.770949720670391 Neighbour 9 with acuracy 0.776536312849162
k=3
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
yhat_knn2 = neigh.predict(X_test)
jc_knn = jaccard_score(y_test,yhat_knn2,pos_label=1)
print(classification_report(y_test,yhat_knn2))
precision recall f1-score support 0 0.86 0.91 0.89 114 1 0.83 0.74 0.78 65 accuracy 0.85 179 macro avg 0.84 0.83 0.83 179 weighted avg 0.85 0.85 0.85 179
yhat_knn2
array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0])
dtree = DecisionTreeClassifier(criterion = "entropy", max_depth = 4)
dtree.fit(X_train, y_train)
predtree = dtree.predict(X_test)
print("Decision tree accuracy :",metrics.accuracy_score(y_test,predtree) )
Decision tree accuracy : 0.7988826815642458
jc_dtree = jaccard_score(y_test,predtree,pos_label=1)
jc_dtree
0.5135135135135135
clf = svm.SVC(kernel = "rbf")
clf.fit(X_train,y_train)
SVC()
SVC()
yhat_svm = clf.predict(X_test)
jc_svm = jaccard_score(y_test,yhat_svm,pos_label=1)
jc_svm
0.56
print(classification_report(y_test,yhat_svm))
precision recall f1-score support 0 0.82 0.91 0.86 114 1 0.81 0.65 0.72 65 accuracy 0.82 179 macro avg 0.81 0.78 0.79 179 weighted avg 0.81 0.82 0.81 179
LR = LogisticRegression(C = 0.01, solver = "liblinear").fit(X_train, y_train)
yhat_LR = LR.predict(X_test)
jc_logreg = jaccard_score(y_test,yhat_LR,pos_label=1)
jc_logreg
0.4444444444444444
print(classification_report(y_test,yhat_LR))
precision recall f1-score support 0 0.76 0.94 0.84 114 1 0.82 0.49 0.62 65 accuracy 0.78 179 macro avg 0.79 0.72 0.73 179 weighted avg 0.78 0.78 0.76 179
X2 = pd.get_dummies(test_data[features])
X2
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 0 | 0 | 0 | 1 |
1 | 3 | 1 | 0 | 1 | 0 |
2 | 2 | 0 | 0 | 0 | 1 |
3 | 3 | 0 | 0 | 0 | 1 |
4 | 3 | 1 | 1 | 1 | 0 |
... | ... | ... | ... | ... | ... |
413 | 3 | 0 | 0 | 0 | 1 |
414 | 1 | 0 | 0 | 1 | 0 |
415 | 3 | 0 | 0 | 0 | 1 |
416 | 3 | 0 | 0 | 0 | 1 |
417 | 3 | 1 | 1 | 0 | 1 |
418 rows × 5 columns
yhat_Knn3 = neigh.predict(X2)
yhat_Knn3
array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
#yhat_knn2 seems best with neighbor3 with .8491 accuracy
output = pd.DataFrame\
({'PassengeId':test_data.PassengerId,'Survivor':yhat_Knn3})
output
PassengeId | Survivor | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 1 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 0 |
... | ... | ... |
413 | 1305 | 0 |
414 | 1306 | 1 |
415 | 1307 | 0 |
416 | 1308 | 0 |
417 | 1309 | 0 |
418 rows × 2 columns
output.to_csv('/content/drive/MyDrive/Books/Kaggle/prediction.csv',index=False)