# Not all libraries maybe used...copied as a standard template for my use, not quite efficient for sure ;-)

# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# Preprocessing allows us to standarsize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier


import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from sklearn import svm
import pylab as pl
import scipy.optimize as opt
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


train_data = pd.read_csv('/content/drive/MyDrive/Books/Kaggle/train.csv')
train_data.head()


test_data = pd.read_csv('/content/drive/MyDrive/Books/Kaggle/test.csv')


test_data.head()


features = ["Pclass","Sex","SibSp","Parch"]
X = pd.get_dummies(train_data[features])
X


y = train_data["Survived"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print("Train set:", X_train.shape, y_train.shape)
print("Test set:", X_test.shape, y_test.shape)

Train set: (712, 5) (712,)
Test set: (179, 5) (179,)


X_train


y_train

790    0
467    0
431    1
710    1
608    1
      ..
536    0
198    1
876    0
7      0
558    1
Name: Survived, Length: 712, dtype: int64


X_test


y_test

177    0
846    0
69     0
682    0
569    1
      ..
797    1
91     0
312    0
540    1
135    0
Name: Survived, Length: 179, dtype: int64


j = 10
  for i in range(1,j):
      
    KNN_model = KNeighborsClassifier(n_neighbors = i).fit(X_train,y_train)
    yhat_Knn = KNN_model.predict(X_test)
    acc_sc= (metrics.accuracy_score(y_test, yhat_Knn))
    print('Neighbour {} with acuracy {}'. format(i, acc_sc))

Neighbour 1 with acuracy 0.8268156424581006
Neighbour 2 with acuracy 0.8212290502793296
Neighbour 3 with acuracy 0.8491620111731844
Neighbour 4 with acuracy 0.7988826815642458
Neighbour 5 with acuracy 0.8156424581005587
Neighbour 6 with acuracy 0.7988826815642458
Neighbour 7 with acuracy 0.7988826815642458
Neighbour 8 with acuracy 0.770949720670391
Neighbour 9 with acuracy 0.776536312849162


k=3
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
yhat_knn2 = neigh.predict(X_test)
jc_knn = jaccard_score(y_test,yhat_knn2,pos_label=1)
print(classification_report(y_test,yhat_knn2))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89       114
           1       0.83      0.74      0.78        65

    accuracy                           0.85       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.85      0.85      0.85       179


yhat_knn2

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0])


dtree = DecisionTreeClassifier(criterion = "entropy", max_depth = 4)
dtree.fit(X_train, y_train)
predtree = dtree.predict(X_test)


print("Decision tree accuracy :",metrics.accuracy_score(y_test,predtree) )

Decision tree accuracy : 0.7988826815642458


jc_dtree = jaccard_score(y_test,predtree,pos_label=1)
jc_dtree

0.5135135135135135


clf = svm.SVC(kernel = "rbf")
clf.fit(X_train,y_train)
SVC()

SVC()


yhat_svm = clf.predict(X_test)


jc_svm = jaccard_score(y_test,yhat_svm,pos_label=1)
jc_svm

0.56


print(classification_report(y_test,yhat_svm))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       114
           1       0.81      0.65      0.72        65

    accuracy                           0.82       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.82      0.81       179


LR = LogisticRegression(C = 0.01, solver = "liblinear").fit(X_train, y_train)
yhat_LR = LR.predict(X_test)


jc_logreg = jaccard_score(y_test,yhat_LR,pos_label=1)
jc_logreg

0.4444444444444444


print(classification_report(y_test,yhat_LR))

              precision    recall  f1-score   support

           0       0.76      0.94      0.84       114
           1       0.82      0.49      0.62        65

    accuracy                           0.78       179
   macro avg       0.79      0.72      0.73       179
weighted avg       0.78      0.78      0.76       179


X2 = pd.get_dummies(test_data[features])
X2


yhat_Knn3 = neigh.predict(X2)
yhat_Knn3

array([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])


#yhat_knn2 seems best with neighbor3 with .8491 accuracy
output = pd.DataFrame\
({'PassengeId':test_data.PassengerId,'Survivor':yhat_Knn3})


output


output.to_csv('/content/drive/MyDrive/Books/Kaggle/prediction.csv',index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassengeId	Survivor
0	892	0
1	893	1
2	894	0
3	895	0
4	896	0
...	...	...
413	1305	0
414	1306	1
415	1307	0
416	1308	0
417	1309	0

Predicting survival on the Titanic through Machine Learning

Data Prep, feature engineering¶

KNN¶

Decision Tree¶

Support Vector Machine¶

Logistic Regression¶

Prediction, Output¶

Vimal Octavius PJ

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	1	0	0	1
1	1	1	0	1	0
2	3	0	0	1	0
3	1	1	0	1	0
4	3	0	0	0	1
...	...	...	...	...	...
886	2	0	0	0	1
887	1	0	0	1	0
888	3	1	2	1	0
889	1	0	0	0	1
890	3	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
790	3	0	0	0	1
467	1	0	0	0	1
431	3	1	0	1	0
710	1	0	0	1	0
608	2	1	2	1	0
...	...	...	...	...	...
536	1	0	0	0	1
198	3	0	0	1	0
876	3	0	0	0	1
7	3	3	1	0	1
558	1	1	1	1	0

	Pclass	SibSp	Parch	Sex_female	Sex_male
177	1	0	0	1	0
846	3	8	2	0	1
69	3	2	0	0	1
682	3	0	0	0	1
569	3	0	0	0	1
...	...	...	...	...	...
797	3	0	0	1	0
91	3	0	0	0	1
312	2	1	1	1	0
540	1	0	2	1	0
135	2	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	0	0	0	1
1	3	1	0	1	0
2	2	0	0	0	1
3	3	0	0	0	1
4	3	1	1	1	0
...	...	...	...	...	...
413	3	0	0	0	1
414	1	0	0	1	0
415	3	0	0	0	1
416	3	0	0	0	1
417	3	1	1	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	1	0	0	1
1	1	1	0	1	0
2	3	0	0	1	0
3	1	1	0	1	0
4	3	0	0	0	1
...	...	...	...	...	...
886	2	0	0	0	1
887	1	0	0	1	0
888	3	1	2	1	0
889	1	0	0	0	1
890	3	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
790	3	0	0	0	1
467	1	0	0	0	1
431	3	1	0	1	0
710	1	0	0	1	0
608	2	1	2	1	0
...	...	...	...	...	...
536	1	0	0	0	1
198	3	0	0	1	0
876	3	0	0	0	1
7	3	3	1	0	1
558	1	1	1	1	0

	Pclass	SibSp	Parch	Sex_female	Sex_male
177	1	0	0	1	0
846	3	8	2	0	1
69	3	2	0	0	1
682	3	0	0	0	1
569	3	0	0	0	1
...	...	...	...	...	...
797	3	0	0	1	0
91	3	0	0	0	1
312	2	1	1	1	0
540	1	0	2	1	0
135	2	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	0	0	0	1
1	3	1	0	1	0
2	2	0	0	0	1
3	3	0	0	0	1
4	3	1	1	1	0
...	...	...	...	...	...
413	3	0	0	0	1
414	1	0	0	1	0
415	3	0	0	0	1
416	3	0	0	0	1
417	3	1	1	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	1	0	0	1
1	1	1	0	1	0
2	3	0	0	1	0
3	1	1	0	1	0
4	3	0	0	0	1
...	...	...	...	...	...
886	2	0	0	0	1
887	1	0	0	1	0
888	3	1	2	1	0
889	1	0	0	0	1
890	3	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
790	3	0	0	0	1
467	1	0	0	0	1
431	3	1	0	1	0
710	1	0	0	1	0
608	2	1	2	1	0
...	...	...	...	...	...
536	1	0	0	0	1
198	3	0	0	1	0
876	3	0	0	0	1
7	3	3	1	0	1
558	1	1	1	1	0

	Pclass	SibSp	Parch	Sex_female	Sex_male
177	1	0	0	1	0
846	3	8	2	0	1
69	3	2	0	0	1
682	3	0	0	0	1
569	3	0	0	0	1
...	...	...	...	...	...
797	3	0	0	1	0
91	3	0	0	0	1
312	2	1	1	1	0
540	1	0	2	1	0
135	2	0	0	0	1

	Pclass	SibSp	Parch	Sex_female	Sex_male
0	3	0	0	0	1
1	3	1	0	1	0
2	2	0	0	0	1
3	3	0	0	0	1
4	3	1	1	1	0
...	...	...	...	...	...
413	3	0	0	0	1
414	1	0	0	1	0
415	3	0	0	0	1
416	3	0	0	0	1
417	3	1	1	0	1