目标:利用给定数据,预测是否 Survived
参考:https://www.kaggle.com/startupsci/titanic-data-science-solutions
LogisticRegression
:0.76555SVC
:0.77990LinearSVC
:0.76555KNeighborsClassifier
:0.77033GaussianNB
:0.74163Perceptron
:0.75598SGDClassifier
:0.79426DecisionTreeClassifier
:0.78469RandomForestClassifier
:0.77990GradientBoostingClassifier
:0.79426import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
combine=[train_df,test_df]
len(combine)
# 查看有哪些 features
print train_df.columns.values
# 查看数据,哪些数据是数字,哪些数据是字符串,哪些数据是混合类型,哪些数据包含NAN
train_df.head()
train_df.tail()
# 查看 features 数据类型
train_df.info()
print '-'*40
test_df.info()
train_df.describe()
train_df.describe(include=['O'])
train_df[['Pclass','Survived']].groupby(['Pclass'],as_index=False).mean()
train_df[['Sex','Survived']].groupby(['Sex'],as_index=False).mean()
train_df[['SibSp','Survived']].groupby(['SibSp'],as_index=False).mean()
train_df[['Parch','Survived']].groupby(['Parch'],as_index=False).mean()
g=sns.FacetGrid(train_df,col='Survived')
g.map(plt.hist,'Age',bins=20)
grid=sns.FacetGrid(train_df,col='Survived',row='Pclass',size=4,aspect=2)
grid.map(plt.hist,'Age',alpha=.5,bins=20) #alpha透明度
从上图可以看出:
grid=sns.FacetGrid(train_df,row='Embarked',size=4,aspect=2)
grid.map(sns.pointplot,'Pclass','Survived','Sex',palette='deep')#palette颜色:深
grid.add_legend() #加说明:每个颜色对应什么
从上图可以看出:
grid=sns.FacetGrid(train_df,row='Embarked',col='Survived',size=2.2,aspect=1.6)
grid.map(sns.barplot,'Sex','Fare',alpha=.8)
grid.add_legend()
从上图可以看出:
# 删除 Ticket 和 Cabin 列
print "Before",train_df.shape,test_df.shape,combine[0].shape,combine[1].shape
train_df.drop(['Ticket','Cabin'],axis=1,inplace=True) #这种写法必须有inplace=True
test_df.drop(['Ticket','Cabin'],axis=1,inplace=True)
combine=[train_df,test_df]
print "After",train_df.shape,test_df.shape,combine[0].shape,combine[1].shape
# 提取 Title,打印 Title 和 Sex 的关系
for dataset in combine:
dataset['Title']=dataset.Name.str.extract(' ([A-Za-z]+)\.',expand=False)
pd.crosstab(train_df['Title'],train_df['Sex'])
# 用常见的 Title 替换不常见的,或者将不常见的放在”Rare“类中
for dataset in combine:
dataset['Title']=dataset['Title'].replace(['Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Rev', 'Sir'],'Rare')
dataset['Title']=dataset['Title'].replace('Mlle','Miss')
dataset['Title']=dataset['Title'].replace('Ms','Miss')
dataset['Title']=dataset['Title'].replace('Mme','Mrs')
train_df[['Title','Survived']].groupby(['Title'],as_index=False).mean()
# 检查一下
print len(train_df['Title']), len(test_df['Title'])
# 将 Title 的类别转为数值
title_mapping={'Mr':1,'Miss':2,'Mrs':3,'Master':4,'Rare':5}
for dataset in combine:
dataset['Title']=dataset['Title'].map(title_mapping)
dataset['Title']=dataset['Title'].fillna(0)
train_df.head()
# 完成了 Title 这个 feature,也就不再需要 Name 了,同时,PassengerId也可以删掉
train_df.drop(['Name','PassengerId'],axis=1,inplace=True)
test_df.drop(['Name'],axis=1,inplace=True)
combine=[train_df,test_df]
train_df.shape,test_df.shape
train_df.head()
for dataset in combine:
dataset['Sex']=dataset['Sex'].map({'female':1,'male':0})
train_df.head()
可以将缺失的年龄填充为平均值,或者是根据平均值和偏差来随机生成。但是这样产生的年龄数据偏差比较大。 所以在这里我们分析年龄和 Pclass 以及 Sex 之间的关系。
grid=sns.FacetGrid(train_df,row='Pclass',col='Sex',size=4,aspect=2)
grid.map(plt.hist,'Age',alpha=.5,bins=20)
guess_ages=np.zeros((2,3))
guess_ages
for dataset in combine:
#分别求Sex=0,Pclass=1的众数,存到guess_ages[0][0]
#求Sex=0,Pclass=2的众数,存到guess_ages[0][1]
for i in range(2):
for j in range(3):
guess_df=dataset[(dataset['Sex']==i) & (dataset['Pclass']==j+1)]['Age'].dropna()
age_guess=guess_df.median()
# Convert random age float to nearest .5 age
guess_ages[i,j]=int(age_guess/0.5+0.5)*0.5
for i in range(2):
for j in range(3):
dataset.loc[(dataset.Age.isnull()) & (dataset.Sex ==i) & (dataset.Pclass==j+1),'Age']=guess_ages[i,j]
dataset['Age']=dataset['Age'].astype(int)
train_df.head()
guess_ages
age_guess #这只是Sex=1,Pclass=3的众数
# 将年龄分组,查看与 Survived 的关系
train_df['AgeBand']=pd.cut(train_df['Age'],5) #将连续值分为5份
train_df[['AgeBand','Survived']].groupby(['AgeBand'],as_index=False).mean()
for dataset in combine:
dataset.loc[dataset['Age']<=16,'Age']=0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age']=4
train_df.head()
# 移除 AgeBand
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
test_df.head()
for dataset in combine:
dataset['FamilySize']=dataset['SibSp']+dataset['Parch']+1
train_df[['FamilySize','Survived']].groupby(['FamilySize'],as_index=False).mean()
for dataset in combine:
dataset['IsAlone']=0
dataset.loc[dataset['FamilySize']==1,'IsAlone']=1
train_df[['IsAlone','Survived']].groupby(['IsAlone'],as_index=False).mean()
# 删除 'Parch','SibSp','FamilySize'
train_df.drop(['Parch','SibSp','FamilySize'],axis=1,inplace=True)
test_df.drop(['Parch','SibSp','FamilySize'],axis=1,inplace=True)
combine=[train_df,test_df]
train_df.head()
Age*Class
¶for dataset in combine:
dataset['Age*Class']=dataset.Age*dataset.Pclass
train_df.loc[:,['Age','Pclass','Age*Class']].head(10)
freq_port=train_df.Embarked.dropna().mode()[0]
freq_port
for dataset in combine:
dataset['Embarked']=dataset['Embarked'].fillna(freq_port)
train_df[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean()
# 转为数值
for dataset in combine:
dataset['Embarked']=dataset['Embarked'].map({'S':0,'C':1,'Q':2})
train_df.head()
test_df['Fare'].fillna(test_df['Fare'].dropna().median(),inplace=True)
test_df.head()
train_df['FareBand']=pd.qcut(train_df['Fare'],4)
train_df[['FareBand','Survived']].groupby(['FareBand'],as_index=False).mean()
for dataset in combine:
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare']=dataset['Fare'].astype(int)
train_df.drop(['FareBand'],axis=1,inplace=True)
combine=[train_df,test_df]
train_df.head(10)
test_df.head(10)
X_train=train_df.drop('Survived',axis=1)
Y_train=train_df['Survived']
X_test=test_df.drop('PassengerId',axis=1).copy()
X_train.shape,Y_train.shape,X_test.shape
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred1 = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log
# 使用Logistic回归验证我们的假设
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
coeff_df.sort_values(by='Correlation', ascending=False)
# Support Vector Machines
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred2 = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc
# Linear SVC
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred3 = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred4 = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred5 = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred6 = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron
# Stochastic Gradient Descent
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred7 = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred8 = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree
# Random Forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred9 = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest
# GradientBoostingClassifier
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, Y_train)
Y_pred10 = gbdt.predict(X_test)
acc_gbdt = round(gbdt.score(X_train, Y_train) * 100, 2)
acc_gbdt
models = pd.DataFrame({
'Model': ['Logistic Regression', 'Support Vector Machines',
'Linear SVC', 'KNN', 'Naive Bayes', 'Perceptron',
'Stochastic Gradient Decent', 'Decision Tree',
'Random Forest', 'Gradient Boosting Classifier'],
'Score': [acc_log, acc_svc, acc_linear_svc, acc_knn,
acc_gaussian, acc_perceptron, acc_sgd,
acc_decision_tree, acc_random_forest, acc_gbdt]})
models.sort_values(by='Score', ascending=False)
for i in range(1,11):
pred=eval('Y_pred{}'.format(i))
submission = pd.DataFrame({
"PassengerId": test_df["PassengerId"],
"Survived": pred
})
filename='submission4_{}.csv'.format(i)
submission.to_csv(filename,index=False)