目标:利用给定数据,预测是否 Survived
参考:https://www.kaggle.com/omarelgabry/a-journey-through-titanic
主要思路是:(get_dummies)删除 Survived 概率低的项
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
titanic_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')
titanic_df.head()
# 打印基本信息
titanic_df.info()
print '-'*40
test_df.info()
在这里不打算使用'PassengerId'(titanic_df中的),'Name','Ticket'这三列来进行预测,所以提前 drop 掉。
titanic_df=titanic_df.drop(['PassengerId','Name','Ticket'],axis=1)
titanic_df.head()
test_df.head()
test_df=test_df.drop(['Name','Ticket'],axis=1)
test_df.head()
首先查看哪些列有缺失数据,然后逐一处理。
# 查看哪些数据缺失
def isnan(d):
return sum(d.isnull())
print titanic_df.apply(isnan)
print '-'*40
test_df.apply(isnan)
填充后 get_dummies,删除 Survived 概率低的列。
# 只有 titanic_df 缺失,使用 S 填充
titanic_df['Embarked']=titanic_df['Embarked'].fillna('S')
sns.factorplot('Embarked','Survived',data=titanic_df,size=4,aspect=3)
fig,(axis1,axis2,axis3)=plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Embarked',data=titanic_df,ax=axis1)
sns.countplot(x='Survived',hue='Embarked',data=titanic_df,ax=axis2)
embark_perc=titanic_df[['Embarked','Survived']].groupby(['Embarked'],as_index=False).mean()
sns.barplot('Embarked','Survived',data=embark_perc,order=['S','C','Q'],ax=axis3)
embark_dummies_titanic=pd.get_dummies(titanic_df['Embarked'])
embark_dummies_titanic.head()
embark_dummies_titanic.drop(['S'],axis=1,inplace=True)
embark_dummies_titanic.head()
embark_dummies_test=pd.get_dummies(test_df['Embarked'])
embark_dummies_test.drop(['S'],axis=1,inplace=True)
embark_dummies_test.head()
titanic_df=titanic_df.join(embark_dummies_titanic)
test_df=test_df.join(embark_dummies_test)
titanic_df.head()
titanic_df.drop(['Embarked'],axis=1,inplace=True)
titanic_df.head()
test_df.drop(['Embarked'],axis=1,inplace=True)
test_df.head()
使用众数填充后转为整数
# 只用 test_df 的 Fare 需要填充,在这里使用众数填充
test_df['Fare'].fillna(test_df['Fare'].median(),inplace=True)
isnan(test_df['Fare'])
# 将 Fare 转成整数
titanic_df['Fare']=titanic_df['Fare'].astype(int)
test_df['Fare']=test_df['Fare'].astype(int)
titanic_df.head()
# Survived 和 not_Survived 的 average_fare、std_fare
fare_not_survived=titanic_df['Fare'][titanic_df['Survived']==0]
fare_survived=titanic_df['Fare'][titanic_df['Survived']==1]
average_fare=pd.DataFrame([fare_not_survived.mean(),fare_survived.mean()])
std_fare=pd.DataFrame([fare_not_survived.std(),fare_survived.std()])
average_fare
titanic_df['Fare'].plot(kind='hist',figsize=(15,3),bins=100,xlim=(0,50))
average_fare.index.names=std_fare.index.names=['Survived']
average_fare.plot(yerr=std_fare,kind='bar')
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Original Age values - Titanic')
axis2.set_title('New Age values - Titanic')
average_age_titanic=titanic_df['Age'].mean()
std_age_titanic=titanic_df['Age'].std()
count_nan_age_titanoc=isnan(titanic_df['Age'])
average_age_test=test_df['Age'].mean()
std_age_test=test_df['Age'].std()
count_nan_age_test=isnan(test_df['Age'])
# 生成随机数填充缺失的 Age,范围 [mean - std, mean + std]
rand_1=np.random.randint(average_age_titanic - std_age_titanic,average_age_titanic+std_age_titanic,count_nan_age_titanoc)
rand_2=np.random.randint(average_age_test - std_age_test,average_age_test+std_age_test,count_nan_age_test)
titanic_df['Age'].dropna().astype(int).hist(bins=70,ax=axis1)
titanic_df.loc[titanic_df['Age'].isnull(),'Age']=rand_1
test_df.loc[test_df['Age'].isnull(),'Age']=rand_2
# 将 Age 转为整数
titanic_df['Age']=titanic_df['Age'].astype(int)
test_df['Age']=test_df['Age'].astype(int)
titanic_df['Age'].hist(bins=70,ax=axis2)
facet=sns.FacetGrid(titanic_df,hue='Survived',aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0,titanic_df['Age'].max()))
facet.add_legend()
plt.show()
# 按年龄划分的平均 Survived
average_age=titanic_df[['Age','Survived']].groupby(['Age'],as_index=False).mean()
fig,axis1=plt.subplots(1,1,figsize=(18,4))
sns.barplot('Age','Survived',data=average_age)
plt.show()
缺失值太多,直接删除
titanic_df.drop('Cabin',axis=1,inplace=True)
test_df.drop('Cabin',axis=1,inplace=True)
titanic_df.head()
titanic_df['Family']=titanic_df['Parch']+titanic_df['SibSp']
titanic_df['Family'].loc[titanic_df['Family']==0]=0
titanic_df['Family'].loc[titanic_df['Family']>0]=1
test_df['Family']=test_df['Parch']+test_df['SibSp']
test_df['Family'].loc[test_df['Family']==0]=0
test_df['Family'].loc[test_df['Family']>0]=1
# 删除 Parch 和 SibSp
titanic_df.drop(['Parch','SibSp'],axis=1,inplace=True)
test_df.drop(['Parch','SibSp'],axis=1,inplace=True)
titanic_df.head()
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))
sns.countplot('Family',data=titanic_df,order=[1,0],ax=axis1)
axis1.set_xticklabels(['With Family','Alone'],rotation=0)
family_perc=titanic_df[['Family','Survived']].groupby(['Family'],as_index=False).mean()
sns.barplot('Family','Survived',data=family_perc,order=[1,0],ax=axis2)
axis2.set_xticklabels(['With Family','Alone'],rotation=0)
def get_person(passenger):
age,sex=passenger
return 'child' if age<16 else sex
titanic_df['Person']=titanic_df[['Age','Sex']].apply(get_person,axis=1)
test_df['Person']=test_df[['Age','Sex']].apply(get_person,axis=1)
titanic_df.head()
# 删除 Sex
titanic_df.drop(['Sex'],axis=1,inplace=True)
test_df.drop(['Sex'],axis=1,inplace=True)
titanic_df.head()
fig,(axis1,axis2)=plt.subplots(1,2,figsize=(10,5))
sns.countplot('Person',data=titanic_df,ax=axis1)
person_perc=titanic_df[['Person','Survived']].groupby(['Person'],as_index=False).mean()
sns.barplot('Person','Survived',data=person_perc,order=['male','female','child'],ax=axis2)
plt.show()
person_dummies_titanic=pd.get_dummies(titanic_df['Person'])
person_dummies_titanic.drop(['male'],axis=1,inplace=True) #因为Male中Survived的概率很低
person_dummies_titanic.head()
person_dummies_test=pd.get_dummies(test_df['Person'])
person_dummies_test.drop(['male'],axis=1,inplace=True)
person_dummies_test.head()
titanic_df=titanic_df.join(person_dummies_titanic)
test_df=test_df.join(person_dummies_test)
titanic_df.head()
# 删除 Person
titanic_df.drop(['Person'],axis=1,inplace=True)
test_df.drop(['Person'],axis=1,inplace=True)
titanic_df.head()
将 Pclass get_dummies,再删除 Survived 概率小的列。
sns.factorplot('Pclass','Survived',order=[1,2,3],data=titanic_df,size=4,aspect=3)
plt.show()
pclass_dummies_titanic=pd.get_dummies(titanic_df['Pclass'])
pclass_dummies_titanic.head()
pclass_dummies_titanic.columns=['Class_1','Class_2','Class_3'] #改名
pclass_dummies_titanic.drop(['Class_3'],axis=1,inplace=True)
pclass_dummies_titanic.head()
pclass_dummies_test=pd.get_dummies(test_df['Pclass'])
pclass_dummies_test.columns=['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'],axis=1,inplace=True)
pclass_dummies_test.head()
titanic_df.drop(['Pclass'],axis=1,inplace=True)
test_df.drop(['Pclass'],axis=1,inplace=True)
titanic_df=titanic_df.join(pclass_dummies_titanic)
test_df=test_df.join(pclass_dummies_test)
titanic_df.head()
X_train=titanic_df.drop(['Survived'],axis=1)
y_train=titanic_df['Survived']
X_test=test_df.drop('PassengerId',axis=1)
test_df.head()
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
y_pred1=logreg.predict(X_test)
logreg.score(X_train,y_train)
svc=SVC()
svc.fit(X_train,y_train)
y_pred2=svc.predict(X_test)
svc.score(X_train,y_train)
random_forest=RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train,y_train)
y_pred3=random_forest.predict(X_test)
random_forest.score(X_train,y_train)
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred4=knn.predict(X_test)
knn.score(X_train,y_train)
gaussian=GaussianNB()
gaussian.fit(X_train,y_train)
y_pred5=gaussian.predict(X_test)
gaussian.score(X_train,y_train)
gbdt=GradientBoostingClassifier()
gbdt.fit(X_train,y_train)
y_pred6=gbdt.predict(X_test)
gbdt.score(X_train,y_train)
titanic_df.columns
titanic_df.columns.delete(0) #删除Survived
coeff_df=pd.DataFrame({'Features':titanic_df.columns.delete(0)})
coeff_df
coeff_df['Coefficient Estimate']=pd.Series(logreg.coef_[0])
coeff_df
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred1
})
submission.to_csv('submission_3_1_logreg.csv',index=False)
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred2
})
submission.to_csv('submission_3_2_svc.csv',index=False)
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred3
})
submission.to_csv('submission_3_3_rdforest.csv',index=False)
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred4
})
submission.to_csv('submission_3_4_knn.csv',index=False)
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred5
})
submission.to_csv('submission_3_5_gaussian.csv',index=False)
submission=pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': y_pred6
})
submission.to_csv('submission_3_6_gbdt.csv',index=False)