Titanic: Machine Learning from Disaster

目标:利用给定数据,预测是否 Survived

加载数据

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

datas=pd.read_csv('train.csv')
testdatas=pd.read_csv('test.csv')

datas.head()
Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

查看数据缺失情况

In [3]:
def isnan(d):
    return sum(d.isnull())
print 'datas:\n'
print datas.apply(isnan)
print '='*30
print 'testdatas:\n'
print testdatas.apply(isnan)
datas:

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
==============================
testdatas:

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

填充缺失数据

In [4]:
# 为了便于计算,将字符转为数值
def formatData(data):
    data['Age']=data['Age'].fillna(data['Age'].mean())
    data.loc[data['Sex']=='male','Sex']=0
    data.loc[data['Sex']=='female','Sex']=1
    data['Embarked']=data['Embarked'].fillna('S')
    data['Fare']=data['Fare'].fillna(data['Fare'].mean())
    data.loc[data['Embarked']=='S','Embarked']=0
    data.loc[data['Embarked']=='C','Embarked']=1
    data.loc[data['Embarked']=='Q','Embarked']=2
    return data 
In [6]:
train=formatData(datas)
test=formatData(testdatas)

训练和调参

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
X=train[predictors]
y=train['Survived']

param_test1={'n_estimators':range(10,70,10)}
gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_test1,scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_,gsearch1.best_score_
Out[8]:
({'n_estimators': 40}, 0.8564240020271153)
In [9]:
param_test1={'n_estimators':range(30,50,1)}
gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_test1,scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_,gsearch1.best_score_
Out[9]:
({'n_estimators': 35}, 0.8567040454920344)
In [10]:
rf2=RandomForestClassifier(n_estimators=35,oob_score=True,random_state=10)
rf2.fit(X,y)
rf2.oob_score_
Out[10]:
0.80695847362514028
In [11]:
param_test2={'min_samples_leaf':range(1,50,10)}
gsearch2=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,random_state=10),param_grid=param_test2,scoring='roc_auc',cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_,gsearch2.best_score_
Out[11]:
({'min_samples_leaf': 11}, 0.8634286494439564)
In [12]:
param_test2={'min_samples_leaf':range(1,20,1)}
gsearch2=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,random_state=10),param_grid=param_test2,scoring='roc_auc',cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_,gsearch2.best_score_
Out[12]:
({'min_samples_leaf': 3}, 0.8721042012921127)
In [13]:
rf3=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,oob_score=True,random_state=10)
rf3.fit(X,y)
rf3.oob_score_
Out[13]:
0.81481481481481477
In [14]:
param_test3={'min_samples_split':range(2,22,1)}
gsearch3=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,random_state=10),param_grid=param_test3,scoring='roc_auc',cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_,gsearch3.best_score_
Out[14]:
({'min_samples_split': 14}, 0.8743671086859832)
In [15]:
rf4=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,oob_score=True,random_state=10)
rf4.fit(X,y)
rf4.oob_score_
Out[15]:
0.81930415263748602
In [16]:
param_test4={'max_depth':range(1,21,1)}
gsearch4=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,random_state=10),param_grid=param_test4,scoring='roc_auc',cv=5)
gsearch4.fit(X,y)
gsearch4.best_params_,gsearch3.best_score_
Out[16]:
({'max_depth': 11}, 0.8743671086859832)
In [17]:
param_test5={'max_features':range(1,8)}
gsearch5=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,random_state=10),param_grid=param_test5,scoring='roc_auc',cv=5)
gsearch5.fit(X,y)
gsearch5.best_params_,gsearch3.best_score_
Out[17]:
({'max_features': 2}, 0.8743671086859832)
In [18]:
rf5=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,oob_score=True,random_state=10)
rf5.fit(X,y)
rf5.oob_score_
Out[18]:
0.82154882154882158
In [19]:
rf6=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,max_features=7,oob_score=True,random_state=10)
rf6.fit(X,y)
rf6.oob_score_
Out[19]:
0.83164983164983164

预测

In [20]:
alg=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,max_features=7,oob_score=True,random_state=10)
alg.fit(X,y)
predict=alg.predict(test[predictors])

保存预测结果

In [21]:
results=pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predict})

results.to_csv('submission_1.csv',index=False)