import pandas as pd
import matplotlib.pyplot as plt
datas=pd.read_csv('train.csv')
testdatas=pd.read_csv('test.csv')
datas.head()
def isnan(d):
return sum(d.isnull())
print 'datas:\n'
print datas.apply(isnan)
print '='*30
print 'testdatas:\n'
print testdatas.apply(isnan)
# 为了便于计算,将字符转为数值
def formatData(data):
data['Age']=data['Age'].fillna(data['Age'].mean())
data.loc[data['Sex']=='male','Sex']=0
data.loc[data['Sex']=='female','Sex']=1
data['Embarked']=data['Embarked'].fillna('S')
data['Fare']=data['Fare'].fillna(data['Fare'].mean())
data.loc[data['Embarked']=='S','Embarked']=0
data.loc[data['Embarked']=='C','Embarked']=1
data.loc[data['Embarked']=='Q','Embarked']=2
return data
train=formatData(datas)
test=formatData(testdatas)
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
predictors=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
X=train[predictors]
y=train['Survived']
param_test1={'n_estimators':range(10,70,10)}
gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_test1,scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_,gsearch1.best_score_
param_test1={'n_estimators':range(30,50,1)}
gsearch1=GridSearchCV(estimator=RandomForestClassifier(random_state=10),param_grid=param_test1,scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_,gsearch1.best_score_
rf2=RandomForestClassifier(n_estimators=35,oob_score=True,random_state=10)
rf2.fit(X,y)
rf2.oob_score_
param_test2={'min_samples_leaf':range(1,50,10)}
gsearch2=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,random_state=10),param_grid=param_test2,scoring='roc_auc',cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_,gsearch2.best_score_
param_test2={'min_samples_leaf':range(1,20,1)}
gsearch2=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,random_state=10),param_grid=param_test2,scoring='roc_auc',cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_,gsearch2.best_score_
rf3=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,oob_score=True,random_state=10)
rf3.fit(X,y)
rf3.oob_score_
param_test3={'min_samples_split':range(2,22,1)}
gsearch3=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,random_state=10),param_grid=param_test3,scoring='roc_auc',cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_,gsearch3.best_score_
rf4=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,oob_score=True,random_state=10)
rf4.fit(X,y)
rf4.oob_score_
param_test4={'max_depth':range(1,21,1)}
gsearch4=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,random_state=10),param_grid=param_test4,scoring='roc_auc',cv=5)
gsearch4.fit(X,y)
gsearch4.best_params_,gsearch3.best_score_
param_test5={'max_features':range(1,8)}
gsearch5=GridSearchCV(estimator=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,random_state=10),param_grid=param_test5,scoring='roc_auc',cv=5)
gsearch5.fit(X,y)
gsearch5.best_params_,gsearch3.best_score_
rf5=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,oob_score=True,random_state=10)
rf5.fit(X,y)
rf5.oob_score_
rf6=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,max_features=7,oob_score=True,random_state=10)
rf6.fit(X,y)
rf6.oob_score_
alg=RandomForestClassifier(n_estimators=35,min_samples_leaf=3,min_samples_split=14,max_depth=11,max_features=7,oob_score=True,random_state=10)
alg.fit(X,y)
predict=alg.predict(test[predictors])
results=pd.DataFrame({
'PassengerId': test['PassengerId'],
'Survived': predict})
results.to_csv('submission_1.csv',index=False)