泰坦尼克号生还预测

导入模块

import pandas as pd # 数据处理
import matplotlib.pyplot as plt # 画图
from sklearn.tree import DecisionTreeClassifier # 决策树模型
from sklearn.model_selection import train_test_split #划分训练集和测试集
from sklearn.model_selection import GridSearchCV # 网格搜索(内涵交叉验证)
from sklearn.model_selection import cross_val_score # 网格搜索

读取数据表

data = pd.read_csv("./data/data.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

数据预处理

# 按列删除无用特征
data.drop(["Cabin", "Name", "Ticket"],axis=1,inplace=True)
# 填充年龄缺失值,按照年龄列的平均值
data["Age"] = data["Age"].fillna(data["Age"].mean())
# 删除空行
data.dropna(axis=0, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    object 
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB
# 通过布尔索引将心别转换成0,1
data["Sex"] = (data["Sex"] == "male").astype("int")
# 创建包含Embarked值的列表,用匿名函数将Embarked值列表的索引重新赋值给Embarked列中的元素
labels = data["Embarked"].unique().tolist()
data["Embarked"] = data["Embarked"].apply(lambda x : labels.index(x))
# 抽取特征值
x = data.iloc[:, data.columns != "Survived"]
# 抽取目标值
y = data.iloc[:, data.columns == "Survived"]

划分测试集和训练集

# 划分测试集和训练集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 重制测试集和训练集的索引
for i in [x_train, x_test, y_train, y_test]:
    i.index = range(i.shape[0])

导入决策树模型

# 实例化决策树模型
clf = DecisionTreeClassifier(random_state=30)
clf.fit(x_train,y_train)
score_ = clf.score(x_test, y_test)
score_
0.7713004484304933

导入决策树

# 实例化交叉验证
corss = cross_val_score(clf, x, y, cv=10)
score = corss.mean()
score

0.7514683350357507

展示树最大深度的拟合曲线

# 创建训练集拟合得分列表
train = []
# 创建交叉验证拟合得分列表
corss = []
# 创建测试集拟合得分列表
test = []
for i in range(10):
    clf = DecisionTreeClassifier(criterion="gini"
                                 ,random_state=30
                                 ,max_depth=i+1) # 树的最大深度1-10
    clf.fit(x_train, y_train)
    score_train = clf.score(x_train, y_train)
    score_test = clf.score(x_test, y_test)
    train.append(score_train)
    test.append(score_test)
    score_corss = cross_val_score(clf, x, y, cv=10).mean()
    corss.append(score_corss)
print("训练集中最好得分:{}".format(max(train)))
print("交叉验证中最好得分:{}".format(max(corss)))
print("测试集中最好得分:{}".format(max(test)))
plt.figure()
plt.plot(range(1, 11), train, color="blue",label="score_train")
plt.plot(range(1, 11), corss, color="red", label="score_corss")
plt.plot(range(1, 11), test, color="black", label="score_test")
plt.xlabel(range(1, 11))
plt.legend()
plt.show()
训练集中最好得分:0.9504504504504504
交叉验证中最好得分:0.8143896833503576
测试集中最好得分:0.8295964125560538

网格搜索

import numpy as np
# 创建0到0.5有序随机的20个float
gini_thresholds = np.linspace(0,0.5,20)
# 创建决策树参数列表
parameters = {'splitter':('best','random')
              ,'criterion':("gini","entropy")
              ,"max_depth":[*range(1,10)]
              ,'min_samples_leaf':[*range(1,50,5)]
             }
# 实例化决策树模型
clf = DecisionTreeClassifier(random_state=25)
# 实例化网格搜索,交叉验证10次
search = GridSearchCV(clf, parameters, cv=10)
search.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=25),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'min_samples_leaf': [1, 6, 11, 16, 21, 26, 31, 36, 41,
                                              46],
                         'splitter': ('best', 'random')})
# 搜索出来的最好模型参数
search.best_estimator_
DecisionTreeClassifier(max_depth=7, min_samples_leaf=6, random_state=25,
                       splitter='random')
# 搜索的最好拟合分数
search.best_score_
0.8227951153324288