随机森林在乳腺癌数据上的调参

导入需要的库

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

导入数据集,探索数据

cancer = load_breast_cancer()
cancer.data.shape
(569, 30)

进行一次简单的建模,看看模型本身在数据集上的效果

rfc = RandomForestClassifier(n_estimators=100, random_state=90)
score_pre = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
score_pre
0.9648809523809524

随机森林调整的第一步:无论如何先来调 n_estimators

score_ = []
for i in range(0,200,10):
    rfc = RandomForestClassifier(n_estimators=i+1, random_state=90)
    score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
    score_.append(score)

查看 n_estimators(1-200)最高得分

print(max(score_), (score_.index(max(score_)) * 10) + 1)
0.9631265664160402 71

nestimators 学习曲线

plt.figure(figsize=(20, 8),dpi=80)
plt.plot(range(1,201,10), score_)
plt.show()

细化 n_estimators 取值

est_range = [*range(65,75)]
est_refinement = []
for i in est_range:
    rfc = RandomForestClassifier(n_estimators=i, random_state=90)
    score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
    est_refinement.append(score)
print(max(est_refinement), est_range[est_refinement.index(max(est_refinement))])
0.9666353383458647 73

为网格搜索做准备,书写网格搜索的参数

"""
param_grid = {'n_estimators':np.arange(0, 200, 10)}
 
param_grid = {'max_depth':np.arange(1, 20, 1)}
    
param_grid = {'max_leaf_nodes':np.arange(25,50,1)}
    对于大型数据集,可以尝试从1000来构建,先输入1000,每100个叶子一个区间,再逐渐缩小范围
 
有一些参数是可以找到一个范围的,或者说我们知道他们的取值和随着他们的取值,模型的整体准确率会如何变化,这
样的参数我们就可以直接跑网格搜索
param_grid = {'criterion':['gini', 'entropy']}
 
param_grid = {'min_samples_split':np.arange(2, 2+20, 1)}
 
param_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)}
    
param_grid = {'max_features':np.arange(5,30,1)} 
 
"""
"\nparam_grid = {'n_estimators':np.arange(0, 200, 10)}\n \nparam_grid = {'max_depth':np.arange(1, 20, 1)}\n    \nparam_grid = {'max_leaf_nodes':np.arange(25,50,1)}\n    对于大型数据集,可以尝试从1000来构建,先输入1000,每100个叶子一个区间,再逐渐缩小范围\n \n有一些参数是可以找到一个范围的,或者说我们知道他们的取值和随着他们的取值,模型的整体准确率会如何变化,这\n样的参数我们就可以直接跑网格搜索\nparam_grid = {'criterion':['gini', 'entropy']}\n \nparam_grid = {'min_samples_split':np.arange(2, 2+20, 1)}\n \nparam_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)}\n    \nparam_grid = {'max_features':np.arange(5,30,1)} \n \n"

开始按照参数对模型整体准确率的影响程度进行调参,首先调整 max_depth

param_grid = {'max_depth':np.arange(1, 20, 1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
   18, 19])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" ><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;max_depth&#x27;: array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
   18, 19])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-2" type="checkbox" ><label for="sk-estimator-id-2" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-3" type="checkbox" ><label for="sk-estimator-id-3" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'max_depth': 8}
# 查看最好的分
grid.best_score_
0.9666353383458647

调整 max_features

param_grid = {'max_features':np.arange(5,30,1)} 
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;max_features&#x27;: array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
   22, 23, 24, 25, 26, 27, 28, 29])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" ><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;max_features&#x27;: array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
   22, 23, 24, 25, 26, 27, 28, 29])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-5" type="checkbox" ><label for="sk-estimator-id-5" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-6" type="checkbox" ><label for="sk-estimator-id-6" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'max_features': 24}
# 查看最好的分
grid.best_score_
0.9666666666666668

调整 min_samples_leaf

param_grid = {'min_samples_leaf':np.arange(1, 1+10, 1)} 
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;min_samples_leaf&#x27;: array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-7" type="checkbox" ><label for="sk-estimator-id-7" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;min_samples_leaf&#x27;: array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-8" type="checkbox" ><label for="sk-estimator-id-8" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-9" type="checkbox" ><label for="sk-estimator-id-9" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'min_samples_leaf': 1}
# 查看最好的分
grid.best_score_
0.9666353383458647

不懈努力,继续尝试 min_samples_split

param_grid = {'min_samples_split':np.arange(2, 2+20, 1)}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;min_samples_split&#x27;: array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   19, 20, 21])})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-10" type="checkbox" ><label for="sk-estimator-id-10" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;min_samples_split&#x27;: array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   19, 20, 21])})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-11" type="checkbox" ><label for="sk-estimator-id-11" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-12" type="checkbox" ><label for="sk-estimator-id-12" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'min_samples_split': 2}
# 查看最好的分
grid.best_score_
0.9666353383458647

最后尝试一下 criterion

param_grid = {'criterion':['gini', 'entropy']}
rfc = RandomForestClassifier(n_estimators=73, random_state=90)
grid = GridSearchCV(rfc, param_grid,cv=10)
grid.fit(cancer.data, cancer.target)
GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;criterion&#x27;: [&#x27;gini&#x27;, &#x27;entropy&#x27;]})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-13" type="checkbox" ><label for="sk-estimator-id-13" class="sk-toggleable__label sk-toggleable__label-arrow">GridSearchCV</label><div class="sk-toggleable__content"><pre>GridSearchCV(cv=10,
         estimator=RandomForestClassifier(n_estimators=73, random_state=90),
         param_grid={&#x27;criterion&#x27;: [&#x27;gini&#x27;, &#x27;entropy&#x27;]})</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-14" type="checkbox" ><label for="sk-estimator-id-14" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-15" type="checkbox" ><label for="sk-estimator-id-15" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(n_estimators=73, random_state=90)</pre></div></div></div></div></div></div></div></div></div></div>
# 查看最好模型参数
grid.best_params_
{'criterion': 'gini'}
# 查看最好的分
grid.best_score_
0.9666353383458647

最好模型参数

rfc = RandomForestClassifier(n_estimators=73, 
                             random_state=90, 
                             max_features=24)
score = cross_val_score(rfc, cancer.data, cancer.target, cv=10).mean()
score
0.9666666666666668

最终提高准确率

score - score_pre
0.0017857142857143904