评分卡案例

数据预处理

%matplotlib inline
# 导入库 
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR

# 加载数据
data = pd.read_csv("./rankingcard.csv", index_col=0)
data.head()

	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

# 删除重复值
data.drop_duplicates(inplace=True)

# 查看缺失值占比
data.isnull().sum() / data.shape[0]

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.195601
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.025624
dtype: float64

# 删除缺失值
data.drop(index=data[data["NumberOfDependents"].isna() == True].index, inplace=True)

# 重建索引
data.index = range(data.shape[0])
data
type(data)

pandas.core.frame.DataFrame

# 封装填充缺失值函数
def fill_miss_rfr(x, y, to_fill: str, n_estimators: int) -> np.ndarray:
    """
    Args:
        x (DataFram): 要填补的特征矩阵
        y (Serise): 完整的，没有缺失值的标签
        to_fill (str): 字符串，要填补的那一列的名称
        n_estimators (int):随机森林树的数量
    """

    # 构建新特征矩阵和新标签
    df = x.copy()
    fill = df.loc[:, to_fill]
    df = pd.concat([df.loc[:, df.columns != to_fill], pd.DataFrame(y)], axis=1)

    # 划分训练集和测试集
    y_train = fill[fill.notna()]
    y_test = fill[fill.isna()]
    x_train = df.iloc[y_train.index, :]
    x_test = df.iloc[y_test.index, :]

    # 使用随机森林填充缺失值
    from sklearn.ensemble import RandomForestRegressor as RFR

    rfr = RFR(n_estimators=n_estimators)
    rfr.fit(x_train, y_train)
    y_predict = rfr.predict(x_test)

    return y_predict

# 准备参数
x = data.iloc[:, 1:]
y = data.iloc[:, 1]

# 获取缺失值MonthlyIncome
y_predict = fill_miss_rfr(x, y, "MonthlyIncome", 100)

# 填补缺失值MonthlyIncome
data.loc[data.loc[:, "MonthlyIncome"].isna(), "MonthlyIncome"] = y_predict

描述性统计处理异常值

data.describe([0.25, 0.5, 0.75, 0.90]).T

	count	mean	std	25%	50%	75%	90%	max
SeriousDlqin2yrs	145563.0	0.067538	0.250952	0.000000	0.000000	0.000000	0.000000	1.0
RevolvingUtilizationOfUnsecuredLines	145563.0	5.941378	250.510835	0.031218	0.158817	0.561085	0.976557	50708.0
age	145563.0	52.110701	14.567652	41.000000	52.000000	62.000000	72.000000	107.0
NumberOfTime30-59DaysPastDueNotWorse	145563.0	0.389185	3.756944	0.000000	0.000000	0.000000	1.000000	98.0
DebtRatio	145563.0	334.548251	1947.228209	0.173934	0.359090	0.770641	1150.000000	329664.0
MonthlyIncome	145563.0	5552.682762	13384.682061	2000.000000	4500.000000	7500.000000	10841.800000	3008750.0
NumberOfOpenCreditLinesAndLoans	145563.0	8.553788	5.141132	5.000000	8.000000	11.000000	15.000000	58.0
NumberOfTimes90DaysLate	145563.0	0.231309	3.728803	0.000000	0.000000	0.000000	0.000000	98.0
NumberRealEstateLoansOrLines	145563.0	1.033346	1.133115	0.000000	1.000000	2.000000	2.000000	54.0
NumberOfTime60-89DaysPastDueNotWorse	145563.0	0.205622	3.712455	0.000000	0.000000	0.000000	0.000000	98.0
NumberOfDependents	145563.0	0.759863	1.116141	0.000000	0.000000	1.000000	2.000000	20.0

# 一个年龄为0的人，需要将他删掉
(data["age"] == 0).sum()
data = data[data["age"] != 0]

data = data[data.loc[:, "NumberOfTimes90DaysLate"] < 90]
# 重置索引
data.index = range(data.shape[0])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145354 entries, 0 to 145353
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      145354 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  145354 non-null  float64
 2   age                                   145354 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  145354 non-null  int64  
 4   DebtRatio                             145354 non-null  float64
 5   MonthlyIncome                         145354 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       145354 non-null  int64  
 7   NumberOfTimes90DaysLate               145354 non-null  int64  
 8   NumberRealEstateLoansOrLines          145354 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  145354 non-null  int64  
 10  NumberOfDependents                    145354 non-null  float64
dtypes: float64(4), int64(7)
memory usage: 12.2 MB

查看样本均衡问题

x = data.iloc[:, 1:]
y = data.iloc[:, 0]

sample_n = y.value_counts().sum()
sample_0 = y.value_counts()[0]
sample_1 = y.value_counts()[1]

print(
    "样本总数: {}, 样本0占比:{:.2%}, 样本1占比:{:.2%}".format(
        sample_n, sample_0 / sample_n, sample_1 / sample_n
    )
)

样本总数: 145354, 样本0占比:93.32%, 样本1占比:6.68%

# 增加样本1的占比
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
x, y = sm.fit_resample(x, y)

sample_n = y.value_counts().sum()
sample_0 = y.value_counts()[0]
sample_1 = y.value_counts()[1]
print(
    "样本总数: {}, 样本0占比:{:.2%}, 样本1占比:{:.2%}".format(
        sample_n, sample_0 / sample_n, sample_1 / sample_n
    )
)

样本总数: 271296, 样本0占比:50.00%, 样本1占比:50.00%

data

	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents
0	1	0.766127	45	2	0.802982	9120.0	13	0	6	0	2.0
1	0	0.957151	40	0	0.121876	2600.0	4	0	0	0	1.0
2	0	0.658180	38	1	0.085113	3042.0	2	1	0	0	0.0
3	0	0.233810	30	0	0.036050	3300.0	5	0	0	0	0.0
4	0	0.907239	49	1	0.024926	63588.0	7	0	1	0	0.0
...	...	...	...	...	...	...	...	...	...	...	...
145349	0	0.040674	74	0	0.225131	2100.0	4	0	1	0	0.0
145350	0	0.299745	44	0	0.716562	5584.0	4	0	1	0	2.0
145351	0	0.246044	58	0	3870.000000	0.0	18	0	1	0	0.0
145352	0	0.000000	30	0	0.000000	5716.0	4	0	0	0	0.0
145353	0	0.850283	64	0	0.249908	8158.0	8	0	2	0	0.0

145354 rows × 11 columns

划分测试集和训练集并为分箱做准备

from sklearn.model_selection import train_test_split as tts

x_train, x_vali, y_train, y_vali = tts(x, y, test_size=0.3, random_state=420)
model_data = pd.concat([y_train, x_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns

model_vali = pd.concat([y_vali, x_vali], axis=1)
model_vali.index = range(model_vali.shape[0])
model_vali.columns = data.columns

model_data.to_csv("./model_data.csv")
model_vali.to_csv("./model_vali.csv")

分箱

等频分箱

# 按照等频对箱子进行分箱
model_data["qcut"], updown = pd.qcut(model_data["age"], retbins=True, q=20)

coount_0 = (
    model_data[model_data["SeriousDlqin2yrs"] == 0]
    .groupby(by=model_data["qcut"])
    .count()["SeriousDlqin2yrs"]
)

coount_1 = (
    model_data[model_data["SeriousDlqin2yrs"] == 1]
    .groupby(by=model_data["qcut"])
    .count()["SeriousDlqin2yrs"]
)

num_bins = [*zip(updown, updown[1:], coount_0, coount_1)]

num_bins

[(21.0, 28.0, 4121, 7178),
 (28.0, 31.0, 3476, 5783),
 (31.0, 34.0, 3933, 6662),
 (34.0, 36.0, 2896, 4652),
 (36.0, 39.0, 5108, 7417),
 (39.0, 41.0, 3963, 5843),
 (41.0, 43.0, 3957, 5558),
 (43.0, 45.0, 4317, 5691),
 (45.0, 46.0, 2342, 3240),
 (46.0, 48.0, 4864, 6150),
 (48.0, 50.0, 4861, 6038),
 (50.0, 52.0, 4621, 5523),
 (52.0, 54.0, 4560, 4623),
 (54.0, 56.0, 4498, 4010),
 (56.0, 58.0, 4468, 3359),
 (58.0, 61.0, 6451, 4746),
 (61.0, 63.0, 4712, 2228),
 (63.0, 67.0, 6974, 2578),
 (67.0, 73.0, 6975, 2041),
 (73.0, 107.0, 7998, 1492)]

定义 WOE 和 IV 函数

# 获取WOE
columns = ["min", "max", "count_0", "count_1"]
df = pd.DataFrame(num_bins, columns=columns)
df["tatol"] = df.count_0 + df.count_1
df["percentage"] = df.tatol / df.tatol.sum()
df["bad_rate"] = df.count_1 / df.tatol
df["good%"] = df.count_0 / df.count_0.sum()
df["bad%"] = df.count_1 / df.count_1.sum()
df["woe"] = np.log(df["good%"] / df["bad%"])

# 获取IV
rate = df["good%"] - df["bad%"]
iv = np.sum(rate * df["woe"])

# 封装 WOE 和 IV 函数
def get_woe(data: list) -> pd.DataFrame:
    """通过 num_bins 数据计算 woe

    Args:
        data (list): _description_

    Returns:
        pd.DataFrame: _description_
    """
    df = data.copy()
    columns = ["min", "max", "count_0", "count_1"]
    df = pd.DataFrame(df, columns=columns)
    df["tatol"] = df.count_0 + df.count_1
    df["percentage"] = df.tatol / df.tatol.sum()
    df["bad_rate"] = df.count_1 / df.tatol
    df["good%"] = df.count_0 / df.count_0.sum()
    df["bad%"] = df.count_1 / df.count_1.sum()
    df["woe"] = np.log(df["good%"] / df["bad%"])
    return df


def get_iv(data: pd.DataFrame) -> float:
    """计算IV值

    Args:
        data (pd.DataFrame): _description_

    Returns:
        float: _description_
    """
    rate = data["good%"] - data["bad%"]
    iv = np.sum(rate * data["woe"])
    return iv

get_iv(get_woe(num_bins))

0.3451906890667462

num_bins_ = num_bins.copy()
[*zip(range(len(num_bins_)), num_bins_)]

[(0, (21.0, 28.0, 4121, 7178)),
 (1, (28.0, 31.0, 3476, 5783)),
 (2, (31.0, 34.0, 3933, 6662)),
 (3, (34.0, 36.0, 2896, 4652)),
 (4, (36.0, 39.0, 5108, 7417)),
 (5, (39.0, 41.0, 3963, 5843)),
 (6, (41.0, 43.0, 3957, 5558)),
 (7, (43.0, 45.0, 4317, 5691)),
 (8, (45.0, 46.0, 2342, 3240)),
 (9, (46.0, 48.0, 4864, 6150)),
 (10, (48.0, 50.0, 4861, 6038)),
 (11, (50.0, 52.0, 4621, 5523)),
 (12, (52.0, 54.0, 4560, 4623)),
 (13, (54.0, 56.0, 4498, 4010)),
 (14, (56.0, 58.0, 4468, 3359)),
 (15, (58.0, 61.0, 6451, 4746)),
 (16, (61.0, 63.0, 4712, 2228)),
 (17, (63.0, 67.0, 6974, 2578)),
 (18, (67.0, 73.0, 6975, 2041)),
 (19, (73.0, 107.0, 7998, 1492))]

卡方检验，合并箱体，画出 IV 曲线

# 卡方检验
import scipy.stats as ss

x1 = num_bins_[0][2:]
x2 = num_bins_[1][2:]
pvs = ss.chi2_contingency([x1, x2])[1]

pvs_list = []
for i in range(len(num_bins_) - 1):
    x1 = num_bins_[i][2:]
    x2 = num_bins_[i + 1][2:]
    pvs = ss.chi2_contingency([x1, x2])[1]
    pvs_list.append(pvs)
pvs_list
pvs_list

[0.11728724900429034,
 0.5508163222314948,
 0.09053121716062464,
 0.0007530919909110494,
 0.5874212014912026,
 0.10044784335748964,
 0.029701216027069655,
 0.15850408664347226,
 0.007103486255692831,
 0.5224541867703053,
 0.16902174569691014,
 1.2731738769229555e-08,
 2.1014725165904846e-05,
 6.866926043454617e-08,
 0.476802189877255,
 1.944497287524474e-43,
 1.0901328872969334e-12,
 8.002706776208699e-12,
 6.800451443744261e-33]

# 合并卡方检验相似的两个箱子
i = pvs_list.index(max(pvs_list))
num_bins_[i : i + 2] = [
    (
        num_bins_[i][0],
        num_bins_[i + 2][0],
        num_bins_[i][2] + num_bins_[i + 1][2],
        num_bins_[i][3] + num_bins_[i + 1][3],
    )
]

import matplotlib.pyplot as plt

IV = []
axisx = []
while len(num_bins_) > 2:
    pvs = []
    # 获取 num_bins_两两之间的卡方检验的置信度（或卡方值）
    for i in range(len(num_bins_) - 1):
        x1 = num_bins_[i][2:]
        x2 = num_bins_[i + 1][2:]
        # 0 返回 chi2 值，1 返回 p 值。
        pv = ss.chi2_contingency([x1, x2])[1]
        # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
        pvs.append(pv)
    # 通过 p 值进行处理。合并 p 值最大的两组
    i = pvs.index(max(pvs))
    num_bins_[i : i + 2] = [
        (
            num_bins_[i][0],
            num_bins_[i + 1][1],
            num_bins_[i][2] + num_bins_[i + 1][2],
            num_bins_[i][3] + num_bins_[i + 1][3],
        )
    ]
    bins_df = get_woe(num_bins_)
    axisx.append(len(num_bins_))
    IV.append(get_iv(bins_df))
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()

# 封装分箱函数
def get_bin(num_bins_: list, n: str) -> list:
    """分箱函数

    Args:
        num_bins (list): _description_
        n (str): _description_

    Returns:
        list: _description_
    """
    while len(num_bins_) > n:
        pvs = []
        # 获取 num_bins_两两之间的卡方检验的置信度（或卡方值）
        for i in range(len(num_bins_) - 1):
            x1 = num_bins_[i][2:]
            x2 = num_bins_[i + 1][2:]
            # 0 返回 chi2 值，1 返回 p 值。
            pv = ss.chi2_contingency([x1, x2])[1]
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pvs.append(pv)
        # 通过 p 值进行处理。合并 p 值最大的两组
        i = pvs.index(max(pvs))
        num_bins_[i : i + 2] = [
            (
                num_bins_[i][0],
                num_bins_[i + 1][1],
                num_bins_[i][2] + num_bins_[i + 1][2],
                num_bins_[i][3] + num_bins_[i + 1][3],
            )
        ]
    return num_bins_

get_woe(get_bin(num_bins, 5))

	min	max	count_0	count_1	tatol	percentage	bad_rate	good%	bad%	woe
0	21.0	36.0	14426	24275	38701	0.203789	0.627245	0.151701	0.256033	-0.523395
1	36.0	52.0	34033	45460	79493	0.418589	0.571874	0.357884	0.479475	-0.292483
2	52.0	61.0	19977	16738	36715	0.193331	0.455890	0.210074	0.176539	0.173920
3	61.0	73.0	18661	6847	25508	0.134318	0.268426	0.196235	0.072217	0.999645
4	73.0	107.0	7998	1492	9490	0.049972	0.157218	0.084105	0.015736	1.676094

import scipy


def graphforbestbin(DF, X, Y, n=5, q=20, graph=True):
    """
    自动最优分箱函数，基于卡方检验的分箱
    参数：
    DF: 需要输入的数据
    X: 需要分箱的列名
    Y: 分箱数据对应的标签 Y 列名
    n: 保留分箱个数
    q: 初始分箱的个数
    graph: 是否要画出IV图像
    区间为前开后闭 (]
    """
    global bins_df
    DF = DF[[X, Y]].copy()
    DF["qcut"], bins = pd.qcut(DF[X], retbins=True, q=q, duplicates="drop")
    coount_y0 = DF.loc[DF[Y] == 0].groupby(by="qcut").count()[Y]
    coount_y1 = DF.loc[DF[Y] == 1].groupby(by="qcut").count()[Y]
    num_bins = [*zip(bins, bins[1:], coount_y0, coount_y1)]
    for i in range(q):
        if 0 in num_bins[0][2:]:
            num_bins[0:2] = [
                (
                    num_bins[0][0],
                    num_bins[1][1],
                    num_bins[0][2] + num_bins[1][2],
                    num_bins[0][3] + num_bins[1][3],
                )
            ]
            continue
        for i in range(len(num_bins)):
            if 0 in num_bins[i][2:]:
                num_bins[i - 1 : i + 1] = [
                    (
                        num_bins[i - 1][0],
                        num_bins[i][1],
                        num_bins[i - 1][2] + num_bins[i][2],
                        num_bins[i - 1][3] + num_bins[i][3],
                    )
                ]
                break
        else:
            break

    def get_woe(num_bins):
        columns = ["min", "max", "count_0", "count_1"]
        df = pd.DataFrame(num_bins, columns=columns)
        df["total"] = df.count_0 + df.count_1
        df["percentage"] = df.total / df.total.sum()
        df["bad_rate"] = df.count_1 / df.total
        df["good%"] = df.count_0 / df.count_0.sum()
        df["bad%"] = df.count_1 / df.count_1.sum()
        df["woe"] = np.log(df["good%"] / df["bad%"])
        return df

    def get_iv(df):
        rate = df["good%"] - df["bad%"]
        iv = np.sum(rate * df.woe)
        return iv

    IV = []
    axisx = []
    while len(num_bins) > n:
        pvs = []
        for i in range(len(num_bins) - 1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i + 1][2:]
            pv = scipy.stats.chi2_contingency([x1, x2])[1]
            pvs.append(pv)
        i = pvs.index(max(pvs))
        num_bins[i : i + 2] = [
            (
                num_bins[i][0],
                num_bins[i + 1][1],
                num_bins[i][2] + num_bins[i + 1][2],
                num_bins[i][3] + num_bins[i + 1][3],
            )
        ]
        bins_df = pd.DataFrame(get_woe(num_bins))
        axisx.append(len(num_bins))
        IV.append(get_iv(bins_df))
    if graph:
        plt.figure()
        plt.plot(axisx, IV)
        plt.xticks(axisx)
        plt.xlabel("number of box")
        plt.ylabel("IV")
        plt.show()
    return bins_df

dfs = []
for i in model_data.columns[1:-1]:
    print(i)
    a = graphforbestbin(DF=model_data, X=i, Y="SeriousDlqin2yrs", graph=False)
    dfs.append(a)

RevolvingUtilizationOfUnsecuredLines
age
NumberOfTime30-59DaysPastDueNotWorse
DebtRatio
MonthlyIncome
NumberOfOpenCreditLinesAndLoans
NumberOfTimes90DaysLate
NumberRealEstateLoansOrLines
NumberOfTime60-89DaysPastDueNotWorse
NumberOfDependents

for i in dfs:
    print(i)

        min           max  count_0  count_1  total  percentage  bad_rate  \
0  0.000000      0.102453    42746     4731  47477    0.250001  0.099648   
1  0.102453      0.224921    13207     5784  18991    0.100002  0.304565   
2  0.224921      0.553524    17987    19994  37981    0.199998  0.526421   
3  0.553524      1.000000    19729    53108  72837    0.383540  0.729135   
4  1.000000  29110.000000     1426    11195  12621    0.066459  0.887014   

      good%      bad%       woe  
0  0.449508  0.049899  2.198159  
1  0.138882  0.061005  0.822671  
2  0.189148  0.210880 -0.108763  
3  0.207466  0.560140 -0.993218  
4  0.014996  0.118076 -2.063574  
    min    max  count_0  count_1  total  percentage  bad_rate     good%  \
0  21.0   36.0    14426    24275  38701    0.203789  0.627245  0.151701   
1  36.0   52.0    34033    45460  79493    0.418589  0.571874  0.357884   
2  52.0   61.0    19977    16738  36715    0.193331  0.455890  0.210074   
3  61.0   73.0    18661     6847  25508    0.134318  0.268426  0.196235   
4  73.0  107.0     7998     1492   9490    0.049972  0.157218  0.084105   

       bad%       woe  
0  0.256033 -0.523395  
1  0.479475 -0.292483  
2  0.176539  0.173920  
3  0.072217  0.999645  
4  0.015736  1.676094  
    min    max  count_0  count_1  total  percentage  bad_rate     good%  \
0  21.0   36.0    14426    24275  38701    0.203789  0.627245  0.151701   
1  36.0   52.0    34033    45460  79493    0.418589  0.571874  0.357884   
2  52.0   61.0    19977    16738  36715    0.193331  0.455890  0.210074   
3  61.0   73.0    18661     6847  25508    0.134318  0.268426  0.196235   
4  73.0  107.0     7998     1492   9490    0.049972  0.157218  0.084105   

       bad%       woe  
0  0.256033 -0.523395  
1  0.479475 -0.292483  
2  0.176539  0.173920  
3  0.072217  0.999645  
4  0.015736  1.676094  
          min            max  count_0  count_1   total  percentage  bad_rate  \
0    0.000000       0.017546     7816     1680    9496    0.050003  0.176917   
1    0.017546       0.490044    52595    51853  104448    0.549996  0.496448   
2    0.490044       1.120358    14828    23153   37981    0.199998  0.609594   
3    1.120358     837.000000     8988    10006   18994    0.100017  0.526798   
4  837.000000  329664.000000    10868     8120   18988    0.099986  0.427639   

      good%      bad%       woe  
0  0.082191  0.017719  1.534399  
1  0.553079  0.546903  0.011228  
2  0.155928  0.244199 -0.448587  
3  0.094516  0.105535 -0.110275  
4  0.114286  0.085643  0.288512  
           min           max  count_0  count_1  total  percentage  bad_rate  \
0     0.000000  9.000000e-02     6585     3001   9586    0.050477  0.313061   
1     0.090000  1.419000e+03    13754    14643  28397    0.149531  0.515653   
2  1419.000000  4.667000e+03    27842    38638  66480    0.350066  0.581197   
3  4667.000000  6.238526e+03    13996    14476  28472    0.149926  0.508429   
4  6238.526439  3.008750e+06    32918    24054  56972    0.299999  0.422207   

      good%      bad%       woe  
0  0.069247  0.031652  0.782868  
1  0.144634  0.154442 -0.065613  
2  0.292781  0.407522 -0.330671  
3  0.147179  0.152681 -0.036701  
4  0.346159  0.253702  0.310738  
    min   max  count_0  count_1   total  percentage  bad_rate     good%  \
0   0.0   1.0     3104     7758   10862    0.057196  0.714233  0.032641   
1   1.0   3.0     9412    13478   22890    0.120533  0.588816  0.098975   
2   3.0   5.0    15485    16514   31999    0.168498  0.516079  0.162837   
3   5.0  17.0    61589    53679  115268    0.606971  0.465689  0.647658   
4  17.0  58.0     5505     3383    8888    0.046802  0.380626  0.057889   

       bad%       woe  
0  0.081825 -0.919013  
1  0.142155 -0.362054  
2  0.174176 -0.067317  
3  0.566163  0.134481  
4  0.035681  0.483913  
    min   max  count_0  count_1   total  percentage  bad_rate     good%  \
0   0.0   1.0     3104     7758   10862    0.057196  0.714233  0.032641   
1   1.0   3.0     9412    13478   22890    0.120533  0.588816  0.098975   
2   3.0   5.0    15485    16514   31999    0.168498  0.516079  0.162837   
3   5.0  17.0    61589    53679  115268    0.606971  0.465689  0.647658   
4  17.0  58.0     5505     3383    8888    0.046802  0.380626  0.057889   

       bad%       woe  
0  0.081825 -0.919013  
1  0.142155 -0.362054  
2  0.174176 -0.067317  
3  0.566163  0.134481  
4  0.035681  0.483913  
    min   max  count_0  count_1   total  percentage  bad_rate     good%  \
0   0.0   1.0     3104     7758   10862    0.057196  0.714233  0.032641   
1   1.0   3.0     9412    13478   22890    0.120533  0.588816  0.098975   
2   3.0   5.0    15485    16514   31999    0.168498  0.516079  0.162837   
3   5.0  17.0    61589    53679  115268    0.606971  0.465689  0.647658   
4  17.0  58.0     5505     3383    8888    0.046802  0.380626  0.057889   

       bad%       woe  
0  0.081825 -0.919013  
1  0.142155 -0.362054  
2  0.174176 -0.067317  
3  0.566163  0.134481  
4  0.035681  0.483913  
    min   max  count_0  count_1   total  percentage  bad_rate     good%  \
0   0.0   1.0     3104     7758   10862    0.057196  0.714233  0.032641   
1   1.0   3.0     9412    13478   22890    0.120533  0.588816  0.098975   
2   3.0   5.0    15485    16514   31999    0.168498  0.516079  0.162837   
3   5.0  17.0    61589    53679  115268    0.606971  0.465689  0.647658   
4  17.0  58.0     5505     3383    8888    0.046802  0.380626  0.057889   

       bad%       woe  
0  0.081825 -0.919013  
1  0.142155 -0.362054  
2  0.174176 -0.067317  
3  0.566163  0.134481  
4  0.035681  0.483913  
    min   max  count_0  count_1   total  percentage  bad_rate     good%  \
0   0.0   1.0     3104     7758   10862    0.057196  0.714233  0.032641   
1   1.0   3.0     9412    13478   22890    0.120533  0.588816  0.098975   
2   3.0   5.0    15485    16514   31999    0.168498  0.516079  0.162837   
3   5.0  17.0    61589    53679  115268    0.606971  0.465689  0.647658   
4  17.0  58.0     5505     3383    8888    0.046802  0.380626  0.057889   

       bad%       woe  
0  0.081825 -0.919013  
1  0.142155 -0.362054  
2  0.174176 -0.067317  
3  0.566163  0.134481  
4  0.035681  0.483913

auto_col_bins = {
    "RevolvingUtilizationOfUnsecuredLines": 6,
    "age": 5,
    "DebtRatio": 4,
    "MonthlyIncome": 3,
    "NumberOfOpenCreditLinesAndLoans": 5,
}
# 不能使用自动分箱的变量
hand_bins = {
    "NumberOfTime30-59DaysPastDueNotWorse": [0, 1, 2, 13],
    "NumberOfTimes90DaysLate": [0, 1, 2, 17],
    "NumberRealEstateLoansOrLines": [0, 1, 2, 4, 54],
    "NumberOfTime60-89DaysPastDueNotWorse": [0, 1, 2, 8],
    "NumberOfDependents": [0, 1, 2, 3],
}
# 保证区间覆盖使用 np.inf替换最大值，用-np.inf替换最小值
hand_bins = {k: [-np.inf, *v[:-1], np.inf] for k, v in hand_bins.items()}

bins_of_col = {}
# 生成自动分箱的分箱区间和分箱后的 IV 值
for col in auto_col_bins:
    bins_df = graphforbestbin(
        model_data,
        col,
        "SeriousDlqin2yrs",
        n=auto_col_bins[col]
        # 使用字典的性质来取出每个特征所对应的箱的数量
        ,
        q=20,
        graph=False,
    )
    bins_list = sorted(set(bins_df["min"]).union(bins_df["max"]))
    # 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
    bins_list[0], bins_list[-1] = -np.inf, np.inf
    bins_of_col[col] = bins_list
    # 合并手动分箱数据
bins_of_col.update(hand_bins)

bins_of_col

{'RevolvingUtilizationOfUnsecuredLines': [-inf,
  0.1024533425,
  0.2249209732,
  0.5535236390041033,
  0.9829105421203392,
  0.9999999,
  inf],
 'age': [-inf, 36.0, 52.0, 61.0, 73.0, inf],
 'DebtRatio': [-inf, 0.017546052905140292, 0.4900438860450557, 837.0, inf],
 'MonthlyIncome': [-inf, 0.09, 6238.526438865782, inf],
 'NumberOfOpenCreditLinesAndLoans': [-inf, 1.0, 3.0, 5.0, 17.0, inf],
 'NumberOfTime30-59DaysPastDueNotWorse': [-inf, 0, 1, 2, inf],
 'NumberOfTimes90DaysLate': [-inf, 0, 1, 2, inf],
 'NumberRealEstateLoansOrLines': [-inf, 0, 1, 2, 4, inf],
 'NumberOfTime60-89DaysPastDueNotWorse': [-inf, 0, 1, 2, inf],
 'NumberOfDependents': [-inf, 0, 1, 2, inf]}

data = model_data.copy()

data["cut"] = pd.cut(data["age"], bins_of_col["age"])

data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df = data.groupby("cut")["SeriousDlqin2yrs"].value_counts().unstack()
bins_df

SeriousDlqin2yrs	0	1
cut
(-inf, 36.0]	14426	24275
(36.0, 52.0]	34033	45460
(52.0, 61.0]	19977	16738
(61.0, 73.0]	18661	6847
(73.0, inf]	7998	1492

bins_df[0]

cut
(-inf, 36.0]    14426
(36.0, 52.0]    34033
(52.0, 61.0]    19977
(61.0, 73.0]    18661
(73.0, inf]      7998
Name: 0, dtype: int64

bins_df["woe"] = np.log(
    (bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum())
)

bins_df

SeriousDlqin2yrs	0	1	woe
cut
(-inf, 36.0]	14426	24275	-0.523395
(36.0, 52.0]	34033	45460	-0.292483
(52.0, 61.0]	19977	16738	0.173920
(61.0, 73.0]	18661	6847	0.999645
(73.0, inf]	7998	1492	1.676094

def get_woe(df, col, y, bins):
    df = df[[col, y]].copy()
    df["cut"] = pd.cut(df[col], bins)
    bins_df = df.groupby("cut")[y].value_counts().unstack()
    woe = bins_df["woe"] = np.log(
        (bins_df[0] / bins_df[0].sum()) / (bins_df[1] / bins_df[1].sum())
    )
    return woe


# 将所有特征的WOE存储到字典当中
woeall = {}
for col in bins_of_col:
    woeall[col] = get_woe(model_data, col, "SeriousDlqin2yrs", bins_of_col[col])
woeall

{'RevolvingUtilizationOfUnsecuredLines': cut
 (-inf, 0.102]     2.198159
 (0.102, 0.225]    0.822671
 (0.225, 0.554]   -0.108763
 (0.554, 0.983]   -1.156168
 (0.983, 1.0]     -0.483896
 (1.0, inf]       -2.063574
 dtype: float64,
 'age': cut
 (-inf, 36.0]   -0.523395
 (36.0, 52.0]   -0.292483
 (52.0, 61.0]    0.173920
 (61.0, 73.0]    0.999645
 (73.0, inf]     1.676094
 dtype: float64,
 'DebtRatio': cut
 (-inf, 0.0175]    1.534399
 (0.0175, 0.49]    0.011228
 (0.49, 837.0]    -0.333937
 (837.0, inf]      0.288512
 dtype: float64,
 'MonthlyIncome': cut
 (-inf, 0.09]        0.782868
 (0.09, 6238.526]   -0.200869
 (6238.526, inf]     0.310738
 dtype: float64,
 'NumberOfOpenCreditLinesAndLoans': cut
 (-inf, 1.0]   -0.919013
 (1.0, 3.0]    -0.362054
 (3.0, 5.0]    -0.067317
 (5.0, 17.0]    0.134481
 (17.0, inf]    0.483913
 dtype: float64,
 'NumberOfTime30-59DaysPastDueNotWorse': cut
 (-inf, 0.0]    0.351043
 (0.0, 1.0]    -0.868353
 (1.0, 2.0]    -1.363561
 (2.0, inf]    -1.482353
 dtype: float64,
 'NumberOfTimes90DaysLate': cut
 (-inf, 0.0]    0.235054
 (0.0, 1.0]    -1.737703
 (1.0, 2.0]    -2.281241
 (2.0, inf]    -2.377136
 dtype: float64,
 'NumberRealEstateLoansOrLines': cut
 (-inf, 0.0]   -0.408144
 (0.0, 1.0]     0.195483
 (1.0, 2.0]     0.650412
 (2.0, 4.0]     0.372983
 (4.0, inf]    -0.297756
 dtype: float64,
 'NumberOfTime60-89DaysPastDueNotWorse': cut
 (-inf, 0.0]    0.124264
 (0.0, 1.0]    -1.375012
 (1.0, 2.0]    -1.807985
 (2.0, inf]    -1.792286
 dtype: float64,
 'NumberOfDependents': cut
 (-inf, 0.0]    0.636441
 (0.0, 1.0]    -0.586082
 (1.0, 2.0]    -0.510146
 (2.0, inf]    -0.451625
 dtype: float64}

建模与模型验证

# 不希望覆盖掉原本的数据，创建一个新的DataFrame，索引和原始数据model_data一模一样
model_woe = pd.DataFrame(index=model_data.index)

# 将原数据分箱后，按箱的结果把WOE结构用map函数映射到数据中
model_woe["age"] = pd.cut(model_data["age"], bins_of_col["age"]).map(woeall["age"])
model_woe

	age
0	0.173920
1	0.999645
2	-0.292483
3	0.173920
4	-0.292483
...	...
189902	0.173920
189903	-0.292483
189904	-0.523395
189905	0.999645
189906	-0.523395

189907 rows × 1 columns

# 对所有特征操作可以写成：
for col in bins_of_col:
    model_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])

# 将标签补充到数据中
model_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]

vali_woe = pd.DataFrame(index=model_data.index)
for col in bins_of_col:
    vali_woe[col] = pd.cut(model_data[col], bins_of_col[col]).map(woeall[col])
vali_woe["SeriousDlqin2yrs"] = model_data["SeriousDlqin2yrs"]

vali_X = vali_woe.iloc[:, :-1]
vali_y = vali_woe.iloc[:, -1]

x = model_woe.iloc[:, :-1]
y = model_woe.iloc[:, -1]

x.columns

Index(['age', 'RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

age = vali_X.pop("age")
age

/Users/chenhao/anaconda3/envs/machine_learn/lib/python3.10/site-packages/IPython/lib/pretty.py:778: FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self.
  output = repr(obj)





0         0.173920
1         0.999645
2        -0.292483
3         0.173920
4        -0.292483
            ...   
189902    0.173920
189903   -0.292483
189904   -0.523395
189905    0.999645
189906   -0.523395
Name: age, Length: 189907, dtype: category
Categories (5, float64): [-0.523395 < -0.292483 < 0.173920 < 0.999645 < 1.676094]

vali_X.insert(0, column="age", value=age)

vali_X.columns

Index(['age', 'RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

from sklearn.linear_model import LogisticRegression as LR

lr = LR().fit(x, y)
lr.score(vali_X, vali_y)

0.7838626274966167

from sklearn.model_selection import cross_val_score

c_1 = np.linspace(0.01, 1, 20)
c_2 = np.linspace(0.01, 0.2, 20)
score_list_1 = []
score_list_2 = []
for i in range(20):
    lr1 = LR(penalty="l2", solver="liblinear", C=c_1[i])
    lr1.fit(x, y)
    score_list_1.append(cross_val_score(lr1, vali_X, vali_y, cv=10).mean())
    lr2 = LR(penalty="l2", solver="liblinear", C=c_2[i])
    lr2.fit(x, y)
    score_list_2.append(cross_val_score(lr2, vali_X, vali_y, cv=10).mean())
plt.figure(figsize=[20, 5])
plt.plot(range(20), score_list_1, label="c_1")
plt.plot(range(20), score_list_2, label="c_2")
plt.legend()
plt.show()

import scikitplot as skplt

vali_proba_df = pd.DataFrame(lr.predict_proba(vali_X))
skplt.metrics.plot_roc(
    vali_y, vali_proba_df, plot_micro=False, figsize=(6, 6), plot_macro=False
)

<AxesSubplot: title={'center': 'ROC Curves'}, xlabel='False Positive Rate', ylabel='True Positive Rate'>

制作评分卡

B = 20 / np.log(2)
A = 600 + B * np.log(1 / 60)
B, A

(28.85390081777927, 481.8621880878296)

base_score = A - B * lr.intercept_
base_score
score_age = woeall["age"] * (-B * lr.coef_[0][0])
score_age

cut
(-inf, 36.0]    -4.180160
(36.0, 52.0]    -2.335948
(52.0, 61.0]     1.389030
(61.0, 73.0]     7.983784
(73.0, inf]     13.386325
dtype: float64

file = "./ScoreData.csv"

# 首先写入基准分数
# 之后使用循环，每次生成一组score_age类似的分档和分数，不断写入文件之中
with open(file, "w") as fdata:
    fdata.write("base_score,{}\n".format(base_score))
for i, col in enumerate(x.columns):
    score = woeall[col] * (-B * lr.coef_[0][i])
    score.name = "Score"
    score.index.name = col
    score.to_csv(file, header=True, mode="a")