二值化与分段

二值化与分段

sklearn.preprocessing.Binarizer

from sklearn.preprocessing import Binarizer
import pandas as pd

data = pd.read_csv("./data_full", index_col=0)
data

	Age	Survived	Sex_female	Sex_male	Embarked_C	Embarked_Q	Embarked_S
0	22.000000	0.0	0.0	1.0	0.0	0.0	1.0
1	38.000000	2.0	1.0	0.0	1.0	0.0	0.0
2	26.000000	2.0	1.0	0.0	0.0	0.0	1.0
3	35.000000	2.0	1.0	0.0	0.0	0.0	1.0
4	35.000000	0.0	0.0	1.0	0.0	0.0	1.0
...	...	...	...	...	...	...	...
884	25.000000	0.0	0.0	1.0	0.0	0.0	1.0
885	39.000000	0.0	1.0	0.0	0.0	0.0	1.0
886	27.000000	0.0	1.0	0.0	0.0	0.0	1.0
887	19.000000	2.0	0.0	1.0	1.0	0.0	0.0
888	29.699118	0.0	0.0	1.0	0.0	1.0	0.0

887 rows × 7 columns

data1 = data.copy()
age = data1.iloc[:,0].values.reshape(-1,1)
result = Binarizer(threshold=30).fit_transform(age)
data1.iloc[:,0] = result

data1

	Age	Survived	Sex_female	Sex_male	Embarked_C	Embarked_Q	Embarked_S
0	0.0	0.0	0.0	1.0	0.0	0.0	1.0
1	1.0	2.0	1.0	0.0	1.0	0.0	0.0
2	0.0	2.0	1.0	0.0	0.0	0.0	1.0
3	1.0	2.0	1.0	0.0	0.0	0.0	1.0
4	1.0	0.0	0.0	1.0	0.0	0.0	1.0
...	...	...	...	...	...	...	...
884	0.0	0.0	0.0	1.0	0.0	0.0	1.0
885	1.0	0.0	1.0	0.0	0.0	0.0	1.0
886	0.0	0.0	1.0	0.0	0.0	0.0	1.0
887	0.0	2.0	0.0	1.0	1.0	0.0	0.0
888	0.0	0.0	0.0	1.0	0.0	1.0	0.0

887 rows × 7 columns

preprocessing.KBinsDiscretizer

from sklearn.preprocessing import KBinsDiscretizer
data2 = data.copy()
age = data2.iloc[:,0].values.reshape(-1,1)
result2 = KBinsDiscretizer(n_bins=5,encode="ordinal",strategy="uniform").fit_transform(age)

set(result2.ravel())

{0.0, 1.0, 2.0, 3.0, 4.0}

result3 = KBinsDiscretizer(n_bins=5, encode="onehot",strategy="quantile").fit_transform(age)
result3.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])