Pandas字符串离散化处理

字符串离散化处理

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# 读取csv文件
file_path = "./IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)

# 将Genre进行分割，并转换成列表
temp_list = df["Genre"].str.split(",").tolist()  # [[],[]]
# 将分割后的列表进行展开，使用集合去重，再转换成列表
genre_list = list(set([i for j in temp_list for i in j]))

# 创建一个df.shape[0]行，len(genre_list)列的 全零数组，列名为电影类型分类
zero_df = pd.DataFrame(
    np.zeros((df.shape[0], len(genre_list))), columns=genre_list)


# 给每个电影分类出现的位置赋值为1
for i in range(df.shape[0]):
    # 如：zero_df.loc[0, ['Action', 'Adventure', 'Sci-Fi'] = 1
    # zero_df.loc[1, ['Adventure', 'Mystery', 'Sci-Fi'] = 1
    zero_df.loc[i, temp_list[i]] = 1

# 根据列统计数量
genre_count = zero_df.sum(axis=0)
# 排序
genre_count = genre_count.sort_values()

_x = genre_count.index
_y = genre_count.values
# 绘图
plt.figure(figsize=(20, 8), dpi=80)
plt.bar(range(len(_x)), _y, color="red", width=0.6)
plt.xticks(range(len(_x)), _x)
plt.xlabel("Movie Type")
plt.ylabel("counts")
plt.show()

结果：