一、定义缺失值检测函数
python">def missing_values_table(df):# 总的缺失值mis_val = df.isnull().sum()# 缺失值占比mis_val_percent = 100 * df.isnull().sum() / len(df)# 将上述值合并成表mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)# 重命名列名mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# 按缺失值占比降序排列mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# 显示结果print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" "There are " + str(mis_val_table_ren_columns.shape[0]) +" columns that have missing values.")
二、#绘制记录数和违约率的柱状图,以函数的形式呈现,方便后面使用(主要用来分析非数值型字段)
python"> ## df_data原始数据## target 目标字段## feature要分析的字段## label_rotation 坐标标签是否旋转## horizontal_layout 水平还是垂直放置
def plot_stats(df_data, target, feature,label_rotation=False,horizontal_layout=True):temp = df_data[feature].value_counts()df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})cat_perc = df_data[[feature, target]].groupby([feature],as_index=False).mean()cat_perc.sort_values(by=target, ascending=False, inplace=True)if(horizontal_layout):fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))else:fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))sns.set_color_codes("pastel")s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)if(label_rotation):s.set_xticklabels(s.get_xticklabels(),rotation=45)s = sns.barplot(ax=ax2, x = feature, y=target, order=cat_perc[feature], data=cat_perc)if(label_rotation):s.set_xticklabels(s.get_xticklabels(),rotation=45)plt.ylabel('Percent of target with value 1 [%]', fontsize=10)plt.tick_params(axis='both', which='major', labelsize=10)plt.show();
三、绘制记录数和违约率的柱状图,以函数的形式呈现,方便后面使用(主要用来分析数值型字段)
python">## df_data 数据框
## var 数值型变量名
def plot_distribution(df_data , var):i = 0t1 = df_data.loc[df_data['left'] != 0]t0 = df_data.loc[df_data['left'] == 0]len_var = len(var)sns.set_style('whitegrid')plt.figure()fig, ax = plt.subplots(2,2,figsize=(12,12))for feature in var:i += 1plt.subplot(len_var,1,i)sns.kdeplot(t1[feature], bw=0.5,label="left = 1")sns.kdeplot(t0[feature], bw=0.5,label="left = 0")plt.ylabel('Density plot', fontsize=12)plt.xlabel(feature, fontsize=12)locs, labels = plt.xticks()plt.tick_params(axis='both', which='major', labelsize=12)plt.show();
四、条形图画法1-离散型变量:反映在某个自变量的取值范围下,目标变量发生的概率
python">## data_df:原始数据; X_col:自变量列名 ; Y_col:目标变量列名
#data_df = df
#X_col = 'salary'
#Y_col= 'left'
def plot_explore2_char(data_df , X_col , Y_col):plt.figure(figsize=(14,14),dpi=100)plt.subplot(2,2,1)data_df[X_col].value_counts().plot(kind='bar')plt.xticks(rotation = 75); plt.xlabel(X_col +' name '); plt.ylabel('Amount of employee number')plt.title('emp Group')
五、条形图画法2-连续型变量:反映在某个自变量的取值范围下,目标变量发生的概率
python">##条形图画法2-连续型变量
## 旨在反映在某个自变量的取值范围下,目标变量发生的概率
## data_df:原始数据; X_col:自变量列名 ; Y_col:目标变量列名
#data_df = df
#X_col = 'average_monthly_hours'
#Y_col= 'left' def plot_explore2_num(data_df , X_col , Y_col):import copy import numpy as np data_explore = copy.deepcopy(data_df)tmp_col_name =X_col+'2'#data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(round(data_explore[X_col].min()), round(data_explore[X_col].max()), num = 20))data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(96, 310, num = 11))age_groups = data_explore.groupby(tmp_col_name).mean()# 绘制条形图import matplotlibimport matplotlib.pyplot as pltimport seaborn as snscolor = sns.color_palette()sns.set_style('whitegrid')plt.figure()plt.bar(age_groups.index.astype(str), 100 * age_groups[Y_col])plt.xticks(rotation = 45); plt.xlabel(X_col + '_abandon'); plt.ylabel(Y_col+'_probability(%)')plt.title(Y_col + 'probability to ' + X_col +'abandon' )
六、主函数调用
python">if __name__ == "__main__":import pandas as pddf= pd.read_csv('D:\PycharmProjects\lessonOnLine\data\HR2.csv')df.isnull().sum() missing_values = missing_values_table(df)#missing_values.head(20)
# df.columns.tolist()
# df['EMPID'] = df.index.tolist()
# bureau_agg = df.groupby('EMPID', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
# bureau_agg_department = df.groupby('department', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
# import numpy as np import matplotlibimport matplotlib.pyplot as pltimport seaborn as snscolor = sns.color_palette()# ## 条形图的画法1-分类变量的画法
# ## department 的原始分布情况
# plt.figure(figsize=(14,14),dpi=100)
# plt.subplot(2,2,1)
# df['department'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('department name '); plt.ylabel('Amount of employee number')
# plt.title('emp Group')
# ## 目标变量取值为1时, department的取值情况
# plt.subplot(2,2,2)
# df[df['left'] == 1]['department'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('department name'); plt.ylabel('left number')
# plt.title('left Group')
# ## 各个部门离职的概率
# department_groups = df.groupby('department').mean()
# plt.subplot(2,2,3)
# plt.bar(department_groups.index.astype(str), 100 * department_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('department'); plt.ylabel('left probability(%)')
# plt.title('left probability to salary');
#
#
#
# ## salary 的原始分布情况
# plt.figure(figsize=(14,14),dpi=100)
# plt.subplot(2,2,1)
# df['salary'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('salary degree '); plt.ylabel('Amount of employee number')
# plt.title('emp Group')
# ## 目标变量取值为1时, department的取值情况
# plt.subplot(2,2,2)
# df[df['left'] == 1]['salary'].value_counts().plot(kind='bar')
# plt.xticks(rotation = 75); plt.xlabel('salary degree'); plt.ylabel('left number')
# plt.title('left Group')
# ## salay取不同值时,离职的概率
# salary_groups = df.groupby('salary').mean()
# plt.subplot(2,2,3)
# plt.bar(salary_groups.index.astype(str), 100 * salary_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('salary'); plt.ylabel('left probability(%)')
# plt.title('left probability to salary');
#
# ##条形图画法2-连续型变量
# ## 旨在反映在某个自变量的取值范围下,目标变量发生的概率
# df['average_monthly_hours2'] = pd.cut(df['average_monthly_hours'], bins = np.linspace(96, 310, num = 11))
# age_groups = df.groupby('average_monthly_hours2').mean()
# #plt.figure(figsize = (8, 8))
#
# # 绘制条形图
# plt.bar(age_groups.index.astype(str), 100 * age_groups['left'])
# plt.xticks(rotation = 45); plt.xlabel('average_monthly_hours abandon'); plt.ylabel('left probability(%)')
# plt.title('left probability to average_monthly_hours abandon');
# ## df[df['department'] == 'sale']['left'].value_counts()## 函数的调用#plot_stats(df, 'left','department',label_rotation= True,horizontal_layout=True)plot_distribution(df, ['number_project','average_monthly_hours', 'time_spend_company'])plot_explore2_char(df ,'salary', 'left')plot_explore2_num(df ,'average_monthly_hours' , 'left')