监督算法建模前数据质量检查

一、定义缺失值检测函数

python">def missing_values_table(df):# 总的缺失值mis_val = df.isnull().sum()# 缺失值占比mis_val_percent = 100 * df.isnull().sum() / len(df)# 将上述值合并成表mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)# 重命名列名mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})# 按缺失值占比降序排列mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)# 显示结果print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      "There are " + str(mis_val_table_ren_columns.shape[0]) +" columns that have missing values.")

二、#绘制记录数和违约率的柱状图，以函数的形式呈现，方便后面使用（主要用来分析非数值型字段）

python"> ## df_data原始数据## target 目标字段## feature要分析的字段## label_rotation 坐标标签是否旋转## horizontal_layout 水平还是垂直放置
def plot_stats(df_data, target, feature,label_rotation=False,horizontal_layout=True):temp = df_data[feature].value_counts()df1 = pd.DataFrame({feature: temp.index,'Number of contracts': temp.values})cat_perc = df_data[[feature, target]].groupby([feature],as_index=False).mean()cat_perc.sort_values(by=target, ascending=False, inplace=True)if(horizontal_layout):fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,6))else:fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12,14))sns.set_color_codes("pastel")s = sns.barplot(ax=ax1, x = feature, y="Number of contracts",data=df1)if(label_rotation):s.set_xticklabels(s.get_xticklabels(),rotation=45)s = sns.barplot(ax=ax2, x = feature, y=target, order=cat_perc[feature], data=cat_perc)if(label_rotation):s.set_xticklabels(s.get_xticklabels(),rotation=45)plt.ylabel('Percent of target with value 1 [%]', fontsize=10)plt.tick_params(axis='both', which='major', labelsize=10)plt.show();

三、绘制记录数和违约率的柱状图，以函数的形式呈现，方便后面使用（主要用来分析数值型字段）

python">## df_data 数据框
## var 数值型变量名
def plot_distribution(df_data , var):i = 0t1 = df_data.loc[df_data['left'] != 0]t0 = df_data.loc[df_data['left'] == 0]len_var = len(var)sns.set_style('whitegrid')plt.figure()fig, ax = plt.subplots(2,2,figsize=(12,12))for feature in var:i += 1plt.subplot(len_var,1,i)sns.kdeplot(t1[feature], bw=0.5,label="left = 1")sns.kdeplot(t0[feature], bw=0.5,label="left = 0")plt.ylabel('Density plot', fontsize=12)plt.xlabel(feature, fontsize=12)locs, labels = plt.xticks()plt.tick_params(axis='both', which='major', labelsize=12)plt.show();

四、条形图画法1-离散型变量：反映在某个自变量的取值范围下，目标变量发生的概率

python">## data_df:原始数据； X_col：自变量列名 ； Y_col：目标变量列名
#data_df  = df
#X_col = 'salary' 
#Y_col= 'left' 
def plot_explore2_char(data_df , X_col , Y_col):plt.figure(figsize=(14,14),dpi=100)plt.subplot(2,2,1)data_df[X_col].value_counts().plot(kind='bar')plt.xticks(rotation = 75); plt.xlabel(X_col +' name '); plt.ylabel('Amount of  employee number')plt.title('emp Group')

五、条形图画法2-连续型变量：反映在某个自变量的取值范围下，目标变量发生的概率

python">##条形图画法2-连续型变量
## 旨在反映在某个自变量的取值范围下，目标变量发生的概率    
## data_df:原始数据； X_col：自变量列名 ； Y_col：目标变量列名
#data_df  = df
#X_col = 'average_monthly_hours' 
#Y_col= 'left'   def plot_explore2_num(data_df , X_col , Y_col):import copy import numpy as np data_explore = copy.deepcopy(data_df)tmp_col_name =X_col+'2'#data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(round(data_explore[X_col].min()), round(data_explore[X_col].max()), num = 20))data_explore[tmp_col_name] = pd.cut(data_explore[X_col], bins = np.linspace(96, 310, num = 11))age_groups  = data_explore.groupby(tmp_col_name).mean()# 绘制条形图import matplotlibimport matplotlib.pyplot as pltimport seaborn as snscolor = sns.color_palette()sns.set_style('whitegrid')plt.figure()plt.bar(age_groups.index.astype(str), 100 * age_groups[Y_col])plt.xticks(rotation = 45); plt.xlabel(X_col + '_abandon'); plt.ylabel(Y_col+'_probability(%)')plt.title(Y_col + 'probability to ' + X_col +'abandon' )

六、主函数调用

python">if __name__ == "__main__":import pandas as pddf= pd.read_csv('D:\PycharmProjects\lessonOnLine\data\HR2.csv')df.isnull().sum()             missing_values = missing_values_table(df)#missing_values.head(20)
#     df.columns.tolist()
#     df['EMPID'] = df.index.tolist()
#     bureau_agg = df.groupby('EMPID', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
#     bureau_agg_department = df.groupby('department', as_index = False).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()
#     import numpy as np import matplotlibimport matplotlib.pyplot as pltimport seaborn as snscolor = sns.color_palette()#     ## 条形图的画法1-分类变量的画法
#     ## department 的原始分布情况
#     plt.figure(figsize=(14,14),dpi=100)
#     plt.subplot(2,2,1)
#     df['department'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('department name '); plt.ylabel('Amount of  employee number')
#     plt.title('emp Group')
#     ## 目标变量取值为1时， department的取值情况
#     plt.subplot(2,2,2)
#     df[df['left'] == 1]['department'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('department name'); plt.ylabel('left number')
#     plt.title('left Group')
#     ## 各个部门离职的概率
#     department_groups = df.groupby('department').mean()
#     plt.subplot(2,2,3)
#     plt.bar(department_groups.index.astype(str), 100 * department_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('department'); plt.ylabel('left probability(%)')
#     plt.title('left probability to salary');
#     
#     
#     
#     ## salary 的原始分布情况
#     plt.figure(figsize=(14,14),dpi=100)
#     plt.subplot(2,2,1)
#     df['salary'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('salary degree '); plt.ylabel('Amount of  employee number')
#     plt.title('emp Group')
#     ## 目标变量取值为1时， department的取值情况
#     plt.subplot(2,2,2)
#     df[df['left'] == 1]['salary'].value_counts().plot(kind='bar')
#     plt.xticks(rotation = 75); plt.xlabel('salary degree'); plt.ylabel('left number')
#     plt.title('left Group')
#     ## salay取不同值时，离职的概率
#     salary_groups  = df.groupby('salary').mean()
#     plt.subplot(2,2,3)
#     plt.bar(salary_groups.index.astype(str), 100 * salary_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('salary'); plt.ylabel('left probability(%)')
#     plt.title('left probability to salary');
#      
#     ##条形图画法2-连续型变量
#     ## 旨在反映在某个自变量的取值范围下，目标变量发生的概率
#     df['average_monthly_hours2'] = pd.cut(df['average_monthly_hours'], bins = np.linspace(96, 310, num = 11))
#     age_groups  = df.groupby('average_monthly_hours2').mean()
#     #plt.figure(figsize = (8, 8))
#        
#     # 绘制条形图
#     plt.bar(age_groups.index.astype(str), 100 * age_groups['left'])
#     plt.xticks(rotation = 45); plt.xlabel('average_monthly_hours abandon'); plt.ylabel('left probability(%)')
#     plt.title('left probability to average_monthly_hours abandon');
#     ## df[df['department'] == 'sale']['left'].value_counts()## 函数的调用#plot_stats(df, 'left','department',label_rotation= True,horizontal_layout=True)plot_distribution(df, ['number_project','average_monthly_hours', 'time_spend_company'])plot_explore2_char(df ,'salary', 'left')plot_explore2_num(df ,'average_monthly_hours' , 'left')

监督算法建模前数据质量检查

相关文章

Matlab三维空间任意位置绘制二维强度图

BOOT和UBOOT区别与联系

一个开源跨平台嵌入式USB设备协议：TinyUSB

【unity】【C#】游戏音乐播放和发布

R 格式（蓝桥杯）

Python 字符串 Base64

瑞芯微RK3328（ROC-RK3328-PC）buildroot 开发QT的hello world

【机器学习】机器学习学习笔记 - 监督学习 - 逻辑回归分类朴素贝叶斯分类支持向量机 SVM (可分类、可回归) - 04