1. 函数库导入
import pandas as pd
import numpy as np
from sklearn. metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import OneHotEncoder
from sklearn. model_selection import KFold, RepeatedKFold
from scipy import sparse
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
from datetime import datetime
2. 导入数据
train_abbr= pd. read_csv( "./data/happiness_train_abbr.csv" , encoding= 'ISO-8859-1' )
train= pd. read_csv( "./data/happiness_train_complete.csv" , encoding= 'ISO-8859-1' )
test_abbr= pd. read_csv( "./data/happiness_test_abbr.csv" , encoding= 'ISO-8859-1' )
test= pd. read_csv( "./data/happiness_test_complete.csv" , encoding= 'ISO-8859-1' )
test_sub= pd. read_csv( "./data/happiness_submit.csv" , encoding= 'ISO-8859-1' )
3. 查看数据
test. shape
(2968, 139)
test_sub. shape
(2968, 2)
train. shape
(8000, 140)
train. head( )
id happiness survey_type province city county survey_time gender birth nationality religion religion_freq edu edu_other edu_status edu_yr income political join_party floor_area property_0 property_1 property_2 property_3 property_4 property_5 property_6 property_7 property_8 property_other height_cm weight_jin health health_problem depression hukou hukou_loc media_1 media_2 media_3 media_4 media_5 media_6 leisure_1 leisure_2 leisure_3 leisure_4 leisure_5 leisure_6 leisure_7 leisure_8 leisure_9 leisure_10 leisure_11 leisure_12 socialize relax learn social_neighbor social_friend socia_outing equity class class_10_before class_10_after class_14 work_exper work_status work_yr work_type work_manage insur_1 insur_2 insur_3 insur_4 family_income family_m family_status house car invest_0 invest_1 invest_2 invest_3 invest_4 invest_5 invest_6 invest_7 invest_8 invest_other son daughter minor_child marital marital_1st s_birth marital_now s_edu s_political s_hukou s_income s_work_exper s_work_status s_work_type f_birth f_edu f_political f_work_14 m_birth m_edu m_political m_work_14 status_peer status_3_before view inc_ability inc_exp trust_1 trust_2 trust_3 trust_4 trust_5 trust_6 trust_7 trust_8 trust_9 trust_10 trust_11 trust_12 trust_13 neighbor_familiarity public_service_1 public_service_2 public_service_3 public_service_4 public_service_5 public_service_6 public_service_7 public_service_8 public_service_9 0 1 4 1 12 32 59 2015/8/4 14:18 1 1959 1 1 1 11 NaN 4.0 -2.0 20000 1 NaN 45.0 0 1 0 0 0 0 0 0 0 NaN 176 155 3 2 5 5 2.0 4 2 5 5 4 3 1 4 3 1 2 3 4 1 4 5 4 1 2 4 3 3.0 3.0 2 3 3 3 3 1 1 3.0 30.0 1.0 2.0 1 1 1 2 60000.0 2 2 1 2 0 1 0 0 0 0 0 0 0 NaN 1 0 0.0 3 1984.0 1958.0 1984.0 6.0 1.0 5.0 40000.0 5.0 NaN NaN -2 4 4 1 -2 4 1 1 3 2 4 3 50000.0 4 2 -8 -8 5 3 2 3 4 3 -8 4 1 4 50 60 50 50 30.0 30 50 50 50 1 2 4 2 18 52 85 2015/7/21 15:04 1 1992 1 1 1 12 NaN 4.0 2013.0 20000 1 NaN 110.0 0 0 0 0 1 0 0 0 0 NaN 170 110 5 4 3 1 1.0 2 2 1 3 5 1 2 3 4 3 5 4 3 2 3 4 5 1 2 4 3 6.0 2.0 1 3 6 4 8 5 1 3.0 2.0 1.0 3.0 1 1 1 1 40000.0 3 4 1 2 0 1 0 0 0 0 0 0 0 NaN 0 0 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1972 3 1 2 1973 3 1 2 1 1 4 2 50000.0 5 4 4 3 5 3 3 3 2 3 3 3 2 3 90 70 70 80 85.0 70 90 60 60 2 3 4 2 29 83 126 2015/7/21 13:24 2 1967 1 0 3 4 NaN 4.0 -2.0 2000 1 NaN 120.0 0 1 1 0 0 0 0 0 0 NaN 160 122 4 4 5 1 1.0 2 2 2 5 1 3 1 4 4 3 5 4 4 2 3 5 5 5 3 4 2 2.0 5.0 2 4 5 4 6 3 2 NaN NaN NaN NaN 1 1 2 2 8000.0 3 3 1 2 0 1 0 0 0 0 0 0 0 NaN 0 2 1.0 3 1990.0 1968.0 1990.0 3.0 1.0 1.0 6000.0 3.0 NaN NaN -2 1 1 2 -2 1 1 2 2 1 4 2 80000.0 3 3 3 3 4 3 3 3 3 3 -8 3 1 4 90 80 75 79 80.0 90 90 90 75 3 4 5 2 10 28 51 2015/7/25 17:33 2 1943 1 1 1 3 NaN 4.0 1959.0 6420 1 NaN 78.0 0 0 0 1 0 0 0 0 0 NaN 163 170 4 4 4 1 2.0 2 1 1 5 1 1 1 5 2 4 5 4 5 1 1 5 5 5 2 4 4 1.0 6.0 1 4 5 5 7 2 4 NaN NaN NaN NaN 2 2 2 2 12000.0 3 3 1 1 0 1 0 0 0 0 0 0 0 NaN 1 4 0.0 7 1960.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN -2 14 1 2 -2 1 1 2 2 1 3 2 10000.0 3 3 4 3 5 3 3 5 4 3 3 3 2 3 100 90 70 80 80.0 90 90 80 80 4 5 4 1 7 18 36 2015/8/10 9:50 2 1994 1 1 1 12 NaN 1.0 2014.0 -1 2 NaN 70.0 0 0 0 0 1 0 0 0 0 NaN 165 110 5 5 3 2 3.0 1 3 4 2 5 5 3 3 3 2 4 4 3 5 2 5 5 1 4 3 4 7.0 5.0 3 2 1 1 1 4 6 NaN NaN NaN NaN 1 2 2 2 -2.0 4 3 1 1 0 1 0 0 0 0 0 0 0 NaN 0 0 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1970 6 1 10 1972 4 1 15 3 2 3 -8 200000.0 4 3 3 3 5 5 3 4 3 3 3 3 2 2 50 50 50 50 50.0 50 50 50 50
train. info( verbose= True , show_counts= True )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 140 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 id 8000 non-null int64 1 happiness 8000 non-null int64 2 survey_type 8000 non-null int64 3 province 8000 non-null int64 4 city 8000 non-null int64 5 county 8000 non-null int64 6 survey_time 8000 non-null object 7 gender 8000 non-null int64 8 birth 8000 non-null int64 9 nationality 8000 non-null int64 10 religion 8000 non-null int64 11 religion_freq 8000 non-null int64 12 edu 8000 non-null int64 13 edu_other 3 non-null object 14 edu_status 6880 non-null float6415 edu_yr 6028 non-null float6416 income 8000 non-null int64 17 political 8000 non-null int64 18 join_party 824 non-null float6419 floor_area 8000 non-null float6420 property_0 8000 non-null int64 21 property_1 8000 non-null int64 22 property_2 8000 non-null int64 23 property_3 8000 non-null int64 24 property_4 8000 non-null int64 25 property_5 8000 non-null int64 26 property_6 8000 non-null int64 27 property_7 8000 non-null int64 28 property_8 8000 non-null int64 29 property_other 66 non-null object 30 height_cm 8000 non-null int64 31 weight_jin 8000 non-null int64 32 health 8000 non-null int64 33 health_problem 8000 non-null int64 34 depression 8000 non-null int64 35 hukou 8000 non-null int64 36 hukou_loc 7996 non-null float6437 media_1 8000 non-null int64 38 media_2 8000 non-null int64 39 media_3 8000 non-null int64 40 media_4 8000 non-null int64 41 media_5 8000 non-null int64 42 media_6 8000 non-null int64 43 leisure_1 8000 non-null int64 44 leisure_2 8000 non-null int64 45 leisure_3 8000 non-null int64 46 leisure_4 8000 non-null int64 47 leisure_5 8000 non-null int64 48 leisure_6 8000 non-null int64 49 leisure_7 8000 non-null int64 50 leisure_8 8000 non-null int64 51 leisure_9 8000 non-null int64 52 leisure_10 8000 non-null int64 53 leisure_11 8000 non-null int64 54 leisure_12 8000 non-null int64 55 socialize 8000 non-null int64 56 relax 8000 non-null int64 57 learn 8000 non-null int64 58 social_neighbor 7204 non-null float6459 social_friend 7204 non-null float6460 socia_outing 8000 non-null int64 61 equity 8000 non-null int64 62 class 8000 non-null int64 63 class_10_before 8000 non-null int64 64 class_10_after 8000 non-null int64 65 class_14 8000 non-null int64 66 work_exper 8000 non-null int64 67 work_status 2951 non-null float6468 work_yr 2951 non-null float6469 work_type 2951 non-null float6470 work_manage 2951 non-null float6471 insur_1 8000 non-null int64 72 insur_2 8000 non-null int64 73 insur_3 8000 non-null int64 74 insur_4 8000 non-null int64 75 family_income 7999 non-null float6476 family_m 8000 non-null int64 77 family_status 8000 non-null int64 78 house 8000 non-null int64 79 car 8000 non-null int64 80 invest_0 8000 non-null int64 81 invest_1 8000 non-null int64 82 invest_2 8000 non-null int64 83 invest_3 8000 non-null int64 84 invest_4 8000 non-null int64 85 invest_5 8000 non-null int64 86 invest_6 8000 non-null int64 87 invest_7 8000 non-null int64 88 invest_8 8000 non-null int64 89 invest_other 29 non-null object 90 son 8000 non-null int64 91 daughter 8000 non-null int64 92 minor_child 6934 non-null float6493 marital 8000 non-null int64 94 marital_1st 7172 non-null float6495 s_birth 6282 non-null float6496 marital_now 6230 non-null float6497 s_edu 6282 non-null float6498 s_political 6282 non-null float6499 s_hukou 6282 non-null float64100 s_income 6282 non-null float64101 s_work_exper 6282 non-null float64102 s_work_status 2565 non-null float64103 s_work_type 2565 non-null float64104 f_birth 8000 non-null int64 105 f_edu 8000 non-null int64 106 f_political 8000 non-null int64 107 f_work_14 8000 non-null int64 108 m_birth 8000 non-null int64 109 m_edu 8000 non-null int64 110 m_political 8000 non-null int64 111 m_work_14 8000 non-null int64 112 status_peer 8000 non-null int64 113 status_3_before 8000 non-null int64 114 view 8000 non-null int64 115 inc_ability 8000 non-null int64 116 inc_exp 8000 non-null float64117 trust_1 8000 non-null int64 118 trust_2 8000 non-null int64 119 trust_3 8000 non-null int64 120 trust_4 8000 non-null int64 121 trust_5 8000 non-null int64 122 trust_6 8000 non-null int64 123 trust_7 8000 non-null int64 124 trust_8 8000 non-null int64 125 trust_9 8000 non-null int64 126 trust_10 8000 non-null int64 127 trust_11 8000 non-null int64 128 trust_12 8000 non-null int64 129 trust_13 8