有人说parquet是列存储,textfile 是行存储(类似关系型数据库oracle表),那么我半信半疑,那么我们试试看。我有个毛病,听别人说,没有底气,凡事都想自己尝试,拿事实说话。
1.先试试parquet表
create TABLE prestat.stat_nps_user_week_test (timevalue TIMESTAMP,imsi STRING,msisdn STRING,kpi_nps_value DOUBLE,kpi_user_cnt DOUBLE,kpi_recommender_cnt DOUBLE,kpi_neutrals_cnt DOUBLE,kpi_detractors_cnt DOUBLE,kpi_recommender_rate DOUBLE,kpi_neutrals_rate DOUBLE,kpi_detractors_rate DOUBLE,kpi_mention_rate DOUBLE)PARTITIONED BY (day INT,minute CHAR(4))STORED AS PARQUET
2.我们先挑20171016号,547M,来做实验
3. 2017-10-16 复制到 20180401
insert overwrite table prestat.stat_nps_user_week_test partition(day=20180401,minute='0000')select timevalue ,imsi ,msisdn ,kpi_nps_value ,kpi_user_cnt ,kpi_recommender_cnt ,kpi_neutrals_cnt ,kpi_detractors_cnt ,kpi_recommender_rate ,kpi_neutrals_rate ,kpi_detractors_rate ,kpi_mention_rate from prestat.stat_nps_user_week where day=20171016;
4. 20180402 把全部值赋为123,避免有些指标为null ,影响测试效果。
insert overwrite table prestat.stat_nps_user_week_test partition(day=20180402,minute='0000')select timevalue ,imsi ,msisdn ,123 ,123 ,123 ,123 ,123 ,123 ,123 ,123 ,123 from prestat.stat_nps_user_week where day=20171016
5. 把中间字段全部填null ,如果是行表,这里会浪费存储。
insert overwrite table prestat.stat_nps_user_week_test partition(day=20180403,minute='0000')select timevalue ,imsi ,msisdn ,null ,null ,null ,null ,null ,null ,null ,null ,123 from prestat.stat_nps_user_week where day=20171016
6. 这里把123放在前面,后面填null ,刚好与步骤5形成对比。
insert overwrite table prestat.stat_nps_user_week_test partition(day=20180404,minute='0000')select timevalue ,imsi ,msisdn ,123 ,null ,null ,null ,null ,null ,null ,null ,null from prestat.stat_nps_user_week where day=20171016
7.我们来看看结果
[root@SLAVE09 ~]# hadoop fs -du -h /user/hive/warehouse/prestat.db/stat_nps_user_week_test
533.6 M 1.0 G /user/hive/warehouse/prestat.db/stat_nps_user_week_test/day=20180401
519.2 M 1.0 G /user/hive/warehouse/prestat.db/stat_nps_user_week_test/day=20180402
514.3 M 1.0 G /user/hive/warehouse/prestat.db/stat_nps_user_week_test/day=20180403
514.3 M 1.0 G /user/hive/warehouse/prestat.db/stat_nps_user_week_test/day=20180404
20180403 与 20180404 是一样的存储, 说明这是行式存储特性。 不会因为中间字段没有填写,而增加存储与io scan。
在设计表时。 列的多少并不影响ioscan。 注意千万不能用 select * , 这样查询了所有列。增加了多余io。
=============================================================
下面测试text表。 行表的特性。(浪费存储,增加读的io)
Query: create TABLE prestat.stat_nps_user_week_test_text (timevalue TIMESTAMP,imsi STRING,msisdn STRING,kpi_nps_value DOUBLE,kpi_user_cnt DOUBLE,kpi_recommender_cnt DOUBLE,kpi_neutrals_cnt DOUBLE,kpi_detractors_cnt DOUBLE,kpi_recommender_rate DOUBLE,kpi_neutrals_rate DOUBLE,kpi_detractors_rate DOUBLE,kpi_mention_rate DOUBLE)PARTITIONED BY (day INT,minute CHAR(4))Fetched 0 row(s) in 0.16s
[SLAVE08:21000] > show create table prestat.stat_nps_user_week_test_text ;
Query: show create table prestat.stat_nps_user_week_test_text
+------------------------------------------------------------------------------------+
| result |
+------------------------------------------------------------------------------------+
| CREATE TABLE prestat.stat_nps_user_week_test_text ( |
| timevalue TIMESTAMP, |
| imsi STRING, |
| msisdn STRING, |
| kpi_nps_value DOUBLE, |
| kpi_user_cnt DOUBLE, |
| kpi_recommender_cnt DOUBLE, |
| kpi_neutrals_cnt DOUBLE, |
| kpi_detractors_cnt DOUBLE, |
| kpi_recommender_rate DOUBLE, |
| kpi_neutrals_rate DOUBLE, |
| kpi_detractors_rate DOUBLE, |
| kpi_mention_rate DOUBLE |
| ) |
| PARTITIONED BY ( |
| day INT, |
| minute CHAR(4) |
| ) |
| STORED AS TEXTFILE |
| LOCATION 'hdfs://myha/user/hive/warehouse/prestat.db/stat_nps_user_week_test_text' |
| TBLPROPERTIES ('transient_lastDdlTime'='1523429152') |
+------------------------------------------------------------------------------------+
insert overwrite table prestat.stat_nps_user_week_test_text partition(day=20180401,minute='0000')select timevalue ,imsi ,msisdn ,kpi_nps_value ,kpi_user_cnt ,kpi_recommender_cnt ,kpi_neutrals_cnt ,kpi_detractors_cnt ,kpi_recommender_rate ,kpi_neutrals_rate ,kpi_detractors_rate ,kpi_mention_rate from prestat.stat_nps_user_week where day=20171016;insert overwrite table prestat.stat_nps_user_week_test_text partition(day=20180402,minute='0000')select timevalue ,imsi ,msisdn ,123 ,123 ,123 ,123 ,123 ,123 ,123 ,123 ,123 from prestat.stat_nps_user_week where day=20171016insert overwrite table prestat.stat_nps_user_week_test_text partition(day=20180405,minute='0000')select timevalue ,imsi ,msisdn ,null ,null ,null ,null ,null ,null ,null ,null ,123 from prestat.stat_nps_user_week where day=20171016insert overwrite table prestat.stat_nps_user_week_test_text partition(day=20180404,minute='0000')select timevalue ,imsi ,msisdn ,123 ,null ,null ,null ,null ,null ,null ,null ,null from prestat.stat_nps_user_week where day=20171016insert overwrite table prestat.stat_nps_user_week_test_text partition(day=20180405,minute='0000')select timevalue ,imsi ,msisdn ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,'' ,123 from prestat.stat_nps_user_week where day=20171016
结果如下,但是文本表,null值不是很明显, 我找一个200列,再测试下。
====================================================
文本表200列,测试。
create TABLE prestat.stat_nps_user_week_test_text200 (timevalue TIMESTAMP,imsi STRING,msisdn STRING,kpi_nps_value DOUBLE,kpi_user_cnt DOUBLE,kpi_recommender_cnt DOUBLE,kpi_neutrals_cnt DOUBLE,kpi_detractors_cnt DOUBLE,kpi_recommender_rate DOUBLE,kpi_neutrals_rate DOUBLE,kpi_detractors_rate DOUBLE,kpi_mention_rate DOUBLE
, k1 bigint
,k2 bigint
,k3 bigint
,k4 bigint
,k5 bigint
,k6 bigint
,k7 bigint
,k8 bigint
,k9 bigint
,k10 bigint
,k11 bigint
,k12 bigint
,k13 bigint
,k14 bigint
,k15 bigint
,k16 bigint
,k17 bigint
,k18 bigint
,k19 bigint
,k20 bigint
,k21 bigint
,k22 bigint
,k23 bigint
,k24 bigint
,k25 bigint
,k26 bigint
,k27 bigint
,k28 bigint
,k29 bigint
,k30 bigint
,k31 bigint
,k32 bigint
,k33 bigint
,k34 bigint
,k35 bigint
,k36 bigint
,k37 bigint
,k38 bigint
,k39 bigint
,k40 bigint
,k41 bigint
,k42 bigint
,k43 bigint
,k44 bigint
,k45 bigint
,k46 bigint
,k47 bigint
,k48 bigint
,k49 bigint
,k50 bigint
,k51 bigint
,k52 bigint
,k53 bigint
,k54 bigint
,k55 bigint
,k56 bigint
,k57 bigint
,k58 bigint
,k59 bigint
,k60 bigint
,k61 bigint
,k62 bigint
,k63 bigint
,k64 bigint
,k65 bigint
,k66 bigint
,k67 bigint
,k68 bigint
,k69 bigint
,k70 bigint
,k71 bigint
,k72 bigint
,k73 bigint
,k74 bigint
,k75 bigint
,k76 bigint
,k77 bigint
,k78 bigint
,k79 bigint
,k80 bigint
,k81 bigint
,k82 bigint
,k83 bigint
,k84 bigint
,k85 bigint
,k86 bigint
,k87 bigint
,k88 bigint
,k89 bigint
,k90 bigint
,k91 bigint
,k92 bigint
,k93 bigint
,k94 bigint
,k95 bigint
,k96 bigint
,k97 bigint
,k98 bigint
,k99 bigint
,k100 bigint
,k101 bigint
,k102 bigint
,k103 bigint
,k104 bigint
,k105 bigint
,k106 bigint
,k107 bigint
,k108 bigint
,k109 bigint
,k110 bigint
,k111 bigint
,k112 bigint
,k113 bigint
,k114 bigint
,k115 bigint
,k116 bigint
,k117 bigint
,k118 bigint
,k119 bigint
,k120 bigint
,k121 bigint
,k122 bigint
,k123 bigint
,k124 bigint
,k125 bigint
,k126 bigint
,k127 bigint
,k128 bigint
,k129 bigint
,k130 bigint
,k131 bigint
,k132 bigint
,k133 bigint
,k134 bigint
,k135 bigint
,k136 bigint
,k137 bigint
,k138 bigint
,k139 bigint
,k140 bigint
,k141 bigint
,k142 bigint
,k143 bigint
,k144 bigint
,k145 bigint
,k146 bigint
,k147 bigint
,k148 bigint
,k149 bigint
,k150 bigint
,k151 bigint
,k152 bigint
,k153 bigint
,k154 bigint
,k155 bigint
,k156 bigint
,k157 bigint
,k158 bigint
,k159 bigint
,k160 bigint
,k161 bigint
,k162 bigint
,k163 bigint
,k164 bigint
,k165 bigint
,k166 bigint
,k167 bigint
,k168 bigint
,k169 bigint
,k170 bigint
,k171 bigint
,k172 bigint
,k173 bigint
,k174 bigint
,k175 bigint
,k176 bigint
,k177 bigint
,k178 bigint
,k179 bigint
,k180 bigint
,k181 bigint
,k182 bigint
,k183 bigint
,k184 bigint
,k185 bigint
,k186 bigint
,k187 bigint
,k188 bigint
,k189 bigint
,k190 bigint
,k191 bigint
,k192 bigint
,k193 bigint
,k194 bigint
,k195 bigint
,k196 bigint
,k197 bigint
,k198 bigint
,k199 bigint
,k200 bigint)PARTITIONED BY (day INT,minute CHAR(4))
insert overwrite table prestat.stat_nps_user_week_test_text200 partition(day=20180403,minute='0000')select timevalue ,imsi ,msisdn ,null ,null ,null ,null ,null ,null ,null ,null ,123
,null as k1
,null as k2
,null as k3
,null as k4
,null as k5
,null as k6
,null as k7
,null as k8
,null as k9
,null as k10
,null as k11
,null as k12
,null as k13
,null as k14
,null as k15
,null as k16
,null as k17
,null as k18
,null as k19
,null as k20
,null as k21
,null as k22
,null as k23
,null as k24
,null as k25
,null as k26
,null as k27
,null as k28
,null as k29
,null as k30
,null as k31
,null as k32
,null as k33
,null as k34
,null as k35
,null as k36
,null as k37
,null as k38
,null as k39
,null as k40
,null as k41
,null as k42
,null as k43
,null as k44
,null as k45
,null as k46
,null as k47
,null as k48
,null as k49
,null as k50
,null as k51
,null as k52
,null as k53
,null as k54
,null as k55
,null as k56
,null as k57
,null as k58
,null as k59
,null as k60
,null as k61
,null as k62
,null as k63
,null as k64
,null as k65
,null as k66
,null as k67
,null as k68
,null as k69
,null as k70
,null as k71
,null as k72
,null as k73
,null as k74
,null as k75
,null as k76
,null as k77
,null as k78
,null as k79
,null as k80
,null as k81
,null as k82
,null as k83
,null as k84
,null as k85
,null as k86
,null as k87
,null as k88
,null as k89
,null as k90
,null as k91
,null as k92
,null as k93
,null as k94
,null as k95
,null as k96
,null as k97
,null as k98
,null as k99
,null as k100
,null as k101
,null as k102
,null as k103
,null as k104
,null as k105
,null as k106
,null as k107
,null as k108
,null as k109
,null as k110
,null as k111
,null as k112
,null as k113
,null as k114
,null as k115
,null as k116
,null as k117
,null as k118
,null as k119
,null as k120
,null as k121
,null as k122
,null as k123
,null as k124
,null as k125
,null as k126
,null as k127
,null as k128
,null as k129
,null as k130
,null as k131
,null as k132
,null as k133
,null as k134
,null as k135
,null as k136
,null as k137
,null as k138
,null as k139
,null as k140
,null as k141
,null as k142
,null as k143
,null as k144
,null as k145
,null as k146
,null as k147
,null as k148
,null as k149
,null as k150
,null as k151
,null as k152
,null as k153
,null as k154
,null as k155
,null as k156
,null as k157
,null as k158
,null as k159
,null as k160
,null as k161
,null as k162
,null as k163
,null as k164
,null as k165
,null as k166
,null as k167
,null as k168
,null as k169
,null as k170
,null as k171
,null as k172
,null as k173
,null as k174
,null as k175
,null as k176
,null as k177
,null as k178
,null as k179
,null as k180
,null as k181
,null as k182
,null as k183
,null as k184
,null as k185
,null as k186
,null as k187
,null as k188
,null as k189
,null as k190
,null as k191
,null as k192
,null as k193
,null as k194
,null as k195
,null as k196
,null as k197
,null as k198
,null as k199
,123 as k200from prestat.stat_nps_user_week where day=20171016insert overwrite table prestat.stat_nps_user_week_test_text200 partition(day=20180404,minute='0000')select timevalue ,imsi ,msisdn ,123 ,null ,null ,null ,null ,null ,null ,null ,null ,233 as k1
,null as k2
,null as k3
,null as k4
,null as k5
,null as k6
,null as k7
,null as k8
,null as k9
,null as k10
,null as k11
,null as k12
,null as k13
,null as k14
,null as k15
,null as k16
,null as k17
,null as k18
,null as k19
,null as k20
,null as k21
,null as k22
,null as k23
,null as k24
,null as k25
,null as k26
,null as k27
,null as k28
,null as k29
,null as k30
,null as k31
,null as k32
,null as k33
,null as k34
,null as k35
,null as k36
,null as k37
,null as k38
,null as k39
,null as k40
,null as k41
,null as k42
,null as k43
,null as k44
,null as k45
,null as k46
,null as k47
,null as k48
,null as k49
,null as k50
,null as k51
,null as k52
,null as k53
,null as k54
,null as k55
,null as k56
,null as k57
,null as k58
,null as k59
,null as k60
,null as k61
,null as k62
,null as k63
,null as k64
,null as k65
,null as k66
,null as k67
,null as k68
,null as k69
,null as k70
,null as k71
,null as k72
,null as k73
,null as k74
,null as k75
,null as k76
,null as k77
,null as k78
,null as k79
,null as k80
,null as k81
,null as k82
,null as k83
,null as k84
,null as k85
,null as k86
,null as k87
,null as k88
,null as k89
,null as k90
,null as k91
,null as k92
,null as k93
,null as k94
,null as k95
,null as k96
,null as k97
,null as k98
,null as k99
,null as k100
,null as k101
,null as k102
,null as k103
,null as k104
,null as k105
,null as k106
,null as k107
,null as k108
,null as k109
,null as k110
,null as k111
,null as k112
,null as k113
,null as k114
,null as k115
,null as k116
,null as k117
,null as k118
,null as k119
,null as k120
,null as k121
,null as k122
,null as k123
,null as k124
,null as k125
,null as k126
,null as k127
,null as k128
,null as k129
,null as k130
,null as k131
,null as k132
,null as k133
,null as k134
,null as k135
,null as k136
,null as k137
,null as k138
,null as k139
,null as k140
,null as k141
,null as k142
,null as k143
,null as k144
,null as k145
,null as k146
,null as k147
,null as k148
,null as k149
,null as k150
,null as k151
,null as k152
,null as k153
,null as k154
,null as k155
,null as k156
,null as k157
,null as k158
,null as k159
,null as k160
,null as k161
,null as k162
,null as k163
,null as k164
,null as k165
,null as k166
,null as k167
,null as k168
,null as k169
,null as k170
,null as k171
,null as k172
,null as k173
,null as k174
,null as k175
,null as k176
,null as k177
,null as k178
,null as k179
,null as k180
,null as k181
,null as k182
,null as k183
,null as k184
,null as k185
,null as k186
,null as k187
,null as k188
,null as k189
,null as k190
,null as k191
,null as k192
,null as k193
,null as k194
,null as k195
,null as k196
,null as k197
,null as k198
,null as k199
,null as k200from prestat.stat_nps_user_week where day=20171016
总结:
经过测试, 文本表和parquet表,都一样,是列式存储。textfile表并不像关系型数据库(oracle为代表), 在textfiel表中,中间字段即使是null,并最后一列有值。也不会像行表那样占用存储。 在做预统时,用null或者 '' 占位都是可以的。