I'm using Hive 3.1.2 and tried to create a bucket with bucket version=2.
When I created a bucket and checked the bucket file using hdfs dfs -cat, I could see that the hashing result was different.
Are the hash algorithms of Tez and MR different? Shouldn't it be the same if bucket version=2?
Here's the test method and its results.
1. Create Bucket table & Data table
CREATE EXTERNAL TABLE `bucket_test`(
`id` int COMMENT ' ',
`name` string COMMENT ' ',
`age` int COMMENT ' ',
`phone` string COMMENT ' ')
CLUSTERED BY (id, name, age) SORTED BY(phone) INTO 2 Buckets
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
'bucketing_version'='2',
'orc.compress'='ZLIB');
CREATE TABLE data_table (id int, name string, age int, phone string)
row format delimited fields terminated by ',';
2. Insert data into DATA_TABLE
INSERT INTO TABLE data_table
select stack
( 20
,1, 'a', 11, '111'
,1,'a',11,'222'
,3,'b',14,'333'
,3,'b',13,'444'
,5,'c',18,'555'
,5,'c',18,'666'
,5,'c',21,'777'
,8,'d',23,'888'
,9,'d',24,'999'
,10,'d',26,'1110'
,11,'d',27,'1112'
,12,'e',28,'1113'
,13,'f',28,'1114'
,14,'g',30,'1115'
,15,'q',31,'1116'
,16,'w',32,'1117'
,17,'e',33,'1118'
,18,'r',34,'1119'
,19,'t',36,'1120'
,20,'y',36,'1130')
3. Create Bucket with MR
set hive.enforce.bucketing = true;
set hive.execution.engine = mr;
set mapreduce.job.queuename=root.test;
Insert overwrite table bucket_test
select * from data_table ;
4. Check Bucket contents
# bucket0 : 6 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000000_0
10d261110
11d271112
18r341119
3b13444
5c18555
5c18666
# bucket1 : 14 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000001_0
1a11111
12e281113
13f281114
14g301115
15q311116
16w321117
17e331118
19t361120
20y361130
1a11222
3b14333
5c21777
8d23888
9d24999
5. Create Bucket with Tez
set hive.enforce.bucketing = true;
set hive.execution.engine = tez;
set tez.queue.name=root.test;
Insert overwrite table bucket_test
select * from data_table ;
6. Check Bucket contents
# bucket0 : 11 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000000_0
1a11111
10d261110
11d271112
13f281114
16w321117
17e331118
18r341119
20y361130
1a11222
5c18555
5c18666
# bucket1 : 9 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000001_0
12e281113
14g301115
15q311116
19t361120
3b14333
3b13444
5c21777
8d23888
9d24999