Are the bucket hash algorithms of tez and MR different?

Question

I'm using Hive 3.1.2 and tried to create a bucket with bucket version=2.

When I created a bucket and checked the bucket file using hdfs dfs -cat, I could see that the hashing result was different.

Are the hash algorithms of Tez and MR different? Shouldn't it be the same if bucket version=2?

Here's the test method and its results.

1. Create Bucket table & Data table

CREATE EXTERNAL TABLE `bucket_test`(
  `id` int COMMENT ' ',
  `name` string COMMENT ' ',
  `age` int COMMENT ' ',
  `phone` string COMMENT ' ')
CLUSTERED BY (id, name, age) SORTED BY(phone) INTO 2 Buckets
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
TBLPROPERTIES (
  'bucketing_version'='2',
  'orc.compress'='ZLIB');

CREATE TABLE data_table (id int, name string, age int, phone string)
row format delimited fields terminated by ',';

2. Insert data into DATA_TABLE

  INSERT INTO TABLE data_table
select stack
       ( 20
,1, 'a', 11, '111'
,1,'a',11,'222'
,3,'b',14,'333'
,3,'b',13,'444'
,5,'c',18,'555'
,5,'c',18,'666'
,5,'c',21,'777'
,8,'d',23,'888'
,9,'d',24,'999'
,10,'d',26,'1110'
,11,'d',27,'1112'
,12,'e',28,'1113'
,13,'f',28,'1114'
,14,'g',30,'1115'
,15,'q',31,'1116'
,16,'w',32,'1117'
,17,'e',33,'1118'
,18,'r',34,'1119'
,19,'t',36,'1120'
,20,'y',36,'1130')

3. Create Bucket with MR

set hive.enforce.bucketing = true;
set hive.execution.engine = mr;
set mapreduce.job.queuename=root.test;
 
Insert overwrite table bucket_test
select * from data_table ;

4. Check Bucket contents

# bucket0 : 6 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000000_0
10d261110
11d271112
18r341119
3b13444
5c18555
5c18666
 
# bucket1 : 14 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000001_0
1a11111
12e281113
13f281114
14g301115
15q311116
16w321117
17e331118
19t361120
20y361130
1a11222
3b14333
5c21777
8d23888
9d24999

5. Create Bucket with Tez

set hive.enforce.bucketing = true;
set hive.execution.engine = tez;
set tez.queue.name=root.test;

Insert overwrite table bucket_test
select * from data_table ;

6. Check Bucket contents

# bucket0 : 11 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000000_0
1a11111
10d261110
11d271112
13f281114
16w321117
17e331118
18r341119
20y361130
1a11222
5c18555
5c18666
# bucket1 : 9 rows
[root@test~]# hdfs dfs -cat /user/hive/warehouse/bucket_test/000001_0
12e281113
14g301115
15q311116
19t361120
3b14333
3b13444
5c21777
8d23888
9d24999

Are the bucket hash algorithms of tez and MR different?

0 Answers0