处理Hive中的数据倾斜的方法
目录
- 1 groupby(大表分组-局部聚合+全局聚合)
 - 2 join(大中表Join - 加salt + 小表膨胀)
 - 3 双大表Join - 抽样取倾斜key+BroadJoin
 - 4 小结
 
1 groupby(大表分组-局部聚合+全局聚合)
示例1:
select label,sum(cnt) as all from 
(
    select rd,label,sum(1) as cnt from 
    (
        select id,label,round(rand(),2) as rd,value from tmp1
    ) as tmp
    group by rd,label
) as tmp
group by label;示例2:
select split(new_source,'\\_')[0] as source ,sum(cnt) as cnt from (select concat(source,'_', rand()*100) as new_source ,count(1) as cnt from test_table where day ='2022-01-01' group by concat(source,'_', rand()*100) )tt group by split(new_source,'\\_')[0]
2 join(大中表Join - 加salt + 小表膨胀)
示例1:
select label,sum(value) as all from 
(
    select rd,label,sum(value) as cnt from
    (
        select tmp1.rd as rd,tmp1.label as label,tmp1.value*tmp2.value as value 
        from 
        (
            select id,round(rand(),1) as rd,label,value from tmp1
        ) as tmp1
        join
        (
            select id,rd,label,value from tmp2
            lateral view explode(split('0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9',',')) mytable as rd
        ) as tmp2
        on tmp1.rd = tmp2.rd and tmp1.label = tmp2.label
    ) as tmp1
    group by rd,label
) as tmp1
group by label;示例2:
select 
	source
	,source_name
	,sum(cnt) as cnt 
from  
(select 
	t1.source 
	,new_source
	,nvl(source_name,'未知') as source_name 
	,count(imei) as cnt 
from  
(select  
	imei
	,source 
	,concat(cast(rand()*10 as int ),'_',source ) as new_source
from  test_table_1
where day ='2022-01-01'
) t1 
inner join 
(
select 
	source_name 
	,concat(preflix,'_',source) as new_source
from  test_table_1
where day ='2022-01-01'
lateral view explode(split('0,1,2,3,4,5,6,7,8,9,10',','))b as preflix 
) t2 
on t1.new_source =t2.new_source
group by 
t1.source 
,new_source
,nvl(source_name,'未知')
) tta  
group by 
	source
	,source_name3 双大表Join - 抽样取倾斜key+BroadJoin
##优化前: create table test.tmp_table_test_all as select imei ,lable_id ,nvl(label_name,'未知') from tmp_table_1 t1 left join (select lable_id ,label_name from tmp_table_2 where day ='2024-01-01') t2 on t1.lable_id =t2.lable_id where t1.day ='2024-01-01' ; ## 优化后 : create table test.tmp_table_test_all_new as with tmp_table_test_1 as (select lable_id ,count(1) as cnt from tmp_table_1 t1 tablesample(5 percent) --抽样取5%的数据,减少table scan的量 group by lable_id order by cnt desc limit 100 ) select imei ,lable_id ,nvl(label_name,'未知') as label_name from tmp_table_1 t1 left join tmp_table_test_1 t2 on t1.lable_id =t2.lable_id left join (select lable_id ,label_name from tmp_table_2 where day ='2024-01-01') t3 on t1.lable_id =t3.lable_id where t1.day ='2024-01-01' and t2.lable_id is null union all select imei ,lable_id ,nvl(label_name,'未知') as label_name from tmp_table_1 t1 inner join (select lable_id from tmp_table_test_1 t1 left join tmp_table_2 t2 on t1.lable_id =t2.lable_id where t2.day ='2024-01-01') t3 on t1.lable_id =t3.lable_id where t1.day ='2024-01-01' ;
4 小结
到此这篇关于处理Hive中的数据倾斜的方法的文章就介绍到这了,更多相关处理Hive数据倾斜内容请搜索电脑手机教程网以前的文章或继续浏览下面的相关文章希望大家以后多多支持电脑手机教程网!