這樣就抽取了五分之一的資料。select t.varx,t.a
select varx,rand() a
from tablename)t
where t.a between 0 and 0.2
test2 資料塊取樣(block sampling)--或者像這樣隨機抽取100條資料,與limit結合使用
select distinct a.*
from table a
order by rand(222)
limit 100
test3 系統抽樣select * from table1 tablesample (30m)
select * from table1 tablesample (15 percent)
select count(1) from (select * from lxw1 tablesample (200 rows)) x --不懂
select coutn(2) from table1 tablesample (bucket 1 out of 20 on rand()) -- 分桶20抽取第2桶
mod,rand() 依照userrid取模,分5組,每組隨機抽取100個使用者,實現如:
test4 分層抽樣select *
select refund_id,user_id,mod,rank_num from
(select refund_id,user_id,cast(10+rand()*100 as double) rank_num,
user_id%5 as mod --依據user_id,取模,獲取 mod
from table1)
distribute by mod sort by mod,rank_num desc --根據mod分組,並排序
) a
where row_number(mod)<=20; --從每個mod裡面抽取20個
hive 隨機抽樣
