select regexp_replace(reflect("java.util.UUID", "randomUUID"), "-", "") as uuid ,rand() -- rand_num ,rand(100) -- rand_num_seed ;
SELECT * FROM <Table_Name> DISTRIBUTE BY RAND() SORT BY RAND() LIMIT <N rows to sample>;
CREATE TABLE lxw1234 AS SELECT * FROM lxw1 TABLESAMPLE (50 PERCENT); -- 取原表中50%的数据 CREATE TABLE lxw1234_2 AS SELECT * FROM lxw1 TABLESAMPLE (30M); -- 取原表中30M大的数据 SELECT COUNT(1) FROM (SELECT * FROM lxw1 TABLESAMPLE (200 ROWS)) x; -- 取原表中每个map的200行 SELECT COUNT(1) FROM lxw1 TABLESAMPLE (BUCKET 1 OUT OF 10 ON rand()); -- 将表随机分成10个桶,抽样第一个桶的数据;
CREATE TABLE lxw1_bucketed (pcid STRING) CLUSTERED BY(pcid) INTO 10 BUCKETS; -- 创建一个分桶表 INSERT overwrite TABLE lxw1_bucketed SELECT pcid FROM lxw1; -- 插入数据 SELECT COUNT(1) FROM lxw1_bucketed TABLESAMPLE(BUCKET 1 OUT OF 10 ON pcid); -- 从10个桶中抽样第一个桶的数据 SELECT COUNT(1) FROM lxw1_bucketed TABLESAMPLE(BUCKET 1 OUT OF 20 ON pcid) -- 在第一个桶中抽样一半的数据 SELECT COUNT(1) FROM lxw1 TABLESAMPLE(BUCKET 1 OUT OF 20 ON pcid); -- 从源表中直接分桶抽样,也能达到一样的效果
-- Hive实现从表中随机抽样得到一个不重复的数据样本 select * from table_a order by rand() limit 100; select * from (select e.*, cast(rand() * 100000 as int) as vidx from e) vt order by vt.vidx limit 100;
select id ,name ,age ,rank from ( select id ,name ,age ,rank ,row_number()over(partition by rank order by rand()) as rn from a ) t where t.rn <=2 ;