常用SQL
创建表
1
2
3
4
5
6
7
|
CREATE TABLE b6logs(
eventDate Date,
impid UInt64,
uid String,
idfa String,
imei String
) ENGINE=MergeTree(eventDate, (impid, eventDate), 8192)
|
一般情况下, 都建议使用 MergeTree
引擎. 这个引擎必须要有一个 Date
的列来作为索引, 即上面的 eventDate
.
导入CSV数据
1
|
cat xxx.csv | clickhouse-client --query="INSERT INTO b6logs FORMAT CSV";
|
指定分隔符
1
|
cat xxx.csv | clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO b6logs FORMAT CSV";
|
导入数据时忽略错误
1
|
clickhouse-client --input_format_allow_errors_num=100000 --input_format_allow_errors_ratio=0.2
|
--input_format_allow_errors_num
: 是允许的错误数
--input_format_allow_errors_ratio
: 是允许的错误率, 范围是 [0-1]
导出 CSV 数据
1
|
clickhouse-client --query="select uid, idfa, imei from (select impid, uid from b2logs where impid >= 15289903030261609347 and impid <= 15289904230261609347) any inner join (select impid, idfa, imei from b6logs where impid >= 15289903030261609347 and impid <= 15289904230261609347) using(impid) format CSV" > 9c9dc608-269b-4f02-b122-ef5dffb2669d.log
|
即语法为 select xxxx format CSV
重命名表
1
|
rename table tbl1 to btl2;
|
删除表
1
|
drop table tbl;
|
添加列
1
|
alter table dsp_statis add column cost UInt32 default 0;
|
查看表结构
1
|
desc tbl;
|
更多语法, 参考官方文档. https://clickhouse.yandex/docs/en/query_language/queries/
MergeTree 引擎中删除分区
注意, 默认情况下 mergeTree 引擎是按月分区的, 删除分区的格式为
201808
如果想修改为按日分区, 则在建表时加上:
1
2
3
|
ENGINE = MergeTree PARTITION BY eventDate ORDER BY imp_id SETTINGS index_granularity = 8192;
然后就可以:
alter table xxx drop partition '2018-08-08';
|
默认情况下, Clickhouse 不允许删除分区或表的大小大于 50GB 的分区或表. 可以通过修改server的配置文件来永久配置. 也可以临时设置一下来删除而不用重启服务.
永久配置
1
2
3
4
5
6
7
8
|
sudo vim /etc/clickhouse-server/config.xml
然后注释掉下面两行
<!-- <max_table_size_to_drop>0</max_table_size_to_drop> -->
<!-- <max_partition_size_to_drop>0</max_partition_size_to_drop> -->
0表示不限制. 或者你可以设置为你想限制的最大的大小.
|
临时设置
创建个标志文件:
1
|
sudo touch '/home/username/clickhouse/flags/force_drop_table' && sudo chmod 666 '/home/username/clickhouse/flags/force_drop_table'
|
创建好之后, 就可以执行上面的删除分区或表的命令了.
查看表大小
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
SELECT
database,
table,
formatReadableSize(size) AS size,
formatReadableSize(bytes_on_disk) AS bytes_on_disk,
formatReadableSize(data_uncompressed_bytes) AS data_uncompressed_bytes,
formatReadableSize(data_compressed_bytes) AS data_compressed_bytes,
compress_rate,
rows,
days,
formatReadableSize(avgDaySize) AS avgDaySize
FROM
(
SELECT
database,
table,
sum(bytes) AS size,
sum(rows) AS rows,
min(min_date) AS min_date,
max(max_date) AS max_date,
sum(bytes_on_disk) AS bytes_on_disk,
sum(data_uncompressed_bytes) AS data_uncompressed_bytes,
sum(data_compressed_bytes) AS data_compressed_bytes,
(data_compressed_bytes / data_uncompressed_bytes) * 100 AS compress_rate,
max_date - min_date AS days,
size / (max_date - min_date) AS avgDaySize
FROM system.parts
WHERE active
GROUP BY
database,
table
ORDER BY
database ASC,
size DESC
)
|
执行 SQL 文件
1
|
clickhouse-client -d 数据库 --multiquery < /tmp/your.sql.file
|
查看分区信息
1
|
select partition, name, active from system.parts WHERE table = 'visits'
|
性能相关收集
join 表性能
切记, 要用大表 join 小表. (不知道具体为什么, 从经验上看, 用大表作为驱动表, 性能远远快于用小表作为驱动表). (MySQL 里的话, 则是小表驱动大表).
优化 distinct count
之前
1
|
select yob, count(), count(distinct uid, idfa, imei) from nginx_bid_log where eventDate='2018-9-1' group by yob;
|
之后
1
|
select yob, count(), count(distinct(sipHash64(concat(uid, idfa, imei)))) from nginx_bid_log where eventDate='2018-9-1' group by yob;
|
查看数据分布
1
|
select histogram(100)(upstream_resp_time) from (select upstream_resp_time from nginx_bid_log where eventDate = '2018-12-13') format CSV;
|
histogram(100) 表示组距100 (即分成100等份的的分布) , 后面的
upstream_resp_time
是你的列名, 即按这个列的数据来进行统计.
bar
1
|
select upstream_resp_time, bar(列名, 最小值, 最大, step) from tableXX;
|
显示简单的图形.
hex 十六进制 转换为 十进制
1
|
SELECT reinterpretAsInt64(reverse(unhex('123')));
|
md5 分区
1
2
3
4
5
|
# 一
SELECT reinterpretAsInt64(reverse(unhex(substring(md5_field, 1, 1))));
# 二, md5 => hex => 十进制 => 取模
SELECT modulo(reinterpretAsInt64(reverse(unhex(substring(md5_field, 1, 1)))), 5);
|