ClickHouse DDL

[TOC]

ClickHouse DDL

IDEA 连接 Clickhouse 问题解决方案:

驱动可能不能正常下载:选择手动本地添加

LZ4 压缩算法不支持报错:修改

Data Source & Drivers => Advanced => compress_algorithm => 从 lz4 修改为 gzip

create database

1
create database ebs [engine=Atomic()];

create table

MergeTree(8192)

简单类型简单表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
drop table if exists ebs.mt_simple_type_table;
create table if not exists ebs.mt_simple_type_table(
id UInt32 comment '',
salary Decimal(16,2),
isMember UInt8,
name String,
birthTime DateTime64(3,'Asia/Shanghai'),
birthPlace String,
gender Enum8('男'=0,'女'=1)
) engine = MergeTree()
order by (id,name);

insert into mt_simple_type_table values
(1,12458.00,0,'张丰','1995-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男'),
(2,32178.00,0,'鹰王','1986-05-24 09:10:25.123','{"province":"江苏","city":"镇江"}','男'),
(3,20288.00,0,'赵敏','1945-11-13 22:25:14.204','{"province":"安徽","city":"合肥"}','女');
简单类型分区表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
create table if not exists ebs.mt_simple_type_table_par(
id UInt32 comment '主键',
salary Decimal(16,2),
isMember UInt8,
name String,
birthTime DateTime64(3,'Asia/Shanghai') comment '分区字段',
birthPlace String,
gender Enum8('男'=0,'女'=1)
) engine = MergeTree()
order by (id,name)
partition by(toYear(birthTime))
primary key(id);

insert into mt_simple_type_table_par values
(1,12458.00,0,'张丰','1995-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男'),
(2,32178.00,0,'鹰王','1995-05-24 09:10:25.123','{"province":"江苏","city":"镇江"}','男'),
(3,20288.00,0,'赵敏','1945-11-13 22:25:14.204','{"province":"安徽","city":"合肥"}','女');
-- 查表数据看逻辑分区,通过查看虚拟机/var/lig/clickhouse/mt_simple_type_table_par看物理分区
select * from mt_simple_type_table_par;

-- 单独新增一条消息
insert into mt_simple_type_table_par values
(1,12458.00,0,'张山','1945-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男');
-- 查看表发现 1945 出现两个分区,因新插入数据在临时分区,因尚未 compact 出现两个分区
select * from mt_simple_type_table_par;
-- 手动触发单分区 1945 合并
optimize table mt_simple_type_table_par partition 1945 final;
-- 再次查看发现两个 1945 分区已合二为一
clickhouse_mt_partition

特殊:
1、primary key 并不唯一,👉可选,主键创建稀疏索引
2、分区👉可选

  • 分文件夹避免全表扫描
  • 以分区定并行度
  • 批量写入临时分区,15分钟左右定期自动合并
    可 optimize table TABLE_NAME final 触发全表合并
    或 optimize table TABLE_NAME partition ‘PARTITION_VALUE’ final 触发单分区合并

3、排序必须 👍

  • 非分区表,默认全局单一分区
  • 排序确定了分区内数据的排列顺序
  • 去重的依据
  • 若表有主键,排序必须以主键作为最前缀
clickhouse_mt_partition_detail
复杂类型简单表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
-- 当前排本 map 未实验类型,需要开启方能使用
Set allow_experimental_map_type = 1;
-- LowCardinality 支持类型 String, FixedString, Date, DateTime, and numbers
-- 使用字典编码压缩算法 lz7,改变数据存储方式和处理规则,提高查询性能
create table if not exists ebs.mt_complex_type_table(
id UInt32 comment '主键',
name LowCardinality(String),
hobbies Array(Nullable(String)),
father Tuple(id UInt32,name String,phone String),
scores Map(String,UInt8)
) engine = MergeTree()
order by (id);

insert into ebs.mt_complex_type_table values
(1,'HenryChen',array('money',null,'tech'),(2,'HuanChen','18236985474'),{'java':69, 'mysql':78}),
(2,'李博拉',array('rubber','oldmovie'),(3,'ZhangrenLi','13951806855'),{'english':99, 'math':66});

-- 数组的长度:hobbies.size0,按维度从0开始至N-1
-- 数组下标:hobbies[1],按下标从1开始
-- Map取值:scores['java'],若键不存在返回0
select
id,name,
hobbies.size0,
hobbies[2],
father.id,
father.name,
father.phone,
scores['java']
from ebs.mt_complex_type_table;

clickhouse_mt_complex_type

AggregatingMergeTree

聚合字段聚合引擎表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
-- 单字段聚合
create table if not exists ebs.amt_agg_func_table(
id UInt32,
salary SimpleAggregateFunction(sum,Decimal(38,2))
) engine = AggregatingMergeTree(salary)
order by (id);

insert into ebs.amt_agg_func_table values
(1,200),
(2,123),
(1,111),
(2,234),
(1,222);

select * from ebs.amt_agg_func_table;
clickhouse_amt
CollapsingMergeTree

以增代删的思路,支持行级数据修改和删除的表引擎

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
create table if not exists ebs.collapse_mt_table(
id UInt32,
salary Decimal(38,2),
sign Int8
) engine = CollapsingMergeTree(sign)
order by (id);

select * from ebs.collapse_mt_table;

-- 新增 sign=1 为保留信息
insert into ebs.collapse_mt_table values(1,200,1);
-- 新增 同id且sign=-1 为待删除信息
insert into ebs.collapse_mt_table values(1,200,-1);
-- 合并后完成删除 1-1 = 0
optimize table ebs.collapse_mt_table final;
-- 再次新增 同id且sign=1 为修改信息
insert into ebs.collapse_mt_table values(1,320,1);
Index 二级索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
-- glanularity 粒度
-- SECOND 索引按照原子级别进行,即索引是针对每一行。
-- MULTI 索引在更大的数据块上进行,可以提高查询性能,可能会降低数据更新的性能。
-- 1000 索引以每1000行为单位进行,提升查询性能的同时,减少数据更新时对索引的维护开销。
create table mt_second_index_table(
productId UInt32,
sku String,
price Decimal(16,2),
create_time Datetime64(3,'Asia/Shanghai'),
INDEX IX_TABLE_SALARY sku TYPE minmax GRANULARITY 1000
)
engine=MergeTree()
partition by toYYYYMMDD(create_time)
primary key(productId)
order by(productId,sku);
TTL 超时设置

表生命周期
字段级生命周期
必须日期(Date,Datetime 不能是 Datetime64)字段且【非主键】字段
需要等待自动合并或手动触发合并,会自动开启合并任务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
-- 字段级 TTL
create table ebs.mt_ttl_column(
create_time Datetime,
id UInt32,
name String TTL create_time + INTERVAL 1 MINUTE,
test_name String
)
engine=MergeTree()
order by(id);

-- 修改字段级 TTL 👇
alter table ebs.mt_ttl_column modify column test_name String TTL create_time + INTERVAL 1 MINUTE;

-- 插入数据
insert into ebs.mt_ttl_column values(now(),1,'henry','chen');
-- 超过一分钟后,手动触发合并,可能存在一定的延时
optimize table ebs.mt_ttl_column final;
-- 验证结果
select * from ebs.mt_ttl_column;

-- 建表时表级 TTL
create table ebs.mt_ttl_table(
create_time Datetime,
id UInt32,
name String TTL create_time + INTERVAL 1 MINUTE,
test_name String
)
engine=MergeTree()
order by(id)
TTL create_time + INTERVAL 1 MINUTE; 👈

-- 修改表表级 TTL 👇
alter table ebs.mt_ttl_table modify TTL create_time + INTERVAL 1 MINUTE;

-- 插入数据
insert into ebs.mt_ttl_table values(now(),1,'henry','chen');
-- 超过一分钟后,手动触发合并,可能存在一定的延时
optimize table ebs.mt_ttl_table final;
-- 验证结果
select * from ebs.mt_ttl_table;
clickhouse_ttl_after_compact clickhouse_ttl_after_compact
ReplacingMergeTree

核心:去重
契机:合并时才会去重
范围:分区表只会在分区内去重,不能跨分区

案例

ReplacingMergeTree(create_time)

create_time 为版本字段,重复时保留版本字段最大值,若字段值相同按先后顺序保留后者
若版本字段缺省,则按照插入的先后顺序,保留最后一条
同分区按照 order by 之后的字段【列表】识别去重
注意:新版 clickhouse 插入时会在同分区内执行一次去重

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
create table ebs.rep_mt_table(
id UInt32,
name String,
age UInt8,
salary Decimal(16,2),
create_time Datetime
)
engine=ReplacingMergeTree(create_time) 👈版本字段
partition by toYYYYMMDD(create_time)
primary key(id)
order by(id,name);

insert into ebs.rep_mt_table values
(1003,'henry',18,12345.67,'2024-08-07 08:12:13'),
(1003,'henry',22,23321.00,'2024-08-07 10:12:18'),
(1004,'ariel',16,18000.00,'2024-08-08 12:28:02'),
(1004,'jack',22,14568.25,'2024-08-08 15:08:21'),
(1004,'ariel',19,8800.20,'2024-08-07 18:38:15');

-- 验证结果
select * from ebs.rep_mt_table;

insert into ebs.rep_mt_table values
(1004,'ariel',19,18800.20,'2024-08-07 18:38:15');

-- 手动触发合并,可能存在一定的延时
optimize table ebs.rep_mt_table final;
-- 验证结果
select * from ebs.rep_mt_table;
SummingMergeTree

汇总求和

分区内,分片合并时,执行预聚合
按照 order by 之后的字段【列表】进行分组后预聚合
需要指定合并字段,且合并字段必须为数值类型
若未指定合并字段,将会为非 order by 字段列表之外的所有数值字段执行聚合

案例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
create table ebs.sum_mt_table(
id UInt32,
name String,
age UInt8,
salary Decimal(16,2),
create_time Datetime
)
engine=SummingMergeTree(salary) 👈合并字段
partition by toYYYYMMDD(create_time)
primary key(id)
order by(id);

insert into ebs.sum_mt_table values
(1003,'henry',18,12345.67,'2024-08-07 08:12:13'),
(1003,'henry',22,23321.00,'2024-08-07 10:12:18'),
(1004,'ariel',16,18000.00,'2024-08-08 12:28:02'),
(1004,'jack',22,14568.25,'2024-08-08 15:08:21'),
(1004,'ariel',19,8800.20,'2024-08-07 18:38:15');

-- 手动触发合并,可能存在一定的延时
optimize table ebs.sum_mt_table final;
select * from ebs.sum_mt_table;
Kafka
案例 CSV
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
-- 同步表:不能单独查 ods_tran_kafka 表,交给物化视图消费
create table tran_kafka.ods_tran_kafka(
transaction_id UInt32,
store_id UInt16,
customer_id UInt16,
product_id UInt16,
price Decimal(16,2),
create_time DateTime
)
engine = Kafka()
settings
kafka_broker_list = 'master01:9092,master02:9092,worker01:9092',
kafka_topic_list = 'tran_csv',
kafka_group_name = 'kfk_ck_tran_csv_sync02',
kafka_client_id='kfk_ck_tran_csv_sync02_01',
kafka_format = 'CSV',
kafka_max_block_size = 20000,
kafka_skip_broken_messages = 100,
kafka_row_delimiter = '\n',
format_csv_delimiter = '|',
kafka_handle_error_mode= 'stream';

-- 合并表
drop table ebs.kafka_hour_tran_sum;
create table tran_kafka.smt_store_hour_sum_price(
store_id UInt16,
tran_year UInt8,
tran_month UInt8,
tran_day UInt8,
tran_hour UInt8,
hour_sum_amount SimpleAggregateFunction(sum,Decimal(38,2))
)
engine = SummingMergeTree(minute_sum_amount)
order by(store_id,tran_year,tran_month,tran_day,tran_hour)
partition by (tran_year,tran_month,tran_day);

-- 物化视图
create materialized view tran_kafka.v_ods_tran_to_sum_store_minute
to tran_kafka.smt_store_hour_sum_price
as
with tran_time_dim as (
select
store_id,
toYear(create_time) as tran_year,
toMonth(create_time) as tran_month,
toDayOfMonth(create_time) as tran_day,
toHour(create_time) as tran_hour,
toMinute(create_time) as tran_minute,
price
from tran_kafka.ods_tran_kafka
)
select
store_id,tran_year,tran_month,tran_day,tran_hour,
sum(price) as hour_sum_amount
from tran_time_dim
group by
store_id,tran_year,tran_month,tran_day,tran_hour,tran_minute;

-- 查看数据
select * from ebs.kafka_hour_tran_sum;
-- 停止接收主题数据或更改视图逻辑,需要 detach 物化视图:
DETACH TABLE v_tran_to_sum;
-- attach 后继续接收主体数据
ATTACH TABLE v_tran_to_sum;
kafka_format
格式 输入 输出
[TabSeparated]
[TabSeparatedRaw]
[TabSeparatedWithNames]
[TabSeparatedWithNamesAndTypes]
[Template]
[TemplateIgnoreSpaces]
[CSV]
[CSVWithNames]
[CustomSeparated]
[Values]
[Vertical]
[JSON]
[JSONAsString]
[JSONStrings]
[JSONCompact]
[JSONCompactStrings]
[JSONEachRow]
[JSONEachRowWithProgress]
[JSONStringsEachRow]
[JSONStringsEachRowWithProgress]
[JSONCompactEachRow]
[JSONCompactEachRowWithNamesAndTypes]
[JSONCompactStringsEachRow]
[JSONCompactStringsEachRowWithNamesAndTypes]
[TSKV]
[Pretty]
[PrettyCompact]
[PrettyCompactMonoBlock]
[PrettyNoEscapes]
[PrettySpace]
[Protobuf]
[ProtobufSingle]
[Avro]
[AvroConfluent]
[Parquet]
[Arrow]
[ArrowStream]
[ORC]
[RowBinary]
[RowBinaryWithNamesAndTypes]
[Native]
[Null]
[XML]
[CapnProto]
[LineAsString]
[Regexp]
[RawBLOB]

ClickHouse DDL
https://leaf-domain.gitee.io/2025/03/22/databases/clickhouse/clickhouse_3_ddl/
作者
叶域
发布于
2025年3月22日
许可协议