ClickHouse DDL

[TOC]

ClickHouse DDL

IDEA 连接 Clickhouse 问题解决方案：

驱动可能不能正常下载：选择手动本地添加

LZ4 压缩算法不支持报错：修改

Data Source & Drivers => Advanced => compress_algorithm => 从 lz4 修改为 gzip

create database

1	`create database ebs [engine=Atomic()];`

create table

MergeTree（8192）

简单类型简单表

drop table if exists ebs.mt_simple_type_table;
create table if not exists ebs.mt_simple_type_table(
    id UInt32 comment '',
    salary Decimal(16,2),
    isMember UInt8,
    name String,
    birthTime DateTime64(3,'Asia/Shanghai'),
    birthPlace String,
    gender Enum8('男'=0,'女'=1)
) engine = MergeTree()
order by (id,name);

insert into mt_simple_type_table values
(1,12458.00,0,'张丰','1995-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男'),
(2,32178.00,0,'鹰王','1986-05-24 09:10:25.123','{"province":"江苏","city":"镇江"}','男'),
(3,20288.00,0,'赵敏','1945-11-13 22:25:14.204','{"province":"安徽","city":"合肥"}','女');

简单类型分区表

create table if not exists ebs.mt_simple_type_table_par(
   id UInt32 comment '主键',
   salary Decimal(16,2),
   isMember UInt8,
   name String,
   birthTime DateTime64(3,'Asia/Shanghai') comment '分区字段',
   birthPlace String,
   gender Enum8('男'=0,'女'=1)
) engine = MergeTree()
order by (id,name)
partition by(toYear(birthTime))
primary key(id);

insert into mt_simple_type_table_par values
(1,12458.00,0,'张丰','1995-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男'),
(2,32178.00,0,'鹰王','1995-05-24 09:10:25.123','{"province":"江苏","city":"镇江"}','男'),
(3,20288.00,0,'赵敏','1945-11-13 22:25:14.204','{"province":"安徽","city":"合肥"}','女');
-- 查表数据看逻辑分区，通过查看虚拟机/var/lig/clickhouse/mt_simple_type_table_par看物理分区
select * from mt_simple_type_table_par;

-- 单独新增一条消息
insert into mt_simple_type_table_par values
(1,12458.00,0,'张山','1945-10-30 22:10:32.888','{"province":"江苏","city":"南京"}','男');
-- 查看表发现 1945 出现两个分区，因新插入数据在临时分区，因尚未 compact 出现两个分区 
select * from mt_simple_type_table_par;
-- 手动触发单分区 1945 合并
optimize table mt_simple_type_table_par partition 1945 final;
-- 再次查看发现两个 1945 分区已合二为一

特殊：
1、primary key 并不唯一，👉可选，主键创建稀疏索引
2、分区👉可选

分文件夹避免全表扫描

以分区定并行度

批量写入临时分区，15分钟左右定期自动合并
可 optimize table TABLE_NAME final 触发全表合并
或 optimize table TABLE_NAME partition ‘PARTITION_VALUE’ final 触发单分区合并

3、排序必须 👍

非分区表，默认全局单一分区

排序确定了分区内数据的排列顺序

去重的依据

若表有主键，排序必须以主键作为最前缀

复杂类型简单表

-- 当前排本 map 未实验类型，需要开启方能使用
Set allow_experimental_map_type = 1;
-- LowCardinality 支持类型  String, FixedString, Date, DateTime, and numbers
-- 使用字典编码压缩算法 lz7，改变数据存储方式和处理规则，提高查询性能
create table if not exists ebs.mt_complex_type_table(
   id UInt32 comment '主键',
   name LowCardinality(String),
   hobbies Array(Nullable(String)),
   father Tuple(id UInt32,name String,phone String),
   scores Map(String,UInt8)
) engine = MergeTree()
order by (id);

insert into ebs.mt_complex_type_table values
(1,'HenryChen',array('money',null,'tech'),(2,'HuanChen','18236985474'),{'java':69, 'mysql':78}),
(2,'李博拉',array('rubber','oldmovie'),(3,'ZhangrenLi','13951806855'),{'english':99, 'math':66});

-- 数组的长度：hobbies.size0，按维度从0开始至N-1
-- 数组下标：hobbies[1]，按下标从1开始
-- Map取值：scores['java']，若键不存在返回0
select
    id,name,
    hobbies.size0,
    hobbies[2],
    father.id,
    father.name,
    father.phone,
    scores['java']
from ebs.mt_complex_type_table;

clickhouse_mt_complex_type

AggregatingMergeTree

聚合字段聚合引擎表

-- 单字段聚合
create table if not exists ebs.amt_agg_func_table(
	id UInt32,
	salary SimpleAggregateFunction(sum,Decimal(38,2))
) engine = AggregatingMergeTree(salary)
order by (id);

insert into ebs.amt_agg_func_table values
(1,200),
(2,123),
(1,111),
(2,234),
(1,222);

select * from ebs.amt_agg_func_table;

CollapsingMergeTree

以增代删的思路，支持行级数据修改和删除的表引擎

create table if not exists ebs.collapse_mt_table(
 id UInt32,
 salary Decimal(38,2),
 sign Int8
) engine = CollapsingMergeTree(sign)
order by (id);

select * from ebs.collapse_mt_table;

-- 新增 sign=1 为保留信息
insert into ebs.collapse_mt_table values(1,200,1);
-- 新增 同id且sign=-1 为待删除信息
insert into ebs.collapse_mt_table values(1,200,-1);
-- 合并后完成删除 1-1 = 0
optimize table ebs.collapse_mt_table final;
-- 再次新增 同id且sign=1 为修改信息
insert into ebs.collapse_mt_table values(1,320,1);

Index 二级索引

-- glanularity 粒度
-- SECOND 索引按照原子级别进行，即索引是针对每一行。
-- MULTI  索引在更大的数据块上进行，可以提高查询性能，可能会降低数据更新的性能。
-- 1000	  索引以每1000行为单位进行，提升查询性能的同时，减少数据更新时对索引的维护开销。
create table mt_second_index_table(
     productId UInt32,
     sku String,
     price Decimal(16,2),
     create_time Datetime64(3,'Asia/Shanghai'),
     INDEX IX_TABLE_SALARY sku TYPE minmax GRANULARITY 1000
)
engine=MergeTree()
partition by toYYYYMMDD(create_time)
primary key(productId)
order by(productId,sku);

TTL 超时设置

表生命周期
字段级生命周期
必须日期(Date,Datetime 不能是 Datetime64)字段且【非主键】字段
需要等待自动合并或手动触发合并，会自动开启合并任务

-- 字段级 TTL
create table ebs.mt_ttl_column(
    create_time Datetime,
	id UInt32,
    name String TTL create_time + INTERVAL 1 MINUTE,
    test_name String
)
engine=MergeTree()
order by(id);

-- 修改字段级 TTL 👇
alter table ebs.mt_ttl_column modify column test_name String TTL create_time + INTERVAL 1 MINUTE;

-- 插入数据
insert into ebs.mt_ttl_column values(now(),1,'henry','chen');
-- 超过一分钟后，手动触发合并，可能存在一定的延时
optimize table ebs.mt_ttl_column final;
-- 验证结果
select * from ebs.mt_ttl_column;

-- 建表时表级 TTL
create table ebs.mt_ttl_table(
    create_time Datetime,
	id UInt32,
    name String TTL create_time + INTERVAL 1 MINUTE,
    test_name String
)
engine=MergeTree()
order by(id)
TTL create_time + INTERVAL 1 MINUTE; 👈

-- 修改表表级 TTL 👇
alter table ebs.mt_ttl_table modify TTL create_time + INTERVAL 1 MINUTE;

-- 插入数据
insert into ebs.mt_ttl_table values(now(),1,'henry','chen');
-- 超过一分钟后，手动触发合并，可能存在一定的延时
optimize table ebs.mt_ttl_table final;
-- 验证结果
select * from ebs.mt_ttl_table;

ReplacingMergeTree

核心：去重
契机：合并时才会去重
范围：分区表只会在分区内去重，不能跨分区

案例

ReplacingMergeTree(create_time)

create_time 为版本字段，重复时保留版本字段最大值，若字段值相同按先后顺序保留后者
若版本字段缺省，则按照插入的先后顺序，保留最后一条
同分区按照 order by 之后的字段【列表】识别去重
注意：新版 clickhouse 插入时会在同分区内执行一次去重

create table ebs.rep_mt_table(
	id UInt32,
    name String,
    age UInt8,
    salary Decimal(16,2),
    create_time Datetime
)
engine=ReplacingMergeTree(create_time) 	👈版本字段
partition by toYYYYMMDD(create_time)
primary key(id)
order by(id,name);

insert into ebs.rep_mt_table values
(1003,'henry',18,12345.67,'2024-08-07 08:12:13'),
(1003,'henry',22,23321.00,'2024-08-07 10:12:18'),
(1004,'ariel',16,18000.00,'2024-08-08 12:28:02'),
(1004,'jack',22,14568.25,'2024-08-08 15:08:21'),
(1004,'ariel',19,8800.20,'2024-08-07 18:38:15');

-- 验证结果
select * from ebs.rep_mt_table;

insert into ebs.rep_mt_table values
(1004,'ariel',19,18800.20,'2024-08-07 18:38:15');

-- 手动触发合并，可能存在一定的延时
optimize table ebs.rep_mt_table final;
-- 验证结果
select * from ebs.rep_mt_table;

SummingMergeTree

汇总求和

分区内，分片合并时，执行预聚合
按照 order by 之后的字段【列表】进行分组后预聚合
需要指定合并字段，且合并字段必须为数值类型
若未指定合并字段，将会为非 order by 字段列表之外的所有数值字段执行聚合

案例

create table ebs.sum_mt_table(
	id UInt32,
    name String,
    age UInt8,
    salary Decimal(16,2),
    create_time Datetime
)
engine=SummingMergeTree(salary) 		👈合并字段
partition by toYYYYMMDD(create_time)
primary key(id)
order by(id);

insert into ebs.sum_mt_table values
(1003,'henry',18,12345.67,'2024-08-07 08:12:13'),
(1003,'henry',22,23321.00,'2024-08-07 10:12:18'),
(1004,'ariel',16,18000.00,'2024-08-08 12:28:02'),
(1004,'jack',22,14568.25,'2024-08-08 15:08:21'),
(1004,'ariel',19,8800.20,'2024-08-07 18:38:15');

-- 手动触发合并，可能存在一定的延时
optimize table ebs.sum_mt_table final;
select * from ebs.sum_mt_table;

Kafka

案例 CSV

-- 同步表：不能单独查 ods_tran_kafka 表，交给物化视图消费
create table tran_kafka.ods_tran_kafka(
    transaction_id UInt32,
    store_id UInt16,
    customer_id UInt16,
    product_id UInt16,
    price Decimal(16,2),
    create_time DateTime
)
engine = Kafka()
settings
    kafka_broker_list = 'master01:9092,master02:9092,worker01:9092',
    kafka_topic_list = 'tran_csv',
    kafka_group_name = 'kfk_ck_tran_csv_sync02',
    kafka_client_id='kfk_ck_tran_csv_sync02_01',
    kafka_format = 'CSV',
    kafka_max_block_size = 20000,
    kafka_skip_broken_messages = 100,
    kafka_row_delimiter = '\n',
    format_csv_delimiter = '|',
    kafka_handle_error_mode= 'stream';

-- 合并表
drop table ebs.kafka_hour_tran_sum;
create table tran_kafka.smt_store_hour_sum_price(
    store_id UInt16,
    tran_year UInt8,
    tran_month UInt8,
    tran_day UInt8,
    tran_hour UInt8,
    hour_sum_amount SimpleAggregateFunction(sum,Decimal(38,2))
)
engine = SummingMergeTree(minute_sum_amount)
order by(store_id,tran_year,tran_month,tran_day,tran_hour)
partition by (tran_year,tran_month,tran_day);

-- 物化视图
create materialized view tran_kafka.v_ods_tran_to_sum_store_minute
    to tran_kafka.smt_store_hour_sum_price
as
    with tran_time_dim as (
        select
            store_id,
            toYear(create_time) as tran_year,
            toMonth(create_time) as tran_month,
            toDayOfMonth(create_time) as tran_day,
            toHour(create_time) as tran_hour,
            toMinute(create_time) as tran_minute,
            price
        from tran_kafka.ods_tran_kafka
    )
    select
        store_id,tran_year,tran_month,tran_day,tran_hour,
        sum(price) as hour_sum_amount
    from tran_time_dim
    group by
        store_id,tran_year,tran_month,tran_day,tran_hour,tran_minute;

-- 查看数据
select * from ebs.kafka_hour_tran_sum;
-- 停止接收主题数据或更改视图逻辑，需要 detach 物化视图：
DETACH TABLE v_tran_to_sum;
-- attach 后继续接收主体数据
ATTACH TABLE v_tran_to_sum;

kafka_format

格式	输入	输出
[TabSeparated]	✔	✔
[TabSeparatedRaw]	✔	✔
[TabSeparatedWithNames]	✔	✔
[TabSeparatedWithNamesAndTypes]	✔	✔
[Template]	✔	✔
[TemplateIgnoreSpaces]	✔	✗
[CSV]	✔	✔
[CSVWithNames]	✔	✔
[CustomSeparated]	✔	✔
[Values]	✔	✔
[Vertical]	✗	✔
[JSON]	✗	✔
[JSONAsString]	✔	✗
[JSONStrings]	✗	✔
[JSONCompact]	✗	✔
[JSONCompactStrings]	✗	✔
[JSONEachRow]	✔	✔
[JSONEachRowWithProgress]	✗	✔
[JSONStringsEachRow]	✔	✔
[JSONStringsEachRowWithProgress]	✗	✔
[JSONCompactEachRow]	✔	✔
[JSONCompactEachRowWithNamesAndTypes]	✔	✔
[JSONCompactStringsEachRow]	✔	✔
[JSONCompactStringsEachRowWithNamesAndTypes]	✔	✔
[TSKV]	✔	✔
[Pretty]	✗	✔
[PrettyCompact]	✗	✔
[PrettyCompactMonoBlock]	✗	✔
[PrettyNoEscapes]	✗	✔
[PrettySpace]	✗	✔
[Protobuf]	✔	✔
[ProtobufSingle]	✔	✔
[Avro]	✔	✔
[AvroConfluent]	✔	✗
[Parquet]	✔	✔
[Arrow]	✔	✔
[ArrowStream]	✔	✔
[ORC]	✔	✔
[RowBinary]	✔	✔
[RowBinaryWithNamesAndTypes]	✔	✔
[Native]	✔	✔
[Null]	✗	✔
[XML]	✗	✔
[CapnProto]	✔	✗
[LineAsString]	✔	✗
[Regexp]	✔	✗
[RawBLOB]	✔	✔

大数据 > ClickHouse

#ClickHouse #大数据

ClickHouse DDL

https://leaf-domain.gitee.io/2025/03/22/databases/clickhouse/clickhouse_3_ddl/

作者

叶域

发布于

2025年3月22日

许可协议

Go语言进阶与依赖管理上一篇

ClickHouse Theory 下一篇