1. 表操作
1.1 建表
CREATE TABLE [DATABASE].[TABLE_NAME] (
`id` BIGINT COMMENT '学号',
name STRING COMMENT '姓名',
ttl_score DOUBLE COMMENT '总分'
)
COMMENT '枝江中学五年级学生成绩明细表'
PARTITIONED BY (day STRING COMMENT 'day', hour STRING COMMENT 'hour')
ROW FORMAT [row_format]
STORED AS [file_format];
Note: 了解更多,请参考 CREATE TABLE with Hive format
1.2 删表
DROP TABLE [DATABASE].[TABLE_NAME];
DROP TABLE IF EXISTS [DATABASE].[TABLE_NAME];
1.3 在原表上新增列
ALTER TABLE [DATABASE].[TABLE_NAME] ADD COLUMNS (
sub_score ARRAY<DOUBLE> COMMENT '分科成绩',
sub_score_map MAP<STRING,DOUBLE> COMMENT '分科成绩map',
`rank` BIGINT COMMENT '年段排名',
update_time STRING COMMENT '更新时间'
);
1.4 查表结构
DESC [DATABASE].[TABLE_NAME];
1.5 建临时表
CREATE TEMPORARY TABLE std.std_temp_high_score AS
SELECT
name,
ttl_score
FROM [DATABASE].[TABLE_NAME]
WHERE ttl_score >= 10
AND LENGTH(name) > 1;
2. 构建测试用例
2.1 利用union all建多行测试用例
SELECT
`id`,
name,
ttl_score
FROM
(
SELECT 1 AS `id`, '小岁' AS name, 90.5 AS ttl_score
UNION ALL
SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
UNION ALL
SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t
2.2 后文用到的示例
为了方便后文说明,建两个可复用的测试用例_(┐「ε:)_
1)学生分数表:
CREATE TABLE std.std_student_score AS
SELECT
`id`,
name,
ttl_score
FROM
(
SELECT 1 AS `id`, '小岁' AS name, 89.5 AS ttl_score
UNION ALL
SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
UNION ALL
SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t
2)学生基础信息表:
CREATE TABLE std.std_student_info AS
SELECT
`id`,
gender,
age
FROM
(
SELECT 1 AS `id`, 'P' AS gender, 13 AS age
UNION ALL
SELECT 2 AS `id`, 'P' AS gender, 15 AS age
UNION ALL
SELECT 3 AS `id`, 'T' AS gender, 18 AS age
) t
3. 数据类型
3.1 常用的数据类型
数据类型 | 示例 | 说明 |
---|---|---|
BIGINT | 3 |
整型 |
DOUBLE | 3.5 |
双精度浮点型 |
STRING | Alpha |
字符串 |
ARRAY<BIGINT> | [1,2,3] |
列表,列表元素可以是BIGINT, DOUBLE, STRING等 |
MAP<BIGINT,STRING> | {1:'A+',2:'B'} |
字典,字典的key和value可以是BIGINT, DOUBLE, STRING等 |
3.2 数据类型转换
1)把分数从double转成string
SELECT
`id`,
CAST(ttl_score AS STRING) AS ttl_score
FROM std.std_student_score
2)分数除10,然后转为bigint,有取整效果
SELECT
`id`,
CAST(ttl_score/10 AS BIGINT) AS ttl_score
FROM std.std_student_score
3.3 string型数据的常用函数
1)替换字符串
SELECT
REGEXP_REPLACE("abcdeffecmn", "ef|cm", "%")
output: abcd%fe%n
2)替换符号:需使用 \\
转义
SELECT
REGEXP_REPLACE("abcd:effe(eee)cmn", "\\:|\\(|\\)", " ")
output: abcd effe eee cmn
3)取字符串的子串
SELECT
SUBSTR('abcedf', 3) AS sub_str
output: cedf
SELECT
SUBSTR('abcedf', 3, 2) AS sub_str
output: ce
SELECT
SUBSTR('abcedf', -3, 2) AS sub_str
output: ed
SELECT
SUBSTR('abcedf', -3, 5) AS sub_str
output: edf
3.4 double型数据的常用函数
1)保留小数点后n位有效数字: round函数
SELECT
ROUND(ratio, 3)
FROM
(
SELECT 1.232812 AS ratio
) t
output: 1.233
4. 行处理函数
Hive SQL中的函数大抵可以分为两种。一种是做行处理的,输入是一行,输出也是一行;另一种是做聚合的,输入是多行,但输出只有一行。
本节列举一些常用的行处理函数。
4.1 if函数
SELECT
`id`,
if(ttl_score > 90, 1, 0) AS is_high
FROM std.std_student_score
4.2 case函数
SELECT
`id`,
CASE WHEN ttl_score >= 90 THEN 'A+'
WHEN ttl_score >= 80 THEN 'A'
WHEN ttl_score >= 70 THEN 'B'
WHEN ttl_score >= 60 THEN 'C'
ELSE 'C-'
END AS `level`
FROM std.std_student_score
4.3 concat函数
SELECT
CONCAT('a', '-', 'b')
output: a-b
4.4 concat_ws函数
以下两种写法都可以:
SELECT
CONCAT_WS(',',
ARRAY('1','2','3')
)
SELECT
CONCAT_WS(',', '1', '2', '3')
output: 1,2,3
4.5 split函数
SELECT
SPLIT('1,2,3', ',')
output: ["1","2","3"]
5. 聚合
1)聚合语句group by,单独使用有去重的效果
SELECT gender
FROM std.std_student_info
GROUP BY gender
2)计算每个gender值对应的记录条数
SELECT
gender,
COUNT(*) AS cnt
FROM std.std_student_info
GROUP BY gender
6. 聚合函数
6.1 计算数据行数:count函数
SELECT COUNT(*) AS row_num
FROM std.std_student_info
6.2 对字段去重:distinct
SELECT COUNT(DISTINCT gender) AS gender_cnt
FROM std.std_student_info
6.3 求和、最大值、最小值、均值、方差
SELECT
gender,
SUM(age) AS sum_age, -- 求和
MAX(age) AS max_age, -- 最大值
MIN(age) AS min_age, -- 最小值
AVG(age) AS avg_age, -- 均值
VARIANCE(age) AS var_age -- 方差
FROM std.std_student_info
GROUP BY gender
7. join 方法
常用的 join 方法有:
方法 | 说明 |
---|---|
left join |
保留左表所有记录,保留右表与左表连接键相同的记录,连接键有多值时做笛卡尔积 |
inner join |
保留左表与右表连接键存在且相同的记录,连接键有多值时做笛卡尔积 |
full outer join |
保留左表与右表所有记录,连接键值相等时连接,连接键有多值时做笛卡尔积 |
left join示例:
SELECT
a.id,
ttl_score,
age
FROM
(
SELECT
id,
ttl_score
FROM std.std_student_score
) a
LEFT JOIN
(
SELECT
id,
age
FROM std.std_student_info
) b ON a.id = b.id;
8. array 相关函数
8.1 将来自多行的某字段值聚合成array: collect_list函数
SELECT
gender,
COLLECT_LIST(age) AS age_list
FROM std.std_student_info
GROUP BY gender
8.2 将来自多行的某字段值聚合成array并去重: collect_set函数
SELECT
gender,
COLLECT_SET(age) AS age_set
FROM std.std_student_info
GROUP BY gender
8.3 用分隔符拼接array并生成string
SELECT
gender,
CONCAT_WS(',',
COLLECT_SET(CAST(age AS STRING))
) AS age_set
FROM std.std_student_info
GROUP BY gender
8.4 判断array中是否包含某元素: array_contains函数
SELECT
id,
age_list
FROM
(
SELECT 1 AS id, array(1,2,3) AS age_list
UNION ALL
SELECT 2 AS id, array(6,7,8) AS age_list
) t
WHERE ARRAY_CONTAINS(age_list, 3)
8.5 元素排序: sort_array函数
SELECT
SORT_ARRAY(lst) AS sorted_lst
FROM
(
SELECT ARRAY(3,2,4,1) AS lst
) t
9. map 相关函数
9.1 将固定格式的string转成map: str_to_map函数
可以写成
SELECT
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('key_1', ':', 'value_1'),
CONCAT('key_2', ':', 'value_2')
), ',', ':'
) AS my_map
或者
SELECT
STR_TO_MAP(
CONCAT_WS(',', ARRAY(
CONCAT('key_1', ':', 'value_1'),
CONCAT('key_2', ':', 'value_2')
)
), ',', ':'
) AS my_map
output: {“key_1”:“value_1”,“key_2”:“value_2”}
9.2 通过map的key读取value
SELECT
my_map['A']
FROM
(
SELECT
STR_TO_MAP(
CONCAT_WS(',', ARRAY(
CONCAT('A', ':', 'A1'),
CONCAT('B', ':', 'B1')
)
), ',', ':'
) AS my_map
) t
output: A1
9.3 提取map中的key,生成由key组成的array
📖 为了后文演示方便,这里创建一个通用示例:
CREATE TABLE std.std_level_map AS
SELECT
`id`,
sub_score
FROM
(
SELECT
1 AS `id`,
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('语文:', '80'),
CONCAT('数学:', '85'),
CONCAT('英语:', '95'),
CONCAT('政治:', '90')
), ',', ':'
) AS sub_score
UNION ALL
SELECT
2 AS `id`,
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('法律:', '90'),
CONCAT('英语:', '75'),
CONCAT('政治:', '92')
), ',', ':'
) AS sub_score
) t
使用map_keys函数将字典的key转为key的array
SELECT
`id`,
MAP_KEYS(sub_score) AS sub_list
FROM std.std_level_map
9.4 把map中的value转为value的array
与map_keys类似,用map_values可以把map的value转为value的array
SELECT
`id`,
MAP_VALUES(sub_score) AS score_list
FROM std.std_level_map
9.5 构造JSON
1)构造简单的JSON
SELECT
CONCAT('{"',
CONCAT_WS('","',
CONCAT('语文:', '80'),
CONCAT('数学:', '90')
), '"}'
) AS my_map
output: {"语文:80","数学:90"}
2)将map转换成JSON格式的string
SELECT
`id`,
CONCAT('{"',
CONCAT_WS('","',
COLLECT_SET(
CONCAT(k, '":"', v)
)
), '"}'
) AS my_map
FROM std.std_level_map
LATERAL VIEW OUTER EXPLODE(sub_score) e AS k, v
GROUP BY `id`
output:
id | my_map |
---|---|
1 | {"语文":"80","数学":"85","英语":"95","政治":"90"} |
2 | {"法律":"90","英语":"75","政治":"92"} |
3)构造形如ARRAY<MAP<STRING,STRING>>
的string
SELECT
CONCAT(
'[',
CONCAT_WS(',',
CONCAT('{"',
CONCAT_WS('","',
CONCAT('语文:', '80'),
CONCAT('数学:', '90')
), '"}'
)
), ']'
) AS my_map
output: [{"语文:80","数学:90"}]
4)基于map,构造形如ARRAY<MAP<STRING,STRING>>
的string
SELECT
IF(SUM(IF(ARRAY_CONTAINS(MAP_KEYS(sub_score), '英语') AND ARRAY_CONTAINS(MAP_KEYS(sub_score), '政治'), 1, 0)) = 0, '',
CONCAT('[',
CONCAT_WS(',',
COLLECT_SET(
CONCAT('{"',
CONCAT_WS('","',
CONCAT('英语', '":"', CAST(sub_score['英语'] AS STRING)),
CONCAT('政治', '":"', CAST(sub_score['政治'] AS STRING))
), '"}'
)
)
), ']'
)
) AS my_map
FROM std.std_level_map
output: [{"英语":"95","政治":"90"},{"英语":"75","政治":"92"}]
Note: 如果原MAP中不包含任何选定数据,应该返回空字符串,为了实现这个效果,需要在外层套一个IF函数来判断。常用的判断条件有:
ARRAY_CONTAINS(MAP_KEYS(sub_score), '[YOUR_KEY_HERE]')
SIZE(MAP_KEYS(sub_score)) > 0
5)利用跨行数据构造JSON
Note: 凡是有
CONCAT_WS
的地方,里面都可以套一层COLLECT_SET
或者COLLECT_LIST
做跨行处理。可以参考
2)
和4)
,这两条代码实际都有跨行处理能力
10. lateral view outer explode()
LATERAL VIEW OUTER EXPLODE()
用于把array或者map中的元素展成多行。该语句一般放在 FROM
之后,WHERE
之前。示例如下👇
1)将array展成多行
SELECT
`id`,
gid
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2,3,5,5) AS group_id
UNION ALL
SELECT
2 AS `id`,
ARRAY(1,2) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE id IN (1,3)
output:
id | gid |
---|---|
1 | 1 |
1 | 2 |
1 | 3 |
1 | 5 |
1 | 5 |
2)将array展成多行:一个更复杂的例子
SELECT
gid,
CONCAT_WS(',', COLLECT_SET(CAST(id AS STRING))) AS id_list
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2,3,4,5,5) AS group_id
UNION ALL
SELECT
2 AS `id`,
ARRAY(2,4,4,3) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE gid > 1
GROUP BY gid
output:
gid | id_list |
---|---|
2 | 1,2 |
3 | 1,2 |
4 | 1,2 |
5 | 1 |
3)展开多个array:id
与group_id
的各元素间做笛卡尔积
SELECT
`id`,
gid,
lvl
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2) AS group_id,
ARRAY('A','B') AS level
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
LATERAL VIEW OUTER EXPLODE(level) l AS lvl
output:
id | gid | lvl |
---|---|---|
1 | 1 | A |
1 | 1 | B |
1 | 2 | A |
1 | 2 | B |
4)展开map
Note: 请看 9. map 相关函数 中
9.5
的示例2)
。多个MAP也能像多个ARRAY一样展开,这里就不多演示了。
11. 窗口函数
窗口函数的处理效果,是输入n行,输出n行。但这不意味窗口函数是行处理的,它可以综合其他行的信息对当前行做运算。
1)找同班同学
- 输入是
id
(学号)和class
(班级) - 输出是
id
(学号)和class
(班级)和id_list
(同班同学学号列表)
SELECT
id,
class,
COLLECT_SET(id) OVER(PARTITION BY class) AS id_list
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
output:
id | class | id_list |
---|---|---|
1 | A | [1,2] |
2 | A | [1,2] |
3 | B | [3] |
Note: 常用的聚合函数,都可以替换本例中的
COLLECT_SET(id)
,比如:
SUM()
COUNT()
MAX()
等等……
2)为每个数据行生成序号: row_number函数
SELECT
id,
class,
score,
ROW_NUMBER() OVER(PARTITION BY class ORDER BY score DESC) AS class_no
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_no |
---|---|---|---|
1 | A | 90 | 3 |
2 | A | 91 | 1 |
3 | A | 91 | 2 |
4 | B | 100 | 1 |
Note: 注意到虽然学生2和学生3分数都是91分,但他们的班级序号(class_no)不重叠。2号学生班级序号是1,3号学生班级序号是2
2)排名: rank函数
SELECT
id,
class,
score,
RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_rk |
---|---|---|---|
1 | A | 90 | 3 |
2 | A | 91 | 1 |
3 | A | 91 | 1 |
4 | B | 100 | 1 |
Note: 注意看,虽然学生2和学生3分数都是91分,但他们的班级排名(class_rk)是相同的。2号同学与3号同学并列第一,1号学生由于有2位同学排在他前面,直接变成第3名。
3)不计人头排名: dense_rank函数
DENSE_RANK()
是不计人头,纯计分数的排名方式。
上例中的1号同学,在考试中获得了90分,虽然按人头数计,有两位同学排在他前面,因此成为了第三名。但如果纯以分数计,他的90分是仅次于91分的班级第二好名次。如果我们希望不考虑人头数,仅考虑分数在排序中的位置,那么可以使用DENSE_RANK函数。
SELECT
id,
class,
score,
DENSE_RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_rk |
---|---|---|---|
1 | A | 90 | 2 |
2 | A | 91 | 1 |
3 | A | 91 | 1 |
4 | B | 100 | 1 |
12. 尾缀方法
1)对结果排序: order by
寻找分数TOP2的两位同学:
SELECT
id,
score
FROM
(
SELECT 1 AS id, 70 AS score
UNION ALL
SELECT 2 AS id, 60 AS score
UNION ALL
SELECT 3 AS id, 90 AS score
) t
ORDER BY score DESC
LIMIT 2
ORDER BY也常用于随机抽样:
ORDER BY RAND()
2)组筛选: having
HAVING语句能对GROUP BY的结果做筛选。以下SQL选出班级人数大于1的班级:
SELECT
class,
COUNT(DISTINCT id) AS id_cnt
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
GROUP BY class
HAVING id_cnt > 1
3)CLUSTER BY
, DISTRIBUTE BY
和 SORT BY
这三个语句是控制reducer的。DISTRIBUTE BY [YOUR_COLUMN_HERE]
用于控制数据如何进入reducer,比如:
SELECT
class,
COUNT(DISTINCT id) AS id_cnt
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class
如上,则拥有相同class值的记录,会进入同一个reducer。
SORT BY [YOUR_COLUMN_HERE]
用于控制reducer中数据的排序,比如:
SELECT
class,
COUNT(DISTINCT id) AS id_cnt
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class
SORT BY id
如上例,在每个reducer中,会用id字段做reducer内的排序。
对某字段使用 CLUSTER BY
,等于对该字段既使用DISTRIBUTE BY,又使用SORT BY,一种语法糖了属于是。
Note: 这三个语句日常不太常见,什么场景能用上呢?
举个例子,用户 (uid) 的购物车订单 (oid) 有时候会被拆成多个包裹,这时每个包裹会对应一个子单 (sub_oid)。这种情况下,一个订单就会对应多个子单。假设我们只知道
uid - oid
的关系与uid - sub_oid
的关系。如果已知 sub_oid 是 oid 后加下划线加子单单号,我们希望获取oid - sub_oid
的关系。这种情况就可以使用DISTRIBUTE BY uid
,这可以大幅节省算力。
13. transform
SQL部分:
SELECT TRANSFORM
-- PYTHON文件的输入
(
COL_1,
COL_2
)
USING '[PYTHON_ENV_PATH_HERE] [YOUR_PYTHON_FILE_HERE]' AS
-- PYTHON文件的输出,输出的行数和内容取决于PYTHON怎么写
(
OUTPUT_1,
OUTPUT_2,
OUTPUT_3
)
FROM
-- 数据源,会传给PYTHON文件的输入
(
SELECT
COL_1,
COL_2
FROM [DB_NAME].[SOURCE_TABLE]
WHERE [CONDITIONS]
DISTRIBUTE BY [YOUR_KEY]
) t
PYTHON部分:
import sys
class Processor:
@staticmethod
def add(lst):
nlst = list()
for col_1, col_2 in lst:
nlst.append((col_1, col_2, col_1+'-'+col_2))
return nlst
def main(self):
lst = list()
for line in sys.stdin:
col_1, col_2 = line.strip().split('\t')[:2]
lst.append((col_1, col_2))
for col_1, col_2, col_3 in self.add(lst):
print('{}\t{}\t{}'.format(col_1, col_2, col_3))
if __name__ == '__main__':
p = Processor()
p.main()
14. 空值处理
1)coalesce函数:接受多个参数。从左到右遍历参数,返回第一个遇到的非NULL
参数
SELECT
COALESCE(col_1, col_2, 0) AS res
FROM
(
SELECT NULL AS col_1, 10 AS col_2
UNION ALL
SELECT NULL AS col_1, NULL AS col_2
) t
output:
res |
---|
10 |
0 |
2)nvl函数:接受两个参数。如果第一个参数为NULL
,返回第二个参数,否则返回第一个参数
SELECT
NVL(col_1, col_2) AS res
FROM
(
SELECT NULL AS col_1, 10 AS col_2
UNION ALL
SELECT NULL AS col_1, NULL AS col_2
UNION ALL
SELECT 100 AS col_1, 200 AS col_2
) t
output:
res |
---|
10 |
NULL |
100 |
15. UDF
todo