SQL 还需要注释?SQL 会自己注释自己的,对吧
1. 表操作
1.1 建表
CREATE TABLE [DATABASE].[TABLE_NAME] (
`id` BIGINT COMMENT '学号',
name STRING COMMENT '姓名',
ttl_score DOUBLE COMMENT '总分'
)
COMMENT '枝江中学五年级学生成绩明细表'
PARTITIONED BY (day STRING COMMENT 'day', hour STRING COMMENT 'hour')
ROW FORMAT [row_format]
STORED AS [file_format];
Note: 了解更多,请参考 CREATE TABLE with Hive format
1.2 删表
DROP TABLE [DATABASE].[TABLE_NAME];
DROP TABLE IF EXISTS [DATABASE].[TABLE_NAME];
1.3 在原表上新增列
ALTER TABLE [DATABASE].[TABLE_NAME] ADD COLUMNS (
sub_score ARRAY<DOUBLE> COMMENT '分科成绩',
sub_score_map MAP<STRING,DOUBLE> COMMENT '分科成绩map',
`rank` BIGINT COMMENT '年段排名',
update_time STRING COMMENT '更新时间'
);
1.4 查表结构
DESC [DATABASE].[TABLE_NAME];
1.5 查询分区
CHECK PARTITIONS [DATABASE].[TABLE_NAME];
1.6 建临时表
CREATE TEMPORARY TABLE std.std_temp_high_score AS
SELECT
name,
ttl_score
FROM [DATABASE].[TABLE_NAME]
WHERE ttl_score >= 10
AND LENGTH(name) > 1;
1.7 本地 CSV 文件上传 Hive
LOAD DATA LOCAL INPATH '/home/user/data.csv' INTO TABLE my_table
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';
2. 构建测试用例
2.1 利用union all建多行测试用例
SELECT
`id`,
name,
ttl_score
FROM
(
SELECT 1 AS `id`, '小岁' AS name, 90.5 AS ttl_score
UNION ALL
SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
UNION ALL
SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t
2.2 后文用到的示例
为了方便后文说明,建两个可复用的测试用例_(┐「ε:)_
1)学生分数表:
CREATE TABLE std.std_student_score AS
SELECT
`id`,
name,
ttl_score
FROM
(
SELECT 1 AS `id`, '小岁' AS name, 89.5 AS ttl_score
UNION ALL
SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
UNION ALL
SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t
2)学生基础信息表:
CREATE TABLE std.std_student_info AS
SELECT
`id`,
gender,
age
FROM
(
SELECT 1 AS `id`, 'P' AS gender, 13 AS age
UNION ALL
SELECT 2 AS `id`, 'P' AS gender, 15 AS age
UNION ALL
SELECT 3 AS `id`, 'T' AS gender, 18 AS age
) t
3. 数据类型
3.1 常用的数据类型
数据类型 | 示例 | 说明 |
---|---|---|
BIGINT | 3 |
整型 |
DOUBLE | 3.5 |
双精度浮点型 |
STRING | Alpha |
字符串 |
ARRAY<BIGINT> | [1,2,3] |
列表,列表元素可以是BIGINT, DOUBLE, STRING等 |
MAP<BIGINT,STRING> | {1:'A+',2:'B'} |
字典,字典的key和value可以是BIGINT, DOUBLE, STRING等 |
3.2 数据类型转换
1)把分数从double转成string
SELECT
`id`,
CAST(ttl_score AS STRING) AS ttl_score
FROM std.std_student_score
2)分数除10,然后转为bigint,有取整效果
SELECT
`id`,
CAST(ttl_score/10 AS BIGINT) AS ttl_score
FROM std.std_student_score
3.3 string型数据的常用函数
1)替换字符串
SELECT
REGEXP_REPLACE("abcdeffecmn", "ef|cm", "%")
output: abcd%fe%n
2)替换符号:需使用 \\
转义
SELECT
REGEXP_REPLACE("abcd:effe(eee)cmn", "\\:|\\(|\\)", " ")
output: abcd effe eee cmn
3)取字符串的子串
SELECT
SUBSTR('abcedf', 3) AS sub_str
output: cedf
SELECT
SUBSTR('abcedf', 3, 2) AS sub_str
output: ce
SELECT
SUBSTR('abcedf', -3, 2) AS sub_str
output: ed
SELECT
SUBSTR('abcedf', -3, 5) AS sub_str
output: edf
4)找到特定字符在用英文逗号分隔的字符串列表中的位置
SELECT
FIND_IN_SET('a', 'c,b,a,d')
output: 3
SELECT
FIND_IN_SET('m', 'c,b,a,d')
output: 0
找不到会返回 0
所以它也可以通过是否为零来判断字符在列表中是否存在
3.4 double型数据的常用函数
1)保留小数点后n位有效数字: round函数
SELECT
ROUND(ratio, 3)
FROM
(
SELECT 1.232812 AS ratio
) t
output: 1.233
4. 行处理函数
HiveQL 中函数大抵可以分为两种。一种是做行处理的,输入是一行,输出也是一行;另一种是做聚合的,输入是多行,但输出只有一行。
本节列举一些常用的行处理函数。
4.1 if函数
SELECT
`id`,
if(ttl_score > 90, 1, 0) AS is_high
FROM std.std_student_score
4.2 case函数
SELECT
`id`,
CASE WHEN ttl_score >= 90 THEN 'A+'
WHEN ttl_score >= 80 THEN 'A'
WHEN ttl_score >= 70 THEN 'B'
WHEN ttl_score >= 60 THEN 'C'
ELSE 'C-'
END AS `level`
FROM std.std_student_score
4.3 concat函数
SELECT
CONCAT('a', '-', 'b')
output: a-b
4.4 concat_ws函数
以下两种写法都可以:
SELECT
CONCAT_WS(',',
ARRAY('1','2','3')
)
SELECT
CONCAT_WS(',', '1', '2', '3')
output: 1,2,3
4.5 split函数
SELECT
SPLIT('1,2,3', ',')
output: ["1","2","3"]
5. 聚合
1)聚合语句group by,单独使用有去重的效果
SELECT gender
FROM std.std_student_info
GROUP BY gender
2)计算每个gender值对应的记录条数
SELECT
gender,
COUNT(*) AS cnt
FROM std.std_student_info
GROUP BY gender
6. 聚合函数
6.1 计算数据行数:count函数
SELECT COUNT(*) AS row_num
FROM std.std_student_info
6.2 对字段去重:distinct
SELECT COUNT(DISTINCT gender) AS gender_cnt
FROM std.std_student_info
6.3 求和、最大值、最小值、均值、方差
SELECT
gender,
SUM(age) AS sum_age, -- 求和
MAX(age) AS max_age, -- 最大值
MIN(age) AS min_age, -- 最小值
AVG(age) AS avg_age, -- 均值
VARIANCE(age) AS var_age -- 方差
FROM std.std_student_info
GROUP BY gender
7. join 方法
常用的 join 方法有:
方法 | 说明 |
---|---|
left join |
保留左表所有记录,保留右表与左表连接键相同的记录,连接键有多值时做笛卡尔积 |
inner join |
保留左表与右表连接键存在且相同的记录,连接键有多值时做笛卡尔积 |
full outer join |
保留左表与右表所有记录,连接键值相等时连接,连接键有多值时做笛卡尔积 |
left join示例:
SELECT
a.id,
ttl_score,
age
FROM
(
SELECT
id,
ttl_score
FROM std.std_student_score
) a
LEFT JOIN
(
SELECT
id,
age
FROM std.std_student_info
) b ON a.id = b.id;
8. array 相关函数
8.1 将来自多行的某字段值聚合成array: collect_list函数
SELECT
gender,
COLLECT_LIST(age) AS age_list
FROM std.std_student_info
GROUP BY gender
8.2 将来自多行的某字段值聚合成array并去重: collect_set函数
SELECT
gender,
COLLECT_SET(age) AS age_set
FROM std.std_student_info
GROUP BY gender
8.3 用分隔符拼接array并生成string
SELECT
gender,
CONCAT_WS(',',
COLLECT_SET(CAST(age AS STRING))
) AS age_set
FROM std.std_student_info
GROUP BY gender
8.4 判断array中是否包含某元素: array_contains函数
SELECT
id,
age_list
FROM
(
SELECT 1 AS id, array(1,2,3) AS age_list
UNION ALL
SELECT 2 AS id, array(6,7,8) AS age_list
) t
WHERE ARRAY_CONTAINS(age_list, 3)
8.5 元素排序: sort_array函数
SELECT
SORT_ARRAY(lst) AS sorted_lst
FROM
(
SELECT ARRAY(3,2,4,1) AS lst
) t
9. map 相关函数
9.1 将固定格式的string转成map: str_to_map函数
可以写成
SELECT
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('key_1', ':', 'value_1'),
CONCAT('key_2', ':', 'value_2')
), ',', ':'
) AS my_map
或者
SELECT
STR_TO_MAP(
CONCAT_WS(',', ARRAY(
CONCAT('key_1', ':', 'value_1'),
CONCAT('key_2', ':', 'value_2')
)
), ',', ':'
) AS my_map
output: {“key_1”:“value_1”,“key_2”:“value_2”}
9.2 通过map的key读取value
SELECT
my_map['A']
FROM
(
SELECT
STR_TO_MAP(
CONCAT_WS(',', ARRAY(
CONCAT('A', ':', 'A1'),
CONCAT('B', ':', 'B1')
)
), ',', ':'
) AS my_map
) t
output: A1
9.3 提取map中的key,生成由key组成的array
📖 为了后文演示方便,这里创建一个通用示例:
CREATE TABLE std.std_level_map AS
SELECT
`id`,
sub_score
FROM
(
SELECT
1 AS `id`,
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('语文:', '80'),
CONCAT('数学:', '85'),
CONCAT('英语:', '95'),
CONCAT('政治:', '90')
), ',', ':'
) AS sub_score
UNION ALL
SELECT
2 AS `id`,
STR_TO_MAP(
CONCAT_WS(',',
CONCAT('法律:', '90'),
CONCAT('英语:', '75'),
CONCAT('政治:', '92')
), ',', ':'
) AS sub_score
) t
使用map_keys函数将字典的key转为key的array
SELECT
`id`,
MAP_KEYS(sub_score) AS sub_list
FROM std.std_level_map
9.4 把map中的value转为value的array
与map_keys类似,用map_values可以把map的value转为value的array
SELECT
`id`,
MAP_VALUES(sub_score) AS score_list
FROM std.std_level_map
9.5 构造JSON
1)构造简单的JSON
SELECT
CONCAT('{"',
CONCAT_WS('","',
CONCAT('语文:', '80'),
CONCAT('数学:', '90')
), '"}'
) AS my_map
output: {"语文:80","数学:90"}
2)将map转换成JSON格式的string
SELECT
`id`,
CONCAT('{"',
CONCAT_WS('","',
COLLECT_SET(
CONCAT(k, '":"', v)
)
), '"}'
) AS my_map
FROM std.std_level_map
LATERAL VIEW OUTER EXPLODE(sub_score) e AS k, v
GROUP BY `id`
output:
id | my_map |
---|---|
1 | {"语文":"80","数学":"85","英语":"95","政治":"90"} |
2 | {"法律":"90","英语":"75","政治":"92"} |
3)构造形如ARRAY<MAP<STRING,STRING>>
的string
SELECT
CONCAT(
'[',
CONCAT_WS(',',
CONCAT('{"',
CONCAT_WS('","',
CONCAT('语文:', '80'),
CONCAT('数学:', '90')
), '"}'
)
), ']'
) AS my_map
output: [{"语文:80","数学:90"}]
4)基于map,构造形如ARRAY<MAP<STRING,STRING>>
的string
SELECT
IF(SUM(IF(ARRAY_CONTAINS(MAP_KEYS(sub_score), '英语') AND ARRAY_CONTAINS(MAP_KEYS(sub_score), '政治'), 1, 0)) = 0, '',
CONCAT('[',
CONCAT_WS(',',
COLLECT_SET(
CONCAT('{"',
CONCAT_WS('","',
CONCAT('英语', '":"', CAST(sub_score['英语'] AS STRING)),
CONCAT('政治', '":"', CAST(sub_score['政治'] AS STRING))
), '"}'
)
)
), ']'
)
) AS my_map
FROM std.std_level_map
output: [{"英语":"95","政治":"90"},{"英语":"75","政治":"92"}]
Note: 如果原MAP中不包含任何选定数据,应该返回空字符串,为了实现这个效果,需要在外层套一个IF函数来判断。常用的判断条件有:
ARRAY_CONTAINS(MAP_KEYS(sub_score), '[YOUR_KEY_HERE]')
SIZE(MAP_KEYS(sub_score)) > 0
5)利用跨行数据构造JSON
Note: 凡是有
CONCAT_WS
的地方,里面都可以套一层COLLECT_SET
或者COLLECT_LIST
做跨行处理。可以参考
2)
和4)
,这两条代码实际都有跨行处理能力
10. lateral view outer explode()
LATERAL VIEW OUTER EXPLODE()
用于把array或者map中的元素展成多行。该语句一般放在 FROM
之后,WHERE
之前。示例如下👇
1)将array展成多行
SELECT
`id`,
gid
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2,3,5,5) AS group_id
UNION ALL
SELECT
2 AS `id`,
ARRAY(1,2) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE id IN (1,3)
output:
id | gid |
---|---|
1 | 1 |
1 | 2 |
1 | 3 |
1 | 5 |
1 | 5 |
2)将array展成多行:一个更复杂的例子
SELECT
gid,
CONCAT_WS(',', COLLECT_SET(CAST(id AS STRING))) AS id_list
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2,3,4,5,5) AS group_id
UNION ALL
SELECT
2 AS `id`,
ARRAY(2,4,4,3) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE gid > 1
GROUP BY gid
output:
gid | id_list |
---|---|
2 | 1,2 |
3 | 1,2 |
4 | 1,2 |
5 | 1 |
3)展开多个array:id
与group_id
的各元素间做笛卡尔积
SELECT
`id`,
gid,
lvl
FROM
(
SELECT
1 AS `id`,
ARRAY(1,2) AS group_id,
ARRAY('A','B') AS level
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
LATERAL VIEW OUTER EXPLODE(level) l AS lvl
output:
id | gid | lvl |
---|---|---|
1 | 1 | A |
1 | 1 | B |
1 | 2 | A |
1 | 2 | B |
4)展开map
Note: 请看 9. map 相关函数 中
9.5
的示例2)
。多个MAP也能像多个ARRAY一样展开,这里就不多演示了。
11. 窗口函数
窗口函数的处理效果,是输入n行,输出n行。但这不意味窗口函数是行处理的,它可以综合其他行的信息对当前行做运算。
1)找同班同学
- 输入是
id
(学号)和class
(班级) - 输出是
id
(学号)和class
(班级)和id_list
(同班同学学号列表)
SELECT
id,
class,
COLLECT_SET(id) OVER(PARTITION BY class) AS id_list
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
output:
id | class | id_list |
---|---|---|
1 | A | [1,2] |
2 | A | [1,2] |
3 | B | [3] |
Note: 常用的聚合函数,都可以替换本例中的
COLLECT_SET(id)
,比如:
SUM()
COUNT()
MAX()
等等……
2)为每个数据行生成序号: row_number函数
SELECT
id,
class,
score,
ROW_NUMBER() OVER(PARTITION BY class ORDER BY score DESC) AS class_no
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_no |
---|---|---|---|
1 | A | 90 | 3 |
2 | A | 91 | 1 |
3 | A | 91 | 2 |
4 | B | 100 | 1 |
Note: 注意到虽然学生2和学生3分数都是91分,但他们的班级序号(class_no)不重叠。2号学生班级序号是1,3号学生班级序号是2
2)排名: rank函数
SELECT
id,
class,
score,
RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_rk |
---|---|---|---|
1 | A | 90 | 3 |
2 | A | 91 | 1 |
3 | A | 91 | 1 |
4 | B | 100 | 1 |
Note: 注意看,虽然学生2和学生3分数都是91分,但他们的班级排名(class_rk)是相同的。2号同学与3号同学并列第一,1号学生由于有2位同学排在他前面,直接变成第3名。
3)不计人头排名: dense_rank函数
DENSE_RANK()
是不计人头,纯计分数的排名方式。
上例中的1号同学,在考试中获得了90分,虽然按人头数计,有两位同学排在他前面,因此成为了第三名。但如果纯以分数计,他的90分是仅次于91分的班级第二好名次。如果我们希望不考虑人头数,仅考虑分数在排序中的位置,那么可以使用DENSE_RANK函数。
SELECT
id,
class,
score,
DENSE_RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
SELECT 1 AS id, 'A' AS class, 90 AS score
UNION ALL
SELECT 2 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 3 AS id, 'A' AS class, 91 AS score
UNION ALL
SELECT 4 AS id, 'B' AS class, 100 AS score
) t
output:
id | class | score | class_rk |
---|---|---|---|
1 | A | 90 | 2 |
2 | A | 91 | 1 |
3 | A | 91 | 1 |
4 | B | 100 | 1 |
12. 尾缀方法
1)对结果排序: order by
寻找分数TOP2的两位同学:
SELECT
id,
score
FROM
(
SELECT 1 AS id, 70 AS score
UNION ALL
SELECT 2 AS id, 60 AS score
UNION ALL
SELECT 3 AS id, 90 AS score
) t
ORDER BY score DESC
LIMIT 2
ORDER BY也常用于随机抽样:
ORDER BY RAND()
2)组筛选: having
HAVING语句能对GROUP BY的结果做筛选。以下SQL选出班级人数大于1的班级:
SELECT
class,
COUNT(DISTINCT id) AS id_cnt
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
GROUP BY class
HAVING id_cnt > 1
3)CLUSTER BY
, DISTRIBUTE BY
和 SORT BY
这三个语句是控制reducer的。DISTRIBUTE BY [YOUR_COLUMN_HERE]
用于控制数据如何进入reducer,比如:
SELECT
class,
id
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class
如上,则拥有相同class值的记录,会进入同一个reducer。
SORT BY [YOUR_COLUMN_HERE]
用于控制reducer中数据的排序,比如:
SELECT
class,
id
FROM
(
SELECT 1 AS id, 'A' AS class
UNION ALL
SELECT 2 AS id, 'A' AS class
UNION ALL
SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class
SORT BY id
如上例,在每个reducer中,会用id字段做reducer内的排序。
对某字段使用 CLUSTER BY
,等于对该字段既使用DISTRIBUTE BY,又使用SORT BY,一种语法糖了属于是。
Note: 这三个语句日常不太常见,什么场景能用上呢?
举个例子,用户 (uid) 的购物车订单 (oid) 有时候会被拆成多个包裹,这时每个包裹会对应一个子单 (sub_oid)。这种情况下,一个订单就会对应多个子单。假设我们只知道
uid - oid
的关系与uid - sub_oid
的关系。如果已知 sub_oid 是 oid 后加下划线加子单单号,我们希望获取oid - sub_oid
的关系。这种情况就可以使用DISTRIBUTE BY uid
,这可以大幅节省算力。
13. transform
SQL部分:
SELECT TRANSFORM
-- PYTHON文件的输入
(
COL_1,
COL_2
)
USING '[PYTHON_ENV_PATH_HERE] [YOUR_PYTHON_FILE_HERE]' AS
-- PYTHON文件的输出,输出的行数和内容取决于PYTHON怎么写
(
OUTPUT_1,
OUTPUT_2,
OUTPUT_3
)
FROM
-- 数据源,会传给PYTHON文件的输入
(
SELECT
COL_1,
COL_2
FROM [DB_NAME].[SOURCE_TABLE]
WHERE [CONDITIONS]
DISTRIBUTE BY [YOUR_KEY]
) t
PYTHON部分:
import sys
class Processor:
@staticmethod
def add(lst):
nlst = list()
for col_1, col_2 in lst:
nlst.append((col_1, col_2, col_1+'-'+col_2))
return nlst
def main(self):
lst = list()
for line in sys.stdin:
col_1, col_2 = line.strip().split('\t')[:2]
lst.append((col_1, col_2))
for col_1, col_2, col_3 in self.add(lst):
print('{}\t{}\t{}'.format(col_1, col_2, col_3))
if __name__ == '__main__':
p = Processor()
p.main()
14. 空值处理
1)coalesce函数:接受多个参数。从左到右遍历参数,返回第一个遇到的非NULL
参数
SELECT
COALESCE(col_1, col_2, 0) AS res
FROM
(
SELECT NULL AS col_1, 10 AS col_2
UNION ALL
SELECT NULL AS col_1, NULL AS col_2
) t
output:
res |
---|
10 |
0 |
2)nvl函数:接受两个参数。如果第一个参数为NULL
,返回第二个参数,否则返回第一个参数
SELECT
NVL(col_1, col_2) AS res
FROM
(
SELECT NULL AS col_1, 10 AS col_2
UNION ALL
SELECT NULL AS col_1, NULL AS col_2
UNION ALL
SELECT 100 AS col_1, 200 AS col_2
) t
output:
res |
---|
10 |
NULL |
100 |
15.【场景1】同数据行跨字段计算最大/最小值
原始用法:
SELECT
GREATEST(3,1,2) AS greatest_num
LEAST(6,5,7) AS least_num
output:
greatest_num | least_num |
---|---|
3 | 5 |
场景:计算一天中最贵/最便宜的一餐
SELECT
LEAST(breakfast, lunch, dinner) AS cheapest_meal,
GREATEST(breakfast, lunch, dinner) AS expensive_meal
FROM
(
SELECT 3 AS breakfast, 25 AS lunch, 15 AS dinner
UNION ALL
SELECT 8 AS breakfast, 0 AS lunch, 20 AS dinner
UNION ALL
SELECT 0 AS breakfast, 35 AS lunch, 22 AS dinner
) t
output:
cheapest_meal | expensive_meal |
---|---|
3 | 25 |
0 | 20 |
0 | 35 |
16.【场景2】计算分位数
场景:计算学生分数的95分位数
SELECT
ttl_score,
quantile,
distance,
ROUND(quantile, 3) AS quantile_round, -- 精确到小数点后三位便于展示
ROUND(distance, 3) AS distance_round
FROM
(
SELECT
ttl_score,
id_cnt_accumulation / all_cnt AS quantile, -- 实际的分位数值
ABS((id_cnt_accumulation / all_cnt) - 0.05) AS distance -- 与标准95分位数的距离
FROM
(
SELECT
ttl_score,
SUM(id_cnt) OVER(ORDER BY ttl_score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS id_cnt_accumulation,
SUM(id_cnt) OVER() AS all_cnt
FROM
(
SELECT
ttl_score,
COUNT(DISTINCT `id`) AS id_cnt
FROM std.std_student_score
GROUP BY ttl_score
) t1
) t2
) t3
ORDER BY distance
LIMIT 1
场景:计算多组学生分数的分位数:25/50/75分位数
SELECT
ttl_score,
quantile,
ROUND(quantile, 3) AS quantile_round, -- 精确到小数点后三位便于展示
CONCAT_WS(',',
IF(rk_25 = 1, '25', null),
IF(rk_50 = 1, '50', null),
IF(rk_75 = 1, '75', null)
) AS prompt, -- 提示是哪一种分位数,可能存在同一个ttl_score值为多个分位数的情况
distance_25,
distance_50,
distance_75
FROM
(
SELECT
ttl_score,
quantile,
ABS(quantile - 0.25) AS distance_25,
ABS(quantile - 0.5) AS distance_50,
ABS(quantile - 0.75) AS distance_75,
row_number() over(order by ABS(quantile - 0.25)) AS rk_25,
row_number() over(order by ABS(quantile - 0.5)) AS rk_50,
row_number() over(order by ABS(quantile - 0.75)) AS rk_75
FROM
(
SELECT
ttl_score,
id_cnt_accumulation / all_cnt AS quantile -- 实际的分位数值
FROM
(
SELECT
ttl_score,
SUM(id_cnt) OVER(ORDER BY ttl_score ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS id_cnt_accumulation,
SUM(id_cnt) OVER() AS all_cnt
FROM
(
SELECT
ttl_score,
COUNT(DISTINCT `id`) AS id_cnt
FROM std.std_student_score
GROUP BY ttl_score
) t1
) t2
) t3
) t4
WHERE rk_25 = 1
OR rk_50 = 1
OR rk_75 = 1;
17. UDF
todo
18. 命令行
hive -e "[SQL]"
: 在命令行执行 SQL 语句hive -f [PATH/TO/SQL/FILE]
: 先写好 XXX.SQL
文件,再从命令行执行hive -f [PATH/TO/SQL/FILE] > [LOCAL_FILE].csv
下载数据到本地hive -e "set hive.metastore.uris"
查看hive服务的ip和端口
附录
- Hive CLI 工具:Beeline
- 在线练习工具: hue
- Hive UDF 库: brickhouse