Hive SQL 笔记

什么?SQL 还需要注释?SQL 会自己注释自己的,对吧

1. 表操作

1.1 建表

CREATE TABLE [DATABASE].[TABLE_NAME] (
    `id`    BIGINT    COMMENT '学号',
    name    STRING    COMMENT '姓名',
    ttl_score    DOUBLE    COMMENT '总分'
)
COMMENT '枝江中学五年级学生成绩明细表'
PARTITIONED BY (day STRING COMMENT 'day', hour STRING COMMENT 'hour')
ROW FORMAT [row_format]
STORED AS [file_format];

Note: 了解更多,请参考 CREATE TABLE with Hive format

1.2 删表

DROP TABLE [DATABASE].[TABLE_NAME];
DROP TABLE IF EXISTS [DATABASE].[TABLE_NAME];

1.3 在原表上新增列

ALTER TABLE [DATABASE].[TABLE_NAME] ADD COLUMNS (
    sub_score    ARRAY<DOUBLE>    COMMENT '分科成绩',
    sub_score_map    MAP<STRING,DOUBLE>    COMMENT '分科成绩map',
    `rank`    BIGINT    COMMENT '年段排名',
    update_time    STRING    COMMENT '更新时间'
);

1.4 查表结构

DESC [DATABASE].[TABLE_NAME];

1.5 查询分区

CHECK PARTITIONS [DATABASE].[TABLE_NAME];

1.6 建临时表

CREATE TEMPORARY TABLE std.std_temp_high_score AS
SELECT
    name,
    ttl_score
FROM [DATABASE].[TABLE_NAME]
WHERE ttl_score >= 10
    AND LENGTH(name) > 1;

1.7 本地 CSV 文件上传 Hive

LOAD DATA LOCAL INPATH '/home/user/data.csv' INTO TABLE my_table
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

2. 构建测试用例

2.1 利用union all建多行测试用例

SELECT
    `id`,
    name,
    ttl_score
FROM
(
    SELECT 1 AS `id`, '小岁' AS name, 90.5 AS ttl_score
    UNION ALL
    SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
    UNION ALL
    SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t

2.2 后文用到的示例

为了方便后文说明,建两个可复用的测试用例_(┐「ε:)_

1)学生分数表:

CREATE TABLE std.std_student_score AS
SELECT
    `id`,
    name,
    ttl_score
FROM
(
    SELECT 1 AS `id`, '小岁' AS name, 89.5 AS ttl_score
    UNION ALL
    SELECT 2 AS `id`, '大深' AS name, 91.5 AS ttl_score
    UNION ALL
    SELECT 3 AS `id`, '阿弥' AS name, 93 AS ttl_score
) t

2)学生基础信息表:

CREATE TABLE std.std_student_info AS
SELECT
    `id`,
    gender,
    age
FROM
(
    SELECT 1 AS `id`, 'P' AS gender, 13 AS age
    UNION ALL
    SELECT 2 AS `id`, 'P' AS gender, 15 AS age
    UNION ALL
    SELECT 3 AS `id`, 'T' AS gender, 18 AS age
) t

3. 数据类型

3.1 常用的数据类型

数据类型 示例 说明
BIGINT 3 整型
DOUBLE 3.5 双精度浮点型
STRING Alpha 字符串
ARRAY<BIGINT> [1,2,3] 列表,列表元素可以是BIGINT, DOUBLE, STRING等
MAP<BIGINT,STRING> {1:'A+',2:'B'} 字典,字典的key和value可以是BIGINT, DOUBLE, STRING等

3.2 数据类型转换

1)把分数从double转成string

SELECT
    `id`,
    CAST(ttl_score AS STRING) AS ttl_score
FROM std.std_student_score

2)分数除10,然后转为bigint,有取整效果

SELECT
    `id`,
    CAST(ttl_score/10 AS BIGINT) AS ttl_score
FROM std.std_student_score

3.3 string型数据的常用函数

1)替换字符串

SELECT
    REGEXP_REPLACE("abcdeffecmn", "ef|cm", "%")

output: abcd%fe%n

2)替换符号:需使用 \\ 转义

SELECT
    REGEXP_REPLACE("abcd:effe(eee)cmn", "\\:|\\(|\\)", " ")

output: abcd effe eee cmn

3)取字符串的子串

SELECT
    SUBSTR('abcedf', 3) AS sub_str

output: cedf

SELECT
    SUBSTR('abcedf', 3, 2) AS sub_str

output: ce

SELECT
    SUBSTR('abcedf', -3, 2) AS sub_str

output: ed

SELECT
    SUBSTR('abcedf', -3, 5) AS sub_str

output: edf

4)找到特定字符在用英文逗号分隔的字符串列表中的位置

SELECT
    FIND_IN_SET('a', 'c,b,a,d')

output: 3

SELECT
    FIND_IN_SET('m', 'c,b,a,d')

output: 0

找不到会返回 0 所以它也可以通过是否为零来判断字符在列表中是否存在

3.4 double型数据的常用函数

1)保留小数点后n位有效数字: round函数

SELECT
    ROUND(ratio, 3)
FROM
(
    SELECT 1.232812 AS ratio
) t

output: 1.233

4. 行处理函数

HiveQL 中函数大抵可以分为两种。一种是做行处理的,输入是一行,输出也是一行;另一种是做聚合的,输入是多行,但输出只有一行。

本节列举一些常用的行处理函数。

4.1 if函数

SELECT
    `id`,
    if(ttl_score > 90, 1, 0) AS is_high
FROM std.std_student_score

4.2 case函数

SELECT
    `id`,
    CASE WHEN ttl_score >= 90 THEN 'A+'
        WHEN ttl_score >= 80 THEN 'A'
        WHEN ttl_score >= 70 THEN 'B'
        WHEN ttl_score >= 60 THEN 'C'
        ELSE 'C-'
    END AS `level`
FROM std.std_student_score

4.3 concat函数

SELECT
    CONCAT('a', '-', 'b')

output: a-b

4.4 concat_ws函数

以下两种写法都可以:

SELECT
    CONCAT_WS(',',
        ARRAY('1','2','3')
    )
SELECT
    CONCAT_WS(',', '1', '2', '3')

output: 1,2,3

4.5 split函数

SELECT
    SPLIT('1,2,3', ',')

output: ["1","2","3"]

5. 聚合

1)聚合语句group by,单独使用有去重的效果

SELECT gender
FROM std.std_student_info
GROUP BY gender

2)计算每个gender值对应的记录条数

SELECT
    gender,
    COUNT(*) AS cnt
FROM std.std_student_info
GROUP BY gender

6. 聚合函数

6.1 计算数据行数:count函数

SELECT COUNT(*) AS row_num
FROM std.std_student_info

6.2 对字段去重:distinct

SELECT COUNT(DISTINCT gender) AS gender_cnt
FROM std.std_student_info

6.3 求和、最大值、最小值、均值、方差

SELECT
    gender,
    SUM(age) AS sum_age,    -- 求和
    MAX(age) AS max_age,    -- 最大值
    MIN(age) AS min_age,    -- 最小值
    AVG(age) AS avg_age,    -- 均值
    VARIANCE(age) AS var_age    -- 方差
FROM std.std_student_info
GROUP BY gender

7. join 方法

常用的 join 方法有:

方法 说明
left join 保留左表所有记录,保留右表与左表连接键相同的记录,连接键有多值时做笛卡尔积
inner join 保留左表与右表连接键存在且相同的记录,连接键有多值时做笛卡尔积
full outer join 保留左表与右表所有记录,连接键值相等时连接,连接键有多值时做笛卡尔积

left join示例:

SELECT
    a.id,
    ttl_score,
    age
FROM
(
    SELECT
        id,
        ttl_score
    FROM std.std_student_score
) a
LEFT JOIN
(
    SELECT
        id,
        age
    FROM std.std_student_info
) b ON a.id = b.id;

8. array 相关函数

8.1 将来自多行的某字段值聚合成array: collect_list函数

SELECT
    gender,
    COLLECT_LIST(age) AS age_list
FROM std.std_student_info
GROUP BY gender

8.2 将来自多行的某字段值聚合成array并去重: collect_set函数

SELECT
    gender,
    COLLECT_SET(age) AS age_set
FROM std.std_student_info
GROUP BY gender

8.3 用分隔符拼接array并生成string

SELECT
    gender,
    CONCAT_WS(',',
        COLLECT_SET(CAST(age AS STRING))
    ) AS age_set
FROM std.std_student_info
GROUP BY gender

8.4 判断array中是否包含某元素: array_contains函数

SELECT
    id,
    age_list
FROM
(
    SELECT 1 AS id, array(1,2,3) AS age_list
    UNION ALL
    SELECT 2 AS id, array(6,7,8) AS age_list
) t
WHERE ARRAY_CONTAINS(age_list, 3) 

8.5 元素排序: sort_array函数

SELECT
    SORT_ARRAY(lst) AS sorted_lst
FROM
(
    SELECT ARRAY(3,2,4,1) AS lst
) t

9. map 相关函数

9.1 将固定格式的string转成map: str_to_map函数

可以写成

SELECT
    STR_TO_MAP(
        CONCAT_WS(',',
            CONCAT('key_1', ':', 'value_1'),
            CONCAT('key_2', ':', 'value_2')
        ), ',', ':'
    ) AS my_map

或者

SELECT
    STR_TO_MAP(
        CONCAT_WS(',', ARRAY(
            CONCAT('key_1', ':', 'value_1'),
            CONCAT('key_2', ':', 'value_2')
            )
        ), ',', ':'
    ) AS my_map

output: {“key_1”:“value_1”,“key_2”:“value_2”}

9.2 通过map的key读取value

SELECT
    my_map['A']
FROM
(
    SELECT
        STR_TO_MAP(
            CONCAT_WS(',', ARRAY(
                CONCAT('A', ':', 'A1'),
                CONCAT('B', ':', 'B1')
                )
            ), ',', ':'
        ) AS my_map
) t

output: A1

9.3 提取map中的key,生成由key组成的array

📖 为了后文演示方便,这里创建一个通用示例:

CREATE TABLE std.std_level_map AS
SELECT
    `id`,
    sub_score
FROM
(
    SELECT
        1 AS `id`,
        STR_TO_MAP(
            CONCAT_WS(',',
                CONCAT('语文:', '80'),
                CONCAT('数学:', '85'),
                CONCAT('英语:', '95'),
                CONCAT('政治:', '90')
            ), ',', ':'
        ) AS sub_score
    UNION ALL
    SELECT
        2 AS `id`,
        STR_TO_MAP(
            CONCAT_WS(',',
                CONCAT('法律:', '90'),
                CONCAT('英语:', '75'),
                CONCAT('政治:', '92')
            ), ',', ':'
        ) AS sub_score
) t

使用map_keys函数将字典的key转为key的array

SELECT
    `id`,
    MAP_KEYS(sub_score) AS sub_list
FROM std.std_level_map

9.4 把map中的value转为value的array

与map_keys类似,用map_values可以把map的value转为value的array

SELECT
    `id`,
    MAP_VALUES(sub_score) AS score_list
FROM std.std_level_map

9.5 构造JSON

1)构造简单的JSON

SELECT
    CONCAT('{"',
        CONCAT_WS('","',
            CONCAT('语文:', '80'),
            CONCAT('数学:', '90')
        ), '"}'
    ) AS my_map

output: {"语文:80","数学:90"}

2)将map转换成JSON格式的string

SELECT
    `id`,
    CONCAT('{"',
        CONCAT_WS('","',
            COLLECT_SET(
                CONCAT(k, '":"', v)
            )
        ), '"}'
    ) AS my_map
FROM std.std_level_map
LATERAL VIEW OUTER EXPLODE(sub_score) e AS k, v
GROUP BY `id`

output:

id my_map
1 {"语文":"80","数学":"85","英语":"95","政治":"90"}
2 {"法律":"90","英语":"75","政治":"92"}

3)构造形如ARRAY<MAP<STRING,STRING>>的string

SELECT
    CONCAT(
        '[',
        CONCAT_WS(',',
            CONCAT('{"',
                CONCAT_WS('","',
                    CONCAT('语文:', '80'),
                    CONCAT('数学:', '90')
                ), '"}'
            )
        ), ']'
    ) AS my_map

output: [{"语文:80","数学:90"}]

4)基于map,构造形如ARRAY<MAP<STRING,STRING>>的string

SELECT
    IF(SUM(IF(ARRAY_CONTAINS(MAP_KEYS(sub_score), '英语') AND ARRAY_CONTAINS(MAP_KEYS(sub_score), '政治'), 1, 0)) = 0, '',
        CONCAT('[',
            CONCAT_WS(',',
                COLLECT_SET(
                    CONCAT('{"',
                        CONCAT_WS('","',
                            CONCAT('英语', '":"', CAST(sub_score['英语'] AS STRING)),
                            CONCAT('政治', '":"', CAST(sub_score['政治'] AS STRING))
                        ), '"}'
                    )
                )
            ), ']'
        )
    ) AS my_map
FROM std.std_level_map

output: [{"英语":"95","政治":"90"},{"英语":"75","政治":"92"}]

Note: 如果原MAP中不包含任何选定数据,应该返回空字符串,为了实现这个效果,需要在外层套一个IF函数来判断。常用的判断条件有:

  • ARRAY_CONTAINS(MAP_KEYS(sub_score), '[YOUR_KEY_HERE]')
  • SIZE(MAP_KEYS(sub_score)) > 0

5)利用跨行数据构造JSON

Note: 凡是有CONCAT_WS的地方,里面都可以套一层COLLECT_SET或者COLLECT_LIST做跨行处理。

可以参考 2)4),这两条代码实际都有跨行处理能力

10. lateral view outer explode()

LATERAL VIEW OUTER EXPLODE() 用于把array或者map中的元素展成多行。该语句一般放在 FROM 之后,WHERE 之前。示例如下👇

1)将array展成多行

SELECT
    `id`,
    gid
FROM
(
    SELECT
        1 AS `id`,
        ARRAY(1,2,3,5,5) AS group_id
    UNION ALL
    SELECT
        2 AS `id`,
        ARRAY(1,2) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE id IN (1,3)

output:

id gid
1 1
1 2
1 3
1 5
1 5

2)将array展成多行:一个更复杂的例子

SELECT
    gid,
    CONCAT_WS(',', COLLECT_SET(CAST(id AS STRING))) AS id_list
FROM
(
    SELECT
        1 AS `id`,
        ARRAY(1,2,3,4,5,5) AS group_id
    UNION ALL
    SELECT
        2 AS `id`,
        ARRAY(2,4,4,3) AS group_id
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
WHERE gid > 1
GROUP BY gid

output:

gid id_list
2 1,2
3 1,2
4 1,2
5 1

3)展开多个array:idgroup_id的各元素间做笛卡尔积

SELECT
    `id`,
    gid,
    lvl
FROM
(
    SELECT
        1 AS `id`,
        ARRAY(1,2) AS group_id,
        ARRAY('A','B') AS level
) t
LATERAL VIEW OUTER EXPLODE(group_id) g AS gid
LATERAL VIEW OUTER EXPLODE(level) l AS lvl

output:

id gid lvl
1 1 A
1 1 B
1 2 A
1 2 B

4)展开map

Note: 请看 9. map 相关函数9.5 的示例 2)。多个MAP也能像多个ARRAY一样展开,这里就不多演示了。

11. 窗口函数

窗口函数的处理效果,是输入n行,输出n行。但这不意味窗口函数是行处理的,它可以综合其他行的信息对当前行做运算。

1)找同班同学

  • 输入是 id(学号)和 class(班级)
  • 输出是 id(学号)和 class(班级)和 id_list(同班同学学号列表)
SELECT
    id,
    class,
    COLLECT_SET(id) OVER(PARTITION BY class) AS id_list
FROM
(
    SELECT 1 AS id, 'A' AS class
    UNION ALL
    SELECT 2 AS id, 'A' AS class
    UNION ALL
    SELECT 3 AS id, 'B' AS class
) t

output:

id class id_list
1 A [1,2]
2 A [1,2]
3 B [3]

Note: 常用的聚合函数,都可以替换本例中的 COLLECT_SET(id),比如:

  • SUM()
  • COUNT()
  • MAX()

等等……

2)为每个数据行生成序号: row_number函数

SELECT
    id,
    class,
    score,
    ROW_NUMBER() OVER(PARTITION BY class ORDER BY score DESC) AS class_no
FROM
(
    SELECT 1 AS id, 'A' AS class, 90 AS score
    UNION ALL
    SELECT 2 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 3 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 4 AS id, 'B' AS class, 100 AS score
) t

output:

id class score class_no
1 A 90 3
2 A 91 1
3 A 91 2
4 B 100 1

Note: 注意到虽然学生2和学生3分数都是91分,但他们的班级序号(class_no)不重叠。2号学生班级序号是1,3号学生班级序号是2

2)排名: rank函数

SELECT
    id,
    class,
    score,
    RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
    SELECT 1 AS id, 'A' AS class, 90 AS score
    UNION ALL
    SELECT 2 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 3 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 4 AS id, 'B' AS class, 100 AS score
) t

output:

id class score class_rk
1 A 90 3
2 A 91 1
3 A 91 1
4 B 100 1

Note: 注意看,虽然学生2和学生3分数都是91分,但他们的班级排名(class_rk)是相同的。2号同学与3号同学并列第一,1号学生由于有2位同学排在他前面,直接变成第3名。

3)不计人头排名: dense_rank函数

DENSE_RANK() 是不计人头,纯计分数的排名方式。

上例中的1号同学,在考试中获得了90分,虽然按人头数计,有两位同学排在他前面,因此成为了第三名。但如果纯以分数计,他的90分是仅次于91分的班级第二好名次。如果我们希望不考虑人头数,仅考虑分数在排序中的位置,那么可以使用DENSE_RANK函数。

SELECT
    id,
    class,
    score,
    DENSE_RANK() OVER(PARTITION BY class ORDER BY score DESC) AS class_rk
FROM
(
    SELECT 1 AS id, 'A' AS class, 90 AS score
    UNION ALL
    SELECT 2 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 3 AS id, 'A' AS class, 91 AS score
    UNION ALL
    SELECT 4 AS id, 'B' AS class, 100 AS score
) t

output:

id class score class_rk
1 A 90 2
2 A 91 1
3 A 91 1
4 B 100 1

12. 尾缀方法

1)对结果排序: order by

寻找分数TOP2的两位同学:

SELECT
    id,
    score
FROM
(
    SELECT 1 AS id, 70 AS score
    UNION ALL 
    SELECT 2 AS id, 60 AS score
    UNION ALL 
    SELECT 3 AS id, 90 AS score
) t
ORDER BY score DESC
LIMIT 2

ORDER BY也常用于随机抽样:

ORDER BY RAND()

2)组筛选: having

HAVING语句能对GROUP BY的结果做筛选。以下SQL选出班级人数大于1的班级:

SELECT
    class,
    COUNT(DISTINCT id) AS id_cnt
FROM
(
    SELECT 1 AS id, 'A' AS class
    UNION ALL
    SELECT 2 AS id, 'A' AS class
    UNION ALL
    SELECT 3 AS id, 'B' AS class
) t
GROUP BY class
HAVING id_cnt > 1

3)CLUSTER BY, DISTRIBUTE BYSORT BY

这三个语句是控制reducer的。DISTRIBUTE BY [YOUR_COLUMN_HERE] 用于控制数据如何进入reducer,比如:

SELECT
    class,
    id
FROM
(
    SELECT 1 AS id, 'A' AS class
    UNION ALL
    SELECT 2 AS id, 'A' AS class
    UNION ALL
    SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class

如上,则拥有相同class值的记录,会进入同一个reducer。

SORT BY [YOUR_COLUMN_HERE] 用于控制reducer中数据的排序,比如:

SELECT
    class,
    id
FROM
(
    SELECT 1 AS id, 'A' AS class
    UNION ALL
    SELECT 2 AS id, 'A' AS class
    UNION ALL
    SELECT 3 AS id, 'B' AS class
) t
DISTRIBUTE BY class
SORT BY id

如上例,在每个reducer中,会用id字段做reducer内的排序。

对某字段使用 CLUSTER BY,等于对该字段既使用DISTRIBUTE BY,又使用SORT BY,一种语法糖了属于是。

Note: 这三个语句日常不太常见,什么场景能用上呢?

举个例子,用户 (uid) 的购物车订单 (oid) 有时候会被拆成多个包裹,这时每个包裹会对应一个子单 (sub_oid)。这种情况下,一个订单就会对应多个子单。假设我们只知道 uid - oid 的关系与 uid - sub_oid 的关系。如果已知 sub_oid 是 oid 后加下划线加子单单号,我们希望获取 oid - sub_oid 的关系。这种情况就可以使用 DISTRIBUTE BY uid,这可以大幅节省算力。

13. transform

SQL部分:

SELECT TRANSFORM
-- PYTHON文件的输入
(
    COL_1,
    COL_2
)
USING '[PYTHON_ENV_PATH_HERE] [YOUR_PYTHON_FILE_HERE]' AS
-- PYTHON文件的输出,输出的行数和内容取决于PYTHON怎么写
(
    OUTPUT_1,
    OUTPUT_2,
    OUTPUT_3
)
FROM
-- 数据源,会传给PYTHON文件的输入
(
    SELECT
        COL_1,
        COL_2
    FROM [DB_NAME].[SOURCE_TABLE]
    WHERE [CONDITIONS]
    DISTRIBUTE BY [YOUR_KEY]
) t

PYTHON部分:

import sys


class Processor:

    @staticmethod
    def add(lst):
        nlst = list()
        for col_1, col_2 in lst:
            nlst.append((col_1, col_2, col_1+'-'+col_2))

        return nlst

    def main(self):
        lst = list()
        for line in sys.stdin:
            col_1, col_2 = line.strip().split('\t')[:2]
            lst.append((col_1, col_2))

        for col_1, col_2, col_3 in self.add(lst):
            print('{}\t{}\t{}'.format(col_1, col_2, col_3))


if __name__ == '__main__':
    p = Processor()
    p.main()

14. 空值处理

1)coalesce函数:接受多个参数。从左到右遍历参数,返回第一个遇到的非NULL参数

SELECT
    COALESCE(col_1, col_2, 0) AS res
FROM
(
    SELECT NULL AS col_1, 10 AS col_2
    UNION ALL
    SELECT NULL AS col_1, NULL AS col_2
) t

output:

res
10
0

2)nvl函数:接受两个参数。如果第一个参数为NULL,返回第二个参数,否则返回第一个参数

SELECT
    NVL(col_1, col_2) AS res
FROM
(
    SELECT NULL AS col_1, 10 AS col_2
    UNION ALL
    SELECT NULL AS col_1, NULL AS col_2
    UNION ALL
    SELECT 100 AS col_1, 200 AS col_2
) t

output:

res
10
NULL
100

15.【场景1】同数据行跨字段计算最大/最小值

原始用法:

SELECT
    GREATEST(3,1,2) AS greatest_num
    LEAST(6,5,7) AS least_num

output:

greatest_num least_num
3 5

场景:计算一天中最贵/最便宜的一餐

SELECT
    LEAST(breakfast, lunch, dinner) AS cheapest_meal,
    GREATEST(breakfast, lunch, dinner) AS expensive_meal
FROM
(
    SELECT 3 AS breakfast, 25 AS lunch, 15 AS dinner
    UNION ALL
    SELECT 8 AS breakfast, 0 AS lunch, 20 AS dinner
    UNION ALL
    SELECT 0 AS breakfast, 35 AS lunch, 22 AS dinner
) t

output:

cheapest_meal expensive_meal
3 25
0 20
0 35

16.【场景2】计算分位数

场景:计算学生分数的95分位数

SELECT
    ttl_score,
    quantile,
    distance,
    ROUND(quantile, 3) AS quantile_round, -- 精确到小数点后三位便于展示
    ROUND(distance, 3) AS distance_round
FROM
(
    SELECT
        ttl_score,
        id_cnt_accumulation / all_cnt AS quantile,    -- 实际的分位数值
        ABS((id_cnt_accumulation / all_cnt) - 0.05) AS distance    -- 与标准95分位数的距离
    FROM
    (
        SELECT
            ttl_score,
            SUM(id_cnt) OVER(ORDER BY ttl_score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS id_cnt_accumulation,
            SUM(id_cnt) OVER() AS all_cnt
        FROM
        (
            SELECT
                ttl_score,
                COUNT(DISTINCT `id`) AS id_cnt
            FROM std.std_student_score
            GROUP BY ttl_score
        ) t1
    ) t2
) t3
ORDER BY distance
LIMIT 1

场景:计算多组学生分数的分位数:25/50/75分位数

SELECT
    ttl_score,
    quantile,
    ROUND(quantile, 3) AS quantile_round, -- 精确到小数点后三位便于展示
    CONCAT_WS(',',
        IF(rk_25 = 1, '25', null),
        IF(rk_50 = 1, '50', null),
        IF(rk_75 = 1, '75', null)
    ) AS prompt, -- 提示是哪一种分位数,可能存在同一个ttl_score值为多个分位数的情况
    distance_25,
    distance_50,
    distance_75
FROM
(
    SELECT
        ttl_score,
        quantile,
        ABS(quantile - 0.25) AS distance_25,
        ABS(quantile - 0.5) AS distance_50,
        ABS(quantile - 0.75) AS distance_75,
        row_number() over(order by ABS(quantile - 0.25)) AS rk_25,
        row_number() over(order by ABS(quantile - 0.5)) AS rk_50,
        row_number() over(order by ABS(quantile - 0.75)) AS rk_75
    FROM
    (
        SELECT
            ttl_score,
            id_cnt_accumulation / all_cnt AS quantile    -- 实际的分位数值
        FROM
        (
            SELECT
                ttl_score,
                SUM(id_cnt) OVER(ORDER BY ttl_score ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS id_cnt_accumulation,
                SUM(id_cnt) OVER() AS all_cnt
            FROM
            (
                SELECT
                    ttl_score,
                    COUNT(DISTINCT `id`) AS id_cnt
                FROM std.std_student_score
                GROUP BY ttl_score
            ) t1
        ) t2
    ) t3
) t4
WHERE rk_25 = 1
    OR rk_50 = 1
    OR rk_75 = 1;

17. UDF

todo

18. 命令行

  • hive -e "[SQL]": 在命令行执行 SQL 语句
  • hive -f [PATH/TO/SQL/FILE]: 先写好 XXX.SQL 文件,再从命令行执行
  • hive -f [PATH/TO/SQL/FILE] > [LOCAL_FILE].csv 下载数据到本地
  • hive -e "set hive.metastore.uris" 查看hive服务的ip和端口

附录