Spark SQL 用法手册
适用场景:日常写 Spark SQL 多、PySpark 少的数据开发岗位。纯 SQL 语法、带测试数据、每类方法说明用法并展示运行效果。
环境:Spark 3.x,使用 spark.sql() 执行。以下所有示例在 Spark Thrift Server 或 spark-sql CLI 中也可直接运行。
优秀资料分享
Spark SQL 函数分类导航
spark函数
测试数据
-- 员工表
CREATE OR REPLACE TEMPORARY VIEW employees AS
VALUES
(1, '张三', '技术部', 15000, 28, '2021-03-15', '北京'),
(2, '李四', '市场部', 12000, 35, '2019-07-01', '上海'),
(3, '王五', '技术部', 18000, 32, '2020-01-10', '北京'),
(4, '赵六', '财务部', 10000, 42, '2018-05-20', '广州'),
(5, '孙七', '技术部', 22000, 27, '2022-11-01', '深圳'),
(6, '周八', '市场部', 9000, 26, '2023-06-15', '上海'),
(7, '吴九', '财务部', 13000, 38, '2017-09-01', '北京'),
(8, '郑十', '技术部', 16000, 30, '2021-12-01', '广州')
AS t(emp_id, name, dept, salary, age, join_date, city);
-- 部门表
CREATE OR REPLACE TEMPORARY VIEW departments AS
VALUES
('技术部', '王五', 500000),
('市场部', '李四', 300000),
('财务部', '赵六', 200000),
('人事部', NULL, 150000)
AS t(dept_name, manager, budget);
-- 订单表
CREATE OR REPLACE TEMPORARY VIEW orders AS
VALUES
(1001, 1, '笔记本电脑', 8000, 1, '2026-01-15'),
(1002, 3, '显示器', 2500, 2, '2026-01-16'),
(1003, 1, '键盘', 500, 3, '2026-02-01'),
(1004, 5, '服务器', 50000, 1, '2026-02-15'),
(1005, 3, '显示器', 2500, 1, '2026-03-01'),
(1006, 2, '打印机', 1500, 2, '2026-03-10'),
(1007, 5, '笔记本电脑', 8000, 2, '2026-03-20'),
(1008, 8, '鼠标', 100, 10, '2026-04-01')
AS t(order_id, emp_id, product, amount, quantity, order_date);
-- 成绩表(含 NULL)
CREATE OR REPLACE TEMPORARY VIEW scores AS
VALUES
('张三', '数学', 90),
('张三', '语文', 85),
('李四', '数学', 72),
('李四', '语文', 88),
('王五', '数学', 95),
('王五', '语文', 92),
('赵六', '数学', 65),
('赵六', '语文', 78),
('孙七', '数学', NULL),
('孙七', '语文', 80)
AS t(student, subject, score);
-- 查看数据
SELECT * FROM employees;
| emp_id |
name |
dept |
salary |
age |
join_date |
city |
| 1 |
张三 |
技术部 |
15000 |
28 |
2021-03-15 |
北京 |
| 2 |
李四 |
市场部 |
12000 |
35 |
2019-07-01 |
上海 |
| 3 |
王五 |
技术部 |
18000 |
32 |
2020-01-10 |
北京 |
| 4 |
赵六 |
财务部 |
10000 |
42 |
2018-05-20 |
广州 |
| 5 |
孙七 |
技术部 |
22000 |
27 |
2022-11-01 |
深圳 |
| 6 |
周八 |
市场部 |
9000 |
26 |
2023-06-15 |
上海 |
| 7 |
吴九 |
财务部 |
13000 |
38 |
2017-09-01 |
北京 |
| 8 |
郑十 |
技术部 |
16000 |
30 |
2021-12-01 |
广州 |
一、列操作
1.1 选择列 / 别名 / 表达式
-- 选指定列
SELECT name, dept, salary FROM employees;
-- 表达式 + 别名
SELECT name, salary * 12 AS annual_salary FROM employees;
-- 去重
SELECT DISTINCT dept FROM employees;
| name |
dept |
salary |
| 张三 |
技术部 |
15000 |
| 李四 |
市场部 |
12000 |
| ... |
... |
... |
| name |
annual_salary |
| 张三 |
180000 |
| 李四 |
144000 |
| ... |
... |
1.2 条件列(CASE WHEN)
SELECT name, salary,
CASE
WHEN salary >= 18000 THEN '高'
WHEN salary >= 13000 THEN '中'
ELSE '低'
END AS salary_level
FROM employees;
| name |
salary |
salary_level |
| 张三 |
15000 |
中 |
| 李四 |
12000 |
低 |
| 王五 |
18000 |
高 |
| 赵六 |
10000 |
低 |
| 孙七 |
22000 |
高 |
| 周八 |
9000 |
低 |
| 吴九 |
13000 |
中 |
| 郑十 |
16000 |
中 |
1.3 列重命名 / 类型转换
SELECT name,
CAST(salary AS DOUBLE) / 10000 AS salary_wan,
CAST(join_date AS DATE) AS join_dt,
CAST(age AS STRING) AS age_str
FROM employees;
| name |
salary_wan |
join_dt |
age_str |
| 张三 |
1.5 |
2021-03-15 |
28 |
| 姚四 |
1.2 |
2019-07-01 |
35 |
二、过滤与排序
2.1 WHERE 条件
-- 比较运算
SELECT * FROM employees WHERE salary > 13000;
-- 多条件
SELECT * FROM employees WHERE dept = '技术部' AND salary >= 15000;
-- IN 集合
SELECT * FROM employees WHERE city IN ('北京', '上海');
-- LIKE 模糊
SELECT * FROM employees WHERE name LIKE '张%';
-- BETWEEN
SELECT * FROM employees WHERE salary BETWEEN 10000 AND 18000;
-- IS NULL(注意:NULL 不能用 = 判断)
SELECT * FROM scores WHERE score IS NULL;
| emp_id |
name |
dept |
salary |
age |
join_date |
city |
| 1 |
张三 |
技术部 |
15000 |
28 |
2021-03-15 |
北京 |
| 3 |
王五 |
技术部 |
18000 |
32 |
2020-01-10 |
北京 |
| 5 |
孙七 |
技术部 |
22000 |
27 |
2022-11-01 |
深圳 |
| 8 |
郑十 |
技术部 |
16000 |
30 |
2021-12-01 |
广州 |
| student |
subject |
score |
| 孙七 |
数学 |
NULL |
2.2 排序
-- 单列降序
SELECT name, salary FROM employees ORDER BY salary DESC;
-- 多列
SELECT dept, salary, name FROM employees ORDER BY dept, salary DESC;
-- NULL 排到最后
SELECT * FROM scores ORDER BY score DESC NULLS LAST;
| name |
salary |
| 孙七 |
22000 |
| 王五 |
18000 |
| 郑十 |
16000 |
| dept |
salary |
name |
| 财务部 |
13000 |
吴九 |
| 财务部 |
10000 |
赵六 |
| 市场部 |
12000 |
李四 |
| 市场部 |
9000 |
周八 |
2.3 分页 / Top N
-- 前 3 条
SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3;
-- 指定偏移(通过子查询 + row_number,Spark 不直接支持 OFFSET)
SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3;
| name |
salary |
| 孙七 |
22000 |
| 王五 |
18000 |
| 郑十 |
16000 |
三、聚合操作
3.1 GROUP BY 基础聚合
SELECT dept,
COUNT(*) AS 人数,
ROUND(AVG(salary), 0) AS 平均薪资,
SUM(salary) AS 总薪资,
MAX(salary) AS 最高,
MIN(salary) AS 最低
FROM employees
GROUP BY dept;
| dept |
人数 |
平均薪资 |
总薪资 |
最高 |
最低 |
| 财务部 |
2 |
11500.0 |
23000 |
13000 |
10000 |
| 市场部 |
2 |
10500.0 |
21000 |
12000 |
9000 |
| 技术部 |
4 |
17750.0 |
71000 |
22000 |
15000 |
3.2 HAVING — 聚合后过滤
SELECT dept,
COUNT(*) AS cnt,
ROUND(AVG(salary)) AS avg_sal
FROM employees
GROUP BY dept
HAVING COUNT(*) >= 3 -- 注意:HAVING 中只能用聚合表达式,不能用别名
AND AVG(salary) > 10000;
| dept |
cnt |
avg_sal |
| 技术部 |
4 |
17750.0 |
3.3 多维度聚合:GROUPING SETS / CUBE / ROLLUP
-- CUBE:所有维度的所有组合
SELECT dept, city,
ROUND(AVG(salary)) AS avg_salary,
COUNT(*) AS cnt
FROM employees
GROUP BY CUBE(dept, city)
ORDER BY dept NULLS FIRST, city NULLS FIRST;
| dept |
city |
avg_salary |
cnt |
说明 |
| NULL |
NULL |
14375 |
8 |
总计 |
| NULL |
上海 |
10500 |
2 |
按城市汇总 |
| NULL |
北京 |
15333 |
3 |
|
| NULL |
广州 |
13000 |
2 |
|
| NULL |
深圳 |
22000 |
1 |
|
| 财务部 |
NULL |
11500 |
2 |
按部门汇总 |
| 财务部 |
北京 |
13000 |
1 |
|
| 财务部 |
广州 |
10000 |
1 |
|
| 市场部 |
NULL |
10500 |
2 |
|
| 市场部 |
上海 |
10500 |
2 |
|
| 技术部 |
NULL |
17750 |
4 |
|
| 技术部 |
北京 |
16500 |
2 |
|
| 技术部 |
广州 |
16000 |
1 |
|
| 技术部 |
深圳 |
22000 |
1 |
|
-- ROLLUP:层级汇总(dept → city)
SELECT dept, city,
ROUND(AVG(salary)) AS avg_salary,
COUNT(*) AS cnt
FROM employees
GROUP BY ROLLUP(dept, city)
ORDER BY dept NULLS FIRST, city NULLS FIRST;
CUBE vs ROLLUP:CUBE = 全部组合 (a,b),(a),(b),();ROLLUP = 层级组合 (a,b),(a),() — 数据仓库中常用于年→月→日汇总。
3.4 PIVOT — 行转列
SELECT *
FROM scores
PIVOT (
FIRST(score) -- 如果每个(student, subject)只有一行,用 ANY_VALUE 也行
FOR subject IN ('数学', '语文')
);
| student |
数学 |
语文 |
| 王五 |
95 |
92 |
| 李四 |
72 |
88 |
| 赵六 |
65 |
78 |
| 孙七 |
NULL |
80 |
| 张三 |
90 |
85 |
四、窗口函数
4.1 排名函数
SELECT name, dept, salary,
RANK() OVER (PARTITION BY dept ORDER BY salary DESC) AS rk,
DENSE_RANK() OVER (PARTITION BY dept ORDER BY salary DESC) AS drk,
ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) AS rn
FROM employees;
| name |
dept |
salary |
rk |
drk |
rn |
| 吴九 |
财务部 |
13000 |
1 |
1 |
1 |
| 赵六 |
财务部 |
10000 |
2 |
2 |
2 |
| 李四 |
市场部 |
12000 |
1 |
1 |
1 |
| 周八 |
市场部 |
9000 |
2 |
2 |
2 |
| 孙七 |
技术部 |
22000 |
1 |
1 |
1 |
| 王五 |
技术部 |
18000 |
2 |
2 |
2 |
| 郑十 |
技术部 |
16000 |
3 |
3 |
3 |
| 张三 |
技术部 |
15000 |
4 |
4 |
4 |
如果薪资并列:RANK() 会跳号(1,1,3),DENSE_RANK() 连续(1,1,2),ROW_NUMBER() 强制排定(1,2,3)。
4.2 偏移函数:LAG / LEAD
SELECT name, dept, salary,
LAG(salary, 1) OVER (PARTITION BY dept ORDER BY salary) AS prev_salary,
LEAD(salary, 1) OVER (PARTITION BY dept ORDER BY salary) AS next_salary,
salary - LAG(salary, 1) OVER (PARTITION BY dept ORDER BY salary) AS diff
FROM employees;
| name |
dept |
salary |
prev_salary |
next_salary |
diff |
| 吴九 |
财务部 |
13000 |
NULL |
10000 |
NULL |
| 赵六 |
财务部 |
10000 |
13000 |
NULL |
-3000 |
| 周八 |
市场部 |
9000 |
NULL |
12000 |
NULL |
| 李四 |
市场部 |
12000 |
9000 |
NULL |
3000 |
| 张三 |
技术部 |
15000 |
NULL |
16000 |
NULL |
| 郑十 |
技术部 |
16000 |
15000 |
18000 |
1000 |
| 王五 |
技术部 |
18000 |
16000 |
22000 |
2000 |
| 孙七 |
技术部 |
22000 |
18000 |
NULL |
4000 |
4.3 窗口聚合 / 占比
SELECT name, dept, salary,
SUM(salary) OVER (PARTITION BY dept) AS dept_total,
ROUND(AVG(salary) OVER (PARTITION BY dept), 0) AS dept_avg,
ROUND(salary / SUM(salary) OVER (PARTITION BY dept) * 100, 1) AS pct
FROM employees;
| name |
dept |
salary |
dept_total |
dept_avg |
pct |
| 赵六 |
财务部 |
10000 |
23000 |
11500.0 |
43.5 |
| 吴九 |
财务部 |
13000 |
23000 |
11500.0 |
56.5 |
| 周八 |
市场部 |
9000 |
21000 |
10500.0 |
42.9 |
| 李四 |
市场部 |
12000 |
21000 |
10500.0 |
57.1 |
| 张三 |
技术部 |
15000 |
71000 |
17750.0 |
21.1 |
| 郑十 |
技术部 |
16000 |
71000 |
17750.0 |
22.5 |
| 王五 |
技术部 |
18000 |
71000 |
17750.0 |
25.4 |
| 孙七 |
技术部 |
22000 |
71000 |
17750.0 |
31.0 |
4.4 累计值(窗口帧)
SELECT name, join_date, salary,
SUM(salary) OVER (
ORDER BY join_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cum_salary
FROM employees;
| name |
join_date |
salary |
cum_salary |
| 吴九 |
2017-09-01 |
13000 |
13000 |
| 赵六 |
2018-05-20 |
10000 |
23000 |
| 李四 |
2019-07-01 |
12000 |
35000 |
| 王五 |
2020-01-10 |
18000 |
53000 |
| 张三 |
2021-03-15 |
15000 |
68000 |
| 郑十 |
2021-12-01 |
16000 |
84000 |
| 孙七 |
2022-11-01 |
22000 |
106000 |
| 周八 |
2023-06-15 |
9000 |
115000 |
4.5 分组 Top N
WITH ranked AS (
SELECT name, dept, salary,
ROW_NUMBER() OVER (PARTITION BY dept ORDER BY salary DESC) AS rn
FROM employees
)
SELECT name, dept, salary FROM ranked WHERE rn <= 2;
| name |
dept |
salary |
| 吴九 |
财务部 |
13000 |
| 赵六 |
财务部 |
10000 |
| 李四 |
市场部 |
12000 |
| 周八 |
市场部 |
9000 |
| 孙七 |
技术部 |
22000 |
| 王五 |
技术部 |
18000 |
五、表关联——JOIN
5.1 五种 JOIN 类型
-- INNER JOIN(默认)
SELECT e.name, e.dept, d.budget
FROM employees e
JOIN departments d ON e.dept = d.dept_name;
-- LEFT JOIN
SELECT e.name, e.dept, d.budget
FROM employees e
LEFT JOIN departments d ON e.dept = d.dept_name;
-- RIGHT JOIN
SELECT e.name, d.dept_name, d.budget
FROM employees e
RIGHT JOIN departments d ON e.dept = d.dept_name;
-- FULL OUTER JOIN
SELECT e.name, e.dept, d.budget
FROM employees e
FULL OUTER JOIN departments d ON e.dept = d.dept_name;
-- CROSS JOIN (笛卡尔积,谨慎使用)
SELECT e.name, o.product
FROM employees e
CROSS JOIN (SELECT DISTINCT product FROM orders) o;
INNER JOIN 结果:
| name |
dept |
budget |
| 张三 |
技术部 |
500000 |
| 李四 |
市场部 |
300000 |
| 王五 |
技术部 |
500000 |
| 赵六 |
财务部 |
200000 |
| 孙七 |
技术部 |
500000 |
| 周八 |
市场部 |
300000 |
| 吴九 |
财务部 |
200000 |
| 郑十 |
技术部 |
500000 |
LEFT JOIN 结果(和 INNER 一样,因为员工都在 departments 中有对应部门):
RIGHT JOIN 结果(人事部没有员工,保留了空行):
| name |
dept_name |
budget |
| ... |
... |
... |
| NULL |
人事部 |
150000 |
5.2 非等值 JOIN
-- 找出比自己部门平均薪资高的员工
SELECT e.name, e.dept, e.salary, t.dept_avg
FROM employees e
JOIN (
SELECT dept, AVG(salary) AS dept_avg
FROM employees
GROUP BY dept
) t ON e.dept = t.dept
WHERE e.salary > t.dept_avg;
| name |
dept |
salary |
dept_avg |
| 吴九 |
财务部 |
13000 |
11500.0 |
| 孙七 |
技术部 |
22000 |
17750.0 |
5.3 Broadcast Hint 优化
-- 小表 JOIN 大表时加 hint,避免 Shuffle
SELECT /*+ BROADCAST(d) */ e.name, e.dept, d.budget
FROM employees e
JOIN departments d ON e.dept = d.dept_name;
-- 或
SELECT /*+ MAPJOIN(d) */ e.name, e.dept, d.budget
FROM employees e
JOIN departments d ON e.dept = d.dept_name;
小表默认阈值 spark.sql.autoBroadcastJoinThreshold = 10MB,可调大。
Spark 自动判断,MAPJOIN / BROADCAST hint 只是强制提示。
六、集合操作
6.1 UNION / INTERSECT / EXCEPT
-- 并集(去重)
SELECT dept FROM employees
UNION
SELECT dept_name FROM departments;
-- 交集(两边都有)
SELECT dept FROM employees
INTERSECT
SELECT dept_name FROM departments;
-- 差集(左边有、右边没有)
SELECT dept_name FROM departments
EXCEPT
SELECT dept FROM employees;
EXCEPT 结果:
UNION ALL 不去重(更快),UNION 自动去重。
MINUS 是 EXCEPT 的别名,效果一样。
七、字符串函数
SELECT name, city,
CONCAT(name, '-', city) AS name_city,
CONCAT_WS(' | ', name, dept, city) AS info,
SUBSTR(name, 1, 1) AS surname,
CHAR_LENGTH(name) AS name_len,
INSTR(product, '脑') AS pos_nao, -- 返回子串位置,0 = 没找到
SPLIT(product, '电') AS splitted
FROM employees
CROSS JOIN (SELECT DISTINCT product FROM orders WHERE product = '笔记本电脑') t;
| name |
city |
name_city |
info |
surname |
name_len |
| 张三 |
北京 |
张三-北京 |
张三 | 技术部 | 北京 |
张 |
2 |
| 李四 |
上海 |
李四-上海 |
李四 | 市场部 | 上海 |
李 |
2 |
常用字符串函数速查
| 函数 |
作用 |
示例 |
CONCAT(a, b, c) |
拼接 |
CONCAT(name, '(', age, ')' ) |
CONCAT_WS(sep, a, b) |
以分隔符拼接 |
CONCAT_WS(',', name, city) |
SUBSTR(s, pos, len) |
截取 |
SUBSTR(name, 1, 1) → 姓 |
CHAR_LENGTH(s) |
字符长度 |
CHAR_LENGTH(name) |
INSTR(s, substr) |
子串位置(1-based) |
INSTR(name, '张') |
SPLIT(s, pattern) |
分割成数组 |
SPLIT(product, '电') |
REPLACE(s, old, new) |
替换 |
REPLACE(city, '北京', 'BJ') |
UPPER / LOWER |
大小写转换 |
UPPER(name) |
TRIM(s) |
去两端空格 |
TRIM(name) |
LPAD / RPAD |
填充 |
LPAD(CAST(age AS STRING), 3, '0') |
REGEXP_EXTRACT |
正则提取 |
REGEXP_EXTRACT(name, '(张|李)(.*)', 1) |
FORMAT_STRING |
格式化 |
FORMAT_STRING('%s(%d岁)', name, age) |
八、日期函数
SELECT name,
CAST(join_date AS DATE) AS join_dt,
YEAR(join_date) AS join_year,
MONTH(join_date) AS join_month,
DATE_FORMAT(join_date, 'yyyy年MM月') AS join_ym,
DATEDIFF(CURRENT_DATE, join_date) AS days_since_join,
ROUND(MONTHS_BETWEEN(CURRENT_DATE, join_date), 1) AS months_since_join,
ADD_MONTHS(join_date, 3) AS probation_end,
LAST_DAY(join_date) AS month_end,
DATE_ADD(join_date, 7) AS week_after,
TRUNC(join_date, 'MM') AS month_begin -- 当月第一天
FROM employees;
| name |
join_dt |
join_year |
join_month |
join_ym |
days_since_join |
months_since_join |
probation_end |
month_end |
| 张三 |
2021-03-15 |
2021 |
3 |
2021年03月 |
1934 |
63.5 |
2021-06-15 |
2021-03-31 |
| 李四 |
2019-07-01 |
2019 |
7 |
2019年07月 |
2556 |
84.0 |
2019-10-01 |
2019-07-31 |
按月聚合示例
SELECT DATE_FORMAT(order_date, 'yyyy-MM') AS month,
SUM(amount) AS total_amount,
COUNT(*) AS order_count
FROM orders
GROUP BY month
ORDER BY month;
| month |
total_amount |
order_count |
| 2026-01 |
10500 |
2 |
| 2026-02 |
50500 |
2 |
| 2026-03 |
12000 |
3 |
| 2026-04 |
1000 |
1 |
常用日期函数速查
| 函数 |
作用 |
示例结果 |
CURRENT_DATE |
当前日期 |
2026-07-01 |
CURRENT_TIMESTAMP |
当前时间戳 |
2026-07-01 14:30:00 |
YEAR(d) / MONTH(d) / DAY(d) |
提取年月日 |
2021 / 3 / 15 |
DATE_FORMAT(d, fmt) |
格式化 |
'yyyy-MM' → '2021-03' |
DATEDIFF(end, start) |
天数差 |
10 |
MONTHS_BETWEEN(end, start) |
月数差 |
63.5 |
ADD_MONTHS(d, n) |
加减月份 |
2021-06-15 |
DATE_ADD(d, n) / DATE_SUB(d, n) |
加减天数 |
2021-03-22 |
LAST_DAY(d) |
当月最后一天 |
2021-03-31 |
TRUNC(d, unit) |
截断到指定粒度 |
'MM' 当月1号,'YYYY' 当年1月1日 |
NEXT_DAY(d, dow) |
下个周几 |
NEXT_DAY('2026-07-01', 'Wed') |
WEEKOFYEAR(d) |
年内第几周 |
27 |
DAYOFWEEK(d) |
周几(1=周日) |
4 |
TO_DATE / DATE |
转日期类型 |
TO_DATE('2026-07-01') |
TO_TIMESTAMP |
转时间戳 |
TO_TIMESTAMP('2026-07-01 14:30:00') |
九、空值处理
SELECT student, subject, score,
COALESCE(score, 0) AS score_filled, -- 第一个非 NULL 值
IF(score IS NULL, '缺考', CAST(score AS STRING)) AS status
FROM scores;
-- 或
SELECT student, subject,
CASE WHEN score IS NULL THEN '缺考'
ELSE CAST(score AS STRING)
END AS status
FROM scores;
| student |
subject |
score |
score_filled |
status |
| 张三 |
数学 |
90 |
90 |
90 |
| 孙七 |
数学 |
NULL |
0 |
缺考 |
| 孙七 |
语文 |
80 |
80 |
80 |
十、数学函数
SELECT name, salary,
ROUND(salary / 12 / 22, 2) AS daily_salary,
CEIL(salary / 10000) AS ceil_wan,
FLOOR(salary / 5000) AS floor_5k,
ABS(-salary) AS abs_val,
RAND(42) AS random_val -- 固定种子,可重放
FROM employees;
常用数学函数速查
| 函数 |
作用 |
示例 |
ROUND(x, d) |
四舍五入到 d 位小数 |
ROUND(3.14159, 2) → 3.14 |
CEIL(x) |
向上取整 |
CEIL(3.1) → 4 |
FLOOR(x) |
向下取整 |
FLOOR(3.9) → 3 |
ABS(x) |
绝对值 |
ABS(-5) → 5 |
POW(x, y) |
x 的 y 次方 |
POW(2, 10) → 1024 |
SQRT(x) |
平方根 |
SQRT(100) → 10 |
RAND(seed) |
0~1 随机数 |
RAND(42) |
GREATEST(a,b) |
最大值 |
GREATEST(1,2,3) → 3 |
LEAST(a,b) |
最小值 |
LEAST(1,2,3) → 1 |
十一、复杂类型
11.1 Array 与 Explode
-- 创建数组
SELECT name, ARRAY(salary, age * 100) AS nums
FROM employees;
-- explode:数组展开为多行
SELECT name, t.num
FROM employees
LATERAL VIEW EXPLODE(ARRAY(salary, age * 100)) t AS num;
| name |
nums |
| 张三 |
[15000, 2800] |
| 李四 |
[12000, 3500] |
| name |
num |
| 张三 |
15000 |
| 张三 |
2800 |
| 李四 |
12000 |
| 李四 |
3500 |
11.2 Map 类型
-- 创建 map
SELECT name, MAP('salary', salary, 'age', age) AS info
FROM employees;
-- 访问 map 的 key
SELECT name, MAP('salary', salary, 'age', age)['salary'] AS sal
FROM employees;
11.3 高阶函数(Spark 3.x)
-- transform:数组元素转换
SELECT name,
TRANSFORM(ARRAY(salary, age * 100), x -> x * 2) AS doubled
FROM employees;
-- filter:过滤数组
SELECT name,
FILTER(ARRAY(salary, age * 100), x -> x > 10000) AS gt_10000
FROM employees;
-- exists:是否有满足条件的
SELECT name,
EXISTS(ARRAY(salary, age * 100), x -> x > 20000) AS has_big
FROM employees;
十二、SQL UDF(Spark 3.x)
-- 注册临时 UDF
CREATE OR REPLACE TEMPORARY FUNCTION salary_level AS '
SELECT CASE
WHEN salary >= 18000 THEN ''高级''
WHEN salary >= 13000 THEN ''中级''
ELSE ''初级''
END
';
-- 或用 Python UDF(需要先注册到 SparkSession)
-- PySpark 侧:
-- spark.udf.register("py_salary_level", lambda s: "高级" if s >= 18000 else ...)
-- 使用
SELECT name, salary, salary_level(salary) AS level
FROM employees;
注意:CREATE TEMPORARY FUNCTION 在 Spark 3.x 中注册的是 Hive UDF。
纯 SQL 中定义简单逻辑推荐用 CASE WHEN 替代,不需要写 UDF。
十三、CTE 与子查询
-- CTE(WITH 子句):推荐,可读性更高
WITH dept_stats AS (
SELECT dept,
AVG(salary) AS avg_sal,
COUNT(*) AS cnt
FROM employees
GROUP BY dept
)
SELECT e.name, e.dept, e.salary,
ROUND(e.salary - d.avg_sal) AS diff_avg
FROM employees e
JOIN dept_stats d ON e.dept = d.dept
WHERE e.salary > d.avg_sal;
-- 等价子查询
SELECT e.name, e.dept, e.salary,
ROUND(e.salary - t.avg_sal) AS diff_avg
FROM employees e
JOIN (
SELECT dept, AVG(salary) AS avg_sal
FROM employees
GROUP BY dept
) t ON e.dept = t.dept
WHERE e.salary > t.avg_sal;
注意:Spark SQL 的 CTE 默认不会物化。如果在 CTE 中引用了同一张表两次,它会被执行两次。
要复用结果,用 CACHE TABLE 或 CREATE TEMPORARY VIEW ... CACHE。
十四、查看执行计划与调优
14.1 查看执行计划
-- 逻辑计划
EXPLAIN SELECT * FROM employees WHERE salary > 13000;
-- 带物理细节
EXPLAIN EXTENDED SELECT * FROM employees WHERE salary > 13000;
-- formatted 格式(Spark 3.x)
EXPLAIN FORMATTED SELECT * FROM employees WHERE salary > 13000;
14.2 常用调优 Hint
-- 广播提示(小表 JOIN 大表)
SELECT /*+ BROADCAST(d) */ e.name, d.budget
FROM employees e JOIN departments d ON e.dept = d.dept_name;
-- 指定 Shuffle 分区数
SELECT /*+ SHUFFLE_HASH(e) */ *
FROM employees e JOIN orders o ON e.emp_id = o.emp_id;
-- 合并小文件
SELECT /*+ COALESCE(4) */ * FROM employees;
-- 重新分区
SELECT /*+ REPARTITION(8) */ * FROM employees;
14.3 缓存表
-- 将表缓存到内存(后续查询加速)
CACHE TABLE employees;
UNCACHE TABLE employees;
-- 检查是否缓存
SHOW CACHED TABLES;
-- LAZY 缓存:只定义,使用时才加载
CACHE LAZY TABLE employees;
附录:完整方法速查表
类别索引
| 类别 |
本章节 |
核心函数 |
| 列操作 |
§1 |
SELECT / DISTINCT / CASE WHEN / CAST |
| 过滤排序 |
§2 |
WHERE / LIKE / IN / BETWEEN / ORDER BY / LIMIT |
| 聚合 |
§3 |
GROUP BY / HAVING / CUBE / ROLLUP / PIVOT |
| 窗口函数 |
§4 |
RANK / DENSE_RANK / ROW_NUMBER / LAG / LEAD / 窗口帧 |
| JOIN |
§5 |
JOIN / LEFT JOIN / RIGHT JOIN / FULL JOIN / CROSS JOIN |
| 集合 |
§6 |
UNION / INTERSECT / EXCEPT |
| 字符串 |
§7 |
CONCAT / SUBSTR / SPLIT / INSTR / REGEXP_EXTRACT |
| 日期 |
§8 |
DATE_FORMAT / DATEDIFF / ADD_MONTHS / LAST_DAY |
| 空值 |
§9 |
COALESCE / IFNULL / NULLIF |
| 数学 |
§10 |
ROUND / CEIL / FLOOR / ABS / RAND |
| 复杂类型 |
§11 |
ARRAY / MAP / STRUCT / EXPLODE / TRANSFORM |
| 调优 |
§14 |
EXPLAIN / CACHE TABLE / hint (BROADCAST / COALESCE) |
修订记录
v2.0 - 2026.07.01:改造为纯 SQL 语法手册。所有示例可直接在 spark-sql CLI 或 Thrift Server 中运行。
v1.0 - 2026.07.01:初版(PySpark API 版)。
No comments yet. Be the first!