SQL Reference

Spark SQL provides a powerful SQL interface for querying structured data in Spark. You can use standard SQL syntax to interact with DataFrames, tables, and views.

SQL Function Reference

Access the complete built-in functions documentation: Spark SQL Built-in Functions This comprehensive reference includes:

Aggregate functions
String functions
Date and timestamp functions
Math functions
Collection functions
Window functions
JSON functions
And many more

SQL Syntax Reference

For detailed SQL syntax documentation, see the SQL Reference Guide.

Key Topics

SQL Syntax - Complete SQL command syntax
Data Types - Supported data types
Functions - Function usage and categories
Literals - Literal value syntax
Operators - SQL operators

Running SQL Queries

Execute SQL queries using the sql() method:

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Run a SQL query
result = spark.sql("""
    SELECT department, AVG(salary) as avg_salary
    FROM employees
    WHERE hire_date >= '2020-01-01'
    GROUP BY department
    ORDER BY avg_salary DESC
""")

result.show()

Common SQL Operations

SELECT Queries

-- Basic SELECT
SELECT name, age, department FROM employees;

-- With WHERE clause
SELECT name, salary 
FROM employees 
WHERE department = 'Engineering' AND salary > 100000;

-- With aggregations
SELECT department, COUNT(*) as emp_count, AVG(salary) as avg_salary
FROM employees
GROUP BY department;

-- With HAVING clause
SELECT department, AVG(salary) as avg_salary
FROM employees
GROUP BY department
HAVING AVG(salary) > 80000;

JOINs

-- INNER JOIN
SELECT e.name, e.salary, d.dept_name
FROM employees e
INNER JOIN departments d ON e.dept_id = d.id;

-- LEFT JOIN
SELECT e.name, d.dept_name
FROM employees e
LEFT JOIN departments d ON e.dept_id = d.id;

-- Multiple joins
SELECT e.name, d.dept_name, l.city
FROM employees e
INNER JOIN departments d ON e.dept_id = d.id
INNER JOIN locations l ON d.location_id = l.id;

Subqueries

-- Scalar subquery
SELECT name, salary
FROM employees
WHERE salary > (SELECT AVG(salary) FROM employees);

-- IN subquery
SELECT name FROM employees
WHERE dept_id IN (
  SELECT id FROM departments WHERE location = 'NYC'
);

-- EXISTS subquery
SELECT d.dept_name
FROM departments d
WHERE EXISTS (
  SELECT 1 FROM employees e WHERE e.dept_id = d.id
);

Window Functions

-- ROW_NUMBER
SELECT 
  name,
  salary,
  ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) as rank
FROM employees;

-- Running total
SELECT 
  date,
  amount,
  SUM(amount) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as running_total
FROM transactions;

-- LAG and LEAD
SELECT 
  date,
  revenue,
  LAG(revenue, 1) OVER (ORDER BY date) as prev_revenue,
  LEAD(revenue, 1) OVER (ORDER BY date) as next_revenue
FROM daily_sales;

CTEs (Common Table Expressions)

WITH high_earners AS (
  SELECT * FROM employees WHERE salary > 100000
),
dept_stats AS (
  SELECT department, AVG(salary) as avg_salary
  FROM high_earners
  GROUP BY department
)
SELECT * FROM dept_stats WHERE avg_salary > 120000;

DDL Operations

Creating Tables

-- Create table
CREATE TABLE employees (
  id INT,
  name STRING,
  age INT,
  department STRING,
  salary DOUBLE
)
USING parquet
PARTITIONED BY (department);

-- Create table from query
CREATE TABLE high_earners AS
SELECT * FROM employees WHERE salary > 100000;

-- Create external table
CREATE EXTERNAL TABLE external_data (
  col1 STRING,
  col2 INT
)
LOCATION 's3://bucket/path/';

Managing Tables

-- Show tables
SHOW TABLES;
SHOW TABLES IN database_name;

-- Describe table
DESCRIBE employees;
DESCRIBE EXTENDED employees;

-- Drop table
DROP TABLE IF EXISTS employees;

-- Truncate table
TRUNCATE TABLE employees;

Views

-- Create view
CREATE VIEW high_salary_employees AS
SELECT * FROM employees WHERE salary > 100000;

-- Create temporary view
CREATE TEMPORARY VIEW temp_employees AS
SELECT * FROM employees WHERE hire_date >= '2023-01-01';

-- Drop view
DROP VIEW IF EXISTS high_salary_employees;

DML Operations

INSERT

-- Insert values
INSERT INTO employees VALUES
  (1, 'Alice', 30, 'Engineering', 120000),
  (2, 'Bob', 35, 'Sales', 90000);

-- Insert from query
INSERT INTO high_earners
SELECT * FROM employees WHERE salary > 100000;

-- Insert with column specification
INSERT INTO employees (id, name, salary)
VALUES (3, 'Charlie', 95000);

UPDATE (Supported in Delta Lake)

UPDATE employees
SET salary = salary * 1.1
WHERE department = 'Engineering';

DELETE (Supported in Delta Lake)

DELETE FROM employees
WHERE hire_date < '2020-01-01';

Built-in Functions

String Functions

SELECT 
  UPPER(name) as upper_name,
  LOWER(name) as lower_name,
  CONCAT(first_name, ' ', last_name) as full_name,
  SUBSTRING(name, 1, 3) as name_prefix,
  LENGTH(name) as name_length,
  TRIM(name) as trimmed_name
FROM users;

Date and Time Functions

SELECT 
  CURRENT_DATE() as today,
  CURRENT_TIMESTAMP() as now,
  DATE_ADD(hire_date, 90) as probation_end,
  DATEDIFF(CURRENT_DATE(), hire_date) as days_employed,
  YEAR(hire_date) as hire_year,
  MONTH(hire_date) as hire_month,
  DATE_FORMAT(hire_date, 'yyyy-MM') as hire_month_formatted
FROM employees;

Aggregate Functions

SELECT 
  COUNT(*) as total_count,
  COUNT(DISTINCT department) as dept_count,
  SUM(salary) as total_salary,
  AVG(salary) as avg_salary,
  MIN(salary) as min_salary,
  MAX(salary) as max_salary,
  STDDEV(salary) as salary_stddev
FROM employees;

Collection Functions

SELECT 
  ARRAY_CONTAINS(skills, 'Python') as has_python,
  SIZE(skills) as skill_count,
  EXPLODE(skills) as individual_skill,
  ARRAY_JOIN(skills, ', ') as skills_list
FROM employees;

JSON Functions

SELECT 
  GET_JSON_OBJECT(metadata, '$.age') as age,
  JSON_TUPLE(metadata, 'name', 'city') as (name, city)
FROM users;

Performance Tips

Use Partitioning

CREATE TABLE sales (
  id INT,
  amount DOUBLE,
  date DATE
)
USING parquet
PARTITIONED BY (year INT, month INT);

Use Bucketing

CREATE TABLE customers (
  id INT,
  name STRING,
  email STRING
)
USING parquet
CLUSTERED BY (id) INTO 10 BUCKETS;

Optimize Joins

-- Use broadcast hints for small tables
SELECT /*+ BROADCAST(d) */ e.name, d.dept_name
FROM employees e
JOIN departments d ON e.dept_id = d.id;

Configuration

Set SQL configurations:

-- Set configuration
SET spark.sql.shuffle.partitions = 200;

-- View configuration
SET spark.sql.shuffle.partitions;

-- Reset configuration
RESET spark.sql.shuffle.partitions;

SQL CLI

Run SQL queries from the command line:

# Start Spark SQL CLI
./bin/spark-sql

# Run SQL file
./bin/spark-sql -f queries.sql

# Run inline query
./bin/spark-sql -e "SELECT * FROM employees LIMIT 10"

Additional Resources

SQL Reference - Complete SQL syntax reference
Built-in Functions - All available functions
SQL Programming Guide - Comprehensive SQL guide
Data Types Reference - Supported data types
ANSI Compliance - ANSI SQL compatibility

Spark SQL is optimized by the Catalyst query optimizer, which automatically optimizes your queries for better performance.

Programming Languages

Spark Connect

SQL Function Reference

SQL Syntax Reference

Key Topics

Running SQL Queries

Common SQL Operations

SELECT Queries

JOINs

Subqueries

Window Functions

CTEs (Common Table Expressions)

DDL Operations

Creating Tables

Managing Tables

Views

DML Operations

INSERT

UPDATE (Supported in Delta Lake)

DELETE (Supported in Delta Lake)

Built-in Functions

String Functions

Date and Time Functions

Aggregate Functions

Collection Functions

JSON Functions

Performance Tips

Use Partitioning

Use Bucketing

Optimize Joins

Configuration

SQL CLI

Additional Resources

Build docs developers (and LLMs) love

Programming Languages

Spark Connect

​SQL Function Reference

​SQL Syntax Reference

​Key Topics

​Running SQL Queries

​Common SQL Operations

​SELECT Queries

​JOINs

​Subqueries

​Window Functions

​CTEs (Common Table Expressions)

​DDL Operations

​Creating Tables

​Managing Tables

​Views

​DML Operations

​INSERT

​UPDATE (Supported in Delta Lake)

​DELETE (Supported in Delta Lake)

​Built-in Functions

​String Functions

​Date and Time Functions

​Aggregate Functions

​Collection Functions

​JSON Functions

​Performance Tips

​Use Partitioning

​Use Bucketing

​Optimize Joins

​Configuration

​SQL CLI

​Additional Resources

Build docs developers (and LLMs) love

SQL Function Reference

SQL Syntax Reference

Key Topics

Running SQL Queries

Common SQL Operations

SELECT Queries

JOINs

Subqueries

Window Functions

CTEs (Common Table Expressions)

DDL Operations

Creating Tables

Managing Tables

Views

DML Operations

INSERT

UPDATE (Supported in Delta Lake)

DELETE (Supported in Delta Lake)

Built-in Functions

String Functions

Date and Time Functions

Aggregate Functions

Collection Functions

JSON Functions

Performance Tips

Use Partitioning

Use Bucketing

Optimize Joins

Configuration

SQL CLI

Additional Resources