Datatable vs SQL

If you know SQL, you already understand the core of datatable’s query model. The DT[i, j, by] syntax maps cleanly onto SQL clauses:

SQL	datatable
`SELECT`	`j` (second argument)
`WHERE`	`i` (first argument)
`GROUP BY`	`by()` (third argument)
`ORDER BY`	`sort()`
`LEFT JOIN`	`join()`
`UNION ALL`	`rbind()`

Most examples below use the iris dataset.

from datatable import dt, f, g, by, join, sort, update, fread

iris = fread("https://raw.githubusercontent.com/h2oai/datatable/main/docs/_static/iris.csv")

SELECT

datatable
SQL

# Select specific columns (first 5 rows)
iris[:5, ["sepal_length", "sepal_width", "petal_length"]]

# Select all columns
iris[:5, :]

# Select a single column (no i needed)
iris["sepal_length"].head(5)

# Add a computed column
iris[:5, f[:].extend({"sepal_length_doubled": f.sepal_length * 2})]

# Add column in-place with update()
iris[:, update(sepal_length_doubled=f.sepal_length * 2)]

-- Select specific columns
SELECT sepal_length, sepal_width, petal_length
FROM iris
LIMIT 5;

-- Select all columns
SELECT *
FROM iris
LIMIT 5;

-- Select a single column
SELECT sepal_length
FROM iris
LIMIT 5;

-- Add a computed column
SELECT *, sepal_length * 2 AS sepal_length_doubled
FROM iris
LIMIT 5;

WHERE

datatable
SQL

# Filter rows
iris[f.species == "virginica", :].head(5)

# Multiple conditions (AND)
iris[(f.species == "setosa") & (f.sepal_length == 5), :]

# Multiple conditions (OR)
iris[(f.species == "setosa") | (f.sepal_length > 7), :]

# Filter out NULL / NA values
null_data[f.b != None, :]
# or using isna:
null_data[~dt.math.isna(f.b), :]

# Keep only NULL rows
null_data[dt.isna(f.b), :]

-- Filter rows
SELECT * FROM iris
WHERE species = 'virginica'
LIMIT 5;

-- Multiple conditions (AND)
SELECT * FROM iris
WHERE species = 'setosa' AND sepal_length = 5;

-- Multiple conditions (OR)
SELECT * FROM iris
WHERE species = 'setosa' OR sepal_length > 7;

-- Filter out NULLs
SELECT * FROM null_data
WHERE b IS NOT NULL;

-- Keep only NULLs
SELECT * FROM null_data
WHERE b IS NULL;

SQL uses = for equality; Python uses ==. SQL uses AND/OR; datatable uses &/| with each condition wrapped in parentheses.

ORDER BY

datatable
SQL

# Sort ascending (first 5)
iris[:5, :, sort("sepal_length")]

# Sort descending — two equivalent forms
iris[:5, :, sort("sepal_length", reverse=True)]
iris[:5, :, sort(-f.sepal_length)]

# Sort by multiple columns
iris[:, :, sort("species", -f.sepal_length)]

-- Sort ascending
SELECT * FROM iris
ORDER BY sepal_length ASC
LIMIT 5;

-- Sort descending
SELECT * FROM iris
ORDER BY sepal_length DESC
LIMIT 5;

-- Sort by multiple columns
SELECT * FROM iris
ORDER BY species ASC, sepal_length DESC;

GROUP BY

Single aggregation per group

datatable
SQL

iris[:, dt.count(), by("species")]

SELECT species, COUNT() AS N
FROM iris
GROUP BY species;

Multiple aggregations per group

datatable
SQL

iris[:,
     {"mean_sepal_length": dt.mean(f.sepal_length), "N": dt.count()},
     by("species")]

SELECT species,
       AVG(sepal_length) AS mean_sepal_length,
       COUNT() AS N
FROM iris
GROUP BY species;

Grouping on multiple columns

datatable
SQL

fruits_data[:,
            {"sum_num": dt.sum(f.Number)},
            by("Fruit", "Name")]

SELECT fruit, name, SUM(number) AS sum_num
FROM fruits_data
GROUP BY fruit, name;

WHERE with GROUP BY

datatable
SQL

# Chain: filter first, then group
iris[f.sepal_width >= 3, :][:,
     {"avg_sepal_length": dt.mean(f.sepal_length)},
     by("species")]

SELECT species, AVG(sepal_length) AS avg_sepal_length
FROM iris
WHERE sepal_width > 3
GROUP BY species;

HAVING with GROUP BY

datatable
SQL

# Compute aggregation, then filter with i (chaining)
fruits_data[:,
            {"sum_num": dt.sum(f.Number)},
            by("Fruit", "Name")][f.sum_num > 50, :]

SELECT fruit, name, SUM(number) AS sum_num
FROM fruits_data
GROUP BY fruit, name
HAVING sum_num > 50;

Grouping on a condition

datatable
SQL

iris[:,
     {"avg_sepal_length": dt.mean(f.sepal_length)},
     by(f.sepal_width >= 3)]

SELECT sepal_width >= 3 AS width_larger_than_3,
       AVG(sepal_length) AS avg_sepal_length
FROM iris
GROUP BY sepal_width >= 3;

LEFT OUTER JOIN

datatable
SQL

DT = dt.Frame(x=["b"] * 3 + ["a"] * 3 + ["c"] * 3,
              y=[1, 3, 6] * 3,
              v=range(1, 10))
X = dt.Frame({"x": ("c", "b"), "v": (8, 7), "foo": (4, 2)})

# The joined frame must be keyed; key column must be unique
X.key = "x"

DT[:, [f.x, f.y, f.v, g.foo], join(X)]

SELECT DT.x, DT.y, DT.v, X.foo
FROM DT
LEFT JOIN X ON DT.x = X.x;

datatable supports left outer joins only at this time. The frame being joined must be keyed, the key column must contain unique values, and the join column must share the same name in both frames.

UNION ALL

datatable
SQL

# UNION ALL — stack all rows including duplicates
dt.rbind([DT[:, ("x", "v")], X[:, ("x", "v")]])

SELECT x, v FROM DT
UNION ALL
SELECT x, v FROM X;

SQL’s UNION (without ALL) removes duplicate rows. datatable does not have a built-in deduplication function equivalent yet.

Window Functions

TOP n rows per group (ROW_NUMBER)

datatable
SQL

# Top 3 rows per species by sepal_length descending
iris[:3, :, by("species"), sort(-f.sepal_length)]

SELECT * FROM (
  SELECT *,
         ROW_NUMBER() OVER (
           PARTITION BY species
           ORDER BY sepal_length DESC
         ) AS row_num
  FROM iris
)
WHERE row_num < 3;

Filter above per-group average

datatable
SQL

# Add per-group flag, then filter
iris[:, update(temp=f.sepal_length > dt.mean(f.sepal_length)),
     by("species")]
iris[f.temp == 1, f[:-1]].head(5)

SELECT sepal_length, sepal_width, petal_length, petal_width, species
FROM (
  SELECT *,
         AVG(sepal_length) OVER (PARTITION BY species) AS avg_sl
  FROM iris
)
WHERE sepal_length > avg_sl
LIMIT 5;

LEAD and LAG

datatable
SQL

# shift() with negative n = LEAD; positive n = LAG
source_data[:,
            f[:].extend({
                "lead1": dt.shift(f.dep_date, -1),
                "lead2": dt.shift(f.dep_date, -2),
                "lag1":  dt.shift(f.dep_date),
                "lag3":  dt.shift(f.dep_date, 3),
            }),
            sort("dep_date", "name")]

SELECT name, destination, dep_date,
       LEAD(dep_date)    OVER (ORDER BY dep_date, name) AS lead1,
       LEAD(dep_date, 2) OVER (ORDER BY dep_date, name) AS lead2,
       LAG(dep_date)     OVER (ORDER BY dep_date, name) AS lag1,
       LAG(dep_date, 3)  OVER (ORDER BY dep_date, name) AS lag3
FROM source_data;

Total sum and proportions

datatable
SQL

proportions = dt.Frame({"t": [1, 2, 3]})

proportions[:,
            f[:].extend({
                "sum": dt.sum(f.t),
                "pct": f.t / dt.sum(f.t),
            })]

SELECT t,
       SUM(t) OVER () AS sum,
       CAST(t AS FLOAT) / SUM(t) OVER () AS pct
FROM proportions;

Concept Mapping Summary

SQL clause	datatable position	Notes
`SELECT col`	`j`	`DT[:, "col"]` or `DT[:, f.col]`
`SELECT *`	`j = ":"`	`DT[:, :]`
`SELECT expr AS name`	`j = {"name": expr}`	Dict in `j`
`WHERE cond`	`i`	`DT[f.col > 0, :]`
`WHERE x IS NULL`	`i`	`DT[dt.isna(f.x), :]`
`ORDER BY col ASC`	`sort("col")`	Third argument
`ORDER BY col DESC`	`sort(-f.col)`	Negate with `-`
`GROUP BY col`	`by("col")`	Third argument
`HAVING agg > n`	Chain `[agg_expr, by(...)][f.result > n, :]`	Filter after aggregation
`LEFT JOIN`	`join(keyed_frame)`	Frame must be keyed
`UNION ALL`	`dt.rbind([DT1, DT2])`	Row-bind
`LAG(col, n)`	`dt.shift(f.col, n)`	Positive `n`
`LEAD(col, n)`	`dt.shift(f.col, -n)`	Negative `n`

Get Started

Core Concepts

Working with Data

Machine Learning

Migration & Comparisons

SELECT

WHERE

ORDER BY

GROUP BY

Single aggregation per group

Multiple aggregations per group

Grouping on multiple columns

WHERE with GROUP BY

HAVING with GROUP BY

Grouping on a condition

LEFT OUTER JOIN

UNION ALL

Window Functions

TOP n rows per group (ROW_NUMBER)

Filter above per-group average

LEAD and LAG

Total sum and proportions

Concept Mapping Summary

Build docs developers (and LLMs) love

Get Started

Core Concepts

Working with Data

Machine Learning

Migration & Comparisons

​SELECT

​WHERE

​ORDER BY

​GROUP BY

​Single aggregation per group

​Multiple aggregations per group

​Grouping on multiple columns

​WHERE with GROUP BY

​HAVING with GROUP BY

​Grouping on a condition

​LEFT OUTER JOIN

​UNION ALL

​Window Functions

​TOP n rows per group (ROW_NUMBER)

​Filter above per-group average

​LEAD and LAG

​Total sum and proportions

​Concept Mapping Summary

Build docs developers (and LLMs) love

SELECT

WHERE

ORDER BY

GROUP BY

Single aggregation per group

Multiple aggregations per group

Grouping on multiple columns

WHERE with GROUP BY

HAVING with GROUP BY

Grouping on a condition

LEFT OUTER JOIN

UNION ALL

Window Functions

TOP n rows per group (ROW_NUMBER)

Filter above per-group average

LEAD and LAG

Total sum and proportions

Concept Mapping Summary