If you know SQL, you already understand the core of datatable’s query model. The DT[i, j, by] syntax maps cleanly onto SQL clauses:
| SQL | datatable |
|---|
SELECT | j (second argument) |
WHERE | i (first argument) |
GROUP BY | by() (third argument) |
ORDER BY | sort() |
LEFT JOIN | join() |
UNION ALL | rbind() |
Most examples below use the iris dataset.
from datatable import dt, f, g, by, join, sort, update, fread
iris = fread("https://raw.githubusercontent.com/h2oai/datatable/main/docs/_static/iris.csv")
SELECT
# Select specific columns (first 5 rows)
iris[:5, ["sepal_length", "sepal_width", "petal_length"]]
# Select all columns
iris[:5, :]
# Select a single column (no i needed)
iris["sepal_length"].head(5)
# Add a computed column
iris[:5, f[:].extend({"sepal_length_doubled": f.sepal_length * 2})]
# Add column in-place with update()
iris[:, update(sepal_length_doubled=f.sepal_length * 2)]
-- Select specific columns
SELECT sepal_length, sepal_width, petal_length
FROM iris
LIMIT 5;
-- Select all columns
SELECT *
FROM iris
LIMIT 5;
-- Select a single column
SELECT sepal_length
FROM iris
LIMIT 5;
-- Add a computed column
SELECT *, sepal_length * 2 AS sepal_length_doubled
FROM iris
LIMIT 5;
WHERE
# Filter rows
iris[f.species == "virginica", :].head(5)
# Multiple conditions (AND)
iris[(f.species == "setosa") & (f.sepal_length == 5), :]
# Multiple conditions (OR)
iris[(f.species == "setosa") | (f.sepal_length > 7), :]
# Filter out NULL / NA values
null_data[f.b != None, :]
# or using isna:
null_data[~dt.math.isna(f.b), :]
# Keep only NULL rows
null_data[dt.isna(f.b), :]
-- Filter rows
SELECT * FROM iris
WHERE species = 'virginica'
LIMIT 5;
-- Multiple conditions (AND)
SELECT * FROM iris
WHERE species = 'setosa' AND sepal_length = 5;
-- Multiple conditions (OR)
SELECT * FROM iris
WHERE species = 'setosa' OR sepal_length > 7;
-- Filter out NULLs
SELECT * FROM null_data
WHERE b IS NOT NULL;
-- Keep only NULLs
SELECT * FROM null_data
WHERE b IS NULL;
SQL uses = for equality; Python uses ==. SQL uses AND/OR; datatable uses &/| with each condition wrapped in parentheses.
ORDER BY
# Sort ascending (first 5)
iris[:5, :, sort("sepal_length")]
# Sort descending — two equivalent forms
iris[:5, :, sort("sepal_length", reverse=True)]
iris[:5, :, sort(-f.sepal_length)]
# Sort by multiple columns
iris[:, :, sort("species", -f.sepal_length)]
-- Sort ascending
SELECT * FROM iris
ORDER BY sepal_length ASC
LIMIT 5;
-- Sort descending
SELECT * FROM iris
ORDER BY sepal_length DESC
LIMIT 5;
-- Sort by multiple columns
SELECT * FROM iris
ORDER BY species ASC, sepal_length DESC;
GROUP BY
Single aggregation per group
iris[:, dt.count(), by("species")]
SELECT species, COUNT() AS N
FROM iris
GROUP BY species;
Multiple aggregations per group
iris[:,
{"mean_sepal_length": dt.mean(f.sepal_length), "N": dt.count()},
by("species")]
SELECT species,
AVG(sepal_length) AS mean_sepal_length,
COUNT() AS N
FROM iris
GROUP BY species;
Grouping on multiple columns
fruits_data[:,
{"sum_num": dt.sum(f.Number)},
by("Fruit", "Name")]
SELECT fruit, name, SUM(number) AS sum_num
FROM fruits_data
GROUP BY fruit, name;
WHERE with GROUP BY
# Chain: filter first, then group
iris[f.sepal_width >= 3, :][:,
{"avg_sepal_length": dt.mean(f.sepal_length)},
by("species")]
SELECT species, AVG(sepal_length) AS avg_sepal_length
FROM iris
WHERE sepal_width > 3
GROUP BY species;
HAVING with GROUP BY
# Compute aggregation, then filter with i (chaining)
fruits_data[:,
{"sum_num": dt.sum(f.Number)},
by("Fruit", "Name")][f.sum_num > 50, :]
SELECT fruit, name, SUM(number) AS sum_num
FROM fruits_data
GROUP BY fruit, name
HAVING sum_num > 50;
Grouping on a condition
iris[:,
{"avg_sepal_length": dt.mean(f.sepal_length)},
by(f.sepal_width >= 3)]
SELECT sepal_width >= 3 AS width_larger_than_3,
AVG(sepal_length) AS avg_sepal_length
FROM iris
GROUP BY sepal_width >= 3;
LEFT OUTER JOIN
DT = dt.Frame(x=["b"] * 3 + ["a"] * 3 + ["c"] * 3,
y=[1, 3, 6] * 3,
v=range(1, 10))
X = dt.Frame({"x": ("c", "b"), "v": (8, 7), "foo": (4, 2)})
# The joined frame must be keyed; key column must be unique
X.key = "x"
DT[:, [f.x, f.y, f.v, g.foo], join(X)]
SELECT DT.x, DT.y, DT.v, X.foo
FROM DT
LEFT JOIN X ON DT.x = X.x;
datatable supports left outer joins only at this time. The frame being joined must be keyed, the key column must contain unique values, and the join column must share the same name in both frames.
UNION ALL
# UNION ALL — stack all rows including duplicates
dt.rbind([DT[:, ("x", "v")], X[:, ("x", "v")]])
SELECT x, v FROM DT
UNION ALL
SELECT x, v FROM X;
SQL’s UNION (without ALL) removes duplicate rows. datatable does not have a built-in deduplication function equivalent yet.
Window Functions
TOP n rows per group (ROW_NUMBER)
# Top 3 rows per species by sepal_length descending
iris[:3, :, by("species"), sort(-f.sepal_length)]
SELECT * FROM (
SELECT *,
ROW_NUMBER() OVER (
PARTITION BY species
ORDER BY sepal_length DESC
) AS row_num
FROM iris
)
WHERE row_num < 3;
Filter above per-group average
# Add per-group flag, then filter
iris[:, update(temp=f.sepal_length > dt.mean(f.sepal_length)),
by("species")]
iris[f.temp == 1, f[:-1]].head(5)
SELECT sepal_length, sepal_width, petal_length, petal_width, species
FROM (
SELECT *,
AVG(sepal_length) OVER (PARTITION BY species) AS avg_sl
FROM iris
)
WHERE sepal_length > avg_sl
LIMIT 5;
LEAD and LAG
# shift() with negative n = LEAD; positive n = LAG
source_data[:,
f[:].extend({
"lead1": dt.shift(f.dep_date, -1),
"lead2": dt.shift(f.dep_date, -2),
"lag1": dt.shift(f.dep_date),
"lag3": dt.shift(f.dep_date, 3),
}),
sort("dep_date", "name")]
SELECT name, destination, dep_date,
LEAD(dep_date) OVER (ORDER BY dep_date, name) AS lead1,
LEAD(dep_date, 2) OVER (ORDER BY dep_date, name) AS lead2,
LAG(dep_date) OVER (ORDER BY dep_date, name) AS lag1,
LAG(dep_date, 3) OVER (ORDER BY dep_date, name) AS lag3
FROM source_data;
Total sum and proportions
proportions = dt.Frame({"t": [1, 2, 3]})
proportions[:,
f[:].extend({
"sum": dt.sum(f.t),
"pct": f.t / dt.sum(f.t),
})]
SELECT t,
SUM(t) OVER () AS sum,
CAST(t AS FLOAT) / SUM(t) OVER () AS pct
FROM proportions;
Concept Mapping Summary
| SQL clause | datatable position | Notes |
|---|
SELECT col | j | DT[:, "col"] or DT[:, f.col] |
SELECT * | j = ":" | DT[:, :] |
SELECT expr AS name | j = {"name": expr} | Dict in j |
WHERE cond | i | DT[f.col > 0, :] |
WHERE x IS NULL | i | DT[dt.isna(f.x), :] |
ORDER BY col ASC | sort("col") | Third argument |
ORDER BY col DESC | sort(-f.col) | Negate with - |
GROUP BY col | by("col") | Third argument |
HAVING agg > n | Chain [agg_expr, by(...)][f.result > n, :] | Filter after aggregation |
LEFT JOIN | join(keyed_frame) | Frame must be keyed |
UNION ALL | dt.rbind([DT1, DT2]) | Row-bind |
LAG(col, n) | dt.shift(f.col, n) | Positive n |
LEAD(col, n) | dt.shift(f.col, -n) | Negative n |