Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/terrafloww/rasteret/llms.txt

Use this file to discover all available pages before exploring further.

Class Definition

class Collection:
    def __init__(
        self,
        dataset: ds.Dataset | None = None,
        name: str = "",
        description: str = "",
        data_source: str = "",
        start_date: datetime | None = None,
        end_date: datetime | None = None,
    )

Description

A collection of raster data with flexible initialization and efficient partitioned storage. Collections can be created from local partitioned datasets or single Arrow tables. Collections provide spatial and temporal filtering, pixel data extraction, and integration with popular geospatial libraries (xarray, GeoPandas, TorchGeo).

Attributes

dataset
pyarrow.dataset.Dataset | None
Backing Arrow dataset containing the raster metadata.
name
str
Human-readable collection name.
description
str
Free-text description.
data_source
str
Data source identifier (e.g., "sentinel-2-l2a").
start_date
datetime | None
Collection temporal start.
end_date
datetime | None
Collection temporal end.

Methods

from_parquet()

@classmethod
def from_parquet(
    cls,
    path: str | Path,
    name: str = "",
) -> Collection
Load a Collection from any Parquet file or directory. Accepts local paths and cloud URIs (s3://, gs://). Validates that core contract columns are present.
path
str | Path
required
Path to the Parquet file or dataset directory.
name
str
default:""
Optional name override.

subset()

def subset(
    self,
    *,
    cloud_cover_lt: float | None = None,
    date_range: tuple[str, str] | None = None,
    bbox: tuple[float, float, float, float] | None = None,
    geometries: Any = None,
    split: str | Sequence[str] | None = None,
    split_column: str = "split",
) -> Collection
Return a filtered view of this Collection. All provided criteria are combined with AND.
cloud_cover_lt
float
Keep records with eo:cloud_cover below this value (0-100).
date_range
tuple[str, str]
(start, end) ISO date strings for temporal filtering.
bbox
tuple[float, float, float, float]
(minx, miny, maxx, maxy) bounding box filter.
geometries
Any
Spatial filter; records whose bbox overlaps any geometry are kept. Accepts bbox tuples, Arrow arrays, Shapely objects, raw WKB bytes, or GeoJSON dicts.
split
str | Sequence[str]
Keep only rows matching the given split value(s).
split_column
str
default:"split"
Column name holding split labels.

get_xarray()

def get_xarray(
    self,
    geometries: Any,
    bands: list[str],
    resolution: float,
    *,
    resampling: str = "bilinear",
    backend: Any = None,
) -> xr.Dataset
Extract pixel data as an xarray Dataset.
geometries
Any
required
Target geometries (bbox tuple, Shapely, WKB, GeoJSON, or Arrow array).
bands
list[str]
required
List of band codes to extract (e.g., ["B04", "B03", "B02"]).
resolution
float
required
Target spatial resolution in meters.
resampling
str
default:"bilinear"
Resampling method: "bilinear", "nearest", "cubic", etc.
backend
StorageBackend
I/O backend for authenticated cloud reads.

get_numpy()

def get_numpy(
    self,
    geometries: Any,
    bands: list[str],
    resolution: float,
    *,
    resampling: str = "bilinear",
    backend: Any = None,
) -> list[np.ndarray]
Extract pixel data as numpy arrays.

get_gdf()

def get_gdf(
    self,
    *,
    columns: list[str] | None = None,
    geometries: Any = None,
) -> gpd.GeoDataFrame
Export collection metadata as a GeoDataFrame.

to_torchgeo_dataset()

def to_torchgeo_dataset(
    self,
    *,
    backend: Any = None,
) -> RasteretGeoDataset
Convert to a TorchGeo-compatible GeoDataset for training.

where()

def where(self, expr: ds.Expression) -> Collection
Return a filtered view using a raw PyArrow dataset expression. For advanced filtering beyond what subset() provides.
expr
pyarrow.dataset.Expression
required
PyArrow expression for filtering (e.g., ds.field("cloud_cover") < 10).
Example:
import pyarrow.dataset as ds

# Complex filter with multiple conditions
filtered = collection.where(
    (ds.field("cloud_cover") < 5) & 
    (ds.field("platform") == "sentinel-2a")
)

select_split()

def select_split(
    self,
    split: str | Sequence[str],
    *,
    split_column: str = "split",
) -> Collection
Return a split-filtered view of this Collection. Convenience wrapper around subset(split=...).
split
str | Sequence[str]
required
Split value(s) to filter (e.g., "train", ["train", "val"]).
split_column
str
default:"split"
Column name holding split labels.
Example:
train = collection.select_split("train")
val = collection.select_split("val")
test = collection.select_split("test")

list_collections()

@classmethod
def list_collections(
    cls,
    workspace_dir: Path | None = None,
) -> list[dict[str, Any]]
List cached collections with summary metadata.
workspace_dir
Path
Directory to scan for cached collections. Defaults to ~/rasteret_workspace.
Returns: List of dictionaries with collection metadata (name, path, scene count, date range). Example:
collections = Collection.list_collections()
for c in collections:
    print(f"{c['name']}: {c['scene_count']} scenes")

export()

def export(
    self,
    path: str | Path,
    *,
    format: str = "parquet",
    partitioning: Any = None,
) -> None
Persist the collection to disk.
path
str | Path
required
Output path for the exported collection.
format
str
default:"parquet"
Output format (currently only "parquet" is supported).
partitioning
Any
Optional partitioning scheme for the output dataset.

Properties

bands

@property
def bands(self) -> list[str]
List of band codes available in this collection. Extracted from *_metadata columns in the schema.

bounds

@property
def bounds(self) -> tuple[float, float, float, float] | None
Spatial bounds of the collection as (minx, miny, maxx, maxy) in CRS84 (EPSG:4326).

epsg

@property
def epsg(self) -> list[int]
List of unique EPSG codes present in the collection’s scenes.

Usage Example

import rasteret

# Build a collection
collection = rasteret.build(
    "earthsearch/sentinel-2-l2a",
    name="bay-area",
    bbox=(-122.5, 37.5, -122.0, 38.0),
    date_range=("2024-01-01", "2024-03-31"),
)

print(f"{collection.name}: {len(collection)} scenes")
print(f"Date range: {collection.start_date} to {collection.end_date}")

# Filter for clear scenes
filtered = collection.subset(
    cloud_cover_lt=10,
    date_range=("2024-01-15", "2024-01-31"),
)

# Extract pixels
ds = filtered.get_xarray(
    geometries=aoi_polygon,
    bands=["B04", "B03", "B02"],
    resolution=10,
)

# Export for sharing
filtered.export("/path/to/export/filtered.parquet")
  • build() - Build a Collection from a registered dataset
  • load() - Load a persisted Collection
  • as_collection() - Wrap an in-memory Arrow object

Build docs developers (and LLMs) love