Collection

Class Definition

class Collection:
    def __init__(
        self,
        dataset: ds.Dataset | None = None,
        name: str = "",
        description: str = "",
        data_source: str = "",
        start_date: datetime | None = None,
        end_date: datetime | None = None,
    )

Description

A collection of raster data with flexible initialization and efficient partitioned storage. Collections can be created from local partitioned datasets or single Arrow tables. Collections provide spatial and temporal filtering, pixel data extraction, and integration with popular geospatial libraries (xarray, GeoPandas, TorchGeo).

Attributes

dataset

pyarrow.dataset.Dataset | None

Backing Arrow dataset containing the raster metadata.

name

str

Human-readable collection name.

description

str

Free-text description.

data_source

str

Data source identifier (e.g., "sentinel-2-l2a").

start_date

datetime | None

Collection temporal start.

end_date

datetime | None

Collection temporal end.

Methods

from_parquet()

@classmethod
def from_parquet(
    cls,
    path: str | Path,
    name: str = "",
) -> Collection

Load a Collection from any Parquet file or directory. Accepts local paths and cloud URIs (s3://, gs://). Validates that core contract columns are present.

path

str | Path

required

Path to the Parquet file or dataset directory.

name

str

default:""

Optional name override.

subset()

def subset(
    self,
    *,
    cloud_cover_lt: float | None = None,
    date_range: tuple[str, str] | None = None,
    bbox: tuple[float, float, float, float] | None = None,
    geometries: Any = None,
    split: str | Sequence[str] | None = None,
    split_column: str = "split",
) -> Collection

Return a filtered view of this Collection. All provided criteria are combined with AND.

cloud_cover_lt

float

Keep records with eo:cloud_cover below this value (0-100).

date_range

tuple[str, str]

(start, end) ISO date strings for temporal filtering.

bbox

tuple[float, float, float, float]

(minx, miny, maxx, maxy) bounding box filter.

geometries

Any

Spatial filter; records whose bbox overlaps any geometry are kept. Accepts bbox tuples, Arrow arrays, Shapely objects, raw WKB bytes, or GeoJSON dicts.

split

str | Sequence[str]

Keep only rows matching the given split value(s).

split_column

str

default:"split"

Column name holding split labels.

get_xarray()

def get_xarray(
    self,
    geometries: Any,
    bands: list[str],
    resolution: float,
    *,
    resampling: str = "bilinear",
    backend: Any = None,
) -> xr.Dataset

Extract pixel data as an xarray Dataset.

geometries

Any

required

Target geometries (bbox tuple, Shapely, WKB, GeoJSON, or Arrow array).

bands

list[str]

required

List of band codes to extract (e.g., ["B04", "B03", "B02"]).

resolution

float

required

Target spatial resolution in meters.

resampling

str

default:"bilinear"

Resampling method: "bilinear", "nearest", "cubic", etc.

backend

StorageBackend

I/O backend for authenticated cloud reads.

get_numpy()

def get_numpy(
    self,
    geometries: Any,
    bands: list[str],
    resolution: float,
    *,
    resampling: str = "bilinear",
    backend: Any = None,
) -> list[np.ndarray]

Extract pixel data as numpy arrays.

get_gdf()

def get_gdf(
    self,
    *,
    columns: list[str] | None = None,
    geometries: Any = None,
) -> gpd.GeoDataFrame

Export collection metadata as a GeoDataFrame.

to_torchgeo_dataset()

def to_torchgeo_dataset(
    self,
    *,
    backend: Any = None,
) -> RasteretGeoDataset

Convert to a TorchGeo-compatible GeoDataset for training.

where()

def where(self, expr: ds.Expression) -> Collection

Return a filtered view using a raw PyArrow dataset expression. For advanced filtering beyond what subset() provides.

expr

pyarrow.dataset.Expression

required

PyArrow expression for filtering (e.g., ds.field("cloud_cover") < 10).

Example:

import pyarrow.dataset as ds

# Complex filter with multiple conditions
filtered = collection.where(
    (ds.field("cloud_cover") < 5) & 
    (ds.field("platform") == "sentinel-2a")
)

select_split()

def select_split(
    self,
    split: str | Sequence[str],
    *,
    split_column: str = "split",
) -> Collection

Return a split-filtered view of this Collection. Convenience wrapper around subset(split=...).

split

str | Sequence[str]

required

Split value(s) to filter (e.g., "train", ["train", "val"]).

split_column

str

default:"split"

Column name holding split labels.

Example:

train = collection.select_split("train")
val = collection.select_split("val")
test = collection.select_split("test")

list_collections()

@classmethod
def list_collections(
    cls,
    workspace_dir: Path | None = None,
) -> list[dict[str, Any]]

List cached collections with summary metadata.

workspace_dir

Path

Directory to scan for cached collections. Defaults to ~/rasteret_workspace.

Returns: List of dictionaries with collection metadata (name, path, scene count, date range). Example:

collections = Collection.list_collections()
for c in collections:
    print(f"{c['name']}: {c['scene_count']} scenes")

export()

def export(
    self,
    path: str | Path,
    *,
    format: str = "parquet",
    partitioning: Any = None,
) -> None

Persist the collection to disk.

path

str | Path

required

Output path for the exported collection.

format

str

default:"parquet"

Output format (currently only "parquet" is supported).

partitioning

Any

Optional partitioning scheme for the output dataset.

Properties

bands

@property
def bands(self) -> list[str]

List of band codes available in this collection. Extracted from *_metadata columns in the schema.

bounds

@property
def bounds(self) -> tuple[float, float, float, float] | None

Spatial bounds of the collection as (minx, miny, maxx, maxy) in CRS84 (EPSG:4326).

epsg

@property
def epsg(self) -> list[int]

List of unique EPSG codes present in the collection’s scenes.

Usage Example

import rasteret

# Build a collection
collection = rasteret.build(
    "earthsearch/sentinel-2-l2a",
    name="bay-area",
    bbox=(-122.5, 37.5, -122.0, 38.0),
    date_range=("2024-01-01", "2024-03-31"),
)

print(f"{collection.name}: {len(collection)} scenes")
print(f"Date range: {collection.start_date} to {collection.end_date}")

# Filter for clear scenes
filtered = collection.subset(
    cloud_cover_lt=10,
    date_range=("2024-01-15", "2024-01-31"),
)

# Extract pixels
ds = filtered.get_xarray(
    geometries=aoi_polygon,
    bands=["B04", "B03", "B02"],
    resolution=10,
)

# Export for sharing
filtered.export("/path/to/export/filtered.parquet")

build() - Build a Collection from a registered dataset
load() - Load a persisted Collection
as_collection() - Wrap an in-memory Arrow object

Core API

Building Collections

Data Access

Configuration

CLI

Class Definition

Description

Attributes

Methods

from_parquet()

subset()

get_xarray()

get_numpy()

get_gdf()

to_torchgeo_dataset()

where()

select_split()

list_collections()

export()

Properties

bands

bounds

epsg

Usage Example

Build docs developers (and LLMs) love

Core API

Building Collections

Data Access

Configuration

CLI

Documentation Index

​Class Definition

​Description

​Attributes

​Methods

​from_parquet()

​subset()

​get_xarray()

​get_numpy()

​get_gdf()

​to_torchgeo_dataset()

​where()

​select_split()

​list_collections()

​export()

​Properties

​bands

​bounds

​epsg

​Usage Example

​Related Functions

Build docs developers (and LLMs) love

Class Definition

Description

Attributes

Methods

from_parquet()

subset()

get_xarray()

get_numpy()

get_gdf()

to_torchgeo_dataset()

where()

select_split()

list_collections()

export()

Properties

bands

bounds

epsg

Usage Example

Related Functions