Documentation Index
Fetch the complete documentation index at: https://mintlify.com/terrafloww/rasteret/llms.txt
Use this file to discover all available pages before exploring further.
Class Definition
class Collection:
def __init__(
self,
dataset: ds.Dataset | None = None,
name: str = "",
description: str = "",
data_source: str = "",
start_date: datetime | None = None,
end_date: datetime | None = None,
)
Description
A collection of raster data with flexible initialization and efficient partitioned storage. Collections can be created from local partitioned datasets or single Arrow tables.
Collections provide spatial and temporal filtering, pixel data extraction, and integration with popular geospatial libraries (xarray, GeoPandas, TorchGeo).
Attributes
dataset
pyarrow.dataset.Dataset | None
Backing Arrow dataset containing the raster metadata.
Human-readable collection name.
Data source identifier (e.g., "sentinel-2-l2a").
Collection temporal start.
Methods
from_parquet()
@classmethod
def from_parquet(
cls,
path: str | Path,
name: str = "",
) -> Collection
Load a Collection from any Parquet file or directory. Accepts local paths and cloud URIs (s3://, gs://). Validates that core contract columns are present.
Path to the Parquet file or dataset directory.
subset()
def subset(
self,
*,
cloud_cover_lt: float | None = None,
date_range: tuple[str, str] | None = None,
bbox: tuple[float, float, float, float] | None = None,
geometries: Any = None,
split: str | Sequence[str] | None = None,
split_column: str = "split",
) -> Collection
Return a filtered view of this Collection. All provided criteria are combined with AND.
Keep records with eo:cloud_cover below this value (0-100).
(start, end) ISO date strings for temporal filtering.
bbox
tuple[float, float, float, float]
(minx, miny, maxx, maxy) bounding box filter.
Spatial filter; records whose bbox overlaps any geometry are kept. Accepts bbox tuples, Arrow arrays, Shapely objects, raw WKB bytes, or GeoJSON dicts.
Keep only rows matching the given split value(s).
Column name holding split labels.
get_xarray()
def get_xarray(
self,
geometries: Any,
bands: list[str],
resolution: float,
*,
resampling: str = "bilinear",
backend: Any = None,
) -> xr.Dataset
Extract pixel data as an xarray Dataset.
Target geometries (bbox tuple, Shapely, WKB, GeoJSON, or Arrow array).
List of band codes to extract (e.g., ["B04", "B03", "B02"]).
Target spatial resolution in meters.
Resampling method: "bilinear", "nearest", "cubic", etc.
I/O backend for authenticated cloud reads.
get_numpy()
def get_numpy(
self,
geometries: Any,
bands: list[str],
resolution: float,
*,
resampling: str = "bilinear",
backend: Any = None,
) -> list[np.ndarray]
Extract pixel data as numpy arrays.
get_gdf()
def get_gdf(
self,
*,
columns: list[str] | None = None,
geometries: Any = None,
) -> gpd.GeoDataFrame
Export collection metadata as a GeoDataFrame.
to_torchgeo_dataset()
def to_torchgeo_dataset(
self,
*,
backend: Any = None,
) -> RasteretGeoDataset
Convert to a TorchGeo-compatible GeoDataset for training.
where()
def where(self, expr: ds.Expression) -> Collection
Return a filtered view using a raw PyArrow dataset expression. For advanced filtering beyond what subset() provides.
expr
pyarrow.dataset.Expression
required
PyArrow expression for filtering (e.g., ds.field("cloud_cover") < 10).
Example:
import pyarrow.dataset as ds
# Complex filter with multiple conditions
filtered = collection.where(
(ds.field("cloud_cover") < 5) &
(ds.field("platform") == "sentinel-2a")
)
select_split()
def select_split(
self,
split: str | Sequence[str],
*,
split_column: str = "split",
) -> Collection
Return a split-filtered view of this Collection. Convenience wrapper around subset(split=...).
split
str | Sequence[str]
required
Split value(s) to filter (e.g., "train", ["train", "val"]).
Column name holding split labels.
Example:
train = collection.select_split("train")
val = collection.select_split("val")
test = collection.select_split("test")
list_collections()
@classmethod
def list_collections(
cls,
workspace_dir: Path | None = None,
) -> list[dict[str, Any]]
List cached collections with summary metadata.
Directory to scan for cached collections. Defaults to ~/rasteret_workspace.
Returns: List of dictionaries with collection metadata (name, path, scene count, date range).
Example:
collections = Collection.list_collections()
for c in collections:
print(f"{c['name']}: {c['scene_count']} scenes")
export()
def export(
self,
path: str | Path,
*,
format: str = "parquet",
partitioning: Any = None,
) -> None
Persist the collection to disk.
Output path for the exported collection.
Output format (currently only "parquet" is supported).
Optional partitioning scheme for the output dataset.
Properties
bands
@property
def bands(self) -> list[str]
List of band codes available in this collection. Extracted from *_metadata columns in the schema.
bounds
@property
def bounds(self) -> tuple[float, float, float, float] | None
Spatial bounds of the collection as (minx, miny, maxx, maxy) in CRS84 (EPSG:4326).
epsg
@property
def epsg(self) -> list[int]
List of unique EPSG codes present in the collection’s scenes.
Usage Example
import rasteret
# Build a collection
collection = rasteret.build(
"earthsearch/sentinel-2-l2a",
name="bay-area",
bbox=(-122.5, 37.5, -122.0, 38.0),
date_range=("2024-01-01", "2024-03-31"),
)
print(f"{collection.name}: {len(collection)} scenes")
print(f"Date range: {collection.start_date} to {collection.end_date}")
# Filter for clear scenes
filtered = collection.subset(
cloud_cover_lt=10,
date_range=("2024-01-15", "2024-01-31"),
)
# Extract pixels
ds = filtered.get_xarray(
geometries=aoi_polygon,
bands=["B04", "B03", "B02"],
resolution=10,
)
# Export for sharing
filtered.export("/path/to/export/filtered.parquet")
- build() - Build a Collection from a registered dataset
- load() - Load a persisted Collection
- as_collection() - Wrap an in-memory Arrow object