Documentation Index
Fetch the complete documentation index at: https://mintlify.com/terrafloww/rasteret/llms.txt
Use this file to discover all available pages before exploring further.
Rasteret’s dataset registry lets you define reusable dataset descriptors for your own STAC APIs, GeoParquet indexes, or local collections. Once registered, you can build Collections using the simple rasteret.build(dataset_id, ...) API.
Quick Start: Register a STAC Dataset
import rasteret
from rasteret import DatasetDescriptor
# Define your dataset
descriptor = DatasetDescriptor(
id="acme/field-survey-2024",
name="ACME Field Survey 2024",
description="High-res drone imagery, 10cm, agricultural fields",
stac_api="https://stac.acme.com/v1",
stac_collection="field-survey-2024",
band_map={
"R": "red",
"G": "green",
"B": "blue",
"NIR": "nir",
},
spatial_coverage="regional",
temporal_range=("2024-01-01", "2024-12-31"),
license="proprietary",
)
# Register it
rasteret.register(descriptor)
# Use it
collection = rasteret.build(
"acme/field-survey-2024",
name="field-001",
bbox=(11.3, 48.1, 11.5, 48.3),
date_range=("2024-06-01", "2024-08-31"),
)
DatasetDescriptor Fields
A DatasetDescriptor captures dataset identity, access methods, and band mappings:
Identity
DatasetDescriptor(
id="my-org/my-dataset", # Namespaced ID
name="My Dataset", # Human-readable name
description="Brief description", # One-liner
)
Access: STAC API
DatasetDescriptor(
id="my-org/sentinel-3",
stac_api="https://stac.example.com/v1",
stac_collection="sentinel-3-olci",
band_map={
"B01": "Oa01",
"B02": "Oa02",
# ... more bands
},
)
Access: GeoParquet
DatasetDescriptor(
id="my-org/custom-index",
geoparquet_uri="s3://my-bucket/index.parquet",
column_map={
"fid": "id",
"geom": "geometry",
"timestamp": "datetime",
},
href_column="cog_url",
band_index_map={"R": 0, "G": 1, "B": 2}, # Multi-band COG
)
Coverage & Licensing
DatasetDescriptor(
id="my-org/dataset",
spatial_coverage="global", # "global", "regional", "local", etc.
temporal_range=("2020-01-01", "2024-12-31"),
license="CC-BY-4.0", # SPDX identifier or "proprietary"
license_url="https://creativecommons.org/licenses/by/4.0/",
commercial_use=True, # False for licenses like CC-BY-NC
)
Authentication
DatasetDescriptor(
id="my-org/private-data",
requires_auth=True,
s3_credentials_url="https://example.com/api/credentials",
cloud_config={
"provider": "aws",
"requester_pays": True,
"region": "us-west-2",
"url_patterns": {
"https://cdn.example.com/": "s3://my-bucket/",
},
},
)
Registering Local Collections
If you’ve built a Collection and want to make it available via the registry:
from pathlib import Path
# Build a Collection
collection = rasteret.build_from_stac(
name="my-local-collection",
stac_api="...",
collection="sentinel-2-l2a",
bbox=(11.3, 48.1, 11.5, 48.3),
date_range=("2024-01-01", "2024-06-30"),
)
# Export it
export_path = Path("./collections/my_local_collection")
collection.export(export_path)
# Register it
descriptor = rasteret.register_local(
dataset_id="local/my-collection",
path=export_path,
description="My analysis-ready collection",
)
# Now you can load it by ID
reloaded = rasteret.build("local/my-collection", name="reloaded")
What it does:
- Creates a
DatasetDescriptor pointing to the local Parquet path
- Registers it in the global registry (in-memory)
- Persists it to
~/.rasteret/datasets.local.json so it’s available in future sessions
Managing the Registry
List All Datasets
from rasteret import DatasetRegistry
for desc in DatasetRegistry.list():
print(f"{desc.id}: {desc.name}")
Search Datasets
# Search by keyword (case-insensitive)
results = DatasetRegistry.search("sentinel")
for desc in results:
print(f"{desc.id}: {desc.name}")
Get a Descriptor
desc = DatasetRegistry.get("earthsearch/sentinel-2-l2a")
if desc:
print(f"Name: {desc.name}")
print(f"STAC API: {desc.stac_api}")
print(f"Bands: {list(desc.band_map.keys())}")
Unregister a Dataset
# Unregister from in-memory registry
DatasetRegistry.unregister("my-org/my-dataset")
# Unregister from local persistent registry
from rasteret.catalog import unregister_local_descriptor
unregister_local_descriptor("local/my-collection")
Advanced: Multi-Band COGs
For datasets where multiple bands are stored in a single COG file:
DatasetDescriptor(
id="my-org/naip-custom",
stac_api="https://stac.example.com/v1",
stac_collection="naip-custom",
band_map={
"R": "image", # All bands map to the same asset
"G": "image",
"B": "image",
"NIR": "image",
},
band_index_map={ # Sample index within the multi-band COG
"R": 0,
"G": 1,
"B": 2,
"NIR": 3,
},
separate_files=False, # Indicates multi-band COG
)
Example: Private STAC API
import rasteret
from rasteret import DatasetDescriptor
# Define descriptor
descriptor = DatasetDescriptor(
id="mycompany/internal-imagery",
name="Internal High-Res Imagery",
description="Proprietary 1m imagery, internal use only",
stac_api="https://stac.internal.mycompany.com/v1",
stac_collection="internal-2024",
band_map={
"R": "red",
"G": "green",
"B": "blue",
},
spatial_coverage="regional",
temporal_range=("2024-01-01", "present"),
requires_auth=True,
license="proprietary",
commercial_use=False,
)
# Register it
rasteret.register(descriptor)
# Use with credentials
from obstore.store import S3Store
backend = rasteret.create_backend(
default_s3_config={"region": "us-east-1"},
)
collection = rasteret.build(
"mycompany/internal-imagery",
name="project-x",
bbox=(-77.1, 38.8, -76.9, 39.0),
date_range=("2024-06-01", "2024-08-31"),
backend=backend,
)
Example: GeoParquet Index from Source Cooperative
import rasteret
from rasteret import DatasetDescriptor
# AEF embeddings (already registered, shown here for reference)
descriptor = DatasetDescriptor(
id="my-org/aef-custom",
name="AEF Embeddings (Custom)",
description="64-band int8 embeddings, 10m, annual",
geoparquet_uri=(
"s3://us-west-2.opendata.source.coop/"
"tge-labs/aef/v1/annual/aef_index.parquet"
),
column_map={"fid": "id", "geom": "geometry", "year": "datetime"},
href_column="path",
band_index_map={f"A{i:02d}": i for i in range(64)},
bbox_columns={
"minx": "wgs84_west",
"miny": "wgs84_south",
"maxx": "wgs84_east",
"maxy": "wgs84_north",
},
separate_files=False,
spatial_coverage="global",
temporal_range=("2018-01-01", "2023-12-31"),
license="CC-BY-4.0",
cloud_config={
"provider": "aws",
"region": "us-west-2",
"url_patterns": {
"s3://us-west-2.opendata.source.coop/": ("https://data.source.coop/"),
},
},
)
rasteret.register(descriptor)
See /home/daytona/workspace/source/examples/aef_duckdb_query.py:1 for a complete example using this descriptor.
Sharing Descriptors
To share a dataset descriptor with teammates:
Export to JSON
from rasteret.catalog import export_local_descriptor
from pathlib import Path
export_path = export_local_descriptor(
dataset_id="local/my-collection",
output_path=Path("./shared/my_collection_descriptor.json"),
)
print(f"Exported to {export_path}")
Import from JSON
import json
from rasteret import DatasetDescriptor
with open("my_collection_descriptor.json") as f:
data = json.load(f)
descriptor = DatasetDescriptor(**data)
rasteret.register(descriptor)
Best Practices
Namespacing
Use namespaced IDs to avoid collisions:
org/dataset: For organization-owned datasets
provider/collection: For specific STAC provider variants (e.g. pc/sentinel-2-l2a vs earthsearch/sentinel-2-l2a)
local/name: For local collections
Band Naming
Use consistent band codes across datasets:
- Sentinel-2:
B01, B02, …, B12, SCL
- Landsat:
B1, B2, …, B7, qa_pixel
- Custom: Choose descriptive codes (e.g.
R, G, B, NIR)
Documentation
Include clear descriptions and license info:
DatasetDescriptor(
id="my-org/dataset",
name="My Dataset",
description="Brief description (visible in CLI and docs)",
license="CC-BY-4.0",
license_url="https://creativecommons.org/licenses/by/4.0/",
)
Example Queries
Provide example bbox/date ranges for testing:
DatasetDescriptor(
id="my-org/dataset",
example_bbox=(11.3, 48.1, 11.5, 48.3),
example_date_range=("2024-06-01", "2024-06-30"),
)
Next Steps