Rerun

Creating sub-datasets

When experimenting with new features it's often practical to work with a subset of data without modifying the original. A sub-dataset references the same underlying RRD files so no data is copied.

The dependencies in this example are contained in rerun-sdk[all].

Setup setup

Simplified setup to launch the local server for demonstration. In practice you'll connect to your cloud instance.

from __future__ import annotations

from pathlib import Path

import pyarrow as pa
from datafusion import col, lit
from datafusion import functions as F

import rerun as rr

sample_5_path = Path(__file__).parents[4] / "tests" / "assets" / "rrd" / "sample_5"

server = rr.server.Server(datasets={"sample_dataset": sample_5_path})
CATALOG_URL = server.url()
client = rr.catalog.CatalogClient(CATALOG_URL)
source_dataset = client.get_dataset(name="sample_dataset")

Helper function helper-function

Query the source dataset's manifest for storage URLs per (segment, layer) pair and re-register them into a new dataset.

def create_sub_dataset(
    client: rr.catalog.CatalogClient,
    source: rr.catalog.DatasetEntry,
    name: str,
    segment_ids: list[str],
) -> rr.catalog.DatasetEntry:
    """Create a new dataset containing a subset of segments from an existing dataset."""

    # Query the manifest for storage URLs of the selected segments
    manifest = pa.table(
        source
        .manifest()
        .filter(F.in_list(col("rerun_segment_id"), [lit(s) for s in segment_ids]))
        .select("rerun_storage_url", "rerun_layer_name")
    )

    sub_dataset = client.create_dataset(name)

    if manifest.num_rows > 0:
        uris = manifest.column("rerun_storage_url").to_pylist()
        layers = manifest.column("rerun_layer_name").to_pylist()
        sub_dataset.register(uris, layer_name=layers).wait()

    return sub_dataset

Selecting segments selecting-segments

Select segments by any criteria — a hardcoded list, a slice, or a filtered query based on segment properties or metadata joins.

# View available segments
print("Available segments:")
print(source_dataset.segment_table().select("rerun_segment_id").sort("rerun_segment_id"))

# Select a subset — here we pick the first 3 segments.
all_segment_ids = source_dataset.segment_ids()
subset_ids = all_segment_ids[:3]

Creating the sub-dataset creating-the-subdataset

sub_dataset = create_sub_dataset(client, source_dataset, "my_experiment", subset_ids)

Verifying the result verifying-the-result

print("\nSub-dataset segments:")
print(sub_dataset.segment_table().select("rerun_segment_id", "rerun_layer_names").sort("rerun_segment_id"))

print("\nSub-dataset manifest:")
print(
    sub_dataset
    .manifest()
    .select("rerun_segment_id", "rerun_layer_name", "rerun_storage_url")
    .sort("rerun_segment_id", "rerun_layer_name")
)

Cleanup cleanup

Delete the sub-dataset when it is no longer needed. This only removes the dataset entry from the catalog. The underlying RRD storage is not affected.

# When done experimenting, delete the sub-dataset.
# This only removes the dataset entry — the underlying RRD storage is not affected.
sub_dataset.delete()