From 3e70d957f14836186afdb661f204ffc54737bc0d Mon Sep 17 00:00:00 2001
From: d33bs <ekgto445@gmail.com>
Date: Thu, 4 Jun 2026 10:36:57 -0600
Subject: [PATCH 1/2] enable data subsets

---
 README.md                |  40 +++++
 docs/src/python-api.md   |   8 +
 pyproject.toml           |  10 +-
 src/OME_IRIS/__init__.py |  11 +-
 src/OME_IRIS/cli.py      |  51 ++++++
 src/OME_IRIS/datasets.py | 335 +++++++++++++++++++++++++++++++++++++++
 tests/test_cli.py        |  28 +++-
 tests/test_datasets.py   | 127 +++++++++++++++
 8 files changed, 605 insertions(+), 5 deletions(-)
 create mode 100644 src/OME_IRIS/datasets.py
 create mode 100644 tests/test_datasets.py

diff --git a/README.md b/README.md
index 0250548..9166ed0 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,27 @@ uv run ome-iris verify
 uv run ome-iris export-rocrate --dataset nf1-cellpainting-shrunken
 ```
 
+Download a reproducible subset for local development or benchmarking:
+
+```bash
+uv run ome-iris download nf1 \
+  --output .benchmark-data/ome-iris/nf1 \
+  --preset tiny \
+  --channel DAPI
+```
+
+Python API:
+
+```python
+from ome_iris import datasets
+
+datasets.download(
+    "nf1",
+    output_dir=".benchmark-data/ome-iris/nf1",
+    subset={"images": 20, "channels": ["DAPI"]},
+)
+```
+
 Fetch output modes:
 
 ```bash
@@ -82,6 +103,25 @@ uv run ome-iris fetch --data-dir /tmp/ome-iris-data
 uv run ome-iris verify --data-dir /tmp/ome-iris-data
 ```
 
+## What `download` does
+
+`ome-iris download` creates a small, reproducible subset under the exact `--output`
+directory. It supports named dataset aliases such as `nf1`, preset sizes
+(`tiny`, `small`, `benchmark`), image limits, channel filters, plate/well/site
+filters, and Z/T/C ranges where filenames expose those values.
+
+Downloaded subsets include `manifest.json` with the source dataset, selected
+subset options, downloaded file paths, source URLs, SHA-256 checksums, file
+sizes, image shapes, dtypes, and file metadata. Existing files are reused and
+included in the manifest. Use `--validate-only` to verify an existing subset
+cache against its manifest without downloading data:
+
+```bash
+uv run ome-iris download nf1 \
+  --output .benchmark-data/ome-iris/nf1 \
+  --validate-only
+```
+
 ## Add a dataset
 
 1. Add or update a dataset manifest and catalog metadata.
diff --git a/docs/src/python-api.md b/docs/src/python-api.md
index 449de29..9917e62 100644
--- a/docs/src/python-api.md
+++ b/docs/src/python-api.md
@@ -16,6 +16,14 @@
    :undoc-members:
 ```
 
+## Dataset subsets
+
+```{eval-rst}
+.. automodule:: OME_IRIS.datasets
+   :members:
+   :undoc-members:
+```
+
 ## Verify
 
 ```{eval-rst}
diff --git a/pyproject.toml b/pyproject.toml
index 2c24b55..f773ae0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,15 +39,19 @@ docs = [
 ]
 
 [tool.setuptools]
-package-dir = { "" = "src" }
+package-dir = { "" = "src", ome_iris = "src/OME_IRIS" }
 include-package-data = true
 package-data.OME_IRIS = [
   "data/*.yaml",
   "data/*.csv",
   "data/datasets/*.yaml",
 ]
-packages.find.where = [ "src" ]
-packages.find.include = [ "OME_IRIS*" ]
+package-data.ome_iris = [
+  "data/*.yaml",
+  "data/*.csv",
+  "data/datasets/*.yaml",
+]
+packages = [ "OME_IRIS", "ome_iris" ]
 
 [tool.setuptools_scm]
 write_to = "src/OME_IRIS/_version.py"
diff --git a/src/OME_IRIS/__init__.py b/src/OME_IRIS/__init__.py
index 23d5fc4..70b48a4 100644
--- a/src/OME_IRIS/__init__.py
+++ b/src/OME_IRIS/__init__.py
@@ -1,8 +1,17 @@
 """OME-IRIS package."""
 
-__all__ = ["__version__"]
+from __future__ import annotations
+
+import sys
+
+from . import datasets
+
+__all__ = ["__version__", "datasets"]
 
 try:
     from ._version import version as __version__
 except ImportError:  # pragma: no cover
     __version__ = "0+unknown"
+
+sys.modules.setdefault("ome_iris", sys.modules[__name__])
+sys.modules.setdefault("ome_iris.datasets", datasets)
diff --git a/src/OME_IRIS/cli.py b/src/OME_IRIS/cli.py
index 14164ab..f79f395 100644
--- a/src/OME_IRIS/cli.py
+++ b/src/OME_IRIS/cli.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 from OME_IRIS.clean import clean_local_data
+from OME_IRIS.datasets import download
 from OME_IRIS.fetch import fetch_datasets
 from OME_IRIS.rocrate import export_rocrate_metadata
 from OME_IRIS.scaffold import scaffold_dataset_manifest
@@ -16,6 +17,24 @@ def build_parser() -> argparse.ArgumentParser:
     )
     sub = parser.add_subparsers(dest="command", required=True)
 
+    download_cmd = sub.add_parser(
+        "download", help="Download a reproducible dataset subset"
+    )
+    download_cmd.add_argument("dataset")
+    download_cmd.add_argument("--output", required=True)
+    download_cmd.add_argument("--preset", choices=["tiny", "small", "benchmark"])
+    download_cmd.add_argument("--limit-images", type=int)
+    download_cmd.add_argument("--channel", dest="channels", action="append")
+    download_cmd.add_argument("--plate", action="append")
+    download_cmd.add_argument("--well", action="append")
+    download_cmd.add_argument("--site", action="append")
+    download_cmd.add_argument("--z-range", nargs=2, type=int, metavar=("START", "STOP"))
+    download_cmd.add_argument("--t-range", nargs=2, type=int, metavar=("START", "STOP"))
+    download_cmd.add_argument("--c-range", nargs=2, type=int, metavar=("START", "STOP"))
+    download_cmd.add_argument("--validate-only", action="store_true")
+    download_cmd.add_argument("--manifests-dir", default="src/OME_IRIS/data/datasets")
+    download_cmd.add_argument("--silent", action="store_true")
+
     fetch_cmd = sub.add_parser("fetch", help="Fetch dataset files")
     fetch_cmd.add_argument("--dataset", dest="dataset_id")
     fetch_cmd.add_argument("--tier", choices=["tiny", "small", "realistic"])
@@ -68,6 +87,38 @@ def main() -> int:
     parser = build_parser()
     args = parser.parse_args()
 
+    if args.command == "download":
+        subset = {
+            "images": args.limit_images,
+            "channels": args.channels,
+            "plate": args.plate,
+            "well": args.well,
+            "site": args.site,
+            "z": tuple(args.z_range) if args.z_range else None,
+            "t": tuple(args.t_range) if args.t_range else None,
+            "c": tuple(args.c_range) if args.c_range else None,
+        }
+        result = download(
+            args.dataset,
+            output_dir=Path(args.output),
+            subset=subset,
+            preset=args.preset,
+            manifests_dir=Path(args.manifests_dir),
+            validate_only=args.validate_only,
+            silent=args.silent,
+        )
+        print(f"Downloaded: {result.downloaded}")
+        print(f"Skipped: {result.skipped}")
+        print(f"Validated: {result.validated}")
+        if result.manifest_path:
+            print(f"Manifest: {result.manifest_path}")
+        if result.failed:
+            print("Failed:")
+            for item in result.failed:
+                print(f"- {item}")
+            return 1
+        return 0
+
     if args.command == "fetch":
         result = fetch_datasets(
             manifests_dir=Path(args.manifests_dir),
diff --git a/src/OME_IRIS/datasets.py b/src/OME_IRIS/datasets.py
new file mode 100644
index 0000000..fcdc262
--- /dev/null
+++ b/src/OME_IRIS/datasets.py
@@ -0,0 +1,335 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+import hashlib
+import json
+from pathlib import Path
+import re
+from typing import Any
+from urllib.request import urlopen
+
+from PIL import Image
+import yaml
+
+from OME_IRIS.fetch import _download, _parse_github_tree_url
+
+
+PRESET_SUBSETS: dict[str, dict[str, Any]] = {
+    "tiny": {"images": 5},
+    "small": {"images": 20},
+    "benchmark": {"images": 100},
+}
+
+DATASET_ALIASES = {
+    "nf1": "nf1-cellpainting-shrunken",
+    "jump": "jump-plate-example",
+}
+
+
+@dataclass
+class DownloadResult:
+    downloaded: int = 0
+    skipped: int = 0
+    validated: int = 0
+    failed: list[str] = field(default_factory=list)
+    downloaded_items: list[str] = field(default_factory=list)
+    skipped_items: list[str] = field(default_factory=list)
+    validated_items: list[str] = field(default_factory=list)
+    manifest_path: Path | None = None
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _default_manifests_dir() -> Path:
+    return Path(__file__).parent / "data" / "datasets"
+
+
+def _load_manifests(manifests_dir: Path) -> list[dict[str, Any]]:
+    return [
+        yaml.safe_load(path.read_text(encoding="utf-8"))
+        for path in sorted(manifests_dir.glob("*.yaml"))
+    ]
+
+
+def _resolve_dataset(dataset: str, manifests_dir: Path) -> dict[str, Any]:
+    manifests = _load_manifests(manifests_dir)
+    dataset_id = DATASET_ALIASES.get(dataset, dataset)
+    exact = [manifest for manifest in manifests if manifest.get("id") == dataset_id]
+    if exact:
+        return exact[0]
+
+    prefix = [
+        manifest
+        for manifest in manifests
+        if str(manifest.get("id", "")).startswith(dataset)
+    ]
+    if len(prefix) == 1:
+        return prefix[0]
+    if not prefix:
+        raise ValueError(f"Unknown dataset: {dataset}")
+    matches = ", ".join(str(manifest.get("id")) for manifest in prefix)
+    raise ValueError(f"Ambiguous dataset {dataset!r}; matched: {matches}")
+
+
+def _merge_subset(preset: str | None, subset: dict[str, Any] | None) -> dict[str, Any]:
+    merged: dict[str, Any] = {}
+    if preset:
+        try:
+            merged.update(PRESET_SUBSETS[preset])
+        except KeyError as exc:
+            choices = ", ".join(sorted(PRESET_SUBSETS))
+            raise ValueError(f"Unknown preset {preset!r}; choose one of: {choices}") from exc
+    if subset:
+        merged.update(
+            {key: value for key, value in subset.items() if value is not None}
+        )
+    return merged
+
+
+def _github_tree_files(tree_url: str) -> list[tuple[str, str]]:
+    parsed = _parse_github_tree_url(tree_url)
+    if parsed is None:
+        raise ValueError(f"Unsupported directory URL: {tree_url}")
+    owner, repo, ref, subtree = parsed
+    api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1"
+    with urlopen(api_url) as response:  # nosec B310
+        payload = json.loads(response.read().decode("utf-8"))
+
+    prefix = f"{subtree.rstrip('/')}/"
+    files: list[tuple[str, str]] = []
+    for entry in payload.get("tree", []):
+        blob_path = str(entry.get("path", ""))
+        if entry.get("type") != "blob" or not blob_path.startswith(prefix):
+            continue
+        relative = blob_path[len(prefix) :]
+        raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{blob_path}"
+        files.append((relative, raw_url))
+    return sorted(files)
+
+
+def _directory_files(url: str) -> list[tuple[str, str]]:
+    local_path = Path(url)
+    if local_path.exists() and local_path.is_dir():
+        files = []
+        for source_file in local_path.rglob("*"):
+            if source_file.is_file():
+                files.append((str(source_file.relative_to(local_path)), str(source_file)))
+        return sorted(files)
+    return _github_tree_files(url)
+
+
+def _filename_tokens(path: str) -> set[str]:
+    return {
+        token.casefold()
+        for token in re.split(r"[^A-Za-z0-9]+", Path(path).name)
+        if token
+    }
+
+
+def _matches_any_token(path: str, values: list[str] | tuple[str, ...] | None) -> bool:
+    if not values:
+        return True
+    tokens = _filename_tokens(path)
+    return any(value.casefold() in tokens for value in values)
+
+
+def _matches_range(path: str, axis: str, value_range: tuple[int, int] | None) -> bool:
+    if value_range is None:
+        return True
+    match = re.search(
+        rf"(?:^|[^A-Za-z0-9]){axis}\D*0*(\d+)(?:[^A-Za-z0-9]|$)",
+        path,
+        re.IGNORECASE,
+    )
+    if match is None:
+        return True
+    value = int(match.group(1))
+    start, stop = value_range
+    return start <= value <= stop
+
+
+def _matches_subset(path: str, subset: dict[str, Any]) -> bool:
+    if not _matches_any_token(path, subset.get("channels")):
+        return False
+    for key in ("plate", "well", "site"):
+        values = subset.get(key)
+        if isinstance(values, str):
+            values = [values]
+        if not _matches_any_token(path, values):
+            return False
+    return (
+        _matches_range(path, "z", subset.get("z"))
+        and _matches_range(path, "t", subset.get("t"))
+        and _matches_range(path, "c", subset.get("c"))
+    )
+
+
+def _select_files(file_rec: dict[str, Any], subset: dict[str, Any]) -> list[dict[str, str]]:
+    kind = file_rec.get("kind", "file")
+    if kind != "directory":
+        url = (file_rec.get("url") or "").strip()
+        return [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else []
+
+    selected = []
+    for relative, source_url in _directory_files(str(file_rec.get("url", ""))):
+        if _matches_subset(relative, subset):
+            selected.append(
+                {
+                    "relative_path": str(Path(str(file_rec["path"])) / relative),
+                    "source_url": source_url,
+                }
+            )
+
+    image_limit = subset.get("images")
+    if image_limit is not None:
+        selected = selected[: int(image_limit)]
+    return selected
+
+
+def _image_metadata(path: Path) -> tuple[list[int] | None, str | None]:
+    try:
+        with Image.open(path) as image:
+            width, height = image.size
+            bands = len(image.getbands())
+            shape = [height, width] if bands == 1 else [height, width, bands]
+            dtype_by_mode = {
+                "1": "bool",
+                "L": "uint8",
+                "P": "uint8",
+                "RGB": "uint8",
+                "RGBA": "uint8",
+                "I;16": "uint16",
+                "I": "int32",
+                "F": "float32",
+            }
+            return shape, dtype_by_mode.get(image.mode, image.mode)
+    except Exception:  # noqa: BLE001
+        return None, None
+
+
+def _manifest_record(
+    output_dir: Path,
+    relative_path: str,
+    source_url: str,
+    file_rec: dict[str, Any],
+) -> dict[str, Any]:
+    target = output_dir / relative_path
+    shape, dtype = _image_metadata(target)
+    record: dict[str, Any] = {
+        "path": relative_path,
+        "source_url": source_url,
+        "sha256": _sha256(target),
+        "size_bytes": target.stat().st_size,
+        "shape": shape,
+        "dtype": dtype,
+        "metadata": file_rec.get("custom_metadata", {}),
+    }
+    return record
+
+
+def _write_subset_manifest(
+    *,
+    output_dir: Path,
+    dataset_manifest: dict[str, Any],
+    subset: dict[str, Any],
+    files: list[dict[str, Any]],
+) -> Path:
+    manifest_path = output_dir / "manifest.json"
+    payload = {
+        "manifest_version": 1,
+        "dataset": {
+            "id": dataset_manifest.get("id"),
+            "name": dataset_manifest.get("name"),
+            "source_identifier": dataset_manifest.get("source_identifier"),
+            "source": dataset_manifest.get("source", {}),
+        },
+        "subset": subset,
+        "files": files,
+    }
+    manifest_path.write_text(
+        json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8"
+    )
+    return manifest_path
+
+
+def _validate_existing_manifest(output_dir: Path) -> DownloadResult:
+    manifest_path = output_dir / "manifest.json"
+    result = DownloadResult(manifest_path=manifest_path)
+    if not manifest_path.exists():
+        result.failed.append(f"{manifest_path}: manifest not found")
+        return result
+
+    payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+    for file_rec in payload.get("files", []):
+        path = output_dir / file_rec["path"]
+        if not path.exists():
+            result.failed.append(f"{file_rec['path']}: missing file")
+            continue
+        actual = _sha256(path)
+        if actual != file_rec.get("sha256"):
+            result.failed.append(f"{file_rec['path']}: checksum mismatch")
+            continue
+        result.validated += 1
+        result.validated_items.append(str(file_rec["path"]))
+    return result
+
+
+def download(
+    dataset: str,
+    output_dir: str | Path,
+    subset: dict[str, Any] | None = None,
+    *,
+    preset: str | None = None,
+    manifests_dir: str | Path | None = None,
+    validate_only: bool = False,
+    silent: bool = False,
+) -> DownloadResult:
+    """Download or validate a reproducible subset of a known dataset."""
+    output_path = Path(output_dir)
+    if validate_only:
+        return _validate_existing_manifest(output_path)
+
+    manifests_path = (
+        Path(manifests_dir) if manifests_dir is not None else _default_manifests_dir()
+    )
+    dataset_manifest = _resolve_dataset(dataset, manifests_path)
+    selected_subset = _merge_subset(preset, subset)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    result = DownloadResult(manifest_path=output_path / "manifest.json")
+    manifest_files: list[dict[str, Any]] = []
+
+    for file_rec in dataset_manifest.get("files", []):
+        for selected in _select_files(file_rec, selected_subset):
+            relative_path = selected["relative_path"]
+            source_url = selected["source_url"]
+            target = output_path / relative_path
+            try:
+                if target.exists():
+                    result.skipped += 1
+                    result.skipped_items.append(relative_path)
+                else:
+                    if not silent:
+                        print(f"Downloading: {relative_path}")
+                    _download(source_url, target, silent=silent)
+                    result.downloaded += 1
+                    result.downloaded_items.append(relative_path)
+                manifest_files.append(
+                    _manifest_record(output_path, relative_path, source_url, file_rec)
+                )
+            except Exception as exc:  # noqa: BLE001
+                result.failed.append(f"{relative_path}: {exc}")
+
+    result.manifest_path = _write_subset_manifest(
+        output_dir=output_path,
+        dataset_manifest=dataset_manifest,
+        subset=selected_subset,
+        files=manifest_files,
+    )
+    return result
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d5e5447..8a674a7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -3,11 +3,37 @@
 from OME_IRIS.cli import build_parser
 
 
-def test_cli_has_fetch_verify_scaffold_and_export_rocrate_commands() -> None:
+def test_cli_has_download_fetch_verify_scaffold_and_export_rocrate_commands() -> None:
     parser = build_parser()
     help_text = parser.format_help()
 
+    assert "download" in help_text
     assert "fetch" in help_text
     assert "verify" in help_text
     assert "scaffold" in help_text
     assert "export-rocrate" in help_text
+
+
+def test_cli_download_accepts_subset_options() -> None:
+    parser = build_parser()
+
+    args = parser.parse_args(
+        [
+            "download",
+            "nf1",
+            "--output",
+            ".benchmark-data/ome-iris/nf1",
+            "--limit-images",
+            "20",
+            "--channel",
+            "DAPI",
+            "--validate-only",
+        ]
+    )
+
+    assert args.command == "download"
+    assert args.dataset == "nf1"
+    assert args.output == ".benchmark-data/ome-iris/nf1"
+    assert args.limit_images == 20
+    assert args.channels == ["DAPI"]
+    assert args.validate_only is True
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 0000000..6cb3fc6
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from PIL import Image
+import yaml
+
+from OME_IRIS import datasets
+from ome_iris import datasets as lower_datasets
+
+
+def write_subset_manifest(path: Path, source_dir: Path) -> None:
+    payload = {
+        "id": "nf1-cellpainting-shrunken",
+        "name": "NF1 Cell Painting shrunken",
+        "description": "Example dataset",
+        "tier": "small",
+        "license": "CC-BY-4.0",
+        "source_identifier": "NF1_cellpainting_data_shrunken",
+        "source": {"repository": "https://example.org", "path": "data", "url": ""},
+        "formats": ["tiff"],
+        "files": [
+            {
+                "path": "images",
+                "kind": "directory",
+                "url": str(source_dir),
+                "custom_metadata": {"role": "image_bundle"},
+            }
+        ],
+    }
+    path.write_text(yaml.safe_dump(payload), encoding="utf-8")
+
+
+def make_image(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    Image.new("L", (4, 3), color=7).save(path)
+
+
+def test_download_filters_images_by_channel_and_limit(tmp_path: Path) -> None:
+    source_dir = tmp_path / "source"
+    make_image(source_dir / "A01_01_1_1_DAPI_001.tif")
+    make_image(source_dir / "A01_01_2_1_GFP_001.tif")
+    make_image(source_dir / "A02_01_1_1_DAPI_001.tif")
+
+    manifests_dir = tmp_path / "datasets"
+    manifests_dir.mkdir()
+    write_subset_manifest(manifests_dir / "nf1.yaml", source_dir)
+
+    result = datasets.download(
+        "nf1",
+        output_dir=tmp_path / "out",
+        subset={"images": 1, "channels": ["DAPI"]},
+        manifests_dir=manifests_dir,
+    )
+
+    assert result.downloaded == 1
+    assert result.skipped == 0
+    assert result.failed == []
+    assert (tmp_path / "out" / "images" / "A01_01_1_1_DAPI_001.tif").exists()
+    assert not (tmp_path / "out" / "images" / "A02_01_1_1_DAPI_001.tif").exists()
+
+    manifest = json.loads((tmp_path / "out" / "manifest.json").read_text())
+    assert manifest["dataset"]["id"] == "nf1-cellpainting-shrunken"
+    assert manifest["subset"]["images"] == 1
+    assert manifest["files"][0]["source_url"].endswith("A01_01_1_1_DAPI_001.tif")
+    assert manifest["files"][0]["sha256"]
+    assert manifest["files"][0]["shape"] == [3, 4]
+    assert manifest["files"][0]["dtype"] == "uint8"
+
+
+def test_lowercase_package_exposes_datasets_api() -> None:
+    assert lower_datasets.download is datasets.download
+
+
+def test_download_reuses_cached_files_and_validation_only_checks_manifest(
+    tmp_path: Path,
+) -> None:
+    source_dir = tmp_path / "source"
+    make_image(source_dir / "A01_01_1_1_DAPI_001.tif")
+    manifests_dir = tmp_path / "datasets"
+    manifests_dir.mkdir()
+    write_subset_manifest(manifests_dir / "nf1.yaml", source_dir)
+
+    first = datasets.download(
+        "nf1",
+        output_dir=tmp_path / "out",
+        subset={"images": 1},
+        manifests_dir=manifests_dir,
+    )
+    second = datasets.download(
+        "nf1",
+        output_dir=tmp_path / "out",
+        subset={"images": 1},
+        manifests_dir=manifests_dir,
+    )
+    validation = datasets.download(
+        "nf1",
+        output_dir=tmp_path / "out",
+        validate_only=True,
+        manifests_dir=manifests_dir,
+    )
+
+    assert first.downloaded == 1
+    assert second.downloaded == 0
+    assert second.skipped == 1
+    assert validation.validated == 1
+    assert validation.failed == []
+
+
+def test_download_preset_expands_to_reproducible_subset_size(tmp_path: Path) -> None:
+    source_dir = tmp_path / "source"
+    for index in range(3):
+        make_image(source_dir / f"A0{index + 1}_01_1_1_DAPI_001.tif")
+    manifests_dir = tmp_path / "datasets"
+    manifests_dir.mkdir()
+    write_subset_manifest(manifests_dir / "nf1.yaml", source_dir)
+
+    result = datasets.download(
+        "nf1",
+        output_dir=tmp_path / "out",
+        preset="tiny",
+        manifests_dir=manifests_dir,
+    )
+
+    assert result.downloaded == 3
+    assert result.manifest_path == tmp_path / "out" / "manifest.json"

From cc9d7edaa9050c3de4cb4a26d1b7e125019d7e7b Mon Sep 17 00:00:00 2001
From: d33bs <ekgto445@gmail.com>
Date: Thu, 4 Jun 2026 10:37:21 -0600
Subject: [PATCH 2/2] linting

---
 src/OME_IRIS/datasets.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/OME_IRIS/datasets.py b/src/OME_IRIS/datasets.py
index fcdc262..7a85f27 100644
--- a/src/OME_IRIS/datasets.py
+++ b/src/OME_IRIS/datasets.py
@@ -84,7 +84,9 @@ def _merge_subset(preset: str | None, subset: dict[str, Any] | None) -> dict[str
             merged.update(PRESET_SUBSETS[preset])
         except KeyError as exc:
             choices = ", ".join(sorted(PRESET_SUBSETS))
-            raise ValueError(f"Unknown preset {preset!r}; choose one of: {choices}") from exc
+            raise ValueError(
+                f"Unknown preset {preset!r}; choose one of: {choices}"
+            ) from exc
     if subset:
         merged.update(
             {key: value for key, value in subset.items() if value is not None}
@@ -119,7 +121,9 @@ def _directory_files(url: str) -> list[tuple[str, str]]:
         files = []
         for source_file in local_path.rglob("*"):
             if source_file.is_file():
-                files.append((str(source_file.relative_to(local_path)), str(source_file)))
+                files.append(
+                    (str(source_file.relative_to(local_path)), str(source_file))
+                )
         return sorted(files)
     return _github_tree_files(url)
 
@@ -170,11 +174,15 @@ def _matches_subset(path: str, subset: dict[str, Any]) -> bool:
     )
 
 
-def _select_files(file_rec: dict[str, Any], subset: dict[str, Any]) -> list[dict[str, str]]:
+def _select_files(
+    file_rec: dict[str, Any], subset: dict[str, Any]
+) -> list[dict[str, str]]:
     kind = file_rec.get("kind", "file")
     if kind != "directory":
         url = (file_rec.get("url") or "").strip()
-        return [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else []
+        return (
+            [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else []
+        )
 
     selected = []
     for relative, source_url in _directory_files(str(file_rec.get("url", ""))):