From 3e70d957f14836186afdb661f204ffc54737bc0d Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 10:36:57 -0600 Subject: [PATCH 1/2] enable data subsets --- README.md | 40 +++++ docs/src/python-api.md | 8 + pyproject.toml | 10 +- src/OME_IRIS/__init__.py | 11 +- src/OME_IRIS/cli.py | 51 ++++++ src/OME_IRIS/datasets.py | 335 +++++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 28 +++- tests/test_datasets.py | 127 +++++++++++++++ 8 files changed, 605 insertions(+), 5 deletions(-) create mode 100644 src/OME_IRIS/datasets.py create mode 100644 tests/test_datasets.py diff --git a/README.md b/README.md index 0250548..9166ed0 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,27 @@ uv run ome-iris verify uv run ome-iris export-rocrate --dataset nf1-cellpainting-shrunken ``` +Download a reproducible subset for local development or benchmarking: + +```bash +uv run ome-iris download nf1 \ + --output .benchmark-data/ome-iris/nf1 \ + --preset tiny \ + --channel DAPI +``` + +Python API: + +```python +from ome_iris import datasets + +datasets.download( + "nf1", + output_dir=".benchmark-data/ome-iris/nf1", + subset={"images": 20, "channels": ["DAPI"]}, +) +``` + Fetch output modes: ```bash @@ -82,6 +103,25 @@ uv run ome-iris fetch --data-dir /tmp/ome-iris-data uv run ome-iris verify --data-dir /tmp/ome-iris-data ``` +## What `download` does + +`ome-iris download` creates a small, reproducible subset under the exact `--output` +directory. It supports named dataset aliases such as `nf1`, preset sizes +(`tiny`, `small`, `benchmark`), image limits, channel filters, plate/well/site +filters, and Z/T/C ranges where filenames expose those values. + +Downloaded subsets include `manifest.json` with the source dataset, selected +subset options, downloaded file paths, source URLs, SHA-256 checksums, file +sizes, image shapes, dtypes, and file metadata. Existing files are reused and +included in the manifest. Use `--validate-only` to verify an existing subset +cache against its manifest without downloading data: + +```bash +uv run ome-iris download nf1 \ + --output .benchmark-data/ome-iris/nf1 \ + --validate-only +``` + ## Add a dataset 1. Add or update a dataset manifest and catalog metadata. diff --git a/docs/src/python-api.md b/docs/src/python-api.md index 449de29..9917e62 100644 --- a/docs/src/python-api.md +++ b/docs/src/python-api.md @@ -16,6 +16,14 @@ :undoc-members: ``` +## Dataset subsets + +```{eval-rst} +.. automodule:: OME_IRIS.datasets + :members: + :undoc-members: +``` + ## Verify ```{eval-rst} diff --git a/pyproject.toml b/pyproject.toml index 2c24b55..f773ae0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,15 +39,19 @@ docs = [ ] [tool.setuptools] -package-dir = { "" = "src" } +package-dir = { "" = "src", ome_iris = "src/OME_IRIS" } include-package-data = true package-data.OME_IRIS = [ "data/*.yaml", "data/*.csv", "data/datasets/*.yaml", ] -packages.find.where = [ "src" ] -packages.find.include = [ "OME_IRIS*" ] +package-data.ome_iris = [ + "data/*.yaml", + "data/*.csv", + "data/datasets/*.yaml", +] +packages = [ "OME_IRIS", "ome_iris" ] [tool.setuptools_scm] write_to = "src/OME_IRIS/_version.py" diff --git a/src/OME_IRIS/__init__.py b/src/OME_IRIS/__init__.py index 23d5fc4..70b48a4 100644 --- a/src/OME_IRIS/__init__.py +++ b/src/OME_IRIS/__init__.py @@ -1,8 +1,17 @@ """OME-IRIS package.""" -__all__ = ["__version__"] +from __future__ import annotations + +import sys + +from . import datasets + +__all__ = ["__version__", "datasets"] try: from ._version import version as __version__ except ImportError: # pragma: no cover __version__ = "0+unknown" + +sys.modules.setdefault("ome_iris", sys.modules[__name__]) +sys.modules.setdefault("ome_iris.datasets", datasets) diff --git a/src/OME_IRIS/cli.py b/src/OME_IRIS/cli.py index 14164ab..f79f395 100644 --- a/src/OME_IRIS/cli.py +++ b/src/OME_IRIS/cli.py @@ -4,6 +4,7 @@ from pathlib import Path from OME_IRIS.clean import clean_local_data +from OME_IRIS.datasets import download from OME_IRIS.fetch import fetch_datasets from OME_IRIS.rocrate import export_rocrate_metadata from OME_IRIS.scaffold import scaffold_dataset_manifest @@ -16,6 +17,24 @@ def build_parser() -> argparse.ArgumentParser: ) sub = parser.add_subparsers(dest="command", required=True) + download_cmd = sub.add_parser( + "download", help="Download a reproducible dataset subset" + ) + download_cmd.add_argument("dataset") + download_cmd.add_argument("--output", required=True) + download_cmd.add_argument("--preset", choices=["tiny", "small", "benchmark"]) + download_cmd.add_argument("--limit-images", type=int) + download_cmd.add_argument("--channel", dest="channels", action="append") + download_cmd.add_argument("--plate", action="append") + download_cmd.add_argument("--well", action="append") + download_cmd.add_argument("--site", action="append") + download_cmd.add_argument("--z-range", nargs=2, type=int, metavar=("START", "STOP")) + download_cmd.add_argument("--t-range", nargs=2, type=int, metavar=("START", "STOP")) + download_cmd.add_argument("--c-range", nargs=2, type=int, metavar=("START", "STOP")) + download_cmd.add_argument("--validate-only", action="store_true") + download_cmd.add_argument("--manifests-dir", default="src/OME_IRIS/data/datasets") + download_cmd.add_argument("--silent", action="store_true") + fetch_cmd = sub.add_parser("fetch", help="Fetch dataset files") fetch_cmd.add_argument("--dataset", dest="dataset_id") fetch_cmd.add_argument("--tier", choices=["tiny", "small", "realistic"]) @@ -68,6 +87,38 @@ def main() -> int: parser = build_parser() args = parser.parse_args() + if args.command == "download": + subset = { + "images": args.limit_images, + "channels": args.channels, + "plate": args.plate, + "well": args.well, + "site": args.site, + "z": tuple(args.z_range) if args.z_range else None, + "t": tuple(args.t_range) if args.t_range else None, + "c": tuple(args.c_range) if args.c_range else None, + } + result = download( + args.dataset, + output_dir=Path(args.output), + subset=subset, + preset=args.preset, + manifests_dir=Path(args.manifests_dir), + validate_only=args.validate_only, + silent=args.silent, + ) + print(f"Downloaded: {result.downloaded}") + print(f"Skipped: {result.skipped}") + print(f"Validated: {result.validated}") + if result.manifest_path: + print(f"Manifest: {result.manifest_path}") + if result.failed: + print("Failed:") + for item in result.failed: + print(f"- {item}") + return 1 + return 0 + if args.command == "fetch": result = fetch_datasets( manifests_dir=Path(args.manifests_dir), diff --git a/src/OME_IRIS/datasets.py b/src/OME_IRIS/datasets.py new file mode 100644 index 0000000..fcdc262 --- /dev/null +++ b/src/OME_IRIS/datasets.py @@ -0,0 +1,335 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +import hashlib +import json +from pathlib import Path +import re +from typing import Any +from urllib.request import urlopen + +from PIL import Image +import yaml + +from OME_IRIS.fetch import _download, _parse_github_tree_url + + +PRESET_SUBSETS: dict[str, dict[str, Any]] = { + "tiny": {"images": 5}, + "small": {"images": 20}, + "benchmark": {"images": 100}, +} + +DATASET_ALIASES = { + "nf1": "nf1-cellpainting-shrunken", + "jump": "jump-plate-example", +} + + +@dataclass +class DownloadResult: + downloaded: int = 0 + skipped: int = 0 + validated: int = 0 + failed: list[str] = field(default_factory=list) + downloaded_items: list[str] = field(default_factory=list) + skipped_items: list[str] = field(default_factory=list) + validated_items: list[str] = field(default_factory=list) + manifest_path: Path | None = None + + +def _sha256(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _default_manifests_dir() -> Path: + return Path(__file__).parent / "data" / "datasets" + + +def _load_manifests(manifests_dir: Path) -> list[dict[str, Any]]: + return [ + yaml.safe_load(path.read_text(encoding="utf-8")) + for path in sorted(manifests_dir.glob("*.yaml")) + ] + + +def _resolve_dataset(dataset: str, manifests_dir: Path) -> dict[str, Any]: + manifests = _load_manifests(manifests_dir) + dataset_id = DATASET_ALIASES.get(dataset, dataset) + exact = [manifest for manifest in manifests if manifest.get("id") == dataset_id] + if exact: + return exact[0] + + prefix = [ + manifest + for manifest in manifests + if str(manifest.get("id", "")).startswith(dataset) + ] + if len(prefix) == 1: + return prefix[0] + if not prefix: + raise ValueError(f"Unknown dataset: {dataset}") + matches = ", ".join(str(manifest.get("id")) for manifest in prefix) + raise ValueError(f"Ambiguous dataset {dataset!r}; matched: {matches}") + + +def _merge_subset(preset: str | None, subset: dict[str, Any] | None) -> dict[str, Any]: + merged: dict[str, Any] = {} + if preset: + try: + merged.update(PRESET_SUBSETS[preset]) + except KeyError as exc: + choices = ", ".join(sorted(PRESET_SUBSETS)) + raise ValueError(f"Unknown preset {preset!r}; choose one of: {choices}") from exc + if subset: + merged.update( + {key: value for key, value in subset.items() if value is not None} + ) + return merged + + +def _github_tree_files(tree_url: str) -> list[tuple[str, str]]: + parsed = _parse_github_tree_url(tree_url) + if parsed is None: + raise ValueError(f"Unsupported directory URL: {tree_url}") + owner, repo, ref, subtree = parsed + api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{ref}?recursive=1" + with urlopen(api_url) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + + prefix = f"{subtree.rstrip('/')}/" + files: list[tuple[str, str]] = [] + for entry in payload.get("tree", []): + blob_path = str(entry.get("path", "")) + if entry.get("type") != "blob" or not blob_path.startswith(prefix): + continue + relative = blob_path[len(prefix) :] + raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{ref}/{blob_path}" + files.append((relative, raw_url)) + return sorted(files) + + +def _directory_files(url: str) -> list[tuple[str, str]]: + local_path = Path(url) + if local_path.exists() and local_path.is_dir(): + files = [] + for source_file in local_path.rglob("*"): + if source_file.is_file(): + files.append((str(source_file.relative_to(local_path)), str(source_file))) + return sorted(files) + return _github_tree_files(url) + + +def _filename_tokens(path: str) -> set[str]: + return { + token.casefold() + for token in re.split(r"[^A-Za-z0-9]+", Path(path).name) + if token + } + + +def _matches_any_token(path: str, values: list[str] | tuple[str, ...] | None) -> bool: + if not values: + return True + tokens = _filename_tokens(path) + return any(value.casefold() in tokens for value in values) + + +def _matches_range(path: str, axis: str, value_range: tuple[int, int] | None) -> bool: + if value_range is None: + return True + match = re.search( + rf"(?:^|[^A-Za-z0-9]){axis}\D*0*(\d+)(?:[^A-Za-z0-9]|$)", + path, + re.IGNORECASE, + ) + if match is None: + return True + value = int(match.group(1)) + start, stop = value_range + return start <= value <= stop + + +def _matches_subset(path: str, subset: dict[str, Any]) -> bool: + if not _matches_any_token(path, subset.get("channels")): + return False + for key in ("plate", "well", "site"): + values = subset.get(key) + if isinstance(values, str): + values = [values] + if not _matches_any_token(path, values): + return False + return ( + _matches_range(path, "z", subset.get("z")) + and _matches_range(path, "t", subset.get("t")) + and _matches_range(path, "c", subset.get("c")) + ) + + +def _select_files(file_rec: dict[str, Any], subset: dict[str, Any]) -> list[dict[str, str]]: + kind = file_rec.get("kind", "file") + if kind != "directory": + url = (file_rec.get("url") or "").strip() + return [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else [] + + selected = [] + for relative, source_url in _directory_files(str(file_rec.get("url", ""))): + if _matches_subset(relative, subset): + selected.append( + { + "relative_path": str(Path(str(file_rec["path"])) / relative), + "source_url": source_url, + } + ) + + image_limit = subset.get("images") + if image_limit is not None: + selected = selected[: int(image_limit)] + return selected + + +def _image_metadata(path: Path) -> tuple[list[int] | None, str | None]: + try: + with Image.open(path) as image: + width, height = image.size + bands = len(image.getbands()) + shape = [height, width] if bands == 1 else [height, width, bands] + dtype_by_mode = { + "1": "bool", + "L": "uint8", + "P": "uint8", + "RGB": "uint8", + "RGBA": "uint8", + "I;16": "uint16", + "I": "int32", + "F": "float32", + } + return shape, dtype_by_mode.get(image.mode, image.mode) + except Exception: # noqa: BLE001 + return None, None + + +def _manifest_record( + output_dir: Path, + relative_path: str, + source_url: str, + file_rec: dict[str, Any], +) -> dict[str, Any]: + target = output_dir / relative_path + shape, dtype = _image_metadata(target) + record: dict[str, Any] = { + "path": relative_path, + "source_url": source_url, + "sha256": _sha256(target), + "size_bytes": target.stat().st_size, + "shape": shape, + "dtype": dtype, + "metadata": file_rec.get("custom_metadata", {}), + } + return record + + +def _write_subset_manifest( + *, + output_dir: Path, + dataset_manifest: dict[str, Any], + subset: dict[str, Any], + files: list[dict[str, Any]], +) -> Path: + manifest_path = output_dir / "manifest.json" + payload = { + "manifest_version": 1, + "dataset": { + "id": dataset_manifest.get("id"), + "name": dataset_manifest.get("name"), + "source_identifier": dataset_manifest.get("source_identifier"), + "source": dataset_manifest.get("source", {}), + }, + "subset": subset, + "files": files, + } + manifest_path.write_text( + json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8" + ) + return manifest_path + + +def _validate_existing_manifest(output_dir: Path) -> DownloadResult: + manifest_path = output_dir / "manifest.json" + result = DownloadResult(manifest_path=manifest_path) + if not manifest_path.exists(): + result.failed.append(f"{manifest_path}: manifest not found") + return result + + payload = json.loads(manifest_path.read_text(encoding="utf-8")) + for file_rec in payload.get("files", []): + path = output_dir / file_rec["path"] + if not path.exists(): + result.failed.append(f"{file_rec['path']}: missing file") + continue + actual = _sha256(path) + if actual != file_rec.get("sha256"): + result.failed.append(f"{file_rec['path']}: checksum mismatch") + continue + result.validated += 1 + result.validated_items.append(str(file_rec["path"])) + return result + + +def download( + dataset: str, + output_dir: str | Path, + subset: dict[str, Any] | None = None, + *, + preset: str | None = None, + manifests_dir: str | Path | None = None, + validate_only: bool = False, + silent: bool = False, +) -> DownloadResult: + """Download or validate a reproducible subset of a known dataset.""" + output_path = Path(output_dir) + if validate_only: + return _validate_existing_manifest(output_path) + + manifests_path = ( + Path(manifests_dir) if manifests_dir is not None else _default_manifests_dir() + ) + dataset_manifest = _resolve_dataset(dataset, manifests_path) + selected_subset = _merge_subset(preset, subset) + output_path.mkdir(parents=True, exist_ok=True) + + result = DownloadResult(manifest_path=output_path / "manifest.json") + manifest_files: list[dict[str, Any]] = [] + + for file_rec in dataset_manifest.get("files", []): + for selected in _select_files(file_rec, selected_subset): + relative_path = selected["relative_path"] + source_url = selected["source_url"] + target = output_path / relative_path + try: + if target.exists(): + result.skipped += 1 + result.skipped_items.append(relative_path) + else: + if not silent: + print(f"Downloading: {relative_path}") + _download(source_url, target, silent=silent) + result.downloaded += 1 + result.downloaded_items.append(relative_path) + manifest_files.append( + _manifest_record(output_path, relative_path, source_url, file_rec) + ) + except Exception as exc: # noqa: BLE001 + result.failed.append(f"{relative_path}: {exc}") + + result.manifest_path = _write_subset_manifest( + output_dir=output_path, + dataset_manifest=dataset_manifest, + subset=selected_subset, + files=manifest_files, + ) + return result diff --git a/tests/test_cli.py b/tests/test_cli.py index d5e5447..8a674a7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,11 +3,37 @@ from OME_IRIS.cli import build_parser -def test_cli_has_fetch_verify_scaffold_and_export_rocrate_commands() -> None: +def test_cli_has_download_fetch_verify_scaffold_and_export_rocrate_commands() -> None: parser = build_parser() help_text = parser.format_help() + assert "download" in help_text assert "fetch" in help_text assert "verify" in help_text assert "scaffold" in help_text assert "export-rocrate" in help_text + + +def test_cli_download_accepts_subset_options() -> None: + parser = build_parser() + + args = parser.parse_args( + [ + "download", + "nf1", + "--output", + ".benchmark-data/ome-iris/nf1", + "--limit-images", + "20", + "--channel", + "DAPI", + "--validate-only", + ] + ) + + assert args.command == "download" + assert args.dataset == "nf1" + assert args.output == ".benchmark-data/ome-iris/nf1" + assert args.limit_images == 20 + assert args.channels == ["DAPI"] + assert args.validate_only is True diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..6cb3fc6 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from PIL import Image +import yaml + +from OME_IRIS import datasets +from ome_iris import datasets as lower_datasets + + +def write_subset_manifest(path: Path, source_dir: Path) -> None: + payload = { + "id": "nf1-cellpainting-shrunken", + "name": "NF1 Cell Painting shrunken", + "description": "Example dataset", + "tier": "small", + "license": "CC-BY-4.0", + "source_identifier": "NF1_cellpainting_data_shrunken", + "source": {"repository": "https://example.org", "path": "data", "url": ""}, + "formats": ["tiff"], + "files": [ + { + "path": "images", + "kind": "directory", + "url": str(source_dir), + "custom_metadata": {"role": "image_bundle"}, + } + ], + } + path.write_text(yaml.safe_dump(payload), encoding="utf-8") + + +def make_image(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + Image.new("L", (4, 3), color=7).save(path) + + +def test_download_filters_images_by_channel_and_limit(tmp_path: Path) -> None: + source_dir = tmp_path / "source" + make_image(source_dir / "A01_01_1_1_DAPI_001.tif") + make_image(source_dir / "A01_01_2_1_GFP_001.tif") + make_image(source_dir / "A02_01_1_1_DAPI_001.tif") + + manifests_dir = tmp_path / "datasets" + manifests_dir.mkdir() + write_subset_manifest(manifests_dir / "nf1.yaml", source_dir) + + result = datasets.download( + "nf1", + output_dir=tmp_path / "out", + subset={"images": 1, "channels": ["DAPI"]}, + manifests_dir=manifests_dir, + ) + + assert result.downloaded == 1 + assert result.skipped == 0 + assert result.failed == [] + assert (tmp_path / "out" / "images" / "A01_01_1_1_DAPI_001.tif").exists() + assert not (tmp_path / "out" / "images" / "A02_01_1_1_DAPI_001.tif").exists() + + manifest = json.loads((tmp_path / "out" / "manifest.json").read_text()) + assert manifest["dataset"]["id"] == "nf1-cellpainting-shrunken" + assert manifest["subset"]["images"] == 1 + assert manifest["files"][0]["source_url"].endswith("A01_01_1_1_DAPI_001.tif") + assert manifest["files"][0]["sha256"] + assert manifest["files"][0]["shape"] == [3, 4] + assert manifest["files"][0]["dtype"] == "uint8" + + +def test_lowercase_package_exposes_datasets_api() -> None: + assert lower_datasets.download is datasets.download + + +def test_download_reuses_cached_files_and_validation_only_checks_manifest( + tmp_path: Path, +) -> None: + source_dir = tmp_path / "source" + make_image(source_dir / "A01_01_1_1_DAPI_001.tif") + manifests_dir = tmp_path / "datasets" + manifests_dir.mkdir() + write_subset_manifest(manifests_dir / "nf1.yaml", source_dir) + + first = datasets.download( + "nf1", + output_dir=tmp_path / "out", + subset={"images": 1}, + manifests_dir=manifests_dir, + ) + second = datasets.download( + "nf1", + output_dir=tmp_path / "out", + subset={"images": 1}, + manifests_dir=manifests_dir, + ) + validation = datasets.download( + "nf1", + output_dir=tmp_path / "out", + validate_only=True, + manifests_dir=manifests_dir, + ) + + assert first.downloaded == 1 + assert second.downloaded == 0 + assert second.skipped == 1 + assert validation.validated == 1 + assert validation.failed == [] + + +def test_download_preset_expands_to_reproducible_subset_size(tmp_path: Path) -> None: + source_dir = tmp_path / "source" + for index in range(3): + make_image(source_dir / f"A0{index + 1}_01_1_1_DAPI_001.tif") + manifests_dir = tmp_path / "datasets" + manifests_dir.mkdir() + write_subset_manifest(manifests_dir / "nf1.yaml", source_dir) + + result = datasets.download( + "nf1", + output_dir=tmp_path / "out", + preset="tiny", + manifests_dir=manifests_dir, + ) + + assert result.downloaded == 3 + assert result.manifest_path == tmp_path / "out" / "manifest.json" From cc9d7edaa9050c3de4cb4a26d1b7e125019d7e7b Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 10:37:21 -0600 Subject: [PATCH 2/2] linting --- src/OME_IRIS/datasets.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/OME_IRIS/datasets.py b/src/OME_IRIS/datasets.py index fcdc262..7a85f27 100644 --- a/src/OME_IRIS/datasets.py +++ b/src/OME_IRIS/datasets.py @@ -84,7 +84,9 @@ def _merge_subset(preset: str | None, subset: dict[str, Any] | None) -> dict[str merged.update(PRESET_SUBSETS[preset]) except KeyError as exc: choices = ", ".join(sorted(PRESET_SUBSETS)) - raise ValueError(f"Unknown preset {preset!r}; choose one of: {choices}") from exc + raise ValueError( + f"Unknown preset {preset!r}; choose one of: {choices}" + ) from exc if subset: merged.update( {key: value for key, value in subset.items() if value is not None} @@ -119,7 +121,9 @@ def _directory_files(url: str) -> list[tuple[str, str]]: files = [] for source_file in local_path.rglob("*"): if source_file.is_file(): - files.append((str(source_file.relative_to(local_path)), str(source_file))) + files.append( + (str(source_file.relative_to(local_path)), str(source_file)) + ) return sorted(files) return _github_tree_files(url) @@ -170,11 +174,15 @@ def _matches_subset(path: str, subset: dict[str, Any]) -> bool: ) -def _select_files(file_rec: dict[str, Any], subset: dict[str, Any]) -> list[dict[str, str]]: +def _select_files( + file_rec: dict[str, Any], subset: dict[str, Any] +) -> list[dict[str, str]]: kind = file_rec.get("kind", "file") if kind != "directory": url = (file_rec.get("url") or "").strip() - return [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else [] + return ( + [{"relative_path": str(file_rec["path"]), "source_url": url}] if url else [] + ) selected = [] for relative, source_url in _directory_files(str(file_rec.get("url", ""))):