diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1644186..9824430 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,7 +39,6 @@ jobs: data = yaml.safe_load(f) sources = list(data.keys()) print(f'OK — {len(sources)} fonti caricate:', sources) - # Verify required fields per entry for sid, cfg in data.items(): assert 'protocol' in cfg, f'{sid}: missing protocol' assert 'observation_mode' in cfg, f'{sid}: missing observation_mode' @@ -47,6 +46,8 @@ jobs: print('Schema validation OK') " + + - name: Run tests run: pytest tests/ -v diff --git a/.github/workflows/radar.yml b/.github/workflows/radar.yml index 8e7af99..e11fdfb 100644 --- a/.github/workflows/radar.yml +++ b/.github/workflows/radar.yml @@ -38,6 +38,9 @@ jobs: run: | python scripts/radar_check.py + - name: Sync datasets_in_use from DI catalog + run: python scripts/sync_datasets_in_use.py + - name: Upload radar artifacts uses: actions/upload-artifact@v7 with: diff --git a/requirements.txt b/requirements.txt index efcfe71..a36671c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ requests>=2.33.1,<3.0 PyYAML>=6.0.3,<7.0 +ruamel.yaml>=0.18.10,<1.0 duckdb>=1.5.2,<2.0 pandas>=3.0.2,<4.0 pyarrow>=24.0.0,<25.0 diff --git a/scripts/sync_datasets_in_use.py b/scripts/sync_datasets_in_use.py new file mode 100644 index 0000000..6936426 --- /dev/null +++ b/scripts/sync_datasets_in_use.py @@ -0,0 +1,93 @@ +"""Sync datasets_in_use in sources_registry.yaml from DI's clean_catalog.json. + +Reads dataset-incubator/registry/clean_catalog.json, groups datasets by source_id, +and updates sources_registry.yaml so that each source lists its DI candidate slugs. +""" + +import json +import sys +from pathlib import Path +from urllib.request import urlopen + +from ruamel.yaml import YAML + +DI_CATALOG_URL = ( + "https://raw.githubusercontent.com/dataciviclab/dataset-incubator/" + "main/registry/clean_catalog.json" +) +ROOT = Path(__file__).resolve().parents[1] +REGISTRY_PATH = ROOT / "data" / "radar" / "sources_registry.yaml" + + +def fetch_di_catalog() -> list[dict]: + print(f"Fetching {DI_CATALOG_URL}...") + with urlopen(DI_CATALOG_URL) as resp: + catalog = json.loads(resp.read().decode()) + return catalog["datasets"] + + +def group_by_source_id(datasets: list[dict]) -> dict[str, list[str]]: + """Group DI dataset slugs by source_id. + + Returns {source_id: [slug1, slug2, ...]} sorted by slug. + """ + groups: dict[str, list[str]] = {} + for ds in datasets: + sid = ds.get("source_id") + if not sid: + continue + groups.setdefault(sid, []).append(ds["slug"]) + # Sort slugs within each group + for sid in groups: + groups[sid].sort() + return groups + + +def update_registry(registry_path: Path, source_groups: dict[str, list[str]]) -> int: + yaml = YAML(typ="rt") # round-trip preserve order + yaml.indent(mapping=2, sequence=4, offset=2) + + with open(registry_path, encoding="utf-8") as f: + registry = yaml.load(f) or {} + + updated = 0 + for source_id, di_slugs in sorted(source_groups.items()): + if source_id not in registry: + print(f" SKIP {source_id}: not in sources_registry.yaml") + continue + + old_list = registry[source_id].get("datasets_in_use", []) + new_list = sorted(di_slugs) + + if old_list != new_list: + registry[source_id]["datasets_in_use"] = new_list + print(f" UPDATE {source_id}: {old_list} -> {new_list}") + updated += 1 + else: + print(f" OK {source_id}: {new_list}") + + # Write back preserving order + with open(registry_path, "w", encoding="utf-8") as f: + yaml.dump(registry, f) + + return updated + + +def main() -> int: + datasets = fetch_di_catalog() + print(f"DI catalog: {len(datasets)} datasets") + + groups = group_by_source_id(datasets) + print(f"Source groups: {len(groups)}") + + updated = update_registry(REGISTRY_PATH, groups) + + if updated: + print(f"\nUpdated {updated} source(s) in {REGISTRY_PATH.name}") + else: + print("\nNo changes needed") + return 0 + + +if __name__ == "__main__": + sys.exit(main())