Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ jobs:
data = yaml.safe_load(f)
sources = list(data.keys())
print(f'OK — {len(sources)} fonti caricate:', sources)
# Verify required fields per entry
for sid, cfg in data.items():
assert 'protocol' in cfg, f'{sid}: missing protocol'
assert 'observation_mode' in cfg, f'{sid}: missing observation_mode'
assert 'verdict' in cfg, f'{sid}: missing verdict'
print('Schema validation OK')
"



- name: Run tests
run: pytest tests/ -v

Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/radar.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ jobs:
run: |
python scripts/radar_check.py

- name: Sync datasets_in_use from DI catalog
run: python scripts/sync_datasets_in_use.py

- name: Upload radar artifacts
uses: actions/upload-artifact@v7
with:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
requests>=2.33.1,<3.0
PyYAML>=6.0.3,<7.0
ruamel.yaml>=0.18.10,<1.0
duckdb>=1.5.2,<2.0
pandas>=3.0.2,<4.0
pyarrow>=24.0.0,<25.0
Expand Down
93 changes: 93 additions & 0 deletions scripts/sync_datasets_in_use.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""Sync datasets_in_use in sources_registry.yaml from DI's clean_catalog.json.

Reads dataset-incubator/registry/clean_catalog.json, groups datasets by source_id,
and updates sources_registry.yaml so that each source lists its DI candidate slugs.
"""

import json
import sys
from pathlib import Path
from urllib.request import urlopen

from ruamel.yaml import YAML

DI_CATALOG_URL = (
"https://raw.githubusercontent.com/dataciviclab/dataset-incubator/"
"main/registry/clean_catalog.json"
)
ROOT = Path(__file__).resolve().parents[1]
REGISTRY_PATH = ROOT / "data" / "radar" / "sources_registry.yaml"


def fetch_di_catalog() -> list[dict]:
print(f"Fetching {DI_CATALOG_URL}...")
with urlopen(DI_CATALOG_URL) as resp:
catalog = json.loads(resp.read().decode())
return catalog["datasets"]


def group_by_source_id(datasets: list[dict]) -> dict[str, list[str]]:
"""Group DI dataset slugs by source_id.

Returns {source_id: [slug1, slug2, ...]} sorted by slug.
"""
groups: dict[str, list[str]] = {}
for ds in datasets:
sid = ds.get("source_id")
if not sid:
continue
groups.setdefault(sid, []).append(ds["slug"])
# Sort slugs within each group
for sid in groups:
groups[sid].sort()
return groups


def update_registry(registry_path: Path, source_groups: dict[str, list[str]]) -> int:
yaml = YAML(typ="rt") # round-trip preserve order
yaml.indent(mapping=2, sequence=4, offset=2)

with open(registry_path, encoding="utf-8") as f:
registry = yaml.load(f) or {}

updated = 0
for source_id, di_slugs in sorted(source_groups.items()):
if source_id not in registry:
print(f" SKIP {source_id}: not in sources_registry.yaml")
continue

old_list = registry[source_id].get("datasets_in_use", [])
new_list = sorted(di_slugs)

if old_list != new_list:
registry[source_id]["datasets_in_use"] = new_list
print(f" UPDATE {source_id}: {old_list} -> {new_list}")
updated += 1
else:
print(f" OK {source_id}: {new_list}")

# Write back preserving order
with open(registry_path, "w", encoding="utf-8") as f:
yaml.dump(registry, f)

return updated


def main() -> int:
datasets = fetch_di_catalog()
print(f"DI catalog: {len(datasets)} datasets")

groups = group_by_source_id(datasets)
print(f"Source groups: {len(groups)}")

updated = update_registry(REGISTRY_PATH, groups)

if updated:
print(f"\nUpdated {updated} source(s) in {REGISTRY_PATH.name}")
else:
print("\nNo changes needed")
return 0


if __name__ == "__main__":
sys.exit(main())