cernopendata · zlmarshall · Mar 1, 2025 · Mar 19, 2025 · Mar 27, 2025 · Apr 2, 2025
diff --git a/atlas-2025feb-odeo/README.md b/atlas-2025feb-odeo/README.md
@@ -0,0 +1,52 @@
+# Scripts for the Feb 2025 upload of ATLAS Open Data for Education and Outreach
+
+The scripts and files in the repository are as follows:
+
+* `rucio_reformat.sh` transfers files from Eirik's eos space into central rucio
+open data space, defining open-ended rules for keeping the datasets there. Two
+datasets are created per skim: one for MC, and one for data.
+
+* `unskimmed_data.txt` is the list of unskimmed data and MC files available on
+rucio. Some of the files are quite large, so this had to be handled separately.
+
+* `updated_bjet_skims.txt` is a special list of files for updating the b-jet
+skims, which needed a second round of updates following further validation.
+
+* `qcd_skims.txt` contains a list of QCD jet datasets that are provided for
+validation purposes for the beta release, and will be made available publicly
+with the full release.
+
+* `rucio_jetskim_specialhandling.sh` is a script for special handling of the
+b-jet skim updates and QCD jet skim files.
+
+* `rucio_reformat_unskimmed.sh` moves the unskimmed data into the appropriate
+places and renames the datasets as needed.
+
+* `rucio_check.sh` checks for any inconsistencies between the files in rucio and
+those on eos, and checks to make sure that things have been transferred to the
+main open data endpoint (ie they aren't still only on scratch space)
+
+* `dataset_list.txt` is a list of the datasets created in rucio for this release
+
+* `create_metadata.py` creates a metadata json file containing all the datasets
+to be released (those in `dataset_list.txt`), and for each dataset it includes
+a dictionary of files. For each file, it contains the following metadata:
+   * adler32 check-sum
+   * size in bytes
+   * number of events
+   * type (root)
+   * uri (file location on the rucio endpoint)
+
+* `odeo_file_mapping_ODEO_v0_FEB2025_2025-06-30.json` is the output from the
+most recent run of `create_metadata.py`
+
+* `make_odeo_json.py` creates all of the json files for the CERN open data
+portal records. One record is created per rucio dataset.
+
+* `build_full_metadata_json.py` is a script for building json-formatted
+metadata files for the data and MC following the running of all of the above
+scripts, for injection of the metadata into the atlasopenmagic database. It
+requires the csv file available from
+`https://opendata.atlas.cern/docs/data/for_education/13TeV25_metadata` and
+produces files named `data_database_metadata_2025e.json` and
+`mc_database_metadata_2025e.json`
diff --git a/atlas-2025feb-odeo/build_full_metadata_json.py b/atlas-2025feb-odeo/build_full_metadata_json.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Simple script to convert csv metadata file into a full js code file
+
+# For pretty printing and writing json
+import json
+
+# For csv reading
+import csv
+
+# File with the mapping of file names for each dataset
+json_metadata_file = open('odeo_file_mapping_ODEO_v0_FEB2025_2025-06-30.json','r')
+json_file_locations = json.load(json_metadata_file)['file_locations']
+
+# Open our metadata file to go through all the EVNT samples we've gathered metadata for
+input_md = {}
+with open('metadata.csv','r') as metadata_csv_file:
+    # Open the metadata as a formatted dictionary
+    md_reader = csv.DictReader(metadata_csv_file, delimiter=',',quotechar='"',lineterminator='\n') #quoting=csv.QUOTE_ALL,lineterminator='\n')
+    # Loop through all the rows in the file (header skipped automatically for DictReader)
+    for row in md_reader:
+        my_dsid = row['dataset_number']
+        input_md[my_dsid] = { x:row[x] for x in row if x!='dataset_number' }
+
+# Final MD with file listings
+full_md = {}
+# For each row, we're going to extend and then copy over the MD
+for aset in input_md:
+    # Start by listing out all the skims
+    full_md[aset]={'skims':[]}
+    # No skim goes into the regular file list field
+    full_md[aset]['file_list'] = [ json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_MC_v0'][afile]['uri'] for afile in json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_MC_v0'] if f'mc_{aset}' in afile ]
+    # Now go through each skim, add the files and publish the metadata
+    for askim in ['2J2LMET30', '1LMET30', '3J1LMET30', 'exactly4lep', '2muons',
+                  '2to4lep', '4lep', 'exactly3lep', 'GamGam', '3lep']:
+        full_md[aset]['skims'] += [ {'skim_type':askim,
+                                     'file_list':[json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_MC_v0'][afile]['uri'] for afile in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_MC_v0'] if f'mc_{aset}' in afile ]} ]
+    # Special handling for the different naming
+    for askim in ['2bjets']:
+        full_md[aset]['skims'] += [ {'skim_type':askim,
+                                     'file_list':[json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_mc_v1'][afile]['uri'] for afile in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_mc_v1'] if f'mc_{aset}' in afile ]} ]
+
+    full_md[aset].update(input_md[aset])
+# Write out the file
+with open(f'mc_database_metadata_2025e.json','w') as metadata_json_file:
+    json.dump(
+        full_md,
+        metadata_json_file,
+        indent=2,
+        sort_keys=True,
+        ensure_ascii=False,
+        separators=(",", ": "),
+    )
+
+# Reformat and then write the data metadata to a file
+data_metadata={'data':{'skims':[],'file_list':[]}}
+data_metadata['data']['file_list'] = [ json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_Data_v0'][x]['uri'] for x in json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_Data_v0'] ]
+for askim in ['2J2LMET30', '1LMET30', '3J1LMET30', 'exactly4lep', '2muons',
+              '2to4lep', '4lep', 'exactly3lep', 'GamGam', '3lep']:
+    data_metadata['data']['skims'] += [ {'skim_type':askim,
+                                         'file_list':[ json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_Data_v0'][x]['uri'] for x in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_Data_v0'] ]} ]
+# Special handling for the different naming
+for askim in ['2bjets']:
+    data_metadata['data']['skims'] += [ {'skim_type':askim,
+                                         'file_list':[ json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_data_v1'][x]['uri'] for x in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_data_v1'] ]} ]
+with open(f'data_database_metadata_2025e.json','w') as data_metadata_json_file:
+    json.dump(
+        data_metadata,
+        data_metadata_json_file,
+        indent=2,
+        sort_keys=True,
+        ensure_ascii=False,
+        separators=(",", ": "),
+    )
diff --git a/atlas-2025feb-odeo/create_metadata.py b/atlas-2025feb-odeo/create_metadata.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+import datetime
+
+# Grab the list of datasets that we want to run over
+dataset_input = 'dataset_list.txt'
+
+# Set a post-fix for the file, so that we can nicely version things
+static_did_post = '_ODEO_v0_FEB2025_'+datetime.date.today().isoformat()
+
+# Dictionary mapping datasets to file names
+datasets = {}
+# Dictionary of Datasets --> dictionary of file names
+#    file names --> dictionary of properties (checksum, events, uri, type, size)
+file_locations = {}
+
+# Let's go over the list of files...
+with open(dataset_input,'r') as dataset_list_file:
+    for bline in dataset_list_file:
+        # Make sure we ignore comments - in case folks are commenting out datasets
+        aline = bline.split('#')[0].strip()
+        if len(aline)<2:
+            continue
+        # Initialize our dataset lists and file location lists
+        datasets[ aline.strip() ] = []
+        file_locations[ aline.strip() ] = {}
+print(f'Read in {len(datasets.keys())} datasets')
+
+# Get our rucio client ready
+from rucio.client.client import Client
+rc = Client()
+
+# Get ROOT out as well - this is a bit inconvenient, but rucio doesn't know the metadata of these ntuples
+# because it has no provenance information. So we have to figure out ourselves what they've got in them.
+import ROOT
+
+# Loop over all the datasets
+for dataset_number,dataset in enumerate(datasets):
+    # Let the people know how we're doing
+    print(f'Working on dataset {dataset_number+1} of {len(datasets)}: {dataset}')
+
+    # Get the scope
+    my_scope=dataset.split(':')[0]
+
+    # Note that the files and the dataset might have different scope, so we need a little special handling
+    # Grab the list of files from rucio - for education and outreach, we are always going to take _all_ the events
+    fl = rc.list_files(scope=my_scope,name=dataset.split(':')[1])
+    # Note that we're stashing the full file list so we can check if we got all the files later
+    for a in fl:
+        # Update the map of datasets : files
+        datasets[dataset] += [ a['name'] ]
+        # Get the first part of the per-file metadata
+        file_locations[dataset][ a['scope']+':'+a['name'] ] = { 'checksum':'adler32:'+a['adler32'], 'size':a['bytes'], 'events':a['events'], 'type':'root' }
+
+    # Second rucio query, needed to get the file location on eos
+    replicalist = rc.list_replicas([{'scope':my_scope,'name':dataset.split(':')[1]}])
+    # Go through all the results (all the files in the dataset again)
+    for areplica in replicalist:
+        # Make sure we found that file before - just error checking, this should never be printed
+        if areplica['scope']+':'+areplica['name'] not in file_locations[dataset]:
+            print(f'Warning: did not find {areplica["scope"]} {areplica["name"]} in file_locations for {dataset}')
+            continue
+        # Go through the physical locations and get the one at the open data endpoint
+        for a_pfn in areplica['pfns']:
+            if 'opendata/atlas' in a_pfn:
+                file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['uri'] = a_pfn
+                # Take the opportunity to fix the number of events
+                eos_file = ROOT.TFile.Open(a_pfn)
+                # Protect against empty files
+                if 'analysis' in eos_file.GetListOfKeys():
+                    eos_tree = eos_file.Get('analysis')
+                    file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['events'] = eos_tree.GetEntries()
+                else:
+                    file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['events'] = 0
+                break
+        else:
+            # We didn't find one on the open data endpoint
+            print(f'Did not find {dataset} file {my_scope+":"+areplica["name"]} on eos in pfns {areplica["pfns"]}')
+
+# Record the file mapping that we established
+import json
+with open( 'odeo_file_mapping'+static_did_post+'.json' , 'w' ) as file_backup:
+    json.dump( obj={'file_dictionary':datasets, 'file_locations':file_locations} , fp=file_backup )
+
+# All done!
diff --git a/atlas-2025feb-odeo/dataset_list.txt b/atlas-2025feb-odeo/dataset_list.txt
@@ -0,0 +1,26 @@
+opendata:opendata.ODEO_FEB2025_2J2LMET30_Data_v0
+opendata:opendata.ODEO_FEB2025_noskim_Data_v0
+opendata:opendata.ODEO_FEB2025_4lep_MC_v0
+opendata:opendata.ODEO_FEB2025_1LMET30_MC_v0
+opendata:opendata.ODEO_FEB2025_2muons_MC_v0
+opendata:opendata.ODEO_FEB2025_1LMET30_Data_v0
+opendata:opendata.ODEO_FEB2025_exactly4lep_MC_v0
+#opendata:opendata.ODEO_FEB2025_2bjets_Data_v0
+opendata:opendata.ODEO_FEB2025_2bjets_data_v1
+opendata:opendata.ODEO_FEB2025_3J1LMET30_Data_v0
+opendata:opendata.ODEO_FEB2025_3lep_MC_v0
+opendata:opendata.ODEO_FEB2025_exactly4lep_Data_v0
+opendata:opendata.ODEO_FEB2025_2to4lep_MC_v0
+opendata:opendata.ODEO_FEB2025_GamGam_MC_v0
+opendata:opendata.ODEO_FEB2025_2muons_Data_v0
+#opendata:opendata.ODEO_FEB2025_2bjets_MC_v0
+opendata:opendata.ODEO_FEB2025_2bjets_mc_v1
+opendata:opendata.ODEO_FEB2025_2to4lep_Data_v0
+opendata:opendata.ODEO_FEB2025_4lep_Data_v0
+opendata:opendata.ODEO_FEB2025_exactly3lep_Data_v0
+opendata:opendata.ODEO_FEB2025_3J1LMET30_MC_v0
+opendata:opendata.ODEO_FEB2025_GamGam_Data_v0
+opendata:opendata.ODEO_FEB2025_exactly3lep_MC_v0
+opendata:opendata.ODEO_FEB2025_2J2LMET30_MC_v0
+opendata:opendata.ODEO_FEB2025_3lep_Data_v0
+opendata:opendata.ODEO_FEB2025_noskim_MC_v0