Skip to content
Open
52 changes: 52 additions & 0 deletions atlas-2025feb-odeo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Scripts for the Feb 2025 upload of ATLAS Open Data for Education and Outreach

The scripts and files in the repository are as follows:

* `rucio_reformat.sh` transfers files from Eirik's eos space into central rucio
open data space, defining open-ended rules for keeping the datasets there. Two
datasets are created per skim: one for MC, and one for data.

* `unskimmed_data.txt` is the list of unskimmed data and MC files available on
rucio. Some of the files are quite large, so this had to be handled separately.

* `updated_bjet_skims.txt` is a special list of files for updating the b-jet
skims, which needed a second round of updates following further validation.

* `qcd_skims.txt` contains a list of QCD jet datasets that are provided for
validation purposes for the beta release, and will be made available publicly
with the full release.

* `rucio_jetskim_specialhandling.sh` is a script for special handling of the
b-jet skim updates and QCD jet skim files.

* `rucio_reformat_unskimmed.sh` moves the unskimmed data into the appropriate
places and renames the datasets as needed.

* `rucio_check.sh` checks for any inconsistencies between the files in rucio and
those on eos, and checks to make sure that things have been transferred to the
main open data endpoint (ie they aren't still only on scratch space)

* `dataset_list.txt` is a list of the datasets created in rucio for this release

* `create_metadata.py` creates a metadata json file containing all the datasets
to be released (those in `dataset_list.txt`), and for each dataset it includes
a dictionary of files. For each file, it contains the following metadata:
* adler32 check-sum
* size in bytes
* number of events
* type (root)
* uri (file location on the rucio endpoint)

* `odeo_file_mapping_ODEO_v0_FEB2025_2025-06-30.json` is the output from the
most recent run of `create_metadata.py`

* `make_odeo_json.py` creates all of the json files for the CERN open data
portal records. One record is created per rucio dataset.

* `build_full_metadata_json.py` is a script for building json-formatted
metadata files for the data and MC following the running of all of the above
scripts, for injection of the metadata into the atlasopenmagic database. It
requires the csv file available from
`https://opendata.atlas.cern/docs/data/for_education/13TeV25_metadata` and
produces files named `data_database_metadata_2025e.json` and
`mc_database_metadata_2025e.json`
74 changes: 74 additions & 0 deletions atlas-2025feb-odeo/build_full_metadata_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3

# Simple script to convert csv metadata file into a full js code file

# For pretty printing and writing json
import json

# For csv reading
import csv

# File with the mapping of file names for each dataset
json_metadata_file = open('odeo_file_mapping_ODEO_v0_FEB2025_2025-06-30.json','r')
json_file_locations = json.load(json_metadata_file)['file_locations']

# Open our metadata file to go through all the EVNT samples we've gathered metadata for
input_md = {}
with open('metadata.csv','r') as metadata_csv_file:
# Open the metadata as a formatted dictionary
md_reader = csv.DictReader(metadata_csv_file, delimiter=',',quotechar='"',lineterminator='\n') #quoting=csv.QUOTE_ALL,lineterminator='\n')
# Loop through all the rows in the file (header skipped automatically for DictReader)
for row in md_reader:
my_dsid = row['dataset_number']
input_md[my_dsid] = { x:row[x] for x in row if x!='dataset_number' }

# Final MD with file listings
full_md = {}
# For each row, we're going to extend and then copy over the MD
for aset in input_md:
# Start by listing out all the skims
full_md[aset]={'skims':[]}
# No skim goes into the regular file list field
full_md[aset]['file_list'] = [ json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_MC_v0'][afile]['uri'] for afile in json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_MC_v0'] if f'mc_{aset}' in afile ]
# Now go through each skim, add the files and publish the metadata
for askim in ['2J2LMET30', '1LMET30', '3J1LMET30', 'exactly4lep', '2muons',
'2to4lep', '4lep', 'exactly3lep', 'GamGam', '3lep']:
full_md[aset]['skims'] += [ {'skim_type':askim,
'file_list':[json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_MC_v0'][afile]['uri'] for afile in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_MC_v0'] if f'mc_{aset}' in afile ]} ]
# Special handling for the different naming
for askim in ['2bjets']:
full_md[aset]['skims'] += [ {'skim_type':askim,
'file_list':[json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_mc_v1'][afile]['uri'] for afile in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_mc_v1'] if f'mc_{aset}' in afile ]} ]

full_md[aset].update(input_md[aset])
# Write out the file
with open(f'mc_database_metadata_2025e.json','w') as metadata_json_file:
json.dump(
full_md,
metadata_json_file,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)

# Reformat and then write the data metadata to a file
data_metadata={'data':{'skims':[],'file_list':[]}}
data_metadata['data']['file_list'] = [ json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_Data_v0'][x]['uri'] for x in json_file_locations['opendata:opendata.ODEO_FEB2025_noskim_Data_v0'] ]
for askim in ['2J2LMET30', '1LMET30', '3J1LMET30', 'exactly4lep', '2muons',
'2to4lep', '4lep', 'exactly3lep', 'GamGam', '3lep']:
data_metadata['data']['skims'] += [ {'skim_type':askim,
'file_list':[ json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_Data_v0'][x]['uri'] for x in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_Data_v0'] ]} ]
# Special handling for the different naming
for askim in ['2bjets']:
data_metadata['data']['skims'] += [ {'skim_type':askim,
'file_list':[ json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_data_v1'][x]['uri'] for x in json_file_locations[f'opendata:opendata.ODEO_FEB2025_{askim}_data_v1'] ]} ]
with open(f'data_database_metadata_2025e.json','w') as data_metadata_json_file:
json.dump(
data_metadata,
data_metadata_json_file,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
84 changes: 84 additions & 0 deletions atlas-2025feb-odeo/create_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python3
import datetime

# Grab the list of datasets that we want to run over
dataset_input = 'dataset_list.txt'

# Set a post-fix for the file, so that we can nicely version things
static_did_post = '_ODEO_v0_FEB2025_'+datetime.date.today().isoformat()

# Dictionary mapping datasets to file names
datasets = {}
# Dictionary of Datasets --> dictionary of file names
# file names --> dictionary of properties (checksum, events, uri, type, size)
file_locations = {}

# Let's go over the list of files...
with open(dataset_input,'r') as dataset_list_file:
for bline in dataset_list_file:
# Make sure we ignore comments - in case folks are commenting out datasets
aline = bline.split('#')[0].strip()
if len(aline)<2:
continue
# Initialize our dataset lists and file location lists
datasets[ aline.strip() ] = []
file_locations[ aline.strip() ] = {}
print(f'Read in {len(datasets.keys())} datasets')

# Get our rucio client ready
from rucio.client.client import Client
rc = Client()

# Get ROOT out as well - this is a bit inconvenient, but rucio doesn't know the metadata of these ntuples
# because it has no provenance information. So we have to figure out ourselves what they've got in them.
import ROOT

# Loop over all the datasets
for dataset_number,dataset in enumerate(datasets):
# Let the people know how we're doing
print(f'Working on dataset {dataset_number+1} of {len(datasets)}: {dataset}')

# Get the scope
my_scope=dataset.split(':')[0]

# Note that the files and the dataset might have different scope, so we need a little special handling
# Grab the list of files from rucio - for education and outreach, we are always going to take _all_ the events
fl = rc.list_files(scope=my_scope,name=dataset.split(':')[1])
# Note that we're stashing the full file list so we can check if we got all the files later
for a in fl:
# Update the map of datasets : files
datasets[dataset] += [ a['name'] ]
# Get the first part of the per-file metadata
file_locations[dataset][ a['scope']+':'+a['name'] ] = { 'checksum':'adler32:'+a['adler32'], 'size':a['bytes'], 'events':a['events'], 'type':'root' }

# Second rucio query, needed to get the file location on eos
replicalist = rc.list_replicas([{'scope':my_scope,'name':dataset.split(':')[1]}])
# Go through all the results (all the files in the dataset again)
for areplica in replicalist:
# Make sure we found that file before - just error checking, this should never be printed
if areplica['scope']+':'+areplica['name'] not in file_locations[dataset]:
print(f'Warning: did not find {areplica["scope"]} {areplica["name"]} in file_locations for {dataset}')
continue
# Go through the physical locations and get the one at the open data endpoint
for a_pfn in areplica['pfns']:
if 'opendata/atlas' in a_pfn:
file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['uri'] = a_pfn
# Take the opportunity to fix the number of events
eos_file = ROOT.TFile.Open(a_pfn)
# Protect against empty files
if 'analysis' in eos_file.GetListOfKeys():
eos_tree = eos_file.Get('analysis')
file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['events'] = eos_tree.GetEntries()
else:
file_locations[dataset][ areplica['scope']+':'+areplica['name'] ]['events'] = 0
break
else:
# We didn't find one on the open data endpoint
print(f'Did not find {dataset} file {my_scope+":"+areplica["name"]} on eos in pfns {areplica["pfns"]}')

# Record the file mapping that we established
import json
with open( 'odeo_file_mapping'+static_did_post+'.json' , 'w' ) as file_backup:
json.dump( obj={'file_dictionary':datasets, 'file_locations':file_locations} , fp=file_backup )

# All done!
26 changes: 26 additions & 0 deletions atlas-2025feb-odeo/dataset_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
opendata:opendata.ODEO_FEB2025_2J2LMET30_Data_v0
opendata:opendata.ODEO_FEB2025_noskim_Data_v0
opendata:opendata.ODEO_FEB2025_4lep_MC_v0
opendata:opendata.ODEO_FEB2025_1LMET30_MC_v0
opendata:opendata.ODEO_FEB2025_2muons_MC_v0
opendata:opendata.ODEO_FEB2025_1LMET30_Data_v0
opendata:opendata.ODEO_FEB2025_exactly4lep_MC_v0
#opendata:opendata.ODEO_FEB2025_2bjets_Data_v0
opendata:opendata.ODEO_FEB2025_2bjets_data_v1
opendata:opendata.ODEO_FEB2025_3J1LMET30_Data_v0
opendata:opendata.ODEO_FEB2025_3lep_MC_v0
opendata:opendata.ODEO_FEB2025_exactly4lep_Data_v0
opendata:opendata.ODEO_FEB2025_2to4lep_MC_v0
opendata:opendata.ODEO_FEB2025_GamGam_MC_v0
opendata:opendata.ODEO_FEB2025_2muons_Data_v0
#opendata:opendata.ODEO_FEB2025_2bjets_MC_v0
opendata:opendata.ODEO_FEB2025_2bjets_mc_v1
opendata:opendata.ODEO_FEB2025_2to4lep_Data_v0
opendata:opendata.ODEO_FEB2025_4lep_Data_v0
opendata:opendata.ODEO_FEB2025_exactly3lep_Data_v0
opendata:opendata.ODEO_FEB2025_3J1LMET30_MC_v0
opendata:opendata.ODEO_FEB2025_GamGam_Data_v0
opendata:opendata.ODEO_FEB2025_exactly3lep_MC_v0
opendata:opendata.ODEO_FEB2025_2J2LMET30_MC_v0
opendata:opendata.ODEO_FEB2025_3lep_Data_v0
opendata:opendata.ODEO_FEB2025_noskim_MC_v0
Loading
Loading