Skip to content

Commit

Permalink
updated scripts to use command line arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
mas400 committed Jun 28, 2017
1 parent 9940453 commit 9219a44
Show file tree
Hide file tree
Showing 3 changed files with 223 additions and 206 deletions.
59 changes: 33 additions & 26 deletions src/scripts/dats_to_doi/create_spew_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,47 @@
import json
import csv
import re
import sys

dats_folder = 'DATS FOLDER LOCATION'
if os.path.isfile('spew_mapping.csv'):
file = open('spew_mapping.csv', 'a+')
interval = sum(1 for line in open('spew_mapping.csv'))-1
wr = csv.writer(file, quoting=csv.QUOTE_ALL)

if len(sys.argv) < 2:
print('DESCRIPTION:\n\tReads SPEW metadata and creates a CSV mapping of the location code, SPEW version, and landing page URL to an anonymous identifier\n\tA directory path is required\n')
print('USAGE:\n\tpython create_spew_mapping.py <path_to_dats_directory>\n')
else:
interval = 0
file = open('spew_mapping.csv', 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])
dats_folder = sys.argv[1]

for filename in os.listdir(dats_folder):
if filename.endswith(".json"):
interval += 1
if os.path.isfile('spew_mapping.csv'):
file = open('spew_mapping.csv', 'a+')
interval = sum(1 for line in open('spew_mapping.csv'))-1
wr = csv.writer(file, quoting=csv.QUOTE_ALL)

# Read metadata as json
with open(os.path.join(dats_folder, filename)) as json_file:
json_data = json.load(json_file);
else:
interval = 0
file = open('spew_mapping.csv', 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])

# Get title
title = json_data['title']
for filename in os.listdir(dats_folder):
if filename.endswith(".json"):
interval += 1

# Get landing page
landing_page = json_data['distributions'][0]['access']['landingPage']
# Read metadata as json
with open(os.path.join(dats_folder, filename)) as json_file:
json_data = json.load(json_file);

# Get apollo location code
ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
location_code = int(re.search(r'\d+', ls_url).group())
# Get title
title = json_data['title']

# Get spew version
version = json_data['types'][2]['platform']['value']
# Get landing page
landing_page = json_data['distributions'][0]['access']['landingPage']

wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])
# Get apollo location code
ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
location_code = int(re.search(r'\d+', ls_url).group())

file.close()
# Get spew version
version = json_data['types'][2]['platform']['value']

wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])

file.close()
109 changes: 57 additions & 52 deletions src/scripts/dats_to_doi/update_dats_with_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,68 +2,73 @@
import csv
import json
import collections
import sys

ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw'
dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_ipums_dats_json/'
if len(sys.argv) < 3:
print('DESCRIPTION:\n\tUpdate the DATS metadata with the DOI and PURL\n\tA Zenodo access token and directory path are required\n')
print('USAGE:\n\tpython update_dats_with_doi.py <access_token> <path_to_dats_directory>\n')
else:
ACCESS_TOKEN = sys.argv[1]
dats_folder = sys.argv[2]

data = csv.reader(open('spew_mapping.csv'))
# Read the column names from the first line of the file
fields = next(data)
csv_dict = {}
for row in data:
# Zip together the field names and values
items = zip(fields, row)
item = {}
key = ()
# Add the value to our dictionary
for (name, value) in items:
item[name] = value.strip()
data = csv.reader(open('spew_mapping.csv'))
# Read the column names from the first line of the file
fields = next(data)
csv_dict = {}
for row in data:
# Zip together the field names and values
items = zip(fields, row)
item = {}
key = ()
# Add the value to our dictionary
for (name, value) in items:
item[name] = value.strip()

key = item['Title']
csv_dict[key] = item
key = item['Title']
csv_dict[key] = item

response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
json_response = response.json()
for deposition_index in range(len(json_response)):
id = json_response[deposition_index]['id']
response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
json_response = response.json()
for deposition_index in range(len(json_response)):
id = json_response[deposition_index]['id']

r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
params={'access_token': ACCESS_TOKEN})
r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
params={'access_token': ACCESS_TOKEN})

deposition_json = r.json()
deposition_json = r.json()

# Get download link for access url
access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']
# Get download link for access url
access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']

# Get title to cross reference with spew_mapping.csv
title = deposition_json['title']
if not "RABIES" in title.upper() and not "H1N1" in title:
try:
landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
except KeyError:
continue
# Get title to cross reference with spew_mapping.csv
title = deposition_json['title']
if not "RABIES" in title.upper() and not "H1N1" in title:
try:
landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
except KeyError:
continue

# Extract the name from the landing page in spew_mapping, this will allow us to access the json file
file_name = ()
old_landing_page = csv_dict[title]['Landing Page'].split('/')
if len(old_landing_page) > 10:
file_name = old_landing_page[8] + ".json"
else:
file_name = old_landing_page[7] + ".json"
# Extract the name from the landing page in spew_mapping, this will allow us to access the json file
file_name = ()
old_landing_page = csv_dict[title]['Landing Page'].split('/')
if len(old_landing_page) > 10:
file_name = old_landing_page[8] + ".json"
else:
file_name = old_landing_page[7] + ".json"

# Update the dats file with the correct identifier information and the access and landing URLs
try:
with open(dats_folder+file_name) as json_file:
old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
except FileNotFoundError:
continue
# Update the dats file with the correct identifier information and the access and landing URLs
try:
with open(dats_folder+file_name) as json_file:
old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
except FileNotFoundError:
continue


old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
old_meta_data['identifier']['identifierSource'] = "zenodo"
old_meta_data['distributions'][0]['access']['accessURL'] = access_url
old_meta_data['distributions'][0]['access']['landingPage'] = landing_url
old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
old_meta_data['identifier']['identifierSource'] = "zenodo"
old_meta_data['distributions'][0]['access']['accessURL'] = access_url
old_meta_data['distributions'][0]['access']['landingPage'] = landing_url

with open(dats_folder+file_name, 'w') as outfile:
json.dump(old_meta_data, outfile, indent=4)
print("created " + file_name)
with open(dats_folder+file_name, 'w') as outfile:
json.dump(old_meta_data, outfile, indent=4)
print("created " + file_name)
Loading

0 comments on commit 9219a44

Please sign in to comment.