updated scripts to use command line arguments

midas-isg · Jun 28, 2017 · 9219a44 · 9219a44
1 parent 9940453
commit 9219a44
Show file tree

Hide file tree

Showing 3 changed files with 223 additions and 206 deletions.
diff --git a/src/scripts/dats_to_doi/create_spew_mapping.py b/src/scripts/dats_to_doi/create_spew_mapping.py
@@ -4,40 +4,47 @@
 import json
 import csv
 import re
+import sys
 
-dats_folder = 'DATS FOLDER LOCATION'
-if os.path.isfile('spew_mapping.csv'):
-    file = open('spew_mapping.csv', 'a+')
-    interval = sum(1 for line in open('spew_mapping.csv'))-1
-    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
 
+if len(sys.argv) < 2:
+    print('DESCRIPTION:\n\tReads SPEW metadata and creates a CSV mapping of the location code, SPEW version, and landing page URL to an anonymous identifier\n\tA directory path is required\n')
+    print('USAGE:\n\tpython create_spew_mapping.py <path_to_dats_directory>\n')
 else:
-    interval = 0
-    file = open('spew_mapping.csv', 'w')
-    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
-    wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])
+    dats_folder = sys.argv[1]
 
-for filename in os.listdir(dats_folder):
-    if filename.endswith(".json"):
-        interval += 1
+    if os.path.isfile('spew_mapping.csv'):
+        file = open('spew_mapping.csv', 'a+')
+        interval = sum(1 for line in open('spew_mapping.csv'))-1
+        wr = csv.writer(file, quoting=csv.QUOTE_ALL)
 
-        # Read metadata as json
-        with open(os.path.join(dats_folder, filename)) as json_file:
-            json_data = json.load(json_file);
+    else:
+        interval = 0
+        file = open('spew_mapping.csv', 'w')
+        wr = csv.writer(file, quoting=csv.QUOTE_ALL)
+        wr.writerow(['Apollo Location Code', 'SPEW Version', 'Landing Page', 'Anonymous Identifier', 'Title'])
 
-        # Get title
-        title = json_data['title']
+    for filename in os.listdir(dats_folder):
+        if filename.endswith(".json"):
+            interval += 1
 
-        # Get landing page
-        landing_page = json_data['distributions'][0]['access']['landingPage']
+            # Read metadata as json
+            with open(os.path.join(dats_folder, filename)) as json_file:
+                json_data = json.load(json_file);
 
-        # Get apollo location code
-        ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
-        location_code = int(re.search(r'\d+', ls_url).group())
+            # Get title
+            title = json_data['title']
 
-        # Get spew version
-        version = json_data['types'][2]['platform']['value']
+            # Get landing page
+            landing_page = json_data['distributions'][0]['access']['landingPage']
 
-        wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])
+            # Get apollo location code
+            ls_url = json_data['spatialCoverage'][0]['identifier']['identifier']
+            location_code = int(re.search(r'\d+', ls_url).group())
 
-file.close()
+            # Get spew version
+            version = json_data['types'][2]['platform']['value']
+
+            wr.writerow([location_code, version, landing_page, str(interval).zfill(7), title])
+
+    file.close()
diff --git a/src/scripts/dats_to_doi/update_dats_with_doi.py b/src/scripts/dats_to_doi/update_dats_with_doi.py
@@ -2,68 +2,73 @@
 import csv
 import json
 import collections
+import sys
 
-ACCESS_TOKEN = 'SvxcV0O7kHohjkBVHcHZ3iZmgtJvKeZPN85ZFtgrc5wa0Uup1MtYWl2HzWTw'
-dats_folder = '/Users/amd176/Documents/Repositories/digital-commons/src/scripts/convert_to_dats/output/spew_ipums_dats_json/'
+if len(sys.argv) < 3:
+    print('DESCRIPTION:\n\tUpdate the DATS metadata with the DOI and PURL\n\tA Zenodo access token and directory path are required\n')
+    print('USAGE:\n\tpython update_dats_with_doi.py <access_token> <path_to_dats_directory>\n')
+else:
+    ACCESS_TOKEN = sys.argv[1]
+    dats_folder = sys.argv[2]
 
-data = csv.reader(open('spew_mapping.csv'))
-# Read the column names from the first line of the file
-fields = next(data)
-csv_dict = {}
-for row in data:
-    # Zip together the field names and values
-    items = zip(fields, row)
-    item = {}
-    key = ()
-    # Add the value to our dictionary
-    for (name, value) in items:
-        item[name] = value.strip()
+    data = csv.reader(open('spew_mapping.csv'))
+    # Read the column names from the first line of the file
+    fields = next(data)
+    csv_dict = {}
+    for row in data:
+        # Zip together the field names and values
+        items = zip(fields, row)
+        item = {}
+        key = ()
+        # Add the value to our dictionary
+        for (name, value) in items:
+            item[name] = value.strip()
 
-    key = item['Title']
-    csv_dict[key] = item
+        key = item['Title']
+        csv_dict[key] = item
 
-response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
-json_response = response.json()
-for deposition_index in range(len(json_response)):
-    id = json_response[deposition_index]['id']
+    response = requests.get('https://zenodo.org//api/deposit/depositions', params={'access_token': ACCESS_TOKEN, 'size': 200, 'status': 'published'})
+    json_response = response.json()
+    for deposition_index in range(len(json_response)):
+        id = json_response[deposition_index]['id']
 
-    r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
-                    params={'access_token': ACCESS_TOKEN})
+        r = requests.get("https://zenodo.org/api/deposit/depositions/" + str(id),
+                        params={'access_token': ACCESS_TOKEN})
 
-    deposition_json = r.json()
+        deposition_json = r.json()
 
-    # Get download link for access url
-    access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']
+        # Get download link for access url
+        access_url = "https://zenodo.org/record/" + str(deposition_json['record_id']) + "/files/" + deposition_json['files'][0]['filename']
 
-    # Get title to cross reference with spew_mapping.csv
-    title = deposition_json['title']
-    if not "RABIES" in title.upper() and not "H1N1" in title:
-        try:
-            landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
-        except KeyError:
-            continue
+        # Get title to cross reference with spew_mapping.csv
+        title = deposition_json['title']
+        if not "RABIES" in title.upper() and not "H1N1" in title:
+            try:
+                landing_url = "http://w3id.org/spew/" + csv_dict[title]['Anonymous Identifier']
+            except KeyError:
+                continue
 
-        # Extract the name  from the landing page in spew_mapping, this will allow us to access the json file
-        file_name = ()
-        old_landing_page = csv_dict[title]['Landing Page'].split('/')
-        if len(old_landing_page) > 10:
-            file_name = old_landing_page[8] + ".json"
-        else:
-            file_name = old_landing_page[7] + ".json"
+            # Extract the name  from the landing page in spew_mapping, this will allow us to access the json file
+            file_name = ()
+            old_landing_page = csv_dict[title]['Landing Page'].split('/')
+            if len(old_landing_page) > 10:
+                file_name = old_landing_page[8] + ".json"
+            else:
+                file_name = old_landing_page[7] + ".json"
 
-        # Update the dats file with the correct identifier information and the access and landing URLs
-        try:
-            with open(dats_folder+file_name) as json_file:
-                old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
-        except FileNotFoundError:
-            continue
+            # Update the dats file with the correct identifier information and the access and landing URLs
+            try:
+                with open(dats_folder+file_name) as json_file:
+                    old_meta_data = json.load(json_file, object_pairs_hook=collections.OrderedDict)
+            except FileNotFoundError:
+                continue
 
 
-        old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
-        old_meta_data['identifier']['identifierSource'] = "zenodo"
-        old_meta_data['distributions'][0]['access']['accessURL'] = access_url
-        old_meta_data['distributions'][0]['access']['landingPage'] = landing_url
+            old_meta_data['identifier']['identifier'] = deposition_json['doi_url']
+            old_meta_data['identifier']['identifierSource'] = "zenodo"
+            old_meta_data['distributions'][0]['access']['accessURL'] = access_url
+            old_meta_data['distributions'][0]['access']['landingPage'] = landing_url
 
-        with open(dats_folder+file_name, 'w') as outfile:
-            json.dump(old_meta_data, outfile, indent=4)
-        print("created " + file_name)
+            with open(dats_folder+file_name, 'w') as outfile:
+                json.dump(old_meta_data, outfile, indent=4)
+            print("created " + file_name)