From a2906e29057fab7e985631bcf7c31c842303d9c8 Mon Sep 17 00:00:00 2001 From: hancush Date: Tue, 31 Aug 2021 11:31:44 -0500 Subject: [PATCH 1/8] Update existing sources/access points on import, bulletproof publication date logic --- .../management/commands/import_google_doc.py | 95 ++++++++++--------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index e0476d02..51c06838 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1394,55 +1394,64 @@ def create_sources(self, source_sheet): for idx, source_data in enumerate(source_sheet['values']): access_point_uuid = source_data['source:access_point_id:admin'].strip() + try: - AccessPoint.objects.get(uuid=access_point_uuid) - except ValidationError: + access_point, _ = AccessPoint.objects.get_or_create(uuid=access_point_uuid) + + except (ValidationError, ValueError): self.log_error( 'Invalid source UUID: "{}"'.format(access_point_uuid), sheet='sources', current_row=idx + 2 # Handle 0-index and header row ) - except AccessPoint.DoesNotExist: - source_info = { - 'title': source_data[Source.get_spreadsheet_field_name('title')], - 'type': source_data[Source.get_spreadsheet_field_name('type')], - 'author': source_data[Source.get_spreadsheet_field_name('author')], - 'publication': source_data[Source.get_spreadsheet_field_name('publication')], - 'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')], - 'source_url': source_data[Source.get_spreadsheet_field_name('source_url')], - 'user': self.user - } - # Figure out if created/uploaded/published dates are timestamps - for prefix in ('published', 'created', 'uploaded'): - date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] - try: - # Try to parse the value as a timestamp (remove timezone - # marker for Pyton <3.7) - parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') - except ValueError: - # Value is a date, or empty - parsed_date = self.parse_date(date_val) - source_info['{}_date'.format(prefix)] = parsed_date - else: - source_info['{}_timestamp'.format(prefix)] = parsed_date - - if not parsed_date and prefix == 'published': - message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) - self.log_error(message, sheet='sources', current_row=idx + 2) - - new_source, created = Source.objects.get_or_create(**source_info) - - AccessPoint.objects.create( - uuid=access_point_uuid, - type=source_data[AccessPoint.get_spreadsheet_field_name('type')], - trigger=source_data[AccessPoint.get_spreadsheet_field_name('trigger')], - accessed_on=self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]), - archive_url=source_data[AccessPoint.get_spreadsheet_field_name('archive_url')], - source=new_source, - user=self.user - ) - except ValueError: - self.log_error("Invalid access point at: " + access_point_uuid) + continue + + source_info = { + 'title': source_data[Source.get_spreadsheet_field_name('title')], + 'type': source_data[Source.get_spreadsheet_field_name('type')], + 'author': source_data[Source.get_spreadsheet_field_name('author')], + 'publication': source_data[Source.get_spreadsheet_field_name('publication')], + 'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')], + 'source_url': source_data[Source.get_spreadsheet_field_name('source_url')], + 'user': self.user, + } + + for prefix in ('published', 'created', 'uploaded'): + date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] + parsed_date = None + + try: + # Try to parse the value as a timestamp (remove timezone + # marker for Pyton <3.7) + parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') + + except ValueError: + # Value is a date, or empty + parsed_date = self.parse_date(date_val) + source_info['{}_date'.format(prefix)] = parsed_date + + else: + source_info['{}_timestamp'.format(prefix)] = parsed_date + + if not parsed_date and prefix == 'published': + message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) + self.log_error(message, sheet='sources', current_row=idx + 2) + + source, _ = Source.objects.get_or_create(**source_info) + + access_point_info = { + 'type': source_data[AccessPoint.get_spreadsheet_field_name('type')], + 'trigger': source_data[AccessPoint.get_spreadsheet_field_name('trigger')], + 'accessed_on': self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]), + 'archive_url': source_data[AccessPoint.get_spreadsheet_field_name('archive_url')], + 'source': source, + 'user': self.user, + } + + for attr, val in access_point_info.items(): + setattr(access_point, 'attr', val) + + access_point.save() def get_sources(self, source_id_string): From 627a881ab8941610d41ade439a14ab2fe04dcf63 Mon Sep 17 00:00:00 2001 From: hancush Date: Tue, 7 Sep 2021 15:14:23 -0500 Subject: [PATCH 2/8] Stash sourced date test --- tests/test_importer.py | 43 +++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/tests/test_importer.py b/tests/test_importer.py index 59dc21e4..b757fade 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -112,11 +112,12 @@ def test_sources(data_import, data_folder): @pytest.mark.django_db -def test_source_dates_and_timestamps(data_import): +def test_source_dates_and_timestamps(data_import, data_folder): """Make sure Source date fields properly parse dates and timestamps.""" timestamp_src = Source.objects.get(title='Source Timestamps') date_src = Source.objects.get(title='Source Dates') date_and_timestamp_prefixes = ('created', 'published', 'uploaded') + for prefix in date_and_timestamp_prefixes: date_field = '{}_date'.format(prefix) timestamp_field = '{}_timestamp'.format(prefix) @@ -132,27 +133,39 @@ def test_source_dates_and_timestamps(data_import): 'sources-errors.csv' ) - undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\ - .values_list('accesspoint__uuid', flat=True) + # Test that source errors are reported whether it's the first or 101st time + # we're seeing them. + for run_number in range(1, 3): + if run_number == 2: + # Remove the error file from the first run + os.remove(error_file) + + # Re-run the import + data_import = io.StringIO() + call_command('import_google_doc', folder=data_folder, stdout=output) + + undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\ + .values_list('accesspoint__uuid', flat=True) + + undated_source_set = set(str(uuid) for uuid in undated_sources) - undated_source_set = set(str(uuid) for uuid in undated_sources) + error_source_set = set() - error_source_set = set() + with open(error_file, 'r') as f: + reader = csv.reader(f) - with open(error_file, 'r') as f: - reader = csv.reader(f) + next(reader) - next(reader) # discard header + for record in reader: + _, message = record + assert message.startswith('Invalid published_date') - for record in reader: - _, message = record - assert message.startswith('Invalid published_date') + source_id = message.split()[-1] + assert source_id in undated_source_set + error_source_set.add(source_id) - source_id = message.split()[-1] - assert source_id in undated_source_set - error_source_set.add(source_id) + assert undated_source_set == error_source_set - assert undated_source_set == error_source_set @pytest.mark.django_db def test_incidents(data_import): From d628aa07a6f910af55b4836ed77a0303a321ffff Mon Sep 17 00:00:00 2001 From: hancush Date: Wed, 8 Sep 2021 09:54:33 -0500 Subject: [PATCH 3/8] Patch test, import bugs --- .github/workflows/main.yml | 1 + .../management/commands/import_google_doc.py | 22 ++++++++++++++----- source/models.py | 2 +- tests/test_importer.py | 2 +- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b0d9891e..eba46368 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,6 +10,7 @@ on: pull_request: branches: - master + - hcg/translation jobs: test: diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index 51c06838..e8e5e640 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1396,7 +1396,10 @@ def create_sources(self, source_sheet): access_point_uuid = source_data['source:access_point_id:admin'].strip() try: - access_point, _ = AccessPoint.objects.get_or_create(uuid=access_point_uuid) + access_point, _ = AccessPoint.objects.get_or_create( + uuid=access_point_uuid, + user=self.user, + ) except (ValidationError, ValueError): self.log_error( @@ -1426,8 +1429,11 @@ def create_sources(self, source_sheet): parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') except ValueError: - # Value is a date, or empty - parsed_date = self.parse_date(date_val) + # Fall back to an empty string because we want to use + # source_info to retrieve and update existing Sources, and + # date fields default to an empty string if no data is + # provided + parsed_date = self.parse_date(date_val) or '' source_info['{}_date'.format(prefix)] = parsed_date else: @@ -1437,7 +1443,13 @@ def create_sources(self, source_sheet): message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) self.log_error(message, sheet='sources', current_row=idx + 2) - source, _ = Source.objects.get_or_create(**source_info) + source, created = Source.objects.get_or_create(**source_info) + + self.stdout.write( + '{0} Source "{1}" from row {2}'.format('Created' if created else 'Updated', + source, + idx + 2) + ) access_point_info = { 'type': source_data[AccessPoint.get_spreadsheet_field_name('type')], @@ -1449,7 +1461,7 @@ def create_sources(self, source_sheet): } for attr, val in access_point_info.items(): - setattr(access_point, 'attr', val) + setattr(access_point, attr, val) access_point.save() diff --git a/source/models.py b/source/models.py index 20f81727..8d89e92b 100644 --- a/source/models.py +++ b/source/models.py @@ -199,7 +199,7 @@ def __str__(self): @property def archive_timestamp(self): """Given an access point archive_url, parse the timestamp.""" - match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url) + match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url or '') if match: return match.group(1) else: diff --git a/tests/test_importer.py b/tests/test_importer.py index b757fade..f5ae360a 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -142,7 +142,7 @@ def test_source_dates_and_timestamps(data_import, data_folder): # Re-run the import data_import = io.StringIO() - call_command('import_google_doc', folder=data_folder, stdout=output) + call_command('import_google_doc', folder=data_folder, stdout=data_import) undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\ .values_list('accesspoint__uuid', flat=True) From acc7679f72e139c935b405b257f69ec1451abc44 Mon Sep 17 00:00:00 2001 From: hancush Date: Wed, 8 Sep 2021 11:21:48 -0500 Subject: [PATCH 4/8] Handle case where duplicate sources exist, add cleanup flag --- .../management/commands/import_google_doc.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index e8e5e640..f7a2bb22 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -95,6 +95,11 @@ def add_arguments(self, parser): dest='folder', help='Path to a folder containing data (for testing)' ) + parser.add_argument( + '--cleanup', + action='store_true', + default=False + ) def sourcesList(self, obj, attribute): sources = [s for s in getattr(obj, attribute).get_sources()] @@ -153,6 +158,8 @@ def handle(self, *args, **options): except IntegrityError: self.user = User.objects.get(username=importer_user['username']) + self.cleanup = options['cleanup'] + # Disconnect post save signals self.disconnectSignals() @@ -1443,7 +1450,25 @@ def create_sources(self, source_sheet): message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) self.log_error(message, sheet='sources', current_row=idx + 2) - source, created = Source.objects.get_or_create(**source_info) + try: + source, created = Source.objects.get_or_create(**source_info) + + except Source.MultipleObjectsReturned: + sources = Source.objects.filter(**source_info) + source = sources.order_by('date_added').first() + + if self.cleanup: + cleanup_summary = sources.exclude(uuid=source.uuid).delete() + self.stdout.write('Cleaned up duplicate sources: {}'.format(cleanup_summary)) + + else: + self.log_error( + 'Found multiple instances of Source "{}". Recommend cleanup.'.format(source), + sheet='sources', + current_row=idx + 2 + ) + + created = False self.stdout.write( '{0} Source "{1}" from row {2}'.format('Created' if created else 'Updated', From 82eb1e9e79eb54dc4f21246e060fb71b9556d086 Mon Sep 17 00:00:00 2001 From: hancush Date: Wed, 8 Sep 2021 14:42:21 -0500 Subject: [PATCH 5/8] Remove auto-cleanup --- .../management/commands/import_google_doc.py | 29 +++++++------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index f7a2bb22..fdeb8017 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -95,11 +95,6 @@ def add_arguments(self, parser): dest='folder', help='Path to a folder containing data (for testing)' ) - parser.add_argument( - '--cleanup', - action='store_true', - default=False - ) def sourcesList(self, obj, attribute): sources = [s for s in getattr(obj, attribute).get_sources()] @@ -158,8 +153,6 @@ def handle(self, *args, **options): except IntegrityError: self.user = User.objects.get(username=importer_user['username']) - self.cleanup = options['cleanup'] - # Disconnect post save signals self.disconnectSignals() @@ -1455,21 +1448,21 @@ def create_sources(self, source_sheet): except Source.MultipleObjectsReturned: sources = Source.objects.filter(**source_info) - source = sources.order_by('date_added').first() - - if self.cleanup: - cleanup_summary = sources.exclude(uuid=source.uuid).delete() - self.stdout.write('Cleaned up duplicate sources: {}'.format(cleanup_summary)) - else: - self.log_error( - 'Found multiple instances of Source "{}". Recommend cleanup.'.format(source), - sheet='sources', - current_row=idx + 2 - ) + # Get the most recently created source matching the given + # signature and having associated access points. + source = sources.order_by('-date_added')\ + .filter(accesspoint__isnull=False)\ + .first() created = False + self.log_error( + 'Found multiple instances of Source with signature "{}"'.format(source_info), + sheet='sources', + current_row=idx + 2 + ) + self.stdout.write( '{0} Source "{1}" from row {2}'.format('Created' if created else 'Updated', source, From cf9fe1c59459caaa6044de5c857b8a94e140ebf3 Mon Sep 17 00:00:00 2001 From: hancush Date: Thu, 9 Sep 2021 11:37:01 -0500 Subject: [PATCH 6/8] Log duplicates as info, not error --- .../management/commands/import_google_doc.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index fdeb8017..c4109cf8 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1447,21 +1447,22 @@ def create_sources(self, source_sheet): source, created = Source.objects.get_or_create(**source_info) except Source.MultipleObjectsReturned: - sources = Source.objects.filter(**source_info) + # Find sources matching the given signature and having + # associated access points. + sources = Source.objects.filter(**source_info)\ + .filter(accesspoint__isnull=False)\ + .order_by('-date_added')\ + .distinct() - # Get the most recently created source matching the given - # signature and having associated access points. - source = sources.order_by('-date_added')\ - .filter(accesspoint__isnull=False)\ - .first() + if sources.count() > 1: + self.stdout.write(self.style.WARNING( + 'L{0}: Found multiple instances of Source with signature "{1}"'.format(idx + 2, source_info) + )) - created = False + # Get the most recently created source. + source = sources.first() - self.log_error( - 'Found multiple instances of Source with signature "{}"'.format(source_info), - sheet='sources', - current_row=idx + 2 - ) + created = False self.stdout.write( '{0} Source "{1}" from row {2}'.format('Created' if created else 'Updated', From 7c2216f9c963b11d0630ea169da2af3fc2b4566b Mon Sep 17 00:00:00 2001 From: hancush Date: Tue, 21 Sep 2021 10:45:38 -0500 Subject: [PATCH 7/8] Remove handling for impossible condition --- .../management/commands/import_google_doc.py | 27 +++---------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index c4109cf8..dc2ed22b 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1443,31 +1443,12 @@ def create_sources(self, source_sheet): message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) self.log_error(message, sheet='sources', current_row=idx + 2) - try: - source, created = Source.objects.get_or_create(**source_info) - - except Source.MultipleObjectsReturned: - # Find sources matching the given signature and having - # associated access points. - sources = Source.objects.filter(**source_info)\ - .filter(accesspoint__isnull=False)\ - .order_by('-date_added')\ - .distinct() - - if sources.count() > 1: - self.stdout.write(self.style.WARNING( - 'L{0}: Found multiple instances of Source with signature "{1}"'.format(idx + 2, source_info) - )) - - # Get the most recently created source. - source = sources.first() - - created = False + source, created = Source.objects.get_or_create(**source_info) self.stdout.write( - '{0} Source "{1}" from row {2}'.format('Created' if created else 'Updated', - source, - idx + 2) + '{0} Source "{1}" from row {2}'.format( + 'Created' if created else 'Updated', source, idx + 2 + ) ) access_point_info = { From 7f6cb4812f27ba657b24bf3dc5939878d922b7ff Mon Sep 17 00:00:00 2001 From: hancush Date: Wed, 22 Sep 2021 09:45:52 -0500 Subject: [PATCH 8/8] Break source date parsing into standalone method --- .../management/commands/import_google_doc.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index dc2ed22b..1e8f0503 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1420,27 +1420,16 @@ def create_sources(self, source_sheet): } for prefix in ('published', 'created', 'uploaded'): - date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] - parsed_date = None + date_value = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] + parsed_date = self.get_source_date(date_value) - try: - # Try to parse the value as a timestamp (remove timezone - # marker for Pyton <3.7) - parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') - - except ValueError: - # Fall back to an empty string because we want to use - # source_info to retrieve and update existing Sources, and - # date fields default to an empty string if no data is - # provided - parsed_date = self.parse_date(date_val) or '' - source_info['{}_date'.format(prefix)] = parsed_date - - else: + if isinstance(parsed_date, datetime): source_info['{}_timestamp'.format(prefix)] = parsed_date + else: + source_info['{}_date'.format(prefix)] = parsed_date if not parsed_date and prefix == 'published': - message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) + message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_value, access_point_uuid) self.log_error(message, sheet='sources', current_row=idx + 2) source, created = Source.objects.get_or_create(**source_info) @@ -1465,6 +1454,23 @@ def create_sources(self, source_sheet): access_point.save() + def get_source_date(self, date_value): + ''' + Source dates can come to us as full timestamps or dates. Given a string + representing one of these values, return a parsed datetime or date + object, or an empty string, if neither can be parsed. + ''' + try: + # Try to parse the value as a timestamp (remove timezone marker for + # Python <3.7) + return datetime.strptime(date_value.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') + + except ValueError: + # Fall back to an empty string because we want to use this value to + # retrieve and update existing Sources, and date fields default to + # an empty string if no data is provided + return self.parse_date(date_value) or '' + def get_sources(self, source_id_string): sources = []