diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py index e0476d02..1e8f0503 100644 --- a/sfm_pc/management/commands/import_google_doc.py +++ b/sfm_pc/management/commands/import_google_doc.py @@ -1394,55 +1394,82 @@ def create_sources(self, source_sheet): for idx, source_data in enumerate(source_sheet['values']): access_point_uuid = source_data['source:access_point_id:admin'].strip() + try: - AccessPoint.objects.get(uuid=access_point_uuid) - except ValidationError: + access_point, _ = AccessPoint.objects.get_or_create( + uuid=access_point_uuid, + user=self.user, + ) + + except (ValidationError, ValueError): self.log_error( 'Invalid source UUID: "{}"'.format(access_point_uuid), sheet='sources', current_row=idx + 2 # Handle 0-index and header row ) - except AccessPoint.DoesNotExist: - source_info = { - 'title': source_data[Source.get_spreadsheet_field_name('title')], - 'type': source_data[Source.get_spreadsheet_field_name('type')], - 'author': source_data[Source.get_spreadsheet_field_name('author')], - 'publication': source_data[Source.get_spreadsheet_field_name('publication')], - 'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')], - 'source_url': source_data[Source.get_spreadsheet_field_name('source_url')], - 'user': self.user - } - # Figure out if created/uploaded/published dates are timestamps - for prefix in ('published', 'created', 'uploaded'): - date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] - try: - # Try to parse the value as a timestamp (remove timezone - # marker for Pyton <3.7) - parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') - except ValueError: - # Value is a date, or empty - parsed_date = self.parse_date(date_val) - source_info['{}_date'.format(prefix)] = parsed_date - else: - source_info['{}_timestamp'.format(prefix)] = parsed_date + continue - if not parsed_date and prefix == 'published': - message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid) - self.log_error(message, sheet='sources', current_row=idx + 2) + source_info = { + 'title': source_data[Source.get_spreadsheet_field_name('title')], + 'type': source_data[Source.get_spreadsheet_field_name('type')], + 'author': source_data[Source.get_spreadsheet_field_name('author')], + 'publication': source_data[Source.get_spreadsheet_field_name('publication')], + 'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')], + 'source_url': source_data[Source.get_spreadsheet_field_name('source_url')], + 'user': self.user, + } - new_source, created = Source.objects.get_or_create(**source_info) + for prefix in ('published', 'created', 'uploaded'): + date_value = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))] + parsed_date = self.get_source_date(date_value) - AccessPoint.objects.create( - uuid=access_point_uuid, - type=source_data[AccessPoint.get_spreadsheet_field_name('type')], - trigger=source_data[AccessPoint.get_spreadsheet_field_name('trigger')], - accessed_on=self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]), - archive_url=source_data[AccessPoint.get_spreadsheet_field_name('archive_url')], - source=new_source, - user=self.user + if isinstance(parsed_date, datetime): + source_info['{}_timestamp'.format(prefix)] = parsed_date + else: + source_info['{}_date'.format(prefix)] = parsed_date + + if not parsed_date and prefix == 'published': + message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_value, access_point_uuid) + self.log_error(message, sheet='sources', current_row=idx + 2) + + source, created = Source.objects.get_or_create(**source_info) + + self.stdout.write( + '{0} Source "{1}" from row {2}'.format( + 'Created' if created else 'Updated', source, idx + 2 ) - except ValueError: - self.log_error("Invalid access point at: " + access_point_uuid) + ) + + access_point_info = { + 'type': source_data[AccessPoint.get_spreadsheet_field_name('type')], + 'trigger': source_data[AccessPoint.get_spreadsheet_field_name('trigger')], + 'accessed_on': self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]), + 'archive_url': source_data[AccessPoint.get_spreadsheet_field_name('archive_url')], + 'source': source, + 'user': self.user, + } + + for attr, val in access_point_info.items(): + setattr(access_point, attr, val) + + access_point.save() + + def get_source_date(self, date_value): + ''' + Source dates can come to us as full timestamps or dates. Given a string + representing one of these values, return a parsed datetime or date + object, or an empty string, if neither can be parsed. + ''' + try: + # Try to parse the value as a timestamp (remove timezone marker for + # Python <3.7) + return datetime.strptime(date_value.replace('Z', ''), '%Y-%m-%dT%H:%M:%S') + + except ValueError: + # Fall back to an empty string because we want to use this value to + # retrieve and update existing Sources, and date fields default to + # an empty string if no data is provided + return self.parse_date(date_value) or '' def get_sources(self, source_id_string): diff --git a/source/models.py b/source/models.py index 20f81727..8d89e92b 100644 --- a/source/models.py +++ b/source/models.py @@ -199,7 +199,7 @@ def __str__(self): @property def archive_timestamp(self): """Given an access point archive_url, parse the timestamp.""" - match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url) + match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url or '') if match: return match.group(1) else: diff --git a/tests/test_importer.py b/tests/test_importer.py index 59dc21e4..f5ae360a 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -112,11 +112,12 @@ def test_sources(data_import, data_folder): @pytest.mark.django_db -def test_source_dates_and_timestamps(data_import): +def test_source_dates_and_timestamps(data_import, data_folder): """Make sure Source date fields properly parse dates and timestamps.""" timestamp_src = Source.objects.get(title='Source Timestamps') date_src = Source.objects.get(title='Source Dates') date_and_timestamp_prefixes = ('created', 'published', 'uploaded') + for prefix in date_and_timestamp_prefixes: date_field = '{}_date'.format(prefix) timestamp_field = '{}_timestamp'.format(prefix) @@ -132,27 +133,39 @@ def test_source_dates_and_timestamps(data_import): 'sources-errors.csv' ) - undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\ - .values_list('accesspoint__uuid', flat=True) + # Test that source errors are reported whether it's the first or 101st time + # we're seeing them. + for run_number in range(1, 3): + if run_number == 2: + # Remove the error file from the first run + os.remove(error_file) + + # Re-run the import + data_import = io.StringIO() + call_command('import_google_doc', folder=data_folder, stdout=data_import) + + undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\ + .values_list('accesspoint__uuid', flat=True) + + undated_source_set = set(str(uuid) for uuid in undated_sources) - undated_source_set = set(str(uuid) for uuid in undated_sources) + error_source_set = set() - error_source_set = set() + with open(error_file, 'r') as f: + reader = csv.reader(f) - with open(error_file, 'r') as f: - reader = csv.reader(f) + next(reader) - next(reader) # discard header + for record in reader: + _, message = record + assert message.startswith('Invalid published_date') - for record in reader: - _, message = record - assert message.startswith('Invalid published_date') + source_id = message.split()[-1] + assert source_id in undated_source_set + error_source_set.add(source_id) - source_id = message.split()[-1] - assert source_id in undated_source_set - error_source_set.add(source_id) + assert undated_source_set == error_source_set - assert undated_source_set == error_source_set @pytest.mark.django_db def test_incidents(data_import):