Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update existing sources/access points on import, bulletproof publication date validation #776

Merged
merged 9 commits into from
Sep 22, 2021
105 changes: 66 additions & 39 deletions sfm_pc/management/commands/import_google_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,55 +1394,82 @@ def create_sources(self, source_sheet):

for idx, source_data in enumerate(source_sheet['values']):
access_point_uuid = source_data['source:access_point_id:admin'].strip()

try:
AccessPoint.objects.get(uuid=access_point_uuid)
except ValidationError:
access_point, _ = AccessPoint.objects.get_or_create(
uuid=access_point_uuid,
user=self.user,
)

except (ValidationError, ValueError):
self.log_error(
'Invalid source UUID: "{}"'.format(access_point_uuid),
sheet='sources',
current_row=idx + 2 # Handle 0-index and header row
)
except AccessPoint.DoesNotExist:
source_info = {
'title': source_data[Source.get_spreadsheet_field_name('title')],
'type': source_data[Source.get_spreadsheet_field_name('type')],
'author': source_data[Source.get_spreadsheet_field_name('author')],
'publication': source_data[Source.get_spreadsheet_field_name('publication')],
'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
'user': self.user
}
# Figure out if created/uploaded/published dates are timestamps
for prefix in ('published', 'created', 'uploaded'):
date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
try:
# Try to parse the value as a timestamp (remove timezone
# marker for Pyton <3.7)
parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')
except ValueError:
# Value is a date, or empty
parsed_date = self.parse_date(date_val)
source_info['{}_date'.format(prefix)] = parsed_date
else:
source_info['{}_timestamp'.format(prefix)] = parsed_date
continue

if not parsed_date and prefix == 'published':
message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid)
self.log_error(message, sheet='sources', current_row=idx + 2)
source_info = {
'title': source_data[Source.get_spreadsheet_field_name('title')],
'type': source_data[Source.get_spreadsheet_field_name('type')],
'author': source_data[Source.get_spreadsheet_field_name('author')],
'publication': source_data[Source.get_spreadsheet_field_name('publication')],
'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
'user': self.user,
}

new_source, created = Source.objects.get_or_create(**source_info)
for prefix in ('published', 'created', 'uploaded'):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this timestampe handling would be good to factor out to a separate function/method.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good thought, @fgregg. Revised in 7f6cb48. Separating that concern makes the loop more coherent, IMO.

date_value = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
parsed_date = self.get_source_date(date_value)

AccessPoint.objects.create(
uuid=access_point_uuid,
type=source_data[AccessPoint.get_spreadsheet_field_name('type')],
trigger=source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
accessed_on=self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
archive_url=source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
source=new_source,
user=self.user
if isinstance(parsed_date, datetime):
source_info['{}_timestamp'.format(prefix)] = parsed_date
else:
source_info['{}_date'.format(prefix)] = parsed_date

if not parsed_date and prefix == 'published':
message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_value, access_point_uuid)
self.log_error(message, sheet='sources', current_row=idx + 2)

source, created = Source.objects.get_or_create(**source_info)

self.stdout.write(
'{0} Source "{1}" from row {2}'.format(
'Created' if created else 'Updated', source, idx + 2
)
except ValueError:
self.log_error("Invalid access point at: " + access_point_uuid)
)

access_point_info = {
'type': source_data[AccessPoint.get_spreadsheet_field_name('type')],
'trigger': source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
'accessed_on': self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
'archive_url': source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
'source': source,
'user': self.user,
}

for attr, val in access_point_info.items():
setattr(access_point, attr, val)

access_point.save()

def get_source_date(self, date_value):
'''
Source dates can come to us as full timestamps or dates. Given a string
representing one of these values, return a parsed datetime or date
object, or an empty string, if neither can be parsed.
'''
try:
# Try to parse the value as a timestamp (remove timezone marker for
# Python <3.7)
return datetime.strptime(date_value.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')

except ValueError:
# Fall back to an empty string because we want to use this value to
# retrieve and update existing Sources, and date fields default to
# an empty string if no data is provided
return self.parse_date(date_value) or ''

def get_sources(self, source_id_string):

Expand Down
2 changes: 1 addition & 1 deletion source/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def __str__(self):
@property
def archive_timestamp(self):
"""Given an access point archive_url, parse the timestamp."""
match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url)
match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url or '')
if match:
return match.group(1)
else:
Expand Down
43 changes: 28 additions & 15 deletions tests/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,12 @@ def test_sources(data_import, data_folder):


@pytest.mark.django_db
def test_source_dates_and_timestamps(data_import):
def test_source_dates_and_timestamps(data_import, data_folder):
"""Make sure Source date fields properly parse dates and timestamps."""
timestamp_src = Source.objects.get(title='Source Timestamps')
date_src = Source.objects.get(title='Source Dates')
date_and_timestamp_prefixes = ('created', 'published', 'uploaded')

for prefix in date_and_timestamp_prefixes:
date_field = '{}_date'.format(prefix)
timestamp_field = '{}_timestamp'.format(prefix)
Expand All @@ -132,27 +133,39 @@ def test_source_dates_and_timestamps(data_import):
'sources-errors.csv'
)

undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
.values_list('accesspoint__uuid', flat=True)
# Test that source errors are reported whether it's the first or 101st time
# we're seeing them.
for run_number in range(1, 3):
if run_number == 2:
# Remove the error file from the first run
os.remove(error_file)

# Re-run the import
data_import = io.StringIO()
call_command('import_google_doc', folder=data_folder, stdout=data_import)

undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
.values_list('accesspoint__uuid', flat=True)

undated_source_set = set(str(uuid) for uuid in undated_sources)

undated_source_set = set(str(uuid) for uuid in undated_sources)
error_source_set = set()

error_source_set = set()
with open(error_file, 'r') as f:
reader = csv.reader(f)

with open(error_file, 'r') as f:
reader = csv.reader(f)
next(reader)

next(reader) # discard header
for record in reader:
_, message = record
assert message.startswith('Invalid published_date')

for record in reader:
_, message = record
assert message.startswith('Invalid published_date')
source_id = message.split()[-1]
assert source_id in undated_source_set
error_source_set.add(source_id)

source_id = message.split()[-1]
assert source_id in undated_source_set
error_source_set.add(source_id)
assert undated_source_set == error_source_set

assert undated_source_set == error_source_set

@pytest.mark.django_db
def test_incidents(data_import):
Expand Down