Skip to content

Commit

Permalink
Merge pull request #776 from security-force-monitor/hcg/publication-date
Browse files Browse the repository at this point in the history
Update existing sources/access points on import, bulletproof publication date validation
  • Loading branch information
hancush authored Sep 22, 2021
2 parents d0f838c + 7f6cb48 commit 071617d
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 55 deletions.
105 changes: 66 additions & 39 deletions sfm_pc/management/commands/import_google_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,55 +1394,82 @@ def create_sources(self, source_sheet):

for idx, source_data in enumerate(source_sheet['values']):
access_point_uuid = source_data['source:access_point_id:admin'].strip()

try:
AccessPoint.objects.get(uuid=access_point_uuid)
except ValidationError:
access_point, _ = AccessPoint.objects.get_or_create(
uuid=access_point_uuid,
user=self.user,
)

except (ValidationError, ValueError):
self.log_error(
'Invalid source UUID: "{}"'.format(access_point_uuid),
sheet='sources',
current_row=idx + 2 # Handle 0-index and header row
)
except AccessPoint.DoesNotExist:
source_info = {
'title': source_data[Source.get_spreadsheet_field_name('title')],
'type': source_data[Source.get_spreadsheet_field_name('type')],
'author': source_data[Source.get_spreadsheet_field_name('author')],
'publication': source_data[Source.get_spreadsheet_field_name('publication')],
'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
'user': self.user
}
# Figure out if created/uploaded/published dates are timestamps
for prefix in ('published', 'created', 'uploaded'):
date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
try:
# Try to parse the value as a timestamp (remove timezone
# marker for Pyton <3.7)
parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')
except ValueError:
# Value is a date, or empty
parsed_date = self.parse_date(date_val)
source_info['{}_date'.format(prefix)] = parsed_date
else:
source_info['{}_timestamp'.format(prefix)] = parsed_date
continue

if not parsed_date and prefix == 'published':
message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid)
self.log_error(message, sheet='sources', current_row=idx + 2)
source_info = {
'title': source_data[Source.get_spreadsheet_field_name('title')],
'type': source_data[Source.get_spreadsheet_field_name('type')],
'author': source_data[Source.get_spreadsheet_field_name('author')],
'publication': source_data[Source.get_spreadsheet_field_name('publication')],
'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
'user': self.user,
}

new_source, created = Source.objects.get_or_create(**source_info)
for prefix in ('published', 'created', 'uploaded'):
date_value = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
parsed_date = self.get_source_date(date_value)

AccessPoint.objects.create(
uuid=access_point_uuid,
type=source_data[AccessPoint.get_spreadsheet_field_name('type')],
trigger=source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
accessed_on=self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
archive_url=source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
source=new_source,
user=self.user
if isinstance(parsed_date, datetime):
source_info['{}_timestamp'.format(prefix)] = parsed_date
else:
source_info['{}_date'.format(prefix)] = parsed_date

if not parsed_date and prefix == 'published':
message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_value, access_point_uuid)
self.log_error(message, sheet='sources', current_row=idx + 2)

source, created = Source.objects.get_or_create(**source_info)

self.stdout.write(
'{0} Source "{1}" from row {2}'.format(
'Created' if created else 'Updated', source, idx + 2
)
except ValueError:
self.log_error("Invalid access point at: " + access_point_uuid)
)

access_point_info = {
'type': source_data[AccessPoint.get_spreadsheet_field_name('type')],
'trigger': source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
'accessed_on': self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
'archive_url': source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
'source': source,
'user': self.user,
}

for attr, val in access_point_info.items():
setattr(access_point, attr, val)

access_point.save()

def get_source_date(self, date_value):
'''
Source dates can come to us as full timestamps or dates. Given a string
representing one of these values, return a parsed datetime or date
object, or an empty string, if neither can be parsed.
'''
try:
# Try to parse the value as a timestamp (remove timezone marker for
# Python <3.7)
return datetime.strptime(date_value.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')

except ValueError:
# Fall back to an empty string because we want to use this value to
# retrieve and update existing Sources, and date fields default to
# an empty string if no data is provided
return self.parse_date(date_value) or ''

def get_sources(self, source_id_string):

Expand Down
2 changes: 1 addition & 1 deletion source/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def __str__(self):
@property
def archive_timestamp(self):
"""Given an access point archive_url, parse the timestamp."""
match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url)
match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url or '')
if match:
return match.group(1)
else:
Expand Down
43 changes: 28 additions & 15 deletions tests/test_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,12 @@ def test_sources(data_import, data_folder):


@pytest.mark.django_db
def test_source_dates_and_timestamps(data_import):
def test_source_dates_and_timestamps(data_import, data_folder):
"""Make sure Source date fields properly parse dates and timestamps."""
timestamp_src = Source.objects.get(title='Source Timestamps')
date_src = Source.objects.get(title='Source Dates')
date_and_timestamp_prefixes = ('created', 'published', 'uploaded')

for prefix in date_and_timestamp_prefixes:
date_field = '{}_date'.format(prefix)
timestamp_field = '{}_timestamp'.format(prefix)
Expand All @@ -132,27 +133,39 @@ def test_source_dates_and_timestamps(data_import):
'sources-errors.csv'
)

undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
.values_list('accesspoint__uuid', flat=True)
# Test that source errors are reported whether it's the first or 101st time
# we're seeing them.
for run_number in range(1, 3):
if run_number == 2:
# Remove the error file from the first run
os.remove(error_file)

# Re-run the import
data_import = io.StringIO()
call_command('import_google_doc', folder=data_folder, stdout=data_import)

undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
.values_list('accesspoint__uuid', flat=True)

undated_source_set = set(str(uuid) for uuid in undated_sources)

undated_source_set = set(str(uuid) for uuid in undated_sources)
error_source_set = set()

error_source_set = set()
with open(error_file, 'r') as f:
reader = csv.reader(f)

with open(error_file, 'r') as f:
reader = csv.reader(f)
next(reader)

next(reader) # discard header
for record in reader:
_, message = record
assert message.startswith('Invalid published_date')

for record in reader:
_, message = record
assert message.startswith('Invalid published_date')
source_id = message.split()[-1]
assert source_id in undated_source_set
error_source_set.add(source_id)

source_id = message.split()[-1]
assert source_id in undated_source_set
error_source_set.add(source_id)
assert undated_source_set == error_source_set

assert undated_source_set == error_source_set

@pytest.mark.django_db
def test_incidents(data_import):
Expand Down

0 comments on commit 071617d

Please sign in to comment.