security-force-monitor · hancush · Sep 22, 2021 · Aug 31, 2021 · Sep 7, 2021 · Sep 8, 2021
diff --git a/sfm_pc/management/commands/import_google_doc.py b/sfm_pc/management/commands/import_google_doc.py
@@ -1394,55 +1394,76 @@ def create_sources(self, source_sheet):
 
         for idx, source_data in enumerate(source_sheet['values']):
             access_point_uuid = source_data['source:access_point_id:admin'].strip()
+
             try:
-                AccessPoint.objects.get(uuid=access_point_uuid)
-            except ValidationError:
+                access_point, _ = AccessPoint.objects.get_or_create(
+                    uuid=access_point_uuid,
+                    user=self.user,
+                )
+
+            except (ValidationError, ValueError):
                 self.log_error(
                     'Invalid source UUID: "{}"'.format(access_point_uuid),
                     sheet='sources',
                     current_row=idx + 2  # Handle 0-index and header row
                 )
-            except AccessPoint.DoesNotExist:
-                source_info = {
-                    'title': source_data[Source.get_spreadsheet_field_name('title')],
-                    'type': source_data[Source.get_spreadsheet_field_name('type')],
-                    'author': source_data[Source.get_spreadsheet_field_name('author')],
-                    'publication': source_data[Source.get_spreadsheet_field_name('publication')],
-                    'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
-                    'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
-                    'user': self.user
-                }
-                # Figure out if created/uploaded/published dates are timestamps
-                for prefix in ('published', 'created', 'uploaded'):
-                    date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
-                    try:
-                        # Try to parse the value as a timestamp (remove timezone
-                        # marker for Pyton <3.7)
-                        parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')
-                    except ValueError:
-                        # Value is a date, or empty
-                        parsed_date = self.parse_date(date_val)
-                        source_info['{}_date'.format(prefix)] = parsed_date
-                    else:
-                        source_info['{}_timestamp'.format(prefix)] = parsed_date
+                continue
 
-                    if not parsed_date and prefix == 'published':
-                        message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid)
-                        self.log_error(message, sheet='sources', current_row=idx + 2)
+            source_info = {
+                'title': source_data[Source.get_spreadsheet_field_name('title')],
+                'type': source_data[Source.get_spreadsheet_field_name('type')],
+                'author': source_data[Source.get_spreadsheet_field_name('author')],
+                'publication': source_data[Source.get_spreadsheet_field_name('publication')],
+                'publication_country': source_data[Source.get_spreadsheet_field_name('publication_country')],
+                'source_url': source_data[Source.get_spreadsheet_field_name('source_url')],
+                'user': self.user,
+            }
 
-                new_source, created = Source.objects.get_or_create(**source_info)
+            for prefix in ('published', 'created', 'uploaded'):
+                date_val = source_data[Source.get_spreadsheet_field_name('{}_date'.format(prefix))]
+                parsed_date = None
 
-                AccessPoint.objects.create(
-                    uuid=access_point_uuid,
-                    type=source_data[AccessPoint.get_spreadsheet_field_name('type')],
-                    trigger=source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
-                    accessed_on=self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
-                    archive_url=source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
-                    source=new_source,
-                    user=self.user
+                try:
+                    # Try to parse the value as a timestamp (remove timezone
+                    # marker for Pyton <3.7)
+                    parsed_date = datetime.strptime(date_val.replace('Z', ''), '%Y-%m-%dT%H:%M:%S')
+
+                except ValueError:
+                    # Fall back to an empty string because we want to use
+                    # source_info to retrieve and update existing Sources, and
+                    # date fields default to an empty string if no data is
+                    # provided
+                    parsed_date = self.parse_date(date_val) or ''
+                    source_info['{}_date'.format(prefix)] = parsed_date
+
+                else:
+                    source_info['{}_timestamp'.format(prefix)] = parsed_date
+
+                if not parsed_date and prefix == 'published':
+                    message = 'Invalid published_date "{1}" at {2}'.format(prefix, date_val, access_point_uuid)
+                    self.log_error(message, sheet='sources', current_row=idx + 2)
+
+            source, created = Source.objects.get_or_create(**source_info)
+
+            self.stdout.write(
+                '{0} Source "{1}" from row {2}'.format(
+                    'Created' if created else 'Updated', source, idx + 2
                 )
-            except ValueError:
-                self.log_error("Invalid access point at: " + access_point_uuid)
+            )
+
+            access_point_info = {
+                'type': source_data[AccessPoint.get_spreadsheet_field_name('type')],
+                'trigger': source_data[AccessPoint.get_spreadsheet_field_name('trigger')],
+                'accessed_on': self.parse_date(source_data[AccessPoint.get_spreadsheet_field_name('accessed_on')]),
+                'archive_url': source_data[AccessPoint.get_spreadsheet_field_name('archive_url')],
+                'source': source,
+                'user': self.user,
+            }
+
+            for attr, val in access_point_info.items():
+                setattr(access_point, attr, val)
+
+            access_point.save()
 
     def get_sources(self, source_id_string):
 

diff --git a/source/models.py b/source/models.py
@@ -199,7 +199,7 @@ def __str__(self):
     @property
     def archive_timestamp(self):
         """Given an access point archive_url, parse the timestamp."""
-        match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url)
+        match = re.search(r"web\.archive\.org/web/(\d{14})/", self.archive_url or '')
         if match:
             return match.group(1)
         else:

diff --git a/tests/test_importer.py b/tests/test_importer.py
@@ -112,11 +112,12 @@ def test_sources(data_import, data_folder):
 
 
 @pytest.mark.django_db
-def test_source_dates_and_timestamps(data_import):
+def test_source_dates_and_timestamps(data_import, data_folder):
     """Make sure Source date fields properly parse dates and timestamps."""
     timestamp_src = Source.objects.get(title='Source Timestamps')
     date_src = Source.objects.get(title='Source Dates')
     date_and_timestamp_prefixes = ('created', 'published', 'uploaded')
+
     for prefix in date_and_timestamp_prefixes:
         date_field = '{}_date'.format(prefix)
         timestamp_field = '{}_timestamp'.format(prefix)
@@ -132,27 +133,39 @@ def test_source_dates_and_timestamps(data_import):
         'sources-errors.csv'
     )
 
-    undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
-                                    .values_list('accesspoint__uuid', flat=True)
+    # Test that source errors are reported whether it's the first or 101st time
+    # we're seeing them.
+    for run_number in range(1, 3):
+        if run_number == 2:
+            # Remove the error file from the first run
+            os.remove(error_file)
+
+            # Re-run the import
+            data_import = io.StringIO()
+            call_command('import_google_doc', folder=data_folder, stdout=data_import)
+
+        undated_sources = Source.objects.filter(published_date='', published_timestamp__isnull=True)\
+                                        .values_list('accesspoint__uuid', flat=True)
+
+        undated_source_set = set(str(uuid) for uuid in undated_sources)
 
-    undated_source_set = set(str(uuid) for uuid in undated_sources)
+        error_source_set = set()
 
-    error_source_set = set()
+        with open(error_file, 'r') as f:
+            reader = csv.reader(f)
 
-    with open(error_file, 'r') as f:
-        reader = csv.reader(f)
+            next(reader)
 
-        next(reader)  # discard header
+            for record in reader:
+                _, message = record
+                assert message.startswith('Invalid published_date')
 
-        for record in reader:
-            _, message = record
-            assert message.startswith('Invalid published_date')
+                source_id = message.split()[-1]
+                assert source_id in undated_source_set
+                error_source_set.add(source_id)
 
-            source_id = message.split()[-1]
-            assert source_id in undated_source_set
-            error_source_set.add(source_id)
+            assert undated_source_set == error_source_set
 
-        assert undated_source_set == error_source_set
 
 @pytest.mark.django_db
 def test_incidents(data_import):