From aaeb517e50c328db009ab254d5fc1ada0fc02d87 Mon Sep 17 00:00:00 2001
From: Samuel Veiga Rangel
 <82840278+samuelveigarangel@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:10:03 -0300
Subject: [PATCH] Cria processamento para normalizar e-mail em
 ResearcherIdentifier (#905)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Adiciona index id em article

* Realiza otimizacao no filtro de colecao em  article

* Adiciona funcao para extrair email normalizado

* Modifica query Article em ArticleSummaryItem

* Cria traks para normalizar email em ResearcherIdentifier

* Cria teste NormalizeEmailResearcherIdentifierTest

* migration

* fix indentation

* Muda nome para extracts_normalized_email

* Muda funcao para receber paramentros mais genericos

* Altera atribuição de paramentro para extracts_normalized_email

* Utiliza extracts_normalized_email em normalize_stored_email

* Adiciona docs

* Move para o core

* Insere nova importacao de extracts_normalized_email

* Adiciona mais um exemplo em docs
---
 .../0015_article_article_art_id_49c380_idx.py | 26 ++++++++++
 article/models.py                             |  5 ++
 article/sources/xmlsps.py                     |  4 +-
 article/tasks.py                              | 19 +++++++
 article/tests.py                              | 49 ++++++++++++++++++-
 article/wagtail_hooks.py                      |  5 +-
 core/utils/extracts_normalized_email.py       | 33 +++++++++++++
 core/wagtail_hooks.py                         |  2 +-
 8 files changed, 138 insertions(+), 5 deletions(-)
 create mode 100644 article/migrations/0015_article_article_art_id_49c380_idx.py
 create mode 100644 core/utils/extracts_normalized_email.py

diff --git a/article/migrations/0015_article_article_art_id_49c380_idx.py b/article/migrations/0015_article_article_art_id_49c380_idx.py
new file mode 100644
index 00000000..264f9a4e
--- /dev/null
+++ b/article/migrations/0015_article_article_art_id_49c380_idx.py
@@ -0,0 +1,26 @@
+# Generated by Django 5.0.8 on 2024-12-09 18:07
+
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("article", "0014_merge_20241008_0012"),
+        ("core", "0004_language_core_langua_code2_4f7261_idx"),
+        ("doi", "0001_initial"),
+        ("institution", "0005_institution_institution_type_scielo_and_more"),
+        ("issue", "0003_alter_tocsection_unique_together"),
+        ("journal", "0028_journaltocsection_tocitem"),
+        ("researcher", "0004_alter_institutionalauthor_unique_together"),
+        ("vocabulary", "0003_keyword_html_text"),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.AddIndex(
+            model_name="article",
+            index=models.Index(fields=["id"], name="article_art_id_49c380_idx"),
+        ),
+    ]
diff --git a/article/models.py b/article/models.py
index 69adbceb..1b804c87 100755
--- a/article/models.py
+++ b/article/models.py
@@ -149,6 +149,11 @@ class Article(ExportModelOperationsMixin('article'), CommonControlField, Cluster
     class Meta:
         ordering = ["-updated", "-created", "sps_pkg_name"]
         indexes = [
+            models.Index(
+                fields=[
+                    "id",
+                ]
+            ),
             models.Index(
                 fields=[
                     "pid_v2",
diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py
index 57c3895d..35e8c472 100755
--- a/article/sources/xmlsps.py
+++ b/article/sources/xmlsps.py
@@ -20,6 +20,7 @@
 from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre
 
 from article.models import Article, ArticleFunding, DocumentAbstract, DocumentTitle
+from core.utils.extracts_normalized_email import extracts_normalized_email
 from core.models import Language
 from doi.models import DOI
 from institution.models import Sponsor, Publisher
@@ -379,7 +380,8 @@ def create_or_update_researchers(xmltree, user, item):
                 data.append(obj)
             else:
                 for aff in affs:
-                    email = author.get("email") or aff.get("email")
+                    raw_email = author.get("email") or aff.get("email")
+                    email = extracts_normalized_email(data=raw_email)
                     aff_data = {
                         **researcher_data,
                         "aff_name": aff.get("orgname"),
diff --git a/article/tasks.py b/article/tasks.py
index 1fc6eaa0..099d5952 100644
--- a/article/tasks.py
+++ b/article/tasks.py
@@ -1,3 +1,4 @@
+import re
 import logging
 import sys
 from datetime import datetime
@@ -8,8 +9,10 @@
 
 from article.models import Article, ArticleFormat
 from article.sources import xmlsps
+from core.utils.extracts_normalized_email import extracts_normalized_email
 from article.sources.preprint import harvest_preprints
 from config import celery_app
+from researcher.models import ResearcherIdentifier
 from pid_provider.models import PidProviderXML
 from pid_provider.provider import PidProvider
 from tracker.models import UnexpectedEvent
@@ -295,3 +298,19 @@ def remove_duplicate_articles(pid_v3=None):
 def remove_duplicate_articles_task(self, user_id=None, username=None, pid_v3=None):
     remove_duplicate_articles(pid_v3)
 
+
+def get_researcher_identifier_unnormalized():
+    return ResearcherIdentifier.objects.filter(source_name="EMAIL").exclude(identifier__regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
+
+@celery_app.task(bind=True)
+def normalize_stored_email(self,):
+    updated_list = []
+    re_identifiers = get_researcher_identifier_unnormalized()
+    
+    for re_identifier in re_identifiers:
+        email = extracts_normalized_email(raw_email=re_identifier.identifier)
+        if email:
+            re_identifier.identifier = email
+            updated_list.append(re_identifier)
+
+    ResearcherIdentifier.objects.bulk_update(updated_list, ['identifier'])
\ No newline at end of file
diff --git a/article/tests.py b/article/tests.py
index f97d8943..d41f9ebd 100755
--- a/article/tests.py
+++ b/article/tests.py
@@ -5,7 +5,8 @@
 from django.utils.timezone import make_aware
 
 from article.models import Article
-from article.tasks import remove_duplicate_articles
+from article.tasks import remove_duplicate_articles, normalize_stored_email, get_researcher_identifier_unnormalized
+from researcher.models import ResearcherIdentifier
 
 
 class TestArticleMigration(TestCase):
@@ -58,3 +59,49 @@ def test_remove_duplicates_for_multiple_pids(self):
         self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1)
         self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3)))
         self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14)))
+
+
+class NormalizeEmailResearcherIdentifierTest(TestCase):
+    def setUp(self):
+        self.emails = [
+            '<a href="mailto:jgarrido@ucv.cl">jgarrido@ucv.cl</a>',
+            '<a href="mailto:gagopa39@hotmail.com">gagopa39@hotmail.com</a>',
+            ' herbet@ufs.br',
+            'pilosaperez@gmail.com.',
+            'cortes- camarillo@hotmail.com',
+            'ulrikekeyser@upn162-zamora.edu.mx',
+            'cortescamarillo@hotmail.com',
+            'candelariasgro@yahoo.com',
+            'mailto:user@hotmail.com">gagopa39@hotmail.com</a>',
+        ]
+
+        self.orcids = [
+            '0000-0002-9147-0547',
+            '0000-0003-3622-3428',
+            '0000-0002-4842-3331',
+            '0000-0003-1314-4073',
+        ]
+        ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=email, source_name="EMAIL") for email in self.emails])
+        ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=orcid, source_name="ORCID") for orcid in self.orcids])
+
+    def test_normalize_stored_email(self):
+        unnormalized_identifiers = get_researcher_identifier_unnormalized()
+        self.assertEqual(6, unnormalized_identifiers.count())
+
+        normalize_stored_email()
+
+        normalized_emails = [
+            'jgarrido@ucv.cl',
+            'gagopa39@hotmail.com',
+            'herbet@ufs.br',
+            'pilosaperez@gmail.com',
+            'cortes-camarillo@hotmail.com',
+            'user@hotmail.com',
+        ]
+
+        for email in normalized_emails:
+            with self.subTest(email=email):
+                self.assertTrue(
+                    ResearcherIdentifier.objects.filter(identifier=email).exists(),
+                    f"E-mail '{email}' unnormalized"
+                )
diff --git a/article/wagtail_hooks.py b/article/wagtail_hooks.py
index 2d82ee6a..5182e42a 100644
--- a/article/wagtail_hooks.py
+++ b/article/wagtail_hooks.py
@@ -19,6 +19,7 @@
     ArticleFormat,
     ArticleFunding,
 )
+from collection.models import Collection
 from config.menu import get_menu_order
 
 
@@ -27,8 +28,8 @@ class CollectionFilter(SimpleListFilter):
     parameter_name = "collection"
 
     def lookups(self, request, model_admin):
-        articles = Article.objects.all()
-        return [(collection.id, collection.main_name) for article in articles for collection in article.collections if collection.is_active]
+        collections = Collection.objects.filter(is_active=True)
+        return [(collection.id, collection.main_name) for collection in collections]
     
     def queryset(self, request, queryset):
         if self.value():
diff --git a/core/utils/extracts_normalized_email.py b/core/utils/extracts_normalized_email.py
new file mode 100644
index 00000000..e4e3e387
--- /dev/null
+++ b/core/utils/extracts_normalized_email.py
@@ -0,0 +1,33 @@
+import re
+
+def extracts_normalized_email(raw_email):
+    """
+    Extracts and normalizes an email address from a given raw string.
+
+    This function uses a regular expression to identify and extract a valid
+    email address from the provided input string. It removes any spaces
+    from the raw string before processing. If no valid email is found,
+    the function returns None.
+
+    Args:
+        raw_email (str): A string containing the raw email data. This may
+                         include extra characters, spaces, or invalid formatting.
+
+    Returns:
+        str or None: The normalized email address if found, otherwise None.
+
+    Example:
+        >>> extracts_normalized_email('   user@example.com ')
+        'user@example.com'
+        >>> extracts_normalized_email('<a href="mailto:user@example.com">user@example.com</a>')
+        'user@example.com'
+        >>> extracts_normalized_email('invalid-email.com')
+        None
+        >>> extracts_normalized_email('lto:user@example.com">user@example.com</a>')
+        'user@example.com'        
+    """    
+    if raw_email:
+        email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", raw_email.replace(" ", ""))
+        if email_match:
+            return email_match.group()
+    return None
\ No newline at end of file
diff --git a/core/wagtail_hooks.py b/core/wagtail_hooks.py
index 6c2bbba0..8f0f2f26 100755
--- a/core/wagtail_hooks.py
+++ b/core/wagtail_hooks.py
@@ -90,7 +90,7 @@ class ArticleSummaryItem(SummaryItem):
 
     def get_context_data(self, parent_context):
         site_details = get_site_for_user(self.request.user)
-        total_article = Article.objects.all().count()
+        total_article = Article.objects.count()
         return {
             "total_article": total_article,
             "site_name": site_details["site_name"],