From aaeb517e50c328db009ab254d5fc1ada0fc02d87 Mon Sep 17 00:00:00 2001
From: Samuel Veiga Rangel
<82840278+samuelveigarangel@users.noreply.github.com>
Date: Fri, 13 Dec 2024 11:10:03 -0300
Subject: [PATCH] Cria processamento para normalizar e-mail em
ResearcherIdentifier (#905)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Adiciona index id em article
* Realiza otimizacao no filtro de colecao em article
* Adiciona funcao para extrair email normalizado
* Modifica query Article em ArticleSummaryItem
* Cria traks para normalizar email em ResearcherIdentifier
* Cria teste NormalizeEmailResearcherIdentifierTest
* migration
* fix indentation
* Muda nome para extracts_normalized_email
* Muda funcao para receber paramentros mais genericos
* Altera atribuição de paramentro para extracts_normalized_email
* Utiliza extracts_normalized_email em normalize_stored_email
* Adiciona docs
* Move para o core
* Insere nova importacao de extracts_normalized_email
* Adiciona mais um exemplo em docs
---
.../0015_article_article_art_id_49c380_idx.py | 26 ++++++++++
article/models.py | 5 ++
article/sources/xmlsps.py | 4 +-
article/tasks.py | 19 +++++++
article/tests.py | 49 ++++++++++++++++++-
article/wagtail_hooks.py | 5 +-
core/utils/extracts_normalized_email.py | 33 +++++++++++++
core/wagtail_hooks.py | 2 +-
8 files changed, 138 insertions(+), 5 deletions(-)
create mode 100644 article/migrations/0015_article_article_art_id_49c380_idx.py
create mode 100644 core/utils/extracts_normalized_email.py
diff --git a/article/migrations/0015_article_article_art_id_49c380_idx.py b/article/migrations/0015_article_article_art_id_49c380_idx.py
new file mode 100644
index 00000000..264f9a4e
--- /dev/null
+++ b/article/migrations/0015_article_article_art_id_49c380_idx.py
@@ -0,0 +1,26 @@
+# Generated by Django 5.0.8 on 2024-12-09 18:07
+
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ("article", "0014_merge_20241008_0012"),
+ ("core", "0004_language_core_langua_code2_4f7261_idx"),
+ ("doi", "0001_initial"),
+ ("institution", "0005_institution_institution_type_scielo_and_more"),
+ ("issue", "0003_alter_tocsection_unique_together"),
+ ("journal", "0028_journaltocsection_tocitem"),
+ ("researcher", "0004_alter_institutionalauthor_unique_together"),
+ ("vocabulary", "0003_keyword_html_text"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.AddIndex(
+ model_name="article",
+ index=models.Index(fields=["id"], name="article_art_id_49c380_idx"),
+ ),
+ ]
diff --git a/article/models.py b/article/models.py
index 69adbceb..1b804c87 100755
--- a/article/models.py
+++ b/article/models.py
@@ -149,6 +149,11 @@ class Article(ExportModelOperationsMixin('article'), CommonControlField, Cluster
class Meta:
ordering = ["-updated", "-created", "sps_pkg_name"]
indexes = [
+ models.Index(
+ fields=[
+ "id",
+ ]
+ ),
models.Index(
fields=[
"pid_v2",
diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py
index 57c3895d..35e8c472 100755
--- a/article/sources/xmlsps.py
+++ b/article/sources/xmlsps.py
@@ -20,6 +20,7 @@
from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre
from article.models import Article, ArticleFunding, DocumentAbstract, DocumentTitle
+from core.utils.extracts_normalized_email import extracts_normalized_email
from core.models import Language
from doi.models import DOI
from institution.models import Sponsor, Publisher
@@ -379,7 +380,8 @@ def create_or_update_researchers(xmltree, user, item):
data.append(obj)
else:
for aff in affs:
- email = author.get("email") or aff.get("email")
+ raw_email = author.get("email") or aff.get("email")
+ email = extracts_normalized_email(data=raw_email)
aff_data = {
**researcher_data,
"aff_name": aff.get("orgname"),
diff --git a/article/tasks.py b/article/tasks.py
index 1fc6eaa0..099d5952 100644
--- a/article/tasks.py
+++ b/article/tasks.py
@@ -1,3 +1,4 @@
+import re
import logging
import sys
from datetime import datetime
@@ -8,8 +9,10 @@
from article.models import Article, ArticleFormat
from article.sources import xmlsps
+from core.utils.extracts_normalized_email import extracts_normalized_email
from article.sources.preprint import harvest_preprints
from config import celery_app
+from researcher.models import ResearcherIdentifier
from pid_provider.models import PidProviderXML
from pid_provider.provider import PidProvider
from tracker.models import UnexpectedEvent
@@ -295,3 +298,19 @@ def remove_duplicate_articles(pid_v3=None):
def remove_duplicate_articles_task(self, user_id=None, username=None, pid_v3=None):
remove_duplicate_articles(pid_v3)
+
+def get_researcher_identifier_unnormalized():
+ return ResearcherIdentifier.objects.filter(source_name="EMAIL").exclude(identifier__regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
+
+@celery_app.task(bind=True)
+def normalize_stored_email(self,):
+ updated_list = []
+ re_identifiers = get_researcher_identifier_unnormalized()
+
+ for re_identifier in re_identifiers:
+ email = extracts_normalized_email(raw_email=re_identifier.identifier)
+ if email:
+ re_identifier.identifier = email
+ updated_list.append(re_identifier)
+
+ ResearcherIdentifier.objects.bulk_update(updated_list, ['identifier'])
\ No newline at end of file
diff --git a/article/tests.py b/article/tests.py
index f97d8943..d41f9ebd 100755
--- a/article/tests.py
+++ b/article/tests.py
@@ -5,7 +5,8 @@
from django.utils.timezone import make_aware
from article.models import Article
-from article.tasks import remove_duplicate_articles
+from article.tasks import remove_duplicate_articles, normalize_stored_email, get_researcher_identifier_unnormalized
+from researcher.models import ResearcherIdentifier
class TestArticleMigration(TestCase):
@@ -58,3 +59,49 @@ def test_remove_duplicates_for_multiple_pids(self):
self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1)
self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3)))
self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14)))
+
+
+class NormalizeEmailResearcherIdentifierTest(TestCase):
+ def setUp(self):
+ self.emails = [
+ 'jgarrido@ucv.cl',
+ 'gagopa39@hotmail.com',
+ ' herbet@ufs.br',
+ 'pilosaperez@gmail.com.',
+ 'cortes- camarillo@hotmail.com',
+ 'ulrikekeyser@upn162-zamora.edu.mx',
+ 'cortescamarillo@hotmail.com',
+ 'candelariasgro@yahoo.com',
+ 'mailto:user@hotmail.com">gagopa39@hotmail.com',
+ ]
+
+ self.orcids = [
+ '0000-0002-9147-0547',
+ '0000-0003-3622-3428',
+ '0000-0002-4842-3331',
+ '0000-0003-1314-4073',
+ ]
+ ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=email, source_name="EMAIL") for email in self.emails])
+ ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=orcid, source_name="ORCID") for orcid in self.orcids])
+
+ def test_normalize_stored_email(self):
+ unnormalized_identifiers = get_researcher_identifier_unnormalized()
+ self.assertEqual(6, unnormalized_identifiers.count())
+
+ normalize_stored_email()
+
+ normalized_emails = [
+ 'jgarrido@ucv.cl',
+ 'gagopa39@hotmail.com',
+ 'herbet@ufs.br',
+ 'pilosaperez@gmail.com',
+ 'cortes-camarillo@hotmail.com',
+ 'user@hotmail.com',
+ ]
+
+ for email in normalized_emails:
+ with self.subTest(email=email):
+ self.assertTrue(
+ ResearcherIdentifier.objects.filter(identifier=email).exists(),
+ f"E-mail '{email}' unnormalized"
+ )
diff --git a/article/wagtail_hooks.py b/article/wagtail_hooks.py
index 2d82ee6a..5182e42a 100644
--- a/article/wagtail_hooks.py
+++ b/article/wagtail_hooks.py
@@ -19,6 +19,7 @@
ArticleFormat,
ArticleFunding,
)
+from collection.models import Collection
from config.menu import get_menu_order
@@ -27,8 +28,8 @@ class CollectionFilter(SimpleListFilter):
parameter_name = "collection"
def lookups(self, request, model_admin):
- articles = Article.objects.all()
- return [(collection.id, collection.main_name) for article in articles for collection in article.collections if collection.is_active]
+ collections = Collection.objects.filter(is_active=True)
+ return [(collection.id, collection.main_name) for collection in collections]
def queryset(self, request, queryset):
if self.value():
diff --git a/core/utils/extracts_normalized_email.py b/core/utils/extracts_normalized_email.py
new file mode 100644
index 00000000..e4e3e387
--- /dev/null
+++ b/core/utils/extracts_normalized_email.py
@@ -0,0 +1,33 @@
+import re
+
+def extracts_normalized_email(raw_email):
+ """
+ Extracts and normalizes an email address from a given raw string.
+
+ This function uses a regular expression to identify and extract a valid
+ email address from the provided input string. It removes any spaces
+ from the raw string before processing. If no valid email is found,
+ the function returns None.
+
+ Args:
+ raw_email (str): A string containing the raw email data. This may
+ include extra characters, spaces, or invalid formatting.
+
+ Returns:
+ str or None: The normalized email address if found, otherwise None.
+
+ Example:
+ >>> extracts_normalized_email(' user@example.com ')
+ 'user@example.com'
+ >>> extracts_normalized_email('user@example.com')
+ 'user@example.com'
+ >>> extracts_normalized_email('invalid-email.com')
+ None
+ >>> extracts_normalized_email('lto:user@example.com">user@example.com')
+ 'user@example.com'
+ """
+ if raw_email:
+ email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", raw_email.replace(" ", ""))
+ if email_match:
+ return email_match.group()
+ return None
\ No newline at end of file
diff --git a/core/wagtail_hooks.py b/core/wagtail_hooks.py
index 6c2bbba0..8f0f2f26 100755
--- a/core/wagtail_hooks.py
+++ b/core/wagtail_hooks.py
@@ -90,7 +90,7 @@ class ArticleSummaryItem(SummaryItem):
def get_context_data(self, parent_context):
site_details = get_site_for_user(self.request.user)
- total_article = Article.objects.all().count()
+ total_article = Article.objects.count()
return {
"total_article": total_article,
"site_name": site_details["site_name"],