From aaeb517e50c328db009ab254d5fc1ada0fc02d87 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel <82840278+samuelveigarangel@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:10:03 -0300 Subject: [PATCH] Cria processamento para normalizar e-mail em ResearcherIdentifier (#905) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adiciona index id em article * Realiza otimizacao no filtro de colecao em article * Adiciona funcao para extrair email normalizado * Modifica query Article em ArticleSummaryItem * Cria traks para normalizar email em ResearcherIdentifier * Cria teste NormalizeEmailResearcherIdentifierTest * migration * fix indentation * Muda nome para extracts_normalized_email * Muda funcao para receber paramentros mais genericos * Altera atribuição de paramentro para extracts_normalized_email * Utiliza extracts_normalized_email em normalize_stored_email * Adiciona docs * Move para o core * Insere nova importacao de extracts_normalized_email * Adiciona mais um exemplo em docs --- .../0015_article_article_art_id_49c380_idx.py | 26 ++++++++++ article/models.py | 5 ++ article/sources/xmlsps.py | 4 +- article/tasks.py | 19 +++++++ article/tests.py | 49 ++++++++++++++++++- article/wagtail_hooks.py | 5 +- core/utils/extracts_normalized_email.py | 33 +++++++++++++ core/wagtail_hooks.py | 2 +- 8 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 article/migrations/0015_article_article_art_id_49c380_idx.py create mode 100644 core/utils/extracts_normalized_email.py diff --git a/article/migrations/0015_article_article_art_id_49c380_idx.py b/article/migrations/0015_article_article_art_id_49c380_idx.py new file mode 100644 index 00000000..264f9a4e --- /dev/null +++ b/article/migrations/0015_article_article_art_id_49c380_idx.py @@ -0,0 +1,26 @@ +# Generated by Django 5.0.8 on 2024-12-09 18:07 + +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("article", "0014_merge_20241008_0012"), + ("core", "0004_language_core_langua_code2_4f7261_idx"), + ("doi", "0001_initial"), + ("institution", "0005_institution_institution_type_scielo_and_more"), + ("issue", "0003_alter_tocsection_unique_together"), + ("journal", "0028_journaltocsection_tocitem"), + ("researcher", "0004_alter_institutionalauthor_unique_together"), + ("vocabulary", "0003_keyword_html_text"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddIndex( + model_name="article", + index=models.Index(fields=["id"], name="article_art_id_49c380_idx"), + ), + ] diff --git a/article/models.py b/article/models.py index 69adbceb..1b804c87 100755 --- a/article/models.py +++ b/article/models.py @@ -149,6 +149,11 @@ class Article(ExportModelOperationsMixin('article'), CommonControlField, Cluster class Meta: ordering = ["-updated", "-created", "sps_pkg_name"] indexes = [ + models.Index( + fields=[ + "id", + ] + ), models.Index( fields=[ "pid_v2", diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py index 57c3895d..35e8c472 100755 --- a/article/sources/xmlsps.py +++ b/article/sources/xmlsps.py @@ -20,6 +20,7 @@ from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre from article.models import Article, ArticleFunding, DocumentAbstract, DocumentTitle +from core.utils.extracts_normalized_email import extracts_normalized_email from core.models import Language from doi.models import DOI from institution.models import Sponsor, Publisher @@ -379,7 +380,8 @@ def create_or_update_researchers(xmltree, user, item): data.append(obj) else: for aff in affs: - email = author.get("email") or aff.get("email") + raw_email = author.get("email") or aff.get("email") + email = extracts_normalized_email(data=raw_email) aff_data = { **researcher_data, "aff_name": aff.get("orgname"), diff --git a/article/tasks.py b/article/tasks.py index 1fc6eaa0..099d5952 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -1,3 +1,4 @@ +import re import logging import sys from datetime import datetime @@ -8,8 +9,10 @@ from article.models import Article, ArticleFormat from article.sources import xmlsps +from core.utils.extracts_normalized_email import extracts_normalized_email from article.sources.preprint import harvest_preprints from config import celery_app +from researcher.models import ResearcherIdentifier from pid_provider.models import PidProviderXML from pid_provider.provider import PidProvider from tracker.models import UnexpectedEvent @@ -295,3 +298,19 @@ def remove_duplicate_articles(pid_v3=None): def remove_duplicate_articles_task(self, user_id=None, username=None, pid_v3=None): remove_duplicate_articles(pid_v3) + +def get_researcher_identifier_unnormalized(): + return ResearcherIdentifier.objects.filter(source_name="EMAIL").exclude(identifier__regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') + +@celery_app.task(bind=True) +def normalize_stored_email(self,): + updated_list = [] + re_identifiers = get_researcher_identifier_unnormalized() + + for re_identifier in re_identifiers: + email = extracts_normalized_email(raw_email=re_identifier.identifier) + if email: + re_identifier.identifier = email + updated_list.append(re_identifier) + + ResearcherIdentifier.objects.bulk_update(updated_list, ['identifier']) \ No newline at end of file diff --git a/article/tests.py b/article/tests.py index f97d8943..d41f9ebd 100755 --- a/article/tests.py +++ b/article/tests.py @@ -5,7 +5,8 @@ from django.utils.timezone import make_aware from article.models import Article -from article.tasks import remove_duplicate_articles +from article.tasks import remove_duplicate_articles, normalize_stored_email, get_researcher_identifier_unnormalized +from researcher.models import ResearcherIdentifier class TestArticleMigration(TestCase): @@ -58,3 +59,49 @@ def test_remove_duplicates_for_multiple_pids(self): self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1) self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3))) self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14))) + + +class NormalizeEmailResearcherIdentifierTest(TestCase): + def setUp(self): + self.emails = [ + 'jgarrido@ucv.cl', + 'gagopa39@hotmail.com', + ' herbet@ufs.br', + 'pilosaperez@gmail.com.', + 'cortes- camarillo@hotmail.com', + 'ulrikekeyser@upn162-zamora.edu.mx', + 'cortescamarillo@hotmail.com', + 'candelariasgro@yahoo.com', + 'mailto:user@hotmail.com">gagopa39@hotmail.com', + ] + + self.orcids = [ + '0000-0002-9147-0547', + '0000-0003-3622-3428', + '0000-0002-4842-3331', + '0000-0003-1314-4073', + ] + ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=email, source_name="EMAIL") for email in self.emails]) + ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=orcid, source_name="ORCID") for orcid in self.orcids]) + + def test_normalize_stored_email(self): + unnormalized_identifiers = get_researcher_identifier_unnormalized() + self.assertEqual(6, unnormalized_identifiers.count()) + + normalize_stored_email() + + normalized_emails = [ + 'jgarrido@ucv.cl', + 'gagopa39@hotmail.com', + 'herbet@ufs.br', + 'pilosaperez@gmail.com', + 'cortes-camarillo@hotmail.com', + 'user@hotmail.com', + ] + + for email in normalized_emails: + with self.subTest(email=email): + self.assertTrue( + ResearcherIdentifier.objects.filter(identifier=email).exists(), + f"E-mail '{email}' unnormalized" + ) diff --git a/article/wagtail_hooks.py b/article/wagtail_hooks.py index 2d82ee6a..5182e42a 100644 --- a/article/wagtail_hooks.py +++ b/article/wagtail_hooks.py @@ -19,6 +19,7 @@ ArticleFormat, ArticleFunding, ) +from collection.models import Collection from config.menu import get_menu_order @@ -27,8 +28,8 @@ class CollectionFilter(SimpleListFilter): parameter_name = "collection" def lookups(self, request, model_admin): - articles = Article.objects.all() - return [(collection.id, collection.main_name) for article in articles for collection in article.collections if collection.is_active] + collections = Collection.objects.filter(is_active=True) + return [(collection.id, collection.main_name) for collection in collections] def queryset(self, request, queryset): if self.value(): diff --git a/core/utils/extracts_normalized_email.py b/core/utils/extracts_normalized_email.py new file mode 100644 index 00000000..e4e3e387 --- /dev/null +++ b/core/utils/extracts_normalized_email.py @@ -0,0 +1,33 @@ +import re + +def extracts_normalized_email(raw_email): + """ + Extracts and normalizes an email address from a given raw string. + + This function uses a regular expression to identify and extract a valid + email address from the provided input string. It removes any spaces + from the raw string before processing. If no valid email is found, + the function returns None. + + Args: + raw_email (str): A string containing the raw email data. This may + include extra characters, spaces, or invalid formatting. + + Returns: + str or None: The normalized email address if found, otherwise None. + + Example: + >>> extracts_normalized_email(' user@example.com ') + 'user@example.com' + >>> extracts_normalized_email('user@example.com') + 'user@example.com' + >>> extracts_normalized_email('invalid-email.com') + None + >>> extracts_normalized_email('lto:user@example.com">user@example.com') + 'user@example.com' + """ + if raw_email: + email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", raw_email.replace(" ", "")) + if email_match: + return email_match.group() + return None \ No newline at end of file diff --git a/core/wagtail_hooks.py b/core/wagtail_hooks.py index 6c2bbba0..8f0f2f26 100755 --- a/core/wagtail_hooks.py +++ b/core/wagtail_hooks.py @@ -90,7 +90,7 @@ class ArticleSummaryItem(SummaryItem): def get_context_data(self, parent_context): site_details = get_site_for_user(self.request.user) - total_article = Article.objects.all().count() + total_article = Article.objects.count() return { "total_article": total_article, "site_name": site_details["site_name"],