Skip to content

Commit

Permalink
Cria processamento para normalizar e-mail em ResearcherIdentifier (#905)
Browse files Browse the repository at this point in the history
* Adiciona index id em article

* Realiza otimizacao no filtro de colecao em  article

* Adiciona funcao para extrair email normalizado

* Modifica query Article em ArticleSummaryItem

* Cria traks para normalizar email em ResearcherIdentifier

* Cria teste NormalizeEmailResearcherIdentifierTest

* migration

* fix indentation

* Muda nome para extracts_normalized_email

* Muda funcao para receber paramentros mais genericos

* Altera atribuição de paramentro para extracts_normalized_email

* Utiliza extracts_normalized_email em normalize_stored_email

* Adiciona docs

* Move para o core

* Insere nova importacao de extracts_normalized_email

* Adiciona mais um exemplo em docs
  • Loading branch information
samuelveigarangel authored Dec 13, 2024
1 parent adc5dea commit aaeb517
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 5 deletions.
26 changes: 26 additions & 0 deletions article/migrations/0015_article_article_art_id_49c380_idx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 5.0.8 on 2024-12-09 18:07

from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("article", "0014_merge_20241008_0012"),
("core", "0004_language_core_langua_code2_4f7261_idx"),
("doi", "0001_initial"),
("institution", "0005_institution_institution_type_scielo_and_more"),
("issue", "0003_alter_tocsection_unique_together"),
("journal", "0028_journaltocsection_tocitem"),
("researcher", "0004_alter_institutionalauthor_unique_together"),
("vocabulary", "0003_keyword_html_text"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

operations = [
migrations.AddIndex(
model_name="article",
index=models.Index(fields=["id"], name="article_art_id_49c380_idx"),
),
]
5 changes: 5 additions & 0 deletions article/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ class Article(ExportModelOperationsMixin('article'), CommonControlField, Cluster
class Meta:
ordering = ["-updated", "-created", "sps_pkg_name"]
indexes = [
models.Index(
fields=[
"id",
]
),
models.Index(
fields=[
"pid_v2",
Expand Down
4 changes: 3 additions & 1 deletion article/sources/xmlsps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre

from article.models import Article, ArticleFunding, DocumentAbstract, DocumentTitle
from core.utils.extracts_normalized_email import extracts_normalized_email
from core.models import Language
from doi.models import DOI
from institution.models import Sponsor, Publisher
Expand Down Expand Up @@ -379,7 +380,8 @@ def create_or_update_researchers(xmltree, user, item):
data.append(obj)
else:
for aff in affs:
email = author.get("email") or aff.get("email")
raw_email = author.get("email") or aff.get("email")
email = extracts_normalized_email(data=raw_email)
aff_data = {
**researcher_data,
"aff_name": aff.get("orgname"),
Expand Down
19 changes: 19 additions & 0 deletions article/tasks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import logging
import sys
from datetime import datetime
Expand All @@ -8,8 +9,10 @@

from article.models import Article, ArticleFormat
from article.sources import xmlsps
from core.utils.extracts_normalized_email import extracts_normalized_email
from article.sources.preprint import harvest_preprints
from config import celery_app
from researcher.models import ResearcherIdentifier
from pid_provider.models import PidProviderXML
from pid_provider.provider import PidProvider
from tracker.models import UnexpectedEvent
Expand Down Expand Up @@ -295,3 +298,19 @@ def remove_duplicate_articles(pid_v3=None):
def remove_duplicate_articles_task(self, user_id=None, username=None, pid_v3=None):
remove_duplicate_articles(pid_v3)


def get_researcher_identifier_unnormalized():
return ResearcherIdentifier.objects.filter(source_name="EMAIL").exclude(identifier__regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')

@celery_app.task(bind=True)
def normalize_stored_email(self,):
updated_list = []
re_identifiers = get_researcher_identifier_unnormalized()

for re_identifier in re_identifiers:
email = extracts_normalized_email(raw_email=re_identifier.identifier)
if email:
re_identifier.identifier = email
updated_list.append(re_identifier)

ResearcherIdentifier.objects.bulk_update(updated_list, ['identifier'])
49 changes: 48 additions & 1 deletion article/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from django.utils.timezone import make_aware

from article.models import Article
from article.tasks import remove_duplicate_articles
from article.tasks import remove_duplicate_articles, normalize_stored_email, get_researcher_identifier_unnormalized
from researcher.models import ResearcherIdentifier


class TestArticleMigration(TestCase):
Expand Down Expand Up @@ -58,3 +59,49 @@ def test_remove_duplicates_for_multiple_pids(self):
self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1)
self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3)))
self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14)))


class NormalizeEmailResearcherIdentifierTest(TestCase):
def setUp(self):
self.emails = [
'<a href="mailto:[email protected]">[email protected]</a>',
'<a href="mailto:[email protected]">[email protected]</a>',
' [email protected]',
'[email protected].',
'cortes- [email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'mailto:[email protected]">[email protected]</a>',
]

self.orcids = [
'0000-0002-9147-0547',
'0000-0003-3622-3428',
'0000-0002-4842-3331',
'0000-0003-1314-4073',
]
ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=email, source_name="EMAIL") for email in self.emails])
ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=orcid, source_name="ORCID") for orcid in self.orcids])

def test_normalize_stored_email(self):
unnormalized_identifiers = get_researcher_identifier_unnormalized()
self.assertEqual(6, unnormalized_identifiers.count())

normalize_stored_email()

normalized_emails = [
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
'[email protected]',
]

for email in normalized_emails:
with self.subTest(email=email):
self.assertTrue(
ResearcherIdentifier.objects.filter(identifier=email).exists(),
f"E-mail '{email}' unnormalized"
)
5 changes: 3 additions & 2 deletions article/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
ArticleFormat,
ArticleFunding,
)
from collection.models import Collection
from config.menu import get_menu_order


Expand All @@ -27,8 +28,8 @@ class CollectionFilter(SimpleListFilter):
parameter_name = "collection"

def lookups(self, request, model_admin):
articles = Article.objects.all()
return [(collection.id, collection.main_name) for article in articles for collection in article.collections if collection.is_active]
collections = Collection.objects.filter(is_active=True)
return [(collection.id, collection.main_name) for collection in collections]

def queryset(self, request, queryset):
if self.value():
Expand Down
33 changes: 33 additions & 0 deletions core/utils/extracts_normalized_email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import re

def extracts_normalized_email(raw_email):
"""
Extracts and normalizes an email address from a given raw string.
This function uses a regular expression to identify and extract a valid
email address from the provided input string. It removes any spaces
from the raw string before processing. If no valid email is found,
the function returns None.
Args:
raw_email (str): A string containing the raw email data. This may
include extra characters, spaces, or invalid formatting.
Returns:
str or None: The normalized email address if found, otherwise None.
Example:
>>> extracts_normalized_email(' [email protected] ')
'[email protected]'
>>> extracts_normalized_email('<a href="mailto:[email protected]">[email protected]</a>')
'[email protected]'
>>> extracts_normalized_email('invalid-email.com')
None
>>> extracts_normalized_email('lto:[email protected]">[email protected]</a>')
'[email protected]'
"""
if raw_email:
email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", raw_email.replace(" ", ""))
if email_match:
return email_match.group()
return None
2 changes: 1 addition & 1 deletion core/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class ArticleSummaryItem(SummaryItem):

def get_context_data(self, parent_context):
site_details = get_site_for_user(self.request.user)
total_article = Article.objects.all().count()
total_article = Article.objects.count()
return {
"total_article": total_article,
"site_name": site_details["site_name"],
Expand Down

0 comments on commit aaeb517

Please sign in to comment.