-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Cria processamento para normalizar e-mail em ResearcherIdentifier (#905)
* Adiciona index id em article * Realiza otimizacao no filtro de colecao em article * Adiciona funcao para extrair email normalizado * Modifica query Article em ArticleSummaryItem * Cria traks para normalizar email em ResearcherIdentifier * Cria teste NormalizeEmailResearcherIdentifierTest * migration * fix indentation * Muda nome para extracts_normalized_email * Muda funcao para receber paramentros mais genericos * Altera atribuição de paramentro para extracts_normalized_email * Utiliza extracts_normalized_email em normalize_stored_email * Adiciona docs * Move para o core * Insere nova importacao de extracts_normalized_email * Adiciona mais um exemplo em docs
- Loading branch information
1 parent
adc5dea
commit aaeb517
Showing
8 changed files
with
138 additions
and
5 deletions.
There are no files selected for viewing
26 changes: 26 additions & 0 deletions
26
article/migrations/0015_article_article_art_id_49c380_idx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Generated by Django 5.0.8 on 2024-12-09 18:07 | ||
|
||
from django.conf import settings | ||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("article", "0014_merge_20241008_0012"), | ||
("core", "0004_language_core_langua_code2_4f7261_idx"), | ||
("doi", "0001_initial"), | ||
("institution", "0005_institution_institution_type_scielo_and_more"), | ||
("issue", "0003_alter_tocsection_unique_together"), | ||
("journal", "0028_journaltocsection_tocitem"), | ||
("researcher", "0004_alter_institutionalauthor_unique_together"), | ||
("vocabulary", "0003_keyword_html_text"), | ||
migrations.swappable_dependency(settings.AUTH_USER_MODEL), | ||
] | ||
|
||
operations = [ | ||
migrations.AddIndex( | ||
model_name="article", | ||
index=models.Index(fields=["id"], name="article_art_id_49c380_idx"), | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,8 @@ | |
from django.utils.timezone import make_aware | ||
|
||
from article.models import Article | ||
from article.tasks import remove_duplicate_articles | ||
from article.tasks import remove_duplicate_articles, normalize_stored_email, get_researcher_identifier_unnormalized | ||
from researcher.models import ResearcherIdentifier | ||
|
||
|
||
class TestArticleMigration(TestCase): | ||
|
@@ -58,3 +59,49 @@ def test_remove_duplicates_for_multiple_pids(self): | |
self.assertEqual(Article.objects.filter(pid_v3="pid3").count(), 1) | ||
self.assertEqual(Article.objects.get(pid_v3="pid2").created, make_aware(datetime(2022, 6, 3))) | ||
self.assertEqual(Article.objects.get(pid_v3="pid3").created, make_aware(datetime(2022, 6, 14))) | ||
|
||
|
||
class NormalizeEmailResearcherIdentifierTest(TestCase): | ||
def setUp(self): | ||
self.emails = [ | ||
'<a href="mailto:[email protected]">[email protected]</a>', | ||
'<a href="mailto:[email protected]">[email protected]</a>', | ||
' [email protected]', | ||
'[email protected].', | ||
'cortes- [email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
'mailto:[email protected]">[email protected]</a>', | ||
] | ||
|
||
self.orcids = [ | ||
'0000-0002-9147-0547', | ||
'0000-0003-3622-3428', | ||
'0000-0002-4842-3331', | ||
'0000-0003-1314-4073', | ||
] | ||
ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=email, source_name="EMAIL") for email in self.emails]) | ||
ResearcherIdentifier.objects.bulk_create([ResearcherIdentifier(identifier=orcid, source_name="ORCID") for orcid in self.orcids]) | ||
|
||
def test_normalize_stored_email(self): | ||
unnormalized_identifiers = get_researcher_identifier_unnormalized() | ||
self.assertEqual(6, unnormalized_identifiers.count()) | ||
|
||
normalize_stored_email() | ||
|
||
normalized_emails = [ | ||
'[email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
'[email protected]', | ||
] | ||
|
||
for email in normalized_emails: | ||
with self.subTest(email=email): | ||
self.assertTrue( | ||
ResearcherIdentifier.objects.filter(identifier=email).exists(), | ||
f"E-mail '{email}' unnormalized" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import re | ||
|
||
def extracts_normalized_email(raw_email): | ||
""" | ||
Extracts and normalizes an email address from a given raw string. | ||
This function uses a regular expression to identify and extract a valid | ||
email address from the provided input string. It removes any spaces | ||
from the raw string before processing. If no valid email is found, | ||
the function returns None. | ||
Args: | ||
raw_email (str): A string containing the raw email data. This may | ||
include extra characters, spaces, or invalid formatting. | ||
Returns: | ||
str or None: The normalized email address if found, otherwise None. | ||
Example: | ||
>>> extracts_normalized_email(' [email protected] ') | ||
'[email protected]' | ||
>>> extracts_normalized_email('<a href="mailto:[email protected]">[email protected]</a>') | ||
'[email protected]' | ||
>>> extracts_normalized_email('invalid-email.com') | ||
None | ||
>>> extracts_normalized_email('lto:[email protected]">[email protected]</a>') | ||
'[email protected]' | ||
""" | ||
if raw_email: | ||
email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", raw_email.replace(" ", "")) | ||
if email_match: | ||
return email_match.group() | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters