Skip to content

Commit

Permalink
Merge pull request #254 from gitnnolabs/optimize_raw_data
Browse files Browse the repository at this point in the history
Melhora a geração dos indicadores otimizando a criação do arquivo com os dados brutos.
  • Loading branch information
gitnnolabs authored Nov 27, 2023
2 parents 878bb1c + c8ae261 commit 04e5324
Show file tree
Hide file tree
Showing 14 changed files with 393 additions and 166 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Generated by Django 4.1.6 on 2023-11-24 15:09

from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone


class Migration(migrations.Migration):

dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
("article", "0026_article_concepts_article_programs"),
]

operations = [
migrations.AddField(
model_name="article",
name="created",
field=models.DateTimeField(
auto_now_add=True,
default=django.utils.timezone.now,
verbose_name="Data de criação",
),
preserve_default=False,
),
migrations.AddField(
model_name="article",
name="creator",
field=models.ForeignKey(
default=None,
editable=False,
on_delete=django.db.models.deletion.CASCADE,
related_name="%(class)s_creator",
to=settings.AUTH_USER_MODEL,
verbose_name="Criador",
),
preserve_default=False,
),
migrations.AddField(
model_name="article",
name="updated",
field=models.DateTimeField(
auto_now=True, verbose_name="Data da última atualização"
),
),
migrations.AddField(
model_name="article",
name="updated_by",
field=models.ForeignKey(
blank=True,
editable=False,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="%(class)s_last_mod_user",
to=settings.AUTH_USER_MODEL,
verbose_name="Atualizador",
),
),
]
20 changes: 11 additions & 9 deletions article/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import choices
from .forms import ContributorForm

from core.models import Source
from core.models import Source, CommonControlField


class Concepts(models.Model):
Expand Down Expand Up @@ -80,6 +80,7 @@ class Concepts(models.Model):

def autocomplete_label(self):
return "%s (%s)" % (self.name, self.level) or ""

class Meta:
verbose_name = _("Concept")
verbose_name_plural = _("Concepts")
Expand Down Expand Up @@ -331,7 +332,6 @@ def group(
d[k] = v
yield d


@classmethod
def parameters_for_values(
cls,
Expand Down Expand Up @@ -1124,7 +1124,7 @@ def create_or_update(cls, **kwargs):
return article, created


class Article(models.Model):
class Article(CommonControlField):
title = models.CharField(_("Title"), max_length=510, null=True, blank=True)
doi = models.CharField(_("DOI"), max_length=100, null=True, blank=True)
volume = models.CharField(_("Volume"), max_length=20, null=True, blank=True)
Expand Down Expand Up @@ -1326,10 +1326,11 @@ def create_or_update(cls, pk="doi", **kwargs):
"number": "999",
"volume": "9",
"year": 2002,
"journal": instance of <journal>
"contributors": list of <contributors> [<contributor>, <contributor>, <contributor>]
"license": instance of license
"sources": list of <sources> [<source>, <source>]
"journal": instance of <journal>,
"contributors": list of <contributors> [<contributor>, <contributor>, <contributor>],
"license": instance of license,
"sources": list of <sources> [<source>, <source>],
"user": instance of <user>
}
return article(object), 0|1
Expand All @@ -1341,9 +1342,10 @@ def create_or_update(cls, pk="doi", **kwargs):

try:
article = cls.get(**kwargs)
article.updated_by = kwargs.get("user")
created = 0
except Article.DoesNotExist:
article = cls.objects.create()
article = cls.objects.create(creator=kwargs.get("user"))
created = 1
except SourceArticle.MultipleObjectsReturned as e:
print(_("The article table have duplicity...."))
Expand Down Expand Up @@ -1460,4 +1462,4 @@ def group(
k = k.replace("thematic_areas__", "thematic_area__")

d[k] = v
yield d
yield d
20 changes: 19 additions & 1 deletion article/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from article import models


class SourceArticleIndex(indexes.SearchIndex, indexes.Indexable):
class ArticleIndex(indexes.SearchIndex, indexes.Indexable):
"""
Fields:
doi
Expand Down Expand Up @@ -53,6 +53,24 @@ class SourceArticleIndex(indexes.SearchIndex, indexes.Indexable):

text = indexes.CharField(document=True, use_template=True)

# control fields
created = indexes.CharField(null=False)
updated = indexes.CharField(null=False)
creator = indexes.CharField(null=False)
updated_by = indexes.CharField(null=False)

def prepare_created(self, obj):
return obj.created.isoformat()

def prepare_updated(self, obj):
return obj.updated.isoformat()

def prepare_creator(self, obj):
return obj.creator

def prepare_updated_by(self, obj):
return obj.updated_by

def prepare_record_type(self, obj):
return "article"

Expand Down
149 changes: 83 additions & 66 deletions article/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
Retrieves article data from OpenALex API for a specific year and populate the article.models.Article
Sync or Async
Param: date is a integer representing the date range in the format 'YYYY'.
Args:
date: is a integer representing the date range in the format 'YYYY'.
length: A integer that determine the quantity of item to be get from OpenAlex.
country: A string represent the code of country to be get from OpenAlex.
The endpoint OpenAlex: https://api.openalex.org/works/?filter=institutions.country_code:{country},publication_year:{date}&per-page=200&cursor=*"
Expand All @@ -35,7 +40,6 @@ def load_openalex(user_id, date=2012, length=None, country="BR"):
tasks.load_openalex(date=2012)
Running using a script:
python manage.py runscript load_openalex --script-args 1 2012
Expand Down Expand Up @@ -211,24 +215,24 @@ def article_source_to_article(
source_name="OPENALEX",
size=None,
loop_size=1000,
intitution_id=None,
institution_id=None,
year=None,
):
"""
This task load the source article to article.
Args:
size: A integer to indicate the size of the article to process.
loop_size: A integer that determine the size os each slice to call a sub-task ``load_openalex_article``.
intitution_id: A string with the institution to process.
institution_id: A string with the institution to process.
year: A string with the year to process.
"""
count = 0
filters = {}
filters["source__name"] = source_name

if intitution_id:
filters["raw__authorships__0__institutions__icontains"] = intitution_id
if institution_id:
filters["raw__authorships__0__institutions__icontains"] = institution_id

if year:
filters["year"] = year
Expand Down Expand Up @@ -280,7 +284,7 @@ def load_openalex_article(user_id, article_ids, update=False):
if not update:
if doi:
if models.Article.objects.filter(doi=doi).exists():
print("ja existe")
logger.info("Article with id: %s, already exists" % id)
continue

if title:
Expand All @@ -295,27 +299,30 @@ def load_openalex_article(user_id, article_ids, update=False):
year = core_utils.nestget(article.raw, "publication_year")

# Get the journal data
if article.raw.get("primary_location"):
journal_data = core_utils.nestget(
article.raw, "primary_location", "source"
)
if journal_data:
j_issn_l = journal_data.get("issn_l")
if journal_data.get("issn"):
j_issns = ",".join(journal_data.get("issn"))
j_name = journal_data.get("display_name")
j_is_in_doaj = journal_data.get("is_in_doaj")

journal, _ = models.Journal.create_or_update(
**{
"journal_issn_l": j_issn_l,
"journal_issns": j_issns,
"journal_name": j_name,
"journal_is_in_doaj": j_is_in_doaj,
},
)
else:
journal = None
try:
if article.raw.get("primary_location"):
journal_data = core_utils.nestget(
article.raw, "primary_location", "source"
)
if journal_data:
j_issn_l = journal_data.get("issn_l")
if journal_data.get("issn"):
j_issns = ",".join(journal_data.get("issn"))
j_name = journal_data.get("display_name")
j_is_in_doaj = journal_data.get("is_in_doaj")

journal, _ = models.Journal.create_or_update(
**{
"journal_issn_l": j_issn_l,
"journal_issns": j_issns,
"journal_name": j_name,
"journal_is_in_doaj": j_is_in_doaj,
},
)
else:
journal = None
except Exception as e:
logger.error("Erro get/create journal: %s" % e)

# APC
is_apc = (
Expand All @@ -325,18 +332,21 @@ def load_openalex_article(user_id, article_ids, update=False):
# Open Access Status
oa_status = core_utils.nestget(article.raw, "open_access", "oa_status")

# license
if article.raw.get("primary_location"):
if core_utils.nestget(article.raw, "primary_location", "license"):
license, _ = models.License.create_or_update(
**{
"name": core_utils.nestget(
article.raw, "primary_location", "license"
)
}
)
else:
license = None
try:
# license
if article.raw.get("primary_location"):
if core_utils.nestget(article.raw, "primary_location", "license"):
license, _ = models.License.create_or_update(
**{
"name": core_utils.nestget(
article.raw, "primary_location", "license"
)
}
)
else:
license = None
except Exception as e:
logger.error("Erro get/create license: %s" % e)

# contributors
contributors = []
Expand Down Expand Up @@ -366,36 +376,42 @@ def load_openalex_article(user_id, article_ids, update=False):
# Here we are adding the affiliation to the contributor
if au.get("raw_affiliation_strings"):
affs = []
aff_obj, _ = models.Affiliation.create_or_update(
**{"name": "|".join(au.get("raw_affiliation_strings"))}
)
affs.append(aff_obj)
try:
aff_obj, _ = models.Affiliation.create_or_update(
**{"name": "|".join(au.get("raw_affiliation_strings"))}
)
affs.append(aff_obj)

author_dict.update(
{
"affiliations": affs,
}
)
author_dict.update(
{
"affiliations": affs,
}
)
except Exception as ex:
logger.error("Erro get/create affiliation: %s" % e)

# Add the institutions
if au.get("institutions"):
insts = []
source = Source.objects.get(name="OPENALEX")
for inst in au.get("institutions"):
inst_obj = models.SourceInstitution.get(
**{"specific_id": inst.get("id"), "source": source}
try:
for inst in au.get("institutions"):
inst_obj = models.SourceInstitution.get(
**{"specific_id": inst.get("id"), "source": source}
)
insts.append(inst_obj)

author_dict.update(
{
"institutions": insts,
}
)
insts.append(inst_obj)

author_dict.update(
{
"institutions": insts,
}
)
contributor, _ = models.Contributor.create_or_update(
**author_dict
)

contributor, _ = models.Contributor.create_or_update(
**author_dict
)
except Exception as ex:
logger.error("Erro get/create institution: %s" % e)

contributors.append(contributor)

# add the concepts
Expand All @@ -418,12 +434,13 @@ def load_openalex_article(user_id, article_ids, update=False):
"year": year,
"is_oa": core_utils.nestget(article.raw, "open_access", "is_oa"),
"sources": [Source.objects.get(name="OPENALEX")],
"journal": journal,
"journal": journal or None,
"apc": is_apc,
"open_access_status": oa_status,
"contributors": contributors,
"license": license,
"license": license or None,
"concepts": concepts,
"user": user,
}

article, created = models.Article.create_or_update(**article_dict)
Expand Down
Loading

0 comments on commit 04e5324

Please sign in to comment.