Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6248] Bulk resync for CrossRef and DataCite #10988

Open
wants to merge 5 commits into
base: feature/b-and-i-25-01
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions admin/management/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
re_path(r'^daily_reporters_go', views.DailyReportersGo.as_view(), name='daily_reporters_go'),
re_path(r'^monthly_reporters_go', views.MonthlyReportersGo.as_view(), name='monthly_reporters_go'),
re_path(r'^ingest_cedar_metadata_templates', views.IngestCedarMetadataTemplates.as_view(), name='ingest_cedar_metadata_templates'),
re_path(r'^bulk_resync', views.BulkResync.as_view(), name='bulk-resync')
]
15 changes: 15 additions & 0 deletions admin/management/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from django.views.generic import TemplateView, View
from django.contrib import messages
from django.http import HttpResponse
from django.utils import timezone
from django.contrib.auth.mixins import PermissionRequiredMixin

from osf.management.commands.manage_switch_flags import manage_waffle
from osf.management.commands.update_registration_schemas import update_registration_schemas
from osf.management.commands.daily_reporters_go import daily_reporters_go
from osf.management.commands.monthly_reporters_go import monthly_reporters_go
from osf.management.commands.fetch_cedar_metadata_templates import ingest_cedar_metadata_templates
from osf.management.commands.sync_doi_metadata import sync_doi_metadata
from scripts.find_spammy_content import manage_spammy_content
from django.urls import reverse
from django.shortcuts import redirect
Expand Down Expand Up @@ -137,8 +139,21 @@ def post(self, request, *args, **kwargs):
messages.success(request, 'Monthly reporters successfully went.')
return redirect(reverse('management:commands'))


class IngestCedarMetadataTemplates(ManagementCommandPermissionView):
def post(self, request):
ingest_cedar_metadata_templates()
messages.success(request, 'Cedar templates have been successfully imported from Cedar Workbench.')
return redirect(reverse('management:commands'))


class BulkResync(ManagementCommandPermissionView):

def post(self, request):
sync_doi_metadata(
modified_date=timezone.now(),
batch_size=None,
dry_run=False
)
messages.success(request, 'Resyncing with CrossRef and DataCite! It will take some time.')
return redirect(reverse('management:commands'))
13 changes: 13 additions & 0 deletions admin/templates/management/commands.html
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,19 @@ <h4><u>Ingest Cedar Metadata templates</u></h4>
</nav>
</form>
</section>
<section>
<h4><u>Resync with CrossRef and DataCite</u></h4>
<p>
Use this management command to resync all preprints with CrossRef and public nodes/registrations with DataCite.
</p>
<form method="post"
action="{% url 'management:bulk-resync'%}">
{% csrf_token %}
<nav>
<input class="btn btn-success" type="submit" value="Run" />
</nav>
</form>
</section>
</div>
</section>
{% endblock %}
72 changes: 58 additions & 14 deletions osf/management/commands/sync_doi_metadata.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,77 @@
#!/usr/bin/env python3
import time
import datetime
import logging

from django.core.management.base import BaseCommand
from datacite.errors import DataCiteServerError
from website.identifiers.clients.exceptions import CrossRefRateLimitError
from osf.models import Identifier

from framework.celery_tasks import app

logger = logging.getLogger(__name__)


@app.task(name='osf.management.commands.sync_doi_metadata', max_retries=5, default_retry_delay=60)
def sync_identifier_doi(identifier_id):
RATE_LIMIT_RETRY_DELAY = 60 * 5


@app.task(name='osf.management.commands.sync_doi_metadata', max_retries=5, default_retry_delay=RATE_LIMIT_RETRY_DELAY)
def sync_identifier_doi(identifier_id, delay=0):
time.sleep(delay)

identifier = Identifier.objects.get(id=identifier_id)
identifier.referent.request_identifier_update('doi')
identifier.save()
logger.info(f' doi update for {identifier.value} complete')
try:
identifier.referent.request_identifier_update('doi')
identifier.save()
logger.info(f' doi update for {identifier.value} complete')
except CrossRefRateLimitError as err:
logger.warning(f'Doi update for {identifier.value} failed because of rate limit: {err}')
raise
except DataCiteServerError as err:
# the first param is status code, the second one is text
# see create_identifier.metadata_post call in datacite.py
status_code, text = err.args
if status_code == 429:
logger.warning(f'Doi update for {identifier.value} failed because of rate limit: {text}')
else:
logger.warning(f'Doi update for {identifier.value} failed. Error: {text}')

raise
except Exception as err:
logger.warning(f'Doi update for {identifier.value} failed because of an unexpected error: {err}')
raise

def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private=False):

def sync_doi_metadata(modified_date, batch_size=100, dry_run=True, sync_private=False, rate_limit=100):

identifiers = Identifier.objects.filter(
category='doi',
deleted__isnull=True,
modified__lte=modified_date,
object_id__isnull=False,
)[:batch_size]
)
if batch_size:
identifiers = identifiers[:batch_size]
rate_limit = batch_size if batch_size > rate_limit else rate_limit

logger.info(f'{"[DRY RUN]: " if dry_run else ""}'
f'{identifiers.count()} identifiers to mint')

for identifier in identifiers:
if not dry_run:
if (identifier.referent.is_public and not identifier.referent.deleted and not identifier.referent.is_retracted) or sync_private:
sync_identifier_doi.apply_async(kwargs={'identifier_id': identifier.id})
delay = 0
for record_number, identifier in enumerate(identifiers, 1):
if dry_run:
logger.info(f'{"[DRY RUN]: " if dry_run else ""}'
f' doi minting for {identifier.value} started')
continue

logger.info(f'{"[DRY RUN]: " if dry_run else ""}'
f' doi minting for {identifier.value} started')
# in order to not reach rate limits that CrossRef and DataCite have, we increase delay after each
# rate_limit records by 5 minutes
if not record_number % rate_limit:
delay += RATE_LIMIT_RETRY_DELAY

if (identifier.referent.is_public and not identifier.referent.deleted and not identifier.referent.is_retracted) or sync_private:
sync_identifier_doi.apply_async(kwargs={'identifier_id': identifier.id, 'delay': delay})


class Command(BaseCommand):
Expand Down Expand Up @@ -66,10 +102,18 @@ def add_arguments(self, parser):
help='include all dois updated before this date.',
required=True
)
parser.add_argument(
'--rate_limit',
'-r',
type=int,
default=100,
help='number of dois to update at the same time.',
)

def handle(self, *args, **options):
dry_run = options.get('dry_run')
sync_private = options.get('sync_private')
batch_size = options.get('batch_size')
modified_date = options.get('modified_date')
sync_doi_metadata(modified_date, batch_size, dry_run=dry_run, sync_private=sync_private)
rate_limit = options.get('rate_limit')
sync_doi_metadata(modified_date, batch_size, dry_run=dry_run, sync_private=sync_private, rate_limit=rate_limit)
10 changes: 6 additions & 4 deletions website/identifiers/clients/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
from django.db.models import QuerySet

from .exceptions import CrossRefRateLimitError
from framework.auth.utils import impute_names
from website.identifiers.utils import remove_control_characters
from website.identifiers.clients.base import AbstractIdentifierClient
Expand Down Expand Up @@ -247,7 +248,7 @@ def create_identifier(self, preprint, category, include_relation=True):
logger.info(f'Sending metadata for DOI {doi}:\n{metadata}')

# Crossref sends an email to CROSSREF_DEPOSITOR_EMAIL to confirm
requests.post(
response = requests.post(
self._build_url(
operation='doMDUpload',
login_id=username,
Expand All @@ -256,11 +257,12 @@ def create_identifier(self, preprint, category, include_relation=True):
),
files={'file': (f'{preprint._id}.xml', metadata)},
)
if response.status_code == 429:
raise CrossRefRateLimitError(response.text)

# Don't wait for response to confirm doi because it arrives via email.
return {'doi': doi}
else:
raise NotImplementedError()

raise NotImplementedError()

def update_identifier(self, preprint, category):
return self.create_identifier(preprint, category)
Expand Down
10 changes: 10 additions & 0 deletions website/identifiers/clients/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
class IdentifierAlreadyExists(Exception):
pass


class ClientResponseError(Exception):

def __init__(self, response):
self.response = response
super().__init__(f'Error response from client: {self.response.status_code}')


class CrossRefRateLimitError(Exception):

def __init__(self, error):
self.error = error

def __str__(self):
return self.error
Loading