Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate delete_expired tasks to Django. #643

Merged
merged 15 commits into from
Apr 19, 2019
6 changes: 6 additions & 0 deletions app/app.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ handlers:
script: wsgi.application
- url: /.*/sitemap.*
script: wsgi.application
- url: /.*/tasks/process_expirations.*
script: wsgi.application
- url: /.*/tasks/cleanup_stray_notes.*
script: wsgi.application
- url: /.*/tasks/cleanup_stray_subscriptions.*
script: wsgi.application
- url: /.*/tasks/sitemap_ping.*
script: wsgi.application
- url: .*
Expand Down
15 changes: 9 additions & 6 deletions app/cron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ cron:
url: /global/tasks/count/update_dead_status
schedule: every 5 minutes

- description: delete expired
url: /global/tasks/delete_expired
schedule: every 60 minutes
- description: delete old
url: /global/tasks/delete_old
schedule: every 60 minutes
- description: process expirations
url: /global/tasks/process_expirations
schedule: every 4 hours
- description: clean up unassociated notes
url: /global/tasks/cleanup_stray_notes
schedule: every 4 hours
- description: clean up unassociated subscriptions
url: /global/tasks/cleanup_stray_subscriptions
schedule: every 4 hours
- description: clean up in test mode
url: /global/tasks/clean_up_in_test_mode
schedule: every 60 minutes
Expand Down
5 changes: 4 additions & 1 deletion app/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,10 @@ def get_restore_url(handler, person, ttl=3*24*3600):

def delete_person(handler, person, send_notices=True):
"""Delete a person record and associated data. If it's an original
record, deletion can be undone within EXPIRED_TTL_DAYS days."""
record, deletion can be undone within EXPIRED_TTL_DAYS days.

The handler argument is not needed if notices aren't being sent.
"""
if person.is_original():
if send_notices:
# For an original record, send notifiations
Expand Down
21 changes: 17 additions & 4 deletions app/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,7 @@ def get_effective_expiry_date(self):
# in theory, we should always have original_creation_date, but since
# it was only added recently, we might have legacy
# records without it.
start_date = (self.source_date or self.original_creation_date or
utils.get_utcnow())
start_date = self.original_creation_date or utils.get_utcnow()
return start_date + timedelta(expiration_days)

def put_expiry_flags(self):
Expand Down Expand Up @@ -509,12 +508,26 @@ def wipe_contents(self):
# Permanently delete all related Photos and Notes, but not self.
self.delete_related_entities()

was_changed = False
# TODO(nworden): consider adding a is_tombstone property or something
# like that, so we could just check that instead of checking each
# property individually every time.
for name, property in self.properties().items():
# Leave the repo, is_expired flag, and timestamps untouched.
if name not in ['repo', 'is_expired', 'original_creation_date',
'source_date', 'entry_date', 'expiry_date']:
setattr(self, name, property.default)
self.put() # Store the empty placeholder record.
if name == 'photo':
# If we attempt to access this directly, Datastore will try
# to fetch the actual photo, which won't go well, because we
# just deleted the photo.
cur_value = Person.photo.get_value_for_datastore(self)
else:
cur_value = getattr(self, name)
if cur_value != property.default:
setattr(self, name, property.default)
was_changed = True
if was_changed:
self.put() # Store the empty placeholder record.

def delete_related_entities(self, delete_self=False):
"""Permanently delete all related Photos and Notes, and also self if
Expand Down
83 changes: 0 additions & 83 deletions app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import cloud_storage
import config
import const
import delete
import model
import photo
import pfif
Expand All @@ -38,92 +37,10 @@


CPU_MEGACYCLES_PER_REQUEST = 1000
EXPIRED_TTL = datetime.timedelta(delete.EXPIRED_TTL_DAYS, 0, 0)
FETCH_LIMIT = 100
PFIF = pfif.PFIF_VERSIONS[pfif.PFIF_DEFAULT_VERSION]


class ScanForExpired(utils.BaseHandler):
"""Common logic for scanning the Person table looking for things to delete.

The common logic handles iterating through the query, updating the expiry
date and wiping/deleting as needed. The is_expired flag on all records whose
expiry_date has passed. Records that expired more than EXPIRED_TTL in the
past will also have their data fields, notes, and photos permanently
deleted.

Subclasses set the query and task_name."""
repo_required = False

# App Engine issues HTTP requests to tasks.
https_required = False

def task_name(self):
"""Subclasses should implement this."""
pass

def query(self):
"""Subclasses should implement this."""
pass

def schedule_next_task(self, cursor):
"""Schedule the next task for to carry on with this query.
"""
self.add_task_for_repo(self.repo, self.task_name(), self.ACTION,
cursor=cursor, queue_name='expiry')

def get(self):
if self.repo:
query = self.query()
if self.params.cursor:
query.with_cursor(self.params.cursor)
cursor = self.params.cursor
try:
for person in query:
# query.cursor() returns a cursor which returns the entity
# next to this "person" as the first result.
next_cursor = query.cursor()
was_expired = person.is_expired
person.put_expiry_flags()
if (utils.get_utcnow() - person.get_effective_expiry_date()
> EXPIRED_TTL):
person.wipe_contents()
else:
# treat this as a regular deletion.
if person.is_expired and not was_expired:
delete.delete_person(self, person)
cursor = next_cursor
except runtime.DeadlineExceededError:
self.schedule_next_task(cursor)
except datastore_errors.Timeout:
# This exception is sometimes raised, maybe when the query
# object live too long?
self.schedule_next_task(cursor)
else:
for repo in model.Repo.list():
self.add_task_for_repo(repo, self.task_name(), self.ACTION)

class DeleteExpired(ScanForExpired):
"""Scan for person records with expiry date thats past."""
ACTION = 'tasks/delete_expired'

def task_name(self):
return 'delete-expired'

def query(self):
return model.Person.past_due_records(self.repo)

class DeleteOld(ScanForExpired):
"""Scan for person records with old source dates for expiration."""
ACTION = 'tasks/delete_old'

def task_name(self):
return 'delete-old'

def query(self):
return model.Person.potentially_expired_records(self.repo)


class CleanUpInTestMode(utils.BaseHandler):
"""If the repository is in "test mode", this task deletes all entries older
than DELETION_AGE_SECONDS (defined below), regardless of their actual
Expand Down
33 changes: 33 additions & 0 deletions app/tasksmodule/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import django.http

import model
import utils
import views.base

Expand All @@ -40,3 +41,35 @@ def dispatch(self, request, *args, **kwargs):
logging.warn('Non-taskqueue access of: %s' % self.request.path)
return self.error(403)
return super(TasksBaseView, self).dispatch(request, args, kwargs)


class PerRepoTaskBaseView(TasksBaseView):
"""Base class for tasks that should be split up by repo.

It's fairly common for our cron jobs to be split up by repo (it's the only
good way we've thought of to split them up).

GET requests are used for kicking the jobs off. Subclasses should implement
their operations in the POST handler, and should implement schedule_task to
handle their task names, parameters, etc.
"""

def schedule_task(self, repo, **kwargs):
"""Schedules a new or continuation task."""
del self, repo, kwargs # unusued
raise NotImplementedError()

def get(self, request, *args, **kwargs):
"""Schedules tasks."""
del request, args, kwargs # unused
if self.env.repo == 'global':
for repo in model.Repo.list():
self.schedule_task(repo)
else:
self.schedule_task(self.env.repo)
return django.http.HttpResponse('')

def post(self, request, *args, **kwargs):
"""Carries out task operations."""
del request, args, kwargs # unused
raise NotImplementedError()
Loading