Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(statistics): use subquery instead of join to avoid cartesian product #558

Closed
Closed
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ backend-test: ## Test the backend

.PHONY: backend-dbshell
backend-dbshell: ## Start a psql shell
@$(ORCHESTRATOR) compose run -it --rm db psql -Utimed timed
@$(ORCHESTRATOR) compose exec -it db psql -Utimed timed

.PHONY: shellplus
shellplus: ## Run shell_plus
Expand Down
135 changes: 69 additions & 66 deletions backend/poetry.lock

Large diffs are not rendered by default.

203 changes: 52 additions & 151 deletions backend/timed/reports/filters.py
Original file line number Diff line number Diff line change
@@ -1,175 +1,76 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from datetime import timedelta

from django.db.models import DurationField, F, Q, Sum, Value
from django.db.models import Case, Exists, F, OuterRef, Q, QuerySet, Sum, Value, When
from django.db.models.functions import Coalesce
from django_filters import utils
from django_filters.rest_framework import (
BaseInFilter,
CharFilter,
DateFilter,
DjangoFilterBackend,
FilterSet,
NumberFilter,
)

from timed.projects.models import CustomerAssignee, ProjectAssignee, TaskAssignee
from timed.projects.filters import TaskFilterSet
from timed.tracking.filters import CostCenterFilter, ReportFilterSet

if TYPE_CHECKING:
from django.db.models import QuerySet

from timed.tracking.models import Report
class NOOPFilter(CharFilter):
def filter(self, qs, value):
return qs


class StatisticFiltersetBase:
def filter_has_reviewer(
self, queryset: QuerySet[Report], _name: str, value: int
) -> QuerySet[Report]:
if not value: # pragma: no cover
return queryset
class ReportStatisticFilterset(FilterSet):
user = NumberFilter(field_name="user")
customer = NumberFilter(field_name="customer_id")
from_date = DateFilter(field_name="date", lookup_expr="gte")
to_date = DateFilter(field_name="date", lookup_expr="lte")
cost_center = CostCenterFilter(task_prefix="task")
reviewer = NumberFilter(method="filter_reviewer")

task_prefix = self._refs["task_prefix"]
project_prefix = self._refs["project_prefix"]
customer_prefix = self._refs["customer_prefix"]
def filter_reviewer(self, qs, name, value):
return ReportFilterSet.filter_has_reviewer(self, qs, name, value)

customer_assignees = CustomerAssignee.objects.filter(
is_reviewer=True, user_id=value
).values("customer_id")

project_assignees = ProjectAssignee.objects.filter(
is_reviewer=True, user_id=value
).values("project_id")
task_assignees = TaskAssignee.objects.filter(
is_reviewer=True, user_id=value
).values("task_id")
class OrgReportFilterset(FilterSet):
"""Base class for organisational filtersets (customer, project, task).

the_filter = (
Q(**{f"{customer_prefix}pk__in": customer_assignees})
| Q(**{f"{project_prefix}pk__in": project_assignees})
| Q(**{f"{task_prefix}id__in": task_assignees})
)
return queryset.filter_aggregate(the_filter).filter_base(the_filter)

def filter_cost_center(
self, queryset: QuerySet[Report], _name: str, value: int
) -> QuerySet[Report]:
"""Filter report by cost center.

The filter behaves slightly different depending on what the
statistic summarizes:
* When viewing the statistic over customers, the work durations are
filtered (either project or task)
* When viewing the statistic over project, only the projects
are filtered
* When viewing the statistic over tasks, only the tasks
are filtered
"""
# TODO: Discuss Is this the desired behaviour by our users?

if not value: # pragma: no cover
return queryset

is_customer = not self._refs["customer_prefix"]

task_prefix = self._refs["task_prefix"]
project_prefix = self._refs["project_prefix"]

filter_q = Q(**{f"{task_prefix}cost_center": value}) | Q(
**{
f"{project_prefix}cost_center": value,
f"{task_prefix}cost_center__isnull": True,
}
)
These have in common that none of the report-specific things can be
filtered, so instead of copy-pasting noop filters everywhere, we collect
them here
"""

if is_customer:
# Customer mode: We only need to filter aggregates,
# as the customer has no cost center
return queryset.filter_aggregate(filter_q)
# Project or task: Filter both to get the correct result
return queryset.filter_base(filter_q).filter_aggregate(filter_q)
user = NOOPFilter()
from_date = NOOPFilter()
to_date = NOOPFilter()
cost_center = NOOPFilter()
reviewer = NOOPFilter()

def filter_queryset(self, queryset: QuerySet[Report]) -> QuerySet[Report]:
qs = super().filter_queryset(queryset)

duration_ref = self._refs["reports_ref"] + "__duration"
class CustomerReportStatisticFilterSet(OrgReportFilterset):
customer = NumberFilter(field_name="pk")

full_qs = qs._base.annotate( # noqa: SLF001
duration=Coalesce(
Sum(duration_ref, filter=qs._agg_filters), # noqa: SLF001
Value("00:00:00", DurationField(null=False)),
),
pk=F("id"),
)
return full_qs.values()


def statistic_filterset_builder(
name: str, reports_ref: str, project_ref: str, customer_ref: str, task_ref: str
) -> type[StatisticFiltersetBase, FilterSet]:
reports_prefix = f"{reports_ref}__" if reports_ref else ""
project_prefix = f"{project_ref}__" if project_ref else ""
customer_prefix = f"{customer_ref}__" if customer_ref else ""
task_prefix = f"{task_ref}__" if task_ref else ""

return type(
name,
(StatisticFiltersetBase, FilterSet),
{
"_refs": {
"reports_prefix": reports_prefix,
"project_prefix": project_prefix,
"customer_prefix": customer_prefix,
"task_prefix": task_prefix,
"reports_ref": reports_ref,
"project_ref": project_ref,
"customer_ref": customer_ref,
"task_ref": task_ref,
},
"from_date": DateFilter(
field_name=f"{reports_prefix}date", lookup_expr="gte"
),
"to_date": DateFilter(
field_name=f"{reports_prefix}date", lookup_expr="lte"
),
"project": NumberFilter(field_name=f"{project_prefix}pk"),
"customer": NumberFilter(field_name=f"{customer_prefix}pk"),
"review": NumberFilter(field_name=f"{reports_prefix}review"),
"not_billable": NumberFilter(field_name=f"{reports_prefix}not_billable"),
"billed": NumberFilter(field_name=f"{reports_prefix}billed"),
"verified": NumberFilter(
field_name=f"{reports_prefix}verified_by_id",
lookup_expr="isnull",
exclude=True,
),
"verifier": NumberFilter(field_name=f"{reports_prefix}verified_by"),
"billing_type": NumberFilter(field_name=f"{project_prefix}billing_type"),
"user": NumberFilter(field_name=f"{reports_prefix}user_id"),
"rejected": NumberFilter(field_name=f"{reports_prefix}rejected"),
"id": BaseInFilter(),
"cost_center": NumberFilter(method="filter_cost_center"),
"reviewer": NumberFilter(method="filter_has_reviewer"),
},
)


CustomerStatisticFilterSet = statistic_filterset_builder(
"CustomerStatisticFilterSet",
reports_ref="projects__tasks__reports",
project_ref="projects",
task_ref="projects__tasks",
customer_ref="",
)

ProjectStatisticFilterSet = statistic_filterset_builder(
"ProjectStatisticFilterSet",
reports_ref="tasks__reports",
project_ref="",
task_ref="tasks",
customer_ref="customer",
)
class StatisticSecondaryFilterBackend(DjangoFilterBackend):
# Special statistic filter backend. Turn

TaskStatisticFilterSet = statistic_filterset_builder(
"TaskStatisticFilterSet",
reports_ref="reports",
project_ref="project",
task_ref="",
customer_ref="project__customer",
)
def get_filterset_class(self, view, queryset=None):
return view.secondary_filterset_class

def filter_queryset(self, request, queryset, view):
secondary_filterset = self.get_filterset(request, view.secondary_queryset, view)

if not secondary_filterset.is_valid() and self.raise_exception:
raise utils.translate_validation(secondary_filterset.errors)

secondary_qs = secondary_filterset.qs

queryset = (
queryset.filter(**{view.secondary_link_field: OuterRef("pk")})
.values(view.secondary_link_field)
.annotate(duration=Sum("duration"))
)

return secondary_qs.annotate(duration=Sum(queryset.values("duration")))
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from django.db import migrations, models

CREATE_VIEW_SQL = """
CREATE VIEW "reports_reportstatistic" AS (
SELECT
projects_task.id as id, -- avoid jumping through hoops for Django
projects_task.id as task_id,
user_id,

projects_project.id as project_id,
projects_project.customer_id as customer_id,

sum(coalesce(duration, '00:00:00')) as duration,

date,
extract('year' from date) as year,
extract('month' from date) as month,
(extract('year' from date) * 100 + extract('month' from date))
as full_month

FROM projects_task
LEFT JOIN tracking_report ON projects_task.id = tracking_report.task_id
JOIN projects_project ON projects_project.id = projects_task.project_id

GROUP BY
projects_task.id,
user_id,
date,
projects_task.id,
projects_project.id,
projects_project.customer_id
)
"""

DROP_VIEW_SQL = """
DROP VIEW "reports_reportstatistic"
"""


class Migration(migrations.Migration):

dependencies = [
('tracking', '0001_initial')
]

operations = [
migrations.RunSQL(CREATE_VIEW_SQL, DROP_VIEW_SQL),
migrations.CreateModel(
name='reportstatistic',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('duration', models.DurationField()),
('month', models.IntegerField()),
('full_month', models.CharField(max_length=7)),
('year', models.CharField(max_length=4)),
],
options={
'managed': False,
},
),
]
Empty file.
34 changes: 34 additions & 0 deletions backend/timed/reports/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from django.db import models

from timed.employment.models import User
from timed.projects.models import Customer, Project, Task


class ReportStatistic(models.Model):
# Implement a basic STAR SCHEMA view: Basic aggregate is done in the VIEW,
# everything else of interest can be JOINed directly

task = models.OneToOneField(
Task, on_delete=models.DO_NOTHING, null=True, related_name="+"
)
user = models.ForeignKey(
User, on_delete=models.DO_NOTHING, null=True, related_name="+"
)
project = models.ForeignKey(
Project, on_delete=models.DO_NOTHING, null=False, related_name="+"
)
customer = models.ForeignKey(
Customer, on_delete=models.DO_NOTHING, null=False, related_name="+"
)
duration = models.DurationField()

month = models.IntegerField()
full_month = models.CharField(max_length=7)
year = models.IntegerField()
date = models.DateField()

class Meta:
managed = False

def __str__(self):
return f"ReportStat({self.task} @ {self.date}, {self.user})"
28 changes: 23 additions & 5 deletions backend/timed/reports/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from datetime import timedelta
from typing import TYPE_CHECKING

from django.contrib.auth import get_user_model
Expand All @@ -21,16 +22,33 @@
from typing import ClassVar


class ReportDurationField(DurationField):
"""Custom duration field, to turn None into 00:00:00.

We don't wanna complicate the query any more than neccessary, and
cast the NULL durations into something else on DB level. Let's just
do it at deserialisation time instead.
"""

# TODO: this shouldn't be needed, for total_time calc we still need
# the *numbers* in the QS

def to_representation(self, value):
if value is None:
value = timedelta(seconds=0)
return super().to_representation(value)


class YearStatisticSerializer(TotalTimeRootMetaMixin, Serializer):
duration = DurationField()
duration = DurationField(read_only=True)
year = IntegerField()

class Meta:
resource_name = "year-statistics"


class MonthStatisticSerializer(TotalTimeRootMetaMixin, Serializer):
duration = DurationField()
duration = DurationField(read_only=True)
year = IntegerField()
month = IntegerField()

Expand All @@ -39,16 +57,16 @@ class Meta:


class CustomerStatisticSerializer(TotalTimeRootMetaMixin, Serializer):
duration = DurationField()
duration = DurationField(read_only=True)
name = CharField(read_only=True)

class Meta:
resource_name = "customer-statistics"


class ProjectStatisticSerializer(TotalTimeRootMetaMixin, Serializer):
duration = DurationField()
name = CharField()
duration = DurationField(read_only=True)
name = CharField(source="project__name")
amount_offered = DecimalField(max_digits=None, decimal_places=2)
amount_offered_currency = CharField()
amount_invoiced = DecimalField(max_digits=None, decimal_places=2)
Expand Down
Loading
Loading