Skip to content

Commit

Permalink
Merge pull request #252 from datasciencebr/cuducos-full-text-search
Browse files Browse the repository at this point in the history
Use full text search to query reimbursements
  • Loading branch information
anaschwendler authored Sep 28, 2017
2 parents dcfa4ab + 7020281 commit 8e4dae7
Show file tree
Hide file tree
Showing 11 changed files with 126 additions and 21 deletions.
3 changes: 2 additions & 1 deletion jarbas/api/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ def get_sample_reimbursement_api_response(obj):
receipt_text=obj.receipt_text,
last_update=obj.last_update.strftime('%Y-%m-%dT%H:%M:%SZ'),
available_in_latest_dataset=obj.available_in_latest_dataset,
receipt=dict(fetched=obj.receipt_fetched, url=obj.receipt_url)
receipt=dict(fetched=obj.receipt_fetched, url=obj.receipt_url),
search_vector=None
)
4 changes: 2 additions & 2 deletions jarbas/api/tests/test_reimbursement_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def get_reimbursement(**kwargs):
kwargs['reimbursement_values'] = '200.00,500.00'
kwargs['reimbursement_numbers'] = '2,3'
if quantity == 1:
return mixer.blend(Reimbursement, **kwargs)
return mixer.cycle(quantity).blend(Reimbursement, **kwargs)
return mixer.blend(Reimbursement, search_vector=None, **kwargs)
return mixer.cycle(quantity).blend(Reimbursement, search_vector=None, **kwargs)


class TestListApi(TestCase):
Expand Down
26 changes: 26 additions & 0 deletions jarbas/core/management/commands/searchvector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from django.core.management.base import BaseCommand
from django.contrib.postgres.search import SearchVector

from jarbas.core.models import Reimbursement


class Command(BaseCommand):

def handle(self, *args, **options):
total = Reimbursement.objects.count()
print('Creating search vector for {} reimbursements…'.format(total))
print('This takes several minutes/hours.')

search_vector = \
SearchVector('congressperson_name', config='portuguese', weight='A') + \
SearchVector('supplier', config='portuguese', weight='A') + \
SearchVector('cnpj_cpf', config='portuguese', weight='A') + \
SearchVector('party', config='portuguese', weight='A') + \
SearchVector('state', config='portuguese', weight='B') + \
SearchVector('receipt_text', config='portuguese', weight='B') + \
SearchVector('passenger', config='portuguese', weight='C') + \
SearchVector('leg_of_the_trip', config='portuguese', weight='C') + \
SearchVector('subquota_description', config='portuguese', weight='D') + \
SearchVector('subquota_group_description', config='portuguese', weight='D')

Reimbursement.objects.update(search_vector=search_vector)
26 changes: 26 additions & 0 deletions jarbas/core/migrations/0039_add_search_vector_to_reimbursement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.4 on 2017-09-27 20:19
from __future__ import unicode_literals

import django.contrib.postgres.search
from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('core', '0038_auto_20170728_1748'),
]

operations = [
migrations.AddField(
model_name='historicalreimbursement',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name='reimbursement',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
]
20 changes: 20 additions & 0 deletions jarbas/core/migrations/0040_create_gin_index_with_search_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.4 on 2017-09-28 02:37
from __future__ import unicode_literals

import django.contrib.postgres.indexes
from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('core', '0039_add_search_vector_to_reimbursement'),
]

operations = [
migrations.AddIndex(
model_name='reimbursement',
index=django.contrib.postgres.indexes.GinIndex(fields=['search_vector'], name='core_reimbu_search__ba9b2f_gin'),
),
]
5 changes: 5 additions & 0 deletions jarbas/core/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from django.contrib.postgres.fields import JSONField
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.db import models
from requests import head
from simple_history.models import HistoricalRecords
Expand Down Expand Up @@ -79,6 +81,8 @@ class Reimbursement(models.Model):
receipt_url = models.CharField('URL do Documento Fiscal', max_length=140, blank=True, null=True)
receipt_text = models.TextField('Texto do Recibo', blank=True, null=True)

search_vector = SearchVectorField(null=True)

history = HistoricalRecords()

objects = models.Manager.from_queryset(ReimbursementQuerySet)()
Expand All @@ -88,6 +92,7 @@ class Meta:
verbose_name = 'reembolso'
verbose_name_plural = 'reembolsos'
index_together = [['year', 'issue_date', 'id']]
indexes = [GinIndex(fields=['search_vector'])]

def get_receipt_url(self, force=False, bulk=False):
if self.receipt_url:
Expand Down
17 changes: 17 additions & 0 deletions jarbas/core/tests/test_searchvector_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from unittest.mock import patch

from django.test import TestCase

from jarbas.core.management.commands.searchvector import Command
from jarbas.core.models import Reimbursement


class TestCommandHandler(TestCase):

@patch.object(Reimbursement.objects, 'update')
@patch('jarbas.core.management.commands.searchvector.print')
def test_handler(self, print_, update):
command = Command()
command.handle()
self.assertEqual(2, print_.call_count)
self.assertEqual(1, update.call_count)
4 changes: 2 additions & 2 deletions jarbas/core/tests/test_tweet_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
class TestTweet(TestCase):

def setUp(self):
self.tweet = mixer.blend(Tweet, status=42)
self.tweet = mixer.blend(Tweet, reimbursement__search_vector=None, status=42)

def test_ordering(self):
mixer.blend(Tweet, status=1)
mixer.blend(Tweet, reimbursement__search_vector=None, status=1)
self.assertEqual(42, Tweet.objects.first().status)

def test_get_url(self):
Expand Down
14 changes: 9 additions & 5 deletions jarbas/core/tests/test_tweets_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_non_existent_reimbursement(self, save_tweet, document_ids):
@patch.object(Command, 'document_ids', new_callable=PropertyMock)
@patch.object(Command, 'save_tweet')
def test_existing_tweet(self, save_tweet, document_ids):
reimbursement = mixer.blend(Reimbursement, document_id=123456)
reimbursement = mixer.blend(Reimbursement, search_vector=None, document_id=123456)
mixer.blend(Tweet, status=42, reimbursement=reimbursement)

document_ids.return_value = ((42, 123456),)
Expand All @@ -82,7 +82,7 @@ def test_existing_tweet(self, save_tweet, document_ids):
@patch.object(Command, 'document_ids', new_callable=PropertyMock)
@patch.object(Command, 'save_tweet')
def test_new_tweet(self, save_tweet, document_ids):
obj = mixer.blend(Reimbursement, document_id=123456)
obj = mixer.blend(Reimbursement, search_vector=None, document_id=123456)
document_ids.return_value = ((42, 123456),)
with self.settings(**self.credentials):
Command().handle()
Expand All @@ -109,7 +109,7 @@ def test_get_document_id(self):

def test_save_tweet(self):
status = 9999999999999999999999999
reimbursement = mixer.blend(Reimbursement)
reimbursement = mixer.blend(Reimbursement, search_vector=None)
command = Command()
command.log = MagicMock()
command.save_tweet(reimbursement, status)
Expand All @@ -119,7 +119,7 @@ def test_save_tweet(self):

def test_save_duplicated_tweet(self):
status = 9999999999999999999999999
reimbursement = mixer.blend(Reimbursement)
reimbursement = mixer.blend(Reimbursement, search_vector=None)
tweet = mixer.blend(Tweet, status=status, reimbursement=reimbursement)
command = Command()
command.log = MagicMock()
Expand Down Expand Up @@ -150,7 +150,11 @@ def test_tweets_with_clean_database(self, api):

@patch('jarbas.core.management.commands.tweets.twitter.Api')
def test_tweets_with_database(self, api):
tweet = mixer.blend(Tweet, status=random_tweet_status())
tweet = mixer.blend(
Tweet,
reimbursement__search_vector=None,
status=random_tweet_status()
)
api.return_value.GetUserTimeline.return_value = range(3)
with self.settings(**self.credentials):
command = Command()
Expand Down
26 changes: 16 additions & 10 deletions jarbas/dashboard/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from brazilnum.cnpj import format_cnpj
from brazilnum.cpf import format_cpf
from django.contrib.admin import SimpleListFilter
from django.contrib.postgres.search import SearchQuery, SearchRank
from django.db.models import F
from django.forms.widgets import Widget
from simple_history.admin import SimpleHistoryAdmin

Expand Down Expand Up @@ -247,16 +249,7 @@ class ReimbursementModelAdmin(SimpleHistoryAdmin):
# 'still_available',
)

search_fields = (
'applicant_id',
'cnpj_cpf',
'congressperson_name',
'document_id',
'party',
'state',
'supplier',
'subquota_description',
)
search_fields = ('search_vector',)

list_filter = (
SuspiciousListFilter,
Expand Down Expand Up @@ -376,5 +369,18 @@ def formfield_for_dbfield(self, db_field, **kwargs):
kwargs['widget'] = widgets.get(db_field.name)
return super().formfield_for_dbfield(db_field, **kwargs)

def get_search_results(self, request, queryset, search_term):
if not search_term:
return super(ReimbursementModelAdmin, self) \
.get_search_results(request, queryset, search_term)

query = SearchQuery(search_term, config='portuguese')
rank = SearchRank(F('search_vector'), query)
queryset = Reimbursement.objects.annotate(rank=rank) \
.filter(search_vector=query) \
.order_by('-rank')

return queryset, False


dashboard.register(Reimbursement, ReimbursementModelAdmin)
2 changes: 1 addition & 1 deletion jarbas/dashboard/tests/test_dashboard_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class TestDashboard(TestCase):

def setUp(self):
obj = mixer.blend(Reimbursement)
obj = mixer.blend(Reimbursement, search_vector=None)
self.urls = (
resolve_url('dashboard:index'),
resolve_url('dashboard:core_reimbursement_changelist'),
Expand Down

0 comments on commit 8e4dae7

Please sign in to comment.