-
Notifications
You must be signed in to change notification settings - Fork 2
/
indexing.py
46 lines (37 loc) · 1.46 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import logging
import os
import pandas as pd
from tqdm import tqdm
from whoosh import fields, index, qparser, scoring
schema = fields.Schema(
movie_id=fields.KEYWORD(stored=True, scorable=True),
critic_id=fields.KEYWORD(stored=True, scorable=True),
score=fields.NUMERIC(stored=True),
review=fields.TEXT(stored=True),
freshness=fields.KEYWORD(scorable=True)
)
def create_index(df, index_name, schema):
os.makedirs(index_name, exist_ok=True)
ix = index.create_in(index_name, schema)
writer = ix.writer()
logging.info('Indexing reviews')
for _, review in tqdm(df.iterrows(), total=df.shape[0], desc='adding reviews into index'):
writer.add_document(
movie_id=review['movie_id'],
critic_id=review['critic_id'],
review=review['review'],
score=review['score'])
logging.info('Committing reviews. This will take a while')
writer.commit()
return index.open_dir(index_name).searcher(weighting=scoring.BM25F)
def load_index(path, data):
path = os.path.join(path, 'index')
whoosh_parser = qparser.MultifieldParser(['movie_id', 'review'], schema=schema)
try:
searcher = index.open_dir(path).searcher(weighting=scoring.BM25F)
except index.EmptyIndexError:
searcher = create_index(data, path, schema)
return whoosh_parser, searcher
if __name__ == '__main__':
df = pd.read_table('reviews.tsv')
create_index(df, index_name='data/index', schema=schema)