Skip to content
This repository was archived by the owner on Jun 10, 2022. It is now read-only.

Commit 11ccd4b

Browse files
committedJun 29, 2021
chore: hub to requirements
1 parent 0c1ac24 commit 11ccd4b

File tree

3 files changed

+79
-66
lines changed

3 files changed

+79
-66
lines changed
 

‎backend/app.py

+26-65
Original file line numberDiff line numberDiff line change
@@ -2,109 +2,70 @@
22
__license__ = "Apache-2.0"
33

44
import os
5-
import itertools
6-
import csv
75
import shutil
8-
import click
96
import sys
7+
import click
108
from backend_config import (
11-
text_length,
129
max_docs,
13-
backend_datafile,
14-
backend_port,
15-
backend_workdir,
16-
backend_model,
10+
datafile,
11+
port,
12+
workdir,
13+
model
1714
)
1815

1916
from executors.disk_indexer import DiskIndexer
20-
from executors.rankers import ReviewRanker
21-
from executors.encoders import MyTransformer
22-
import random
23-
24-
from jina import Flow, Document
17+
from helper import prep_docs
18+
from jina import Flow
2519

2620
try:
2721
__import__("pretty_errors")
2822
except ImportError:
2923
pass
3024

3125

32-
def trim_string(
33-
input_string: str, word_count: int = text_length, sep: str = " "
34-
) -> str:
35-
"""
36-
Trim a string to a certain number of words.
37-
:param input_string: string to trim
38-
:param word_count: how many words to trim to
39-
:param sep: separator between words
40-
:return: trimmmed string
41-
"""
42-
sanitized_string = input_string.replace("\\n", sep)
43-
words = sanitized_string.split(sep)[:word_count]
44-
trimmed_string = " ".join(words)
45-
46-
return trimmed_string
47-
48-
49-
def prep_docs(input_file: str, num_docs:int=max_docs):
26+
def index(num_docs: int = max_docs):
5027
"""
51-
Create generator for every row in csv as a Document
52-
:param input_file: Input csv filename
53-
:return: Generator
28+
Build an index for your search
29+
:param num_docs: maximum number of Documents to index
5430
"""
55-
56-
with open(input_file, "r") as csv_file:
57-
csv_reader = csv.DictReader(csv_file)
58-
input_field = "Description"
59-
for row in itertools.islice(csv_reader, num_docs):
60-
# Fix invalid ratings and counts
61-
if row["Average User Rating"] == "":
62-
row["Average User Rating"] = random.uniform(0.0, 5.0)
63-
if row["User Rating Count"] == "":
64-
row["User Rating Count"] = random.randint(10, 10_000)
65-
# Set field to encode and index
66-
input_data = trim_string(f"{row['Name']} - {trim_string(row[input_field])}")
67-
# Put all of that into a doc
68-
doc = Document(text=input_data)
69-
doc.tags = row
70-
yield doc
71-
72-
73-
def index(num_docs=max_docs):
7431
flow = (
7532
Flow()
76-
# .add(uses='jinahub+docker://TransformerTorchEncoder', pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3", name="encoder", max_length=50)
7733
.add(
78-
uses=MyTransformer,
79-
pretrained_model_name_or_path=backend_model,
34+
uses="jinahub+docker://TransformerTorchEncoder",
35+
pretrained_model_name_or_path=model,
8036
name="encoder",
81-
).add(uses=DiskIndexer, workspace=backend_workdir, name="indexer")
37+
max_length=50,
38+
)
39+
.add(uses=DiskIndexer, workspace=workdir)
8240
)
8341

8442
with flow:
8543
flow.post(
8644
on="/index",
87-
inputs=prep_docs(input_file=backend_datafile, num_docs=num_docs),
45+
inputs=prep_docs(input_file=datafile, num_docs=num_docs),
8846
request_size=64,
8947
read_mode="r",
9048
)
9149

9250

9351
def query_restful():
52+
"""
53+
Query your index
54+
"""
9455
flow = (
9556
Flow()
96-
# .add(uses='jinahub+docker://TransformerTorchEncoder', pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3", name="encoder", max_length=50)
9757
.add(
98-
uses=MyTransformer,
99-
pretrained_model_name_or_path=backend_model,
58+
uses="jinahub+docker://TransformerTorchEncoder",
59+
pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3",
10060
name="encoder",
101-
).add(uses=DiskIndexer, workspace=backend_workdir, name="indexer")
102-
# .add(uses=ReviewRanker, name="ranker")
61+
max_length=50,
62+
)
63+
.add(uses=DiskIndexer, workspace=workdir)
10364
)
10465

10566
with flow:
10667
flow.protocol = "http"
107-
flow.port_expose = backend_port
68+
flow.port_expose = port
10869
flow.block()
10970

11071

@@ -117,7 +78,7 @@ def query_restful():
11778
@click.option("--num_docs", "-n", default=max_docs)
11879
@click.option("--force", "-f", is_flag=True)
11980
def main(task: str, num_docs: int, force: bool):
120-
workspace = backend_workdir
81+
workspace = workdir
12182
if task == "index":
12283
if os.path.exists(workspace):
12384
if force:

‎backend/helper.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import random
2+
import itertools
3+
import csv
4+
from jina import Document
5+
from typing import Generator
6+
7+
from backend_config import (
8+
text_length,
9+
max_docs,
10+
)
11+
12+
13+
def trim_string(
14+
input_string: str, word_count: int = text_length, sep: str = " "
15+
) -> str:
16+
"""
17+
Trim a string to a certain number of words.
18+
:param input_string: string to trim
19+
:param word_count: how many words to trim to
20+
:param sep: separator between words
21+
:return: trimmmed string
22+
"""
23+
sanitized_string = input_string.replace("\\n", sep)
24+
words = sanitized_string.split(sep)[:word_count]
25+
trimmed_string = " ".join(words)
26+
27+
return trimmed_string
28+
29+
30+
def prep_docs(input_file: str, num_docs: int = max_docs) -> Generator:
31+
"""
32+
Create generator for every row in csv as a Document
33+
:param input_file: Input csv filename
34+
:return: Generator
35+
"""
36+
37+
with open(input_file, "r") as csv_file:
38+
csv_reader = csv.DictReader(csv_file)
39+
input_field = "Description"
40+
for row in itertools.islice(csv_reader, num_docs):
41+
# Fix invalid ratings and counts
42+
if row["Average User Rating"] == "":
43+
row["Average User Rating"] = random.uniform(0.0, 5.0)
44+
if row["User Rating Count"] == "":
45+
row["User Rating Count"] = random.randint(10, 10_000)
46+
# Set field to encode and index
47+
input_data = trim_string(f"{row['Name']} - {trim_string(row[input_field])}")
48+
# Put all of that into a doc
49+
doc = Document(text=input_data)
50+
doc.tags = row
51+
yield doc

‎requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1-
jina[http,transformers,torch]==2.0.0rc9.dev22
1+
jina[http,transformers,torch,hub]==2.0.0rc9.dev22
22
pretty-errors==1.2.21
33
streamlit==0.82.0
4+
docker==5.0

0 commit comments

Comments
 (0)