chore: hub to requirements

alexcg1 · alexcg1 · commit 11ccd4b70c05 · 2021-06-29T10:55:55.000+02:00
diff --git a/backend/app.py b/backend/app.py
@@ -2,109 +2,70 @@
 __license__ = "Apache-2.0"
 
 import os
-import itertools
-import csv
 import shutil
-import click
 import sys
+import click
 from backend_config import (
-    text_length,
     max_docs,
-    backend_datafile,
-    backend_port,
-    backend_workdir,
-    backend_model,
+    datafile,
+    port,
+    workdir,
+    model
 )
 
 from executors.disk_indexer import DiskIndexer
-from executors.rankers import ReviewRanker
-from executors.encoders import MyTransformer
-import random
-
-from jina import Flow, Document
+from helper import prep_docs
+from jina import Flow
 
 try:
     __import__("pretty_errors")
 except ImportError:
     pass
 
 
-def trim_string(
-    input_string: str, word_count: int = text_length, sep: str = " "
-) -> str:
-    """
-    Trim a string to a certain number of words.
-    :param input_string: string to trim
-    :param word_count: how many words to trim to
-    :param sep: separator between words
-    :return: trimmmed string
-    """
-    sanitized_string = input_string.replace("\\n", sep)
-    words = sanitized_string.split(sep)[:word_count]
-    trimmed_string = " ".join(words)
-
-    return trimmed_string
-
-
-def prep_docs(input_file: str, num_docs:int=max_docs):
+def index(num_docs: int = max_docs):
     """
-    Create generator for every row in csv as a Document
-    :param input_file: Input csv filename
-    :return: Generator
+    Build an index for your search
+    :param num_docs: maximum number of Documents to index
     """
-
-    with open(input_file, "r") as csv_file:
-        csv_reader = csv.DictReader(csv_file)
-        input_field = "Description"
-        for row in itertools.islice(csv_reader, num_docs):
-            # Fix invalid ratings and counts
-            if row["Average User Rating"] == "":
-                row["Average User Rating"] = random.uniform(0.0, 5.0)
-            if row["User Rating Count"] == "":
-                row["User Rating Count"] = random.randint(10, 10_000)
-            # Set field to encode and index
-            input_data = trim_string(f"{row['Name']} - {trim_string(row[input_field])}")
-            # Put all of that into a doc
-            doc = Document(text=input_data)
-            doc.tags = row
-            yield doc
-
-
-def index(num_docs=max_docs):
     flow = (
         Flow()
-        # .add(uses='jinahub+docker://TransformerTorchEncoder', pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3", name="encoder", max_length=50)
         .add(
-            uses=MyTransformer,
-            pretrained_model_name_or_path=backend_model,
+            uses="jinahub+docker://TransformerTorchEncoder",
+            pretrained_model_name_or_path=model,
             name="encoder",
-        ).add(uses=DiskIndexer, workspace=backend_workdir, name="indexer")
+            max_length=50,
+        )
+        .add(uses=DiskIndexer, workspace=workdir)
     )
 
     with flow:
         flow.post(
             on="/index",
-            inputs=prep_docs(input_file=backend_datafile, num_docs=num_docs),
+            inputs=prep_docs(input_file=datafile, num_docs=num_docs),
             request_size=64,
             read_mode="r",
         )
 
 
 def query_restful():
+    """
+    Query your index
+    """
     flow = (
         Flow()
-        # .add(uses='jinahub+docker://TransformerTorchEncoder', pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3", name="encoder", max_length=50)
         .add(
-            uses=MyTransformer,
-            pretrained_model_name_or_path=backend_model,
+            uses="jinahub+docker://TransformerTorchEncoder",
+            pretrained_model_name_or_path="sentence-transformers/msmarco-distilbert-base-v3",
             name="encoder",
-        ).add(uses=DiskIndexer, workspace=backend_workdir, name="indexer")
-        # .add(uses=ReviewRanker, name="ranker")
+            max_length=50,
+        )
+        .add(uses=DiskIndexer, workspace=workdir)
     )
 
     with flow:
         flow.protocol = "http"
-        flow.port_expose = backend_port
+        flow.port_expose = port
         flow.block()
 
 
@@ -117,7 +78,7 @@ def query_restful():
 @click.option("--num_docs", "-n", default=max_docs)
 @click.option("--force", "-f", is_flag=True)
 def main(task: str, num_docs: int, force: bool):
-    workspace = backend_workdir
+    workspace = workdir
     if task == "index":
         if os.path.exists(workspace):
             if force:
diff --git a/backend/helper.py b/backend/helper.py
@@ -0,0 +1,51 @@
+import random
+import itertools
+import csv
+from jina import Document
+from typing import Generator
+
+from backend_config import (
+    text_length,
+    max_docs,
+)
+
+
+def trim_string(
+    input_string: str, word_count: int = text_length, sep: str = " "
+) -> str:
+    """
+    Trim a string to a certain number of words.
+    :param input_string: string to trim
+    :param word_count: how many words to trim to
+    :param sep: separator between words
+    :return: trimmmed string
+    """
+    sanitized_string = input_string.replace("\\n", sep)
+    words = sanitized_string.split(sep)[:word_count]
+    trimmed_string = " ".join(words)
+
+    return trimmed_string
+
+
+def prep_docs(input_file: str, num_docs: int = max_docs) -> Generator:
+    """
+    Create generator for every row in csv as a Document
+    :param input_file: Input csv filename
+    :return: Generator
+    """
+
+    with open(input_file, "r") as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        input_field = "Description"
+        for row in itertools.islice(csv_reader, num_docs):
+            # Fix invalid ratings and counts
+            if row["Average User Rating"] == "":
+                row["Average User Rating"] = random.uniform(0.0, 5.0)
+            if row["User Rating Count"] == "":
+                row["User Rating Count"] = random.randint(10, 10_000)
+            # Set field to encode and index
+            input_data = trim_string(f"{row['Name']} - {trim_string(row[input_field])}")
+            # Put all of that into a doc
+            doc = Document(text=input_data)
+            doc.tags = row
+            yield doc
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
-jina[http,transformers,torch]==2.0.0rc9.dev22
+jina[http,transformers,torch,hub]==2.0.0rc9.dev22
 pretty-errors==1.2.21
 streamlit==0.82.0
+docker==5.0