Improve YAML configuration for "demo-question-answering" (#7230)

lucametehau · Manul from Pathway · commit 3d6423bbb6a7 · 2024-09-03T10:40:32.000Z
GitOrigin-RevId: 78ef14280c258d04dde139a51d7ff52e26058cf3
diff --git a/examples/pipelines/demo-question-answering/app.py b/examples/pipelines/demo-question-answering/app.py
@@ -1,14 +1,12 @@
 import logging
-import sys
 
-import click
 import pathway as pw
-import yaml
 from dotenv import load_dotenv
-from pathway.udfs import DiskCache, ExponentialBackoffRetryStrategy
-from pathway.xpacks.llm import embedders, llms, parsers, splitters
+from pathway.xpacks import llm
 from pathway.xpacks.llm.question_answering import BaseRAGQuestionAnswerer
 from pathway.xpacks.llm.vector_store import VectorStoreServer
+from pydantic import BaseModel, ConfigDict, InstanceOf
+from typing_extensions import TypedDict
 
 # To use advanced features with Pathway Scale, get your free license key from
 # https://pathway.com/features and paste it below.
@@ -23,77 +21,41 @@
 
 load_dotenv()
 
+host_config = TypedDict("host_config", {"host": str, "port": int})
 
-def data_sources(source_configs) -> list[pw.Table]:
-    sources = []
-    for source_config in source_configs:
-        if source_config["kind"] == "local":
-            source = pw.io.fs.read(
-                **source_config["config"],
-                format="binary",
-                with_metadata=True,
-            )
-            sources.append(source)
-        elif source_config["kind"] == "gdrive":
-            source = pw.io.gdrive.read(
-                **source_config["config"],
-                with_metadata=True,
-            )
-            sources.append(source)
-        elif source_config["kind"] == "sharepoint":
-            try:
-                import pathway.xpacks.connectors.sharepoint as io_sp
-
-                source = io_sp.read(**source_config["config"], with_metadata=True)
-                sources.append(source)
-            except ImportError:
-                print(
-                    "The Pathway Sharepoint connector is part of the commercial offering, "
-                    "please contact us for a commercial license."
-                )
-                sys.exit(1)
-
-    return sources
-
-
-@click.command()
-@click.option("--config_file", default="config.yaml", help="Config file to be used.")
-def run(
-    config_file: str = "config.yaml",
-):
-    with open(config_file) as config_f:
-        configuration = yaml.safe_load(config_f)
-
-    GPT_MODEL = configuration["llm_config"]["model"]
-
-    embedder = embedders.OpenAIEmbedder(
-        model="text-embedding-ada-002",
-        cache_strategy=DiskCache(),
-    )
-
-    chat = llms.OpenAIChat(
-        model=GPT_MODEL,
-        retry_strategy=ExponentialBackoffRetryStrategy(max_retries=6),
-        cache_strategy=DiskCache(),
-        temperature=0.05,
-    )
-
-    host_config = configuration["host_config"]
-    host, port = host_config["host"], host_config["port"]
-
-    doc_store = VectorStoreServer(
-        *data_sources(configuration["sources"]),
-        embedder=embedder,
-        splitter=splitters.TokenCountSplitter(max_tokens=400),
-        parser=parsers.ParseUnstructured(),
-    )
-
-    rag_app = BaseRAGQuestionAnswerer(llm=chat, indexer=doc_store)
-
-    rag_app.build_server(host=host, port=port)
-
-    rag_app.run_server(with_cache=True, terminate_on_error=False)
+
+class App(BaseModel):
+    llm: InstanceOf[pw.UDF]
+    embedder: InstanceOf[llm.embedders.BaseEmbedder]
+    splitter: InstanceOf[pw.UDF]
+    parser: InstanceOf[pw.UDF]
+
+    sources: list[InstanceOf[pw.Table]]
+
+    host_config: host_config
+
+    def run(self, config_file: str = "config.yaml") -> None:
+        # Unpack host and port from config
+        host, port = self.host_config["host"], self.host_config["port"]
+
+        doc_store = VectorStoreServer(
+            *self.sources,
+            embedder=self.embedder,
+            splitter=self.splitter,
+            parser=self.parser,
+        )
+
+        rag_app = BaseRAGQuestionAnswerer(llm=self.llm, indexer=doc_store)
+
+        rag_app.build_server(host=host, port=port)
+
+        rag_app.run_server(with_cache=True, terminate_on_error=False)
+
+    model_config = ConfigDict(extra="forbid")
 
 
 if __name__ == "__main__":
-    run()
+    with open("config.yaml") as f:
+        config = pw.load_yaml(f)
+    app = App(**config)
+    app.run()
diff --git a/examples/pipelines/demo-question-answering/config.yaml b/examples/pipelines/demo-question-answering/config.yaml
@@ -1,39 +1,46 @@
-llm_config:
+llm: !pw.xpacks.llm.llms.OpenAIChat
   model: "gpt-3.5-turbo"
+  retry_strategy: !pw.udfs.ExponentialBackoffRetryStrategy
+    max_retries: 6
+  cache_strategy: !pw.udfs.DiskCache
+  temperature: 0.05
+  capacity: 8
+
+embedder: !pw.xpacks.llm.embedders.OpenAIEmbedder
+  model: "text-embedding-ada-002"
+  cache_strategy: !pw.udfs.DiskCache
+
 host_config:
   host: "0.0.0.0"
-  port: 8000
-cache_options:
-  with_cache: True
-  cache_folder: "./Cache"
+  port: 16003
+
+splitter: !pw.xpacks.llm.splitters.TokenCountSplitter
+  max_tokens: 400
+
+parser: !pw.xpacks.llm.parsers.ParseUnstructured
+
 sources:
-  - local_files:
-    kind: local
-    config:
-      # Please refer to
-      # https://pathway.com/developers/api-docs/pathway-io/fs#pathway.io.fs.read
-      # for options definition
-      path: "data/"
-  # - google_drive_folder:
-  #   kind: gdrive
-  #   config:
-  #     # Please refer to
-  #     # https://pathway.com/developers/api-docs/pathway-io/gdrive#pathway.io.gdrive.read
-  #     # for options definition
-  #     # Please follow https://pathway.com/developers/user-guide/connectors/gdrive-connector/#setting-up-google-drive
-  #     # for instructions on getting credentials
-  #     object_id: "1cULDv2OaViJBmOfG5WB0oWcgayNrGtVs" # folder used in the managed demo
-  #     service_user_credentials_file: SERVICE_CREDENTIALS
-  #     refresh_interval: 5
-  # - sharepoint_folder:
-  #   kind: sharepoint
-  #   config:
-  #     # The sharepoint is part of our commercial offering, please contact us to use it
-  #     # Please contact here: `contact@pathway.com`
-  #     root_path: ROOT_PATH
-  #     url: SHAREPOINT_URL
-  #     tenant: SHAREPOINT_TENANT
-  #     client_id: SHAREPOINT_CLIENT_ID
-  #     cert_path: SHAREPOINT.pem
-  #     thumbprint: SHAREPOINT_THUMBPRINT
-  #     refresh_interval: 5
+  - !pw.io.fs.read
+    path: data
+    format: binary
+    with_metadata: true
+
+  # - !pw.xpacks.connectors.sharepoint.read 
+  #   url: $SHAREPOINT_URL
+  #   tenant: $SHAREPOINT_TENANT
+  #   client_id: $SHAREPOINT_CLIENT_ID
+  #   cert_path: sharepointcert.pem
+  #   thumbprint: $SHAREPOINT_THUMBPRINT
+  #   root_path: $SHAREPOINT_ROOT
+  #   with_metadata: true
+  #   refresh_interval: 30
+
+  # - !pw.io.gdrive.read
+  #   object_id: $DRIVE_ID
+  #   service_user_credentials_file: gdrive_indexer.json
+  #   name_pattern:
+  #     - "*.pdf"
+  #     - "*.pptx"
+  #   object_size_limit: null
+  #   with_metadata: true
+  #   refresh_interval: 30