WIP, pass one at populate-json recipe

Gregory-Pereira · Gregory-Pereira · commit dbffd1f14bc1 · 2024-04-21T13:40:54.000-07:00
Signed-off-by: greg pereira &lt;grpereir@redhat.com&gt;
diff --git a/recipes/natural_language_processing/text-to-json/Makefile b/recipes/natural_language_processing/text-to-json/Makefile
@@ -0,0 +1,17 @@
+MODEL_URL ?=
+MODEL_NAME ?=
+MODEL_DIR ?= models
+
+.PHONY: download-model
+download-model:
+	curl -H "Cache-Control: no-cache" -s -S -L -f $(MODEL_URL) -z $(MODEL_DIR)/$(MODEL_NAME) -o $(MODEL_DIR)/$(MODEL_NAME).tmp && \
+	mv -f $(MODEL_NAME).tmp $(MODEL_NAME) 2>/dev/null || \
+	rm -f $(MODEL_DIR)/$(MODEL_NAME).tmp $(MODEL_DIR)/$(MODEL_NAME)
+
+.PHONY: download-model-mistral # default model
+download-model-mistral:
+	MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf make -f Makefile download-model
+
+.PHONY: download-default-json-grammar
+download-default-json-grammar: 
+	curl -sLO https://raw.githubusercontent.com/ggerganov/llama.cpp/master/grammars/json.gbnf && mv json.gbnf grammars/json.gbnf
diff --git a/recipes/natural_language_processing/text-to-json/README.md b/recipes/natural_language_processing/text-to-json/README.md
@@ -0,0 +1,7 @@
+Steps:
+
+1. begin local dev
+2. Work on datasources 
+    - connect with SRE teams to figure out ways we could get SRE tickets normalized into a training dataset easliy ingested by the model
+    - scrape stackoverflow, stackexchange, and medium for training data
+3. deploy with langserve
diff --git a/recipes/natural_language_processing/text-to-json/requirements.txt b/recipes/natural_language_processing/text-to-json/requirements.txt
@@ -0,0 +1,42 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+dataclasses-json==0.6.4
+fastapi==0.110.2
+frozenlist==1.4.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+idna==3.7
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.1.16
+langchain-community==0.0.34
+langchain-core==0.1.45
+langchain-text-splitters==0.0.1
+langserve==0.1.0
+langsmith==0.1.49
+marshmallow==3.21.1
+multidict==6.0.5
+mypy-extensions==1.0.0
+numpy==1.26.4
+orjson==3.10.1
+packaging==23.2
+pathlib==1.0.1
+pydantic==2.7.0
+pydantic_core==2.18.1
+PyYAML==6.0.1
+requests==2.31.0
+sniffio==1.3.1
+SQLAlchemy==2.0.29
+starlette==0.37.2
+tenacity==8.2.3
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+urllib3==2.2.1
+validators==0.28.1
+yarl==1.9.4
diff --git a/recipes/natural_language_processing/text-to-json/source/main.py b/recipes/natural_language_processing/text-to-json/source/main.py
@@ -0,0 +1,48 @@
+from langchain_community.llms import LlamaCpp
+from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
+from langchain_core.prompts.prompt import PromptTemplate
+from langchain.schema.runnable import Runnable
+import json
+from pprint import pprint
+from pathlib import Path
+# from langchain_text_splitters import RecursiveJsonSplitter
+
+model_path="/Users/gregpereirapereira/Documents/tech/work/red-hat/code/containers/ai-lab-recipes/models/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
+n_gpu_layers = -1  # This has been compiled with METAL framework all GPU for mac ARM64
+n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. Using default
+
+callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+
+def no_download_json_chain(file_name: str, input: str) -> Runnable:
+    """Return a runnable."""
+    json_schema = json.loads(Path(f"schemas/{file_name}").read_text())
+    json_schema_string = json.dumps(json_schema)
+    print("schema tokens: ", len(json_schema_string))
+    # dropping chunk splitting --> moving to a model with bigger token input
+    # splitter = RecursiveJsonSplitter(max_chunk_size=300) 
+    # json_chunks = splitter.split_json(json_data=json_data)
+    template = """
+        The user a JSON schema, and some text. Return to me a JSON object based on schema and by selecting the appropriate selections of the user text.
+        %JSON schema
+        {json_schema}
+        %User input:
+        {input}
+    """
+    template = template.format(json_schema=json_schema, input=input)
+    print("Token usage: ", len(template))
+    model = LlamaCpp(
+            model_path=model_path,
+            n_gpu_layers=n_gpu_layers,
+            n_batch=n_batch,
+            echo=True,
+            callback_manager=callback_manager,
+            verbose=True,
+            max_tokens=4000,
+            temperature=0.05,
+
+        )
+    return model | template
+    
+# Examples for when running as a non langserve route / non Runnable
+# no_download_json_chain("fruit.json", "A red banana.")
+# no_download_json_chain("employee.json", "My name is Gregory Pereira. I work in the Emereging Technologies department and the Platform and Services team. I like apples.")
diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/car.json b/recipes/natural_language_processing/text-to-json/source/schemas/car.json
@@ -0,0 +1,36 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Car",
+    "type": "object",
+    "properties": {
+      "make": {
+        "type": "string",
+        "description": "The make or manufacturer of the car"
+      },
+      "model": {
+        "type": "string",
+        "description": "The model of the car"
+      },
+      "year": {
+        "type": "integer",
+        "minimum": 1900,
+        "maximum": 2024,
+        "description": "The manufacturing year of the car"
+      },
+      "color": {
+        "type": "string",
+        "description": "The color of the car"
+      },
+      "mileage": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The mileage of the car in kilometers"
+      },
+      "price": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The price of the car in USD"
+      }
+    },
+    "required": ["make", "model", "year", "color", "mileage", "price"]
+}
diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/computer.json b/recipes/natural_language_processing/text-to-json/source/schemas/computer.json
@@ -0,0 +1,78 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Computer",
+    "type": "object",
+    "properties": {
+      "brand": {
+        "type": "string",
+        "description": "The brand or manufacturer of the computer"
+      },
+      "model": {
+        "type": "string",
+        "description": "The model of the computer"
+      },
+      "processor": {
+        "type": "object",
+        "description": "Details about the processor of the computer",
+        "properties": {
+          "manufacturer": {
+            "type": "string",
+            "description": "The manufacturer of the processor"
+          },
+          "model": {
+            "type": "string",
+            "description": "The model of the processor"
+          },
+          "cores": {
+            "type": "integer",
+            "minimum": 1,
+            "description": "The number of processor cores"
+          },
+          "clock_speed": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The clock speed of the processor in GHz"
+          }
+        },
+        "required": ["manufacturer", "model", "cores", "clock_speed"]
+      },
+      "ram": {
+        "type": "object",
+        "description": "Details about the RAM of the computer",
+        "properties": {
+          "size_gb": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The size of RAM in gigabytes"
+          },
+          "type": {
+            "type": "string",
+            "description": "The type of RAM (e.g., DDR4)"
+          }
+        },
+        "required": ["size_gb", "type"]
+      },
+      "storage": {
+        "type": "object",
+        "description": "Details about the storage of the computer",
+        "properties": {
+          "type": {
+            "type": "string",
+            "description": "The type of storage (e.g., SSD, HDD)"
+          },
+          "capacity_gb": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The capacity of storage in gigabytes"
+          }
+        },
+        "required": ["type", "capacity_gb"]
+      },
+      "price": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The price of the computer in USD"
+      }
+    },
+    "required": ["brand", "model", "processor", "ram", "storage", "price"]
+  }
diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/employee.json b/recipes/natural_language_processing/text-to-json/source/schemas/employee.json
@@ -0,0 +1,59 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "$id": "https://github.com/containers/ai-lab-recipes/recipes/json-to-text/employee.schema.json",
+    "title": "employee",
+    "description": "Acme's Employee Information",
+    "type": "object",
+    "properties": {
+        "name": {
+            "description": "The employee's full name",
+            "type": "string"
+        },
+        "employeeId": {
+            "description": "The unique identifier for a product",
+            "type": "integer"
+      },
+        "title": {
+            "description": "An Identifier for what position the employee holds within the company",
+            "type": "string"
+      },
+        "manager": {
+            "description": "Who sits above the employee in the Org chart and is responsible for managing them.",
+            "type": "object",
+            "items": { 
+              "$ref": "#" 
+            }
+      },
+        "teams": {
+            "description": "Which products, services or other initiatives is this employee responsible for contributing to.",
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "teamName": {
+                        "description": "A name used to refer to and distinguish between teams.",
+                        "type": "string"
+                    },
+                    "teamId": {
+                        "description": "A unique integer used to identify a team.",
+                        "type": "integer"
+                    },
+                    "leader": {
+                        "description": "The one who is responsbile for guiding the team.",
+                        "items": { 
+                          "$ref": "#" 
+                        }
+                    },
+                    "description": {
+                        "description": "A short blurb giving information on the team.",
+                        "type": "string"
+                    }
+                },
+                "required": ["teamName", "teamId"]
+            },
+        "minItems": 1,
+        "uniqueItems": true
+        }
+    },
+    "required": ["employeeId", "title", "teams"]
+}
diff --git a/recipes/natural_language_processing/text-to-json/source/schemas/fruit.json b/recipes/natural_language_processing/text-to-json/source/schemas/fruit.json
@@ -0,0 +1,16 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Fruit",
+    "type": "object",
+    "properties": {
+      "name": {
+        "type": "string",
+        "description": "The name of the fruit"
+      },
+      "color": {
+        "type": "string",
+        "description": "The color of the fruit"
+      }
+    },
+    "required": ["name", "color"]
+}
diff --git a/recipes/natural_language_processing/text-to-json/source/server.py b/recipes/natural_language_processing/text-to-json/source/server.py
@@ -0,0 +1,18 @@
+from fastapi import FastAPI
+from langserve import add_routes
+
+from __main__ import no_download_json_chain
+
+
+app=FastAPI(
+    title="Langchain Server",
+    version="1.0",
+    description="A simple API Server"
+)
+
+add_routes(app, no_download_json_chain())
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8001)
diff --git a/recipes/natural_language_processing/text-to-json/source/utils.py b/recipes/natural_language_processing/text-to-json/source/utils.py
@@ -0,0 +1,14 @@
+import validators.url
+from urllib.error import URLError, HTTPError
+import urllib.request
+
+def download_json_file(url: str, file_name: str):
+    if validators.url(url):
+        try:
+            urllib.request.urlretrieve(url, file_name)
+        except HTTPError as e:
+            print(f"HTTP Error: {e.code}, {e.reason}")
+        except URLError as e:
+            print(f"URL Error: {e.reason}")
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")
diff --git a/recipes/natural_language_processing/text-to-json/utils/json_schema_to_grammar.py b/recipes/natural_language_processing/text-to-json/utils/json_schema_to_grammar.py