fuzzylabs · d-lowl · Aug 16, 2023 · Aug 16, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/app/app.py b/app/app.py
@@ -27,7 +27,7 @@
 
 
 # Setup for chroma vector store
-CHROMA_SERVER_HOST_NAME = "chroma-service.default"
+CHROMA_SERVER_HOST_NAME = "localhost"
 CHROMA_SERVER_PORT = 8000
 DEFAULT_EMBED_MODEL = "base"  # ["base", "large", "xl"]
 N_CLOSEST_MATCHES = 3
@@ -99,7 +99,7 @@ def _get_prediction_endpoint() -> Optional[str]:
     Returns:
         Optional[str]: the url endpoint if it exists and is valid, None otherwise.
     """
-    return f"http://{SELDON_SERVICE_NAME}.{SELDON_NAMESPACE}:{SELDON_PORT}/v2/models/transformer/infer"
+    return f"http://localhost:9000/v2/models/transformer/infer"
 
 
 @st.cache_data(show_spinner=False)
@@ -307,7 +307,7 @@ def show_disclaimer() -> None:
 def show_settings() -> None:
     """Show inference settings on the sidebar."""
     st.title("Settings")
-    st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.8)
+    st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.0)
     st.session_state.max_length = st.slider("Max response length", min_value=50, max_value=500, value=300, step=1)
     st.session_state.prompt_template = st.select_slider("Prompt template", options=["simple", "complex", "advanced"], value="simple")
 
@@ -383,6 +383,8 @@ def main() -> None:
                             embedding_function=embed_function,
                         )
 
+                        print(context)
+
                         # Create a dict of prompt and context
                         message = {"prompt_query": prompt, "context": context}
 
@@ -396,11 +398,11 @@ def main() -> None:
 
                         full_response += f"{source}: {assistant_response}  \n"
 
-                        if metric_service_endpoint:
-                            result = post_response_to_metric_service(
-                                metric_service_endpoint, assistant_response
-                            )
-                            logging.info(result.text)
+                        # if metric_service_endpoint:
+                        #     result = post_response_to_metric_service(
+                        #         metric_service_endpoint, assistant_response
+                        #     )
+                        #     logging.info(result.text)
 
                     message_placeholder.markdown(full_response)
 

diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 027332cb6e506fd47d3962471c1246d2
-  size: 1859742
+- md5: c76568f0dd939286c633b2ed3a557630
+  size: 2336306
   hash: md5
   path: mind_data_validated.csv
diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 1534192fdce6f9c7bde650cf0374629c
-  size: 797289
+- md5: c4b3c09b6499a4faeec99fd7dbc27d4f
+  size: 811904
   hash: md5
   path: nhs_data_validated.csv
diff --git a/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml b/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml
@@ -5,9 +5,13 @@ steps:
       data_version: "data/second_version"
       data_postfix: "raw"
       reference_data_version: "data/second_version"
-  clean_data:
+  clean_data_mind:
     enable_cache: False
-  validate_data:
+  clean_data_nhs:
+    enable_cache: False
+  validate_data_mind:
+    enable_cache: False
+  validate_data_nhs:
     enable_cache: False
   version_data:
     enable_cache: False

diff --git a/pipelines/data_preparation_pipeline/data_preparation_pipeline.py b/pipelines/data_preparation_pipeline/data_preparation_pipeline.py
@@ -1,5 +1,6 @@
 """Data preparation pipeline."""
 from steps.data_preparation_steps import clean_data, validate_data
+from steps.data_preparation_steps.split_nhs_pages_step.split_nhs_pages_step import split_pages
 from steps.data_versioning_steps import version_data
 from steps.generic_steps import load_data
 from zenml import pipeline
@@ -20,12 +21,13 @@ def data_preparation_pipeline() -> None:
     """
     _, _, mind_df, nhs_df = load_data()
 
-    mind_df = clean_data(mind_df)
-    nhs_df = clean_data(nhs_df)
+    mind_df = split_pages(mind_df, "mind", after=["load_data"], id="split_pages_mind")
+    nhs_df = split_pages(nhs_df, "nhs", after=["load_data"], id="split_pages_nhs")
 
-    mind_df = validate_data(mind_df, "mind")
-    nhs_df = validate_data(nhs_df, "nhs")
+    mind_df = clean_data(mind_df, after=["split_pages_mind"], id="clean_data_mind")
+    nhs_df = clean_data(nhs_df, after=["split_pages_nhs"], id="clean_data_nhs")
 
-    version_data(
-        after=["load_data", "clean_data", "validate_data"], data_postfix="validated"
-    )
+    mind_df = validate_data(mind_df, "mind", after=["clean_data_mind"], id="validate_data_mind")
+    nhs_df = validate_data(nhs_df, "nhs", after=["clean_data_nhs"], id="validate_data_nhs")
+
+    version_data(after=["validate_data_mind", "validate_data_nhs"], data_postfix="validated")
diff --git a/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py b/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py
@@ -149,11 +149,11 @@ def compute_embedding_drift(
     payload = build_embedding_drift_payload(
         reference_data_version, current_data_version, distance
     )
-    response = requests.post(
-        f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift",
-        json=payload,
-    )
+    # response = requests.post(
+    #     f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift",
+    #     json=payload,
+    # )
 
-    logger.info(response.text)
+    # logger.info(response.text)
 
     return float(distance)
diff --git a/steps/data_preparation_steps/clean_data_step/clean_data_step.py b/steps/data_preparation_steps/clean_data_step/clean_data_step.py
@@ -267,7 +267,7 @@ def clean_data(data: pd.DataFrame) -> pd.DataFrame:
     data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Or see our page[^\.]+[\.]", text))
     data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Read more about[^\.]+[\.]", text))
 
-    data = data.drop(data[data.text_scraped == ""].index)
+    data = data[data.text_scraped != ""]
     data = data.drop_duplicates()
     data = data.drop(columns=["html_scraped"])
 

diff --git a/steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py b/steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py
@@ -0,0 +1,110 @@
+"""Split NHS pages step."""
+from typing import List, Dict
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from zenml import step
+
+
+SOURCE_MAPPING = {
+    "nhs": {
+        "tag": "section",
+        "kwargs": {}
+    },
+    "mind": {
+        "tag": "div",
+        "kwargs": {
+            "class_": "column"
+        }
+    }
+}
+
+
+def split_html(html: str, tag: str, kwargs: Dict[str, str]) -> List[str]:
+    """Split html using the tag and some other optional Beautiful Soup arguments.
+
+    Args:
+        html (str): HTML text to split
+        tag (str): tag to split by
+        kwargs (Dict[str, str]): Beautiful Soup keyword arguments
+
+    Returns:
+        List[str]: list of HTML strings
+    """
+    soup = BeautifulSoup(html, "lxml")
+    sections = soup.find_all(tag, **kwargs)
+    return [str(section) for section in sections]
+
+
+def split_page(data: pd.Series, source: str) -> pd.DataFrame:
+    """Split a page.
+
+    Preserve other metadata.
+    * url is appended with an anchor suffix of the form '#section-{n}'
+    * uuid is appended with an anchor suffix of the form '-{n}'
+    * timestamp is kept as is
+
+    Args:
+        data (pd.Series): The scraped NHS data.
+            Index:
+                Name: uuid, dtype: object
+                Name: html_scraped, dtype: object
+                Name: timestamp, dtype: datetime64[ns]
+                Name: url, dtype: object
+        source (str): source of the page; determines how the page is split
+
+    Returns:
+        pd.DataFrame: The split page.
+            Index:
+                RangeIndex
+            Columns:
+                Name: uuid, dtype: object
+                Name: html_scraped, dtype: object
+                Name: timestamp, dtype: datetime64[ns]
+                Name: url, dtype: object
+    """
+    params = SOURCE_MAPPING[source]
+    if data.html_scraped:
+        sections = split_html(data.html_scraped, params["tag"], params["kwargs"])
+    else:
+        sections = []
+    return pd.DataFrame(
+        [{
+            "uuid": f"{data.uuid}-{i}",
+            "html_scraped": section,
+            "timestamp": data.timestamp,
+            "url": f"{data.url}#section-{i}"
+        } for i, section in enumerate(sections)],
+        columns=["uuid", "html_scraped", "timestamp", "url"]
+    )
+
+
+@step
+def split_pages(data: pd.DataFrame, source: str) -> pd.DataFrame:
+    """Split the NHS pages by the <section> tag.
+
+    Preserve other metadata.
+
+    Args:
+        data (pd.DataFrame): The scraped NHS data.
+            Index:
+                RangeIndex
+            Columns:
+                Name: uuid, dtype: object
+                Name: html_scraped, dtype: object
+                Name: timestamp, dtype: datetime64[ns]
+                Name: url, dtype: object
+        source (str): source of the page; determines how the page is split
+
+    Returns:
+        pd.DataFrame: The split data in the format described above.
+            Index:
+                RangeIndex
+            Columns:
+                Name: uuid, dtype: object
+                Name: html_scraped, dtype: object
+                Name: timestamp, dtype: datetime64[ns]
+                Name: url, dtype: object
+    """
+    frames = data.apply(split_page, args=(source,), axis=1)
+    return pd.concat(frames.tolist())