Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] [MIN-85] [Experiment] Intelligent Chunking #65

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


# Setup for chroma vector store
CHROMA_SERVER_HOST_NAME = "chroma-service.default"
CHROMA_SERVER_HOST_NAME = "localhost"
CHROMA_SERVER_PORT = 8000
DEFAULT_EMBED_MODEL = "base" # ["base", "large", "xl"]
N_CLOSEST_MATCHES = 3
Expand Down Expand Up @@ -99,7 +99,7 @@ def _get_prediction_endpoint() -> Optional[str]:
Returns:
Optional[str]: the url endpoint if it exists and is valid, None otherwise.
"""
return f"http://{SELDON_SERVICE_NAME}.{SELDON_NAMESPACE}:{SELDON_PORT}/v2/models/transformer/infer"
return f"http://localhost:9000/v2/models/transformer/infer"


@st.cache_data(show_spinner=False)
Expand Down Expand Up @@ -307,7 +307,7 @@ def show_disclaimer() -> None:
def show_settings() -> None:
"""Show inference settings on the sidebar."""
st.title("Settings")
st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.8)
st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.0)
st.session_state.max_length = st.slider("Max response length", min_value=50, max_value=500, value=300, step=1)
st.session_state.prompt_template = st.select_slider("Prompt template", options=["simple", "complex", "advanced"], value="simple")

Expand Down Expand Up @@ -383,6 +383,8 @@ def main() -> None:
embedding_function=embed_function,
)

print(context)

# Create a dict of prompt and context
message = {"prompt_query": prompt, "context": context}

Expand All @@ -396,11 +398,11 @@ def main() -> None:

full_response += f"{source}: {assistant_response} \n"

if metric_service_endpoint:
result = post_response_to_metric_service(
metric_service_endpoint, assistant_response
)
logging.info(result.text)
# if metric_service_endpoint:
# result = post_response_to_metric_service(
# metric_service_endpoint, assistant_response
# )
# logging.info(result.text)

message_placeholder.markdown(full_response)

Expand Down
4 changes: 2 additions & 2 deletions data/mind_data_validated.csv.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: 027332cb6e506fd47d3962471c1246d2
size: 1859742
- md5: c76568f0dd939286c633b2ed3a557630
size: 2336306
hash: md5
path: mind_data_validated.csv
4 changes: 2 additions & 2 deletions data/nhs_data_validated.csv.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
outs:
- md5: 1534192fdce6f9c7bde650cf0374629c
size: 797289
- md5: c4b3c09b6499a4faeec99fd7dbc27d4f
size: 811904
hash: md5
path: nhs_data_validated.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@ steps:
data_version: "data/second_version"
data_postfix: "raw"
reference_data_version: "data/second_version"
clean_data:
clean_data_mind:
enable_cache: False
validate_data:
clean_data_nhs:
enable_cache: False
validate_data_mind:
enable_cache: False
validate_data_nhs:
enable_cache: False
version_data:
enable_cache: False
Expand Down
16 changes: 9 additions & 7 deletions pipelines/data_preparation_pipeline/data_preparation_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Data preparation pipeline."""
from steps.data_preparation_steps import clean_data, validate_data
from steps.data_preparation_steps.split_nhs_pages_step.split_nhs_pages_step import split_pages
from steps.data_versioning_steps import version_data
from steps.generic_steps import load_data
from zenml import pipeline
Expand All @@ -20,12 +21,13 @@ def data_preparation_pipeline() -> None:
"""
_, _, mind_df, nhs_df = load_data()

mind_df = clean_data(mind_df)
nhs_df = clean_data(nhs_df)
mind_df = split_pages(mind_df, "mind", after=["load_data"], id="split_pages_mind")
nhs_df = split_pages(nhs_df, "nhs", after=["load_data"], id="split_pages_nhs")

mind_df = validate_data(mind_df, "mind")
nhs_df = validate_data(nhs_df, "nhs")
mind_df = clean_data(mind_df, after=["split_pages_mind"], id="clean_data_mind")
nhs_df = clean_data(nhs_df, after=["split_pages_nhs"], id="clean_data_nhs")

version_data(
after=["load_data", "clean_data", "validate_data"], data_postfix="validated"
)
mind_df = validate_data(mind_df, "mind", after=["clean_data_mind"], id="validate_data_mind")
nhs_df = validate_data(nhs_df, "nhs", after=["clean_data_nhs"], id="validate_data_nhs")

version_data(after=["validate_data_mind", "validate_data_nhs"], data_postfix="validated")
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,11 @@ def compute_embedding_drift(
payload = build_embedding_drift_payload(
reference_data_version, current_data_version, distance
)
response = requests.post(
f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift",
json=payload,
)
# response = requests.post(
# f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift",
# json=payload,
# )

logger.info(response.text)
# logger.info(response.text)

return float(distance)
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def clean_data(data: pd.DataFrame) -> pd.DataFrame:
data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Or see our page[^\.]+[\.]", text))
data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Read more about[^\.]+[\.]", text))

data = data.drop(data[data.text_scraped == ""].index)
data = data[data.text_scraped != ""]
data = data.drop_duplicates()
data = data.drop(columns=["html_scraped"])

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Split NHS pages step."""
from typing import List, Dict

import pandas as pd
from bs4 import BeautifulSoup
from zenml import step


SOURCE_MAPPING = {
"nhs": {
"tag": "section",
"kwargs": {}
},
"mind": {
"tag": "div",
"kwargs": {
"class_": "column"
}
}
}


def split_html(html: str, tag: str, kwargs: Dict[str, str]) -> List[str]:
"""Split html using the tag and some other optional Beautiful Soup arguments.

Args:
html (str): HTML text to split
tag (str): tag to split by
kwargs (Dict[str, str]): Beautiful Soup keyword arguments

Returns:
List[str]: list of HTML strings
"""
soup = BeautifulSoup(html, "lxml")
sections = soup.find_all(tag, **kwargs)
return [str(section) for section in sections]


def split_page(data: pd.Series, source: str) -> pd.DataFrame:
"""Split a page.

Preserve other metadata.
* url is appended with an anchor suffix of the form '#section-{n}'
* uuid is appended with an anchor suffix of the form '-{n}'
* timestamp is kept as is

Args:
data (pd.Series): The scraped NHS data.
Index:
Name: uuid, dtype: object
Name: html_scraped, dtype: object
Name: timestamp, dtype: datetime64[ns]
Name: url, dtype: object
source (str): source of the page; determines how the page is split

Returns:
pd.DataFrame: The split page.
Index:
RangeIndex
Columns:
Name: uuid, dtype: object
Name: html_scraped, dtype: object
Name: timestamp, dtype: datetime64[ns]
Name: url, dtype: object
"""
params = SOURCE_MAPPING[source]
if data.html_scraped:
sections = split_html(data.html_scraped, params["tag"], params["kwargs"])
else:
sections = []
return pd.DataFrame(
[{
"uuid": f"{data.uuid}-{i}",
"html_scraped": section,
"timestamp": data.timestamp,
"url": f"{data.url}#section-{i}"
} for i, section in enumerate(sections)],
columns=["uuid", "html_scraped", "timestamp", "url"]
)


@step
def split_pages(data: pd.DataFrame, source: str) -> pd.DataFrame:
"""Split the NHS pages by the <section> tag.

Preserve other metadata.

Args:
data (pd.DataFrame): The scraped NHS data.
Index:
RangeIndex
Columns:
Name: uuid, dtype: object
Name: html_scraped, dtype: object
Name: timestamp, dtype: datetime64[ns]
Name: url, dtype: object
source (str): source of the page; determines how the page is split

Returns:
pd.DataFrame: The split data in the format described above.
Index:
RangeIndex
Columns:
Name: uuid, dtype: object
Name: html_scraped, dtype: object
Name: timestamp, dtype: datetime64[ns]
Name: url, dtype: object
"""
frames = data.apply(split_page, args=(source,), axis=1)
return pd.concat(frames.tolist())
Loading
Loading