From 02314db5174a3405cad75afb717e532b2fc8cfbf Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Wed, 16 Aug 2023 18:00:29 +0400 Subject: [PATCH 1/8] Update dvc files --- data/mind_data_raw.csv.dvc | 4 ++-- data/nhs_data_raw.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_raw.csv.dvc b/data/mind_data_raw.csv.dvc index 8f79853..3756535 100644 --- a/data/mind_data_raw.csv.dvc +++ b/data/mind_data_raw.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: dad43927d0712f97a8c261d2ab8dba81 - size: 10548104 +- md5: 3bdc23a1738ddf268a78cbe8f6a026e7 + size: 10562463 hash: md5 path: mind_data_raw.csv diff --git a/data/nhs_data_raw.csv.dvc b/data/nhs_data_raw.csv.dvc index 480b135..c8f978c 100644 --- a/data/nhs_data_raw.csv.dvc +++ b/data/nhs_data_raw.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 7e645f219661c3dd173669d284383bbe - size: 2688031 +- md5: 046f4080f9d8c2e2abcfa563c41d95b6 + size: 2687654 hash: md5 path: nhs_data_raw.csv From 75d622a96d846a3aba016ccd63851f4774abe7f6 Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Wed, 16 Aug 2023 18:00:44 +0400 Subject: [PATCH 2/8] Update dvc files --- data/mind_data_validated.csv.dvc | 2 +- data/nhs_data_validated.csv.dvc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index 1d047a1..ed731bd 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 027332cb6e506fd47d3962471c1246d2 +- md5: de6fed15842b8661c864c01a253d04a2 size: 1859742 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index ad9a96e..af7623e 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 1534192fdce6f9c7bde650cf0374629c +- md5: 1d5ebb8903a5470374ae9271ddbd9f59 size: 797289 hash: md5 path: nhs_data_validated.csv From 40cc786a9752f8928b86073f512a04ca0b5fe5c0 Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 17:52:53 +0400 Subject: [PATCH 3/8] Update dvc files --- data/mind_data_validated.csv.dvc | 4 ++-- data/nhs_data_validated.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index ed731bd..94f6330 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: de6fed15842b8661c864c01a253d04a2 - size: 1859742 +- md5: c76568f0dd939286c633b2ed3a557630 + size: 2336306 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index af7623e..5a52378 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 1d5ebb8903a5470374ae9271ddbd9f59 - size: 797289 +- md5: c4b3c09b6499a4faeec99fd7dbc27d4f + size: 811904 hash: md5 path: nhs_data_validated.csv From 023a2cbe37fb2f1afce420823ea153fe3d4459b9 Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 17:55:08 +0400 Subject: [PATCH 4/8] Update dvc files --- data/mind_data_validated.csv.dvc | 4 ++-- data/nhs_data_validated.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index 94f6330..c51ea00 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: c76568f0dd939286c633b2ed3a557630 - size: 2336306 +- md5: 45a2629d51c001b2556f2586966ef80b + size: 6778 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index 5a52378..ebf7bff 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: c4b3c09b6499a4faeec99fd7dbc27d4f - size: 811904 +- md5: ed6e0add96a44532f0a0ac73448d1382 + size: 148389 hash: md5 path: nhs_data_validated.csv From 2e2626a44e699071a6910e6f6e9729276e6760eb Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 18:08:03 +0400 Subject: [PATCH 5/8] Update dvc files --- data/mind_data_validated.csv.dvc | 4 ++-- data/nhs_data_validated.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index c51ea00..94f6330 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 45a2629d51c001b2556f2586966ef80b - size: 6778 +- md5: c76568f0dd939286c633b2ed3a557630 + size: 2336306 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index ebf7bff..5a52378 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: ed6e0add96a44532f0a0ac73448d1382 - size: 148389 +- md5: c4b3c09b6499a4faeec99fd7dbc27d4f + size: 811904 hash: md5 path: nhs_data_validated.csv From 198ffc0a5348912d6bcca84b1e81a3da3891997f Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 18:38:28 +0400 Subject: [PATCH 6/8] Update dvc files --- data/mind_data_validated.csv.dvc | 4 ++-- data/nhs_data_validated.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index 94f6330..f7cfda0 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: c76568f0dd939286c633b2ed3a557630 - size: 2336306 +- md5: 3a7b9eb1525ae9021318306d874adfbe + size: 2541363 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index 5a52378..ae53e4f 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: c4b3c09b6499a4faeec99fd7dbc27d4f - size: 811904 +- md5: 89494c5ba3ee949b09244f50f121fda1 + size: 933121 hash: md5 path: nhs_data_validated.csv From 9592b0966abda00b265319db45a0984495d70813 Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 18:40:00 +0400 Subject: [PATCH 7/8] Update dvc files --- data/mind_data_validated.csv.dvc | 4 ++-- data/nhs_data_validated.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/mind_data_validated.csv.dvc b/data/mind_data_validated.csv.dvc index f7cfda0..94f6330 100644 --- a/data/mind_data_validated.csv.dvc +++ b/data/mind_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 3a7b9eb1525ae9021318306d874adfbe - size: 2541363 +- md5: c76568f0dd939286c633b2ed3a557630 + size: 2336306 hash: md5 path: mind_data_validated.csv diff --git a/data/nhs_data_validated.csv.dvc b/data/nhs_data_validated.csv.dvc index ae53e4f..5a52378 100644 --- a/data/nhs_data_validated.csv.dvc +++ b/data/nhs_data_validated.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 89494c5ba3ee949b09244f50f121fda1 - size: 933121 +- md5: c4b3c09b6499a4faeec99fd7dbc27d4f + size: 811904 hash: md5 path: nhs_data_validated.csv From f68a52d1aa34f284735e1840c996e710cc642e10 Mon Sep 17 00:00:00 2001 From: "D. Lowl" Date: Thu, 17 Aug 2023 20:15:48 +0400 Subject: [PATCH 8/8] Add intelligent chunking experiment --- app/app.py | 18 +- data/mind_data_raw.csv.dvc | 4 +- data/nhs_data_raw.csv.dvc | 4 +- .../config_data_preparation_pipeline.yaml | 8 +- .../data_preparation_pipeline.py | 16 +- .../compute_embedding_drift_step.py | 10 +- .../clean_data_step/clean_data_step.py | 2 +- .../split_nhs_pages_step.py | 110 ++++++++++++ .../test_split_nhs_pages_step.py | 161 ++++++++++++++++++ 9 files changed, 306 insertions(+), 27 deletions(-) create mode 100644 steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py create mode 100644 tests/test_steps/test_data_preparation_steps/test_split_nhs_pages_step.py diff --git a/app/app.py b/app/app.py index 5afb025..d7e7375 100644 --- a/app/app.py +++ b/app/app.py @@ -27,7 +27,7 @@ # Setup for chroma vector store -CHROMA_SERVER_HOST_NAME = "chroma-service.default" +CHROMA_SERVER_HOST_NAME = "localhost" CHROMA_SERVER_PORT = 8000 DEFAULT_EMBED_MODEL = "base" # ["base", "large", "xl"] N_CLOSEST_MATCHES = 3 @@ -99,7 +99,7 @@ def _get_prediction_endpoint() -> Optional[str]: Returns: Optional[str]: the url endpoint if it exists and is valid, None otherwise. """ - return f"http://{SELDON_SERVICE_NAME}.{SELDON_NAMESPACE}:{SELDON_PORT}/v2/models/transformer/infer" + return f"http://localhost:9000/v2/models/transformer/infer" @st.cache_data(show_spinner=False) @@ -307,7 +307,7 @@ def show_disclaimer() -> None: def show_settings() -> None: """Show inference settings on the sidebar.""" st.title("Settings") - st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.8) + st.session_state.temperature = st.slider("Temperature", min_value=0.0, max_value=2.0, value=0.0) st.session_state.max_length = st.slider("Max response length", min_value=50, max_value=500, value=300, step=1) st.session_state.prompt_template = st.select_slider("Prompt template", options=["simple", "complex", "advanced"], value="simple") @@ -383,6 +383,8 @@ def main() -> None: embedding_function=embed_function, ) + print(context) + # Create a dict of prompt and context message = {"prompt_query": prompt, "context": context} @@ -396,11 +398,11 @@ def main() -> None: full_response += f"{source}: {assistant_response} \n" - if metric_service_endpoint: - result = post_response_to_metric_service( - metric_service_endpoint, assistant_response - ) - logging.info(result.text) + # if metric_service_endpoint: + # result = post_response_to_metric_service( + # metric_service_endpoint, assistant_response + # ) + # logging.info(result.text) message_placeholder.markdown(full_response) diff --git a/data/mind_data_raw.csv.dvc b/data/mind_data_raw.csv.dvc index 3756535..8f79853 100644 --- a/data/mind_data_raw.csv.dvc +++ b/data/mind_data_raw.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 3bdc23a1738ddf268a78cbe8f6a026e7 - size: 10562463 +- md5: dad43927d0712f97a8c261d2ab8dba81 + size: 10548104 hash: md5 path: mind_data_raw.csv diff --git a/data/nhs_data_raw.csv.dvc b/data/nhs_data_raw.csv.dvc index c8f978c..480b135 100644 --- a/data/nhs_data_raw.csv.dvc +++ b/data/nhs_data_raw.csv.dvc @@ -1,5 +1,5 @@ outs: -- md5: 046f4080f9d8c2e2abcfa563c41d95b6 - size: 2687654 +- md5: 7e645f219661c3dd173669d284383bbe + size: 2688031 hash: md5 path: nhs_data_raw.csv diff --git a/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml b/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml index ecf18b1..056ff89 100644 --- a/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml +++ b/pipelines/data_preparation_pipeline/config_data_preparation_pipeline.yaml @@ -5,9 +5,13 @@ steps: data_version: "data/second_version" data_postfix: "raw" reference_data_version: "data/second_version" - clean_data: + clean_data_mind: enable_cache: False - validate_data: + clean_data_nhs: + enable_cache: False + validate_data_mind: + enable_cache: False + validate_data_nhs: enable_cache: False version_data: enable_cache: False diff --git a/pipelines/data_preparation_pipeline/data_preparation_pipeline.py b/pipelines/data_preparation_pipeline/data_preparation_pipeline.py index acd0a1d..942a9a7 100644 --- a/pipelines/data_preparation_pipeline/data_preparation_pipeline.py +++ b/pipelines/data_preparation_pipeline/data_preparation_pipeline.py @@ -1,5 +1,6 @@ """Data preparation pipeline.""" from steps.data_preparation_steps import clean_data, validate_data +from steps.data_preparation_steps.split_nhs_pages_step.split_nhs_pages_step import split_pages from steps.data_versioning_steps import version_data from steps.generic_steps import load_data from zenml import pipeline @@ -20,12 +21,13 @@ def data_preparation_pipeline() -> None: """ _, _, mind_df, nhs_df = load_data() - mind_df = clean_data(mind_df) - nhs_df = clean_data(nhs_df) + mind_df = split_pages(mind_df, "mind", after=["load_data"], id="split_pages_mind") + nhs_df = split_pages(nhs_df, "nhs", after=["load_data"], id="split_pages_nhs") - mind_df = validate_data(mind_df, "mind") - nhs_df = validate_data(nhs_df, "nhs") + mind_df = clean_data(mind_df, after=["split_pages_mind"], id="clean_data_mind") + nhs_df = clean_data(nhs_df, after=["split_pages_nhs"], id="clean_data_nhs") - version_data( - after=["load_data", "clean_data", "validate_data"], data_postfix="validated" - ) + mind_df = validate_data(mind_df, "mind", after=["clean_data_mind"], id="validate_data_mind") + nhs_df = validate_data(nhs_df, "nhs", after=["clean_data_nhs"], id="validate_data_nhs") + + version_data(after=["validate_data_mind", "validate_data_nhs"], data_postfix="validated") diff --git a/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py b/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py index 059f028..6ea93c9 100644 --- a/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py +++ b/steps/data_embedding_steps/compute_embedding_drift_step/compute_embedding_drift_step.py @@ -149,11 +149,11 @@ def compute_embedding_drift( payload = build_embedding_drift_payload( reference_data_version, current_data_version, distance ) - response = requests.post( - f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift", - json=payload, - ) + # response = requests.post( + # f"http://{MONITORING_METRICS_HOST_NAME}:{MONITORING_METRICS_PORT}/embedding_drift", + # json=payload, + # ) - logger.info(response.text) + # logger.info(response.text) return float(distance) diff --git a/steps/data_preparation_steps/clean_data_step/clean_data_step.py b/steps/data_preparation_steps/clean_data_step/clean_data_step.py index 4b23e7b..8bcd790 100644 --- a/steps/data_preparation_steps/clean_data_step/clean_data_step.py +++ b/steps/data_preparation_steps/clean_data_step/clean_data_step.py @@ -267,7 +267,7 @@ def clean_data(data: pd.DataFrame) -> pd.DataFrame: data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Or see our page[^\.]+[\.]", text)) data["text_scraped"] = data["text_scraped"].map(lambda text: remove_pattern(r"Read more about[^\.]+[\.]", text)) - data = data.drop(data[data.text_scraped == ""].index) + data = data[data.text_scraped != ""] data = data.drop_duplicates() data = data.drop(columns=["html_scraped"]) diff --git a/steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py b/steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py new file mode 100644 index 0000000..c1c05c7 --- /dev/null +++ b/steps/data_preparation_steps/split_nhs_pages_step/split_nhs_pages_step.py @@ -0,0 +1,110 @@ +"""Split NHS pages step.""" +from typing import List, Dict + +import pandas as pd +from bs4 import BeautifulSoup +from zenml import step + + +SOURCE_MAPPING = { + "nhs": { + "tag": "section", + "kwargs": {} + }, + "mind": { + "tag": "div", + "kwargs": { + "class_": "column" + } + } +} + + +def split_html(html: str, tag: str, kwargs: Dict[str, str]) -> List[str]: + """Split html using the tag and some other optional Beautiful Soup arguments. + + Args: + html (str): HTML text to split + tag (str): tag to split by + kwargs (Dict[str, str]): Beautiful Soup keyword arguments + + Returns: + List[str]: list of HTML strings + """ + soup = BeautifulSoup(html, "lxml") + sections = soup.find_all(tag, **kwargs) + return [str(section) for section in sections] + + +def split_page(data: pd.Series, source: str) -> pd.DataFrame: + """Split a page. + + Preserve other metadata. + * url is appended with an anchor suffix of the form '#section-{n}' + * uuid is appended with an anchor suffix of the form '-{n}' + * timestamp is kept as is + + Args: + data (pd.Series): The scraped NHS data. + Index: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + source (str): source of the page; determines how the page is split + + Returns: + pd.DataFrame: The split page. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + """ + params = SOURCE_MAPPING[source] + if data.html_scraped: + sections = split_html(data.html_scraped, params["tag"], params["kwargs"]) + else: + sections = [] + return pd.DataFrame( + [{ + "uuid": f"{data.uuid}-{i}", + "html_scraped": section, + "timestamp": data.timestamp, + "url": f"{data.url}#section-{i}" + } for i, section in enumerate(sections)], + columns=["uuid", "html_scraped", "timestamp", "url"] + ) + + +@step +def split_pages(data: pd.DataFrame, source: str) -> pd.DataFrame: + """Split the NHS pages by the
tag. + + Preserve other metadata. + + Args: + data (pd.DataFrame): The scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + source (str): source of the page; determines how the page is split + + Returns: + pd.DataFrame: The split data in the format described above. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + """ + frames = data.apply(split_page, args=(source,), axis=1) + return pd.concat(frames.tolist()) diff --git a/tests/test_steps/test_data_preparation_steps/test_split_nhs_pages_step.py b/tests/test_steps/test_data_preparation_steps/test_split_nhs_pages_step.py new file mode 100644 index 0000000..45f4dbd --- /dev/null +++ b/tests/test_steps/test_data_preparation_steps/test_split_nhs_pages_step.py @@ -0,0 +1,161 @@ +"""Test split NHS pages step.""" +from typing import List + +import pandas as pd +import pytest +from pandas._testing import assert_frame_equal + +from steps.data_preparation_steps.split_nhs_pages_step.split_nhs_pages_step import split_pages, split_page + + +@pytest.fixture +def nhs_pages_split() -> List[pd.DataFrame]: + """A fixture with NHS pages after splitting. + + Returns: + List[pd.DataFrame]: The list of mock scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + """ + return [ + pd.DataFrame([], columns=["uuid", "html_scraped", "timestamp", "url"]), + pd.DataFrame([ + { + "uuid": "1-0", + "html_scraped": "

Section1

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/1#section-0", + } + ]), + pd.DataFrame([ + { + "uuid": "2-0", + "html_scraped": "

Section1

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/2#section-0", + }, + { + "uuid": "2-1", + "html_scraped": "

Section2

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/2#section-1", + } + ]), + pd.DataFrame([ + { + "uuid": "3-0", + "html_scraped": "

Heading1

Section1

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/3#section-0", + }, + { + "uuid": "3-1", + "html_scraped": "

Section2

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/3#section-1", + } + ]), + pd.DataFrame([ + { + "uuid": "4-0", + "html_scraped": "

Heading1

Section1

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/4#section-0", + }, + { + "uuid": "4-1", + "html_scraped": "

Section2

", + "timestamp": pd.to_datetime(0), + "url": f"https://example.com/4#section-1", + } + ]) + ] + +@pytest.fixture +def nhs_pages_raw() -> pd.DataFrame: + """A fixture with NHS pages before splitting. + + Returns: + pd.DataFrame: The mock scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + """ + html_scraped = [ + "

No section

", + "

Section1

", + "

Section1

Section2

", + "

Heading1

Section1

Section2

", + "

Outside

Heading1

Section1

Section2

", + ] + cases = [{ + "uuid": i, + "url": f"https://example.com/{i}", + "html_scraped": text, + "timestamp": pd.to_datetime(0) + } for i, text in enumerate(html_scraped)] + + return pd.DataFrame(cases) + + +def test_split_page(nhs_pages_raw: pd.DataFrame, nhs_pages_split: List[pd.DataFrame]): + """Test splitting individual pages. + + Args: + nhs_pages_raw (pd.DataFrame): The mock raw scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + nhs_pages_split (List[pd.DataFrame]): The list of mock split scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + """ + nhs_pages_raw_rows = [row for _, row in nhs_pages_raw.iterrows()] + for expected, nhs_page in zip(nhs_pages_split, nhs_pages_raw_rows): + got = split_page(nhs_page, "NHS") + assert_frame_equal(expected, got) + + +def test_split_nhs_pages(nhs_pages_raw: pd.DataFrame, nhs_pages_split: List[pd.DataFrame]): + """Test split NHS pages step. + + Args: + nhs_pages_raw (pd.DataFrame): The mock raw scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + nhs_pages_split (List[pd.DataFrame]): The list of mock split scraped NHS data. + Index: + RangeIndex + Columns: + Name: uuid, dtype: object + Name: html_scraped, dtype: object + Name: timestamp, dtype: datetime64[ns] + Name: url, dtype: object + + """ + expected = pd.concat(nhs_pages_split) + got = split_pages.entrypoint(nhs_pages_raw) + assert_frame_equal(expected, got) \ No newline at end of file