Updating the merged api response object to not contain confusing data…

… that isn't actually merged. (#26) ### Correcting Merged Api Result --- Previously the merged api result still contained data like `content` and `pages` from the first api response in the array to merge. This was somewhat confusing as it is not actually merged data. Thus, removing and updating the tests accordingly. --------- Co-authored-by: Mark <[email protected]>
climatepolicyradar · Oct 18, 2023 · 9561485 · 9561485
1 parent 9fd44ae
commit 9561485
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 8 deletions.
diff --git a/src/azure_pdf_parser/utils.py b/src/azure_pdf_parser/utils.py
@@ -104,8 +104,9 @@ def merge_responses(batches: Sequence[PDFPagesBatchExtracted]) -> AnalyzeResult:
             all_tables.extend(batch.extracted_content.tables)
         all_pages.extend(batch.extracted_content.pages)
 
-    # Copy the first result to a variable and add the content for all the pages.
-    merged_analyse_result: AnalyzeResult = batches.pop(0).extracted_content
+    merged_analyse_result = AnalyzeResult()
+    merged_analyse_result.api_version = batches[0].extracted_content.api_version
+    merged_analyse_result.model_id = batches[0].extracted_content.model_id
     merged_analyse_result.paragraphs = all_paragraphs
     merged_analyse_result.tables = all_tables
     merged_analyse_result.pages = all_pages

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -81,6 +81,13 @@ def test_merge_responses_one_page_results(
     one_page_analyse_result: AnalyzeResult,
 ) -> None:
     """Test that the responses are merged correctly."""
+    # The one_page_analyse_result test data has empty values for these fields, and thus
+    # we need to set values for them to assert that they don't persist in the merged
+    # result.
+    one_page_analyse_result.documents = ["test_document"]
+    one_page_analyse_result.languages = ["test_language"]
+    one_page_analyse_result.styles = ["test_style"]
+
     api_responses = [
         PDFPagesBatchExtracted(
             page_range=(1, 1),
@@ -115,9 +122,9 @@ def test_merge_responses_one_page_results(
     assert isinstance(merged_api_response, AnalyzeResult)
     assert merged_api_response.api_version == one_page_analyse_result.api_version
     assert merged_api_response.model_id == one_page_analyse_result.model_id
-    assert merged_api_response.languages == one_page_analyse_result.languages
-    assert merged_api_response.styles == one_page_analyse_result.styles
-    assert merged_api_response.documents == one_page_analyse_result.documents
+    assert merged_api_response.languages != one_page_analyse_result.languages
+    assert merged_api_response.styles != one_page_analyse_result.styles
+    assert merged_api_response.documents != one_page_analyse_result.documents
 
     # Check that the number of paragraphs and tables is correct
     assert merged_api_response.paragraphs is not None
@@ -164,9 +171,9 @@ def test_merge_api_responses_sixteen_page_results(
     assert isinstance(merged_api_response, AnalyzeResult)
     assert merged_api_response.api_version == sixteen_page_analyse_result.api_version
     assert merged_api_response.model_id == sixteen_page_analyse_result.model_id
-    assert merged_api_response.languages == sixteen_page_analyse_result.languages
-    assert merged_api_response.styles == sixteen_page_analyse_result.styles
-    assert merged_api_response.documents == sixteen_page_analyse_result.documents
+    assert merged_api_response.languages != sixteen_page_analyse_result.languages
+    assert merged_api_response.styles != sixteen_page_analyse_result.styles
+    assert merged_api_response.documents != sixteen_page_analyse_result.documents
 
     # Check that the number of paragraphs and tables is correct
     assert merged_api_response.paragraphs is not None