Skip to content

Commit

Permalink
TLRD-182 eml reader bug fix (#406)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander1999-hub authored Mar 4, 2024
1 parent df6985c commit 4a418d4
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 14 deletions.
23 changes: 11 additions & 12 deletions dedoc/readers/html_reader/html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
document_postprocess = self.postprocessor.postprocess(document)
return document_postprocess

def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool) -> List[LineWithMeta]:
def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool, table: Optional[bool] = False) -> List[LineWithMeta]:
tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest()
assert isinstance(tag, (Tag, str))
if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table):
Expand All @@ -79,14 +79,14 @@ def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool
elif tag.name in HtmlTags.list_tags:
block_lines = self.__read_list(lst=tag, uid=tag_uid, path_hash=uid, handle_invisible_table=handle_invisible_table)
else:
block_lines = self.__handle_single_tag(tag, uid)
block_lines = self.__handle_single_tag(tag, uid, table)
for line in block_lines:
if not getattr(line.metadata, "html_tag", None):
line.metadata.extend_other_fields({"html_tag": tag.name})
return block_lines

def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]:
text = self.__get_text(tag)
def __handle_single_tag(self, tag: Tag, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]:
text = self.__get_text(tag, table)

if not text or text.isspace():
return []
Expand All @@ -99,7 +99,7 @@ def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]:
line.metadata.extend_other_fields({"html_tag": tag.name})
return [line]

def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False) -> List[LineWithMeta]:
def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False) -> List[LineWithMeta]:
uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest()
if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
return []
Expand All @@ -108,7 +108,7 @@ def __read_blocks(self, block: Tag, path_hash: str = "", handle_invisible_table:

for tag in block:
assert isinstance(tag, (Tag, str))
block_lines = self.__handle_block(tag=tag, uid=uid, handle_invisible_table=handle_invisible_table)
block_lines = self.__handle_block(tag=tag, uid=uid, handle_invisible_table=handle_invisible_table, table=table)
lines.extend(block_lines)
return lines

Expand Down Expand Up @@ -182,8 +182,10 @@ def __handle_list_item(self, item: Tag, item_index: int, list_type: str, path_ha
return lines

# not currently used, but may be useful in the future
def __get_text(self, tag: Tag) -> [str, int, int]:
text = tag.getText() + "\n" if tag.name == "p" else tag.getText()
def __get_text(self, tag: Tag, table: Optional[bool] = False) -> [str, int, int]:
for br in tag.find_all("br"):
br.replace_with("\n")
text = tag.getText() + "\n" if tag.name == "p" and not table else tag.getText()
text = "" if text is None else text
return text

Expand Down Expand Up @@ -218,11 +220,8 @@ def _read_table(self, table: Tag, path_hash: str) -> Table:
for row in table.find_all(HtmlTags.table_rows):
row_lines = []
for cell in row.find_all(HtmlTags.table_cells):
uid = hashlib.md5(cell.name.encode()).hexdigest()
tag_uid = hashlib.md5((uid + cell.getText()).encode()).hexdigest()

cell_with_meta = CellWithMeta(
lines=[self.__make_line(line=cell.getText(), line_type=HierarchyLevel.unknown, uid=tag_uid, path_hash=path_hash)],
lines=self.__read_blocks(block=cell, path_hash=path_hash, handle_invisible_table=False, table=True), # read each cell as block with styles
colspan=cell.colspan if cell.colspan else 1,
rowspan=cell.rowspan if cell.rowspan else 1,
invisible=cell.invisible if cell.invisible else True
Expand Down
4 changes: 2 additions & 2 deletions dedoc/readers/html_reader/html_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ class HtmlTags:
service_tags = ["script", "style"]

list_items = ["li", "dd", "dt"]
block_tags = ["aside", "article", "body", "div", "footer", "header", "html", "main", "nav", "section", "form", *list_items]
block_tags = ["aside", "article", "body", "div", "blockquote", "footer", "header", "html", "main", "nav", "section", "form", *list_items]
unordered_list = ["ul", "dl", "dir"]
ordered_list = ["ol"]
list_tags = unordered_list + ordered_list
Expand All @@ -20,7 +20,7 @@ class HtmlTags:

styled_tag = bold_tags + italic_tags + underlined_tags + strike_tags + superscript_tags + subscript_tags
simple_text_tags = [
"a", "abbr", "acronym", "applet", "area", "article", "aside", "bdi", "bdo", "big", "blockquote", "canvas", "caption", "center", "cite", "code", "data",
"a", "abbr", "acronym", "applet", "area", "article", "aside", "bdi", "bdo", "big", "canvas", "caption", "center", "cite", "code", "data",
"font", "kbd", "mark", "output", "p", "pre", "q", "samp", "small", "span", "tt", "wbr"
]
text_tags = simple_text_tags + styled_tag
Expand Down
9 changes: 9 additions & 0 deletions tests/api_tests/test_api_format_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ def test_html_with_styles_as_attribute(self) -> None:
self.assertIn({"name": "bold", "value": "True", "start": 33, "end": 47}, annotations)
self.assertIn({"name": "bold", "value": "True", "start": 0, "end": 15}, annotations)

def test_html_table_with_styles(self) -> None:
file_name = "table_with_styles.html"
result = self._send_request(file_name)
table = result["content"]["tables"][0]
self.assertIn({"start": 0, "end": 6, "name": "bold", "value": "True"}, table["cells"][1][0]["lines"][0]["annotations"])
self.assertIn({"start": 0, "end": 10, "name": "italic", "value": "True"}, table["cells"][1][1]["lines"][0]["annotations"])
self.assertIn({"start": 0, "end": 10, "name": "linked_text", "value": "some_text"}, table["cells"][2][0]["lines"][0]["annotations"])
self.assertIn({"start": 0, "end": 16, "name": "strike", "value": "True"}, table["cells"][2][1]["lines"][0]["annotations"])

def test_html_font_style_attribute(self) -> None:
file_name = "210.html"
self._send_request(file_name)
Expand Down
18 changes: 18 additions & 0 deletions tests/data/htmls/table_with_styles.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Название тестового документа</title>
</head>
<body>
<table border="1">
<caption>Пример таблицы со стилями</caption>
<tr>
<th>Первый столбец</th>
<th>Второй столбец</th>
</tr>
<tr><td><p><strong>Что-то</strong></p></td><td><div><i>Что-то ещё</i></div></td></tr>
<tr><td><a href="some_text">Ещё что-то</a></td><td><del>Последняя ячейка</del></td></tr>
</table>
</body>
</html>

0 comments on commit 4a418d4

Please sign in to comment.