Skip to content

Commit 8a0f5b5

Browse files
authored
Merge pull request #18 from opendatalab/dev
fix: Fix missing short text in complex code blocks
2 parents 1c44bd5 + 068cd0c commit 8a0f5b5

4 files changed

Lines changed: 19 additions & 1 deletion

File tree

magic_html/extractors/base_extractor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,9 @@ def delete_by_link_density(
760760
elemtext = trim(elem.text_content())
761761
result, templist = link_density_test(elem, elemtext, favor_precision)
762762
if result is True and img_div_check(elem):
763+
# 保留table中的链接
764+
if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']):
765+
continue
763766
deletions.append(elem)
764767
elif backtracking is True and len(templist) > 0: # if?
765768
myelems[elemtext].append(elem)

magic_html/readability_plus.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,14 @@ def sanitize(self, node, candidates):
448448
reason = "less than 3x <p>s than <input>s"
449449
to_remove = True
450450
elif content_length < MIN_LEN and counts["img"] == 0:
451+
# 代码块内容过短,导致删除
452+
if el.tag in ['code', 'pre']:
453+
continue
454+
if ancestor_node_check(el, ['code', 'pre']):
455+
continue
456+
# 保留table中的链接
457+
if el.tag in ['ul', 'li', 'div', 'p'] and ancestor_node_check(el, ['td']):
458+
continue
451459
reason = (
452460
"too short content length %s without a single image"
453461
% content_length

magic_html/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,13 @@ def fromstring_bytes(htmlobject):
173173
return tree
174174

175175

176+
def ancestor_node_check(node: HtmlElement, tags: list):
177+
for tag in tags:
178+
if node.xpath(f'ancestor::{tag}[1]'):
179+
return True
180+
return False
181+
182+
176183
def load_html(htmlobject):
177184
if isinstance(htmlobject, HtmlElement):
178185
return htmlobject

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Brotli
2-
cchardet
2+
cchardet==2.2.0a2
33
charset_normalizer
44
lxml<5.2.0
55
numpy

0 commit comments

Comments
 (0)