diff --git a/.gitignore b/.gitignore index c062fea..c34fe44 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ build/ dist/ .cache +.venv/ \ No newline at end of file diff --git a/html_similarity/structural_similarity.py b/html_similarity/structural_similarity.py index 1be5c88..3b80c29 100644 --- a/html_similarity/structural_similarity.py +++ b/html_similarity/structural_similarity.py @@ -1,5 +1,5 @@ import difflib -from io import StringIO +from io import BytesIO, StringIO import lxml.html @@ -32,8 +32,8 @@ def structural_similarity(document_1, document_2): :return: int """ try: - document_1 = lxml.html.parse(StringIO(document_1)) - document_2 = lxml.html.parse(StringIO(document_2)) + document_1 = lxml.html.parse(StringIO(document_1) if isinstance(document_1, str) else BytesIO(document_1)) + document_2 = lxml.html.parse(StringIO(document_2) if isinstance(document_2, str) else BytesIO(document_2)) except Exception as e: print(e) return 0 diff --git a/tests/test_similarity.py b/tests/test_similarity.py index e642c80..3ca31c9 100644 --- a/tests/test_similarity.py +++ b/tests/test_similarity.py @@ -1,9 +1,9 @@ from html_similarity import style_similarity from html_similarity.style_similarity import jaccard_similarity +from html_similarity import structural_similarity from .utils import almost_equal - html1 = '''' @@ -44,3 +44,36 @@ def test_jaccard_similarity(): assert almost_equal(0.6666, jaccard_similarity(['a', 'b'], ['a', 'b', 'c'])) assert 0 == jaccard_similarity(['d', 'e'], ['a', 'b', 'c']) assert almost_equal(jaccard_similarity(list(range(1, 1000000)), list(range(1000000 - 10, 2 * 1000000))), 0) + +xhtml_1 = ''' + + + + +

This a title

+