Skip to content

Commit c1af73b

Browse files
authored
Extract rich text from blocks (#8)
* Extract rich text from blocks, wip * Add TextWithBackref * Checkpoint * Checkpoint * Extract blocks util * Checkpoint * get_intersecting_backrefs * Handle unnecessary whitespace in html conversion * Fix colspan issue * Add TypeID generation for block and page ids, refactor HtmlToJsonDocConverter options to use Pydantic
1 parent 24a2c75 commit c1af73b

File tree

9 files changed

+425
-66
lines changed

9 files changed

+425
-66
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
__pycache__
66
build/
77
*.docx
8-
*.pptx
8+
*.pptx
9+
scratch/

jsondoc/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

jsondoc/convert/html.py

+79-48
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import re
22
from types import NoneType
3-
from typing import List, Union
3+
from typing import Callable, List, Union
44

55
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString
66
from pydantic import BaseModel
@@ -45,7 +45,7 @@
4545
from jsondoc.models.page import Page
4646
from jsondoc.models.shared_definitions import Annotations
4747
from jsondoc.rules import is_block_child_allowed
48-
from jsondoc.utils import generate_id, get_current_time
48+
from jsondoc.utils import generate_block_id, get_current_time
4949

5050
line_beginning_re = re.compile(r"^", re.MULTILINE)
5151
whitespace_re = re.compile(r"[\t ]+")
@@ -307,7 +307,9 @@ def reconcile_to_rich_text(
307307

308308

309309
def reconcile_to_block(
310-
block: BlockBase, children: List[CHILDREN_TYPE]
310+
block: BlockBase,
311+
children: List[CHILDREN_TYPE],
312+
typeid: bool = False,
311313
) -> List[CHILDREN_TYPE]:
312314
"""
313315
Given a block and a list of children,
@@ -350,7 +352,7 @@ def reconcile_to_block(
350352
# Get corresponding field from the block
351353
block_field = getattr(block, block_type)
352354
init_kwargs = {
353-
"id": generate_id(),
355+
"id": generate_block_id(typeid=typeid),
354356
"created_time": child.created_time,
355357
block_type: type(block_field)(),
356358
}
@@ -383,26 +385,20 @@ def reconcile_to_block(
383385

384386

385387
class HtmlToJsonDocConverter(object):
386-
class DefaultOptions:
387-
autolinks = True
388-
code_language = ""
389-
code_language_callback = None
390-
convert = None
391-
default_title = False
392-
keep_inline_images_in = []
393-
strip = None
394-
force_page = False
395-
396-
class Options(DefaultOptions):
397-
pass
388+
class Options(BaseModel):
389+
autolinks: bool = True
390+
code_language: str = ""
391+
code_language_callback: Callable | None = None
392+
convert: Callable | None = None
393+
default_title: bool = False
394+
keep_inline_images_in: list[str] = []
395+
strip: str | None = None
396+
force_page: bool = False
397+
typeid: bool = False
398398

399399
def __init__(self, **options):
400-
# Create an options dictionary. Use DefaultOptions as a base so that
401-
# it doesn't have to be extended.
402-
self.options = _todict(self.DefaultOptions)
403-
self.options.update(_todict(self.Options))
404-
self.options.update(options)
405-
if self.options["strip"] is not None and self.options["convert"] is not None:
400+
self.options = self.Options(**options)
401+
if self.options.strip is not None and self.options.convert is not None:
406402
raise ValueError(
407403
"You may specify either tags to strip or tags to convert, but not both."
408404
)
@@ -417,7 +413,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
417413
is_page = self._is_soup_page(soup)
418414

419415
ret = None
420-
if is_page or self.options["force_page"]:
416+
if is_page or self.options.force_page:
421417
title = self._get_html_title(soup)
422418
# Ensure that children is a list
423419
if not isinstance(children, list):
@@ -427,6 +423,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
427423
ret = create_page(
428424
title=title,
429425
children=children,
426+
typeid=self.options.typeid,
430427
)
431428
else:
432429
ret = children
@@ -526,7 +523,11 @@ def is_nested_node(el):
526523
if current_level_object is None:
527524
objects = children_objects
528525
elif isinstance(current_level_object, BlockBase):
529-
objects = reconcile_to_block(current_level_object, children_objects)
526+
objects = reconcile_to_block(
527+
current_level_object,
528+
children_objects,
529+
typeid=self.options.typeid,
530+
)
530531
elif isinstance(current_level_object, RichTextBase):
531532
objects = reconcile_to_rich_text(current_level_object, children_objects)
532533
else:
@@ -615,8 +616,8 @@ def process_text(self, el):
615616

616617
def should_convert_tag(self, tag):
617618
tag = tag.lower()
618-
strip = self.options["strip"]
619-
convert = self.options["convert"]
619+
strip = self.options.strip
620+
convert = self.options.convert
620621
if strip is not None:
621622
return tag not in strip
622623
elif convert is not None:
@@ -629,7 +630,7 @@ def convert_a(self, el, convert_as_inline):
629630
return ConvertOutput(main_object=create_rich_text(url=href))
630631

631632
convert_b = abstract_inline_conversion(
632-
lambda self: Annotations(bold=True) # 2 * self.options["strong_em_symbol"]
633+
lambda self: Annotations(bold=True) # 2 * self.options.strong_em_symbol
633634
)
634635

635636
def convert_blockquote(self, el, convert_as_inline):
@@ -646,7 +647,11 @@ def convert_blockquote(self, el, convert_as_inline):
646647
return ConvertOutput(main_object=create_rich_text())
647648

648649
# TODO: If text has newlines, split them and add 2, 3, ... lines as children
649-
return ConvertOutput(main_object=create_quote_block())
650+
return ConvertOutput(
651+
main_object=create_quote_block(
652+
typeid=self.options.typeid,
653+
)
654+
)
650655

651656
def convert_br(self, el, convert_as_inline):
652657
if convert_as_inline:
@@ -683,40 +688,48 @@ def convert_h1(self, el, convert_as_inline):
683688
if convert_as_inline:
684689
return ConvertOutput(main_object=create_rich_text())
685690

686-
return ConvertOutput(main_object=create_h1_block())
691+
return ConvertOutput(main_object=create_h1_block(typeid=self.options.typeid))
687692

688693
def convert_h2(self, el, convert_as_inline):
689694
if convert_as_inline:
690695
return ConvertOutput(main_object=create_rich_text())
691696

692-
return ConvertOutput(main_object=create_h2_block())
697+
return ConvertOutput(main_object=create_h2_block(typeid=self.options.typeid))
693698

694699
def convert_h3(self, el, convert_as_inline):
695700
if convert_as_inline:
696701
return ConvertOutput(main_object=create_rich_text())
697702

698-
return ConvertOutput(main_object=create_h3_block())
703+
return ConvertOutput(main_object=create_h3_block(typeid=self.options.typeid))
699704

700705
def convert_h4(self, el, convert_as_inline):
701706
if convert_as_inline:
702707
return ConvertOutput(main_object=create_rich_text())
703708

704-
return ConvertOutput(main_object=create_paragraph_block())
709+
return ConvertOutput(
710+
main_object=create_paragraph_block(typeid=self.options.typeid)
711+
)
705712

706713
def convert_h5(self, el, convert_as_inline):
707714
if convert_as_inline:
708715
return ConvertOutput(main_object=create_rich_text())
709716

710-
return ConvertOutput(main_object=create_paragraph_block())
717+
return ConvertOutput(
718+
main_object=create_paragraph_block(typeid=self.options.typeid)
719+
)
711720

712721
def convert_h6(self, el, convert_as_inline):
713722
if convert_as_inline:
714723
return ConvertOutput(main_object=create_rich_text())
715724

716-
return ConvertOutput(main_object=create_paragraph_block())
725+
return ConvertOutput(
726+
main_object=create_paragraph_block(typeid=self.options.typeid)
727+
)
717728

718729
def convert_hr(self, el, convert_as_inline):
719-
return ConvertOutput(main_object=create_divider_block())
730+
return ConvertOutput(
731+
main_object=create_divider_block(typeid=self.options.typeid)
732+
)
720733

721734
convert_i = convert_em
722735

@@ -730,13 +743,14 @@ def convert_img(self, el, convert_as_inline):
730743
# title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
731744
if (
732745
convert_as_inline
733-
and el.parent.name not in self.options["keep_inline_images_in"]
746+
and el.parent.name not in self.options.keep_inline_images_in
734747
):
735748
return alt
736749

737750
return ConvertOutput(
738751
main_object=create_image_block(
739752
url=src,
753+
typeid=self.options.typeid,
740754
# alt is not supported in JSON-DOC yet
741755
# caption=alt,
742756
)
@@ -755,28 +769,38 @@ def convert_list(self, el, convert_as_inline):
755769
def convert_li(self, el, convert_as_inline):
756770
parent = el.parent
757771
if parent is not None and parent.name == "ol":
758-
return ConvertOutput(main_object=create_numbered_list_item_block())
772+
return ConvertOutput(
773+
main_object=create_numbered_list_item_block(typeid=self.options.typeid)
774+
)
759775
else:
760-
return ConvertOutput(main_object=create_bullet_list_item_block())
776+
return ConvertOutput(
777+
main_object=create_bullet_list_item_block(typeid=self.options.typeid)
778+
)
761779

762780
def convert_p(self, el, convert_as_inline):
763781
if convert_as_inline:
764782
return ConvertOutput(main_object=create_rich_text())
765783

766-
return ConvertOutput(main_object=create_paragraph_block())
784+
return ConvertOutput(
785+
main_object=create_paragraph_block(typeid=self.options.typeid)
786+
)
767787

768788
def convert_pre(self, el, convert_as_inline):
769789
text = el.get_text()
770790

771791
if not text:
772792
return None
773793

774-
code_language = self.options["code_language"]
794+
code_language = self.options.code_language
775795

776-
if self.options["code_language_callback"]:
777-
code_language = self.options["code_language_callback"](el) or code_language
796+
if self.options.code_language_callback:
797+
code_language = self.options.code_language_callback(el) or code_language
778798

779-
return ConvertOutput(main_object=create_code_block(language=code_language))
799+
return ConvertOutput(
800+
main_object=create_code_block(
801+
language=code_language, typeid=self.options.typeid
802+
)
803+
)
780804

781805
def convert_script(self, el, convert_as_inline):
782806
return None
@@ -793,19 +817,19 @@ def convert_style(self, el, convert_as_inline):
793817
# Notion does not have an alternative for sub and sup tags
794818
convert_sub = abstract_inline_conversion(
795819
lambda self: Annotations()
796-
# self.options["sub_symbol"],
820+
# self.options.sub_symbol,
797821
)
798822

799823
convert_sup = abstract_inline_conversion(
800824
lambda self: Annotations()
801-
# self.options["sup_symbol"],
825+
# self.options.sup_symbol,
802826
)
803827

804828
def convert_table(self, el, convert_as_inline):
805829
has_column_header = html_table_has_header_row(el)
806830
return ConvertOutput(
807831
main_object=create_table_block(
808-
has_column_header=has_column_header,
832+
has_column_header=has_column_header, typeid=self.options.typeid
809833
)
810834
)
811835

@@ -841,10 +865,15 @@ def convert_td(self, el, convert_as_inline):
841865
paragraph_block.rich_text will be extracted to form table_row.cells.
842866
"""
843867
# Get colspan
844-
colspan = el.get("colspan", 1)
868+
colspan = el.get("colspan", "1")
845869
# Get rowspan
846870
# rowspan = el.get("rowspan", 1)
847871
# We need to come up with a much different way to handle rowspan
872+
if not isinstance(colspan, int):
873+
try:
874+
colspan = int(colspan)
875+
except ValueError:
876+
colspan = 1
848877

849878
next_objects = []
850879
if colspan > 1:
@@ -863,7 +892,9 @@ def convert_tr(self, el, convert_as_inline):
863892
"""
864893
Table row
865894
"""
866-
return ConvertOutput(main_object=create_table_row_block())
895+
return ConvertOutput(
896+
main_object=create_table_row_block(typeid=self.options.typeid)
897+
)
867898

868899

869900
def html_to_jsondoc(html: str | bytes, **options) -> Page | BlockBase | List[BlockBase]:

0 commit comments

Comments
 (0)