1
1
import re
2
2
from types import NoneType
3
- from typing import List , Union
3
+ from typing import Callable , List , Union
4
4
5
5
from bs4 import BeautifulSoup , Comment , Doctype , NavigableString
6
6
from pydantic import BaseModel
45
45
from jsondoc .models .page import Page
46
46
from jsondoc .models .shared_definitions import Annotations
47
47
from jsondoc .rules import is_block_child_allowed
48
- from jsondoc .utils import generate_id , get_current_time
48
+ from jsondoc .utils import generate_block_id , get_current_time
49
49
50
50
line_beginning_re = re .compile (r"^" , re .MULTILINE )
51
51
whitespace_re = re .compile (r"[\t ]+" )
@@ -307,7 +307,9 @@ def reconcile_to_rich_text(
307
307
308
308
309
309
def reconcile_to_block (
310
- block : BlockBase , children : List [CHILDREN_TYPE ]
310
+ block : BlockBase ,
311
+ children : List [CHILDREN_TYPE ],
312
+ typeid : bool = False ,
311
313
) -> List [CHILDREN_TYPE ]:
312
314
"""
313
315
Given a block and a list of children,
@@ -350,7 +352,7 @@ def reconcile_to_block(
350
352
# Get corresponding field from the block
351
353
block_field = getattr (block , block_type )
352
354
init_kwargs = {
353
- "id" : generate_id ( ),
355
+ "id" : generate_block_id ( typeid = typeid ),
354
356
"created_time" : child .created_time ,
355
357
block_type : type (block_field )(),
356
358
}
@@ -383,26 +385,20 @@ def reconcile_to_block(
383
385
384
386
385
387
class HtmlToJsonDocConverter (object ):
386
- class DefaultOptions :
387
- autolinks = True
388
- code_language = ""
389
- code_language_callback = None
390
- convert = None
391
- default_title = False
392
- keep_inline_images_in = []
393
- strip = None
394
- force_page = False
395
-
396
- class Options (DefaultOptions ):
397
- pass
388
+ class Options (BaseModel ):
389
+ autolinks : bool = True
390
+ code_language : str = ""
391
+ code_language_callback : Callable | None = None
392
+ convert : Callable | None = None
393
+ default_title : bool = False
394
+ keep_inline_images_in : list [str ] = []
395
+ strip : str | None = None
396
+ force_page : bool = False
397
+ typeid : bool = False
398
398
399
399
def __init__ (self , ** options ):
400
- # Create an options dictionary. Use DefaultOptions as a base so that
401
- # it doesn't have to be extended.
402
- self .options = _todict (self .DefaultOptions )
403
- self .options .update (_todict (self .Options ))
404
- self .options .update (options )
405
- if self .options ["strip" ] is not None and self .options ["convert" ] is not None :
400
+ self .options = self .Options (** options )
401
+ if self .options .strip is not None and self .options .convert is not None :
406
402
raise ValueError (
407
403
"You may specify either tags to strip or tags to convert, but not both."
408
404
)
@@ -417,7 +413,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
417
413
is_page = self ._is_soup_page (soup )
418
414
419
415
ret = None
420
- if is_page or self .options [ " force_page" ] :
416
+ if is_page or self .options . force_page :
421
417
title = self ._get_html_title (soup )
422
418
# Ensure that children is a list
423
419
if not isinstance (children , list ):
@@ -427,6 +423,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase
427
423
ret = create_page (
428
424
title = title ,
429
425
children = children ,
426
+ typeid = self .options .typeid ,
430
427
)
431
428
else :
432
429
ret = children
@@ -526,7 +523,11 @@ def is_nested_node(el):
526
523
if current_level_object is None :
527
524
objects = children_objects
528
525
elif isinstance (current_level_object , BlockBase ):
529
- objects = reconcile_to_block (current_level_object , children_objects )
526
+ objects = reconcile_to_block (
527
+ current_level_object ,
528
+ children_objects ,
529
+ typeid = self .options .typeid ,
530
+ )
530
531
elif isinstance (current_level_object , RichTextBase ):
531
532
objects = reconcile_to_rich_text (current_level_object , children_objects )
532
533
else :
@@ -615,8 +616,8 @@ def process_text(self, el):
615
616
616
617
def should_convert_tag (self , tag ):
617
618
tag = tag .lower ()
618
- strip = self .options [ " strip" ]
619
- convert = self .options [ " convert" ]
619
+ strip = self .options . strip
620
+ convert = self .options . convert
620
621
if strip is not None :
621
622
return tag not in strip
622
623
elif convert is not None :
@@ -629,7 +630,7 @@ def convert_a(self, el, convert_as_inline):
629
630
return ConvertOutput (main_object = create_rich_text (url = href ))
630
631
631
632
convert_b = abstract_inline_conversion (
632
- lambda self : Annotations (bold = True ) # 2 * self.options[" strong_em_symbol"]
633
+ lambda self : Annotations (bold = True ) # 2 * self.options. strong_em_symbol
633
634
)
634
635
635
636
def convert_blockquote (self , el , convert_as_inline ):
@@ -646,7 +647,11 @@ def convert_blockquote(self, el, convert_as_inline):
646
647
return ConvertOutput (main_object = create_rich_text ())
647
648
648
649
# TODO: If text has newlines, split them and add 2, 3, ... lines as children
649
- return ConvertOutput (main_object = create_quote_block ())
650
+ return ConvertOutput (
651
+ main_object = create_quote_block (
652
+ typeid = self .options .typeid ,
653
+ )
654
+ )
650
655
651
656
def convert_br (self , el , convert_as_inline ):
652
657
if convert_as_inline :
@@ -683,40 +688,48 @@ def convert_h1(self, el, convert_as_inline):
683
688
if convert_as_inline :
684
689
return ConvertOutput (main_object = create_rich_text ())
685
690
686
- return ConvertOutput (main_object = create_h1_block ())
691
+ return ConvertOutput (main_object = create_h1_block (typeid = self . options . typeid ))
687
692
688
693
def convert_h2 (self , el , convert_as_inline ):
689
694
if convert_as_inline :
690
695
return ConvertOutput (main_object = create_rich_text ())
691
696
692
- return ConvertOutput (main_object = create_h2_block ())
697
+ return ConvertOutput (main_object = create_h2_block (typeid = self . options . typeid ))
693
698
694
699
def convert_h3 (self , el , convert_as_inline ):
695
700
if convert_as_inline :
696
701
return ConvertOutput (main_object = create_rich_text ())
697
702
698
- return ConvertOutput (main_object = create_h3_block ())
703
+ return ConvertOutput (main_object = create_h3_block (typeid = self . options . typeid ))
699
704
700
705
def convert_h4 (self , el , convert_as_inline ):
701
706
if convert_as_inline :
702
707
return ConvertOutput (main_object = create_rich_text ())
703
708
704
- return ConvertOutput (main_object = create_paragraph_block ())
709
+ return ConvertOutput (
710
+ main_object = create_paragraph_block (typeid = self .options .typeid )
711
+ )
705
712
706
713
def convert_h5 (self , el , convert_as_inline ):
707
714
if convert_as_inline :
708
715
return ConvertOutput (main_object = create_rich_text ())
709
716
710
- return ConvertOutput (main_object = create_paragraph_block ())
717
+ return ConvertOutput (
718
+ main_object = create_paragraph_block (typeid = self .options .typeid )
719
+ )
711
720
712
721
def convert_h6 (self , el , convert_as_inline ):
713
722
if convert_as_inline :
714
723
return ConvertOutput (main_object = create_rich_text ())
715
724
716
- return ConvertOutput (main_object = create_paragraph_block ())
725
+ return ConvertOutput (
726
+ main_object = create_paragraph_block (typeid = self .options .typeid )
727
+ )
717
728
718
729
def convert_hr (self , el , convert_as_inline ):
719
- return ConvertOutput (main_object = create_divider_block ())
730
+ return ConvertOutput (
731
+ main_object = create_divider_block (typeid = self .options .typeid )
732
+ )
720
733
721
734
convert_i = convert_em
722
735
@@ -730,13 +743,14 @@ def convert_img(self, el, convert_as_inline):
730
743
# title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
731
744
if (
732
745
convert_as_inline
733
- and el .parent .name not in self .options [ " keep_inline_images_in" ]
746
+ and el .parent .name not in self .options . keep_inline_images_in
734
747
):
735
748
return alt
736
749
737
750
return ConvertOutput (
738
751
main_object = create_image_block (
739
752
url = src ,
753
+ typeid = self .options .typeid ,
740
754
# alt is not supported in JSON-DOC yet
741
755
# caption=alt,
742
756
)
@@ -755,28 +769,38 @@ def convert_list(self, el, convert_as_inline):
755
769
def convert_li (self , el , convert_as_inline ):
756
770
parent = el .parent
757
771
if parent is not None and parent .name == "ol" :
758
- return ConvertOutput (main_object = create_numbered_list_item_block ())
772
+ return ConvertOutput (
773
+ main_object = create_numbered_list_item_block (typeid = self .options .typeid )
774
+ )
759
775
else :
760
- return ConvertOutput (main_object = create_bullet_list_item_block ())
776
+ return ConvertOutput (
777
+ main_object = create_bullet_list_item_block (typeid = self .options .typeid )
778
+ )
761
779
762
780
def convert_p (self , el , convert_as_inline ):
763
781
if convert_as_inline :
764
782
return ConvertOutput (main_object = create_rich_text ())
765
783
766
- return ConvertOutput (main_object = create_paragraph_block ())
784
+ return ConvertOutput (
785
+ main_object = create_paragraph_block (typeid = self .options .typeid )
786
+ )
767
787
768
788
def convert_pre (self , el , convert_as_inline ):
769
789
text = el .get_text ()
770
790
771
791
if not text :
772
792
return None
773
793
774
- code_language = self .options [ " code_language" ]
794
+ code_language = self .options . code_language
775
795
776
- if self .options [ " code_language_callback" ] :
777
- code_language = self .options [ " code_language_callback" ] (el ) or code_language
796
+ if self .options . code_language_callback :
797
+ code_language = self .options . code_language_callback (el ) or code_language
778
798
779
- return ConvertOutput (main_object = create_code_block (language = code_language ))
799
+ return ConvertOutput (
800
+ main_object = create_code_block (
801
+ language = code_language , typeid = self .options .typeid
802
+ )
803
+ )
780
804
781
805
def convert_script (self , el , convert_as_inline ):
782
806
return None
@@ -793,19 +817,19 @@ def convert_style(self, el, convert_as_inline):
793
817
# Notion does not have an alternative for sub and sup tags
794
818
convert_sub = abstract_inline_conversion (
795
819
lambda self : Annotations ()
796
- # self.options[" sub_symbol"] ,
820
+ # self.options. sub_symbol,
797
821
)
798
822
799
823
convert_sup = abstract_inline_conversion (
800
824
lambda self : Annotations ()
801
- # self.options[" sup_symbol"] ,
825
+ # self.options. sup_symbol,
802
826
)
803
827
804
828
def convert_table (self , el , convert_as_inline ):
805
829
has_column_header = html_table_has_header_row (el )
806
830
return ConvertOutput (
807
831
main_object = create_table_block (
808
- has_column_header = has_column_header ,
832
+ has_column_header = has_column_header , typeid = self . options . typeid
809
833
)
810
834
)
811
835
@@ -841,10 +865,15 @@ def convert_td(self, el, convert_as_inline):
841
865
paragraph_block.rich_text will be extracted to form table_row.cells.
842
866
"""
843
867
# Get colspan
844
- colspan = el .get ("colspan" , 1 )
868
+ colspan = el .get ("colspan" , "1" )
845
869
# Get rowspan
846
870
# rowspan = el.get("rowspan", 1)
847
871
# We need to come up with a much different way to handle rowspan
872
+ if not isinstance (colspan , int ):
873
+ try :
874
+ colspan = int (colspan )
875
+ except ValueError :
876
+ colspan = 1
848
877
849
878
next_objects = []
850
879
if colspan > 1 :
@@ -863,7 +892,9 @@ def convert_tr(self, el, convert_as_inline):
863
892
"""
864
893
Table row
865
894
"""
866
- return ConvertOutput (main_object = create_table_row_block ())
895
+ return ConvertOutput (
896
+ main_object = create_table_row_block (typeid = self .options .typeid )
897
+ )
867
898
868
899
869
900
def html_to_jsondoc (html : str | bytes , ** options ) -> Page | BlockBase | List [BlockBase ]:
0 commit comments