11from enum import Enum , auto
2- from typing import Iterable , List , Optional , Sequence , Tuple , Union
2+ from typing import Callable , Dict , Iterable , List , Optional , Sequence , Tuple , Union
33
4+ from ..scripture .verse_ref import IgnoreSegmentsVerseRef , VerseRef , Versification
45from .scripture_ref import ScriptureRef
56from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler , ScriptureTextType
67from .usfm_parser_state import UsfmParserState
1011from .usfm_tokenizer import UsfmTokenizer
1112from .usfm_update_block import UsfmUpdateBlock
1213from .usfm_update_block_element import UsfmUpdateBlockElement , UsfmUpdateBlockElementType
13- from .usfm_update_block_handler import UsfmUpdateBlockHandler
14+ from .usfm_update_block_handler import UsfmUpdateBlockHandler , UsfmUpdateBlockHandlerError
1415
1516
1617class UpdateUsfmTextBehavior (Enum ):
@@ -24,6 +25,12 @@ class UpdateUsfmMarkerBehavior(Enum):
2425 STRIP = auto ()
2526
2627
28+ class _RowInfo :
29+ def __init__ (self , row_index : int ):
30+ self .row_index = row_index
31+ self .is_consumed = False
32+
33+
2734class UpdateUsfmRow :
2835 def __init__ (self , refs : Sequence [ScriptureRef ], text : str , metadata : Optional [dict [str , object ]] = None ):
2936 self .refs = refs
@@ -43,9 +50,19 @@ def __init__(
4350 preserve_paragraph_styles : Optional [Union [Iterable [str ], str ]] = None ,
4451 update_block_handlers : Optional [Iterable [UsfmUpdateBlockHandler ]] = None ,
4552 remarks : Optional [Iterable [str ]] = None ,
53+ error_handler : Optional [Callable [[UsfmUpdateBlockHandlerError ], bool ]] = None ,
54+ compare_segments : bool = False ,
4655 ) -> None :
4756 super ().__init__ ()
4857 self ._rows = rows or []
58+ self ._verse_rows : List [int ] = []
59+ self ._verse_row_index = 0
60+ self ._verse_rows_map : Dict [VerseRef , List [_RowInfo ]] = {}
61+ self ._verse_rows_ref = VerseRef ()
62+ if len (self ._rows ) > 0 :
63+ self ._update_rows_versification : Versification = self ._rows [0 ].refs [0 ].versification
64+ else :
65+ self ._update_rows_versification = Versification .get_builtin ("English" )
4966 self ._tokens : List [UsfmToken ] = []
5067 self ._updated_text : List [UsfmToken ] = []
5168 self ._update_block_stack : list [UsfmUpdateBlock ] = []
@@ -65,6 +82,11 @@ def __init__(
6582 self ._remarks = []
6683 else :
6784 self ._remarks = list (remarks )
85+ if error_handler is None :
86+ self ._error_handler = lambda _ : False
87+ else :
88+ self ._error_handler = error_handler
89+ self ._compare_segments = compare_segments
6890 self ._text_behavior = text_behavior
6991 self ._paragraph_behavior = paragraph_behavior
7092 self ._embed_behavior = embed_behavior
@@ -82,6 +104,10 @@ def end_usfm(self, state: UsfmParserState) -> None:
82104 super ().end_usfm (state )
83105
84106 def start_book (self , state : UsfmParserState , marker : str , code : str ) -> None :
107+ self ._verse_rows_ref = state .verse_ref .copy ()
108+ self ._update_verse_rows_map ()
109+ self ._update_verse_rows ()
110+
85111 self ._collect_readonly_tokens (state )
86112 self ._update_block_stack .append (UsfmUpdateBlock ())
87113 start_book_tokens : List [UsfmToken ] = []
@@ -108,7 +134,7 @@ def start_para(
108134 ) -> None :
109135 if state .is_verse_text :
110136 # Only strip paragraph markers in a verse
111- if self ._paragraph_behavior == UpdateUsfmMarkerBehavior .PRESERVE :
137+ if self ._paragraph_behavior == UpdateUsfmMarkerBehavior .PRESERVE and not self . _duplicate_verse :
112138 self ._collect_updatable_tokens (state )
113139 else :
114140 self ._skip_updatable_tokens (state )
@@ -148,6 +174,11 @@ def chapter(
148174 ) -> None :
149175 self ._use_updated_text ()
150176
177+ if self ._verse_rows_ref != state .verse_ref :
178+ self ._verse_rows_ref = state .verse_ref .copy ()
179+ self ._update_verse_rows_map ()
180+ self ._update_verse_rows ()
181+
151182 super ().chapter (state , number , marker , alt_number , pub_number )
152183
153184 self ._collect_readonly_tokens (state )
@@ -179,14 +210,23 @@ def verse(
179210 if last_paragraph is not None :
180211 last_paragraph .marked_for_removal = False
181212
182- super ().verse (state , number , marker , alt_number , pub_number )
213+ if self ._verse_rows_ref != state .verse_ref :
214+ self ._verse_rows_ref = state .verse_ref .copy ()
215+ self ._update_verse_rows ()
183216
184- self ._collect_readonly_tokens (state )
217+ super ().verse (state , number , marker , alt_number , pub_number )
218+ if self ._duplicate_verse :
219+ self ._skip_updatable_tokens (state )
220+ else :
221+ self ._collect_readonly_tokens (state )
185222
186223 def start_note (self , state : UsfmParserState , marker : str , caller : str , category : str ) -> None :
187224 super ().start_note (state , marker , caller , category )
188225
189- self ._collect_updatable_tokens (state )
226+ if not self ._duplicate_verse :
227+ self ._collect_updatable_tokens (state )
228+ else :
229+ self ._skip_updatable_tokens (state )
190230
191231 def end_note (self , state : UsfmParserState , marker : str , closed : bool ) -> None :
192232 if closed :
@@ -219,15 +259,14 @@ def end_char(
219259 attributes : Sequence [UsfmAttribute ],
220260 closed : bool ,
221261 ) -> None :
222- if closed :
223- if self ._current_text_type == ScriptureTextType .EMBED :
224- self ._collect_updatable_tokens (state )
262+ if self ._current_text_type == ScriptureTextType .EMBED :
263+ self ._collect_updatable_tokens (state )
264+ else :
265+ self ._replace_with_new_tokens (state )
266+ if self ._style_behavior == UpdateUsfmMarkerBehavior .STRIP :
267+ self ._skip_updatable_tokens (state )
225268 else :
226- self ._replace_with_new_tokens (state )
227- if self ._style_behavior == UpdateUsfmMarkerBehavior .STRIP :
228- self ._skip_updatable_tokens (state )
229- else :
230- self ._collect_updatable_tokens (state )
269+ self ._collect_updatable_tokens (state )
231270
232271 super ().end_char (state , marker , attributes , closed )
233272
@@ -242,7 +281,9 @@ def ref(self, state: UsfmParserState, marker: str, display: str, target: str) ->
242281 def text (self , state : UsfmParserState , text : str ) -> None :
243282 super ().text (state , text )
244283
245- if self ._replace_with_new_tokens (state ):
284+ if self ._replace_with_new_tokens (state ) or (
285+ self ._duplicate_verse and self ._current_text_type == ScriptureTextType .VERSE
286+ ):
246287 self ._skip_updatable_tokens (state )
247288 else :
248289 self ._collect_updatable_tokens (state )
@@ -292,11 +333,10 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
292333 for remark in self ._remarks :
293334 remark_tokens .append (UsfmToken (UsfmTokenType .PARAGRAPH , "rem" ))
294335 remark_tokens .append (UsfmToken (UsfmTokenType .TEXT , text = remark ))
295- if len (tokens ) > 0 and tokens [0 ].marker == "id" :
296- index = 1
297- if len (tokens ) > 1 and tokens [1 ].type == UsfmTokenType .TEXT :
298- index = 2
299- while tokens [index ].marker == "rem" :
336+ if len (tokens ) > 0 :
337+ index = 0
338+ markers_to_skip = {"id" , "ide" , "rem" }
339+ while tokens [index ].marker in markers_to_skip :
300340 index += 1
301341 if len (tokens ) > index and tokens [index ].type == UsfmTokenType .TEXT :
302342 index += 1
@@ -308,13 +348,15 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
308348 row_texts : List [str ] = []
309349 row_metadata = None
310350 source_index : int = 0
311- while self ._row_index < len (self ._rows ) and source_index < len (seg_scr_refs ):
351+ while self ._verse_row_index < len (self ._verse_rows ) and source_index < len (seg_scr_refs ):
312352 compare : int = 0
313- row = self ._rows [self ._row_index ]
353+ row = self ._rows [self ._verse_rows [ self . _verse_row_index ] ]
314354 row_scr_refs , text , metadata = row .refs , row .text , row .metadata
315355 for row_scr_ref in row_scr_refs :
316356 while source_index < len (seg_scr_refs ):
317- compare = row_scr_ref .compare_to (seg_scr_refs [source_index ], compare_segments = False )
357+ compare = row_scr_ref .compare_to (
358+ seg_scr_refs [source_index ], compare_segments = self ._compare_segments
359+ )
318360 if compare > 0 :
319361 # row is ahead of source, increment source
320362 source_index += 1
@@ -328,7 +370,7 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
328370 break
329371 if compare <= 0 :
330372 # source is ahead of row, increment row
331- self ._row_index += 1
373+ self ._verse_row_index += 1
332374 return row_texts , row_metadata
333375
334376 def _collect_updatable_tokens (self , state : UsfmParserState ) -> None :
@@ -418,7 +460,13 @@ def _end_update_block(self, state: UsfmParserState, scripture_refs: Sequence[Scr
418460 para_elems .append (update_block .pop ())
419461
420462 for handler in self ._update_block_handlers :
421- update_block = handler .process_block (update_block )
463+ try :
464+ update_block = handler .process_block (update_block )
465+ except UsfmUpdateBlockHandlerError as e :
466+ should_continue = self ._error_handler (e )
467+ if not should_continue :
468+ raise
469+
422470 tokens = update_block .get_tokens ()
423471 for elem in reversed (para_elems ):
424472 tokens .extend (elem .get_tokens ())
@@ -449,6 +497,41 @@ def _pop_new_tokens(self) -> None:
449497 def _is_in_preserved_paragraph (self , state : UsfmParserState ) -> bool :
450498 return state .para_tag is not None and state .para_tag .marker in self ._preserve_paragraph_styles
451499
500+ def _update_verse_rows_map (self ) -> None :
501+ self ._verse_rows_map .clear ()
502+ while (
503+ self ._row_index < len (self ._rows )
504+ and self ._rows [self ._row_index ].refs [0 ].chapter_num == self ._verse_rows_ref .chapter_num
505+ ):
506+ row = self ._rows [self ._row_index ]
507+ ri = _RowInfo (self ._row_index )
508+ for sr in row .refs :
509+ vr = sr .verse_ref if self ._compare_segments else IgnoreSegmentsVerseRef (sr .verse_ref )
510+ if vr in self ._verse_rows_map :
511+ self ._verse_rows_map [vr ].append (ri )
512+ else :
513+ self ._verse_rows_map [vr ] = [ri ]
514+ self ._row_index += 1
515+
516+ def _update_verse_rows (self ) -> None :
517+ vref = self ._verse_rows_ref .copy ()
518+ # We are using a dictionary, which uses an equality comparer. As a result, we need to change the
519+ # source verse ref to use the row versification. If we used a SortedList, it wouldn't be necessary, but it
520+ # would be less efficient.
521+ vref .change_versification (self ._update_rows_versification )
522+
523+ self ._verse_rows .clear ()
524+ self ._verse_row_index = 0
525+
526+ for vr in vref .all_verses ():
527+ if not self ._compare_segments :
528+ vr = IgnoreSegmentsVerseRef (vr )
529+ if rows := self ._verse_rows_map .get (vr ):
530+ for row in rows :
531+ if not row .is_consumed :
532+ self ._verse_rows .append (row .row_index )
533+ row .is_consumed = True
534+
452535
453536def _is_nonverse_paragraph (state : UsfmParserState , element : UsfmUpdateBlockElement ) -> bool :
454537 if element .type != UsfmUpdateBlockElementType .PARAGRAPH :
0 commit comments