From 60e9adddedea476f5c63dcc7d1d6a94b3028cec5 Mon Sep 17 00:00:00 2001 From: Nicolas Berthier Date: Tue, 19 Sep 2023 08:52:45 +0200 Subject: [PATCH 1/4] Improve representation and hash of virtual limits in source overlay module --- src/lsp/cobol_preproc/src_overlay.ml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/lsp/cobol_preproc/src_overlay.ml b/src/lsp/cobol_preproc/src_overlay.ml index e073ff002..7b40a317a 100644 --- a/src/lsp/cobol_preproc/src_overlay.ml +++ b/src/lsp/cobol_preproc/src_overlay.ml @@ -33,19 +33,20 @@ module Limit = struct type t = limit let make_virtual: unit -> t = - let id = ref 0 in + let id = ref (-1) in (* Actually start at -2 (-1 is used in Lexing.dummy) *) fun () -> decr id; - Lexing.{ dummy_pos with pos_lnum = !id } + Lexing.{ dummy_pos with pos_cnum = !id } - let is_virtual: t -> bool = - fun l -> l.Lexing.pos_lnum < 0 + let is_virtual (l: t) : bool = + l.pos_cnum < (-1) + + let equal (l1: t) (l2: t) = + l1.pos_cnum == l2.pos_cnum && + l1.pos_fname = l2.pos_fname + + let hash (l: limit) = l.pos_cnum - (* Structural equality is required below, to deal with cases where we - construct the limits of a token several times, for instance at the - beginning of recovery. *) - let equal = (=) - let hash = Hashtbl.hash end (** Weak hashtable where keys are overlay limits (internal) *) From 4e9b3b9a63e1b8bd048fcd9f0f1cd3535992c315 Mon Sep 17 00:00:00 2001 From: Nicolas Berthier Date: Tue, 19 Sep 2023 09:00:23 +0200 Subject: [PATCH 2/4] Cache composed locations in source overlay module --- src/lsp/cobol_common/srcloc.ml | 18 +++++++++-- src/lsp/cobol_preproc/src_overlay.ml | 46 ++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/lsp/cobol_common/srcloc.ml b/src/lsp/cobol_common/srcloc.ml index 91c1c3946..9cf9dd487 100644 --- a/src/lsp/cobol_common/srcloc.ml +++ b/src/lsp/cobol_common/srcloc.ml @@ -374,7 +374,7 @@ let pp_file_loc ppf loc = let raw ?(in_area_a = false) ((s, e): lexloc) : srcloc = assert Lexing.(s.pos_cnum <= e.pos_cnum); (* ensure proper use *) let loc = Raw (s, e, in_area_a) in - if Lexing.(s.pos_fname != e.pos_fname) then + if Lexing.(s.pos_fname <> e.pos_fname) then Pretty.error "%a@\n>> Internal warning in `%s.raw`: file names mismatch (`%s` != `%s`)\ " pp_srcloc loc __MODULE__ s.pos_fname e.pos_fname; @@ -402,13 +402,27 @@ let replacement ~old ~new_ ~in_area_a ~replloc : srcloc = (* end *) (* | Cpy {copyloc = {filename; _}; _} -> Some filename *) +(** [may_join_as_single_raw a b] checks whether a lexloc {i l{_ a}} with a a + left-hand lexing position [a] and a lexloc {i l{_ b}} with a right-hand + position [b], may be joined to form a single raw source location + (internal). *) +let may_join_as_single_raw (a: Lexing.position) (b: Lexing.position) = + a.pos_fname = b.pos_fname && + a.pos_lnum == b.pos_lnum && (* ensure we are stay on a single line *) + a.pos_cnum >= b.pos_cnum - 1 + (** [concat l1 l2] concatenates two adjacent source locations [l1] and [l2]. *) let rec concat: srcloc -> srcloc -> srcloc = fun l1 l2 -> match l1, l2 with | Raw (s1, e1, in_area_a), Raw (s2, e2, _) - when e1.pos_fname = s2.pos_fname && e1.pos_cnum = s2.pos_cnum - 1 -> + when may_join_as_single_raw e1 s2 -> Raw (s1, e2, in_area_a) + | Cat { left; right = Raw (s1, e1, in_area_a) }, + Raw (s2, e2, _) + when may_join_as_single_raw e1 s2 -> + Cat { left; right = Raw (s1, e2, in_area_a) } + | Cpy { copied = l1; copyloc = c1 }, Cpy { copied = l2; copyloc = c2 } when same_copyloc c1 c2 -> diff --git a/src/lsp/cobol_preproc/src_overlay.ml b/src/lsp/cobol_preproc/src_overlay.ml index 7b40a317a..b756c3e50 100644 --- a/src/lsp/cobol_preproc/src_overlay.ml +++ b/src/lsp/cobol_preproc/src_overlay.ml @@ -47,6 +47,12 @@ module Limit = struct let hash (l: limit) = l.pos_cnum + (* l1 = l2, or l1 was emitted before l2 *) + let surely_predates (l1: limit) (l2: limit) = (* or equals *) + if l1.pos_cnum < (-1) + then l2.pos_cnum <= l1.pos_cnum + else l1.pos_cnum > 0 && l1.pos_cnum <= l2.pos_cnum && + l1.pos_fname = l2.pos_fname end (** Weak hashtable where keys are overlay limits (internal) *) @@ -60,6 +66,7 @@ type manager = corresponding right limit. *) over_right_gap: limit Links.t; (** associates the right limit of a token to the left limit of the next *) + cache: (srcloc * limit) Links.t; id: string; (** manager identifier (for logging/debugging) *) } @@ -71,11 +78,13 @@ let new_manager: string -> manager = { right_of = Links.create 42; over_right_gap = Links.create 42; + cache = Links.create 42; id = Pretty.to_string "%s-%u" manager_name !id; } (** Returns left and right (potentially fresh) limits for the given source - location *) + location; for any given file, must be called with the leftmost location + first. *) let limits: manager -> srcloc -> limit * limit = fun ctx loc -> let s, e = match Cobol_common.Srcloc.as_unique_lexloc loc with | Some lexloc -> lexloc @@ -114,18 +123,35 @@ let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> Cobol_common.Srcloc.raw (pos, pos) in let try_limits (s, e) = - let rec jump_right loc e' = - let s' = Links.find ctx.over_right_gap e' in - let loc', e' = Links.find ctx.right_of s' in - check (Cobol_common.Srcloc.concat loc loc') e' - and check loc e' = + + let rec proceed_from ?loc s = (* start search from left limit [s] *) + check ?loc @@ Links.find ctx.right_of s + + and check ?loc (loc', e') = + (* continue search with ([loc] concatenated with) [loc'] if [e'] is not + the sought after right limit; raises {!Not_found} when reaching an + unknown gap or limit *) + let loc = match loc with + | None -> loc' + | Some loc -> Cobol_common.Srcloc.concat loc loc' + in if e == e' (* physical comparison *) - then loc - else jump_right loc e' + then (Links.replace ctx.cache s (loc, e); loc) + else try_cache_from ~loc @@ Links.find ctx.over_right_gap e' + + and try_cache_from ?loc s = + (* attempt with cache first; proceed via small-step upon miss or + failure *) + match Links.find_opt ctx.cache s with + | Some ((_, e') as hit) when Limit.surely_predates e' e -> + (try check ?loc hit with Not_found -> proceed_from ?loc s) + | Some _ | None -> + proceed_from ?loc s in + if s == e then pointwise s - else let loc, e' = Links.find ctx.right_of s in check loc e' + else try_cache_from s in let join_failure (s, e) = let loc = Cobol_common.Srcloc.raw (s, e) in @@ -138,7 +164,7 @@ let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> (* first attempt assumes proper token limits: `s` is a left and `e` is a right of tokens *) try try_limits (s, e) with Not_found -> - (* try assuming `s` is an end of token *) + (* otherwise try assuming `s` is an end of token *) try try_limits (Links.find ctx.over_right_gap s, e) with Not_found -> if s.pos_cnum = 0 (* potential special case with left-position forged by the parser: retry with leftmost limit if it differs from From e187bb87a27e4f73644591f4150c5e4fed76b939 Mon Sep 17 00:00:00 2001 From: Nicolas Berthier Date: Tue, 19 Sep 2023 09:37:37 +0200 Subject: [PATCH 3/4] Avoid use of deprecated `fold` function on weak hash tables --- src/lsp/cobol_preproc/src_overlay.ml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/lsp/cobol_preproc/src_overlay.ml b/src/lsp/cobol_preproc/src_overlay.ml index b756c3e50..211b10cfc 100644 --- a/src/lsp/cobol_preproc/src_overlay.ml +++ b/src/lsp/cobol_preproc/src_overlay.ml @@ -67,6 +67,7 @@ type manager = over_right_gap: limit Links.t; (** associates the right limit of a token to the left limit of the next *) cache: (srcloc * limit) Links.t; + leftmost_in_file: (string, limit) Hashtbl.t; id: string; (** manager identifier (for logging/debugging) *) } @@ -79,18 +80,24 @@ let new_manager: string -> manager = right_of = Links.create 42; over_right_gap = Links.create 42; cache = Links.create 42; + leftmost_in_file = Hashtbl.create 3; id = Pretty.to_string "%s-%u" manager_name !id; } (** Returns left and right (potentially fresh) limits for the given source location; for any given file, must be called with the leftmost location first. *) +(* TODO: try to see whether registering the leftmost location in each file could + be done more efficiently wihtout a membership test on each new location (but + the pre-processor does not provide change-of-file info to the parser). *) let limits: manager -> srcloc -> limit * limit = fun ctx loc -> let s, e = match Cobol_common.Srcloc.as_unique_lexloc loc with | Some lexloc -> lexloc | _ -> Limit.make_virtual (), Limit.make_virtual () in Links.replace ctx.right_of s (loc, e); (* replace to deal with duplicates *) + if not (Hashtbl.mem ctx.leftmost_in_file s.pos_fname) + then Hashtbl.add ctx.leftmost_in_file s.pos_fname s; s, e (** Links token limits *) @@ -102,12 +109,7 @@ let link_limits ctx left right = in [filename] that is registered in [ctx] (internal). Use with moderation as this is quite inefficient. *) let leftmost_limit_in ~filename ctx = - Links.fold begin fun l _ -> function - | None when l.Lexing.pos_fname = filename -> Some l - | Some l' when l.Lexing.pos_cnum < l'.pos_cnum && - l.Lexing.pos_fname = filename -> Some l - | res -> res - end ctx.right_of None + Hashtbl.find_opt ctx.leftmost_in_file filename (** Returns a source location that spans between two given limits; returns a valid pointwise location if the two given limits are physically equal. *) From 10628f222f47588f87eed6a98ba3ac15812a6934 Mon Sep 17 00:00:00 2001 From: Nicolas Berthier Date: Wed, 20 Sep 2023 17:31:01 +0200 Subject: [PATCH 4/4] Run the pre-precessor once before building the first parser checkpoint This completely removes the need to handle cases of dummy left-most positions in the source overlay module. --- src/lsp/cobol_parser/parser_engine.ml | 19 +++++++++----- src/lsp/cobol_preproc/src_overlay.ml | 38 ++++++++------------------- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/src/lsp/cobol_parser/parser_engine.ml b/src/lsp/cobol_parser/parser_engine.ml index cdb520734..9a3a5b9ac 100644 --- a/src/lsp/cobol_parser/parser_engine.ml +++ b/src/lsp/cobol_parser/parser_engine.ml @@ -220,7 +220,7 @@ module Make (Config: Cobol_config.T) = struct (* --- *) - let do_parse: type m. m state -> _ -> _ * m state = + let do_parse: type m. m state -> _ -> _ -> _ * m state = let rec next_tokens ({ preproc = { tokzr; _ }; _ } as ps) tokens = match Tokzr.next_token tokzr tokens with @@ -424,16 +424,22 @@ module Make (Config: Cobol_config.T) = struct Some v, ps in - fun ps c -> normal ps [] c + fun ps tokens c -> normal ps tokens c let parse ?verbose ?show ~recovery - (type m) ~(memory: m memory) pp checkpoint + (type m) ~(memory: m memory) pp make_checkpoint : ('a option, m) output * _ = let ps = init_parser ?verbose ?show ~recovery ~tokenizer_memory:memory pp in let res, ps = (* TODO: catch in a deeper context to grab parsed tokens *) - try do_parse ps checkpoint with e -> None, add_diag (DIAGS.of_exn e) ps + let ps, tokens = produce_tokens ps in + let first_pos = match tokens with + | [] -> Cobol_preproc.position ps.preproc.pp + | t :: _ -> Cobol_common.Srcloc.start_pos ~@t + in + try do_parse ps tokens (make_checkpoint first_pos) + with e -> None, add_diag (DIAGS.of_exn e) ps in match memory with | Amnesic -> @@ -480,9 +486,8 @@ let parse init_source_format = source_format} in let module P = Make (val config) in - P.parse ?verbose ?show ~memory ~recovery pp @@ - Grammar.Incremental.compilation_group @@ - Cobol_preproc.position pp + P.parse ?verbose ?show ~memory ~recovery pp + Grammar.Incremental.compilation_group end in { diff --git a/src/lsp/cobol_preproc/src_overlay.ml b/src/lsp/cobol_preproc/src_overlay.ml index 211b10cfc..5106a8668 100644 --- a/src/lsp/cobol_preproc/src_overlay.ml +++ b/src/lsp/cobol_preproc/src_overlay.ml @@ -67,7 +67,6 @@ type manager = over_right_gap: limit Links.t; (** associates the right limit of a token to the left limit of the next *) cache: (srcloc * limit) Links.t; - leftmost_in_file: (string, limit) Hashtbl.t; id: string; (** manager identifier (for logging/debugging) *) } @@ -80,24 +79,18 @@ let new_manager: string -> manager = right_of = Links.create 42; over_right_gap = Links.create 42; cache = Links.create 42; - leftmost_in_file = Hashtbl.create 3; id = Pretty.to_string "%s-%u" manager_name !id; } (** Returns left and right (potentially fresh) limits for the given source location; for any given file, must be called with the leftmost location first. *) -(* TODO: try to see whether registering the leftmost location in each file could - be done more efficiently wihtout a membership test on each new location (but - the pre-processor does not provide change-of-file info to the parser). *) let limits: manager -> srcloc -> limit * limit = fun ctx loc -> let s, e = match Cobol_common.Srcloc.as_unique_lexloc loc with | Some lexloc -> lexloc | _ -> Limit.make_virtual (), Limit.make_virtual () in Links.replace ctx.right_of s (loc, e); (* replace to deal with duplicates *) - if not (Hashtbl.mem ctx.leftmost_in_file s.pos_fname) - then Hashtbl.add ctx.leftmost_in_file s.pos_fname s; s, e (** Links token limits *) @@ -105,12 +98,6 @@ let link_limits ctx left right = (* Replace to deal with overriding of limits during recovery. *) Links.replace ctx.over_right_gap left right -(** [leftmost_limit_in ~filename ctx] finds the leftmost limit from a location - in [filename] that is registered in [ctx] (internal). Use with moderation - as this is quite inefficient. *) -let leftmost_limit_in ~filename ctx = - Hashtbl.find_opt ctx.leftmost_in_file filename - (** Returns a source location that spans between two given limits; returns a valid pointwise location if the two given limits are physically equal. *) let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> @@ -124,7 +111,7 @@ let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> in Cobol_common.Srcloc.raw (pos, pos) in - let try_limits (s, e) = + let try_limits (s, e: limit * limit) = let rec proceed_from ?loc s = (* start search from left limit [s] *) check ?loc @@ Links.find ctx.right_of s @@ -137,7 +124,8 @@ let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> | None -> loc' | Some loc -> Cobol_common.Srcloc.concat loc loc' in - if e == e' (* physical comparison *) + if e.pos_cnum = e'.pos_cnum && (* compare only the fields that matter *) + e.pos_fname = e'.pos_fname then (Links.replace ctx.cache s (loc, e); loc) else try_cache_from ~loc @@ Links.find ctx.over_right_gap e' @@ -163,18 +151,14 @@ let join_limits: manager -> limit * limit -> srcloc = fun ctx (s, e) -> (* Printexc.(print_raw_backtrace Stdlib.stderr @@ get_callstack 10); *) loc in - (* first attempt assumes proper token limits: `s` is a left and `e` is a right - of tokens *) - try try_limits (s, e) with Not_found -> - (* otherwise try assuming `s` is an end of token *) - try try_limits (Links.find ctx.over_right_gap s, e) with Not_found -> - if s.pos_cnum = 0 (* potential special case with left-position forged by the - parser: retry with leftmost limit if it differs from - s *) - then match leftmost_limit_in ~filename:s.pos_fname ctx with - | Some l when l != s -> try_limits (l, e) (* physical equality is enough *) - | Some _ | None -> join_failure (s, e) - else join_failure (s, e) + try (* first attempt assumes proper token limits: `s` is a left and `e` is a + right of tokens *) + try_limits (s, e) + with Not_found -> + try (* otherwise try assuming `s` is an end of token *) + try_limits (Links.find ctx.over_right_gap s, e) + with Not_found -> + join_failure (s, e) module New_manager (Id: sig val name: string end) : MANAGER = struct let ctx = new_manager Id.name