Skip to content

Commit

Permalink
feat(lexer): introduce native patterns for custom lexing logic
Browse files Browse the repository at this point in the history
  • Loading branch information
gvozdvmozgu authored and benfdking committed Dec 2, 2024
1 parent 7b5a1b5 commit c2a1ea3
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 25 deletions.
65 changes: 62 additions & 3 deletions crates/lib-core/src/parser/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::borrow::Cow;
use std::fmt::Debug;
use std::ops::Range;
use std::str::Chars;

use super::markers::PositionMarker;
use super::segments::base::{ErasedSegment, SegmentBuilder, Tables};
Expand Down Expand Up @@ -103,6 +104,10 @@ impl Matcher {
Self::new(Pattern::regex(name, pattern, syntax_kind))
}

pub fn native(name: &'static str, f: fn(&mut Cursor) -> bool, syntax_kind: SyntaxKind) -> Self {
Self::new(Pattern::native(name, f, syntax_kind))
}

#[track_caller]
pub fn legacy(
name: &'static str,
Expand All @@ -114,13 +119,19 @@ impl Matcher {
}

pub fn subdivider(mut self, subdivider: Pattern) -> Self {
assert!(matches!(self.pattern.kind, SearchPatternKind::Legacy(_, _)));
assert!(matches!(
self.pattern.kind,
SearchPatternKind::Legacy(_, _) | SearchPatternKind::Native(_)
));
self.subdivider = Some(subdivider);
self
}

pub fn post_subdivide(mut self, trim_post_subdivide: Pattern) -> Self {
assert!(matches!(self.pattern.kind, SearchPatternKind::Legacy(_, _)));
assert!(matches!(
self.pattern.kind,
SearchPatternKind::Legacy(_, _) | SearchPatternKind::Native(_)
));
self.trim_post_subdivide = Some(trim_post_subdivide);
self
}
Expand Down Expand Up @@ -251,6 +262,7 @@ pub struct Pattern {
pub enum SearchPatternKind {
String(&'static str),
Regex(&'static str),
Native(fn(&mut Cursor) -> bool),
Legacy(fn(&str) -> bool, fancy_regex::Regex),
}

Expand Down Expand Up @@ -281,6 +293,14 @@ impl Pattern {
}
}

pub fn native(name: &'static str, f: fn(&mut Cursor) -> bool, syntax_kind: SyntaxKind) -> Self {
Self {
name,
syntax_kind,
kind: SearchPatternKind::Native(f),
}
}

pub fn legacy(
name: &'static str,
starts_with: fn(&str) -> bool,
Expand Down Expand Up @@ -313,6 +333,10 @@ impl Pattern {
}
}
}
SearchPatternKind::Native(f) => {
let mut cursor = Cursor::new(forward_string);
return f(&mut cursor).then(|| cursor.lexed());
}
_ => unreachable!(),
};

Expand All @@ -335,6 +359,41 @@ impl Pattern {
}
}

pub struct Cursor<'text> {
text: &'text str,
chars: Chars<'text>,
}

impl<'text> Cursor<'text> {
const EOF: char = '\0';

fn new(text: &'text str) -> Self {
Self {
text,
chars: text.chars(),
}
}

pub fn peek(&self) -> char {
self.chars.clone().next().unwrap_or(Self::EOF)
}

pub fn shift(&mut self) -> char {
self.chars.next().unwrap_or(Self::EOF)
}

pub fn shift_while(&mut self, f: impl Fn(char) -> bool + Copy) {
while self.peek() != Self::EOF && f(self.peek()) {
self.shift();
}
}

fn lexed(&self) -> &'text str {
let len = self.text.len() - self.chars.as_str().len();
&self.text[..len]
}
}

/// The Lexer class actually does the lexing step.
#[derive(Debug, Clone)]
pub struct Lexer {
Expand Down Expand Up @@ -374,7 +433,7 @@ impl Lexer {
patterns.push(pattern);
syntax_map.push((matcher.pattern.name, matcher.pattern.syntax_kind));
}
SearchPatternKind::Legacy(_, _) => {
SearchPatternKind::Legacy(_, _) | SearchPatternKind::Native(_) => {
matchers.push(matcher.clone());
}
}
Expand Down
114 changes: 92 additions & 22 deletions crates/lib-dialects/src/ansi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use sqruff_lib_core::parser::grammar::base::{Anything, Nothing, Ref};
use sqruff_lib_core::parser::grammar::conditional::Conditional;
use sqruff_lib_core::parser::grammar::delimited::Delimited;
use sqruff_lib_core::parser::grammar::sequence::{Bracketed, Sequence};
use sqruff_lib_core::parser::lexer::{Matcher, Pattern};
use sqruff_lib_core::parser::lexer::{Cursor, Matcher, Pattern};
use sqruff_lib_core::parser::matchable::{Matchable, MatchableTrait};
use sqruff_lib_core::parser::node_matcher::NodeMatcher;
use sqruff_lib_core::parser::parsers::{MultiStringParser, RegexParser, StringParser, TypedParser};
Expand Down Expand Up @@ -4909,24 +4909,19 @@ fn lexer_matchers() -> Vec<Matcher> {
vec![
Matcher::regex("whitespace", r"[^\S\r\n]+", SyntaxKind::Whitespace),
Matcher::regex("inline_comment", r"(--|#)[^\n]*", SyntaxKind::InlineComment),
Matcher::legacy(
"block_comment",
|s| s.starts_with("/*"),
r"\/\*([^\*]|\*(?!\/))*\*\/",
SyntaxKind::BlockComment,
)
.subdivider(Pattern::legacy(
"newline",
|_| true,
r"\r\n|\n",
SyntaxKind::Newline,
))
.post_subdivide(Pattern::legacy(
"whitespace",
|_| true,
r"[^\S\r\n]+",
SyntaxKind::Whitespace,
)),
Matcher::native("block_comment", block_comment, SyntaxKind::BlockComment)
.subdivider(Pattern::legacy(
"newline",
|_| true,
r"\r\n|\n",
SyntaxKind::Newline,
))
.post_subdivide(Pattern::legacy(
"whitespace",
|_| true,
r"[^\S\r\n]+",
SyntaxKind::Whitespace,
)),
Matcher::regex(
"single_quote",
r"'([^'\\]|\\.|'')*'",
Expand All @@ -4944,10 +4939,9 @@ fn lexer_matchers() -> Vec<Matcher> {
r"\$(\w*)\$[\s\S]*?\$\1\$",
SyntaxKind::DollarQuote,
),
Matcher::legacy(
Matcher::native(
"numeric_literal",
|s| s.starts_with(|ch: char| ch == '.' || ch == '-' || ch.is_ascii_alphanumeric()),
r"(?>\d+\.\d+|\d+\.(?![\.\w])|\.\d+|\d+)(\.?[eE][+-]?\d+)?((?<=\.)|(?=\b))",
numeric_literal,
SyntaxKind::NumericLiteral,
),
Matcher::regex("like_operator", r"!?~~?\*?", SyntaxKind::LikeOperator),
Expand Down Expand Up @@ -5117,3 +5111,79 @@ pub fn statement_segment() -> Matchable {
pub fn wildcard_expression_segment() -> Matchable {
Sequence::new(vec![Ref::new("WildcardIdentifierSegment").to_matchable()]).to_matchable()
}

fn numeric_literal(cursor: &mut Cursor) -> bool {
let first_char = cursor.shift();
match first_char {
'0'..='9' | '.' => {
let has_decimal = first_char == '.';

if has_decimal {
if cursor.peek().is_ascii_digit() {
cursor.shift_while(|c| c.is_ascii_digit());
} else {
return false;
}
} else {
cursor.shift_while(|c| c.is_ascii_digit());
if cursor.peek() == '.' {
cursor.shift();
cursor.shift_while(|c| c.is_ascii_digit());
}
}

if let 'e' | 'E' = cursor.peek() {
cursor.shift();
if let '+' | '-' = cursor.peek() {
cursor.shift();
}
let mut exp_digits = false;
while cursor.peek().is_ascii_digit() {
cursor.shift();
exp_digits = true;
}
if !exp_digits {
return false;
}
}

let next_char = cursor.peek();
if next_char == '.' || next_char.is_ascii_alphanumeric() || next_char == '_' {
return false;
}

true
}
_ => false,
}
}

fn block_comment(cursor: &mut Cursor) -> bool {
if cursor.shift() != '/' {
return false;
}

if cursor.shift() != '*' {
return false;
}

let mut depth = 1usize;

loop {
match cursor.shift() {
'\0' => return false,
'/' if cursor.peek() == '*' => {
cursor.shift();
depth += 1;
}
'*' if cursor.peek() == '/' => {
cursor.shift();
depth -= 1;
if depth == 0 {
break true;
}
}
_ => {}
}
}
}

0 comments on commit c2a1ea3

Please sign in to comment.