From 33ce0875609cdc378b9b7aa41a2dec468c2a590a Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Thu, 15 Jun 2023 19:38:12 +0200 Subject: [PATCH] feat: add std regex builtins Upstream issue: https://github.com/google/jsonnet/pull/1039 --- Cargo.lock | 44 ++++++ .../src/typed/conversions.rs | 16 +++ crates/jrsonnet-stdlib/Cargo.toml | 3 + crates/jrsonnet-stdlib/src/lib.rs | 35 +++++ crates/jrsonnet-stdlib/src/regex.rs | 134 ++++++++++++++++++ 5 files changed, 232 insertions(+) create mode 100644 crates/jrsonnet-stdlib/src/regex.rs diff --git a/Cargo.lock b/Cargo.lock index f905aee4..328578fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,6 +13,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + [[package]] name = "annotate-snippets" version = "0.9.1" @@ -370,8 +379,11 @@ dependencies = [ "jrsonnet-gcmodule", "jrsonnet-macros", "jrsonnet-parser", + "lru", "md5", "num-bigint", + "regex", + "rustc-hash", "serde", "serde_json", "serde_yaml_with_quirks", @@ -425,12 +437,27 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lru" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03f1160296536f10c833a82dca22267d5486734230d47bf00bf435885814ba1e" +dependencies = [ + "hashbrown 0.13.2", +] + [[package]] name = "md5" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "mimalloc-sys" version = "0.1.6" @@ -600,6 +627,23 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + [[package]] name = "rustc-hash" version = "1.1.0" diff --git a/crates/jrsonnet-evaluator/src/typed/conversions.rs b/crates/jrsonnet-evaluator/src/typed/conversions.rs index 9d9fb0df..8205346e 100644 --- a/crates/jrsonnet-evaluator/src/typed/conversions.rs +++ b/crates/jrsonnet-evaluator/src/typed/conversions.rs @@ -223,6 +223,22 @@ impl Typed for String { } } +impl Typed for StrValue { + const TYPE: &'static ComplexValType = &ComplexValType::Simple(ValType::Str); + + fn into_untyped(value: Self) -> Result { + Ok(Val::Str(value)) + } + + fn from_untyped(value: Val) -> Result { + ::TYPE.check(&value)?; + match value { + Val::Str(s) => Ok(s), + _ => unreachable!(), + } + } +} + impl Typed for char { const TYPE: &'static ComplexValType = &ComplexValType::Char; diff --git a/crates/jrsonnet-stdlib/Cargo.toml b/crates/jrsonnet-stdlib/Cargo.toml index e4592f8f..6e8f2b49 100644 --- a/crates/jrsonnet-stdlib/Cargo.toml +++ b/crates/jrsonnet-stdlib/Cargo.toml @@ -42,6 +42,9 @@ serde_json = "1.0" serde_yaml_with_quirks = "0.8.24" num-bigint = { version = "0.4.3", optional = true } +regex = "1.8.4" +lru = "0.10.0" +rustc-hash = "1.1.0" [build-dependencies] jrsonnet-parser.workspace = true diff --git a/crates/jrsonnet-stdlib/src/lib.rs b/crates/jrsonnet-stdlib/src/lib.rs index f28e6d5c..b1a8cc86 100644 --- a/crates/jrsonnet-stdlib/src/lib.rs +++ b/crates/jrsonnet-stdlib/src/lib.rs @@ -44,6 +44,8 @@ mod sets; pub use sets::*; mod compat; pub use compat::*; +mod regex; +pub use crate::regex::*; pub fn stdlib_uncached(settings: Rc>) -> ObjValue { let mut builder = ObjValueBuilder::new(); @@ -154,6 +156,8 @@ pub fn stdlib_uncached(settings: Rc>) -> ObjValue { // Sets ("setMember", builtin_set_member::INST), ("setInter", builtin_set_inter::INST), + // Regex + ("regexQuoteMeta", builtin_regex_quote_meta::INST), // Compat ("__compare", builtin___compare::INST), ] @@ -187,6 +191,37 @@ pub fn stdlib_uncached(settings: Rc>) -> ObjValue { .value(Val::Func(FuncVal::builtin(builtin_trace { settings }))) .expect("no conflict"); + // Regex + let regex_cache = RegexCache::default(); + builder + .member("regexFullMatch".into()) + .hide() + .value(Val::Func(FuncVal::builtin(builtin_regex_full_match { + cache: regex_cache.clone(), + }))) + .expect("no conflict"); + builder + .member("regexPartialMatch".into()) + .hide() + .value(Val::Func(FuncVal::builtin(builtin_regex_partial_match { + cache: regex_cache.clone(), + }))) + .expect("no conflict"); + builder + .member("regexReplace".into()) + .hide() + .value(Val::Func(FuncVal::builtin(builtin_regex_replace { + cache: regex_cache.clone(), + }))) + .expect("no conflict"); + builder + .member("regexGlobalReplace".into()) + .hide() + .value(Val::Func(FuncVal::builtin(builtin_regex_global_replace { + cache: regex_cache.clone(), + }))) + .expect("no conflict"); + builder .member("id".into()) .hide() diff --git a/crates/jrsonnet-stdlib/src/regex.rs b/crates/jrsonnet-stdlib/src/regex.rs new file mode 100644 index 00000000..27482580 --- /dev/null +++ b/crates/jrsonnet-stdlib/src/regex.rs @@ -0,0 +1,134 @@ +use std::{cell::RefCell, hash::BuildHasherDefault, num::NonZeroUsize, rc::Rc}; + +use ::regex::Regex; +use jrsonnet_evaluator::{ + error::{ErrorKind::*, Result}, + val::StrValue, + IStr, ObjValueBuilder, Val, +}; +use jrsonnet_macros::builtin; +use lru::LruCache; +use rustc_hash::FxHasher; + +pub struct RegexCacheInner { + cache: RefCell, BuildHasherDefault>>, +} +impl Default for RegexCacheInner { + fn default() -> Self { + Self { + cache: RefCell::new(LruCache::with_hasher( + NonZeroUsize::new(20).unwrap(), + BuildHasherDefault::default(), + )), + } + } +} +pub type RegexCache = Rc; +impl RegexCacheInner { + fn parse(&self, pattern: IStr) -> Result> { + let mut cache = self.cache.borrow_mut(); + if let Some(found) = cache.get(&pattern) { + return Ok(found.clone()); + } + let regex = Regex::new(&pattern) + .map_err(|e| RuntimeError(format!("regex parse failed: {e}").into()))?; + let regex = Rc::new(regex); + cache.push(pattern, regex.clone()); + Ok(regex) + } +} + +pub fn regex_match_inner(regex: &Regex, str: String) -> Result { + let mut out = ObjValueBuilder::with_capacity(3); + + let mut captures = Vec::with_capacity(regex.captures_len()); + let mut named_captures = ObjValueBuilder::with_capacity(regex.capture_names().len()); + + let Some(captured) = regex.captures(&str) else { + return Ok(Val::Null) + }; + + for ele in captured.iter().skip(1) { + if let Some(ele) = ele { + captures.push(Val::Str(StrValue::Flat(ele.as_str().into()))) + } else { + captures.push(Val::Str(StrValue::Flat(IStr::empty()))) + } + } + for (i, name) in regex + .capture_names() + .skip(1) + .enumerate() + .flat_map(|(i, v)| Some((i, v?))) + { + let capture = captures[i].clone(); + named_captures.member(name.into()).value(capture)?; + } + + out.member("string".into()) + .value_unchecked(Val::Str(captured.get(0).unwrap().as_str().into())); + out.member("captures".into()) + .value_unchecked(Val::Arr(captures.into())); + out.member("namedCaptures".into()) + .value_unchecked(Val::Obj(named_captures.build())); + + Ok(Val::Obj(out.build())) +} + +#[builtin(fields( + cache: RegexCache, +))] +pub fn builtin_regex_partial_match( + this: &builtin_regex_partial_match, + pattern: IStr, + str: String, +) -> Result { + let regex = this.cache.parse(pattern)?; + regex_match_inner(®ex, str) +} + +#[builtin(fields( + cache: RegexCache, +))] +pub fn builtin_regex_full_match( + this: &builtin_regex_full_match, + pattern: StrValue, + str: String, +) -> Result { + let pattern = format!("^{pattern}$").into(); + let regex = this.cache.parse(pattern)?; + regex_match_inner(®ex, str) +} + +#[builtin] +pub fn builtin_regex_quote_meta(pattern: String) -> String { + regex::escape(&pattern) +} + +#[builtin(fields( + cache: RegexCache, +))] +pub fn builtin_regex_replace( + this: &builtin_regex_replace, + str: String, + pattern: IStr, + to: String, +) -> Result { + let regex = this.cache.parse(pattern)?; + let replaced = regex.replace(&str, to); + Ok(replaced.to_string()) +} + +#[builtin(fields( + cache: RegexCache, +))] +pub fn builtin_regex_global_replace( + this: &builtin_regex_global_replace, + str: String, + pattern: IStr, + to: String, +) -> Result { + let regex = this.cache.parse(pattern)?; + let replaced = regex.replace_all(&str, to); + Ok(replaced.to_string()) +}