-
Notifications
You must be signed in to change notification settings - Fork 445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add RegEx support using RE2 (rebased #665) #1039
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# CMake script run a generation-time. This must be separate from the main | ||
# CMakeLists.txt file to allow downloading and building googletest at generation | ||
# time. | ||
cmake_minimum_required(VERSION 2.8.2) | ||
|
||
project(re2-download NONE) | ||
|
||
include(ExternalProject) | ||
ExternalProject_Add(re2 | ||
GIT_REPOSITORY https://github.com/google/re2.git | ||
GIT_TAG 2022-06-01 | ||
SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src" | ||
BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build" | ||
CONFIGURE_COMMAND "" | ||
BUILD_COMMAND "" | ||
INSTALL_COMMAND "" | ||
TEST_COMMAND "" | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ limitations under the License. | |
#include "parser.h" | ||
#include "ryml_std.hpp" // include this before any other ryml header | ||
#include "ryml.hpp" | ||
#include "re2/re2.h" | ||
#include "state.h" | ||
#include "static_analysis.h" | ||
#include "string_utils.h" | ||
|
@@ -49,6 +50,10 @@ using json = nlohmann::json; | |
|
||
namespace { | ||
|
||
static const Fodder EF; // Empty fodder. | ||
|
||
static const LocationRange E; // Empty. | ||
|
||
/** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "". | ||
*/ | ||
std::string dir_name(const std::string &path) | ||
|
@@ -938,6 +943,11 @@ class Interpreter { | |
builtins["parseYaml"] = &Interpreter::builtinParseYaml; | ||
builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8; | ||
builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8; | ||
builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch; | ||
builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch; | ||
builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta; | ||
builtins["regexReplace"] = &Interpreter::builtinRegexReplace; | ||
builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace; | ||
|
||
DesugaredObject *stdlib = makeStdlibAST(alloc, "__internal__"); | ||
jsonnet_static_analysis(stdlib); | ||
|
@@ -1440,6 +1450,129 @@ class Interpreter { | |
return decodeUTF8(); | ||
} | ||
|
||
const AST *regexMatch(const std::string &pattern, const std::string &string, bool full) | ||
{ | ||
RE2 re(pattern, RE2::CannedOptions::Quiet); | ||
if (!re.ok()) { | ||
std::stringstream ss; | ||
ss << "Invalid regex '" << re.pattern() << "': " << re.error(); | ||
throw makeError(stack.top().location, ss.str()); | ||
} | ||
|
||
int num_groups = re.NumberOfCapturingGroups(); | ||
|
||
std::vector<std::string> rcaptures(num_groups); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case of optional group, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the underlying library returns an empty string then it's clearly "OK" to do that. Would we choose to differ? I'm not sure there's much point, because you have to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could also make the argument that the only reason the RE2 C++ API returns the empty string is because it is much harder to return a sentinel in that language. I wonder what the Go library does? @rohitjangid There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, empty match is not the same as the optional group is not matched. |
||
std::vector<RE2::Arg> rargv(num_groups); | ||
std::vector<const RE2::Arg*> rargs(num_groups); | ||
for (int i = 0; i < num_groups; ++i) { | ||
rargs[i] = &rargv[i]; | ||
rargv[i] = &rcaptures[i]; | ||
} | ||
|
||
if (full ? RE2::FullMatchN(string, re, rargs.data(), num_groups) | ||
: RE2::PartialMatchN(string, re, rargs.data(), num_groups)) { | ||
std::map<const Identifier *, HeapSimpleObject::Field> fields; | ||
|
||
const Identifier *fid = alloc->makeIdentifier(U"string"); | ||
fields[fid].hide = ObjectField::VISIBLE; | ||
fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", ""); | ||
|
||
fid = alloc->makeIdentifier(U"captures"); | ||
fields[fid].hide = ObjectField::VISIBLE; | ||
std::vector<Array::Element> captures; | ||
for (int i = 0; i < num_groups; ++i) { | ||
captures.push_back(Array::Element( | ||
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""), | ||
EF)); | ||
} | ||
fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF); | ||
|
||
fid = alloc->makeIdentifier(U"namedCaptures"); | ||
fields[fid].hide = ObjectField::VISIBLE; | ||
DesugaredObject::Fields named_captures; | ||
const std::map<std::string, int> &named_groups = re.NamedCapturingGroups(); | ||
for (auto it = named_groups.cbegin(); it != named_groups.cend(); ++it) { | ||
named_captures.push_back(DesugaredObject::Field( | ||
ObjectField::VISIBLE, | ||
alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""), | ||
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", ""))); | ||
} | ||
fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures); | ||
|
||
scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{}); | ||
} else { | ||
scratch = makeNull(); | ||
} | ||
return nullptr; | ||
} | ||
|
||
const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING}); | ||
|
||
std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value); | ||
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value); | ||
|
||
return regexMatch(pattern, string, true); | ||
} | ||
|
||
const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING}); | ||
|
||
std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value); | ||
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value); | ||
|
||
return regexMatch(pattern, string, false); | ||
} | ||
|
||
const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING}); | ||
scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value)))); | ||
return nullptr; | ||
} | ||
|
||
const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING}); | ||
|
||
std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value); | ||
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value); | ||
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value); | ||
|
||
RE2 re(pattern, RE2::CannedOptions::Quiet); | ||
if(!re.ok()) { | ||
std::stringstream ss; | ||
ss << "Invalid regex '" << re.pattern() << "': " << re.error(); | ||
throw makeError(stack.top().location, ss.str()); | ||
} | ||
|
||
RE2::Replace(&string, re, replace); | ||
scratch = makeString(decode_utf8(string)); | ||
return nullptr; | ||
} | ||
|
||
const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING}); | ||
|
||
std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value); | ||
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value); | ||
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value); | ||
|
||
RE2 re(pattern, RE2::CannedOptions::Quiet); | ||
if(!re.ok()) { | ||
std::stringstream ss; | ||
ss << "Invalid regex '" << re.pattern() << "': " << re.error(); | ||
throw makeError(stack.top().location, ss.str()); | ||
} | ||
|
||
RE2::GlobalReplace(&string, re, replace); | ||
scratch = makeString(decode_utf8(string)); | ||
return nullptr; | ||
} | ||
|
||
const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args) | ||
{ | ||
if(args[0].t != Value::STRING) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the most controversial part. Introducing a dependency here may break existing users. The fix is easy for them, so maybe not a very big deal. Going from 0 to some dependencies might still be problematic in some contexts, but it's probably worth it for regexps.
We definitely need to add a README section for dependencies, though. It would also be good to check that the Python package works as expected and maybe mention the dependencies in the description.
@sparkprime What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll look into that when I do the release. I presume it will break existing windows build processes but whoever uses that ought to be able to figure it out.