diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b1aaf38eb..b5df4aca5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,8 +5,14 @@ file(GLOB_RECURSE TEST_SOURCES "lib/**/*.cpp" ) +# Add source files that are being tested +set(REGEX_UTILS_SRC + ${CMAKE_SOURCE_DIR}/vicinae/src/services/files-service/file-indexer/regex-utils.cpp +) + add_executable(all_tests ${TEST_SOURCES} + ${REGEX_UTILS_SRC} ) set(FIXTURE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/fixtures) @@ -17,6 +23,11 @@ target_compile_definitions(all_tests PRIVATE XDGPP_FIXTURE_DIR="${FIXTURE_DIR}/xdgpp" ) +target_include_directories(all_tests PRIVATE + ${CMAKE_SOURCE_DIR}/vicinae/src + ${CMAKE_SOURCE_DIR}/vicinae/include +) + target_link_libraries(all_tests PRIVATE xdgpp diff --git a/tests/lib/file-indexer/regex-utils.cpp b/tests/lib/file-indexer/regex-utils.cpp new file mode 100644 index 000000000..c9fafa8c9 --- /dev/null +++ b/tests/lib/file-indexer/regex-utils.cpp @@ -0,0 +1,155 @@ +#include "services/files-service/file-indexer/regex-utils.hpp" +#include + +TEST_CASE("extractStaticCharsFromRegex - basic patterns", "[file-indexer]") { + SECTION(".*config.*db^") { REQUIRE(extractStaticCharsFromRegex(".*config.*db^") == "config db"); } + SECTION("^[ab]cd") { REQUIRE(extractStaticCharsFromRegex("^[ab]cd") == "cd"); } + SECTION("^ab[cd](test)k") { REQUIRE(extractStaticCharsFromRegex("^ab[cd](test)k") == "^ab test k"); } +} + +TEST_CASE("extractStaticCharsFromRegex - special regex characters", "[file-indexer]") { + SECTION("handles dot wildcard") { REQUIRE(extractStaticCharsFromRegex("a.b") == "a b"); } + SECTION("handles asterisk") { REQUIRE(extractStaticCharsFromRegex("a*b") == "a b"); } + SECTION("handles plus") { REQUIRE(extractStaticCharsFromRegex("a+b") == "a b"); } + SECTION("handles question mark") { REQUIRE(extractStaticCharsFromRegex("a?b") == "b"); } + SECTION("handles pipe") { REQUIRE(extractStaticCharsFromRegex("a|b") == ""); } + SECTION("handles dollar sign") { REQUIRE(extractStaticCharsFromRegex("abc$") == "abc"); } +} + +TEST_CASE("extractStaticCharsFromRegex - keeps only alphanumercial, space, tab, newline", "[file-indexer]") { + SECTION("strips dots") { REQUIRE(extractStaticCharsFromRegex("abc\\.") == "abc"); } + SECTION("strips dots") { REQUIRE(extractStaticCharsFromRegex("\\\\-$@#abc\\.") == "abc"); } + SECTION("strips dots") { REQUIRE(extractStaticCharsFromRegex("abc-def") == "abc def"); } +} + +TEST_CASE("extractStaticCharsFromRegex - brackets", "[file-indexer]") { + SECTION("ignores content in square brackets") { REQUIRE(extractStaticCharsFromRegex("[abc]def") == "def"); } + + SECTION("keeps content in parentheses (groups)") { + REQUIRE(extractStaticCharsFromRegex("(abc)def") == "abcdef"); + } +} + +TEST_CASE("extractStaticCharsFromRegex - caret handling", "[file-indexer]") { + SECTION("keeps caret followed by regular character") { + REQUIRE(extractStaticCharsFromRegex("^abc") == "^abc"); + } + + SECTION("removes caret not followed by character") { REQUIRE(extractStaticCharsFromRegex("^[abc]") == ""); } + + SECTION("removes caret followed by special char") { + REQUIRE(extractStaticCharsFromRegex("^.abc") == "abc"); + } + + SECTION("caret in middle without following char") { REQUIRE(extractStaticCharsFromRegex("abc^") == "abc"); } +} + +TEST_CASE("extractStaticCharsFromRegex - escaping", "[file-indexer]") { + SECTION("escaped special characters become static") { + REQUIRE(extractStaticCharsFromRegex("a\\.b") == "a b"); + } + + SECTION("escaped bracket") { REQUIRE(extractStaticCharsFromRegex("a\\[b") == "a b"); } + SECTION("escaped backslash") { REQUIRE(extractStaticCharsFromRegex("a\\\\b") == "a b"); } + SECTION("caret with escaped character") { REQUIRE(extractStaticCharsFromRegex("^\\[test") == "test"); } +} + +TEST_CASE("extractStaticCharsFromRegex - empty and edge cases", "[file-indexer]") { + SECTION("empty string") { REQUIRE(extractStaticCharsFromRegex("") == ""); } + SECTION("only special characters") { REQUIRE(extractStaticCharsFromRegex(".*+?|$") == ""); } + SECTION("only brackets") { REQUIRE(extractStaticCharsFromRegex("[abc](def)") == "def"); } + SECTION("single character") { REQUIRE(extractStaticCharsFromRegex("a") == "a"); } +} + +TEST_CASE("extractStaticCharsFromRegex - quantifiers", "[file-indexer]") { + SECTION("exactly n - {3}") { REQUIRE(extractStaticCharsFromRegex("a{3}b") == "a b"); } + SECTION("n or more - {3,}") { REQUIRE(extractStaticCharsFromRegex("a{3,}b") == "a b"); } + SECTION("between m and n - {3,5}") { REQUIRE(extractStaticCharsFromRegex("a{3,5}b") == "a b"); } + + SECTION("lazy quantifiers - ?? *? +?") { + REQUIRE(extractStaticCharsFromRegex("a??b") == "b"); + REQUIRE(extractStaticCharsFromRegex("a*?b") == "a b"); + REQUIRE(extractStaticCharsFromRegex("a+?b") == "a b"); + } + + SECTION("multiple wildcards") { REQUIRE(extractStaticCharsFromRegex("a.*b.*c") == "a b c"); } + SECTION("combined quantifiers") { REQUIRE(extractStaticCharsFromRegex("a+b*c?d") == "a b d"); } +} + +TEST_CASE("extractStaticCharsFromRegex - word boundaries", "[file-indexer]") { + SECTION("word boundary \\b") { REQUIRE(extractStaticCharsFromRegex("\\btest\\b") == "test"); } + SECTION("not word boundary \\B") { REQUIRE(extractStaticCharsFromRegex("\\Btest\\B") == "test"); } + SECTION("start of word \\<") { REQUIRE(extractStaticCharsFromRegex("\\") { REQUIRE(extractStaticCharsFromRegex("test\\>") == "test"); } +} + +TEST_CASE("extractStaticCharsFromRegex - character classes", "[file-indexer]") { + SECTION("digit \\d") { REQUIRE(extractStaticCharsFromRegex("\\d") == ""); } + SECTION("not digit \\D") { REQUIRE(extractStaticCharsFromRegex("\\D") == ""); } + SECTION("whitespace \\s") { REQUIRE(extractStaticCharsFromRegex("\\s") == ""); } + SECTION("not whitespace \\S") { REQUIRE(extractStaticCharsFromRegex("\\S") == ""); } + SECTION("word character \\w") { REQUIRE(extractStaticCharsFromRegex("\\w") == ""); } + SECTION("not word character \\W") { REQUIRE(extractStaticCharsFromRegex("\\W") == ""); } + SECTION("hex digit \\x") { REQUIRE(extractStaticCharsFromRegex("\\x41") == "A"); } +} + +TEST_CASE("extractStaticCharsFromRegex - complex patterns", "[file-indexer]") { + SECTION("multiple groups") { REQUIRE(extractStaticCharsFromRegex("(abc)(def)(ghi)") == "abcdefghi"); } + SECTION("nested groups") { REQUIRE(extractStaticCharsFromRegex("((ab)c)def") == "abcdef"); } + SECTION("alternation in group") { REQUIRE(extractStaticCharsFromRegex("(a|b)cd") == "cd"); } + + SECTION("mixed brackets and groups") { + REQUIRE(extractStaticCharsFromRegex("[abc](def)[ghi]jkl") == "def jkl"); + } + + SECTION("quantifiers with groups") { REQUIRE(extractStaticCharsFromRegex("(abc)+def") == "abc def"); } +} + +TEST_CASE("extractStaticCharsFromRegex - anchors", "[file-indexer]") { + SECTION("\\A start of string") { REQUIRE(extractStaticCharsFromRegex("\\Atest") == "test"); } + SECTION("\\Z end of string") { REQUIRE(extractStaticCharsFromRegex("test\\Z") == "test"); } + SECTION("combined anchors") { REQUIRE(extractStaticCharsFromRegex("^test$") == "^test"); } +} + +TEST_CASE("extractStaticCharsFromRegex - special escapes", "[file-indexer]") { + SECTION("tab \\t") { REQUIRE(extractStaticCharsFromRegex("a\\tb") == "a\tb"); } + SECTION("newline \\n") { REQUIRE(extractStaticCharsFromRegex("a\\nb") == "a\nb"); } + SECTION("carriage return \\r") { REQUIRE(extractStaticCharsFromRegex("a\\rb") == "a\rb"); } + SECTION("vertical tab \\v") { REQUIRE(extractStaticCharsFromRegex("a\\vb") == "a\vb"); } + SECTION("form feed \\f") { REQUIRE(extractStaticCharsFromRegex("a\\fb") == "a\fb"); } +} + +TEST_CASE("extractStaticCharsFromRegex - minWordLength", "[file-indexer]") { + SECTION("filters single character words") { + REQUIRE(extractStaticCharsFromRegex("a b c", 2) == ""); + REQUIRE(extractStaticCharsFromRegex("ab cd ef", 2) == "ab cd ef"); + REQUIRE(extractStaticCharsFromRegex("a bc d", 2) == "bc"); + } + + SECTION("filters with minimum length 3") { + REQUIRE(extractStaticCharsFromRegex("a bc def", 3) == "def"); + REQUIRE(extractStaticCharsFromRegex("foo bar ab", 3) == "foo bar"); + REQUIRE(extractStaticCharsFromRegex("test a b", 3) == "test"); + } + + SECTION("keeps all words when min is 0") { + REQUIRE(extractStaticCharsFromRegex("a b c", 0) == "a b c"); + REQUIRE(extractStaticCharsFromRegex("test a bc", 0) == "test a bc"); + } + + SECTION("regex patterns with minWordLength") { + REQUIRE(extractStaticCharsFromRegex(".*config.*db^", 3) == "config"); + REQUIRE(extractStaticCharsFromRegex("^ab[cd](test)k", 3) == "test"); + REQUIRE(extractStaticCharsFromRegex("a.*b.*c", 2) == ""); + } + + SECTION("empty result when all words too short") { + REQUIRE(extractStaticCharsFromRegex("a b c d e", 3) == ""); + REQUIRE(extractStaticCharsFromRegex("ab cd", 3) == ""); + } + + SECTION("preserves word boundaries") { + REQUIRE(extractStaticCharsFromRegex("test a b config", 4) == "test config"); + REQUIRE(extractStaticCharsFromRegex("a very long test", 4) == "very long test"); + } +} diff --git a/vicinae/CMakeLists.txt b/vicinae/CMakeLists.txt index f8b601400..88769265e 100644 --- a/vicinae/CMakeLists.txt +++ b/vicinae/CMakeLists.txt @@ -7,7 +7,7 @@ find_package(Qt6 REQUIRED COMPONENTS Core Widgets Sql Network Svg DBus Concurren find_package(OpenSSL REQUIRED) find_package(LibXml2 REQUIRED) -list(APPEND LIBS Qt6::Widgets Qt6::Sql Qt6::Network Qt6::Svg Qt6::DBus Qt6::Concurrent ${CMARK_LIBRARY} ${CMARK_EXT_LIBRARY} protobuf::libprotobuf minizip OpenSSL::Crypto wayland-client xdgpp qt6keychain LibXml2::LibXml2) +list(APPEND LIBS Qt6::Widgets Qt6::Sql Qt6::Network Qt6::Svg Qt6::DBus Qt6::Concurrent ${CMARK_LIBRARY} ${CMARK_EXT_LIBRARY} protobuf::libprotobuf minizip OpenSSL::Crypto wayland-client xdgpp qt6keychain sqlite3 LibXml2::LibXml2) set(WLR_CLIP_BIN ${CMAKE_BINARY_DIR}/wlr-clip/wlr-clip${CMAKE_EXECUTABLE_SUFFIX}) set(ASSET_PATH ${CMAKE_CURRENT_SOURCE_DIR}/assets) @@ -504,6 +504,8 @@ set(SRCS src/services/files-service/file-indexer/writer-worker.cpp src/services/files-service/file-indexer/scan-dispatcher.cpp src/services/files-service/file-indexer/abstract-scanner.hpp + src/services/files-service/file-indexer/regex-utils.hpp + src/services/files-service/file-indexer/regex-utils.cpp src/services/files-service/file-indexer/file-indexer-query-engine.hpp src/services/extension-registry/extension-registry.hpp diff --git a/vicinae/database/file-indexer/migrations.qrc b/vicinae/database/file-indexer/migrations.qrc index d342dafdc..aaec1f2ff 100644 --- a/vicinae/database/file-indexer/migrations.qrc +++ b/vicinae/database/file-indexer/migrations.qrc @@ -1,5 +1,6 @@ migrations/001_init.sql + migrations/002_tri_idx.sql diff --git a/vicinae/database/file-indexer/migrations/002_tri_idx.sql b/vicinae/database/file-indexer/migrations/002_tri_idx.sql new file mode 100644 index 000000000..8455d53c7 --- /dev/null +++ b/vicinae/database/file-indexer/migrations/002_tri_idx.sql @@ -0,0 +1,13 @@ + +CREATE VIRTUAL TABLE IF NOT EXISTS tri_idx USING fts5(name, content='indexed_file', + tokenize='trigram'); + +INSERT INTO tri_idx(tri_idx) VALUES('rebuild'); + +-- Triggers to keep the FTS index up to date. + +CREATE TRIGGER tri_idx_ai AFTER INSERT ON indexed_file BEGIN + INSERT INTO tri_idx(rowid, name) VALUES (new.id, new.name);END; + +CREATE TRIGGER tri_idx_ad AFTER DELETE ON indexed_file BEGIN + INSERT INTO tri_idx(tri_idx, rowid, name) VALUES('delete', old.id, old.name);END; diff --git a/vicinae/src/extensions/file/file-extension.hpp b/vicinae/src/extensions/file/file-extension.hpp index 376af8aad..1635cb06b 100644 --- a/vicinae/src/extensions/file/file-extension.hpp +++ b/vicinae/src/extensions/file/file-extension.hpp @@ -97,7 +97,12 @@ class FileExtension : public BuiltinCommandRepository { watcherPaths.setDescription("Semicolon-separated list of paths watched by experimental watcher"); watcherPaths.setDefaultValue(""); - return {indexing, paths, excludedPaths, watcherPaths}; + auto useRegex = Preference::makeCheckbox("useRegex"); + useRegex.setTitle("Use regex search"); + useRegex.setDescription("Enable regular expression matching for file searches"); + useRegex.setDefaultValue(false); + + return {indexing, paths, excludedPaths, watcherPaths, useRegex}; } void preferenceValuesChanged(const QJsonObject &preferences) const override { diff --git a/vicinae/src/services/extension-registry/extension-manifest.cpp b/vicinae/src/services/extension-registry/extension-manifest.cpp index df158ecbc..aa6efc842 100644 --- a/vicinae/src/services/extension-registry/extension-manifest.cpp +++ b/vicinae/src/services/extension-registry/extension-manifest.cpp @@ -1,5 +1,6 @@ #include "extension-manifest.hpp" #include "utils.hpp" +#include namespace fs = std::filesystem; @@ -100,7 +101,7 @@ Preference ExtensionManifest::parsePreferenceFromObject(const QJsonObject &obj) for (const auto &child : data) { auto obj = child.toObject(); - options.push_back({.title = obj["title"].toString(), .value = obj["value"].toString()}); + options.push_back(Preference::DropdownData::Option{obj["title"].toString(), obj["value"].toString()}); } base.setData(Preference::DropdownData{options}); @@ -150,7 +151,7 @@ CommandArgument ExtensionManifest::parseArgumentFromObject(const QJsonObject &ob for (const auto &child : data) { auto obj = child.toObject(); - options.push_back({.title = obj["title"].toString(), .value = obj["value"].toString()}); + options.push_back(CommandArgument::DropdownData{obj["title"].toString(), obj["value"].toString()}); } arg.data = options; diff --git a/vicinae/src/services/files-service/abstract-file-indexer.hpp b/vicinae/src/services/files-service/abstract-file-indexer.hpp index 3b51d37b1..06e51513f 100644 --- a/vicinae/src/services/files-service/abstract-file-indexer.hpp +++ b/vicinae/src/services/files-service/abstract-file-indexer.hpp @@ -38,11 +38,14 @@ struct Pagination { int limit = 50; }; +struct FileIndexerQueryParams { + Pagination pagination; + bool useRegex = false; +}; + class AbstractFileIndexer : public QObject { public: - struct QueryParams { - Pagination pagination; - }; + using QueryParams = FileIndexerQueryParams; public: virtual void start() = 0; diff --git a/vicinae/src/services/files-service/file-indexer/file-indexer-db.cpp b/vicinae/src/services/files-service/file-indexer/file-indexer-db.cpp index 034aa711f..0f1fdac1c 100644 --- a/vicinae/src/services/files-service/file-indexer/file-indexer-db.cpp +++ b/vicinae/src/services/files-service/file-indexer/file-indexer-db.cpp @@ -4,6 +4,7 @@ #include "services/files-service/file-indexer/relevancy-scorer.hpp" #include "utils/migration-manager/migration-manager.hpp" #include "utils/utils.hpp" +#include "regex-utils.hpp" #include #include #include @@ -12,6 +13,10 @@ #include #include #include +#include +#include +#include +#include // clang-format off static const std::vector SQLITE_PRAGMAS = { @@ -247,7 +252,32 @@ QSqlDatabase *FileIndexerDatabase::database() { return &m_db; } std::vector FileIndexerDatabase::search(std::string_view searchQuery, const AbstractFileIndexer::QueryParams ¶ms) { - auto queryString = QString(R"( + QString queryString; + std::string_view regexString; + std::string staticChars; + bool hasPattern = false; + + if (params.useRegex) { + regexString = searchQuery; + // trigram search requires at least 3 characters + staticChars = extractStaticCharsFromRegex(searchQuery, 3); + searchQuery = staticChars; + bool hasSearchString = !searchQuery.empty(); + hasPattern = !regexString.empty(); + + queryString = + QString(R"( + SELECT f.path, tri_idx.rank FROM indexed_file f + JOIN tri_idx ON tri_idx.rowid = f.id + WHERE %2%3 + ORDER BY f.relevancy_score, tri_idx.rank + LIMIT :limit + OFFSET :offset + )") + .arg(hasSearchString ? "tri_idx MATCH '" + qStringFromStdView(searchQuery) + "'" : "1=1") + .arg(hasPattern ? " AND f.name REGEXP :pattern" : ""); + } else { + queryString = QString(R"( SELECT path, rank FROM indexed_file f JOIN unicode_idx ON unicode_idx.rowid = f.id WHERE @@ -256,11 +286,13 @@ std::vector FileIndexerDatabase::search(std::string_view searchQuery, LIMIT :limit OFFSET :offset )") - .arg(qStringFromStdView(searchQuery)); + .arg(qStringFromStdView(searchQuery)); + } QSqlQuery query(m_db); - query.prepare(queryString); + + if (params.useRegex && hasPattern) { query.bindValue(":pattern", qStringFromStdView(regexString)); } query.bindValue(":limit", params.pagination.limit); query.bindValue(":offset", params.pagination.offset); @@ -424,6 +456,31 @@ void FileIndexerDatabase::indexFiles(const std::vector &p if (!m_db.commit()) { qCritical() << "Failed to commit batchIndex" << m_db.lastError(); } } +static void sqliteRegexpCallback(sqlite3_context *context, int argc, sqlite3_value **argv) { + if (argc != 2) { + sqlite3_result_error(context, "REGEXP requires 2 arguments", -1); + return; + } + + const char *pattern = reinterpret_cast(sqlite3_value_text(argv[0])); + const char *text = reinterpret_cast(sqlite3_value_text(argv[1])); + + if (!pattern || !text) { + sqlite3_result_null(context); + return; + } + + QRegularExpression regex(QString::fromUtf8(pattern)); + QString textStr = QString::fromUtf8(text); + + if (!regex.isValid()) { + sqlite3_result_error(context, "Invalid regular expression", -1); + return; + } + + sqlite3_result_int(context, regex.match(textStr).hasMatch() ? 1 : 0); +} + FileIndexerDatabase::FileIndexerDatabase() : m_connectionId(createRandomConnectionId()) { m_db = QSqlDatabase::addDatabase("QSQLITE", m_connectionId); m_db.setDatabaseName(getDatabasePath().c_str()); @@ -433,6 +490,15 @@ FileIndexerDatabase::FileIndexerDatabase() : m_connectionId(createRandomConnecti return; } + QVariant v = m_db.driver()->handle(); + if (v.isValid() && qstrcmp(v.typeName(), "sqlite3*") == 0) { + sqlite3 *db_handle = *static_cast(v.data()); + if (db_handle) { + sqlite3_create_function(db_handle, "REGEXP", 2, SQLITE_UTF8, nullptr, sqliteRegexpCallback, nullptr, + nullptr); + } + } + QSqlQuery query(m_db); for (const auto &pragma : SQLITE_PRAGMAS) { diff --git a/vicinae/src/services/files-service/file-indexer/file-indexer-query-engine.hpp b/vicinae/src/services/files-service/file-indexer/file-indexer-query-engine.hpp index abff32f43..9e78926cc 100644 --- a/vicinae/src/services/files-service/file-indexer/file-indexer-query-engine.hpp +++ b/vicinae/src/services/files-service/file-indexer/file-indexer-query-engine.hpp @@ -21,7 +21,7 @@ class FileIndexerQueryEngineWorker : public QObject { public slots: void handleQuery(const QString &query, const AbstractFileIndexer::QueryParams ¶ms) { std::vector paths; - QString finalQuery = preparePrefixSearchQuery(query); + QString finalQuery = params.useRegex ? query : preparePrefixSearchQuery(query); paths = db->search(finalQuery.toStdString(), params); std::vector results; diff --git a/vicinae/src/services/files-service/file-indexer/file-indexer.cpp b/vicinae/src/services/files-service/file-indexer/file-indexer.cpp index 2a191e808..7925a6671 100644 --- a/vicinae/src/services/files-service/file-indexer/file-indexer.cpp +++ b/vicinae/src/services/files-service/file-indexer/file-indexer.cpp @@ -132,13 +132,17 @@ void FileIndexer::preferenceValuesChanged(const QJsonObject &preferences) { preferences.value("watcherPaths").toString().split(';', Qt::SkipEmptyParts) | std::views::transform([](const QStringView &v) { return fs::path(v.toString().toStdString()); })); + m_useRegex = preferences.value("useRegex").toBool(); + std::string databaseFilename = FileIndexerDatabase::getDatabasePath().filename().string(); m_excludedFilenames = {databaseFilename, databaseFilename + "-wal"}; } QFuture> FileIndexer::queryAsync(std::string_view view, const QueryParams ¶ms) { - return m_queryEngine.query(view, params); + QueryParams paramsWithRegex = params; + paramsWithRegex.useRegex = m_useRegex; + return m_queryEngine.query(view, paramsWithRegex); } FileIndexer::FileIndexer() : m_writer(std::make_shared()), m_dispatcher(m_writer) { diff --git a/vicinae/src/services/files-service/file-indexer/file-indexer.hpp b/vicinae/src/services/files-service/file-indexer/file-indexer.hpp index 1c6067dab..2de6c605d 100644 --- a/vicinae/src/services/files-service/file-indexer/file-indexer.hpp +++ b/vicinae/src/services/files-service/file-indexer/file-indexer.hpp @@ -36,10 +36,13 @@ class FileIndexer : public AbstractFileIndexer { std::unique_ptr m_homeWatcher; - // move that somewhere else later + bool m_useRegex = false; + QString preparePrefixSearchQuery(std::string_view query) const; public: + bool useRegex() const { return m_useRegex; } + void startFullScan(); void startSingleScan(std::filesystem::path entrypoint, ScanType type, std::vector excludedFilenames = {}); diff --git a/vicinae/src/services/files-service/file-indexer/regex-utils.cpp b/vicinae/src/services/files-service/file-indexer/regex-utils.cpp new file mode 100644 index 000000000..00f298220 --- /dev/null +++ b/vicinae/src/services/files-service/file-indexer/regex-utils.cpp @@ -0,0 +1,221 @@ +// ai-generated based on the test cases, so please keep cases updated if making changes to this file. +#include "regex-utils.hpp" +#include + +static constexpr bool isCharClass(char c) { + return c == 'd' || c == 'D' || c == 's' || c == 'S' || c == 'w' || c == 'W'; +} + +static constexpr bool isAnchor(char c) { + return c == 'b' || c == 'B' || c == '<' || c == '>' || c == 'A' || c == 'Z'; +} + +static constexpr bool isSpecialChar(char c) { + return c == '[' || c == '(' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '$' || + c == '^' || c == '{'; +} + +static void addSpaceIfNeeded(std::string &result) { + if (!result.empty() && result.back() != ' ' && result.back() != '^') { result += ' '; } +} + +static void flushCaretIfPending(std::string &result, bool &caretPending) { + if (caretPending) { + result += '^'; + caretPending = false; + } +} + +static void addFilteredWord(std::string &filtered, const std::string &word, bool hasCaret, + size_t minWordLength) { + size_t wordLen = word.length() - (hasCaret && !word.empty() ? 1 : 0); + if (!word.empty() && wordLen >= minWordLength) { + if (!filtered.empty()) { filtered += ' '; } + filtered += word; + } +} + +std::string extractStaticCharsFromRegex(std::string_view regex, size_t minWordLength) { + std::string result; + bool inBracket = false; + bool escaped = false; + bool caretPending = false; + bool hasAlternation = false; + int parenDepth = 0; + bool needSpaceAfterGroup = false; + bool parenAfterBracket = false; + int groupWithAlternationDepth = -1; + size_t groupStartPos = 0; + + for (size_t i = 0; i < regex.length(); ++i) { + char c = regex[i]; + + if (escaped) { + flushCaretIfPending(result, caretPending); + if (needSpaceAfterGroup && parenDepth == 0) { + result += ' '; + needSpaceAfterGroup = false; + } + + switch (c) { + case 't': + result += '\t'; + break; + case 'n': + result += '\n'; + break; + case 'r': + result += '\r'; + break; + case 'v': + result += '\v'; + break; + case 'f': + result += '\f'; + break; + case 'x': + if (i + 2 < regex.length()) { + char hex[3] = {regex[i + 1], regex[i + 2], '\0'}; + result += static_cast(std::strtol(hex, nullptr, 16)); + i += 2; + } + break; + default: + if (!isCharClass(c) && !isAnchor(c)) { result += c; } + break; + } + escaped = false; + continue; + } + + switch (c) { + case '\\': + escaped = true; + caretPending = false; + continue; + case '[': + inBracket = true; + caretPending = false; + continue; + case ']': + inBracket = false; + needSpaceAfterGroup = true; + continue; + } + + if (inBracket) { continue; } + + switch (c) { + case '{': { + size_t closePos = regex.find('}', i); + if (closePos != std::string_view::npos) { + caretPending = false; + needSpaceAfterGroup = false; + addSpaceIfNeeded(result); + i = closePos; + continue; + } + } + case '|': + if (parenDepth > 0) { + if (groupWithAlternationDepth == -1) { + groupWithAlternationDepth = parenDepth; + result.resize(groupStartPos); + } + } else { + hasAlternation = true; + } + continue; + case '(': + parenDepth++; + if (groupWithAlternationDepth == -1) { groupStartPos = result.size(); } + if (needSpaceAfterGroup) { + addSpaceIfNeeded(result); + parenAfterBracket = true; + } + needSpaceAfterGroup = false; + continue; + case ')': + if (parenDepth > 0) parenDepth--; + if (parenDepth == 0 && parenAfterBracket) { + needSpaceAfterGroup = true; + parenAfterBracket = false; + } + if (groupWithAlternationDepth == parenDepth + 1) { groupWithAlternationDepth = -1; } + continue; + case '^': + if (i + 1 < regex.length() && !isSpecialChar(regex[i + 1]) && regex[i + 1] != '\\') { + caretPending = true; + } + continue; + case '.': + caretPending = false; + needSpaceAfterGroup = false; + addSpaceIfNeeded(result); + continue; + case '?': + if (i > 0 && (regex[i - 1] == '*' || regex[i - 1] == '+' || regex[i - 1] == '?')) { continue; } + if (!result.empty()) { + if (result.back() == ' ') { result.pop_back(); } + if (!result.empty() && result.back() != '^') { result.pop_back(); } + } + caretPending = false; + needSpaceAfterGroup = false; + continue; + case '*': + case '+': + caretPending = false; + needSpaceAfterGroup = false; + addSpaceIfNeeded(result); + continue; + case '$': + caretPending = false; + needSpaceAfterGroup = false; + continue; + } + + if (groupWithAlternationDepth > 0 && parenDepth >= groupWithAlternationDepth) { continue; } + + if (needSpaceAfterGroup && parenDepth == 0) { addSpaceIfNeeded(result); } + if (parenDepth == 0) { needSpaceAfterGroup = false; } + + flushCaretIfPending(result, caretPending); + result += c; + } + + if (hasAlternation) { return ""; } + + std::string cleanResult; + for (char c : result) { + auto uc = static_cast(c); + if (std::isalnum(uc) || c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\v' || c == '\f' || + c == '^') { + cleanResult += c; + } else if (!std::isspace(uc) && !cleanResult.empty() && cleanResult.back() != ' ') { + cleanResult += ' '; + } + } + + while (!cleanResult.empty() && cleanResult.back() == ' ') { + cleanResult.pop_back(); + } + + if (minWordLength == 0) { return cleanResult; } + + std::string filtered, currentWord; + bool hasCaret = false; + + for (char c : cleanResult) { + if (c == ' ') { + addFilteredWord(filtered, currentWord, hasCaret, minWordLength); + currentWord.clear(); + hasCaret = false; + } else { + if (c == '^' && currentWord.empty()) { hasCaret = true; } + currentWord += c; + } + } + + addFilteredWord(filtered, currentWord, hasCaret, minWordLength); + return filtered; +} diff --git a/vicinae/src/services/files-service/file-indexer/regex-utils.hpp b/vicinae/src/services/files-service/file-indexer/regex-utils.hpp new file mode 100644 index 000000000..9d8a5299b --- /dev/null +++ b/vicinae/src/services/files-service/file-indexer/regex-utils.hpp @@ -0,0 +1,5 @@ +#pragma once +#include +#include + +std::string extractStaticCharsFromRegex(std::string_view regex, size_t minWordLength = 0); diff --git a/vicinae/src/ui/vlist/vlist.cpp b/vicinae/src/ui/vlist/vlist.cpp index 4f7132834..5fdcbaa3f 100644 --- a/vicinae/src/ui/vlist/vlist.cpp +++ b/vicinae/src/ui/vlist/vlist.cpp @@ -1,6 +1,7 @@ #include "vlist.hpp" #include #include +#include #include namespace vicinae::ui {