Skip to content

Commit

Permalink
use a custom utf8-iterator to better handle regex captures with utf8 …
Browse files Browse the repository at this point in the history
…text

closes #404
  • Loading branch information
stefankueng committed Jan 15, 2025
1 parent b571d1a commit a52e9ff
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 20 deletions.
2 changes: 1 addition & 1 deletion ext/sktoolslib
1 change: 1 addition & 0 deletions src/BowPad.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@
<ClInclude Include="..\ext\sktoolslib\SysInfo.h" />
<ClInclude Include="..\ext\sktoolslib\TempFile.h" />
<ClInclude Include="..\ext\sktoolslib\UnicodeUtils.h" />
<ClInclude Include="..\ext\sktoolslib\Utf8ToWideIterator.h" />
<ClInclude Include="..\ext\sktoolslib\Windows10Colors.h" />
<ClInclude Include="..\ext\tinyexpr\tinyexpr.h" />
<ClInclude Include="AboutDlg.h" />
Expand Down
3 changes: 3 additions & 0 deletions src/BowPad.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,9 @@
<ClInclude Include="Commands\CmdTail.h">
<Filter>Commands</Filter>
</ClInclude>
<ClInclude Include="..\ext\sktoolslib\Utf8ToWideIterator.h">
<Filter>sktoolslib</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile Include="stdafx.cpp">
Expand Down
43 changes: 24 additions & 19 deletions src/Commands/CmdRegexCapture.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// This file is part of BowPad.
//
// Copyright (C) 2013-2022 - Stefan Kueng
// Copyright (C) 2013-2022, 2025 - Stefan Kueng
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -30,6 +30,7 @@
#include <sstream>

#include <dwmapi.h>
#include <Utf8ToWideIterator.h>
#pragma comment(lib, "dwmapi.lib")

constexpr auto DEFAULT_MAX_SEARCH_STRINGS = 20;
Expand Down Expand Up @@ -227,7 +228,7 @@ void CRegexCaptureDlg::DoCapture()
std::wstring sCaptureW = GetDlgItemText(IDC_CAPTURECOMBO).get();
UpdateCombo(IDC_CAPTURECOMBO, sCaptureW, m_maxCaptureStrings);

auto sRegex = CUnicodeUtils::StdGetUTF8(sRegexW);
auto sRegex = sRegexW; // CUnicodeUtils::StdGetUTF8(sRegexW);
auto sCapture = UnEscape(CUnicodeUtils::StdGetUTF8(sCaptureW));
if (sCapture.empty())
sCapture = "$&";
Expand All @@ -249,19 +250,20 @@ void CRegexCaptureDlg::DoCapture()
APPVERIFY(false); // Shouldn't happen.
}
}
sCaptureW = CUnicodeUtils::StdGetUnicode(sCapture);
try
{
auto findText = GetDlgItemText(IDC_SEARCHCOMBO);
std::regex::flag_type rxFlags = std::regex_constants::ECMAScript;
if (IsDlgButtonChecked(*this, IDC_ICASE))
rxFlags |= std::regex_constants::icase;
// replace all "\n" chars with "(?:\n|\r\n|\n\r)"
if ((sRegex.size() > 1) && (sRegex.find("\\r") == std::wstring::npos))
if ((sRegex.size() > 1) && (sRegex.find(L"\\r") == std::wstring::npos))
{
SearchReplace(sRegex, "\\n", "(!:\\n|\\r\\n|\\n\\r)");
SearchReplace(sRegex, L"\\n", L"(!:\\n|\\r\\n|\\n\\r)");
}

const std::regex rx(sRegex, rxFlags);
const std::wregex rx(sRegex, rxFlags);

m_captureWnd.Scintilla().ClearAll();

Expand All @@ -272,34 +274,34 @@ void CRegexCaptureDlg::DoCapture()
Scintilla().IndicatorClearRange(0, lengthDoc);
}

const char* pText = static_cast<const char*>(Scintilla().CharacterPointer());
std::string_view searchText(pText, lengthDoc);
std::match_results<std::string_view::const_iterator> whatC;
std::regex_constants::match_flag_type flags = std::regex_constants::match_flag_type::match_default | std::regex_constants::match_flag_type::match_not_null;
auto* pText = static_cast<const unsigned char*>(Scintilla().CharacterPointer());
std::match_results<Utf8ToWideIterator> whatC;
std::regex_constants::match_flag_type flags = std::regex_constants::match_flag_type::match_default | std::regex_constants::match_flag_type::match_not_null;
if (IsDlgButtonChecked(*this, IDC_DOTNEWLINE))
flags |= std::regex_constants::match_flag_type::match_not_eol;
auto start = searchText.cbegin();
auto end = searchText.cend();
Utf8ToWideIterator start(pText, 0);
Utf8ToWideIterator end(pText, lengthDoc);
std::vector<std::tuple<int, size_t, size_t>> capturePositions;
std::ostringstream outStream;
std::wostringstream outStream;

while (std::regex_search(start, end, whatC, rx, flags))
{
if (whatC[0].matched)
{
auto out = whatC.format(sCapture, flags);
auto out = whatC.format(sCaptureW, flags);
outStream << out;
if (outStream.tellp() > static_cast<long long>(5 * 1024 * 1024))
{
const auto& sOut = outStream.str();
const auto& sOut = CUnicodeUtils::StdGetUTF8(outStream.str());
m_captureWnd.Scintilla().AppendText(sOut.size(), sOut.c_str());
outStream.str("");
outStream.str(L"");
outStream.clear();
}

int captureCount = 0;
for (const auto& w : whatC)
{
capturePositions.push_back(std::make_tuple(captureCount, w.first - searchText.cbegin(), w.length()));
capturePositions.push_back(std::make_tuple(captureCount, w.first.CurrentPos(), CUnicodeUtils::StdGetUTF8(w.str()).length()));
++captureCount;
}
}
Expand All @@ -315,14 +317,17 @@ void CRegexCaptureDlg::DoCapture()
// update flags for continuation
flags |= std::regex_constants::match_flag_type::match_prev_avail;
}
const auto& resultString = outStream.str();
const auto& resultString = CUnicodeUtils::StdGetUTF8(outStream.str());
m_captureWnd.Scintilla().AppendText(resultString.size(), resultString.c_str());
m_captureWnd.UpdateLineNumberWidth();

for (const auto& [num, begin, length] : capturePositions)
{
Scintilla().SetIndicatorCurrent(INDIC_REGEXCAPTURE + num);
Scintilla().IndicatorFillRange(begin, length);
if (num < (INDIC_REGEXCAPTURE_END - INDIC_REGEXCAPTURE))
{
Scintilla().SetIndicatorCurrent(INDIC_REGEXCAPTURE + num);
Scintilla().IndicatorFillRange(begin, length);
}
}

std::vector<std::wstring> regexStrings;
Expand Down

0 comments on commit a52e9ff

Please sign in to comment.