Skip to content

Commit

Permalink
regexp: make combining chars in collections work
Browse files Browse the repository at this point in the history
fixes vim#10286

Also, while at it, make debug mode work again.
  • Loading branch information
chrisbra committed Aug 30, 2023
1 parent 1bed993 commit 0c86e35
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 11 deletions.
10 changes: 8 additions & 2 deletions src/regexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2709,7 +2709,10 @@ static regengine_T bt_regengine =
bt_regcomp,
bt_regfree,
bt_regexec_nl,
bt_regexec_multi,
bt_regexec_multi
#ifdef DEBUG
,(char_u *)""
#endif
};

#include "regexp_nfa.c"
Expand All @@ -2719,7 +2722,10 @@ static regengine_T nfa_regengine =
nfa_regcomp,
nfa_regfree,
nfa_regexec_nl,
nfa_regexec_multi,
nfa_regexec_multi
#ifdef DEBUG
,(char_u *)""
#endif
};

// Which regexp engine to use? Needed for vim_regcomp().
Expand Down
4 changes: 3 additions & 1 deletion src/regexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ struct regengine
int (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int);
// bt_regexec_mult or nfa_regexec_mult
long (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *);
//char_u *expr;
#ifdef DEBUG
char_u *expr;
#endif
};

// Flags used by vim_regsub() and vim_regsub_both()
Expand Down
39 changes: 32 additions & 7 deletions src/regexp_bt.c
Original file line number Diff line number Diff line change
Expand Up @@ -3743,13 +3743,38 @@ regmatch(

case ANYOF:
case ANYBUT:
if (c == NUL)
status = RA_NOMATCH;
else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
status = RA_NOMATCH;
else
ADVANCE_REGINPUT();
break;
{
char_u *q = OPERAND(scan);

if (c == NUL)
status = RA_NOMATCH;
else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
status = RA_NOMATCH;
else
{
// Check following combining characters
int len = 0;
int i;

if (enc_utf8)
len = utfc_ptr2len(q) - utf_ptr2len(q);

MB_CPTR_ADV(rex.input);
MB_CPTR_ADV(q);

if (!enc_utf8 || len == 0)
break;

for (i = 0; i < len; ++i)
if (q[i] != rex.input[i])
{
status = RA_NOMATCH;
break;
}
rex.input += len;
}
break;
}

case MULTIBYTECODE:
if (has_mbyte)
Expand Down
111 changes: 110 additions & 1 deletion src/regexp_nfa.c
Original file line number Diff line number Diff line change
Expand Up @@ -1764,6 +1764,7 @@ nfa_regatom(void)
endp = skip_anyof(p);
if (*endp == ']')
{
int plen;
/*
* Try to reverse engineer character classes. For example,
* recognize that [0-9] stands for \d and [A-Za-z_] for \h,
Expand Down Expand Up @@ -2033,13 +2034,43 @@ nfa_regatom(void)
else
{
if (got_coll_char == TRUE && startc == 0)
{
EMIT(0x0a);
EMIT(NFA_CONCAT);
}
else
{
EMIT(startc);
EMIT(NFA_CONCAT);
if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
{
EMIT(NFA_CONCAT);
}
}
}
}

if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
{
int i = utf_ptr2len(regparse);

c = utf_ptr2char(regparse + i);

// Add composing characters
for (;;)
{
if (c == 0)
// \x00 is translated to \x0a, start at \x01.
EMIT(1);
else
EMIT(c);
EMIT(NFA_CONCAT);
if ((i += utf_char2len(c)) >= plen)
break;
c = utf_ptr2char(regparse + i);
}
EMIT(NFA_COMPOSING);
EMIT(NFA_CONCAT);
}
MB_PTR_ADV(regparse);
} // while (p < endp)

Expand Down Expand Up @@ -6418,6 +6449,84 @@ nfa_regmatch(
result_if_matched = (t->state->c == NFA_START_COLL);
for (;;)
{
if (state->c == NFA_COMPOSING)
{
int mc = curc;
int len = 0;
nfa_state_T *end;
nfa_state_T *sta;
int cchars[MAX_MCO];
int ccount = 0;
int j;

sta = t->state->out->out;
len = 0;
if (utf_iscomposing(sta->c))
{
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
len += mb_char2len(mc);
}
if (rex.reg_icombine && len == 0)
{
// If \Z was present, then ignore composing characters.
// When ignoring the base character this always matches.
if (sta->c != curc)
result = FAIL;
else
result = OK;
while (sta->c != NFA_END_COMPOSING)
sta = sta->out;
}
// Check base character matches first, unless ignored.
else if (len > 0 || mc == sta->c)
// if (len > 0 || mc == sta->c)
{
if (len == 0)
{
len += mb_char2len(mc);
sta = sta->out;
}

// We don't care about the order of composing characters.
// Get them into cchars[] first.
while (len < clen)
{
mc = mb_ptr2char(rex.input + len);
cchars[ccount++] = mc;
len += mb_char2len(mc);
if (ccount == MAX_MCO)
break;
}

// Check that each composing char in the pattern matches a
// composing char in the text. We do not check if all
// composing chars are matched.
result = OK;
while (sta->c != NFA_END_COMPOSING)
{
for (j = 0; j < ccount; ++j)
if (cchars[j] == sta->c)
break;
if (j == ccount)
{
result = FAIL;
break;
}
sta = sta->out;
}
}
else
result = FAIL;

if (t->state->out->out1->c == NFA_END_COMPOSING)
{
end = t->state->out->out1;
ADD_STATE_IF_MATCH(end);
}
break;
}
if (state->c == NFA_END_COLL)
{
result = !result_if_matched;
Expand Down
11 changes: 11 additions & 0 deletions src/testdir/test_regexp_utf8.vim
Original file line number Diff line number Diff line change
Expand Up @@ -575,5 +575,16 @@ func Test_match_too_complicated()
set regexpengine=0
endfunc

func Test_combining_chars_in_collection()
new
for i in range(0,2)
exe "set re=".i
put =['��', '', '� a�', 'abcd']
:%s/[��]//
call assert_equal(['', '', '', '� a�', 'abcd'], getline(1,'$'))
%d
endfor
bw!
endfunc

" vim: shiftwidth=2 sts=2 expandtab

0 comments on commit 0c86e35

Please sign in to comment.