diff --git a/src/regexp.c b/src/regexp.c index 9c576c6893cb21..a32717afa37a9d 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -2709,7 +2709,10 @@ static regengine_T bt_regengine = bt_regcomp, bt_regfree, bt_regexec_nl, - bt_regexec_multi, + bt_regexec_multi +#ifdef DEBUG + ,(char_u *)"" +#endif }; #include "regexp_nfa.c" @@ -2719,7 +2722,10 @@ static regengine_T nfa_regengine = nfa_regcomp, nfa_regfree, nfa_regexec_nl, - nfa_regexec_multi, + nfa_regexec_multi +#ifdef DEBUG + ,(char_u *)"" +#endif }; // Which regexp engine to use? Needed for vim_regcomp(). diff --git a/src/regexp.h b/src/regexp.h index d6c8f48c7b9339..1ff2e1b6efae6f 100644 --- a/src/regexp.h +++ b/src/regexp.h @@ -178,7 +178,9 @@ struct regengine int (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int); // bt_regexec_mult or nfa_regexec_mult long (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *); - //char_u *expr; +#ifdef DEBUG + char_u *expr; +#endif }; // Flags used by vim_regsub() and vim_regsub_both() diff --git a/src/regexp_bt.c b/src/regexp_bt.c index 522cf37e2dfb1f..198946e0dcb63b 100644 --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -3743,13 +3743,38 @@ regmatch( case ANYOF: case ANYBUT: - if (c == NUL) - status = RA_NOMATCH; - else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF)) - status = RA_NOMATCH; - else - ADVANCE_REGINPUT(); - break; + { + char_u *q = OPERAND(scan); + + if (c == NUL) + status = RA_NOMATCH; + else if ((cstrchr(q, c) == NULL) == (op == ANYOF)) + status = RA_NOMATCH; + else + { + // Check following combining characters + int len = 0; + int i; + + if (enc_utf8) + len = utfc_ptr2len(q) - utf_ptr2len(q); + + MB_CPTR_ADV(rex.input); + MB_CPTR_ADV(q); + + if (!enc_utf8 || len == 0) + break; + + for (i = 0; i < len; ++i) + if (q[i] != rex.input[i]) + { + status = RA_NOMATCH; + break; + } + rex.input += len; + } + break; + } case MULTIBYTECODE: if (has_mbyte) diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c index d724d527b6d23b..ff54348905e748 100644 --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -1764,6 +1764,7 @@ nfa_regatom(void) endp = skip_anyof(p); if (*endp == ']') { + int plen; /* * Try to reverse engineer character classes. For example, * recognize that [0-9] stands for \d and [A-Za-z_] for \h, @@ -2033,13 +2034,43 @@ nfa_regatom(void) else { if (got_coll_char == TRUE && startc == 0) + { EMIT(0x0a); + EMIT(NFA_CONCAT); + } else + { EMIT(startc); - EMIT(NFA_CONCAT); + if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))) + { + EMIT(NFA_CONCAT); + } + } } } + if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))) + { + int i = utf_ptr2len(regparse); + + c = utf_ptr2char(regparse + i); + + // Add composing characters + for (;;) + { + if (c == 0) + // \x00 is translated to \x0a, start at \x01. + EMIT(1); + else + EMIT(c); + EMIT(NFA_CONCAT); + if ((i += utf_char2len(c)) >= plen) + break; + c = utf_ptr2char(regparse + i); + } + EMIT(NFA_COMPOSING); + EMIT(NFA_CONCAT); + } MB_PTR_ADV(regparse); } // while (p < endp) @@ -6418,6 +6449,84 @@ nfa_regmatch( result_if_matched = (t->state->c == NFA_START_COLL); for (;;) { + if (state->c == NFA_COMPOSING) + { + int mc = curc; + int len = 0; + nfa_state_T *end; + nfa_state_T *sta; + int cchars[MAX_MCO]; + int ccount = 0; + int j; + + sta = t->state->out->out; + len = 0; + if (utf_iscomposing(sta->c)) + { + // Only match composing character(s), ignore base + // character. Used for ".{composing}" and "{composing}" + // (no preceding character). + len += mb_char2len(mc); + } + if (rex.reg_icombine && len == 0) + { + // If \Z was present, then ignore composing characters. + // When ignoring the base character this always matches. + if (sta->c != curc) + result = FAIL; + else + result = OK; + while (sta->c != NFA_END_COMPOSING) + sta = sta->out; + } + // Check base character matches first, unless ignored. + else if (len > 0 || mc == sta->c) +// if (len > 0 || mc == sta->c) + { + if (len == 0) + { + len += mb_char2len(mc); + sta = sta->out; + } + + // We don't care about the order of composing characters. + // Get them into cchars[] first. + while (len < clen) + { + mc = mb_ptr2char(rex.input + len); + cchars[ccount++] = mc; + len += mb_char2len(mc); + if (ccount == MAX_MCO) + break; + } + + // Check that each composing char in the pattern matches a + // composing char in the text. We do not check if all + // composing chars are matched. + result = OK; + while (sta->c != NFA_END_COMPOSING) + { + for (j = 0; j < ccount; ++j) + if (cchars[j] == sta->c) + break; + if (j == ccount) + { + result = FAIL; + break; + } + sta = sta->out; + } + } + else + result = FAIL; + + if (t->state->out->out1->c == NFA_END_COMPOSING) + { + end = t->state->out->out1; + ADD_STATE_IF_MATCH(end); + } + break; + } if (state->c == NFA_END_COLL) { result = !result_if_matched; diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim index b591aedbb7c026..6669dee57e4cc7 100644 --- a/src/testdir/test_regexp_utf8.vim +++ b/src/testdir/test_regexp_utf8.vim @@ -575,5 +575,16 @@ func Test_match_too_complicated() set regexpengine=0 endfunc +func Test_combining_chars_in_collection() + new + for i in range(0,2) + exe "set re=".i + put =['ɔ̃', 'ɔ', '̃ ã', 'abcd'] + :%s/[ɔ̃]// + call assert_equal(['', '', 'ɔ', '̃ ã', 'abcd'], getline(1,'$')) + %d + endfor + bw! +endfunc " vim: shiftwidth=2 sts=2 expandtab