From 957d8f22310ebfbac08f6bb22ecddad23be026b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Queiroz?= Date: Mon, 5 Aug 2019 16:11:07 -0300 Subject: [PATCH] Updating LPegLabel to the codebase of LPeg(1.1.0?) (https://github.com/roberto-ieru/LPeg/tree/c2680687d148820847607e13ed7100e60d94c79e) --- lpcode.c | 79 ++++++++++++++++++++++++-------------- lpprint.c | 54 ++++++++++++++++---------- lptree.c | 104 +++++++++++++++++++++++++++++++++++++------------- lptree.h | 11 ++++-- lptypes.h | 8 ++-- lpvm.c | 54 +++++++++++++++++++++++--- lpvm.h | 15 +++++--- test.lua | 98 ++++++++++++++++++++++++++++++++++++++++++++--- testlabel.lua | 4 +- 9 files changed, 328 insertions(+), 99 deletions(-) diff --git a/lpcode.c b/lpcode.c index ecc6fa3..a2d08f7 100644 --- a/lpcode.c +++ b/lpcode.c @@ -196,7 +196,7 @@ int hascaptures (TTree *tree) { int checkaux (TTree *tree, int pred) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TOpenCall: case TThrow: /* labeled failure */ return 0; /* not nullable */ case TRep: case TTrue: @@ -220,7 +220,7 @@ int checkaux (TTree *tree, int pred) { if (checkaux(sib2(tree), pred)) return 1; /* else return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; - case TCapture: case TGrammar: case TRule: + case TCapture: case TGrammar: case TRule: case TXInfo: /* return checkaux(sib1(tree), pred); */ tree = sib1(tree); goto tailcall; case TCall: /* return checkaux(sib2(tree), pred); */ @@ -239,12 +239,14 @@ int fixedlen (TTree *tree) { switch (tree->tag) { case TChar: case TSet: case TAny: return len + 1; + case TUTFR: + return (tree->cap == sib1(tree)->cap) ? len + tree->cap : -1; case TFalse: case TTrue: case TNot: case TAnd: case TBehind: return len; case TRep: case TRunTime: case TOpenCall: case TThrow: /* labeled failure */ return -1; - case TCapture: case TRule: case TGrammar: + case TCapture: case TRule: case TGrammar: case TXInfo: /* return fixedlen(sib1(tree)); */ tree = sib1(tree); goto tailcall; case TCall: { @@ -254,14 +256,14 @@ int fixedlen (TTree *tree) { else return len + n1; } - case TSeq: { + case TSeq: { int n1 = fixedlen(sib1(tree)); if (n1 < 0) return -1; /* else return fixedlen(sib2(tree)) + len; */ len += n1; tree = sib2(tree); goto tailcall; } - case TChoice: { + case TChoice: { int n1 = fixedlen(sib1(tree)); int n2 = fixedlen(sib2(tree)); if (n1 != n2 || n1 < 0) @@ -299,6 +301,13 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { tocharset(tree, firstset); return 0; } + case TUTFR: { + int c; + loopset(i, firstset->cs[i] = 0); /* erase all chars */ + for (c = tree->key; c <= sib1(tree)->key; c++) + setchar(firstset->cs, c); + return 0; + } case TTrue: { loopset(i, firstset->cs[i] = follow->cs[i]); return 1; /* accepts the empty string */ @@ -311,7 +320,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { loopset(i, firstset->cs[i] = fullset->cs[i]); return 1; } - case TChoice: { + case TChoice: { Charset csaux; int e1 = getfirst(sib1(tree), follow, firstset); int e2 = getfirst(sib2(tree), follow, &csaux); @@ -339,7 +348,7 @@ static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { loopset(i, firstset->cs[i] |= follow->cs[i]); return 1; /* accept the empty string */ } - case TCapture: case TGrammar: case TRule: { + case TCapture: case TGrammar: case TRule: case TXInfo: { /* return getfirst(sib1(tree), follow, firstset); */ tree = sib1(tree); goto tailcall; } @@ -385,10 +394,10 @@ static int headfail (TTree *tree) { case TChar: case TSet: case TAny: case TFalse: return 1; case TTrue: case TRep: case TRunTime: case TNot: - case TBehind: + case TBehind: case TUTFR: case TThrow: /* labeled failure: must always throw the label */ return 0; - case TCapture: case TGrammar: case TRule: case TAnd: + case TCapture: case TGrammar: case TRule: case TXInfo: case TAnd: tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ case TCall: tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ @@ -396,7 +405,7 @@ static int headfail (TTree *tree) { if (!nofail(sib2(tree))) return 0; /* else return headfail(sib1(tree)); */ tree = sib1(tree); goto tailcall; - case TChoice: + case TChoice: if (!headfail(sib1(tree))) return 0; /* else return headfail(sib2(tree)); */ tree = sib2(tree); goto tailcall; @@ -413,7 +422,7 @@ static int headfail (TTree *tree) { static int needfollow (TTree *tree) { tailcall: switch (tree->tag) { - case TChar: case TSet: case TAny: + case TChar: case TSet: case TAny: case TUTFR: case TFalse: case TTrue: case TAnd: case TNot: case TRunTime: case TGrammar: case TCall: case TBehind: case TThrow: /* (?)labeled failure */ @@ -425,7 +434,7 @@ static int needfollow (TTree *tree) { case TSeq: tree = sib2(tree); goto tailcall; default: assert(0); return 0; - } + } } /* }====================================================== */ @@ -448,12 +457,12 @@ int sizei (const Instruction *i) { case ITestSet: return CHARSETINSTSIZE + 1; case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: + case IUTFR: return 2; case IThrow: case IPredChoice: /* labeled failure */ return 2; case IThrowRec: /* labeled failure */ return 3; - default: return 1; } } @@ -517,8 +526,9 @@ static int addinstruction (CompileState *compst, Opcode op, int aux) { static int addoffsetinst (CompileState *compst, Opcode op) { int i = addinstruction(compst, op, 0); /* instruction */ addinstruction(compst, (Opcode)0, 0); /* open space for offset */ - assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 || + assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2 || op == IThrowRec); /* labeled failure */ + return i; } @@ -528,13 +538,13 @@ static void codethrow (CompileState *compst, TTree *throw) { int recov, aux; if (throw->u.ps != 0) { recov = addoffsetinst(compst, IThrowRec); - assert(sib2(throw)->tag == TRule); + assert(sib1(sib2(throw))->tag == TXInfo); } else { recov = addinstruction(compst, IThrow, 0); } aux = nextinstruction(compst); getinstr(compst, aux).i.key = throw->key; /* next instruction keeps only rule name */ - getinstr(compst, recov).i.key = sib2(throw)->cap; /* rule number */ + getinstr(compst, recov).i.key = sib1(sib2(throw))->u.n; /* rule number */ } /* labeled failure */ @@ -547,6 +557,16 @@ static void setoffset (CompileState *compst, int instruction, int offset) { } +static void codeutfr (CompileState *compst, TTree *tree) { + int i = addoffsetinst(compst, IUTFR); + int to = sib1(tree)->u.n; + assert(sib1(tree)->tag == TXInfo); + getinstr(compst, i + 1).offset = tree->u.n; + getinstr(compst, i).i.aux = to & 0xff; + getinstr(compst, i).i.key = to >> 8; +} + + /* ** Add a capture instruction: ** 'op' is the capture instruction; 'cap' the capture kind; @@ -694,11 +714,11 @@ static void codebehind (CompileState *compst, TTree *tree) { /* ** Choice; optimizations: -** - when p1 is headfail or -** when first(p1) and first(p2) are disjoint, than -** a character not in first(p1) cannot go to p1, and a character -** in first(p1) cannot go to p2 (at it is not in first(p2)). -** (The optimization is not valid if p1 accepts the empty string, +** - when p1 is headfail or when first(p1) and first(p2) are disjoint, +** than a character not in first(p1) cannot go to p1 and a character +** in first(p1) cannot go to p2, either because p1 will accept +** (headfail) or because it is not in first(p2) (disjoint). +** (The second case is not valid if p1 accepts the empty string, ** as then there is no character at all...) ** - when p2 is empty and opt is true; a IPartialCommit can reuse ** the Choice already active in the stack. @@ -715,7 +735,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, int jmp = NOINST; codegen(compst, p1, 0, test, fl); if (!emptyp2) - jmp = addoffsetinst(compst, IJmp); + jmp = addoffsetinst(compst, IJmp); jumptohere(compst, test); codegen(compst, p2, opt, NOINST, fl); jumptohere(compst, jmp); @@ -726,7 +746,7 @@ static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, codegen(compst, p1, 1, NOINST, fullset); } else { - /* == + /* == test(first(p1)) -> L1; choice L1; ; commit L2; L1: ; L2: */ int pcommit; int test = codetestset(compst, &cs1, e1); @@ -887,7 +907,7 @@ static void correctcalls (CompileState *compst, int *positions, else code[i].i.code = ICall; jumptothere(compst, i, rule); /* call jumps to respective rule */ - } else if (code[i].i.code == IThrowRec) { + } else if (code[i].i.code == IThrowRec) { /* labeled failure */ int n = code[i].i.key; /* rule number */ int rule = positions[n]; /* rule position */ assert(rule == from || code[rule - 1].i.code == IRet); @@ -912,8 +932,10 @@ static void codegrammar (CompileState *compst, TTree *grammar) { int start = gethere(compst); /* here starts the initial rule */ jumptohere(compst, firstcall); for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + TTree *r = sib1(rule); + assert(r->tag == TXInfo); positions[rulenumber++] = gethere(compst); /* save rule position */ - codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ + codegen(compst, sib1(r), 0, NOINST, fullset); /* code rule */ addinstruction(compst, IRet, 0); } assert(rule->tag == TTrue); @@ -924,8 +946,8 @@ static void codegrammar (CompileState *compst, TTree *grammar) { static void codecall (CompileState *compst, TTree *call) { int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ - getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ - assert(sib2(call)->tag == TRule); + assert(sib1(sib2(call))->tag == TXInfo); + getinstr(compst, c).i.key = sib1(sib2(call))->u.n; /* rule number */ } @@ -963,6 +985,7 @@ static void codegen (CompileState *compst, TTree *tree, int opt, int tt, case TSet: codecharset(compst, treebuffer(tree), tt); break; case TTrue: break; case TFalse: addinstruction(compst, IFail, 0); break; + case TUTFR: codeutfr(compst, tree); break; case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; case TRep: coderep(compst, sib1(tree), opt, fl); break; case TBehind: codebehind(compst, tree); break; @@ -1013,7 +1036,7 @@ static void peephole (CompileState *compst) { case IRet: case IFail: case IFailTwice: case IEnd: { /* instructions with unconditional implicit jumps */ code[i] = code[ft]; /* jump becomes that instruction */ - code[i + 1].i.code = IAny; /* 'no-op' for target position */ + code[i + 1].i.code = IEmpty; /* 'no-op' for target position */ break; } case ICommit: case IPartialCommit: diff --git a/lpprint.c b/lpprint.c index 76a7007..af03edc 100644 --- a/lpprint.c +++ b/lpprint.c @@ -56,21 +56,26 @@ void printinst (const Instruction *op, const Instruction *p) { const char *const names[] = { "any", "char", "set", "testany", "testchar", "testset", - "span", "behind", + "span", "utf-range", "behind", "ret", "end", "choice", "pred_choice", "jmp", "call", "open_call", /* labeled failure */ "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", "fullcapture", "opencapture", "closecapture", "closeruntime", "throw", "throw_rec", /* labeled failure */ + "--" }; printf("%02ld: %s ", (long)(p - op), names[p->i.code]); switch ((Opcode)p->i.code) { case IChar: { - printf("'%c'", p->i.aux); + printf("'%c' (%02x)", p->i.aux, p->i.aux); break; } case ITestChar: { - printf("'%c'", p->i.aux); printjmp(op, p); + printf("'%c' (%02x)", p->i.aux, p->i.aux); printjmp(op, p); + break; + } + case IUTFR: { + printf("%d - %d", p[1].offset, utf_to(p)); break; } case IFullCapture: { @@ -157,11 +162,11 @@ void printcaplist (Capture *cap, Capture *limit) { static const char *tagnames[] = { "char", "set", "any", - "true", "false", + "true", "false", "utf8.range", "rep", "seq", "choice", "not", "and", - "call", "opencall", "rule", "grammar", + "call", "opencall", "rule", "xinfo", "grammar", "behind", "capture", "run-time", "throw" /* labeled failure */ @@ -170,6 +175,7 @@ static const char *tagnames[] = { void printtree (TTree *tree, int ident) { int i; + int sibs = numsiblings[tree->tag]; for (i = 0; i < ident; i++) printf(" "); printf("%s", tagnames[tree->tag]); switch (tree->tag) { @@ -186,25 +192,34 @@ void printtree (TTree *tree, int ident) { printf("\n"); break; } + case TUTFR: { + assert(sib1(tree)->tag == TXInfo); + printf(" %d (%02x %d) - %d (%02x %d) \n", + tree->u.n, tree->key, tree->cap, + sib1(tree)->u.n, sib1(tree)->key, sib1(tree)->cap); + break; + } case TOpenCall: case TCall: { - assert(sib2(tree)->tag == TRule); - printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); + assert(sib1(sib2(tree))->tag == TXInfo); + printf(" key: %d (rule: %d)\n", tree->key, sib1(sib2(tree))->u.n); break; } case TBehind: { printf(" %d\n", tree->u.n); - printtree(sib1(tree), ident + 2); break; } case TCapture: { printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); - printtree(sib1(tree), ident + 2); break; } case TRule: { - printf(" n: %d key: %d\n", tree->cap, tree->key); - printtree(sib1(tree), ident + 2); - break; /* do not print next rule as a sibling */ + printf(" key: %d\n", tree->key); + sibs = 1; /* do not print 'sib2' (next rule) as a sibling */ + break; + } + case TXInfo: { + printf(" n: %d\n", tree->u.n); + break; } case TGrammar: { TTree *rule = sib1(tree); @@ -214,6 +229,7 @@ void printtree (TTree *tree, int ident) { rule = sib2(rule); } assert(rule->tag == TTrue); /* sentinel */ + sibs = 0; /* siblings already handled */ break; } case TThrow: { /* labeled failure */ @@ -222,16 +238,14 @@ void printtree (TTree *tree, int ident) { printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); break; } - default: { - int sibs = numsiblings[tree->tag]; + default: printf("\n"); - if (sibs >= 1) { - printtree(sib1(tree), ident + 2); - if (sibs >= 2) - printtree(sib2(tree), ident + 2); - } break; - } + } + if (sibs >= 1) { + printtree(sib1(tree), ident + 2); + if (sibs >= 2) + printtree(sib2(tree), ident + 2); } } diff --git a/lptree.c b/lptree.c index b1a32c4..4afbae7 100644 --- a/lptree.c +++ b/lptree.c @@ -21,11 +21,11 @@ /* number of siblings for each tree */ const byte numsiblings[] = { 0, 0, 0, /* char, set, any */ - 0, 0, /* true, false */ + 0, 0, 0, /* true, false, utf-range */ 1, /* rep */ 2, 2, /* seq, choice */ 1, 1, /* not, and */ - 0, 0, 2, 1, /* call, opencall, rule, grammar */ + 0, 0, 2, 1, 1, /* call, opencall, rule, prerule, grammar */ 1, /* behind */ 1, 1, /* capture, runtime capture */ 0 /* labeled failure throw */ @@ -58,7 +58,7 @@ static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t, byte tag lua_gettable(L, postable); /* query name in position table */ n = lua_tonumber(L, -1); /* get (absolute) position */ lua_pop(L, 1); /* remove position */ - if (tag == TOpenCall) { + if (tag == TOpenCall) { /* labeled failure */ if (n == 0) { /* no position? */ lua_rawgeti(L, -1, t->key); /* get rule's name again */ luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); @@ -109,7 +109,7 @@ static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { return; case TOpenCall: { if (g != NULL) /* inside a grammar? */ - fixonecall(L, postable, g, t, TOpenCall); + fixonecall(L, postable, g, t, TOpenCall); /* labeled failure */ else { /* open call outside grammar */ lua_rawgeti(L, -1, t->key); luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); @@ -694,6 +694,56 @@ static int lp_range (lua_State *L) { } +/* +** Fills a tree node with basic information about the UTF-8 code point +** 'cpu': its value in 'n', its length in 'cap', and its first byte in +** 'key' +*/ +static void codeutftree (lua_State *L, TTree *t, lua_Unsigned cpu, int arg) { + int len, fb, cp; + cp = (int)cpu; + if (cp <= 0x7f) { /* one byte? */ + len = 1; + fb = cp; + } else if (cp <= 0x7ff) { + len = 2; + fb = 0xC0 | (cp >> 6); + } else if (cp <= 0xffff) { + len = 3; + fb = 0xE0 | (cp >> 12); + } + else { + luaL_argcheck(L, cpu <= 0x10ffffu, arg, "invalid code point"); + len = 4; + fb = 0xF0 | (cp >> 18); + } + t->u.n = cp; + t->cap = len; + t->key = fb; +} + + +static int lp_utfr (lua_State *L) { + lua_Unsigned from = (lua_Unsigned)luaL_checkinteger(L, 1); + lua_Unsigned to = (lua_Unsigned)luaL_checkinteger(L, 2); + luaL_argcheck(L, from <= to, 2, "empty range"); + if (to <= 0x7f) { /* ascii range? */ + TTree *tree = newcharset(L); /* code it as a regular charset */ + unsigned int f; + for (f = (int)from; f <= to; f++) + setchar(treebuffer(tree), f); + } + else { /* multi-byte utf-8 range */ + TTree *tree = newtree(L, 2); + tree->tag = TUTFR; + codeutftree(L, tree, from, 1); + sib1(tree)->tag = TXInfo; + codeutftree(L, sib1(tree), to, 2); + } + return 1; +} + + /* ** Look-behind predicate */ @@ -940,7 +990,7 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { int size; /* accumulator for total size */ lua_newtable(L); /* create position table */ getfirstrule(L, arg, postab); - size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ + size = 3 + getsize(L, postab + 2); /* TGrammar + TRule + TXInfo + rule */ lua_pushnil(L); /* prepare to traverse grammar table */ while (lua_next(L, arg) != 0) { if (lua_tonumber(L, -2) == 1 || @@ -954,11 +1004,11 @@ static int collectrules (lua_State *L, int arg, int *totalsize) { lua_pushvalue(L, -2); /* push key (to insert into position table) */ lua_pushinteger(L, size); lua_settable(L, postab); - size += 1 + getsize(L, -1); /* update size */ + size += 2 + getsize(L, -1); /* add 'TRule + TXInfo + rule' to size */ lua_pushvalue(L, -2); /* push key (for next lua_next) */ n++; } - *totalsize = size + 1; /* TTrue to finish list of rules */ + *totalsize = size + 1; /* space for 'TTrue' finishing list of rules */ return n; } @@ -970,11 +1020,13 @@ static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { int ridx = frule + 2*i + 1; /* index of i-th rule */ int rulesize; TTree *rn = gettree(L, ridx, &rulesize); + TTree *pr = sib1(nd); /* points to rule's prerule */ nd->tag = TRule; nd->key = 0; /* will be fixed when rule is used */ - nd->cap = i; /* rule number */ - nd->u.ps = rulesize + 1; /* point to next rule */ - memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ + pr->tag = TXInfo; + pr->u.n = i; /* rule number */ + nd->u.ps = rulesize + 2; /* point to next rule */ + memcpy(sib1(pr), rn, rulesize * sizeof(TTree)); /* copy rule */ mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ nd = sib2(nd); /* move to next rule */ } @@ -1010,7 +1062,7 @@ static int checkloops (TTree *tree) { ** twice in 'passed', there is path from it back to itself without ** advancing the subject. */ -static int verifyerror (lua_State *L, int *passed, int npassed) { +static int verifyerror (lua_State *L, unsigned short *passed, int npassed) { int i, j; for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ for (j = i - 1; j >= 0; j--) { @@ -1035,12 +1087,13 @@ static int verifyerror (lua_State *L, int *passed, int npassed) { ** counts the elements in 'passed'. ** Assume ktable at the top of the stack. */ -static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, - int nb) { +static int verifyrule (lua_State *L, TTree *tree, unsigned short *passed, + int npassed, int nb) { tailcall: switch (tree->tag) { case TChar: case TSet: case TAny: - case TFalse: case TThrow: /* labeled failure */ + case TFalse: case TUTFR: + case TThrow: /* labeled failure */ return nb; /* cannot pass from here */ case TTrue: case TBehind: /* look-behind cannot have calls */ @@ -1048,7 +1101,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, case TNot: case TAnd: case TRep: /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ tree = sib1(tree); nb = 1; goto tailcall; - case TCapture: case TRunTime: + case TCapture: case TRunTime: case TXInfo: /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ tree = sib1(tree); goto tailcall; case TCall: @@ -1059,15 +1112,15 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, return nb; /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; - case TChoice: /* must check both children */ + case TChoice: /* must check both children */ nb = verifyrule(L, sib1(tree), passed, npassed, nb); /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ tree = sib2(tree); goto tailcall; case TRule: - if (npassed >= MAXRULES) - return verifyerror(L, passed, npassed); + if (npassed >= MAXRULES) /* too many steps? */ + return verifyerror(L, passed, npassed); /* error */ else { - passed[npassed++] = tree->key; + passed[npassed++] = tree->key; /* add rule to path */ /* return verifyrule(L, sib1(tree), passed, npassed); */ tree = sib1(tree); goto tailcall; } @@ -1079,7 +1132,7 @@ static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, static void verifygrammar (lua_State *L, TTree *grammar) { - int passed[MAXRULES]; + unsigned short passed[MAXRULES]; TTree *rule; /* check left-recursive rules */ for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { @@ -1243,12 +1296,6 @@ static int lp_setmax (lua_State *L) { } -static int lp_version (lua_State *L) { - lua_pushstring(L, VERSION); - return 1; -} - - static int lp_type (lua_State *L) { if (testpattern(L, 1)) lua_pushliteral(L, "pattern"); @@ -1317,8 +1364,9 @@ static struct luaL_Reg pattreg[] = { {"P", lp_P}, {"S", lp_set}, {"R", lp_range}, + {"utfR", lp_utfr}, {"locale", lp_locale}, - {"version", lp_version}, + {"version", NULL}, {"setmaxstack", lp_setmax}, {"type", lp_type}, {"T", lp_throw}, /* labeled failure throw */ @@ -1347,6 +1395,8 @@ int luaopen_lpeglabel (lua_State *L) { /* labeled failure */ luaL_newlib(L, pattreg); lua_pushvalue(L, -1); lua_setfield(L, -3, "__index"); + lua_pushliteral(L, "LPegLabel " VERSION); /* labeled failure */ + lua_setfield(L, -2, "version"); return 1; } diff --git a/lptree.h b/lptree.h index 0cf160a..05e0680 100644 --- a/lptree.h +++ b/lptree.h @@ -18,6 +18,9 @@ typedef enum TTag { TAny, TTrue, TFalse, + TUTFR, /* range of UTF-8 codepoints; 'n' has initial codepoint; + 'cap' has length; 'key' has first byte; + extra info is similar for end codepoint */ TRep, /* 'sib1'* */ TSeq, /* 'sib1' 'sib2' */ TChoice, /* 'sib1' / 'sib2' */ @@ -26,8 +29,9 @@ typedef enum TTag { TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ TOpenCall, /* ktable[key] is rule's key */ TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); - 'sib1' is rule's pattern; - 'sib2' is next rule; 'cap' is rule's sequential number */ + 'sib1' is rule's pattern pre-rule; 'sib2' is next rule; + extra info 'n' is rule's sequential number */ + TXInfo, /* extra info */ TGrammar, /* 'sib1' is initial (and first) rule */ TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); @@ -36,6 +40,7 @@ typedef enum TTag { TRunTime, /* run-time capture: 'key' is Lua function; 'sib1' is capture body */ TThrow, /* labeled failure: ktable[key] is label's name */ + } TTag; @@ -50,8 +55,8 @@ typedef struct TTree { byte cap; /* kind of capture (if it is a capture) */ unsigned short key; /* key in ktable for Lua data (0 if no key) */ union { - int n; /* occasional counter */ int ps; /* occasional second child */ + int n; /* occasional counter */ } u; } TTree; diff --git a/lptypes.h b/lptypes.h index 3261428..bf9aed1 100644 --- a/lptypes.h +++ b/lptypes.h @@ -15,7 +15,7 @@ #include "lua.h" -#define VERSION "1.5.2" +#define VERSION "1.6.0" #define PATTERN_T "lpeg-pattern" @@ -37,6 +37,8 @@ #define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) #define luaL_newlib(L,f) luaL_register(L,"lpeg",f) +typedef size_t lua_Unsigned; + #endif @@ -51,9 +53,9 @@ #endif -/* maximum number of rules in a grammar (limited by 'unsigned char') */ +/* maximum number of rules in a grammar (limited by 'unsigned short') */ #if !defined(MAXRULES) -#define MAXRULES UCHAR_MAX +#define MAXRULES 1000 #endif diff --git a/lpvm.c b/lpvm.c index a791c44..b7ae631 100644 --- a/lpvm.c +++ b/lpvm.c @@ -18,15 +18,44 @@ /* initial size for call/backtrack stack */ #if !defined(INITBACK) -#define INITBACK MAXBACK +#define INITBACK MAXBACK #endif -#define getoffset(p) (((p) + 1)->offset) +#define getoffset(p) (((p) + 1)->offset) static const Instruction giveup = {{IGiveup, 0, 0}}; +/* +** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. +*/ +static const char *utf8_decode (const char *o, int *val) { + static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFFu}; + const unsigned char *s = (const unsigned char *)o; + unsigned int c = s[0]; /* first byte */ + unsigned int res = 0; /* final result */ + if (c < 0x80) /* ascii? */ + res = c; + else { + int count = 0; /* to count number of continuation bytes */ + while (c & 0x40) { /* still have continuation bytes? */ + int cc = s[++count]; /* read next byte */ + if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ + return NULL; /* invalid byte sequence */ + res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ + c <<= 1; /* to test next bit */ + } + res |= (c & 0x7F) << (count * 5); /* add first byte */ + if (count > 3 || res > 0x10FFFFu || res <= limits[count]) + return NULL; /* invalid byte sequence */ + s += count; /* skip continuation bytes read */ + } + *val = res; + return (const char *)s + 1; /* +1 to include first byte */ +} + + /* ** {====================================================== ** Virtual Machine @@ -43,7 +72,7 @@ typedef struct Stack { } Stack; -#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) +#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) /* @@ -207,6 +236,20 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, } continue; } + case IUTFR: { + int codepoint; + if (s >= e) + goto fail; + s = utf8_decode (s, &codepoint); + if (s && p[1].offset <= codepoint && codepoint <= utf_to(p)) + p += 2; + else { + *labelf = LFAIL; /* labeled failure */ + updatefarthest(*sfail, s); /*labeled failure */ + goto fail; + } + continue; + } case ITestAny: { if (s < e) p += 2; else p += getoffset(p); @@ -301,8 +344,7 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, continue; } case ICommit: { - assert(stack > getstackbase(L, ptop)); - assert((stack - 1)->s != NULL); + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); stack--; p += getoffset(p); continue; @@ -318,6 +360,8 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); s = (--stack)->s; insidepred = stack->labenv; /* labeled failure */ + if (ndyncap > 0) /* are there matchtime captures? */ + ndyncap -= removedyncap(L, capture, stack->caplevel, captop); captop = stack->caplevel; p += getoffset(p); continue; diff --git a/lpvm.h b/lpvm.h index 6633c4b..19f4108 100644 --- a/lpvm.h +++ b/lpvm.h @@ -17,17 +17,18 @@ typedef enum Opcode { ITestChar, /* if char != aux, jump to 'offset' */ ITestSet, /* if char not in buff, jump to 'offset' */ ISpan, /* read a span of chars in buff */ + IUTFR, /* if codepoint not in range [offset, utf_to], fail */ IBehind, /* walk back 'aux' characters (fail if not possible) */ IRet, /* return from a rule */ IEnd, /* end of pattern */ IChoice, /* stack a choice; next fail will jump to 'offset' */ - IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ + IPredChoice, /* labeld failure: stack a choice; changes label env next fail will jump to 'offset' */ /*labeled failure */ IJmp, /* jump to 'offset' */ ICall, /* call rule at 'offset' */ IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ ICommit, /* pop choice and jump to 'offset' */ IPartialCommit, /* update top choice to current position and jump */ - IBackCommit, /* "fails" but jump to its own 'offset' */ + IBackCommit, /* backtrack like "fail" but jump to its own 'offset' */ IFailTwice, /* pop one choice and then fail */ IFail, /* go back to saved state on choice and jump to saved offset */ IGiveup, /* internal use */ @@ -35,8 +36,9 @@ typedef enum Opcode { IOpenCapture, /* start a capture */ ICloseCapture, ICloseRunTime, - IThrow, /* fails with a given label */ - IThrowRec, /* fails with a given label and call rule at 'offset' */ + IThrow, /* fails with a given label */ /*labeled failure */ + IThrowRec, /* fails with a given label and call rule at 'offset' */ /*labeled failure */ + IEmpty /* to fill empty slots left by optimizations */ } Opcode; @@ -52,10 +54,13 @@ typedef union Instruction { } Instruction; +/* extract 24-bit value from an instruction */ +#define utf_to(inst) (((inst)->i.key << 8) | (inst)->i.aux) + + void printpatt (Instruction *p, int n); const char *match (lua_State *L, const char *o, const char *s, const char *e, Instruction *op, Capture *capture, int ptop, short *labelf, const char **sfail); /* labeled failure */ - #endif diff --git a/test.lua b/test.lua index 2c05dd0..3989f30 100755 --- a/test.lua +++ b/test.lua @@ -48,8 +48,8 @@ end print"General tests for LPeg library" -assert(type(m.version()) == "string") -print("version " .. m.version()) +assert(type(m.version) == "string") +print(m.version) assert(m.type("alo") ~= "pattern") assert(m.type(io.input) ~= "pattern") assert(m.type(m.P"alo") == "pattern") @@ -70,7 +70,6 @@ assert(m.match(#m.P(true) * "a", "a") == 2) assert(m.match("a" * #m.P(false), "a") == nil) assert(m.match("a" * #m.P(true), "a") == 2) - -- tests for locale do assert(m.locale(m) == m) @@ -406,7 +405,7 @@ assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') do - -- large dynamic Cc + print "testing large dynamic Cc" local lim = 2^16 - 1 local c = 0 local function seq (n) @@ -985,10 +984,10 @@ for i = 1, 10 do assert(p:match("aaaaaaaaaaa") == 11 - i + 1) end -print"+" --- tests for back references +print "testing back references" + checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') @@ -1032,6 +1031,17 @@ local function id (s, i, ...) return true, ... end +do -- run-time capture in an end predicate (should discard its value) + local x = 0 + function foo (s, i) + x = x + 1 + return true, x + end + + local p = #(m.Cmt("", foo) * "xx") * m.Cmt("", foo) + assert(p:match("xx") == 2) +end + assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + m.R'09'^1 / string.char + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") @@ -1171,9 +1181,85 @@ t = {p:match('abacc')} checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) +do print"testing large grammars" + local lim = 1000 -- number of rules + local t = {} + + for i = 3, lim do + t[i] = m.V(i - 1) -- each rule calls previous one + end + t[1] = m.V(lim) -- start on last rule + t[2] = m.C("alo") -- final rule + + local P = m.P(t) -- build grammar + assert(P:match("alo") == "alo") + + t[#t + 1] = m.P("x") -- one more rule... + checkerr("too many rules", m.P, t) +end + + +print "testing UTF-8 ranges" + +do -- a few typical UTF-8 ranges + local p = m.utfR(0x410, 0x44f)^1 / "cyr: %0" + + m.utfR(0x4e00, 0x9fff)^1 / "cjk: %0" + + m.utfR(0x1F600, 0x1F64F)^1 / "emot: %0" + + m.utfR(0, 0x7f)^1 / "ascii: %0" + + m.utfR(0, 0x10ffff) / "other: %0" + + p = m.Ct(p^0) * -m.P(1) + + local cyr = "ждюя" + local emot = "\240\159\152\128\240\159\153\128" -- 😀🙀 + local cjk = "专举乸" + local ascii = "alo" + local last = "\244\143\191\191" -- U+10FFFF + + local s = cyr .. "—" .. emot .. "—" .. cjk .. "—" .. ascii .. last + t = (p:match(s)) + + assert(t[1] == "cyr: " .. cyr and t[2] == "other: —" and + t[3] == "emot: " .. emot and t[4] == "other: —" and + t[5] == "cjk: " .. cjk and t[6] == "other: —" and + t[7] == "ascii: " .. ascii and t[8] == "other: " .. last and + t[9] == nil) +end + + +do -- valid and invalid code points + local p = m.utfR(0, 0x10ffff)^0 + assert(p:match("汉字\128") == #"汉字" + 1) + assert(p:match("\244\159\191") == 1) + assert(p:match("\244\159\191\191") == 1) + assert(p:match("\255") == 1) + + -- basic errors + checkerr("empty range", m.utfR, 1, 0) + checkerr("invalid code point", m.utfR, 1, 0x10ffff + 1) +end + + +do -- back references (fixed width) + -- match a byte after a CJK point + local p = m.B(m.utfR(0x4e00, 0x9fff)) * m.C(1) + p = m.P{ p + m.P(1) * m.V(1) } -- search for 'p' + assert(p:match("ab д 专X x") == "X") + + -- match a byte after a hebrew point + local p = m.B(m.utfR(0x5d0, 0x5ea)) * m.C(1) + p = m.P(#"ש") * p + assert(p:match("שX") == "X") + + checkerr("fixed length", m.B, m.utfR(0, 0x10ffff)) +end + + + ------------------------------------------------------------------- -- Tests for 're' module ------------------------------------------------------------------- +print"testing 're' module" local re = require "relabel" diff --git a/testlabel.lua b/testlabel.lua index d60bb54..c2f760d 100644 --- a/testlabel.lua +++ b/testlabel.lua @@ -115,7 +115,7 @@ p = m.P{ "S", S = m.T("bola"), bolada = m.P"a" -} +} r, l, poserr = p:match("abc") assert(r == nil and l == 'bola' and poserr == 1) @@ -134,7 +134,7 @@ p = m.P{ "S", S = m.T("bola"), bola = m.P"a" -} +} r, l, poserr = p:match("abc") assert(r == 2)