Skip to content

Commit

Permalink
Add support for unbounded ranges (#91)
Browse files Browse the repository at this point in the history
* Add support for unbounded ranges

This commit adds support for open ranges, i.e. inequality operators in
front of a term. In tree form, the < is named To, and > is named From.

Additionally, this commit also adds a TreeTransformer to convert these
open ranges to more traditional Range objects.

To properly support escaping, this commit also adjusts how escaping
sequences work. After careful evaluation of how Apache Lucene handles
escape sequences, it appears that random characters can be escaped, even
if they result in unknown escape sequences: the escaped character is
always yielded. This makes support for operations such as `<\=foo` a lot
less complicated.

This commit does NOT add support in the ElasticsearchQueryBuilder.

Authored-by: JSCU-CNI
Co-authored-by: Alex Garel <[email protected]>
  • Loading branch information
JSCU-CNI and alexgarel authored Mar 24, 2023
1 parent ae3f3a3 commit eb6ff98
Show file tree
Hide file tree
Showing 6 changed files with 440 additions and 37 deletions.
34 changes: 30 additions & 4 deletions luqum/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from .tree import (
AndOperation, Boost, Fuzzy, Group, Not,
OrOperation, Phrase, Plus, Prohibit, Proximity,
Range, Regex, SearchField, UnknownOperation, Word,
create_operation, group_to_fieldgroup,
Range, To, From, Regex, SearchField, UnknownOperation,
Word, create_operation, group_to_fieldgroup,
)


Expand All @@ -40,7 +40,9 @@
'LPAREN',
'RPAREN',
'LBRACKET',
'RBRACKET'] +
'RBRACKET',
'LESSTHAN',
'GREATERTHAN'] +
# we sort to have a deterministic order, so that gammar signature does not changes
sorted(list(reserved.values())))

Expand Down Expand Up @@ -79,7 +81,7 @@
TERM_RE = r'''
(?P<term> # group term
(?:
[^\s:^~(){{}}[\]/"'+\-\\] # first char is not a space neither some char which have meanings
[^\s:^~(){{}}[\]/"'+\-\\<>] # first char is not a space neither some char which have meanings
# note: escape of "-" and "]"
# and doubling of "{{}}" (because we use format)
| # but
Expand Down Expand Up @@ -191,6 +193,16 @@ def t_RBRACKET(t):
return simple_token(t)


def t_GREATERTHAN(t):
r'>=?'
return simple_token(t)


def t_LESSTHAN(t):
r'<=?'
return simple_token(t)


@lex.TOKEN(PHRASE_RE)
def t_PHRASE(t):
orig_value = t.value
Expand Down Expand Up @@ -291,6 +303,20 @@ def p_range(p):
head_tail.range(p)


def p_lessthan(p):
'''unary_expression : LESSTHAN phrase_or_term'''
include_bound = '=' in p[1].value
p[0] = To(p[2], include_bound)
head_tail.unary(p)


def p_greaterthan(p):
'''unary_expression : GREATERTHAN phrase_or_term'''
include_bound = '=' in p[1].value
p[0] = From(p[2], include_bound)
head_tail.unary(p)


def p_field_search(p):
'''unary_expression : TERM COLUMN unary_expression'''
if isinstance(p[3], Group):
Expand Down
46 changes: 24 additions & 22 deletions luqum/parsetab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

_lr_method = 'LALR'

_lr_signature = 'leftIMPLICIT_OPleftOR_OPleftAND_OPnonassocPLUSMINUSnonassocBOOSTnonassocTOAND_OP APPROX BOOST COLUMN LBRACKET LPAREN MINUS NOT OR_OP PHRASE PLUS RBRACKET REGEX RPAREN TERM TOexpression : expression OR_OP expressionexpression : expression AND_OP expressionexpression : expression expression %prec IMPLICIT_OPunary_expression : PLUS unary_expressionunary_expression : MINUS unary_expressionunary_expression : NOT unary_expressionexpression : unary_expressionunary_expression : LPAREN expression RPARENunary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKETunary_expression : TERM COLUMN unary_expressionunary_expression : PHRASEunary_expression : PHRASE APPROXunary_expression : unary_expression BOOSTunary_expression : TERMunary_expression : TERM APPROXunary_expression : REGEXunary_expression : TOphrase_or_term : TERM\n | PHRASE'
_lr_signature = 'leftIMPLICIT_OPleftOR_OPleftAND_OPnonassocPLUSMINUSnonassocBOOSTnonassocTOAND_OP APPROX BOOST COLUMN GREATERTHAN LBRACKET LESSTHAN LPAREN MINUS NOT OR_OP PHRASE PLUS RBRACKET REGEX RPAREN TERM TOexpression : expression OR_OP expressionexpression : expression AND_OP expressionexpression : expression expression %prec IMPLICIT_OPunary_expression : PLUS unary_expressionunary_expression : MINUS unary_expressionunary_expression : NOT unary_expressionexpression : unary_expressionunary_expression : LPAREN expression RPARENunary_expression : LBRACKET phrase_or_term TO phrase_or_term RBRACKETunary_expression : LESSTHAN phrase_or_termunary_expression : GREATERTHAN phrase_or_termunary_expression : TERM COLUMN unary_expressionunary_expression : PHRASEunary_expression : PHRASE APPROXunary_expression : unary_expression BOOSTunary_expression : TERMunary_expression : TERM APPROXunary_expression : REGEXunary_expression : TOphrase_or_term : TERM\n | PHRASE'

_lr_action_items = {'PLUS':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[3,3,-7,3,3,3,3,-17,-14,-11,-16,3,3,3,-13,-4,-5,-6,3,3,-15,-12,3,3,-8,-10,-9,]),'MINUS':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[4,4,-7,4,4,4,4,-17,-14,-11,-16,4,4,4,-13,-4,-5,-6,4,4,-15,-12,4,4,-8,-10,-9,]),'NOT':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[5,5,-7,5,5,5,5,-17,-14,-11,-16,-3,5,5,-13,-4,-5,-6,5,5,-15,-12,-1,-2,-8,-10,-9,]),'LPAREN':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[6,6,-7,6,6,6,6,-17,-14,-11,-16,-3,6,6,-13,-4,-5,-6,6,6,-15,-12,-1,-2,-8,-10,-9,]),'LBRACKET':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[7,7,-7,7,7,7,7,-17,-14,-11,-16,-3,7,7,-13,-4,-5,-6,7,7,-15,-12,-1,-2,-8,-10,-9,]),'TERM':([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,29,30,32,],[9,9,-7,9,9,9,9,21,-17,-14,-11,-16,-3,9,9,-13,-4,-5,-6,9,9,-15,-12,-1,-2,-8,21,-10,-9,]),'PHRASE':([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,29,30,32,],[10,10,-7,10,10,10,10,22,-17,-14,-11,-16,-3,10,10,-13,-4,-5,-6,10,10,-15,-12,-1,-2,-8,22,-10,-9,]),'REGEX':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,23,24,25,26,27,28,30,32,],[11,11,-7,11,11,11,11,-17,-14,-11,-16,-3,11,11,-13,-4,-5,-6,11,11,-15,-12,-1,-2,-8,-10,-9,]),'TO':([0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,32,],[8,8,-7,8,8,8,8,-17,-14,-11,-16,8,8,8,-13,-4,-5,-6,8,29,-18,-19,8,-15,-12,8,8,-8,-10,-9,]),'$end':([1,2,8,9,10,11,12,15,16,17,18,24,25,26,27,28,30,32,],[0,-7,-17,-14,-11,-16,-3,-13,-4,-5,-6,-15,-12,-1,-2,-8,-10,-9,]),'OR_OP':([1,2,8,9,10,11,12,15,16,17,18,19,24,25,26,27,28,30,32,],[13,-7,-17,-14,-11,-16,13,-13,-4,-5,-6,13,-15,-12,-1,-2,-8,-10,-9,]),'AND_OP':([1,2,8,9,10,11,12,15,16,17,18,19,24,25,26,27,28,30,32,],[14,-7,-17,-14,-11,-16,14,-13,-4,-5,-6,14,-15,-12,14,-2,-8,-10,-9,]),'RPAREN':([2,8,9,10,11,12,15,16,17,18,19,24,25,26,27,28,30,32,],[-7,-17,-14,-11,-16,-3,-13,-4,-5,-6,28,-15,-12,-1,-2,-8,-10,-9,]),'BOOST':([2,8,9,10,11,15,16,17,18,24,25,28,30,32,],[15,-17,-14,-11,-16,-13,15,15,15,-15,-12,-8,15,-9,]),'COLUMN':([9,],[23,]),'APPROX':([9,10,],[24,25,]),'RBRACKET':([21,22,31,],[-18,-19,32,]),}
_lr_action_items = {'PLUS':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[3,3,-7,3,3,3,3,-19,-16,-13,-18,3,3,3,-15,-4,-5,-6,3,-20,-21,-10,-11,3,-17,-14,3,3,-8,-12,-9,]),'MINUS':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[4,4,-7,4,4,4,4,-19,-16,-13,-18,4,4,4,-15,-4,-5,-6,4,-20,-21,-10,-11,4,-17,-14,4,4,-8,-12,-9,]),'NOT':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[5,5,-7,5,5,5,5,-19,-16,-13,-18,-3,5,5,-15,-4,-5,-6,5,-20,-21,-10,-11,5,-17,-14,-1,-2,-8,-12,-9,]),'LPAREN':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[6,6,-7,6,6,6,6,-19,-16,-13,-18,-3,6,6,-15,-4,-5,-6,6,-20,-21,-10,-11,6,-17,-14,-1,-2,-8,-12,-9,]),'LBRACKET':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[7,7,-7,7,7,7,7,-19,-16,-13,-18,-3,7,7,-15,-4,-5,-6,7,-20,-21,-10,-11,7,-17,-14,-1,-2,-8,-12,-9,]),'LESSTHAN':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[9,9,-7,9,9,9,9,-19,-16,-13,-18,-3,9,9,-15,-4,-5,-6,9,-20,-21,-10,-11,9,-17,-14,-1,-2,-8,-12,-9,]),'GREATERTHAN':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[10,10,-7,10,10,10,10,-19,-16,-13,-18,-3,10,10,-15,-4,-5,-6,10,-20,-21,-10,-11,10,-17,-14,-1,-2,-8,-12,-9,]),'TERM':([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,33,34,36,],[11,11,-7,11,11,11,11,23,-19,23,23,-16,-13,-18,-3,11,11,-15,-4,-5,-6,11,-20,-21,-10,-11,11,-17,-14,-1,-2,-8,23,-12,-9,]),'PHRASE':([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,33,34,36,],[12,12,-7,12,12,12,12,24,-19,24,24,-16,-13,-18,-3,12,12,-15,-4,-5,-6,12,-20,-21,-10,-11,12,-17,-14,-1,-2,-8,24,-12,-9,]),'REGEX':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28,29,30,31,32,34,36,],[13,13,-7,13,13,13,13,-19,-16,-13,-18,-3,13,13,-15,-4,-5,-6,13,-20,-21,-10,-11,13,-17,-14,-1,-2,-8,-12,-9,]),'TO':([0,1,2,3,4,5,6,8,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,],[8,8,-7,8,8,8,8,-19,-16,-13,-18,8,8,8,-15,-4,-5,-6,8,33,-20,-21,-10,-11,8,-17,-14,8,8,-8,-12,-9,]),'$end':([1,2,8,11,12,13,14,17,18,19,20,23,24,25,26,28,29,30,31,32,34,36,],[0,-7,-19,-16,-13,-18,-3,-15,-4,-5,-6,-20,-21,-10,-11,-17,-14,-1,-2,-8,-12,-9,]),'OR_OP':([1,2,8,11,12,13,14,17,18,19,20,21,23,24,25,26,28,29,30,31,32,34,36,],[15,-7,-19,-16,-13,-18,15,-15,-4,-5,-6,15,-20,-21,-10,-11,-17,-14,-1,-2,-8,-12,-9,]),'AND_OP':([1,2,8,11,12,13,14,17,18,19,20,21,23,24,25,26,28,29,30,31,32,34,36,],[16,-7,-19,-16,-13,-18,16,-15,-4,-5,-6,16,-20,-21,-10,-11,-17,-14,16,-2,-8,-12,-9,]),'RPAREN':([2,8,11,12,13,14,17,18,19,20,21,23,24,25,26,28,29,30,31,32,34,36,],[-7,-19,-16,-13,-18,-3,-15,-4,-5,-6,32,-20,-21,-10,-11,-17,-14,-1,-2,-8,-12,-9,]),'BOOST':([2,8,11,12,13,17,18,19,20,23,24,25,26,28,29,32,34,36,],[17,-19,-16,-13,-18,-15,17,17,17,-20,-21,-10,-11,-17,-14,-8,17,-9,]),'COLUMN':([11,],[27,]),'APPROX':([11,12,],[28,29,]),'RBRACKET':([23,24,35,],[-20,-21,36,]),}

_lr_action = {}
for _k, _v in _lr_action_items.items():
Expand All @@ -17,7 +17,7 @@
_lr_action[_x][_k] = _y
del _lr_action_items

_lr_goto_items = {'expression':([0,1,6,12,13,14,19,26,27,],[1,12,19,12,26,27,12,12,12,]),'unary_expression':([0,1,3,4,5,6,12,13,14,19,23,26,27,],[2,2,16,17,18,2,2,2,2,2,30,2,2,]),'phrase_or_term':([7,29,],[20,31,]),}
_lr_goto_items = {'expression':([0,1,6,14,15,16,21,30,31,],[1,14,21,14,30,31,14,14,14,]),'unary_expression':([0,1,3,4,5,6,14,15,16,21,27,30,31,],[2,2,18,19,20,2,2,2,2,2,34,2,2,]),'phrase_or_term':([7,9,10,33,],[22,25,26,35,]),}

_lr_goto = {}
for _k, _v in _lr_goto_items.items():
Expand All @@ -27,23 +27,25 @@
del _lr_goto_items
_lr_productions = [
("S' -> expression","S'",1,None,None,None),
('expression -> expression OR_OP expression','expression',3,'p_expression_or','parser.py',240),
('expression -> expression AND_OP expression','expression',3,'p_expression_and','parser.py',246),
('expression -> expression expression','expression',2,'p_expression_implicit','parser.py',252),
('unary_expression -> PLUS unary_expression','unary_expression',2,'p_expression_plus','parser.py',258),
('unary_expression -> MINUS unary_expression','unary_expression',2,'p_expression_minus','parser.py',264),
('unary_expression -> NOT unary_expression','unary_expression',2,'p_expression_not','parser.py',270),
('expression -> unary_expression','expression',1,'p_expression_unary','parser.py',276),
('unary_expression -> LPAREN expression RPAREN','unary_expression',3,'p_grouping','parser.py',281),
('unary_expression -> LBRACKET phrase_or_term TO phrase_or_term RBRACKET','unary_expression',5,'p_range','parser.py',287),
('unary_expression -> TERM COLUMN unary_expression','unary_expression',3,'p_field_search','parser.py',295),
('unary_expression -> PHRASE','unary_expression',1,'p_quoting','parser.py',304),
('unary_expression -> PHRASE APPROX','unary_expression',2,'p_proximity','parser.py',309),
('unary_expression -> unary_expression BOOST','unary_expression',2,'p_boosting','parser.py',315),
('unary_expression -> TERM','unary_expression',1,'p_terms','parser.py',321),
('unary_expression -> TERM APPROX','unary_expression',2,'p_fuzzy','parser.py',326),
('unary_expression -> REGEX','unary_expression',1,'p_regex','parser.py',332),
('unary_expression -> TO','unary_expression',1,'p_to_as_term','parser.py',338),
('phrase_or_term -> TERM','phrase_or_term',1,'p_phrase_or_term','parser.py',344),
('phrase_or_term -> PHRASE','phrase_or_term',1,'p_phrase_or_term','parser.py',345),
('expression -> expression OR_OP expression','expression',3,'p_expression_or','parser.py',252),
('expression -> expression AND_OP expression','expression',3,'p_expression_and','parser.py',258),
('expression -> expression expression','expression',2,'p_expression_implicit','parser.py',264),
('unary_expression -> PLUS unary_expression','unary_expression',2,'p_expression_plus','parser.py',270),
('unary_expression -> MINUS unary_expression','unary_expression',2,'p_expression_minus','parser.py',276),
('unary_expression -> NOT unary_expression','unary_expression',2,'p_expression_not','parser.py',282),
('expression -> unary_expression','expression',1,'p_expression_unary','parser.py',288),
('unary_expression -> LPAREN expression RPAREN','unary_expression',3,'p_grouping','parser.py',293),
('unary_expression -> LBRACKET phrase_or_term TO phrase_or_term RBRACKET','unary_expression',5,'p_range','parser.py',299),
('unary_expression -> LESSTHAN phrase_or_term','unary_expression',2,'p_lessthan','parser.py',307),
('unary_expression -> GREATERTHAN phrase_or_term','unary_expression',2,'p_greaterthan','parser.py',314),
('unary_expression -> TERM COLUMN unary_expression','unary_expression',3,'p_field_search','parser.py',321),
('unary_expression -> PHRASE','unary_expression',1,'p_quoting','parser.py',330),
('unary_expression -> PHRASE APPROX','unary_expression',2,'p_proximity','parser.py',335),
('unary_expression -> unary_expression BOOST','unary_expression',2,'p_boosting','parser.py',341),
('unary_expression -> TERM','unary_expression',1,'p_terms','parser.py',347),
('unary_expression -> TERM APPROX','unary_expression',2,'p_fuzzy','parser.py',352),
('unary_expression -> REGEX','unary_expression',1,'p_regex','parser.py',358),
('unary_expression -> TO','unary_expression',1,'p_to_as_term','parser.py',364),
('phrase_or_term -> TERM','phrase_or_term',1,'p_phrase_or_term','parser.py',370),
('phrase_or_term -> PHRASE','phrase_or_term',1,'p_phrase_or_term','parser.py',371),
]
43 changes: 38 additions & 5 deletions luqum/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,10 @@ class Term(Item):
:param str value: the value
"""
WILDCARDS_PATTERN = re.compile(r"((?<=[^\\])[?*]|\\\\[?*]|^[?*])") # non escaped * and ?
# see
# Although the following URL lists [+\-&|!(){}[\]^"~*?:\\] as escaped characters, in
# practice, in Lucene, all escaped letters are interpreted as a literal, i.e. '\a' == 'a'
# https://lucene.apache.org/core/3_6_0/queryparsersyntax.html#Escaping%20Special%20Characters
WORD_ESCAPED_CHARS = re.compile(r'\\([+\-&|!(){}[\]^"~*?:\\])')
WORD_ESCAPED_CHARS = re.compile(r'\\(.)')

_equality_attrs = ['value']

Expand Down Expand Up @@ -490,17 +491,49 @@ def __str__(self, head_tail=False):
return self._head_tail(value, head_tail)


class Plus(Unary):
class UnaryOperator(Unary):
"""Base class for unary operators"""
pass


class Plus(UnaryOperator):
"""plus, unary operation
"""
op = "+"


class Not(Unary):
class Not(UnaryOperator):
op = 'NOT'


class Prohibit(Unary):
class Prohibit(UnaryOperator):
"""The negation
"""
op = "-"


class OpenRange(Unary):
"""A range with only one bound.
:param a: the provided bound value
:param bool include: whether a is included
"""

_char = {True: '=', False: ''}
_equality_attrs = ['include']

def __init__(self, a, include=True, **kwargs):
self.include = include
super().__init__(a, **kwargs)

def __str__(self, head_tail=False):
value = "%s%s%s" % (self.op, self._char[self.include], self.a.__str__(head_tail=True))
return self._head_tail(value, head_tail)


class From(OpenRange):
op = ">"


class To(OpenRange):
op = "<"
Loading

0 comments on commit eb6ff98

Please sign in to comment.