Skip to content

Commit 67bfa0e

Browse files
committed
Support names with multiple parts in loose parser (#10)
This allows the loose parser to include parts of the name before and after other things like mark, amount or children. In "cheese (MILK) with 2.3% fat" the whole ingredient name is now included. Also things like "foo* 50%" now recognize the amount.
1 parent 2f4eb9d commit 67bfa0e

File tree

5 files changed

+67
-36
lines changed

5 files changed

+67
-36
lines changed

lib/food_ingredient_parser/loose/node.rb

+11-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module FoodIngredientParser::Loose
55
class Node
66
include ToHtml
77

8-
attr_accessor :name, :mark, :amount, :contains, :notes
8+
attr_accessor :name_parts, :mark, :amount, :contains, :notes
99
attr_reader :input, :interval, :auto_close
1010

1111
def initialize(input, interval, auto_close: false)
@@ -14,7 +14,8 @@ def initialize(input, interval, auto_close: false)
1414
@auto_close = auto_close
1515
@contains = []
1616
@notes = []
17-
@name = @mark = @amount = nil
17+
@name_parts = []
18+
@mark = @amount = nil
1819
end
1920

2021
def ends(index)
@@ -31,14 +32,20 @@ def text_value
3132

3233
def to_h
3334
r = {}
34-
r[:name] = name.text_value.strip if name && name.text_value.strip != ''
35+
_name = name
36+
r[:name] = _name if _name
3537
r[:marks] = [mark.text_value.strip] if mark
3638
r[:amount] = amount.text_value.strip if amount
3739
r[:contains] = contains.map(&:to_h).reject {|c| c == {} } if contains.any?
3840
r[:notes] = notes.map{|n| n.text_value.strip }.reject {|c| c == '' } if notes.any?
3941
r
4042
end
4143

44+
def name
45+
strings = name_parts.map {|n| n.text_value.strip }.reject {|n| n == nil || n == '' }
46+
return strings.any? ? strings.join(" ") : nil
47+
end
48+
4249
def inspect(indent="", variant="")
4350
inspect_self(indent, variant) +
4451
inspect_children(indent)
@@ -47,7 +54,7 @@ def inspect(indent="", variant="")
4754
def inspect_self(indent="", variant="")
4855
[
4956
indent + "Node#{variant} interval=#{@interval}",
50-
name ? "name=#{name.text_value.strip.inspect}" : nil,
57+
name ? "name=#{name.inspect}" : nil,
5158
mark ? "mark=#{mark.text_value.strip.inspect}" : nil,
5259
amount ? "amount=#{amount.text_value.strip.inspect}" : nil,
5360
auto_close ? "auto_close" : nil

lib/food_ingredient_parser/loose/scanner.rb

+20-11
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ class Scanner
3333

3434
def initialize(s, index: 0)
3535
@s = s # input string
36-
@i = index # current index in string
36+
@i = index # current index in string, the iterator looks at this character
3737
@cur = nil # current node we're populating
38+
@curifree = nil # last index in string for current node that we haven't added to a child node yet
3839
@ancestors = [Node.new(@s, @i)] # nesting hierarchy
3940
@iterator = :beginning # scan_iteration_<iterator> to use for parsing
4041
@dest = :contains # append current node to this attribute on parent
@@ -79,6 +80,7 @@ def scan_iteration_standard
7980
# after bracket check for 'and' to not lose text
8081
if is_and_sep?(@i+1)
8182
@i += and_sep_len(@i+1)
83+
@curifree = @i # don't include 'and' in cur name
8284
add_child
8385
end
8486
elsif is_notes_start? # usually a dot marks the start of notes
@@ -147,7 +149,11 @@ def parent
147149
end
148150

149151
def cur
150-
@cur ||= Node.new(@s, @i)
152+
if !@cur
153+
@cur ||= Node.new(@s, @i)
154+
@curifree = @i
155+
end
156+
@cur
151157
end
152158

153159
def is_sep?(chars: SEP_CHARS)
@@ -201,16 +207,19 @@ def add_child
201207
cur.ends(@i-1)
202208
parent.send(@dest) << cur
203209
@cur = nil
210+
@curifree = nil
204211
end
205212

206213
def open_parent(**options)
207214
name_until_here
208215
@ancestors << cur
209216
@cur = Node.new(@s, @i + 1, **options)
217+
@curifree = @i + 1
210218
end
211219

212220
def close_parent
213221
return unless @ancestors.count > 1
222+
@curifree = @i + 1
214223
@cur = @ancestors.pop
215224
while @cur.auto_close
216225
add_child
@@ -227,15 +236,15 @@ def close_all_ancestors
227236
end
228237

229238
def name_until_here
230-
cur.name ||= begin
231-
i, j = cur.interval.first, @i - 1
232-
i += mark_len(i) # skip any mark in front
233-
# Set name if there is any. There is one corner-case that needs to be avoided when
234-
# a nesting was opened without a name, which would set the name to the nesting text.
235-
# In this case, the name starts with an open-nesting symbol, which should never happen.
236-
if j >= i && !"([:".include?(@s[i])
237-
Node.new(@s, i .. j)
238-
end
239+
return unless @curifree # no cur started yet
240+
i, j = @curifree, @i - 1
241+
i += mark_len(i) # skip any mark in front
242+
# Set name if there is any. There is one corner-case that needs to be avoided when
243+
# a nesting was opened without a name, which would set the name to the nesting text.
244+
# In this case, the name starts with an open-nesting symbol, which should never happen.
245+
if j >= i && !"([:".include?(@s[i])
246+
cur.name_parts << Node.new(@s, i .. j)
247+
@curifree = @i
239248
end
240249
end
241250

lib/food_ingredient_parser/loose/transform/amount.rb

+17-9
Original file line numberDiff line numberDiff line change
@@ -29,18 +29,26 @@ def transform!
2929

3030
# Extract amount from name, if any.
3131
def transform_name(node = @node)
32-
if !node.amount && parsed = parse_amount(node.name&.text_value)
33-
offset = node.name.interval.first
32+
if !node.amount
33+
node.name_parts.each_with_index do |name, i|
34+
parsed = parse_amount(name.text_value)
35+
next unless parsed
36+
offset = name.interval.first
3437

35-
amount = parsed.amount.amount
36-
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
38+
amount = parsed.amount.amount
39+
node.amount = Node.new(node.input, offset + amount.interval.first .. offset + amount.interval.last - 1)
3740

38-
name = parsed.respond_to?(:name) && parsed.name
39-
if name && name.interval.count > 0
40-
node.name = Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
41-
else
42-
node.name = nil
41+
name = parsed.respond_to?(:name) && parsed.name
42+
node.name_parts[i] = if name && name.interval.count > 0
43+
Node.new(node.input, offset + name.interval.first .. offset + name.interval.last - 1)
44+
else
45+
nil
46+
end
47+
# found an amount, stop looking in other parts
48+
break
4349
end
50+
# remove cleared name parts
51+
node.name_parts.reject!(&:nil?)
4452
end
4553

4654
# recursively transform contained nodes

lib/food_ingredient_parser/loose/transform/handle_missing_name.rb

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def transform_children!(node)
4242
# Apply recursively. Do it before processing to handle multiple depth levels of missing names.
4343
transform_children!(child) if child.contains.any?
4444

45-
if child.name.nil? || child.name.text_value.strip == ''
45+
name = child.name
46+
if name.nil? || name == ''
4647
# Name is empty, we need to do something.
4748
if prev
4849
# there is a previous ingredient: move children to new parent

lib/food_ingredient_parser/loose/transform/split_e_numbers.rb

+17-11
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,27 @@ def transform!
2929
def transform_node!(node)
3030
if node.contains.any?
3131
node.contains.each {|n| transform_node!(n) }
32-
elsif node.name && m = MATCH_RE.match(node.name.text_value)
33-
i = 0
34-
while m = node.name.text_value.match(SPLIT_RE, i)
35-
node.contains << new_node(node, i, m.begin(0)-1)
36-
i = m.end(0)
32+
else
33+
node.name_parts.each_with_index do |name, name_index|
34+
if m = MATCH_RE.match(name.text_value)
35+
i = 0
36+
while m = name.text_value.match(SPLIT_RE, i)
37+
node.contains << new_node(name, i, m.begin(0)-1)
38+
i = m.end(0)
39+
end
40+
node.contains << new_node(name, i, name.interval.last) if i <= name.interval.last
41+
node.name_parts[name_index] = nil
42+
end
3743
end
38-
node.contains << new_node(node, i, node.name.interval.last) if i <= node.name.interval.last
39-
node.name = nil
44+
# remove cleared name parts
45+
node.name_parts.reject!(&:nil?)
4046
end
4147
end
4248

43-
def new_node(node, begins, ends)
44-
offset = node.name.interval.first
45-
new_node = Node.new(node.input, offset + begins .. offset + ends)
46-
new_node.name = Node.new(node.input, new_node.interval)
49+
def new_node(name, begins, ends)
50+
offset = name.interval.first
51+
new_node = Node.new(name.input, offset + begins .. offset + ends)
52+
new_node.name_parts = [Node.new(name.input, new_node.interval)]
4753
new_node
4854
end
4955
end

0 commit comments

Comments
 (0)