Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[POC] Add a new Strict parser #164

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions lib/monetize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
require 'monetize/errors'
require 'monetize/version'
require 'monetize/parser'
require 'monetize/strict_parser'
require 'monetize/collection'

module Monetize
# Class methods
class << self
attr_writer :parser_class

# @attr_accessor [true, false] assume_from_symbol Use this to enable the
# ability to assume the currency from a passed symbol
attr_accessor :assume_from_symbol
Expand All @@ -20,12 +23,15 @@ class << self
# to true to enforce the delimiters set in the currency all the time.
attr_accessor :enforce_currency_delimiters


# Where this set to true, the behavior for parsing thousands separators is changed to
# expect that eg. €10.000 is EUR 10 000 and not EUR 10.000 - it's incredibly rare when parsing
# human text that we're dealing with fractions of cents.
attr_accessor :expect_whole_subunits

def parser_class
@parser_class || Monetize::Parser
end

def parse(input, currency = Money.default_currency, options = {})
parse! input, currency, options
rescue Error
Expand All @@ -36,7 +42,7 @@ def parse!(input, currency = Money.default_currency, options = {})
return input if input.is_a?(Money)
return from_numeric(input, currency) if input.is_a?(Numeric)

parser = Monetize::Parser.new(input, currency, options)
parser = parser_class.new(input, currency, options)
amount, currency = parser.parse

Money.from_amount(amount, currency)
Expand Down Expand Up @@ -79,3 +85,5 @@ def extract_cents(input, currency = Money.default_currency)
end
end
end

Monetize.parser_class = Monetize::StrictParser
118 changes: 118 additions & 0 deletions lib/monetize/strict_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
require 'monetize/parser'
require 'monetize/tokenizer'

module Monetize
class StrictParser
# TODO: perform exhaustive match
# TODO: error subclasses with detailed explanation
# TODO: check if decimal mark is a thousands separator (1,000 USD)
# TODO: switch to using allowed format as strings for added flexibility

def initialize(input, fallback_currency = Money.default_currency, options = {})
@input = input.to_s
@options = options
@fallback_currency = fallback_currency
end

def parse
result = Tokenizer.new(input, options).process

unless ALLOWED_FORMATS.include?(result.map(&:first))
raise ParseError, "invalid input - #{result.map(&:first)}"
end

amount = result.find { |token| token.type == :amount }
sign = result.find { |token| token.type == :sign }
symbol = result.find { |token| token.type == :symbol }
currency_iso = result.find { |token| token.type == :currency_iso }

currency =
if currency_iso
parse_currency_iso(currency_iso.match.to_s)
elsif symbol && assume_from_symbol?
parse_symbol(symbol.match.to_s)
else
fallback_currency
end

amount = parse_amount(currency, amount.match, sign&.match)

[amount, currency]
end

private

ALLOWED_FORMATS = [
[:amount], # 9.99
[:sign, :amount], # -9.99
[:symbol, :amount], # £9.99
[:sign, :symbol, :amount], # -£9.99
[:symbol, :sign, :amount], # £-9.99
[:symbol, :amount, :sign], # £9.99-
[:amount, :symbol], # 9.99£
[:sign, :amount, :symbol], # -9.99£
[:currency_iso, :amount], # GBP 9.99
[:currency_iso, :sign, :amount], # GBP -9.99
[:amount, :currency_iso], # 9.99 GBP
[:sign, :amount, :currency_iso], # -9.99 GBP
[:symbol, :amount, :currency_iso], # £9.99 GBP
[:sign, :symbol, :amount, :currency_iso], # -£9.99 GBP
].freeze

attr_reader :input, :fallback_currency, :options

def parse_amount(currency, amount, sign)
multiplier = amount[:multiplier]
amount = amount[:amount].gsub(' ', '')

used_delimiters = amount.scan(/[^\d]/).uniq

num =
case used_delimiters.length
when 0
amount.to_f
when 1
decimal_mark = used_delimiters.first
amount = amount.gsub(decimal_mark, '.')

amount.to_f
when 2
thousands_separator, decimal_mark = used_delimiters
amount = amount.gsub(thousands_separator, '')
amount = amount.gsub(decimal_mark, '.')

amount.to_f
else
raise ParseError, 'invalid amount of delimiters used'
end

num = apply_multiplier(num, multiplier)
num = apply_sign(num, sign.to_s)

num
end

def parse_symbol(symbol)
Money::Currency.wrap(Monetize::Parser::CURRENCY_SYMBOLS[symbol])
end

def parse_currency_iso(currency_iso)
Money::Currency.wrap(currency_iso)
end

def assume_from_symbol?
options.fetch(:assume_from_symbol) { Monetize.assume_from_symbol }
end

def apply_multiplier(num, multiplier)
return num unless multiplier

exponent = Monetize::Parser::MULTIPLIER_SUFFIXES[multiplier.to_s.upcase]
num * 10**exponent
end

def apply_sign(num, sign)
sign == '-' ? num * -1 : num
end
end
end
71 changes: 71 additions & 0 deletions lib/monetize/tokenizer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
require 'monetize/parser'

module Monetize
class Tokenizer
SYMBOLS = Monetize::Parser::CURRENCY_SYMBOLS.keys.map { |symbol| Regexp.escape(symbol) }.freeze
THOUSAND_SEPARATORS = /[\.\ ,]/.freeze
DECIMAL_MARKS = /[\.,]/.freeze
MULTIPLIERS = Monetize::Parser::MULTIPLIER_SUFFIXES.keys.join('|').freeze

SYMBOL_REGEXP = Regexp.new(SYMBOLS.join('|')).freeze
CURRENCY_ISO_REGEXP = /(?<![A-Z])[A-Z]{3}(?![A-Z])/i.freeze
SIGN_REGEXP = /[\-\+]/.freeze
AMOUNT_REGEXP = %r{
(?<amount> # amount group
\d+ # starts with at least one digit
(?:#{THOUSAND_SEPARATORS}\d{3})* # separated into groups of 3 digits by a thousands separator
(?!\d) # not followed by a digit
(?:#{DECIMAL_MARKS}\d+)? # has decimal mark followed by decimal part
)
(?<multiplier>#{MULTIPLIERS})? # optional multiplier
}ix.freeze

class Token < Struct.new(:type, :match); end

def initialize(input, options = {})
@original_input = input
@options = options
end

def process
# matches are removed from the input string to avoid overlapping matches
input = original_input.dup
result = []

result += match(input, :currency_iso, CURRENCY_ISO_REGEXP)
result += match(input, :symbol, SYMBOL_REGEXP)
result += match(input, :sign, SIGN_REGEXP)
result += match(input, :amount, AMOUNT_REGEXP)

result.sort_by { |token| token.match.offset(0).first }
end

private

attr_reader :original_input, :options

def match(input, type, regexp)
tokens = []
input.scan(regexp) { tokens << Token.new(type, Regexp.last_match) }

# Replace the matches from the input with § to avoid over-matching
tokens.each do |token|
offset = token.match.offset(0)
input[offset.first..(offset.last - 1)] = '§' * token.match.to_s.length
end

tokens
end

def preview(result)
preview_input = original_input.dup
result.reverse.each do |token|
offset = token.match.offset(0)
preview_input.slice!(offset.first, token.match.to_s.length)
preview_input.insert(offset.first, "<#{token.type}>")
end

puts preview_input
end
end
end