Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions lib/unit/system.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def load(input)

@unit.each {|name, unit| validate_unit(unit[:def]) }

# Newly registered symbols may introduce glyph characters the lexer must
# recognize; rebuild it lazily on next parse.
@tokenizer = nil

true
end

Expand All @@ -62,7 +66,7 @@ def validate_unit(units)

def parse_unit(expr)
stack, result, implicit_mul = [], [], false
expr.to_s.scan(TOKENIZER).each do |tok|
expr.to_s.scan(tokenizer).each do |tok|
if tok == '('
stack << '('
implicit_mul = false
Expand All @@ -79,7 +83,7 @@ def parse_unit(expr)
val = case tok
when REAL then [[:one, tok.to_f, 1]]
when DEC then [[:one, tok.to_i, 1]]
when SYMBOL then symbol_to_unit(tok)
else symbol_to_unit(tok)
end
stack << '*' if implicit_mul
implicit_mul = true
Expand All @@ -94,11 +98,28 @@ def parse_unit(expr)

REAL = /^-?(?:(?:\d*\.\d+|\d+\.\d*)(?:[eE][-+]?\d+)?|\d+[eE][-+]?\d+)$/
DEC = /^-?\d+$/
SYMBOL = /^[a-zA-Z_°'"][\w°'"]*$/
OPERATOR = { '/' => ['/', 1], '*' => ['*', 1], '·' => ['*', 1], '^' => ['^', 2], '**' => ['^', 2] }
OPERATOR_TOKENS = OPERATOR.keys.sort_by {|x| -x.size }. map {|x| Regexp.quote(x) }
VALUE_TOKENS = [REAL.source[1..-2], DEC.source[1..-2], SYMBOL.source[1..-2]]
TOKENIZER = Regexp.new((OPERATOR_TOKENS + VALUE_TOKENS + ['\\(', '\\)']).join('|'))
OPERATOR_CHARS = (OPERATOR.keys.join + '()').chars.uniq.freeze

# The tokenizer and symbol matcher are derived from the symbols actually
# registered in this system (plus an ASCII baseline), then memoized.
# #load clears the memo, so any unit defined later — including ones whose
# symbol uses a non-ASCII glyph (Ω, ℃, µ, %) — becomes lexable without
# touching the lexer. This replaces a frozen glyph char-class that silently
# dropped unrecognized glyphs to a dimensionless value.
def tokenizer
@tokenizer ||= Regexp.new(
(OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], symbol_pattern, '\\(', '\\)']).join('|')
)
end

def symbol_pattern
glyphs = (unit_symbol.keys + factor_symbol.keys).join.chars.uniq.reject do |char|
char =~ /[A-Za-z0-9_]/ || char =~ /\s/ || OPERATOR_CHARS.include?(char)
end
"[A-Za-z0-9_#{glyphs.map { |char| Regexp.escape(char) }.join}]+"
end

def lookup_symbol(symbol)
if unit_symbol[symbol]
Expand Down
32 changes: 32 additions & 0 deletions spec/system_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,36 @@
end
end
end

describe "lexing every symbol in the default system" do
# Guards against the lexer silently dropping a registered glyph to a
# dimensionless value (the historical Ω/% failure): every unit symbol the
# system knows must parse back to that same unit. Built on a fresh system
# with the documented default load-out so the set is deterministic and not
# polluted by other specs loading optional systems onto Unit.default_system.
default = Unit::System.new("default") do |sys|
sys.load(:si)
sys.load(:binary)
sys.load(:degree)
sys.load(:time)
end
default.unit_symbol.each do |sym, name|
it "parses #{sym.inspect} as #{name}" do
expect(Unit(1, sym, default).unit).to eq(Unit(1, name.to_s, default).unit)
end
end
end

describe "lexing a glyph symbol loaded at runtime" do
# The lexer is derived from the registered symbols, so a unit defined after
# the system was built (e.g. an app loading a domain unit) becomes lexable
# without any change to the gem.
it "parses a non-word glyph symbol once its unit is loaded" do
system.load(:si)
system.load('units' => { 'percent' => { 'sym' => '%', 'def' => '1 / 100' } })

expect(Unit(1, '%', system).unit).to eq(Unit(1, 'percent', system).unit)
expect(Unit(50, '%', system)).to eq(Unit(Rational(1, 2), system))
end
end
end
Loading