diff --git a/lib/unit/system.rb b/lib/unit/system.rb index 545ef84..c3fbfbf 100644 --- a/lib/unit/system.rb +++ b/lib/unit/system.rb @@ -47,6 +47,10 @@ def load(input) @unit.each {|name, unit| validate_unit(unit[:def]) } + # Newly registered symbols may introduce glyph characters the lexer must + # recognize; rebuild it lazily on next parse. + @tokenizer = nil + true end @@ -62,7 +66,7 @@ def validate_unit(units) def parse_unit(expr) stack, result, implicit_mul = [], [], false - expr.to_s.scan(TOKENIZER).each do |tok| + expr.to_s.scan(tokenizer).each do |tok| if tok == '(' stack << '(' implicit_mul = false @@ -79,7 +83,7 @@ def parse_unit(expr) val = case tok when REAL then [[:one, tok.to_f, 1]] when DEC then [[:one, tok.to_i, 1]] - when SYMBOL then symbol_to_unit(tok) + else symbol_to_unit(tok) end stack << '*' if implicit_mul implicit_mul = true @@ -94,11 +98,28 @@ def parse_unit(expr) REAL = /^-?(?:(?:\d*\.\d+|\d+\.\d*)(?:[eE][-+]?\d+)?|\d+[eE][-+]?\d+)$/ DEC = /^-?\d+$/ - SYMBOL = /^[a-zA-Z_°'"][\w°'"]*$/ OPERATOR = { '/' => ['/', 1], '*' => ['*', 1], '·' => ['*', 1], '^' => ['^', 2], '**' => ['^', 2] } OPERATOR_TOKENS = OPERATOR.keys.sort_by {|x| -x.size }. map {|x| Regexp.quote(x) } - VALUE_TOKENS = [REAL.source[1..-2], DEC.source[1..-2], SYMBOL.source[1..-2]] - TOKENIZER = Regexp.new((OPERATOR_TOKENS + VALUE_TOKENS + ['\\(', '\\)']).join('|')) + OPERATOR_CHARS = (OPERATOR.keys.join + '()').chars.uniq.freeze + + # The tokenizer and symbol matcher are derived from the symbols actually + # registered in this system (plus an ASCII baseline), then memoized. + # #load clears the memo, so any unit defined later — including ones whose + # symbol uses a non-ASCII glyph (Ω, ℃, µ, %) — becomes lexable without + # touching the lexer. This replaces a frozen glyph char-class that silently + # dropped unrecognized glyphs to a dimensionless value. + def tokenizer + @tokenizer ||= Regexp.new( + (OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], symbol_pattern, '\\(', '\\)']).join('|') + ) + end + + def symbol_pattern + glyphs = (unit_symbol.keys + factor_symbol.keys).join.chars.uniq.reject do |char| + char =~ /[A-Za-z0-9_]/ || char =~ /\s/ || OPERATOR_CHARS.include?(char) + end + "[A-Za-z0-9_#{glyphs.map { |char| Regexp.escape(char) }.join}]+" + end def lookup_symbol(symbol) if unit_symbol[symbol] diff --git a/spec/system_spec.rb b/spec/system_spec.rb index 8b87463..4b24eea 100644 --- a/spec/system_spec.rb +++ b/spec/system_spec.rb @@ -95,4 +95,36 @@ end end end + + describe "lexing every symbol in the default system" do + # Guards against the lexer silently dropping a registered glyph to a + # dimensionless value (the historical Ω/% failure): every unit symbol the + # system knows must parse back to that same unit. Built on a fresh system + # with the documented default load-out so the set is deterministic and not + # polluted by other specs loading optional systems onto Unit.default_system. + default = Unit::System.new("default") do |sys| + sys.load(:si) + sys.load(:binary) + sys.load(:degree) + sys.load(:time) + end + default.unit_symbol.each do |sym, name| + it "parses #{sym.inspect} as #{name}" do + expect(Unit(1, sym, default).unit).to eq(Unit(1, name.to_s, default).unit) + end + end + end + + describe "lexing a glyph symbol loaded at runtime" do + # The lexer is derived from the registered symbols, so a unit defined after + # the system was built (e.g. an app loading a domain unit) becomes lexable + # without any change to the gem. + it "parses a non-word glyph symbol once its unit is loaded" do + system.load(:si) + system.load('units' => { 'percent' => { 'sym' => '%', 'def' => '1 / 100' } }) + + expect(Unit(1, '%', system).unit).to eq(Unit(1, 'percent', system).unit) + expect(Unit(50, '%', system)).to eq(Unit(Rational(1, 2), system)) + end + end end