diff --git a/lib/unit/system.rb b/lib/unit/system.rb index c3fbfbf..ccd57e7 100644 --- a/lib/unit/system.rb +++ b/lib/unit/system.rb @@ -47,10 +47,6 @@ def load(input) @unit.each {|name, unit| validate_unit(unit[:def]) } - # Newly registered symbols may introduce glyph characters the lexer must - # recognize; rebuild it lazily on next parse. - @tokenizer = nil - true end @@ -66,7 +62,7 @@ def validate_unit(units) def parse_unit(expr) stack, result, implicit_mul = [], [], false - expr.to_s.scan(tokenizer).each do |tok| + expr.to_s.scan(TOKENIZER).each do |tok| if tok == '(' stack << '(' implicit_mul = false @@ -102,24 +98,15 @@ def parse_unit(expr) OPERATOR_TOKENS = OPERATOR.keys.sort_by {|x| -x.size }. map {|x| Regexp.quote(x) } OPERATOR_CHARS = (OPERATOR.keys.join + '()').chars.uniq.freeze - # The tokenizer and symbol matcher are derived from the symbols actually - # registered in this system (plus an ASCII baseline), then memoized. - # #load clears the memo, so any unit defined later — including ones whose - # symbol uses a non-ASCII glyph (Ω, ℃, µ, %) — becomes lexable without - # touching the lexer. This replaces a frozen glyph char-class that silently - # dropped unrecognized glyphs to a dimensionless value. - def tokenizer - @tokenizer ||= Regexp.new( - (OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], symbol_pattern, '\\(', '\\)']).join('|') - ) - end - - def symbol_pattern - glyphs = (unit_symbol.keys + factor_symbol.keys).join.chars.uniq.reject do |char| - char =~ /[A-Za-z0-9_]/ || char =~ /\s/ || OPERATOR_CHARS.include?(char) - end - "[A-Za-z0-9_#{glyphs.map { |char| Regexp.escape(char) }.join}]+" - end + # A symbol is any run of characters that are not whitespace, parentheses, or + # operator characters. Crucially this also matches *unrecognized* glyphs + # (e.g. a stray Greek letter): they survive lexing and then fail loudly as + # an "Undefined unit" during validation, exactly like an unknown ASCII + # symbol does, instead of being silently dropped to a dimensionless value. + SYMBOL = Regexp.new("[^\\s#{OPERATOR_CHARS.map { |char| Regexp.escape(char) }.join}]+") + TOKENIZER = Regexp.new( + (OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], SYMBOL.source, '\\(', '\\)']).join('|') + ) def lookup_symbol(symbol) if unit_symbol[symbol] diff --git a/spec/system_spec.rb b/spec/system_spec.rb index 4b24eea..c5bcd2b 100644 --- a/spec/system_spec.rb +++ b/spec/system_spec.rb @@ -127,4 +127,49 @@ expect(Unit(50, '%', system)).to eq(Unit(Rational(1, 2), system)) end end + + describe "rejecting symbols it cannot resolve" do + # The lexer matches any run of non-operator, non-whitespace characters as a + # symbol — including glyphs that no system has registered — so an unknown + # symbol reaches validation and fails loudly as "Undefined unit" instead of + # being silently dropped to a dimensionless value. This keeps unknown + # glyphs consistent with unknown ASCII symbols rather than special-casing + # them into silence. + before { system.load(:si) } + + it "raises for an unknown ASCII symbol" do + expect { Unit(1, "foo", system) }.to raise_error(TypeError, "Undefined unit foo") + end + + it "raises for an unregistered glyph instead of silently dropping it" do + expect { Unit(1, "λ", system) }.to raise_error(TypeError, "Undefined unit λ") + expect { Unit(5, "αβγ", system) }.to raise_error(TypeError, "Undefined unit αβγ") + end + + it "raises when an unregistered glyph is implicitly multiplied with a known unit" do + expect { Unit(1, "m λ", system) }.to raise_error(TypeError, "Undefined unit λ") + expect { Unit(1, "λ m", system) }.to raise_error(TypeError, "Undefined unit λ") + end + + it "names the unregistered glyph rather than a neighboring operator" do + # The glyph used to be dropped, leaving a dangling operator that raised a + # misleading "Unexpected token /"; now the real culprit is reported. + expect { Unit(1, "m/λ", system) }.to raise_error(TypeError, "Undefined unit λ") + expect { Unit(1, "λ^2", system) }.to raise_error(TypeError, "Undefined unit λ") + end + end + + describe "parsing a unit whose system has not been loaded" do + # Directly answers "what happens with unit strings that aren't loaded": the + # symbol is lexed but unresolved, so it fails loudly until the defining + # system is loaded, after which it resolves normally. + it "raises until the defining system is loaded, then resolves" do + system.load(:si) + # megaelectronvolt is defined in :scientific, which is not loaded yet. + expect { Unit(1, "MeV", system) }.to raise_error(TypeError, "Undefined unit MeV") + + system.load(:scientific) + expect(Unit(1, "MeV", system).unit).to eq(Unit(1, "megaelectronvolt", system).unit) + end + end end