Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 10 additions & 23 deletions lib/unit/system.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ def load(input)

@unit.each {|name, unit| validate_unit(unit[:def]) }

# Newly registered symbols may introduce glyph characters the lexer must
# recognize; rebuild it lazily on next parse.
@tokenizer = nil

true
end

Expand All @@ -66,7 +62,7 @@ def validate_unit(units)

def parse_unit(expr)
stack, result, implicit_mul = [], [], false
expr.to_s.scan(tokenizer).each do |tok|
expr.to_s.scan(TOKENIZER).each do |tok|
if tok == '('
stack << '('
implicit_mul = false
Expand Down Expand Up @@ -102,24 +98,15 @@ def parse_unit(expr)
OPERATOR_TOKENS = OPERATOR.keys.sort_by {|x| -x.size }. map {|x| Regexp.quote(x) }
OPERATOR_CHARS = (OPERATOR.keys.join + '()').chars.uniq.freeze

# The tokenizer and symbol matcher are derived from the symbols actually
# registered in this system (plus an ASCII baseline), then memoized.
# #load clears the memo, so any unit defined later — including ones whose
# symbol uses a non-ASCII glyph (Ω, ℃, µ, %) — becomes lexable without
# touching the lexer. This replaces a frozen glyph char-class that silently
# dropped unrecognized glyphs to a dimensionless value.
def tokenizer
@tokenizer ||= Regexp.new(
(OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], symbol_pattern, '\\(', '\\)']).join('|')
)
end

def symbol_pattern
glyphs = (unit_symbol.keys + factor_symbol.keys).join.chars.uniq.reject do |char|
char =~ /[A-Za-z0-9_]/ || char =~ /\s/ || OPERATOR_CHARS.include?(char)
end
"[A-Za-z0-9_#{glyphs.map { |char| Regexp.escape(char) }.join}]+"
end
# A symbol is any run of characters that are not whitespace, parentheses, or
# operator characters. Crucially this also matches *unrecognized* glyphs
# (e.g. a stray Greek letter): they survive lexing and then fail loudly as
# an "Undefined unit" during validation, exactly like an unknown ASCII
# symbol does, instead of being silently dropped to a dimensionless value.
SYMBOL = Regexp.new("[^\\s#{OPERATOR_CHARS.map { |char| Regexp.escape(char) }.join}]+")
TOKENIZER = Regexp.new(
(OPERATOR_TOKENS + [REAL.source[1..-2], DEC.source[1..-2], SYMBOL.source, '\\(', '\\)']).join('|')
)

def lookup_symbol(symbol)
if unit_symbol[symbol]
Expand Down
45 changes: 45 additions & 0 deletions spec/system_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,49 @@
expect(Unit(50, '%', system)).to eq(Unit(Rational(1, 2), system))
end
end

describe "rejecting symbols it cannot resolve" do
# The lexer matches any run of non-operator, non-whitespace characters as a
# symbol — including glyphs that no system has registered — so an unknown
# symbol reaches validation and fails loudly as "Undefined unit" instead of
# being silently dropped to a dimensionless value. This keeps unknown
# glyphs consistent with unknown ASCII symbols rather than special-casing
# them into silence.
before { system.load(:si) }

it "raises for an unknown ASCII symbol" do
expect { Unit(1, "foo", system) }.to raise_error(TypeError, "Undefined unit foo")
end

it "raises for an unregistered glyph instead of silently dropping it" do
expect { Unit(1, "λ", system) }.to raise_error(TypeError, "Undefined unit λ")
expect { Unit(5, "αβγ", system) }.to raise_error(TypeError, "Undefined unit αβγ")
end

it "raises when an unregistered glyph is implicitly multiplied with a known unit" do
expect { Unit(1, "m λ", system) }.to raise_error(TypeError, "Undefined unit λ")
expect { Unit(1, "λ m", system) }.to raise_error(TypeError, "Undefined unit λ")
end

it "names the unregistered glyph rather than a neighboring operator" do
# The glyph used to be dropped, leaving a dangling operator that raised a
# misleading "Unexpected token /"; now the real culprit is reported.
expect { Unit(1, "m/λ", system) }.to raise_error(TypeError, "Undefined unit λ")
expect { Unit(1, "λ^2", system) }.to raise_error(TypeError, "Undefined unit λ")
end
end

describe "parsing a unit whose system has not been loaded" do
# Directly answers "what happens with unit strings that aren't loaded": the
# symbol is lexed but unresolved, so it fails loudly until the defining
# system is loaded, after which it resolves normally.
it "raises until the defining system is loaded, then resolves" do
system.load(:si)
# megaelectronvolt is defined in :scientific, which is not loaded yet.
expect { Unit(1, "MeV", system) }.to raise_error(TypeError, "Undefined unit MeV")

system.load(:scientific)
expect(Unit(1, "MeV", system).unit).to eq(Unit(1, "megaelectronvolt", system).unit)
end
end
end
Loading