diff --git a/src/Data/Ruby/Marshal/Get.hs b/src/Data/Ruby/Marshal/Get.hs index fa1e9a2..332726e 100644 --- a/src/Data/Ruby/Marshal/Get.hs +++ b/src/Data/Ruby/Marshal/Get.hs @@ -54,25 +54,38 @@ getRubyObject = getMarshalVersion >> go where go :: Marshal RubyObject go = liftMarshal getWord8 >>= \case - NilChar -> return RNil - TrueChar -> return $ RBool True - FalseChar -> return $ RBool False - FixnumChar -> RFixnum <$> getFixnum - FloatChar -> RFloat <$> getFloat - StringChar -> RString <$> getString - SymbolChar -> RSymbol <$> getSymbol - ObjectLinkChar -> getObjectLink - SymlinkChar -> RSymbol <$> getSymlink - ArrayChar -> do + NilChar -> return RNil + TrueChar -> return $ RBool True + FalseChar -> return $ RBool False + FixnumChar -> RFixnum <$> getFixnum + FloatChar -> RFloat <$> getFloat + StringChar -> RString <$> getString + SymbolChar -> RSymbol <$> getSymbol + ObjectLinkChar -> getObjectLink + SymlinkChar -> RSymbol <$> getSymlink + ArrayChar -> do result <- RArray <$> getArray go writeCache result pure result - HashChar -> do + HashChar -> do result <- RHash <$> getHash go go writeCache result pure result - IVarChar -> RIVar <$> getIVar go - _ -> return Unsupported + HashDefChar -> getHashWithDefault go + IVarChar -> getIVar go + BignumChar -> getBignum + RegexpChar -> getRegexp + ObjectChar -> getObjectOrStruct RObject "Object" go + StructChar -> getObjectOrStruct RStruct "Struct" go + ClassChar -> getNamedRef RClass "Class" + ModuleChar -> getNamedRef RModule "Module" + OldModuleChar -> getNamedRef RModule "OldModule" + UserDefChar -> getUserDef go + UserMarshalChar -> getUserMarshalOrData RUserMarshal "UserMarshal" go + DataChar -> getUserMarshalOrData RData "Data" go + ExtendedChar -> getWrapper "Extended" go + UClassChar -> getWrapper "UClass" go + c -> fail $ "unknown marshal tag: " <> show c -------------------------------------------------------------------- -- Ancillary functions. @@ -122,29 +135,138 @@ getHash k v = marshalLabel "Hash" $ do V.replicateM n (liftM2 (,) k v) -- | Parses . -getIVar :: Marshal RubyObject -> Marshal (RubyObject, RubyStringEncoding) +-- +-- IVar wraps an arbitrary object together with a list of @(symbol, value)@ +-- instance-variable pairs. When the wrapped object is a string and the IVs +-- carry encoding info (@:E@ or @:encoding@), the result is an 'RIVar'. For +-- any other shape we still consume every byte but surface just the inner +-- object — the IV metadata is dropped, but the surrounding stream keeps +-- parsing correctly. +getIVar :: Marshal RubyObject -> Marshal RubyObject getIVar g = marshalLabel "IVar" $ do - str <- g + inner <- g len <- getFixnum - if | len /= 1 -> fail "expected single character" - | otherwise -> do - symbol <- g - denote <- g - case symbol of - RSymbol "E" -> - case denote of - RBool True -> return' (str, UTF_8) - RBool False -> return' (str, US_ASCII) - _ -> fail "expected bool" - RSymbol "encoding" -> - case denote of - RString enc -> return' (str, toEnc enc) - _ -> fail "expected string" - _ -> fail "invalid ivar" - where - return' result = do - writeCache $ RIVar result + ivars <- V.replicateM len (liftM2 (,) g g) + let maybeEnc = V.foldl' (\acc pair -> acc <|> extractEncoding pair) Nothing ivars + case (inner, maybeEnc) of + (RString _, Just enc) -> do + let result = RIVar (inner, enc) + writeCache result return result + _ -> return inner + where + extractEncoding :: (RubyObject, RubyObject) -> Maybe RubyStringEncoding + extractEncoding (RSymbol "E", RBool True) = Just UTF_8 + extractEncoding (RSymbol "E", RBool False) = Just US_ASCII + extractEncoding (RSymbol "encoding", RString enc) = Just (toEnc enc) + extractEncoding _ = Nothing + +-- | Parses . +-- +-- Wire format: one sign byte (@\'+\'@ or @\'-\'@), then a packed-int count of +-- 16-bit little-endian digits, then that many digits. +getBignum :: Marshal RubyObject +getBignum = marshalLabel "Bignum" $ do + sign <- liftMarshal getWord8 + n <- getFixnum + bytes <- liftMarshal $ getBytes (n * 2) + let magnitude = BS.foldr (\b acc -> acc * 256 + fromIntegral b) 0 bytes + value = if sign == 0x2D then negate magnitude else magnitude + result = RBignum value + writeCache result + return result + +-- | Parses . +-- +-- Wire format: a raw byte sequence for the pattern, then a single byte of +-- options flags. The pattern's source encoding is typically carried by a +-- surrounding 'RIVar' wrapper, which is parsed independently. +getRegexp :: Marshal RubyObject +getRegexp = marshalLabel "Regexp" $ do + pat <- getString + opts <- liftMarshal getWord8 + let result = RRegexp pat opts + writeCache result + return result + +-- | Parses Hash with a default value (@}@). Wire format matches a regular +-- Hash followed by one additional object (the default). +getHashWithDefault :: Marshal RubyObject -> Marshal RubyObject +getHashWithDefault g = marshalLabel "HashWithDefault" $ do + n <- getFixnum + pairs <- V.replicateM n (liftM2 (,) g g) + def <- g + let result = RHashWithDefault pairs def + writeCache result + return result + +-- | Parses Object (@o@) and Struct (@S@). Both share the wire shape: +-- class symbol, count of pairs, then count many @(symbol, value)@ pairs. +getObjectOrStruct + :: (BS.ByteString -> V.Vector (RubyObject, RubyObject) -> RubyObject) + -> String + -> Marshal RubyObject + -> Marshal RubyObject +getObjectOrStruct con name g = marshalLabel name $ do + classSym <- g + n <- getFixnum + pairs <- V.replicateM n (liftM2 (,) g g) + let result = case classSym of + RSymbol cls -> con cls pairs + _ -> Unsupported + writeCache result + return result + +-- | Parses a Class/Module name reference (@c@, @m@, @M@). Wire format is a +-- bare byte sequence — note that this is not a Symbol; the bytes are the +-- fully-qualified class or module name. +getNamedRef + :: (BS.ByteString -> RubyObject) + -> String + -> Marshal RubyObject +getNamedRef con name = marshalLabel name $ do + s <- getString + let result = con s + writeCache result + return result + +-- | Parses an object dumped via @_dump@ (@u@). Wire format: class symbol, +-- then a raw byte sequence carrying the user-defined payload. +getUserDef :: Marshal RubyObject -> Marshal RubyObject +getUserDef g = marshalLabel "UserDef" $ do + classSym <- g + payload <- getString + let result = case classSym of + RSymbol cls -> RUserDef cls payload + _ -> Unsupported + writeCache result + return result + +-- | Parses an object dumped via @marshal_dump@ (@U@) or @_dump_data@ (@d@). +-- Both share the wire shape: class symbol then one arbitrary Marshal object. +getUserMarshalOrData + :: (BS.ByteString -> RubyObject -> RubyObject) + -> String + -> Marshal RubyObject + -> Marshal RubyObject +getUserMarshalOrData con name g = marshalLabel name $ do + classSym <- g + payload <- g + let result = case classSym of + RSymbol cls -> con cls payload + _ -> Unsupported + writeCache result + return result + +-- | Parses a wrapper tag — @e@ (object extended with a module) or @C@ +-- (object whose class is a user subclass of a builtin). Both read a symbol +-- and then an object, and Ruby does not give the wrapper its own slot in +-- the object table — the inner object owns it. We discard the modifier +-- symbol and pass the inner through unchanged. +getWrapper :: String -> Marshal RubyObject -> Marshal RubyObject +getWrapper name g = marshalLabel name $ do + _ <- g -- modifier symbol (module name or subclass name) + g -- | Pulls an Instance Variable out of the object cache. getObjectLink :: Marshal RubyObject diff --git a/src/Data/Ruby/Marshal/Monad.hs b/src/Data/Ruby/Marshal/Monad.hs index 31ce2ee..00ab31d 100644 --- a/src/Data/Ruby/Marshal/Monad.hs +++ b/src/Data/Ruby/Marshal/Monad.hs @@ -73,16 +73,31 @@ readSymbol :: Int -> Marshal (Maybe RubyObject) readSymbol index = readCache index symbols -- | Write an object to the appropriate cache. +-- +-- Symbols feed the symbol table that 'Symlink' draws from; everything else +-- that can be the target of an 'Object link' (@\@@) feeds the object table. +-- 'RFixnum', 'RNil', 'RBool', and 'RString' are intentionally not cached: +-- the first three are immediate values and bare 'RString' values never appear +-- on the wire outside of an 'RIVar' wrapper, which is itself cached. writeCache :: RubyObject -> Marshal () writeCache object = do cache <- get + let putObj = put $ cache { objects = V.snoc (objects cache) object } + putSym = put $ cache { symbols = V.snoc (symbols cache) object } case object of - RSymbol _ -> do - put $ cache { symbols = V.snoc (symbols cache) object } - RIVar _ -> do - put $ cache { objects = V.snoc (objects cache) object } - RArray _ -> do - put $ cache { objects = V.snoc (objects cache) object } - RHash _ -> do - put $ cache { objects = V.snoc (objects cache) object } - _ -> return () + RSymbol _ -> putSym + RIVar _ -> putObj + RArray _ -> putObj + RHash _ -> putObj + RHashWithDefault _ _ -> putObj + RBignum _ -> putObj + RRegexp _ _ -> putObj + RObject _ _ -> putObj + RStruct _ _ -> putObj + RClass _ -> putObj + RModule _ -> putObj + RUserDef _ _ -> putObj + RUserMarshal _ _ -> putObj + RData _ _ -> putObj + Unsupported -> putObj + _ -> return () diff --git a/src/Data/Ruby/Marshal/RubyObject.hs b/src/Data/Ruby/Marshal/RubyObject.hs index 4815072..6fd7f22 100644 --- a/src/Data/Ruby/Marshal/RubyObject.hs +++ b/src/Data/Ruby/Marshal/RubyObject.hs @@ -24,6 +24,7 @@ import qualified Data.ByteString as BS import qualified Data.Map.Strict as DM import Data.Ruby.Marshal.Encoding (RubyStringEncoding (..)) import qualified Data.Vector as V +import Data.Word (Word8) import Prelude -- | Representation of a Ruby object. @@ -34,10 +35,14 @@ data RubyObject -- ^ represents @true@ or @false@ | RFixnum {-# UNPACK #-} !Int -- ^ represents a @Fixnum@ + | RBignum !Integer + -- ^ represents a @Bignum@ | RArray !(V.Vector RubyObject) -- ^ represents an @Array@ | RHash !(V.Vector (RubyObject, RubyObject)) -- ^ represents an @Hash@ + | RHashWithDefault !(V.Vector (RubyObject, RubyObject)) !RubyObject + -- ^ represents a @Hash@ with a default value (pairs, then default) | RIVar !(RubyObject, RubyStringEncoding) -- ^ represents an @IVar@ | RString !BS.ByteString @@ -46,8 +51,25 @@ data RubyObject -- ^ represents a @Float@ | RSymbol !BS.ByteString -- ^ represents a @Symbol@ + | RRegexp !BS.ByteString !Word8 + -- ^ represents a @Regexp@ (pattern bytes, options flags) + | RObject !BS.ByteString !(V.Vector (RubyObject, RubyObject)) + -- ^ represents a generic @Object@ (class name, instance-variable pairs) + | RStruct !BS.ByteString !(V.Vector (RubyObject, RubyObject)) + -- ^ represents a @Struct@ (class name, member pairs) + | RClass !BS.ByteString + -- ^ represents a @Class@ reference (class name) + | RModule !BS.ByteString + -- ^ represents a @Module@ reference (module name) + | RUserDef !BS.ByteString !BS.ByteString + -- ^ represents an object dumped via @_dump@ (class name, opaque payload) + | RUserMarshal !BS.ByteString !RubyObject + -- ^ represents an object dumped via @marshal_dump@ (class name, payload object) + | RData !BS.ByteString !RubyObject + -- ^ represents an object dumped via @_dump_data@ (class name, state object) | Unsupported - -- ^ represents an invalid object + -- ^ represents an object whose bytes were consumed but whose Ruby semantics + -- this library does not (yet) model deriving (Eq, Ord, Show) -- | Transform plain Haskell values to RubyObjects and back. diff --git a/src/Data/Ruby/Marshal/Types.hs b/src/Data/Ruby/Marshal/Types.hs index 5834ce5..cf3bec3 100644 --- a/src/Data/Ruby/Marshal/Types.hs +++ b/src/Data/Ruby/Marshal/Types.hs @@ -36,6 +36,19 @@ module Data.Ruby.Marshal.Types ( , pattern StringChar , pattern SymbolChar , pattern SymlinkChar + , pattern BignumChar + , pattern HashDefChar + , pattern RegexpChar + , pattern ObjectChar + , pattern StructChar + , pattern ClassChar + , pattern ModuleChar + , pattern OldModuleChar + , pattern ExtendedChar + , pattern UserDefChar + , pattern UserMarshalChar + , pattern UClassChar + , pattern DataChar ) where import Data.Ruby.Marshal.Encoding @@ -66,3 +79,29 @@ pattern StringChar = 34 pattern SymbolChar = 58 -- | Character that represents Symlink. pattern SymlinkChar = 59 +-- | Character that represents Bignum. +pattern BignumChar = 108 +-- | Character that represents Hash with default value. +pattern HashDefChar = 125 +-- | Character that represents Regexp. +pattern RegexpChar = 47 +-- | Character that represents Object. +pattern ObjectChar = 111 +-- | Character that represents Struct. +pattern StructChar = 83 +-- | Character that represents Class reference. +pattern ClassChar = 99 +-- | Character that represents Module reference. +pattern ModuleChar = 109 +-- | Character that represents the legacy Module/Class reference. +pattern OldModuleChar = 77 +-- | Character that represents an object extended with a module. +pattern ExtendedChar = 101 +-- | Character that represents a user-defined dump (_dump). +pattern UserDefChar = 117 +-- | Character that represents a user-defined marshal (marshal_dump). +pattern UserMarshalChar = 85 +-- | Character that represents an object whose class is a user subclass of a builtin. +pattern UClassChar = 67 +-- | Character that represents a Data object (_dump_data). +pattern DataChar = 100 diff --git a/test/MarshalSpec.hs b/test/MarshalSpec.hs index 32bee61..b7665fc 100644 --- a/test/MarshalSpec.hs +++ b/test/MarshalSpec.hs @@ -121,3 +121,90 @@ spec = describe "load" $ do object `shouldBe` Right (RArray $ V.fromList [ RHash mempty, RArray mempty, RIVar (RString "hello", UTF_8), RIVar (RString "haskell", UTF_8) , RHash mempty, RArray mempty, RIVar (RString "hello", UTF_8), RIVar (RString "haskell", UTF_8)]) + + context "when we have a positive Bignum (2**40)" $ + it "should parse" $ do + object <- loadBin "test/bin/bignum.bin" + object `shouldBe` Just (RBignum 1099511627776) + + context "when we have a negative Bignum (-(2**40))" $ + it "should parse" $ do + object <- loadBin "test/bin/negativeBignum.bin" + object `shouldBe` Just (RBignum (-1099511627776)) + + context "when we have a Regexp /fo+/i" $ + it "should parse the inner Regexp, discarding the IVar encoding wrapper" $ do + object <- loadBin "test/bin/regexp.bin" + -- options=1 (IGNORECASE); the enclosing IVar's encoding info is dropped + -- because the inner is not a String. + object `shouldBe` Just (RRegexp "fo+" 1) + + context "when we have a Hash with a default value" $ + it "should parse" $ do + object <- loadBin "test/bin/hashWithDefault.bin" + object `shouldBe` Just + (RHashWithDefault + (V.fromList [(RFixnum 1, RFixnum 10), (RFixnum 2, RFixnum 20)]) + (RFixnum 0)) + + context "when we have a Class reference" $ + it "should parse" $ do + object <- loadBin "test/bin/classRef.bin" + object `shouldBe` Just (RClass "Array") + + context "when we have a Module reference" $ + it "should parse" $ do + object <- loadBin "test/bin/moduleRef.bin" + object `shouldBe` Just (RModule "Comparable") + + context "when we have a plain Object (Point.new(1, 2))" $ + it "should parse the class name and instance variables" $ do + object <- loadBin "test/bin/object.bin" + object `shouldBe` Just + (RObject "Point" + (V.fromList [(RSymbol "@x", RFixnum 1), (RSymbol "@y", RFixnum 2)])) + + context "when we have a Struct (PointStruct.new(3, 4))" $ + it "should parse" $ do + object <- loadBin "test/bin/struct.bin" + object `shouldBe` Just + (RStruct "PointStruct" + (V.fromList [(RSymbol "x", RFixnum 3), (RSymbol "y", RFixnum 4)])) + + context "when we have a UserDef object (Packed via _dump)" $ + it "should expose the class name and opaque payload bytes" $ do + object <- loadBin "test/bin/userDef.bin" + -- [42].pack("L") = "*\\0\\0\\0" on a little-endian host. + object `shouldBe` Just (RUserDef "Packed" "\x2a\x00\x00\x00") + + context "when we have a UserMarshal object (Boxed via marshal_dump)" $ + it "should expose the class name and dumped payload" $ do + object <- loadBin "test/bin/userMarshal.bin" + object `shouldBe` Just (RUserMarshal "Boxed" (RFixnum 42)) + + context "when we have a String with an extra non-encoding instance variable" $ + it "should still surface the String+encoding and not corrupt the stream" $ do + -- Previously the IVar parser failed when len /= 1; now it consumes all + -- IV bytes and just picks the encoding it understands. + object <- loadBin "test/bin/stringWithExtraIVar.bin" + object `shouldBe` Just (RIVar (RString "hello", UTF_8)) + + context "when we have a String extended with a module (tag 'e')" $ + it "should pass the inner String through, dropping the module info" $ do + object <- loadBin "test/bin/extendedString.bin" + object `shouldBe` Just (RIVar (RString "extended", UTF_8)) + + context "when we have a subclassed Array (tag 'C')" $ + it "should pass the inner Array through, dropping the subclass info" $ do + object <- loadBin "test/bin/subclassedArray.bin" + object `shouldBe` Just (RArray (V.fromList [RFixnum 1, RFixnum 2])) + + context "when an object-link points back at a previously-parsed Object" $ + it "should resolve the link to the cached Object" $ do + -- Exercises that the new RObject constructor participates in the + -- object cache so that @N references still work in mixed-type arrays. + let point = RObject "Point" + (V.fromList [(RSymbol "@x", RFixnum 1), (RSymbol "@y", RFixnum 2)]) + object <- loadBin "test/bin/objectLinkArray.bin" + object `shouldBe` Just + (RArray (V.fromList [point, RIVar (RString "marker", UTF_8), point])) diff --git a/test/bin/bignum.bin b/test/bin/bignum.bin new file mode 100644 index 0000000..c9bc7ed Binary files /dev/null and b/test/bin/bignum.bin differ diff --git a/test/bin/classRef.bin b/test/bin/classRef.bin new file mode 100644 index 0000000..ef5fac5 --- /dev/null +++ b/test/bin/classRef.bin @@ -0,0 +1,2 @@ +c +Array \ No newline at end of file diff --git a/test/bin/extendedString.bin b/test/bin/extendedString.bin new file mode 100644 index 0000000..97049b9 --- /dev/null +++ b/test/bin/extendedString.bin @@ -0,0 +1 @@ +Ie:Tag" extended:ET \ No newline at end of file diff --git a/test/bin/hashWithDefault.bin b/test/bin/hashWithDefault.bin new file mode 100644 index 0000000..95863da Binary files /dev/null and b/test/bin/hashWithDefault.bin differ diff --git a/test/bin/moduleRef.bin b/test/bin/moduleRef.bin new file mode 100644 index 0000000..23bbe5a --- /dev/null +++ b/test/bin/moduleRef.bin @@ -0,0 +1 @@ +mComparable \ No newline at end of file diff --git a/test/bin/negativeBignum.bin b/test/bin/negativeBignum.bin new file mode 100644 index 0000000..8baa57d Binary files /dev/null and b/test/bin/negativeBignum.bin differ diff --git a/test/bin/object.bin b/test/bin/object.bin new file mode 100644 index 0000000..ac90a6a --- /dev/null +++ b/test/bin/object.bin @@ -0,0 +1,2 @@ +o: +Point:@xi:@yi \ No newline at end of file diff --git a/test/bin/objectLinkArray.bin b/test/bin/objectLinkArray.bin new file mode 100644 index 0000000..6076987 --- /dev/null +++ b/test/bin/objectLinkArray.bin @@ -0,0 +1,2 @@ +[o: +Point:@xi:@yiI" marker:ET@ \ No newline at end of file diff --git a/test/bin/regexp.bin b/test/bin/regexp.bin new file mode 100644 index 0000000..e9af92b --- /dev/null +++ b/test/bin/regexp.bin @@ -0,0 +1 @@ +I/fo+:EF \ No newline at end of file diff --git a/test/bin/stringWithExtraIVar.bin b/test/bin/stringWithExtraIVar.bin new file mode 100644 index 0000000..6caddf4 Binary files /dev/null and b/test/bin/stringWithExtraIVar.bin differ diff --git a/test/bin/struct.bin b/test/bin/struct.bin new file mode 100644 index 0000000..74f575f --- /dev/null +++ b/test/bin/struct.bin @@ -0,0 +1 @@ +S:PointStruct:xi:yi \ No newline at end of file diff --git a/test/bin/subclassedArray.bin b/test/bin/subclassedArray.bin new file mode 100644 index 0000000..22ed0ed --- /dev/null +++ b/test/bin/subclassedArray.bin @@ -0,0 +1 @@ +C: SubArray[ii \ No newline at end of file diff --git a/test/bin/userDef.bin b/test/bin/userDef.bin new file mode 100644 index 0000000..38b234e Binary files /dev/null and b/test/bin/userDef.bin differ diff --git a/test/bin/userMarshal.bin b/test/bin/userMarshal.bin new file mode 100644 index 0000000..d53eb29 --- /dev/null +++ b/test/bin/userMarshal.bin @@ -0,0 +1,2 @@ +U: +Boxedi/ \ No newline at end of file diff --git a/test/dump b/test/dump index 8b50ec9..e9d0948 100755 --- a/test/dump +++ b/test/dump @@ -9,6 +9,38 @@ haskell = "haskell" someHash = {} someArray = [] +# Fixtures for the extended type coverage (Bignum, Regexp, Object, Struct, +# Class, Module, UserDef, UserMarshal, Extended, UClass, multi-IV IVar). + +class Point + def initialize(x, y) + @x = x + @y = y + end +end + +PointStruct = Struct.new(:x, :y) + +class Boxed + def initialize(v); @v = v; end + def marshal_dump; @v; end + def marshal_load(v); @v = v; end +end + +class Packed + def initialize(n); @n = n; end + def _dump(_level); [@n].pack("L"); end + def self._load(s); new(s.unpack("L").first); end +end + +module Tag; end +class SubArray < Array; end + +bigNumber = 2 ** 40 +extendedString = "extended".extend(Tag) +stringWithIvar = "hello".tap { |s| s.instance_variable_set(:@meta, "world") } +sharedPoint = Point.new(1, 2) + TYPES = { "0" => 0, "2048" => 2048, @@ -34,6 +66,21 @@ TYPES = { "symbolArray" => [:hello, :haskell, :hello, :haskell], "true" => true, "objectsAndStringReferences" => [someHash, someArray, hello, haskell, someHash, someArray, hello, haskell], + + "bignum" => bigNumber, + "negativeBignum" => -bigNumber, + "regexp" => /fo+/i, + "hashWithDefault" => Hash.new(0).tap { |h| h[1] = 10; h[2] = 20 }, + "classRef" => Array, + "moduleRef" => Comparable, + "object" => Point.new(1, 2), + "struct" => PointStruct.new(3, 4), + "userDef" => Packed.new(42), + "userMarshal" => Boxed.new(42), + "stringWithExtraIVar" => stringWithIvar, + "extendedString" => extendedString, + "subclassedArray" => SubArray.new([1, 2]), + "objectLinkArray" => [sharedPoint, "marker", sharedPoint], } def dump(name, x)