Skip to content

Types coerced to string #34

@GPHemsley-RELX

Description

@GPHemsley-RELX

utils.parse_type() coerces certain types to string, losing the ability to manipulate or reformat native Python types.

def parse_type(data_type, buffer, length=None, version=3, props=None):
parsed = ""
# Bool or int8
if data_type == TYPE_INT8:
parsed = struct.unpack_from("b", buffer)[0]
elif data_type == TYPE_INT16:
parsed = struct.unpack_from("h", buffer)[0]
elif data_type == TYPE_INT32 or data_type == TYPE_COMPLEX:
parsed = struct.unpack_from("i", buffer)[0]
elif data_type == TYPE_MONEY:
parsed = struct.unpack_from("q", buffer)[0]
if props and "Format" in props:
prop_format = props['Format']
if parsed == 0:
parsed = [y for x, y in FORMAT_TO_DEFAULT_VALUE.items() if prop_format.startswith(x)]
if not parsed:
LOGGER.warning(f"parse_type got unknown format while parsing money field {prop_format}")
else:
parsed = parsed[0]
else:
parsed = parse_money_type(parsed, prop_format)
elif data_type == TYPE_FLOAT32:
parsed = struct.unpack_from("f", buffer)[0]
elif data_type == TYPE_FLOAT64:
parsed = struct.unpack_from("d", buffer)[0]
elif data_type == TYPE_DATETIME:
double_datetime = struct.unpack_from("q", buffer)[0]
parsed = mdb_date_to_readable(double_datetime)
elif data_type == TYPE_BINARY:
parsed = buffer[:length]
offset = length
elif data_type == TYPE_OLE:
parsed = buffer
elif data_type == TYPE_GUID:
parsed = buffer[:16]
guid = uuid.UUID(parsed.hex())
parsed = str(guid)
elif data_type == TYPE_96_bit_17_BYTES:
parsed = buffer[:17]
elif data_type == TYPE_TEXT:
if version > 3:
# Looks like if BOM is present text is already decoded
if buffer.startswith(b"\xfe\xff") or buffer.startswith(b"\xff\xfe"):
buff = buffer[2:]
parsed = get_decoded_text(buff)
else:
parsed = buffer.decode("utf-16", errors='ignore')
else:
parsed = get_decoded_text(buffer)
if "\x00" in parsed:
LOGGER.debug(f"Parsed string contains NUL (0x00) characters: {parsed}")
parsed = parsed.replace("\x00", "")
else:
LOGGER.debug(f"parse_type - unsupported data type: {data_type}")
return parsed

This especially affects datetime, which is further processed by utils.mdb_date_to_readable():

# https://stackoverflow.com/questions/45560782
def mdb_date_to_readable(double_time):
try:
dtime_bytes = struct.pack("Q", double_time)
dtime_double = struct.unpack('<d', dtime_bytes)[0]
dtime_frac, dtime_whole = math.modf(dtime_double)
dtime = (ACCESS_EPOCH + timedelta(days=dtime_whole) + timedelta(days=dtime_frac))
if dtime == ACCESS_EPOCH:
return "(Empty Date)"
return str(dtime)
except OverflowError:
return "(Invalid Date)"
except struct.error:
return "(Invalid Date)"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions