From ae1a727cc05daed10e4062df9e38590e0418cde5 Mon Sep 17 00:00:00 2001 From: ChenZeiShuai Date: Thu, 30 Apr 2026 11:44:04 +0800 Subject: [PATCH] fix: write minimum byte width in chunked dimension encoded length (#53) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChunkedStoragePropertyDescription4.Encode() always wrote (byte)8 as the "Dimension Size Encoded Length" field, regardless of the actual dimension magnitudes. The HDF5 spec requires this field to hold the *minimum* number of bytes needed to encode the largest chunk dimension, and libhdf5's H5D__chunk_set_sizes() in src/H5Dchunk.c strictly enforces this with a direct `!=` check that aborts with: "stored chunk dimension encoding length does not match value calculated from chunk dimensions" As a result every chunked file written by PureHDF (with or without filters) was rejected by libhdf5-based readers — h5py, HDFView, MATLAB, Imaris, Bio-Formats — even though PureHDF could read them back itself. This is the same symptom users reported in #53 (pandas via h5py) and is likely related to #88 (h5dump hang). Fix: - Compute the encoded length as `1 + floor(log2(max_dim) / 8)` (min 1 byte, capped at 8 by HDF5 spec), mirroring libhdf5's byte-counting loop. - Replace `driver.Write((byte)8)` with the computed value. - Replace the fixed-width `for (i = 0; i < Rank-1) Write(ulong)` + trailing `Write((ulong)4)` (which also hardcoded the element-size word to 4, breaking any non-int32 dataset coincidentally) with a single `Rank`-iteration loop using `WriteUtils.WriteUlongArbitrary` at the computed width — element size now uses the real `DimensionSizes[Rank-1]`. - `GetEncodeSize()` updated to reflect the variable byte width. Test: - New `[Theory] ChunkedFile_IsReadableBy_libhdf5` round-trips through HDF.PInvoke (same libhdf5 h5py uses). Covers 1-byte, 2-byte, 3-byte encoded length cases plus a 6D microscopy-style chunk shape. Verified independently with h5py 3.16 / numpy 2.4 / hdf5 lib 2.0 on a 6D SPAD-counts data export pipeline (chunked + Deflate-1 → 47x compression, chunked + Deflate-9 → 112x compression, all readable round-trip). --- .../DataLayout/StoragePropertyDescriptions.cs | 61 ++++++++++---- .../Writing/DatasetTests@layout_chunked.cs | 81 +++++++++++++++++++ 2 files changed, 128 insertions(+), 14 deletions(-) diff --git a/src/PureHDF/VOL/Native/FileFormat/Level2/ObjectHeaderMessages/DataLayout/StoragePropertyDescriptions.cs b/src/PureHDF/VOL/Native/FileFormat/Level2/ObjectHeaderMessages/DataLayout/StoragePropertyDescriptions.cs index 8e9b6c1d..58c13753 100644 --- a/src/PureHDF/VOL/Native/FileFormat/Level2/ObjectHeaderMessages/DataLayout/StoragePropertyDescriptions.cs +++ b/src/PureHDF/VOL/Native/FileFormat/Level2/ObjectHeaderMessages/DataLayout/StoragePropertyDescriptions.cs @@ -196,14 +196,16 @@ public static ChunkedStoragePropertyDescription4 Decode(NativeReadContext contex public override ushort GetEncodeSize() { + var encLen = ComputeEncodedLength(DimensionSizes); + var encodeSize = - sizeof(byte) + - sizeof(byte) + - sizeof(byte) + - sizeof(ulong) * Rank + - sizeof(byte) + + sizeof(byte) + // flags + sizeof(byte) + // dimensionality (rank) + sizeof(byte) + // dimension size encoded length + encLen * Rank + // dimension sizes (variable byte width) + sizeof(byte) + // chunk indexing type IndexingInformation.GetEncodeSize(Flags) + - sizeof(ulong); + sizeof(ulong); // address return (ushort)encodeSize; } @@ -218,17 +220,21 @@ public override void Encode(H5DriverBase driver) // dimensionality driver.Write(Rank); - // dimension size encoded length - driver.Write((byte)8); - - // dimension sizes - for (int i = 0; i < Rank - 1; i++) + // dimension size encoded length: minimum number of bytes needed to encode + // the largest chunk dimension. libhdf5's H5D__chunk_set_sizes() in + // src/H5Dchunk.c strictly enforces (`!=` check) that this value matches its + // own calculation; hardcoding a different value (e.g. 8) produces files h5py / + // HDFView / MATLAB / Imaris reject with "stored chunk dimension encoding + // length does not match value calculated from chunk dimensions". + var encLen = ComputeEncodedLength(DimensionSizes); + driver.Write(encLen); + + // dimension sizes (variable byte width per encLen, last entry is element size) + for (int i = 0; i < Rank; i++) { - driver.Write(DimensionSizes[i]); + WriteUtils.WriteUlongArbitrary(driver, DimensionSizes[i], encLen); } - driver.Write((ulong)4); - // chunk indexing type var indexingType = IndexingInformation switch { @@ -250,6 +256,33 @@ public override void Encode(H5DriverBase driver) IsDirty = false; } + + // Mirrors libhdf5 H5D__chunk_set_sizes() byte-counting logic: counts how many + // 8-bit-shifted iterations bring the largest dimension value to zero. Returns 1 + // even when all dims are zero (encoded length must be at least 1 per HDF5 spec). + private static byte ComputeEncodedLength(ulong[] dimensionSizes) + { + var maxValue = 0UL; + + for (int i = 0; i < dimensionSizes.Length; i++) + { + if (dimensionSizes[i] > maxValue) + maxValue = dimensionSizes[i]; + } + + if (maxValue == 0) + return 1; + + byte length = 0; + + while (maxValue != 0) + { + length++; + maxValue >>= 8; + } + + return length; + } } internal record class VirtualStoragePropertyDescription( diff --git a/tests/PureHDF.Tests/Writing/DatasetTests@layout_chunked.cs b/tests/PureHDF.Tests/Writing/DatasetTests@layout_chunked.cs index 8191696e..113db2bb 100644 --- a/tests/PureHDF.Tests/Writing/DatasetTests@layout_chunked.cs +++ b/tests/PureHDF.Tests/Writing/DatasetTests@layout_chunked.cs @@ -1,5 +1,6 @@ using Xunit; using System.Reflection; +using HDF.PInvoke; using PureHDF.Filters; namespace PureHDF.Tests.Writing; @@ -386,4 +387,84 @@ public void ThrowsForInvalidChunkDimensions() File.Delete(filePath); } } + + // Cross-library compatibility test for chunk dimension encoded length. + // Pre-fix: chunked layouts always wrote (byte)8 as the encoded length, which + // libhdf5's H5D__chunk_set_sizes() rejects with + // "stored chunk dimension encoding length does not match value calculated from chunk dimensions" + // because libhdf5 expects the *minimum* number of bytes needed to hold the + // largest chunk dimension. This test writes a chunked file through PureHDF + // and opens it through libhdf5 (via HDF.PInvoke); regression of the + // encoded-length bug surfaces as H5F.open returning a negative handle. + [Theory] + [InlineData(new uint[] { 10U })] // 1D, max 10 → 1 byte + [InlineData(new uint[] { 256U })] // 1D, max 256 → 2 bytes + [InlineData(new uint[] { 65536U })] // 1D, max 65536 → 3 bytes + [InlineData(new uint[] { 4U, 4U, 32U, 32U, 16U, 1U })] // 6D real-world (microscopy) + public void ChunkedFile_IsReadableBy_libhdf5(uint[] chunkDims) + { + // Arrange — build N-D mock data matching the chunk shape (one chunk per dim) + var totalElements = 1; + foreach (var d in chunkDims) + totalElements *= (int)d; + var rawData = new int[totalElements]; + for (var i = 0; i < totalElements; i++) + rawData[i] = i; + + Array data; + if (chunkDims.Length == 1) + { + data = rawData; + } + else + { + var shape = new int[chunkDims.Length]; + for (var i = 0; i < chunkDims.Length; i++) + shape[i] = (int)chunkDims[i]; + var nd = Array.CreateInstance(typeof(int), shape); + Buffer.BlockCopy(rawData, 0, nd, 0, rawData.Length * sizeof(int)); + data = nd; + } + + var file = new H5File + { + ["chunked"] = new H5Dataset(data, chunks: chunkDims) + }; + + var filePath = Path.GetTempFileName(); + + // Act + file.Write(filePath); + + // Assert — libhdf5 must accept the file (negative handle = error) + try + { + var fileId = H5F.open(filePath, H5F.ACC_RDONLY); + try + { + Assert.True(fileId >= 0, $"H5F.open rejected PureHDF chunked file (handle={fileId})"); + + var datasetId = H5D.open(fileId, "chunked"); + try + { + Assert.True(datasetId >= 0, $"H5D.open rejected chunked dataset (handle={datasetId})"); + } + finally + { + if (datasetId >= 0) + _ = H5D.close(datasetId); + } + } + finally + { + if (fileId >= 0) + _ = H5F.close(fileId); + } + } + finally + { + if (File.Exists(filePath)) + File.Delete(filePath); + } + } } \ No newline at end of file