-
Notifications
You must be signed in to change notification settings - Fork 212
feat: let WriteManifest set and return v3 first_row_id #1321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1158,6 +1158,110 @@ func (m *ManifestTestSuite) TestV3DataManifestFirstRowIDInheritanceSkipsDeletedE | |
| m.EqualValues(1000+liveCount, *read[2].DataFile().FirstRowID()) | ||
| } | ||
|
|
||
| func (m *ManifestTestSuite) TestWriteManifestWithFirstRowIDOption() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: consider adding (a) a v1/v2 +
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added both in b6f3bbb: a v1 manifest + WithManifestFileFirstRowID case confirming it's a no-op (FirstRowID() is nil) |
||
| partitionSpec := NewPartitionSpecID(1, | ||
| PartitionField{FieldID: 1000, SourceIDs: []int{1}, Name: "x", Transform: IdentityTransform{}}) | ||
| count := int64(10) | ||
| entries := []ManifestEntry{ | ||
| &manifestEntry{ | ||
| EntryStatus: EntryStatusADDED, | ||
| Snapshot: &entrySnapshotID, | ||
| Data: &dataFile{ | ||
| Content: EntryContentData, | ||
| Path: "/data/file1.parquet", | ||
| Format: ParquetFile, | ||
| PartitionData: map[string]any{"x": int(1)}, | ||
| RecordCount: count, | ||
| FileSize: 1000, | ||
| BlockSizeInBytes: 64 * 1024, | ||
| FirstRowIDField: nil, | ||
| }, | ||
| }, | ||
| &manifestEntry{ | ||
| EntryStatus: EntryStatusADDED, | ||
| Snapshot: &entrySnapshotID, | ||
| Data: &dataFile{ | ||
| Content: EntryContentData, | ||
| Path: "/data/file2.parquet", | ||
| Format: ParquetFile, | ||
| PartitionData: map[string]any{"x": int(2)}, | ||
| RecordCount: count, | ||
| FileSize: 2000, | ||
| BlockSizeInBytes: 64 * 1024, | ||
| FirstRowIDField: nil, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // Test 1: WriteManifest with WithManifestFileFirstRowID sets the field. | ||
| var bufWithID bytes.Buffer | ||
| firstRowID := int64(500) | ||
| mf, err := WriteManifest("/manifest.avro", &bufWithID, 3, partitionSpec, testSchema, entrySnapshotID, entries, | ||
| WithManifestFileFirstRowID(firstRowID)) | ||
| m.Require().NoError(err) | ||
| m.Require().NotNil(mf.FirstRowID()) | ||
| m.Equal(firstRowID, *mf.FirstRowID()) | ||
|
|
||
| // Reading back, entries inherit first_row_id from the manifest file. | ||
| read, err := ReadManifest(mf, bytes.NewReader(bufWithID.Bytes()), false) | ||
| m.Require().NoError(err) | ||
| m.Require().Len(read, 2) | ||
| m.Require().NotNil(read[0].DataFile().FirstRowID()) | ||
| m.EqualValues(firstRowID, *read[0].DataFile().FirstRowID()) | ||
| m.Require().NotNil(read[1].DataFile().FirstRowID()) | ||
| m.EqualValues(firstRowID+count, *read[1].DataFile().FirstRowID()) | ||
|
|
||
| // Test 2: WriteManifest without option leaves FirstRowID nil (backward compat). | ||
| var bufNoID bytes.Buffer | ||
| mf2, err := WriteManifest("/manifest.avro", &bufNoID, 3, partitionSpec, testSchema, entrySnapshotID, entries) | ||
| m.Require().NoError(err) | ||
| m.Nil(mf2.FirstRowID()) | ||
|
|
||
| // Test 3: v1 + WithManifestFileFirstRowID is a no-op (version < 3). | ||
| var bufV1 bytes.Buffer | ||
| mf3, err := WriteManifest("/manifest.avro", &bufV1, 1, partitionSpec, testSchema, entrySnapshotID, entries, | ||
| WithManifestFileFirstRowID(999)) | ||
| m.Require().NoError(err) | ||
| m.Nil(mf3.FirstRowID()) | ||
|
|
||
| // Test 4: v3 delete-manifest + WithManifestFileFirstRowID sets the field | ||
| // (version >= 3), but the reader does not inherit first_row_id into entries | ||
| // for delete manifests. | ||
| deleteEntry := &manifestEntry{ | ||
| EntryStatus: EntryStatusADDED, | ||
| Snapshot: &entrySnapshotID, | ||
| Data: &dataFile{ | ||
| Content: EntryContentPosDeletes, | ||
| Path: "/data/deletes.avro", | ||
| Format: AvroFile, | ||
| PartitionData: map[string]any{"x": int(1)}, | ||
| RecordCount: count, | ||
| FileSize: 1000, | ||
| BlockSizeInBytes: 64 * 1024, | ||
| FirstRowIDField: nil, | ||
| }, | ||
| } | ||
| var bufDelete bytes.Buffer | ||
| cnt := &internal.CountingWriter{W: &bufDelete} | ||
| w, err := NewManifestWriter(3, cnt, partitionSpec, testSchema, entrySnapshotID, | ||
| WithManifestWriterContent(ManifestContentDeletes)) | ||
| m.Require().NoError(err) | ||
| m.Require().NoError(w.Add(deleteEntry)) | ||
| m.Require().NoError(w.Close()) | ||
| mf4, err := w.ToManifestFile("/manifest.avro", cnt.Count, | ||
| WithManifestFileContent(ManifestContentDeletes), | ||
| WithManifestFileFirstRowID(777)) | ||
| m.Require().NoError(err) | ||
| m.Require().NotNil(mf4.FirstRowID()) | ||
| m.EqualValues(777, *mf4.FirstRowID()) | ||
|
|
||
| // Reading back a delete manifest — entries should not inherit first_row_id. | ||
| readDelete, err := ReadManifest(mf4, bytes.NewReader(bufDelete.Bytes()), false) | ||
| m.Require().NoError(err) | ||
| m.Require().Len(readDelete, 1) | ||
| m.Nil(readDelete[0].DataFile().FirstRowID()) | ||
| } | ||
|
|
||
| func (m *ManifestTestSuite) TestReadManifestListIncompleteSchema() { | ||
| // Verify that reading a manifest list whose embedded schema references | ||
| // an undefined named type ("field_summary" without its definition) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit (pre-existing):
WriteManifestis exported but has no doc comment. Since you're already touching the signature, a one-liner noting thatoptsset v3-specific descriptor fields (e.g.WithManifestFileFirstRowID) would be a welcome add.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a doc comment on WriteManifest in b6f3bbb, calling out that opts can include v3-specific options like WithManifestFileFirstRowID.