diff --git a/.seqbench/baseline.env b/.seqbench/baseline.env index ee12dfcb6..eea4bb9c0 100644 --- a/.seqbench/baseline.env +++ b/.seqbench/baseline.env @@ -1,8 +1,20 @@ GOGC=100 -SEQDB_STORAGE_FRAC_SIZE=16MiB +SEQDB_STORAGE_FRAC_SIZE=1MiB SEQDB_STORAGE_TOTAL_SIZE=10GiB +SEQDB_COMPACTION_ENABLED=true +SEQDB_COMPACTION_WORKERS=4 +SEQDB_COMPACTION_TIME_WINDOW=1h +SEQDB_COMPACTION_TICK_INTERVAL=1s + +SEQDB_COMPACTION_STCS_MERGE_TRIGGER=4 +SEQDB_COMPACTION_STCS_MERGE_FAN_IN=8 +SEQDB_COMPACTION_STCS_MERGE_FAN_OUT_SIZE=256MiB + +SEQDB_COMPACTION_STCS_BUCKET_LOWERBOUND=0.5 +SEQDB_COMPACTION_STCS_BUCKET_UPPERBOUND=1.5 + SEQDB_LIMITS_QUERY_RATE=1024 SEQDB_LIMITS_SEARCH_REQUESTS=1024 SEQDB_LIMITS_BULK_REQUESTS=128 diff --git a/.seqbench/comparison.env b/.seqbench/comparison.env index ee12dfcb6..eea4bb9c0 100644 --- a/.seqbench/comparison.env +++ b/.seqbench/comparison.env @@ -1,8 +1,20 @@ GOGC=100 -SEQDB_STORAGE_FRAC_SIZE=16MiB +SEQDB_STORAGE_FRAC_SIZE=1MiB SEQDB_STORAGE_TOTAL_SIZE=10GiB +SEQDB_COMPACTION_ENABLED=true +SEQDB_COMPACTION_WORKERS=4 +SEQDB_COMPACTION_TIME_WINDOW=1h +SEQDB_COMPACTION_TICK_INTERVAL=1s + +SEQDB_COMPACTION_STCS_MERGE_TRIGGER=4 +SEQDB_COMPACTION_STCS_MERGE_FAN_IN=8 +SEQDB_COMPACTION_STCS_MERGE_FAN_OUT_SIZE=256MiB + +SEQDB_COMPACTION_STCS_BUCKET_LOWERBOUND=0.5 +SEQDB_COMPACTION_STCS_BUCKET_UPPERBOUND=1.5 + SEQDB_LIMITS_QUERY_RATE=1024 SEQDB_LIMITS_SEARCH_REQUESTS=1024 SEQDB_LIMITS_BULK_REQUESTS=128 diff --git a/.seqbench/continuous.env b/.seqbench/continuous.env index 96a870acf..9fb9c7fcc 100644 --- a/.seqbench/continuous.env +++ b/.seqbench/continuous.env @@ -2,9 +2,21 @@ GOGC=100 SEQDB_RESOURCES_SKIP_FSYNC=true -SEQDB_STORAGE_FRAC_SIZE=16MiB +SEQDB_STORAGE_FRAC_SIZE=1MiB SEQDB_STORAGE_TOTAL_SIZE=10GiB +SEQDB_COMPACTION_ENABLED=true +SEQDB_COMPACTION_WORKERS=4 +SEQDB_COMPACTION_TIME_WINDOW=1h +SEQDB_COMPACTION_TICK_INTERVAL=1s + +SEQDB_COMPACTION_STCS_MERGE_TRIGGER=4 +SEQDB_COMPACTION_STCS_MERGE_FAN_IN=8 +SEQDB_COMPACTION_STCS_MERGE_FAN_OUT_SIZE=256MiB + +SEQDB_COMPACTION_STCS_BUCKET_LOWERBOUND=0.5 +SEQDB_COMPACTION_STCS_BUCKET_UPPERBOUND=1.5 + SEQDB_LIMITS_QUERY_RATE=1024 SEQDB_LIMITS_SEARCH_REQUESTS=1024 SEQDB_LIMITS_BULK_REQUESTS=128 diff --git a/cmd/index_analyzer/main.go b/cmd/index_analyzer/main.go index b7422da41..da158d0cd 100644 --- a/cmd/index_analyzer/main.go +++ b/cmd/index_analyzer/main.go @@ -200,10 +200,19 @@ func analyzeIndex( logger.Fatal("error unpacking lids block", zap.Error(err)) } - last := len(block.Offsets) - 2 - for i := 0; i <= last; i++ { - tokenLIDs = append(tokenLIDs, block.LIDs[block.Offsets[i]:block.Offsets[i+1]]...) - if i < last || block.IsLastLID { // the end of token lids + listsCount := block.GetCount() + for i := 0; i < listsCount; i++ { + lidsBatch := block.GetLIDs(i) + iter := lidsBatch.Iter() + for { + lid, ok := iter.Next() + if !ok { + break + } + tokenLIDs = append(tokenLIDs, lid) + } + + if i < listsCount || block.IsLastLID() { // the end of token lids lidsTotal += len(tokenLIDs) lidsLens[tid] = len(tokenLIDs) lidsUniq[getLIDsHash(tokenLIDs)] = len(tokenLIDs) diff --git a/cmd/seq-db/seq-db.go b/cmd/seq-db/seq-db.go index de29ac4ca..e373509a9 100644 --- a/cmd/seq-db/seq-db.go +++ b/cmd/seq-db/seq-db.go @@ -19,6 +19,7 @@ import ( "github.com/ozontech/seq-db/asyncsearcher" "github.com/ozontech/seq-db/buildinfo" + "github.com/ozontech/seq-db/compaction" "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" @@ -272,6 +273,7 @@ func startStore( TokenTableZstdLevel: cfg.Compression.SealedZstdCompressionLevel, DocBlocksZstdLevel: cfg.Compression.DocBlockZstdCompressionLevel, DocBlockSize: int(cfg.DocsSorting.DocBlockSize), + LidsBitmapThreshold: cfg.Sealing.Lids.BitmapThreshold, }, Fraction: frac.Config{ Search: frac.SearchConfig{ @@ -289,6 +291,7 @@ func startStore( OffloadingRetention: cfg.Offloading.Retention, OffloadingRetryDelay: cfg.Offloading.RetryDelay, OffloadingQueueSize: uint64(float64(cfg.Storage.TotalSize) * cfg.Offloading.QueueSizePercent / 100), + CompactionEnabled: cfg.Compaction.Enabled, }, API: storeapi.APIConfig{ StoreMode: configMode, @@ -324,6 +327,20 @@ func startStore( Workers: cfg.SkipMaskManager.Workers, CacheSizeLimit: uint64(cfg.SkipMaskManager.CacheSize), }, + Compaction: compaction.Config{ + Enabled: cfg.Compaction.Enabled, + + MergeTrigger: cfg.Compaction.STCS.MergeTrigger, + MergeFanIn: cfg.Compaction.STCS.MergeFanIn, + MergeFanOutSize: uint64(cfg.Compaction.STCS.MergeFanOutSize), + + BucketLowerbound: cfg.Compaction.STCS.BucketLowerbound, + BucketUpperbound: cfg.Compaction.STCS.BucketUpperbound, + + Workers: cfg.Compaction.Workers, + TimeWindow: cfg.Compaction.TimeWindow, + TickInterval: cfg.Compaction.TickInterval, + }, } s3cli := initS3Client(cfg) diff --git a/compaction/executor.go b/compaction/executor.go new file mode 100644 index 000000000..b0cc6db93 --- /dev/null +++ b/compaction/executor.go @@ -0,0 +1,76 @@ +package compaction + +import ( + "sync" + "time" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/logger" +) + +type Executor struct { + params common.SealParams + + workers int + wg sync.WaitGroup + + p *planner +} + +func NewExecutor(workers int, params common.SealParams, p *planner) *Executor { + e := Executor{params: params, workers: workers, p: p} + e.init() + return &e +} + +func (e *Executor) Stop() { + e.p.stop() + e.wg.Wait() +} + +func (e *Executor) init() { + for range e.workers { + e.wg.Go(func() { + for t := range e.p.tasks { + start := time.Now() + + result, err := e.compact(t) + compactionDurationSeconds. + WithLabelValues(t.bucketSize). + Observe(time.Since(start).Seconds()) + + t.onComplete(result, err) + } + }) + } +} + +func (e *Executor) compact(t task) (*sealed.PreloadedData, error) { + var ( + names []string + srcs []Source + ) + + for _, f := range t.snapshot.Fractions() { + names = append(names, f.Info().Name()) + srcs = append(srcs, frac.NewSealedSource(f)) + + compactionBytesTotal. + WithLabelValues(t.bucketSize). + Add(float64(f.Info().IndexOnDisk)) + } + + logger.Info( + "compacting fractions", + zap.Time("bin", t.bin), + zap.Strings("names", names), + zap.String("bucket_size", t.bucketSize), + ) + + preloaded, err := Merge(t.filename, e.params, srcs...) + return preloaded, err +} diff --git a/compaction/merge.go b/compaction/merge.go new file mode 100644 index 000000000..30aa3ab8a --- /dev/null +++ b/compaction/merge.go @@ -0,0 +1,162 @@ +package compaction + +import ( + "errors" + "os" + + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/indexwriter" +) + +func Merge(filename string, params common.SealParams, srcs ...Source) (*sealed.PreloadedData, error) { + w := indexwriter.New(params) + src := NewMergeSource(filename, srcs) + + if err := createAndWrite( + filename+consts.OffsetsTmpFileSuffix, + filename+consts.OffsetsFileSuffix, + func(f *os.File) error { return w.WriteOffsetsFile(f, src) }, + ); err != nil { + return nil, err + } + + if err := createAndWrite( + filename+consts.IDTmpFileSuffix, + filename+consts.IDFileSuffix, + func(f *os.File) error { return w.WriteIDFile(f, src) }, + ); err != nil { + return nil, err + } + + if err := createAndWriteBoth( + filename+consts.TokenTmpFileSuffix, + filename+consts.TokenFileSuffix, + filename+consts.LIDTmpFileSuffix, + filename+consts.LIDFileSuffix, + func(tf, lf *os.File) error { return w.WriteTokenTriplet(tf, lf, src) }, + ); err != nil { + return nil, err + } + + if err := createAndWrite( + filename+consts.InfoTmpFileSuffix, + filename+consts.InfoFileSuffix, + func(f *os.File) error { return w.WriteInfoFile(f, src) }, + ); err != nil { + return nil, err + } + + if err := mergeDocs(filename, srcs...); err != nil { + return nil, err + } + + info := src.Info() + info.IndexOnDisk = 0 + + for _, suffix := range []string{ + consts.InfoFileSuffix, + consts.TokenFileSuffix, + consts.OffsetsFileSuffix, + consts.IDFileSuffix, + consts.LIDFileSuffix, + } { + st, err := os.Stat(info.Path + suffix) + if err != nil { + return nil, err + } + info.IndexOnDisk += uint64(st.Size()) + } + + lidsTable := w.LIDsTable() + preloaded := &sealed.PreloadedData{ + Info: info, + TokenTable: w.TokenTable(), + BlocksData: sealed.BlocksData{ + LIDsTable: &lidsTable, + IDsTable: w.IDsTable(), + BlocksOffsets: src.BlockOffsets(), + }, + } + + return preloaded, nil +} + +func mergeDocs(filename string, srcs ...Source) error { + return createAndWrite( + filename+consts.DocsTmpFileSuffix, + filename+consts.DocsFileSuffix, + func(f *os.File) error { + var docsSize uint64 + for _, src := range srcs { + for loc, err := range src.DocBlock() { + if err != nil { + return err + } + + payload, offset := loc.First, loc.Second + if _, err := f.WriteAt(payload, int64(offset+docsSize)); err != nil { + return err + } + } + + docsSize += src.Info().DocsOnDisk + } + + return nil + }, + ) +} + +func syncAndClose(f *os.File) error { + if err := f.Sync(); err != nil { + f.Close() + return err + } + return f.Close() +} + +func createAndWrite( + tmp, final string, + write func(*os.File) error, +) error { + f, err := os.Create(tmp) + if err != nil { + return err + } + + if err := errors.Join(write(f), syncAndClose(f)); err != nil { + return err + } + + return os.Rename(tmp, final) +} + +func createAndWriteBoth( + atmp, afinal, + btmp, bfinal string, + write func(*os.File, *os.File) error, +) error { + a, err := os.Create(atmp) + if err != nil { + return err + } + + b, err := os.Create(btmp) + if err != nil { + a.Close() + return err + } + + writeErr := write(a, b) + if err := errors.Join(writeErr, syncAndClose(a), syncAndClose(b)); err != nil { + return err + } + + if err := os.Rename(atmp, afinal); err != nil { + return err + } + + return os.Rename(btmp, bfinal) +} diff --git a/compaction/merge_source.go b/compaction/merge_source.go new file mode 100644 index 000000000..f2e49da75 --- /dev/null +++ b/compaction/merge_source.go @@ -0,0 +1,445 @@ +package compaction + +import ( + "bytes" + "iter" + "slices" + "sync" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/indexwriter" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + +type ( + Document = util.Pair[seq.ID, []byte] + DocBlockLocation = util.Pair[[]byte, uint64] + TokenPosting = util.Pair[[]byte, []uint32] + DocLocation = util.Pair[seq.ID, seq.DocPos] + IndexedDocBlock = util.Pair[[]byte, []seq.DocPos] +) + +type Source interface { + indexwriter.Source + DocBlock() iter.Seq2[DocBlockLocation, error] +} + +type MergeSource struct { + filename string + + // sources is a slice of [sealing.Source] + // which provide view into underlying fractions. + sources []Source + + info *common.Info + infoOnce sync.Once + + offsets []uint64 + offsetsOnce sync.Once + + // docBlockCount is populated during [MergeSource.BlockOffsets] call. + // This slice is used for changing block indexes in [seq.DocPos]. + docBlockCount []int + + // lidMapping describes the transformation of lids + // after k-merge of several fractions. + // + // i-th index of [lidMapping] correponds to i-th fraction. + // j-th index of i-th [lidMapping] corresponds to rename of j-th lid. + lidMapping [][]uint32 +} + +func NewMergeSource(filename string, sources []Source) *MergeSource { + lidMapping := make([][]uint32, len(sources)) + + for i, src := range sources { + lidMapping[i] = make( + []uint32, + // Increment for [seq.SystemID]. + src.Info().DocsTotal+1, + ) + } + + s := &MergeSource{ + filename: filename, + sources: sources, + lidMapping: lidMapping, + } + + s.info = s.prepareInfo() + return s +} + +func (s *MergeSource) prepareInfo() *common.Info { + info := common.NewInfo(s.filename, 0, 0) + + var ( + from seq.MID = seq.MaxID.MID + to seq.MID = seq.MinID.MID + ) + + for _, src := range s.sources { + from = min(from, src.Info().From) + to = max(to, src.Info().To) + } + + info.From, info.To = from, to + info.SealingTime = info.CreationTime + + info.InitEmptyDistribution() + return info +} + +func (s *MergeSource) Info() *common.Info { + s.infoOnce.Do(func() { + for i := range s.sources { + sinfo := s.sources[i].Info() + + s.info.DocsRaw += sinfo.DocsRaw + s.info.DocsTotal += sinfo.DocsTotal + s.info.DocsOnDisk += sinfo.DocsOnDisk + + // NOTE(dkharms): [IndexOnDisk] is calculated later. + } + }) + + return s.info +} + +func (s *MergeSource) BlockOffsets() []uint64 { + s.offsetsOnce.Do(func() { + var ( + docsSize uint64 + offsets []uint64 + ) + + s.docBlockCount = append(s.docBlockCount, 0) + for i := 0; i < len(s.sources); i++ { + for _, offset := range s.sources[i].BlockOffsets() { + offsets = append(offsets, uint64(offset)+docsSize) + } + docsSize += s.sources[i].Info().DocsOnDisk + s.docBlockCount = append(s.docBlockCount, len(offsets)) + } + + s.offsets = offsets + }) + + return s.offsets +} + +func (s *MergeSource) ID() iter.Seq2[DocLocation, error] { + // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. + // + // Its time complexity O(k*n) so it's not efficient enough if we compare it + // against time complexity of min-heap (which is O(n*log(k))) + // or another great data structure -- tournament tree -- which is O(n*log(k)) as well. + // + // However, tournament tree performs less comparisons than min-heap + // and it is around log(k) vs 2*log(k). + + type cursor struct { + next func() (DocLocation, error, bool) + stop func() + + loc DocLocation + lidOld uint32 + + ok bool + } + + return func(yield func(DocLocation, error) bool) { + var cursors []cursor + + defer func() { + for _, c := range cursors { + c.stop() + } + }() + + for i := range s.sources { + src := s.sources[i] + next, stop := iter.Pull2(src.ID()) + + // Skip [seq.SystemID] and [seq.SystemDocPos]. + _, _, _ = next() + + loc, err, ok := next() + cursors = append(cursors, cursor{ + next: next, stop: stop, + loc: loc, lidOld: 1, + ok: ok && err == nil, + }) + + if err != nil { + yield(DocLocation{}, err) + return + } + } + + lid := uint32(1) + // We've previosly dropped [seq.SystemID] from + // iterators however we do have to emit one such id. + if !yield(DocLocation{First: seq.SystemID, Second: seq.SystemDocPos}, nil) { + return + } + + for { + var ( + id seq.ID = seq.MinID + idx int = -1 + ) + + for i, c := range cursors { + // We exhausted i-th cursor so there is nothing pull. + if !c.ok { + continue + } + + if seq.Less(id, c.loc.First) { + id = c.loc.First + idx = i + } + } + + // All pull-iterators are exhausted. + // Close all iterators and return. + if idx == -1 { + break + } + + c := cursors[idx] + + minID, lidOld := c.loc.First, c.lidOld + s.info.AddMID(uint64(minID.MID)) + + blockIdx, offset := c.loc.Second.Unpack() + minDocPos := seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) + + if !yield(DocLocation{First: minID, Second: minDocPos}, nil) { + return + } + + // Rename lid from picked cursor to the new value. + s.lidMapping[idx][lidOld] = lid + + var err error + c.loc, err, c.ok = c.next() + c.lidOld += 1 + + if err != nil { + cursors[idx] = c + yield(DocLocation{}, err) + return + } + + lid += 1 + cursors[idx] = c + } + } +} + +func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { + // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. + // + // Its time complexity O(k*n) so it's not efficient enough if we compare it + // against time complexity of min-heap (which is O(n*log(k))) + // or another great data structure -- tournament tree -- which is O(n*log(k)) as well. + // + // However, tournament tree performs less comparisons than min-heap + // and it is around log(k) vs 2*log(k). + + type cursor struct { + next func() (string, iter.Seq2[TokenPosting, error], bool) + stop func() + + field string + tokIt iter.Seq2[TokenPosting, error] + + ok bool + } + + minimal := func(cursors []cursor) (string, bool) { + var ( + set bool + field string + ) + + for _, c := range cursors { + if !c.ok { + continue + } + + if !set { + field = c.field + set = true + continue + } + + field = min(field, c.field) + } + + return field, set + } + + return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { + var cursors []cursor + + for i := range s.sources { + src := s.sources[i] + + next, stop := iter.Pull2(src.TokenTriplet()) + field, tokIt, has := next() + + cursors = append(cursors, cursor{ + next: next, stop: stop, + field: field, tokIt: tokIt, + ok: has, + }) + } + + defer func() { + for _, c := range cursors { + c.stop() + } + }() + + for { + field, ok := minimal(cursors) + if !ok { + break + } + + var ( + idxs []int + iters []iter.Seq2[TokenPosting, error] + ) + + for i, c := range cursors { + if !c.ok || c.field != field { + continue + } + + idxs = append(idxs, i) + iters = append(iters, c.tokIt) + } + + if !yield(field, s.postingsForField(idxs, iters)) { + return + } + + // Advance all cursors that were on this field. + for _, idx := range idxs { + c := cursors[idx] + c.field, c.tokIt, c.ok = c.next() + cursors[idx] = c + } + } + } +} + +func (s *MergeSource) postingsForField( + idxs []int, iters []iter.Seq2[TokenPosting, error], +) iter.Seq2[TokenPosting, error] { + type cursor struct { + next func() (TokenPosting, error, bool) + stop func() + + idx int + posting TokenPosting + + ok bool + } + + minimal := func(cursors []cursor) ([]byte, bool) { + var ( + set bool + token []byte + ) + + for _, c := range cursors { + if !c.ok { + continue + } + + if !set { + token = c.posting.First + set = true + continue + } + + if bytes.Compare(c.posting.First, token) < 0 { + token = c.posting.First + } + } + + return token, set + } + + // NB: This buffer will be reused across + // all calls within current field. + var lidRenamed []uint32 + + return func(yield func(TokenPosting, error) bool) { + var cursors []cursor + + defer func() { + for _, c := range cursors { + c.stop() + } + }() + + for i := range iters { + next, stop := iter.Pull2(iters[i]) + posting, err, ok := next() + + cursors = append(cursors, cursor{ + next: next, stop: stop, + idx: idxs[i], posting: posting, + ok: ok && err == nil, + }) + + if err != nil { + yield(TokenPosting{}, err) + return + } + } + + for { + token, ok := minimal(cursors) + if !ok { + break + } + + // Collect and remap lids from all cursors at this token, then advance them. + for i, c := range cursors { + if !c.ok || !bytes.Equal(c.posting.First, token) { + continue + } + + for _, lid := range c.posting.Second { + lidRenamed = append(lidRenamed, s.lidMapping[c.idx][lid]) + } + + var err error + c.posting, err, c.ok = c.next() + + if err != nil { + cursors[i] = c + yield(TokenPosting{}, err) + return + } + + cursors[i] = c + } + + slices.Sort(lidRenamed) + if !yield(TokenPosting{First: token, Second: lidRenamed}, nil) { + return + } + + lidRenamed = lidRenamed[:0] + } + } +} diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go new file mode 100644 index 000000000..bb5fb3b15 --- /dev/null +++ b/compaction/merge_source_test.go @@ -0,0 +1,352 @@ +package compaction + +import ( + "cmp" + "fmt" + "iter" + "math/rand" + "slices" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/seq" +) + +type mockSealingSource struct { + ids []seq.ID + pos []seq.DocPos + blocks []uint64 + docsOnDisk uint64 + fields map[string]map[string][]uint32 +} + +func (m *mockSealingSource) Info() *common.Info { + return &common.Info{ + DocsRaw: m.docsOnDisk, + DocsTotal: uint32(len(m.ids)), + DocsOnDisk: m.docsOnDisk, + + From: slices.MinFunc(m.ids, func(x, y seq.ID) int { + return cmp.Compare(x.MID, y.MID) + }).MID, + + To: slices.MaxFunc(m.ids, func(x, y seq.ID) int { + return cmp.Compare(x.MID, y.MID) + }).MID, + } +} + +func (m *mockSealingSource) BlockOffsets() []uint64 { + return m.blocks +} + +func (m *mockSealingSource) ID() iter.Seq2[DocLocation, error] { + return func(yield func(DocLocation, error) bool) { + docloc := DocLocation{First: seq.SystemID, Second: seq.SystemDocPos} + if !yield(docloc, nil) { + return + } + + for i, id := range m.ids { + docloc = DocLocation{First: id, Second: m.pos[i]} + if !yield(docloc, nil) { + return + } + } + } +} + +func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { + fields := make([]string, 0, len(m.fields)) + for f := range m.fields { + fields = append(fields, f) + } + + slices.Sort(fields) + return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { + for _, field := range fields { + if !yield(field, m.postingsForField(field)) { + return + } + } + } +} + +func (m *mockSealingSource) postingsForField(field string) iter.Seq2[TokenPosting, error] { + return func(yield func(TokenPosting, error) bool) { + tokens := make([]string, 0, len(m.fields[field])) + for t := range m.fields[field] { + tokens = append(tokens, t) + } + + slices.Sort(tokens) + for _, tok := range tokens { + posting := TokenPosting{ + First: []byte(tok), + Second: m.fields[field][tok], + } + + if !yield(posting, nil) { + return + } + } + } +} + +func (m *mockSealingSource) DocBlock() iter.Seq2[DocBlockLocation, error] { + return func(yield func(DocBlockLocation, error) bool) { + if !yield(DocBlockLocation{}, nil) { + return + } + } +} + +func (m *mockSealingSource) LastError() error { + return nil +} + +func TestMergeSource(t *testing.T) { + first := &mockSealingSource{ + ids: []seq.ID{ + {MID: 3}, + {MID: 2}, + {MID: 1}, + }, + + pos: []seq.DocPos{ + seq.PackDocPos(0, 0), + seq.PackDocPos(0, 1024), + seq.PackDocPos(0, 2048), + }, + + fields: map[string]map[string][]uint32{ + "level": { + "error": {1, 3}, + "info": {2, 3}, + }, + }, + + blocks: []uint64{0}, + docsOnDisk: 1024, + } + + second := &mockSealingSource{ + ids: []seq.ID{ + {MID: 6}, + {MID: 5}, + }, + + pos: []seq.DocPos{ + seq.PackDocPos(0, 0), + seq.PackDocPos(0, 2048), + }, + + fields: map[string]map[string][]uint32{ + "level": { + "debug": {1}, + "info": {2}, + }, + }, + + blocks: []uint64{0}, + docsOnDisk: 2048, + } + + source := NewMergeSource("inmemory", []Source{first, second}) + + t.Run("offsets", func(t *testing.T) { + // Validate correctness of [storage.DocBlock] calculation. + offsets := source.BlockOffsets() + require.Equal(t, []uint64{0, 1024}, offsets) + }) + + t.Run("ids", func(t *testing.T) { + var ( + ids []seq.ID + docpos []seq.DocPos + ) + + for loc, err := range source.ID() { + require.NoError(t, err) + ids = append(ids, loc.First) + docpos = append(docpos, loc.Second) + } + + require.Equal(t, + []seq.ID{ + seq.SystemID, + // [seq.ID] from the second source. + {MID: 6}, + {MID: 5}, + // [seq.ID] from the first source. + {MID: 3}, + {MID: 2}, + {MID: 1}, + }, + ids, + ) + + require.Equal(t, + []seq.DocPos{ + seq.SystemDocPos, + // [seq.DocPos] from the second source. + seq.PackDocPos(1, 0), seq.PackDocPos(1, 2048), + // [seq.DocPos] from the first source. + seq.PackDocPos(0, 0), seq.PackDocPos(0, 1024), seq.PackDocPos(0, 2048), + }, + docpos, + ) + }) + + t.Run("tokens-lids", func(t *testing.T) { + var ( + fields []string + tokens [][]byte + lids [][]uint32 + ) + + for field, fieldIt := range source.TokenTriplet() { + fields = append(fields, field) + + for posting, err := range fieldIt { + require.NoError(t, err) + tokens = append(tokens, posting.First) + lids = append(lids, slices.Clone(posting.Second)) + } + } + + // Both sources have the same and the only field. + require.Equal(t, []string{"level"}, fields) + + // Ensure tokens are sorted in ascending order. + require.Equal(t, + [][]byte{[]byte("debug"), []byte("error"), []byte("info")}, + tokens, + ) + + // Ensure correctness of lids remapping: + // ------------------------- + // seq.MID 6 5 | 3 2 1 + // seq.LID (old) 1 2 | 1 2 3 + // seq.LID (new) 1 2 | 3 4 5 + // ------------------------- + require.Equal(t, + [][]uint32{ + // Sequence of [seq.LID] for token `debug`. + {1}, + // Sequence of [seq.LID] for token `error`. + {3, 5}, + // Sequence of [seq.LID] for token `info`. + {2, 4, 5}, + }, + lids, + ) + }) + + t.Run("info", func(t *testing.T) { + merged := source.Info() + finfo, sinfo := first.Info(), second.Info() + + // Validate correctness of fraction time-range. + require.Equal(t, merged.From, min(finfo.From, sinfo.From)) + require.Equal(t, merged.To, max(finfo.To, sinfo.To)) + + // Validate correctness of total documents of merged fractions. + require.Equal(t, merged.DocsTotal, finfo.DocsTotal+sinfo.DocsTotal) + require.Equal(t, merged.DocsOnDisk, finfo.DocsOnDisk+sinfo.DocsOnDisk) + require.Equal(t, merged.DocsRaw, finfo.DocsRaw+sinfo.DocsRaw) + + // Validate correctness of distribution. + require.NotNil(t, merged.Distribution) + require.True(t, merged.IsIntersecting(finfo.From, finfo.To)) + require.True(t, merged.IsIntersecting(sinfo.From, sinfo.To)) + require.True(t, merged.IsIntersecting(min(finfo.From, sinfo.From), max(finfo.To, sinfo.To))) + }) +} + +func BenchmarkMergeSource(b *testing.B) { + const ( + numSources = 4 + docsPerSource = 512_000 + + // Total count of pairs of (field, token) will be + // [numFields] * [numTokens]. + numFields = 512 + numTokens = 16384 + ) + + rng := rand.New(rand.NewSource(42)) + + fieldNames := make([]string, numFields) + for i := range fieldNames { + fieldNames[i] = fmt.Sprintf("field-%d", i) + } + + tokenNames := make([]string, numTokens) + for i := range tokenNames { + tokenNames[i] = fmt.Sprintf("token-%d", i) + } + + makeSource := func(midOffset seq.MID) Source { + ids := make([]seq.ID, docsPerSource) + pos := make([]seq.DocPos, docsPerSource) + + for j := range ids { + // IDs must be in descending MID order within each source. + ids[j] = seq.ID{MID: midOffset + seq.MID(docsPerSource-j)} + pos[j] = seq.PackDocPos(0, uint64(j*64)) + } + + // Assign each lid to a random (field, token) pair from the vocabulary + // so that total lids per source equals [docsPerSource]. + fields := make(map[string]map[string][]uint32) + for lid := uint32(1); lid <= uint32(docsPerSource); lid++ { + field := fieldNames[rng.Intn(numFields)] + token := tokenNames[rng.Intn(numTokens)] + + if fields[field] == nil { + fields[field] = make(map[string][]uint32) + } + + fields[field][token] = append(fields[field][token], lid) + } + + for _, tokens := range fields { + for tok, lids := range tokens { + slices.Sort(lids) + tokens[tok] = lids + } + } + + return &mockSealingSource{ + ids: ids, + pos: pos, + blocks: []uint64{0}, + docsOnDisk: docsPerSource * 64, + fields: fields, + } + } + + sources := make([]Source, numSources) + for i := range sources { + sources[i] = makeSource(seq.MID(i * docsPerSource)) + } + + b.ResetTimer() + b.ReportAllocs() + + for b.Loop() { + ms := NewMergeSource("bench", sources) + + ms.BlockOffsets() + for range ms.ID() { + } + + for _, tokIt := range ms.TokenTriplet() { + for range tokIt { + } + } + } +} diff --git a/compaction/metrics.go b/compaction/metrics.go new file mode 100644 index 000000000..a6fe88fd7 --- /dev/null +++ b/compaction/metrics.go @@ -0,0 +1,46 @@ +package compaction + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + + "github.com/ozontech/seq-db/metric" +) + +var ( + compactionSkipped = promauto.NewCounter(prometheus.CounterOpts{ + Namespace: "seq_db_store", + Subsystem: "compaction", + Name: "skipped_total", + Help: "Tick-triggered tasks dropped because all workers were busy or no candidates were found", + }) + + compactionBins = promauto.NewGauge(prometheus.GaugeOpts{ + Namespace: "seq_db_store", + Subsystem: "compaction", + Name: "bins", + Help: "Number of active time-bins considered for compaction", + }) + + compactionDurationSeconds = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "seq_db_store", + Subsystem: "compaction", + Name: "duration_seconds", + Help: "Time spent executing a single compaction", + Buckets: metric.SecondsBuckets, + }, []string{"bucket"}) + + compactionBytesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "seq_db_store", + Subsystem: "compaction", + Name: "bytes_total", + Help: "Total index bytes merged across all compactions", + }, []string{"bucket"}) + + compactionResultTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "seq_db_store", + Subsystem: "compaction", + Name: "result_total", + Help: "Compaction outcomes by result (success, empty, error)", + }, []string{"bucket", "result"}) +) diff --git a/compaction/planner.go b/compaction/planner.go new file mode 100644 index 000000000..da0c00089 --- /dev/null +++ b/compaction/planner.go @@ -0,0 +1,239 @@ +package compaction + +import ( + "cmp" + "context" + "maps" + "math/bits" + "slices" + "sync" + "time" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/fracmanager" + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +type Config struct { + Enabled bool + + MergeTrigger int + MergeFanIn int + MergeFanOutSize uint64 + + BucketLowerbound float64 + BucketUpperbound float64 + + Workers int + TimeWindow time.Duration + TickInterval time.Duration +} + +type fraction interface { + Info() *common.Info +} + +type task struct { + bin time.Time + bucketSize string + + filename string + snapshot *fracmanager.CompactionSnapshot + + onComplete func(*sealed.PreloadedData, error) +} + +type planner struct { + cfg Config + ctx context.Context + + wg sync.WaitGroup + done chan struct{} + + fm *fracmanager.FracManager + tasks chan task +} + +func NewPlanner(ctx context.Context, fm *fracmanager.FracManager, cfg Config) *planner { + p := planner{ + cfg: cfg, + ctx: ctx, + + done: make(chan struct{}), + fm: fm, + + tasks: make(chan task), + } + + if cfg.Enabled { + p.init() + return &p + } + + return &p +} + +func (p *planner) init() { + p.wg.Go(func() { + t := time.NewTicker(p.cfg.TickInterval) + + for { + select { + case <-p.ctx.Done(): + close(p.tasks) + return + + case <-p.done: + close(p.tasks) + return + + case <-t.C: + task, ok := p.pick() + if !ok { + compactionSkipped.Inc() + continue + } + + select { + case p.tasks <- task: + case <-time.NewTimer(time.Second).C: + // If all executor workers are busy for some long period of time, + // we want to drop the task because it might contain stale decision. + compactionSkipped.Inc() + } + } + } + }) +} + +func (p *planner) stop() { + close(p.done) + if !p.cfg.Enabled { + close(p.tasks) + } +} + +func (p *planner) pick() (task, bool) { + fractions := p.fm.SealedFractionsSnapshot() + snapshot := make([]fraction, len(fractions)) + + for i := range fractions { + snapshot[i] = fractions[i] + } + + bins := p.distribute(p.cfg.TimeWindow, snapshot) + compactionBins.Set(float64(len(bins))) + times := p.prioritize(bins) + + for _, t := range times { + picked := strategySTCS{ + mergeTrigger: p.cfg.MergeTrigger, + mergeFanIn: p.cfg.MergeFanIn, + mergeFanOutSize: p.cfg.MergeFanOutSize, + bucketLowerbound: p.cfg.BucketLowerbound, + bucketUpperbound: p.cfg.BucketUpperbound, + }.Pick(bins[t]) + + if len(picked.fracs) == 0 { + // No candidates were found. + continue + } + + bucketSize := util.SizeStr(powerOfTwo(picked.sizeAvg)) + csnapshot, err := p.fm.ClaimForCompaction(names(picked.fracs)) + if err != nil { + continue + } + + return task{ + bin: t, + bucketSize: bucketSize, + + filename: p.fm.FractionName(), + snapshot: csnapshot, + + onComplete: func(s *sealed.PreloadedData, err error) { + if err != nil { + compactionResultTotal.WithLabelValues(bucketSize, "error").Inc() + + logger.Error( + "failed to compact fractions", + zap.Error(err), + zap.Any("snapshot", names(csnapshot.Fractions())), + ) + + return + } + + if s == nil { + logger.Info( + "compaction did not produce fraction", + zap.Any("snapshot", names(csnapshot.Fractions())), + ) + return + } + + compactionResultTotal.WithLabelValues(bucketSize, "success").Inc() + // TODO(dkharms): Is it fine to substitute and delete? + // We need somehow substitute and delete atomically. + p.fm.SubstituteWithSealed(s, csnapshot) + csnapshot.Destroy() + }, + }, true + } + + return task{}, false +} + +func (p *planner) distribute(window time.Duration, fracs []fraction) map[time.Time][]fraction { + bins := make(map[time.Time][]fraction) + + for _, f := range fracs { + // TODO(dkharms): Group by time-range fraction cover. + // + // Once we implement timestamp-binning, we need to group fractions into bins + // not by creation time, but by time-range they cover. + creation := time.UnixMilli(int64(f.Info().CreationTime)) + + bin := creation.Truncate(window) + bins[bin] = append(bins[bin], f) + } + + return bins +} + +func (p *planner) prioritize(bins map[time.Time][]fraction) []time.Time { + ordered := slices.Collect(maps.Keys(bins)) + + // Prioritize bins with the most fractions above target since they hurt search the most. + // Older bins are preferred on ties since they have been sitting above target longer. + slices.SortFunc(ordered, func(x, y time.Time) int { + xcount := len(bins[x]) + ycount := len(bins[y]) + if xcount == ycount { + return -x.Compare(y) + } + return -cmp.Compare(xcount, ycount) + }) + + return ordered +} + +func names[T interface{ Info() *common.Info }, S ~[]T](fracs S) []string { + fnames := make([]string, len(fracs)) + for i := range fracs { + fnames[i] = fracs[i].Info().Name() + } + return fnames +} + +func powerOfTwo(v uint64) uint64 { + if v == 0 { + return 1 + } + return 1 << bits.Len64(v-1) +} diff --git a/compaction/stcs.go b/compaction/stcs.go new file mode 100644 index 000000000..3baa9d6b3 --- /dev/null +++ b/compaction/stcs.go @@ -0,0 +1,112 @@ +package compaction + +import ( + "cmp" + "slices" +) + +type strategySTCS struct { + // To trigger compaction of bucket there must be + // at least [mergeTrigger] fractions. + mergeTrigger int + + // At most this many fractions are compacted from a single bucket + // per compaction iteration. + mergeFanIn int + mergeFanOutSize uint64 + + // Fraction size must be within [bucketLowerbound, bucketUpperbound] * avg(bucket) + // to be considered part of the bucket. + bucketLowerbound float64 + bucketUpperbound float64 +} + +type bucket struct { + sizeAvg uint64 + fracs []fraction +} + +func (s strategySTCS) Pick(candidates []fraction) bucket { + if len(candidates) < s.mergeTrigger { + return bucket{} + } + + sorted := slices.Clone(candidates) + slices.SortFunc(sorted, func(a, b fraction) int { + return cmp.Compare(a.Info().IndexOnDisk, b.Info().IndexOnDisk) + }) + + buckets := s.group(sorted) + // We are interested in buckets with the most amount of fractions. + // Usually, these are the lowest tiers where all freshly sealed fractions end up. + slices.SortFunc(buckets, func(x, y bucket) int { + return -cmp.Compare(len(x.fracs), len(y.fracs)) + }) + + for _, b := range buckets { + if len(b.fracs) < s.mergeTrigger { + continue + } + + b.fracs = b.fracs[:min(len(b.fracs), s.mergeFanIn)] + if picked := s.takeUntilSize(b); len(picked.fracs) >= s.mergeTrigger { + return picked + } + } + + return bucket{} +} + +func (s strategySTCS) group(sorted []fraction) []bucket { + var ( + sum uint64 + current []fraction + buckets []bucket + ) + + for _, f := range sorted { + size := f.Info().IndexOnDisk + + if len(current) == 0 { + current = append(current, f) + sum = size + continue + } + + avg := float64(sum) / float64(len(current)) + fsize := float64(size) + + lower := avg * s.bucketLowerbound + upper := avg * s.bucketUpperbound + + if lower <= fsize && fsize <= upper { + current = append(current, f) + sum += size + continue + } + + buckets = append(buckets, bucket{uint64(avg), current}) + current = []fraction{f} + sum = size + } + + if len(current) > 0 { + avg := float64(sum) / float64(len(current)) + buckets = append(buckets, bucket{uint64(avg), current}) + } + + return buckets +} + +func (s strategySTCS) takeUntilSize(b bucket) bucket { + var picked uint64 + + for i := range b.fracs { + picked += b.fracs[i].Info().IndexOnDisk + if picked >= s.mergeFanOutSize { + return bucket{b.sizeAvg, b.fracs[:i]} + } + } + + return b +} diff --git a/compaction/stcs_test.go b/compaction/stcs_test.go new file mode 100644 index 000000000..00a1fb6a9 --- /dev/null +++ b/compaction/stcs_test.go @@ -0,0 +1,74 @@ +package compaction + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/ozontech/seq-db/frac/common" +) + +type mockFraction struct { + indexOnDisk uint64 +} + +func (m *mockFraction) Info() *common.Info { + return &common.Info{IndexOnDisk: m.indexOnDisk} +} + +func makeFracs(sizes ...uint64) []fraction { + out := make([]fraction, len(sizes)) + for i, s := range sizes { + out[i] = &mockFraction{indexOnDisk: s} + } + return out +} + +func TestSTCS_Pick(t *testing.T) { + s := strategySTCS{ + mergeTrigger: 4, + mergeFanIn: 32, + mergeFanOutSize: math.MaxUint64, + bucketLowerbound: 0.5, + bucketUpperbound: 1.5, + } + + t.Run("not-enough-candidates", func(t *testing.T) { + for n := range s.mergeTrigger { + require.Len(t, s.Pick(makeFracs(make([]uint64, n)...)).fracs, 0) + } + }) + + t.Run("requirement-not-met", func(t *testing.T) { + // Each Fraction size is 10x the previous. + // They land in different buckets and no bucket with [mergeTrigger] fractions exists. + require.Len(t, s.Pick(makeFracs(100, 1000, 10000, 100000)).fracs, 0) + }) + + t.Run("one-bucket", func(t *testing.T) { + require.Len(t, s.Pick(makeFracs(1000, 1000, 1000, 1000)).fracs, 4) + }) + + t.Run("largest-bucket", func(t *testing.T) { + b := s.Pick(makeFracs( + 1000, 1000, + 100000, 100000, 100000, 100000, 100000, // Will take this bucket. + )) + + require.Len(t, b.fracs, 5) + for _, f := range b.fracs { + require.Equal(t, uint64(100000), f.Info().IndexOnDisk) + } + }) + + t.Run("cap-at-fan-in", func(t *testing.T) { + sizes := make([]uint64, s.mergeFanIn+10) + + for i := range sizes { + sizes[i] = 5000 + } + + require.Len(t, s.Pick(makeFracs(sizes...)).fracs, s.mergeFanIn) + }) +} diff --git a/config.example.yaml b/config.example.yaml index c5bcb24b8..e4cf0a9af 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -10,13 +10,25 @@ storage: frac_size: 16MiB total_size: 1GiB +compaction: + enabled: true + workers: 4 + time_window: 24h + tick_interval: 1s + stcs: + merge_trigger: 4 + merge_fan_in: 32 + merge_fan_out_size: 256MiB + bucket_lowerbound: 0.5 + bucket_upperbound: 1.5 + # For testing or developments purposes you can run MinIO S3 compatible object storage locally. # # docker run -p 9000:9000 -p 9001:9001 \ # quay.io/minio/minio server /data --console-address ":9001" offloading: - enabled: true + enabled: false retention: 5m endpoint: http://localhost:9000/ bucket: remote-storage diff --git a/config/config.go b/config/config.go index 0d929a7f5..d6639dff7 100644 --- a/config/config.go +++ b/config/config.go @@ -35,6 +35,7 @@ func Parse(path string) (Config, error) { } /* Set computed defaults if user did not override them */ + c.Compaction.Workers = cmp.Or(c.Compaction.Workers, NumCPU) c.Resources.ReaderWorkers = cmp.Or(c.Resources.ReaderWorkers, NumCPU) c.Resources.SearchWorkers = cmp.Or(c.Resources.SearchWorkers, NumCPU) @@ -59,7 +60,7 @@ type Config struct { // DataDir is a path to a directory where fractions will be stored. DataDir string `config:"data_dir"` // FracSize specifies the maximum size of an active fraction before it gets sealed. - FracSize Bytes `config:"frac_size" default:"128MiB"` + FracSize Bytes `config:"frac_size" default:"16MiB"` // TotalSize specifies upper bound of how much disk space can be occupied // by sealed fractions before they get deleted (or offloaded). TotalSize Bytes `config:"total_size" default:"1GiB"` @@ -74,6 +75,10 @@ type Config struct { Lids struct { // BlockSize sets max lids (postings) saved per LIDs block. BlockSize int `config:"block_size" default:"65536"` + // BitmapThreshold specifies minimum number of LIDs in the lid list + // which are serialized as bitmap. Lists with more elements use bitmap encoding, + // while smaller lists use delta encoding. + BitmapThreshold int `config:"bitmap_threshold" default:"65536"` } `config:"lids"` } `config:"sealing"` @@ -209,6 +214,20 @@ type Config struct { DocBlockZstdCompressionLevel int `config:"doc_block_zstd_compression_level" default:"3"` } `config:"compression"` + Compaction struct { + STCS struct { + MergeTrigger int `config:"merge_trigger" default:"4"` + MergeFanIn int `config:"merge_fan_in" default:"32"` + MergeFanOutSize Bytes `config:"merge_fan_out_size" default:"512MiB"` + BucketLowerbound float64 `config:"bucket_lowerbound" default:"0.5"` + BucketUpperbound float64 `config:"bucket_upperbound" default:"1.5"` + } `config:"stcs"` + Enabled bool `config:"enabled"` + Workers int `config:"workers"` + TimeWindow time.Duration `config:"time_window" default:"1h"` + TickInterval time.Duration `config:"tick_interval" default:"1s"` + } `config:"compaction"` + Indexing struct { MaxTokenSize int `config:"max_token_size" default:"72"` CaseSensitive bool `config:"case_sensitive"` diff --git a/config/frac_version.go b/config/frac_version.go index 73c3261a4..5ef2b35da 100644 --- a/config/frac_version.go +++ b/config/frac_version.go @@ -21,6 +21,12 @@ const ( // BinaryDataV4 - delta bitpack encoded MIDs and LIDs BinaryDataV4 + + // BinaryDataV5 - token frequencies stored in token blocks for large tokens + BinaryDataV5 + + // BinaryDataV6 - bitmap for sufficiently large LID lists, mixed LIDs block format + BinaryDataV6 ) -const CurrentFracVersion = BinaryDataV4 +const CurrentFracVersion = BinaryDataV6 diff --git a/config/validation.go b/config/validation.go index c305a7066..a09cb23e7 100644 --- a/config/validation.go +++ b/config/validation.go @@ -73,6 +73,17 @@ func (c *Config) storeValidations() []validateFn { inRange("offloading.queue_size_percent", 0, 100, c.Offloading.QueueSizePercent), greaterThan("experimental.max_regex_tokens_check", -1, c.Experimental.MaxRegexTokensCheck), + + greaterThan("compaction.stcs.merge_trigger", 0, c.Compaction.STCS.MergeTrigger), + greaterThan("compaction.stcs.merge_fan_out_size", 0, c.Compaction.STCS.MergeFanOutSize), + greaterOrEqThan("compaction.stcs.merge_fan_in", c.Compaction.STCS.MergeTrigger, c.Compaction.STCS.MergeFanIn), + + greaterThan("compaction.stcs.bucket_lowerbound", 0, c.Compaction.STCS.BucketLowerbound), + greaterOrEqThan("compaction.stcs.bucket_upperbound", c.Compaction.STCS.BucketLowerbound, c.Compaction.STCS.BucketUpperbound), + + greaterOrEqThan("compaction.workers", 0, c.Compaction.Workers), + greaterThan("compaction.time_window", 0, c.Compaction.TimeWindow), + greaterThan("compaction.tick_interval", 0, c.Compaction.TickInterval), } if c.Offloading.Enabled { @@ -112,7 +123,19 @@ func lessOrEqThan[T cmp.Ordered](field string, base, v T) validateFn { return func() error { if v > base { return fmt.Errorf( - "field %q must be greater than %v", + "field %q must be less or equal than %v", + field, base, + ) + } + return nil + } +} + +func greaterOrEqThan[T cmp.Ordered](field string, base, v T) validateFn { + return func() error { + if v < base { + return fmt.Errorf( + "field %q must be greater or equal than %v", field, base, ) } diff --git a/consts/consts.go b/consts/consts.go index ccaba4e2c..3341aecd9 100644 --- a/consts/consts.go +++ b/consts/consts.go @@ -56,6 +56,7 @@ const ( WalFileSuffix = ".wal" DocsFileSuffix = ".docs" + DocsTmpFileSuffix = "._docs" DocsDelFileSuffix = ".docs.del" SdocsFileSuffix = ".sdocs" diff --git a/frac/active_indexer_test.go b/frac/active_indexer_test.go index a1200a7c3..812b27632 100644 --- a/frac/active_indexer_test.go +++ b/frac/active_indexer_test.go @@ -1,4 +1,4 @@ -package frac +package frac_test import ( "bytes" @@ -12,6 +12,7 @@ import ( "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric/stopwatch" @@ -76,20 +77,20 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewActiveIndexer(8, 8) + idx, stop := frac.NewActiveIndexer(8, 8) defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - active := NewActive( + active := frac.NewActive( filepath.Join(b.TempDir(), "test"), idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) diff --git a/frac/common/info.go b/frac/common/info.go index b82f6b996..2a3805aa8 100644 --- a/frac/common/info.go +++ b/frac/common/info.go @@ -82,6 +82,13 @@ func (s *Info) BuildDistribution(mids []uint64) { } } +func (s *Info) AddMID(mid uint64) { + if s.Distribution == nil { + return + } + s.Distribution.Add(seq.MID(mid)) +} + func (s *Info) InitEmptyDistribution() bool { from := s.From.Time() creationTime := time.UnixMilli(int64(s.CreationTime)) diff --git a/frac/common/seal_params.go b/frac/common/seal_params.go index 05f89696f..ca2ba200f 100644 --- a/frac/common/seal_params.go +++ b/frac/common/seal_params.go @@ -7,7 +7,8 @@ type SealParams struct { DocsPositionsZstdLevel int TokenTableZstdLevel int - DocBlocksZstdLevel int // DocBlocksZstdLevel is the zstd compress level of each document block. - LIDBlockSize int - DocBlockSize int // DocBlockSize is decompressed payload size of document block. + DocBlocksZstdLevel int // DocBlocksZstdLevel is the zstd compress level of each document block. + LIDBlockSize int + LidsBitmapThreshold int // LidsBitmapThreshold is the minimum number of LIDs in the lid list to serialize as bitmap. + DocBlockSize int // DocBlockSize is decompressed payload size of document block. } diff --git a/frac/fraction_concurrency_test.go b/frac/fraction_concurrency_test.go index 560760cdd..dd85ae91a 100644 --- a/frac/fraction_concurrency_test.go +++ b/frac/fraction_concurrency_test.go @@ -1,4 +1,4 @@ -package frac +package frac_test import ( "fmt" @@ -14,11 +14,12 @@ import ( "golang.org/x/sync/errgroup" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" testcommon "github.com/ozontech/seq-db/tests/common" @@ -39,16 +40,16 @@ func TestConcurrentAppendAndQuery(t *testing.T) { fracPath := filepath.Join(tmpDir, "test_fraction") defer testcommon.RemoveDir(fracPath) - activeIndexer, stop := NewActiveIndexer(numIndexWorkers, 1000) + activeIndexer, stop := frac.NewActiveIndexer(numIndexWorkers, 1000) defer stop() - active := NewActive( + active := frac.NewActive( fracPath, activeIndexer, storage.NewReadLimiter(numReaders/2, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) @@ -154,7 +155,7 @@ const ( kafka = "kafka" ) -func readTest(t *testing.T, fraction Fraction, numReaders, numQueries int, docs []*testDoc, fromTime, toTime time.Time, mapping seq.Mapping) { +func readTest(t *testing.T, fraction frac.Fraction, numReaders, numQueries int, docs []*testDoc, fromTime, toTime time.Time, mapping seq.Mapping) { readersGroup, ctx := errgroup.WithContext(t.Context()) type queryFilter func(doc *testDoc) bool @@ -332,7 +333,7 @@ func generatesMessages(numMessages, bulkSize int) ([]*testDoc, [][]string, time. return docs, bulks, fromTime, toTime } -func seal(active *Active) (*Sealed, error) { +func seal(active *frac.Active) (*frac.Sealed, error) { sealParams := common.SealParams{ IDsZstdLevel: 1, LIDsZstdLevel: 1, @@ -343,7 +344,7 @@ func seal(active *Active) (*Sealed, error) { DocBlockSize: 128 * int(units.KiB), LIDBlockSize: 512, } - activeSealingSource, err := NewActiveSealingSource(active, sealParams) + activeSealingSource, err := frac.NewActiveSealingSource(active, sealParams) if err != nil { return nil, err } @@ -352,13 +353,13 @@ func seal(active *Active) (*Sealed, error) { return nil, err } - sealed := NewSealedPreloaded( + sealed := frac.NewSealedPreloaded( active.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) diff --git a/frac/fraction_test.go b/frac/fraction_test.go index 244aeb99f..fdaf894a0 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -1,7 +1,9 @@ -package frac +package frac_test import ( "context" + cryptorand "crypto/rand" + "encoding/hex" "fmt" "math" "math/rand/v2" @@ -21,12 +23,14 @@ import ( "github.com/stretchr/testify/suite" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/compaction" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -46,20 +50,20 @@ func (testSkipMaskProvider) RemoveFrac(_ string) {} type FractionTestSuite struct { suite.Suite tmpDir string - config *Config + config *frac.Config mapping seq.Mapping tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *ActiveIndexer + activeIndexer *frac.ActiveIndexer stopIndexer func() sealParams common.SealParams - fraction Fraction + fraction frac.Fraction insertDocuments func(docs ...[]string) } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer, s.stopIndexer = NewActiveIndexer(4, 10) + s.activeIndexer, s.stopIndexer = frac.NewActiveIndexer(4, 10) } func (s *FractionTestSuite) TearDownSuiteCommon() { @@ -67,7 +71,7 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &Config{} + s.config = &frac.Config{} s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -100,6 +104,7 @@ func (s *FractionTestSuite) SetupTestCommon() { DocBlocksZstdLevel: 1, LIDBlockSize: 512, DocBlockSize: 128 * int(units.KiB), + LidsBitmapThreshold: 25, } var err error @@ -115,6 +120,12 @@ func (s *FractionTestSuite) TearDownTestCommon() { s.NoError(err, "Failed to remove tmp dir") } +func randomHex(n int) string { + b := make([]byte, (n+1)/2) + cryptorand.Read(b) + return hex.EncodeToString(b)[:n] +} + func (s *FractionTestSuite) TestSearchKeyword() { docs := []string{ /*0*/ `{"timestamp":"2000-01-01T13:00:25Z","service":"service_a","message":"first message some text","trace_id":"abcdef","source":"prod01","level":"1"}`, @@ -1375,6 +1386,18 @@ func (s *FractionTestSuite) TestSearchLargeFrac() { fromTime: fromTime, toTime: toTime, }, + { + name: "complex AND+OR", + query: "(service:gateway OR service:proxy OR service:scheduler) AND " + + "(message:request OR message:failed) AND (level:1 OR level:2 OR level:3)", + filter: func(doc *testDoc) bool { + return (doc.service == gateway || doc.service == proxy || doc.service == "scheduler") && + (strings.Contains(doc.message, "request") || strings.Contains(doc.message, "failed")) && + (doc.level >= 1 && doc.level <= 3) + }, + fromTime: fromTime, + toTime: toTime, + }, { name: "service:gateway AND NOT (message:request OR message:timed OR level:[0 to 3])", query: "service:gateway AND NOT (message:request OR message:timed OR level:[0 to 3])", @@ -1856,7 +1879,7 @@ func (s *FractionTestSuite) TestMIDDistribution() { s.insertDocuments(docs) - _, ok := s.fraction.(*Active) + _, ok := s.fraction.(*frac.Active) if ok { s.Require().Nil(s.fraction.Info().Distribution, "active fraction has MID distribution") return @@ -1895,15 +1918,15 @@ func (s *FractionTestSuite) TestFractionInfo() { s.Require().Equal(seq.MID(946731654000000000), info.To, "to doesn't match") switch s.fraction.(type) { - case *Active: + case *frac.Active: s.Require().True(info.MetaOnDisk >= uint64(250) && info.MetaOnDisk <= uint64(400), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") - case *Sealed: + case *frac.Sealed: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1450), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) - case *Remote: + case *frac.Remote: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1450), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) @@ -2102,9 +2125,10 @@ func (s *FractionTestSuite) AssertHist( } } -func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { - baseName := filepath.Join(s.tmpDir, "test_fraction") - active := NewActive( +func (s *FractionTestSuite) newActive(bulks ...[]string) *frac.Active { + baseName := filepath.Join(s.tmpDir, randomHex(12)) + + active := frac.NewActive( baseName, s.activeIndexer, storage.NewReadLimiter(1, nil), @@ -2148,20 +2172,20 @@ func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { return active } -func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { +func (s *FractionTestSuite) newSealed(bulks ...[]string) *frac.Sealed { active := s.newActive(bulks...) - activeSealingSource, err := NewActiveSealingSource(active, s.sealParams) + activeSealingSource, err := frac.NewActiveSealingSource(active, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) s.Require().NoError(err, "Sealing failed") - sealed := NewSealedPreloaded( + sealed := frac.NewSealedPreloaded( active.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), s.config, testSkipMaskProvider{}, @@ -2194,7 +2218,7 @@ func (s *ActiveFractionTestSuite) SetupTest() { } func (s *ActiveFractionTestSuite) TearDownTest() { - if active, ok := s.fraction.(*Active); ok { + if active, ok := s.fraction.(*frac.Active); ok { active.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -2212,7 +2236,7 @@ ActiveReplayedFractionTestSuite run tests for active fraction which was replayed */ type ActiveReplayedFractionTestSuite struct { FractionTestSuite - originalFrac *Active + originalFrac *frac.Active } func (s *ActiveReplayedFractionTestSuite) SetupSuite() { @@ -2233,26 +2257,29 @@ func (s *ActiveReplayedFractionTestSuite) SetupTest() { } } -func (s *ActiveReplayedFractionTestSuite) Replay(frac *Active) Fraction { - fracFileName := frac.BaseFileName - s.originalFrac = frac - replayedFrac := NewActive( +func (s *ActiveReplayedFractionTestSuite) Replay(f *frac.Active) frac.Fraction { + s.originalFrac = f + fracFileName := f.BaseFileName + + replayedFrac := frac.NewActive( fracFileName, s.activeIndexer, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) + err := replayedFrac.Replay(context.Background()) s.Require().NoError(err, "replay failed") + return replayedFrac } func (s *ActiveReplayedFractionTestSuite) TearDownTest() { s.originalFrac.Release() - if active, ok := s.fraction.(*Active); ok { + if active, ok := s.fraction.(*frac.Active); ok { active.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -2287,7 +2314,7 @@ func (s *SealedFractionTestSuite) SetupTest() { } func (s *SealedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { + if sealed, ok := s.fraction.(*frac.Sealed); ok { sealed.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") @@ -2323,7 +2350,7 @@ func (s *SealedLoadedFractionTestSuite) SetupTest() { } func (s *SealedLoadedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { + if sealed, ok := s.fraction.(*frac.Sealed); ok { sealed.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") @@ -2335,14 +2362,14 @@ func (s *SealedLoadedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Sealed { +func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *frac.Sealed { sealed := s.newSealed(bulks...) sealed.Release() - sealed = NewSealed( + sealed = frac.NewSealed( sealed.BaseFileName, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), nil, s.config, @@ -2399,13 +2426,13 @@ func (s *RemoteFractionTestSuite) SetupTest() { s.Require().NoError(err, "offload failed") s.Require().True(offloaded, "didn't offload frac") - remoteFrac := NewRemote( + remoteFrac := frac.NewRemote( context.Background(), sealed.BaseFileName, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), - sealed.info, + sealed.Info(), s.config, s3cli, testSkipMaskProvider{}, @@ -2417,7 +2444,7 @@ func (s *RemoteFractionTestSuite) SetupTest() { } func (s *RemoteFractionTestSuite) TearDownTest() { - if remote, ok := s.fraction.(*Remote); ok { + if remote, ok := s.fraction.(*frac.Remote); ok { remote.Suicide() } else { s.Require().Nil(s.fraction, "fraction is not of Remote type") @@ -2431,6 +2458,113 @@ func (s *RemoteFractionTestSuite) TearDownSuite() { s.s3server.Close() } +type CompactedFractionTestSuite struct { + FractionTestSuite +} + +func (s *CompactedFractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *CompactedFractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(bulks ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newCompacted(bulks...) + } +} + +func (s *CompactedFractionTestSuite) TearDownTest() { + if sealed, ok := s.fraction.(*frac.Sealed); ok { + sealed.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") + } + s.TearDownTestCommon() +} + +func (s *CompactedFractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} + +// newCompacted flattens all bulks into one doc list, splits it in half, +// seals each half as a separate fraction, and merges them with compaction.Merge. +func (s *CompactedFractionTestSuite) newCompacted(bulks ...[]string) *frac.Sealed { + // Flatten all documents because we are going to reorganize it. + var docs []string + for _, b := range bulks { + docs = append(docs, b...) + } + + var ( + reorganized [][]string + bulkSize = max(len(docs)/32, 1) + ) + + for i := 0; i < len(docs); i += bulkSize { + reorganized = append( + reorganized, + docs[i:min(i+bulkSize, len(docs))], + ) + } + + merged := s.newSealed(reorganized[0]) + for i, bulk := range reorganized[1:] { + current := s.newSealed(bulk) + + mergedBase := filepath.Join( + s.tmpDir, + fmt.Sprintf("merged-%d", i), + ) + + preloaded, err := compaction.Merge( + mergedBase, s.sealParams, + frac.NewSealedSource(merged), + frac.NewSealedSource(current), + ) + + s.Require().NoError(err) + merged = frac.NewSealedPreloaded( + mergedBase, + preloaded, + storage.NewReadLimiter(1, nil), + frac.NewIndexCache(), + cache.NewCache[[]byte](nil, nil), + s.config, + testSkipMaskProvider{}, + ) + } + + return merged +} + +// TestFractionInfo overrides the base test because DocsOnDisk is larger in a +// merged fraction (sum of two source docs files) and MIDsDistribution is not +// populated by compaction.Merge. +func (s *CompactedFractionTestSuite) TestFractionInfo() { + docs := []string{ + `{"timestamp":"2000-01-01T13:00:25Z","service":"service_a","message":"first message some text", "container":"gateway"}`, + `{"timestamp":"2000-01-01T13:00:32Z","service":"service_b","message":"second message other text", "container":"kube-proxy"}`, + `{"timestamp":"2000-01-01T13:00:43Z","service":"service_c","message":"third message other text", "container":"gateway"}`, + `{"timestamp":"2000-01-01T13:00:53Z","service":"service_a","message":"fourth message some text", "container":"kube-proxy"}`, + `{"timestamp":"2000-01-01T13:00:54Z","service":"service_c","message":"apple","container":"kube-scheduler"}`, + } + + s.insertDocuments(docs) + + info := s.fraction.Info() + + s.Require().Equal(uint32(5), info.DocsTotal, "doc total doesn't match") + s.Require().Equal(uint64(583), info.DocsRaw, "doc raw doesn't match") + s.Require().Equal(seq.MID(946731625000000000), info.From, "from doesn't match") + s.Require().Equal(seq.MID(946731654000000000), info.To, "to doesn't match") + s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match") + s.Require().True(info.IndexOnDisk > 0, "index on disk should be non-zero") +} + func TestActiveFractionTestSuite(t *testing.T) { suite.Run(t, new(ActiveFractionTestSuite)) } @@ -2450,3 +2584,7 @@ func TestSealedLoadedFractionTestSuite(t *testing.T) { func TestRemoteFractionTestSuite(t *testing.T) { suite.Run(t, new(RemoteFractionTestSuite)) } + +func TestCompactedFractionTestSuite(t *testing.T) { + suite.Run(t, new(CompactedFractionTestSuite)) +} diff --git a/frac/index_cache.go b/frac/index_cache.go index 043e8c5ce..f270f209b 100644 --- a/frac/index_cache.go +++ b/frac/index_cache.go @@ -7,7 +7,7 @@ import ( "github.com/ozontech/seq-db/frac/sealed/token" ) -func newIndexCache() *IndexCache { +func NewIndexCache() *IndexCache { return &IndexCache{ LegacyRegistry: cache.NewCache[[]byte](nil, nil), diff --git a/frac/processor/search.go b/frac/processor/search.go index 30ed4f45e..b56184c62 100644 --- a/frac/processor/search.go +++ b/frac/processor/search.go @@ -174,10 +174,10 @@ func IndexSearch( return qpr, nil } -func batcher(evalTree node.Node, buf []node.LID) func(need int) []node.LID { +func batcher(evalTree node.Node, buf []node.LID, desc bool) func(need int) []node.LID { if batchNode, ok := tryConvertToBatchedTree(evalTree); ok { return func(need int) []node.LID { - buf = batchNode.NextBatch().LIDs(buf[:0]) + buf = batchNode.NextBatch(need).CopyLIDs(desc, buf[:0]) if len(buf) > need { buf = buf[:need] } @@ -225,7 +225,7 @@ func iterateEvalTree( mids := buffers.mids rids := buffers.rids - batchedEvalTree := batcher(evalTree, buffers.lids) + batchedEvalTree := batcher(evalTree, buffers.lids, params.Order.IsDesc()) timerEval := sw.Timer("eval_tree_next") timerMID := sw.Timer("get_mid") @@ -326,9 +326,9 @@ func iterateEvalTree( func tryConvertToBatchedTree(evalTree node.Node) (node.BatchedNode, bool) { switch it := evalTree.(type) { case *lids.IteratorDesc: - return it, true + return lids.NewBatchedIteratorDesc(it), true case *lids.IteratorAsc: - return it, true + return lids.NewBatchedIteratorAsc(it), true default: return nil, false } diff --git a/frac/sealed.go b/frac/sealed.go index 4bde6d5b9..dcf91f3ff 100644 --- a/frac/sealed.go +++ b/frac/sealed.go @@ -392,6 +392,7 @@ func (f *Sealed) Release() { func (f *Sealed) Suicide() { f.Release() + // Rename docs atomically first — this commits the intent to delete. oldPath := f.BaseFileName + consts.DocsFileSuffix newPath := f.BaseFileName + consts.DocsDelFileSuffix diff --git a/frac/sealed/block_offsets.go b/frac/sealed/block_offsets.go index 2be599421..d644a0f77 100644 --- a/frac/sealed/block_offsets.go +++ b/frac/sealed/block_offsets.go @@ -6,13 +6,17 @@ import ( ) type BlockOffsets struct { - IDsTotal uint32 // todo: the best place for this field is Info block - Offsets []uint64 + Offsets []uint64 } func (b *BlockOffsets) Pack(buf []byte) []byte { buf = binary.LittleEndian.AppendUint32(buf, uint32(len(b.Offsets))) - buf = binary.LittleEndian.AppendUint32(buf, b.IDsTotal) + + // NOTE(dkharms): Previously we stored here amount of documents ids. + // + // I've created a task which will require fraction binary version bumping + // to get rid of this: https://github.com/ozontech/seq-db/issues/409 + buf = binary.LittleEndian.AppendUint32(buf, 0) var prev uint64 for _, pos := range b.Offsets { @@ -26,13 +30,16 @@ func (b *BlockOffsets) Unpack(data []byte) error { if len(data) < 4 { return errors.New("blocks offset decoding error: truncated header (missing offsets count)") } + idsBlocksCount := binary.LittleEndian.Uint32(data) data = data[4:] if len(data) < 4 { return errors.New("blocks offset decoding error: truncated header (missing IDsTotal)") } - b.IDsTotal = binary.LittleEndian.Uint32(data) + + // NOTE(dkharms): Previously we stored here amount of documents ids. + _ = binary.LittleEndian.Uint32(data) data = data[4:] offset := uint64(0) @@ -42,15 +49,20 @@ func (b *BlockOffsets) Unpack(data []byte) error { if n == 0 { return errors.New("blocks offset decoding error: varint returned 0") } + if n < 0 { return errors.New("blocks offset decoding error: varint overflow") } + data = data[n:] offset += uint64(delta) + b.Offsets = append(b.Offsets, offset) } + if uint32(len(b.Offsets)) != idsBlocksCount { return errors.New("blocks offset decoding error: offset count mismatch") } + return nil } diff --git a/frac/sealed/lids/block.go b/frac/sealed/lids/block.go index 6d4627ec9..d92603e87 100644 --- a/frac/sealed/lids/block.go +++ b/frac/sealed/lids/block.go @@ -1,53 +1,289 @@ package lids import ( + "bytes" + "encoding/binary" + "fmt" "math" + "slices" "unsafe" + "github.com/RoaringBitmap/roaring/v2" "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/packer" ) -type Block struct { - LIDs []uint32 - Offsets []uint32 - // todo remove this legacy field +const ( + defaultLidsBitmapThreshold = math.MaxInt // bitmaps disabled by default +) + +type BlockPacker struct { + buf []uint32 // bitpack buffer (reusable across packing) + bmIndexes []uint32 // bitmap indexes (reusable across packing) + bitpackLIDs []uint32 + bitpackOffsets []uint32 + bm *roaring.Bitmap // reusable across packing + lidsBitmapThreshold int +} + +func NewBlockPacker() *BlockPacker { + return &BlockPacker{ + bm: roaring.NewBitmap(), + buf: make([]uint32, 0, consts.DefaultLIDBlockCap), + bitpackLIDs: make([]uint32, 0, consts.DefaultLIDBlockCap), + bitpackOffsets: make([]uint32, 0, consts.DefaultLIDBlockCap), + bmIndexes: make([]uint32, 0, consts.DefaultLIDBlockCap/10), + lidsBitmapThreshold: defaultLidsBitmapThreshold, + } +} + +func NewBlockPackerWithThreshold(lidsBitmapThreshold int) *BlockPacker { + p := NewBlockPacker() + if lidsBitmapThreshold != 0 { + p.lidsBitmapThreshold = lidsBitmapThreshold + } + return p +} + +// UnpackedBlock contains accumulated LIDs ready to pack. It's only used on sealing/compaction (index writing) time. +type UnpackedBlock struct { + LIDs []uint32 + Offsets []uint32 IsLastLID bool } -func (b *Block) getCount() int { - return len(b.Offsets) - 1 +// Block contains LIDs in variable format. It's used during search/queries processing. +// Field types is used to distinguish format for every LID list. If it's positive or zero, then it's a slot index in offsets +// (delta-encoding is used). If it's negative, then it's a bitmap slot index (index starts from -1, so -1 stands for bitmaps[0]). +// If types is nil, then the entire block is delta-encoded. +// +// On-disk format: +// +// [listsCount: uint32] — number of LID lists in the block +// [bitmapsCount: uint32] — number of lists stored as roaring bitmaps +// [bitmaps: bitmapsCount × roaring bitmap, serial format] +// [bitmapIndexes: delta-bitpack []uint32] — sorted list indices encoded as bitmaps +// [offsets: delta-bitpack []uint32] — slice boundaries in the delta-encoded LIDs array +// [lids: delta-bitpack []uint32] — concatenated delta-encoded LID values +// +// Each list i in [0, listsCount) is either a roaring bitmap (when i appears in bitmapIndexes) +// or a delta-encoded slice lids[offsets[k]:offsets[k+1]], where k is its delta-encoded slot index. +// Lists with length >= lidsBitmapThreshold are stored as bitmaps; shorter lists use delta-encoding. +type Block struct { + types []int32 // determines LID list type: delta-encoded (non-negative value) or bitmap (negative value). nil for delta-encoded blocks + lids []uint32 // all LIDs which are delta-encoded as a flat array + offsets []uint32 // offsets for delta-encoded LIDs + bitmaps []*roaring.Bitmap // all LIDs lists which are stored as bitmaps + lastLID bool // legacy field, will be removed soon } -func (b *Block) getLIDs(i int) []uint32 { - return b.LIDs[b.Offsets[i]:b.Offsets[i+1]] +func (b *Block) GetCount() int { + if b.types != nil { + return len(b.types) + } + + return len(b.offsets) - 1 } -func (b *Block) Pack(dst []byte, buf []uint32) []byte { - dst = packer.CompressDeltaBitpackUint32(dst, b.Offsets, buf) - dst = packer.CompressDeltaBitpackUint32(dst, b.LIDs, buf) - return dst +func (b *Block) IsLastLID() bool { + return b.lastLID +} + +func (b *Block) GetLIDs(i int) node.LIDBatch { + if b.types == nil { + return node.NewSliceBatch(b.lids[b.offsets[i]:b.offsets[i+1]]) + } + t := b.types[i] + if t >= 0 { + return node.NewSliceBatch(b.lids[b.offsets[t]:b.offsets[t+1]]) + } + return node.NewBitmapBatch(b.bitmaps[-t-1]) +} + +func (b *Block) CopyLIDs(idx int, dst []uint32) []uint32 { + if b.types == nil { + dst = append(dst, b.lids[b.offsets[idx]:b.offsets[idx+1]]...) + return dst + } + t := b.types[idx] + if t >= 0 { + dst = append(dst, b.lids[b.offsets[t]:b.offsets[t+1]]...) + return dst + } + return b.copyLIDsFromBitmap(t, dst) +} + +func (b *Block) copyLIDsFromBitmap(ref int32, buf []uint32) []uint32 { + bitmap := b.bitmaps[-ref-1] + n := int(bitmap.GetCardinality()) + oldLen := len(buf) + + buf = slices.Grow(buf, n)[:oldLen+n] + dest := buf[oldLen:] + bitmap.ToExistingArray(&dest) + return buf } func (b *Block) GetSizeBytes() int { - const ( - uint32Size = int(unsafe.Sizeof(uint32(0))) - blockSize = int(unsafe.Sizeof(*b)) + const uint32Size = int(unsafe.Sizeof(uint32(0))) + size := int(unsafe.Sizeof(*b)) + uint32Size*cap(b.types) + uint32Size*cap(b.lids) + uint32Size*cap(b.offsets) + for _, bm := range b.bitmaps { + if bm != nil { + size += int(bm.GetSizeInBytes()) + } + } + return size +} + +func (p *BlockPacker) Pack(b *UnpackedBlock, dst []byte) []byte { + p.buf = p.buf[:0] + totalLists := len(b.Offsets) - 1 + bmCount := 0 // count of lid lists that will be stored as bitmaps + bmIndexes := p.bmIndexes[:0] + for i := 0; i < totalLists; i++ { + if int(b.Offsets[i+1]-b.Offsets[i]) >= p.lidsBitmapThreshold { + bmCount++ + bmIndexes = append(bmIndexes, uint32(i)) + } + } + + // write total number of LID lists and bitmap indexes + var numBuf [4]byte + binary.LittleEndian.PutUint32(numBuf[:], uint32(totalLists)) + dst = append(dst, numBuf[:]...) + binary.LittleEndian.PutUint32(numBuf[:], uint32(bmCount)) + dst = append(dst, numBuf[:]...) + + var ( + bitpackLIDs []uint32 + bitpackOffsets []uint32 ) - return blockSize + uint32Size*cap(b.LIDs) + uint32Size*cap(b.Offsets) + + if bmCount > 0 { + bitpackLIDs = p.bitpackLIDs[:0] + bitpackOffsets = p.bitpackOffsets[:0] + if bmCount < totalLists { + bitpackOffsets = append(bitpackOffsets, 0) + } + + for i := 0; i < totalLists; i++ { + lids := b.LIDs[b.Offsets[i]:b.Offsets[i+1]] + if len(lids) >= p.lidsBitmapThreshold { + dst = p.writeAsBitmap(dst, lids) + } else { + bitpackLIDs = append(bitpackLIDs, lids...) + bitpackOffsets = append(bitpackOffsets, uint32(len(bitpackLIDs))) + } + } + } else { + bitpackLIDs = b.LIDs + bitpackOffsets = b.Offsets + } + + dst = packer.CompressDeltaBitpackUint32(dst, bmIndexes, p.buf) + dst = packer.CompressDeltaBitpackUint32(dst, bitpackOffsets, p.buf) + dst = packer.CompressDeltaBitpackUint32(dst, bitpackLIDs, p.buf) + return dst +} + +func (p *BlockPacker) writeAsBitmap(dst []byte, lids []uint32) []byte { + p.bm.Clear() + p.bm.AddMany(lids) + p.bm.RunOptimize() + + wrt := bytes.NewBuffer(dst) + _, err := p.bm.WriteTo(wrt) + if err != nil { + panic(fmt.Errorf("bitmap write failed: %w", err)) + } + return wrt.Bytes() } func (b *Block) Unpack(data []byte, fracVer config.BinaryDataVersion, buf *UnpackBuffer) error { buf.Reset(fracVer) - if fracVer >= config.BinaryDataV4 { - return b.unpackBitpack(data, buf) + if fracVer < config.BinaryDataV4 { + return b.unpackVarintsV1(data, buf) + } + if fracVer < config.BinaryDataV6 { + return b.unpackBitpackV4(data, buf) + } + + return b.unpackBlockV6(data, buf) +} + +// unpackBlockV6 unpacks the mixed bitmap / delta-bitpack format (BinaryDataV6+). +func (b *Block) unpackBlockV6(data []byte, buf *UnpackBuffer) error { + listsCount := int(binary.LittleEndian.Uint32(data[:4])) + data = data[4:] + bitmapsCount := int(binary.LittleEndian.Uint32(data[:4])) + data = data[4:] + + bitmaps := make([]*roaring.Bitmap, bitmapsCount) + for i := 0; i < bitmapsCount; i++ { + rb := roaring.NewBitmap() + n, err := rb.ReadFrom(bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("read bitmap %d: %w", i, err) + } + data = data[n:] + bitmaps[i] = rb + } + + var ( + err error + bitmapIndexes []uint32 + ) + data, bitmapIndexes, err = packer.DecompressDeltaBitpackUint32(data, buf.decompressed, buf.compressed) + if err != nil { + return err + } + b.types = deriveTypes(listsCount, bitmapIndexes) + + var values []uint32 + data, values, err = packer.DecompressDeltaBitpackUint32(data, buf.decompressed, buf.compressed) + if err != nil { + return err + } + offsets := append([]uint32{}, values...) + + _, values, err = packer.DecompressDeltaBitpackUint32(data, buf.decompressed, buf.compressed) + if err != nil { + return err } + lids := append([]uint32{}, values...) + + b.lids = lids + b.offsets = offsets + b.bitmaps = bitmaps + return nil +} - return b.unpackVarint(data, buf) +// deriveTypes derives types array (only for LID blocks which have at least one bitmap). +func deriveTypes(totalLists int, bitmapIndexes []uint32) []int32 { + if len(bitmapIndexes) == 0 { + return nil + } + listTypes := make([]int32, totalLists) + bmIdx := 0 + deltaIdx := 0 + bmSlotIdx := 0 + for i := 0; i < totalLists; i++ { + if bmSlotIdx < len(bitmapIndexes) && bitmapIndexes[bmSlotIdx] == uint32(i) { + listTypes[i] = -int32(bmIdx + 1) + bmIdx++ + bmSlotIdx++ + } else { + listTypes[i] = int32(deltaIdx) + deltaIdx++ + } + } + return listTypes } -func (b *Block) unpackBitpack(data []byte, buf *UnpackBuffer) error { +func (b *Block) unpackBitpackV4(data []byte, buf *UnpackBuffer) error { var err error var values []uint32 @@ -55,22 +291,26 @@ func (b *Block) unpackBitpack(data []byte, buf *UnpackBuffer) error { if err != nil { return err } - b.Offsets = append([]uint32{}, values...) + offsets := append([]uint32{}, values...) _, values, err = packer.DecompressDeltaBitpackUint32(data, buf.decompressed, buf.compressed) if err != nil { return err } - b.LIDs = append([]uint32{}, values...) + lids := append([]uint32{}, values...) + + b.types = nil + b.lids = lids + b.offsets = offsets + b.bitmaps = nil + b.lastLID = false return nil } -func (b *Block) unpackVarint(data []byte, buf *UnpackBuffer) error { +func (b *Block) unpackVarintsV1(data []byte, buf *UnpackBuffer) error { var lid, offset uint32 - b.IsLastLID = true - - buf.offsets = append(buf.offsets, 0) // first offset is always zero + buf.offsets = append(buf.offsets, 0) unpacker := packer.NewBytesUnpacker(data) for unpacker.Len() > 0 { @@ -80,7 +320,7 @@ func (b *Block) unpackVarint(data []byte, buf *UnpackBuffer) error { } lid += uint32(delta) - if lid == math.MaxUint32 { // end of LIDs of current TID, see `Block.Pack()` method + if lid == math.MaxUint32 { offset = uint32(len(buf.lids)) buf.offsets = append(buf.offsets, offset) lid -= uint32(delta) @@ -90,14 +330,21 @@ func (b *Block) unpackVarint(data []byte, buf *UnpackBuffer) error { buf.lids = append(buf.lids, lid) } + lastLID := true if int(offset) < len(buf.lids) { - b.IsLastLID = false + lastLID = false buf.offsets = append(buf.offsets, uint32(len(buf.lids))) } - // copy from buffer - b.LIDs = append([]uint32{}, buf.lids...) - b.Offsets = append([]uint32{}, buf.offsets...) - + b.types = nil + b.lids = append([]uint32{}, buf.lids...) + b.offsets = append([]uint32{}, buf.offsets...) + b.bitmaps = nil + b.lastLID = lastLID return nil } + +func (b *UnpackedBlock) GetSizeBytes() int { + const uint32Size = 4 + return int(uint32Size*len(b.LIDs) + uint32Size*len(b.Offsets)) +} diff --git a/frac/sealed/lids/block_test.go b/frac/sealed/lids/block_test.go index fde6d26f0..e2e218627 100644 --- a/frac/sealed/lids/block_test.go +++ b/frac/sealed/lids/block_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/config" + "github.com/ozontech/seq-db/node" ) func TestBlockPack(t *testing.T) { @@ -44,12 +45,12 @@ func TestBlockPack(t *testing.T) { offsets: []uint32{0, 3, 6, 8}, }, { - name: "medium_many_tokens", + name: "large_delta_encoded", generator: func() ([]uint32, []uint32) { lids := make([]uint32, 0) offsets := []uint32{0} startLID := uint32(100) - for i := 0; i < 10; i++ { + for i := 0; i < 100; i++ { for j := 0; j < 3; j++ { lids = append(lids, startLID+uint32(i*10+j)) } @@ -90,17 +91,32 @@ func TestBlockPack(t *testing.T) { offsets: []uint32{0, 129}, }, { - name: "medium_4k_lids", + name: "medium_4k_bitmap_and_small_list", + lids: generate(4096), + offsets: []uint32{0, 4093, 4096}, + }, + { + name: "medium_4k_small_list_and_bitmap", + lids: generate(4096), + offsets: []uint32{0, 3, 4096}, + }, + { + name: "medium_4k_hybrid", + lids: generate(4096), + offsets: []uint32{0, 1000, 1005, 1010, 2000, 2100, 2103, 2106, 2107, 3000, 3500, 3505, 4096}, + }, + { + name: "medium_4k", lids: generate(4096), offsets: []uint32{0, 4096}, }, { - name: "medium_4k_minus_one_lids", + name: "medium_4095", lids: generate(4095), offsets: []uint32{0, 10, 50, 100, 150, 190, 1000, 1500, 4095}, }, { - name: "medium_4k_plus_one_lids", + name: "medium_4097", lids: generate(4097), offsets: []uint32{0, 10, 50, 100, 150, 190, 1000, 1500, 4097}, }, @@ -123,104 +139,137 @@ func TestBlockPack(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - var lids []uint32 + var lidsIn []uint32 var offsets []uint32 if tc.generator != nil { - lids, offsets = tc.generator() + lidsIn, offsets = tc.generator() } else { - lids = tc.lids + lidsIn = tc.lids offsets = tc.offsets } - block := &Block{ - LIDs: lids, + block := &UnpackedBlock{ + LIDs: lidsIn, Offsets: offsets, } - packed := block.Pack(nil, nil) + packer := NewBlockPacker() + packer.lidsBitmapThreshold = 25 + + packed := packer.Pack(block, nil) require.NotEmpty(t, packed) - unpacked := &Block{} buf := &UnpackBuffer{} - err := unpacked.Unpack(packed, config.CurrentFracVersion, buf) + var unpacked Block + require.NoError(t, unpacked.Unpack(packed, config.CurrentFracVersion, buf)) - require.NoError(t, err) - assert.EqualExportedValues(t, block, unpacked) + assertListsEqual(t, block, &unpacked) }) } } -func generate(n int) []uint32 { - v := make([]uint32, n) - last := uint32(100) - for i := range v { - v[i] = last - last += uint32(1 + rand.Intn(5)) +func TestBlockPack_VariableMixed(t *testing.T) { + small := generate(10) + large := generate(30) + block := &UnpackedBlock{ + LIDs: append(append([]uint32{}, small...), large...), + Offsets: []uint32{0, uint32(len(small)), uint32(len(small) + len(large))}, } - return v + + packed := NewBlockPacker().Pack(block, nil) + + buf := &UnpackBuffer{} + var ub Block + require.NoError(t, ub.Unpack(packed, config.CurrentFracVersion, buf)) + assert.Equal(t, 2, ub.GetCount()) + assert.Equal(t, small, ToArray(ub.GetLIDs(0))) + assert.Equal(t, large, ToArray(ub.GetLIDs(1))) +} + +func ToArray(b node.LIDBatch) []uint32 { + if b.IsEmpty() { + return nil + } + out := make([]uint32, 0, b.Len()) + for _, lid := range b.CopyLIDs(true, nil) { + out = append(out, lid.Unpack()) + } + return out } func TestBlockPack_ReuseBuffer(t *testing.T) { - // Test that UnpackBuffer can be reused - block1 := &Block{ + block1 := &UnpackedBlock{ LIDs: generate(64 * 1024), Offsets: []uint32{0, 3}, } - block2 := &Block{ + block2 := &UnpackedBlock{ LIDs: generate(64 * 1024), Offsets: []uint32{0, 4}, } - buf1 := make([]uint32, 0, 64*1024) - packed1 := block1.Pack(nil, buf1) - - buf1 = buf1[:0] - packed2 := block2.Pack(nil, buf1) + packer := NewBlockPacker() + packed1 := packer.Pack(block1, nil) + packed2 := packer.Pack(block2, nil) buf2 := &UnpackBuffer{} - unpacked1 := &Block{} - err := unpacked1.Unpack(packed1, config.CurrentFracVersion, buf2) - require.NoError(t, err) - assert.Equal(t, block1.LIDs, unpacked1.LIDs) + var unpacked1, unpacked2 Block + require.NoError(t, unpacked1.Unpack(packed1, config.CurrentFracVersion, buf2)) + require.NoError(t, unpacked2.Unpack(packed2, config.CurrentFracVersion, buf2)) + + assertListsEqual(t, block1, &unpacked1) + assertListsEqual(t, block2, &unpacked2) +} - unpacked2 := &Block{} - err = unpacked2.Unpack(packed2, config.CurrentFracVersion, buf2) - require.NoError(t, err) - assert.Equal(t, block2.LIDs, unpacked2.LIDs) +func assertListsEqual(t *testing.T, src *UnpackedBlock, blk *Block) { + t.Helper() + require.Equal(t, len(src.Offsets)-1, blk.GetCount()) + for i := 0; i < blk.GetCount(); i++ { + want := src.LIDs[src.Offsets[i]:src.Offsets[i+1]] + assert.Equal(t, want, ToArray(blk.GetLIDs(i))) + } +} + +func generate(n int) []uint32 { + v := make([]uint32, n) + last := uint32(100) + for i := range v { + v[i] = last + last += uint32(1 + rand.Intn(5)) + } + return v } func BenchmarkBlock_Pack(b *testing.B) { - lids := generate(64 * 1024) + lidsIn := generate(64 * 1024) - block := &Block{ - LIDs: lids, + block := &UnpackedBlock{ + LIDs: lidsIn, Offsets: []uint32{0, 64 * 1024}, } - tmp := make([]uint32, 0, 64*1024/4) + packer := NewBlockPacker() for b.Loop() { - block.Pack(nil, tmp) + packer.Pack(block, nil) } } func BenchmarkBlock_Unpack(b *testing.B) { - lids := generate(64 * 1024) + lidsIn := generate(64 * 1024) - block := &Block{ - LIDs: lids, + block := &UnpackedBlock{ + LIDs: lidsIn, Offsets: []uint32{0, 64 * 1024}, } - packed := block.Pack(nil, nil) + packed := NewBlockPacker().Pack(block, nil) buf := &UnpackBuffer{} - unpacked := &Block{} b.ResetTimer() for b.Loop() { - err := unpacked.Unpack(packed, config.CurrentFracVersion, buf) - assert.NoError(b, err) + var ub Block + assert.NoError(b, ub.Unpack(packed, config.CurrentFracVersion, buf)) } } diff --git a/frac/sealed/lids/cursor.go b/frac/sealed/lids/cursor.go index 4dc0d73c0..d3446ca75 100644 --- a/frac/sealed/lids/cursor.go +++ b/frac/sealed/lids/cursor.go @@ -1,5 +1,7 @@ package lids +import "github.com/ozontech/seq-db/node" + type Counter interface { AddLIDsCount(int) } @@ -15,7 +17,7 @@ type Cursor struct { blockIndex uint32 tryNextBlock bool - lids []uint32 + batch node.LIDBatch counter Counter } @@ -38,6 +40,7 @@ func NewLIDsCursor( tid: tid, blockIndex: startIndex, tryNextBlock: true, + batch: node.EmptyBatch(), counter: counter, } diff --git a/frac/sealed/lids/iterator_asc.go b/frac/sealed/lids/iterator_asc.go index 0230ebd20..67cd834f0 100644 --- a/frac/sealed/lids/iterator_asc.go +++ b/frac/sealed/lids/iterator_asc.go @@ -1,44 +1,61 @@ package lids import ( - "sort" - "go.uber.org/zap" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" ) -type IteratorAsc Cursor +type IteratorAsc struct { + Cursor + it node.Iter +} + +func NewIteratorAsc( + table *Table, + loader *Loader, + startIndex uint32, + tid uint32, + counter Counter, + minLID, maxLID uint32, +) *IteratorAsc { + it := &IteratorAsc{ + Cursor: *NewLIDsCursor(table, loader, startIndex, tid, counter, minLID, maxLID), + } + it.it = it.batch.ReverseIter() + return it +} func (*IteratorAsc) String() string { return "LIDS_ASC" } -// narrowLIDsRange cuts LIDs between from and to. Returns new lids and tryNextBlock flag -func (it *IteratorAsc) narrowLIDsRange(lids []uint32, tryNextBlock bool) ([]uint32, bool) { - first := lids[0] - if it.maxLID < first { // fast path: out-of-bounds 1; allowed to continue reading blocks - return nil, tryNextBlock +// narrowLIDsRange cuts LIDs between minLID and maxLID. Returns updated tryNextBlock flag. +func (it *IteratorAsc) narrowLIDsRange(tryNextBlock bool) bool { + if it.batch.IsEmpty() { + return tryNextBlock } - last := lids[len(lids)-1] - if it.minLID > last { // fast path: out-of-bounds 2 - return nil, false // stop reading blocks + first := it.batch.Min() + if it.maxLID < first { + it.batch = node.EmptyBatch() + return tryNextBlock } - if it.minLID > first { - left := sort.Search(len(lids), func(i int) bool { return lids[i] >= it.minLID }) - lids = lids[left:] - tryNextBlock = false + last := it.batch.Max() + if it.minLID > last { + it.batch = node.EmptyBatch() + return false } - if it.maxLID <= last { - right := sort.Search(len(lids), func(i int) bool { return lids[i] > it.maxLID }) - lids = lids[:right] + lastBlock := it.minLID > first + it.batch = it.batch.Narrow(it.minLID, it.maxLID) + if lastBlock { + tryNextBlock = false } - return lids, tryNextBlock + return tryNextBlock } func (it *IteratorAsc) loadNextLIDsBlock() { @@ -47,92 +64,46 @@ func (it *IteratorAsc) loadNextLIDsBlock() { logger.Panic("error loading LIDs block", zap.Error(err)) } - if block.getCount() != int(it.table.GetChunksCount(it.blockIndex)) { + if block.GetCount() != int(it.table.GetChunksCount(it.blockIndex)) { logger.Panic("unexpected LIDs count") } - it.lids = block.getLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) - it.tryNextBlock = it.table.HasTIDInPrevBlock(it.blockIndex, it.tid) + it.batch = block.GetLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) + tryNextBlock := it.table.HasTIDInPrevBlock(it.blockIndex, it.tid) + it.tryNextBlock = it.narrowLIDsRange(tryNextBlock) + it.it = it.batch.ReverseIter() + it.counter.AddLIDsCount(it.batch.Len()) it.blockIndex-- } +func (it *IteratorAsc) discardBlock() { + it.batch = node.EmptyBatch() + it.it = it.batch.Iter() +} + func (it *IteratorAsc) Next() node.LID { - for len(it.lids) == 0 { + for { + lid, ok := it.it.Next() + if ok { + return node.NewAscLID(lid) + } if !it.tryNextBlock { return node.NullLID() } - - it.loadNextLIDsBlock() // last chunk in block but not last for tid; need load next block - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) // inc loaded LIDs count + it.loadNextLIDsBlock() } - - i := len(it.lids) - 1 - lid := it.lids[i] - it.lids = it.lids[:i] - return node.NewAscLID(lid) } // NextGeq returns the next (in reverse iteration order) LID that is <= maxLID. func (it *IteratorAsc) NextGeq(nextID node.LID) node.LID { for { - for len(it.lids) == 0 { - if !it.tryNextBlock { - return node.NullLID() - } - - it.loadNextLIDsBlock() - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) - } - - // fast path: smallest remaining > nextID => skip entire block - // TODO(cheb0): We could also pass LID into narrowLIDsRange to perform block skipping once we add something like MinLID to LID block header - if it.lids[0] > nextID.Unpack() { - it.lids = it.lids[:0] - continue - } - - idx := sort.Search(len(it.lids), func(i int) bool { return it.lids[i] > nextID.Unpack() }) - 1 - if idx >= 0 { - lid := it.lids[idx] - it.lids = it.lids[:idx] + lid, ok := it.it.NextGeq(nextID.Unpack()) + if ok { return node.NewAscLID(lid) } - - it.lids = it.lids[:0] - } -} - -func (it *IteratorAsc) NextBatch() node.LIDBatch { - return it.NextBatchGeq(node.NewAscZeroLID()) -} - -func (it *IteratorAsc) NextBatchGeq(nextID node.LID) node.LIDBatch { - for { - for len(it.lids) == 0 { - if !it.tryNextBlock { - return node.NewAscBatch(nil) - } - it.loadNextLIDsBlock() - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) - } - - // fast path: smallest remaining > nextID => skip entire block - // TODO(cheb0): We could also pass LID into narrowLIDsRange to perform block skipping once we add something like MinLID to LID block header - if it.lids[0] > nextID.Unpack() { - it.lids = it.lids[:0] - continue - } - - idx := sort.Search(len(it.lids), func(i int) bool { return it.lids[i] > nextID.Unpack() }) - 1 - if idx >= 0 { - batch := it.lids[:idx+1] - it.lids = it.lids[:0] - return node.NewAscBatch(batch) + if !it.tryNextBlock { + return node.NullLID() } - - it.lids = it.lids[:0] + it.loadNextLIDsBlock() } } diff --git a/frac/sealed/lids/iterator_batched_asc.go b/frac/sealed/lids/iterator_batched_asc.go new file mode 100644 index 000000000..c8f390c2d --- /dev/null +++ b/frac/sealed/lids/iterator_batched_asc.go @@ -0,0 +1,95 @@ +package lids + +import ( + "go.uber.org/zap" + + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/node" +) + +type BatchedIteratorAsc struct { + Cursor +} + +func NewBatchedIteratorAsc(it *IteratorAsc) *BatchedIteratorAsc { + return &BatchedIteratorAsc{ + Cursor: it.Cursor, + } +} + +func (*BatchedIteratorAsc) String() string { + return "LIDS_ASC_BATCHED" +} + +// narrowLIDsRange cuts LIDs between minLID and maxLID. Returns updated tryNextBlock flag. +func (it *BatchedIteratorAsc) narrowLIDsRange(tryNextBlock bool) bool { + if it.batch.IsEmpty() { + return tryNextBlock + } + + first := it.batch.Min() + if it.maxLID < first { + it.batch = node.EmptyBatch() + return tryNextBlock + } + + batchMax := it.batch.Max() + if it.minLID > batchMax { + it.batch = node.EmptyBatch() + return false + } + + lastBlock := it.minLID > first + it.batch = it.batch.Narrow(it.minLID, it.maxLID) + if lastBlock { + tryNextBlock = false + } + + return tryNextBlock +} + +func (it *BatchedIteratorAsc) loadNextLIDsBlock() { + block, err := it.loader.GetLIDsBlock(it.table.StartBlockIndex + it.blockIndex) + if err != nil { + logger.Panic("error loading LIDs block", zap.Error(err)) + } + + if block.GetCount() != int(it.table.GetChunksCount(it.blockIndex)) { + logger.Panic("unexpected LIDs count") + } + + it.batch = block.GetLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) + it.tryNextBlock = it.table.HasTIDInPrevBlock(it.blockIndex, it.tid) + it.blockIndex-- +} + +func (it *BatchedIteratorAsc) NextBatch(need int) node.LIDBatch { + return it.NextBatchGeq(need, node.NewAscZeroLID()) +} + +func (it *BatchedIteratorAsc) NextBatchGeq(_ int, nextID node.LID) node.LIDBatch { + for { + if it.batch.IsEmpty() { + if !it.tryNextBlock { + return node.EmptyBatch() + } + it.loadNextLIDsBlock() + it.tryNextBlock = it.narrowLIDsRange(it.tryNextBlock) + it.counter.AddLIDsCount(it.batch.Len()) + } + + if it.batch.IsEmpty() { + continue + } + + // NextGeq scrolls past thtrimmedAbovee block => skip entire block + if nextID.Unpack() < it.batch.Min() { + it.batch = node.EmptyBatch() + continue + } + + out := it.batch + it.batch = node.EmptyBatch() + return out + } +} diff --git a/frac/sealed/lids/iterator_batched_desc.go b/frac/sealed/lids/iterator_batched_desc.go new file mode 100644 index 000000000..cee8c674c --- /dev/null +++ b/frac/sealed/lids/iterator_batched_desc.go @@ -0,0 +1,94 @@ +package lids + +import ( + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/node" + "go.uber.org/zap" +) + +type BatchedIteratorDesc struct { + Cursor +} + +func NewBatchedIteratorDesc(it *IteratorDesc) *BatchedIteratorDesc { + return &BatchedIteratorDesc{ + Cursor: it.Cursor, + } +} + +func (*BatchedIteratorDesc) String() string { + return "LIDS_DESC_BATCHED" +} + +// narrowLIDsRange cuts LIDs between minLID and maxLID. Returns updated tryNextBlock flag. +func (it *BatchedIteratorDesc) narrowLIDsRange(tryNextBlock bool) bool { + if it.batch.IsEmpty() { + return tryNextBlock + } + + first := it.batch.Min() + if it.maxLID < first { + it.batch = node.EmptyBatch() + return false + } + + batchMax := it.batch.Max() + if it.minLID > batchMax { + it.batch = node.EmptyBatch() + return tryNextBlock + } + + lastBlock := it.maxLID < batchMax + it.batch = it.batch.Narrow(it.minLID, it.maxLID) + if lastBlock { + tryNextBlock = false + } + + return tryNextBlock +} + +func (it *BatchedIteratorDesc) loadNextLIDsBlock() { + block, err := it.loader.GetLIDsBlock(it.table.StartBlockIndex + it.blockIndex) + if err != nil { + logger.Panic("error loading LIDs block", zap.Error(err)) + } + + if block.GetCount() != int(it.table.GetChunksCount(it.blockIndex)) { + logger.Panic("unexpected LIDs count") + } + + it.batch = block.GetLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) + it.tryNextBlock = it.table.HasTIDInNextBlock(it.blockIndex, it.tid) + it.blockIndex++ +} + +func (it *BatchedIteratorDesc) NextBatch(need int) node.LIDBatch { + return it.NextBatchGeq(need, node.NewDescZeroLID()) +} + +func (it *BatchedIteratorDesc) NextBatchGeq(_ int, nextID node.LID) node.LIDBatch { + for { + if it.batch.IsEmpty() { + if !it.tryNextBlock { + return node.EmptyBatch() + } + it.loadNextLIDsBlock() + it.tryNextBlock = it.narrowLIDsRange(it.tryNextBlock) + it.counter.AddLIDsCount(it.batch.Len()) + } + + if it.batch.IsEmpty() { + continue + } + + // NextGeq scrolls past the block => skip entire block + if nextID.Unpack() > it.batch.Max() { + it.batch = node.EmptyBatch() + continue + } + + out := it.batch + it.batch = node.EmptyBatch() + return out + } +} diff --git a/frac/sealed/lids/iterator_desc.go b/frac/sealed/lids/iterator_desc.go index a3da61762..d4137e41f 100644 --- a/frac/sealed/lids/iterator_desc.go +++ b/frac/sealed/lids/iterator_desc.go @@ -1,44 +1,61 @@ package lids import ( - "sort" - "go.uber.org/zap" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" ) -type IteratorDesc Cursor +type IteratorDesc struct { + Cursor + it node.Iter +} + +func NewIteratorDesc( + table *Table, + loader *Loader, + startIndex uint32, + tid uint32, + counter Counter, + minLID, maxLID uint32, +) *IteratorDesc { + it := &IteratorDesc{ + Cursor: *NewLIDsCursor(table, loader, startIndex, tid, counter, minLID, maxLID), + } + it.it = it.batch.Iter() + return it +} func (*IteratorDesc) String() string { return "LIDS_DESC" } -// narrowLIDsRange cuts LIDs between from and to. Returns new lids and tryNextBlock flag -func (it *IteratorDesc) narrowLIDsRange(lids []uint32, tryNextBlock bool) ([]uint32, bool) { - first := lids[0] - if it.maxLID < first { // fast path: out-of-bounds 1 - return nil, false // stop reading blocks +// narrowLIDsRange cuts LIDs between minLID and maxLID. Returns updated tryNextBlock flag. +func (it *IteratorDesc) narrowLIDsRange(tryNextBlock bool) bool { + if it.batch.IsEmpty() { + return tryNextBlock } - last := lids[len(lids)-1] - if it.minLID > last { // fast path: out-of-bounds 2; allowed to continue reading blocks - return nil, tryNextBlock + first := it.batch.Min() + if it.maxLID < first { + it.batch = node.EmptyBatch() + return false } - if it.minLID > first { - left := sort.Search(len(lids), func(i int) bool { return lids[i] >= it.minLID }) - lids = lids[left:] + last := it.batch.Max() + if it.minLID > last { + it.batch = node.EmptyBatch() + return tryNextBlock } - if it.maxLID <= last { - right := sort.Search(len(lids), func(i int) bool { return lids[i] > it.maxLID }) - lids = lids[:right] + lastBlock := it.maxLID < last + it.batch = it.batch.Narrow(it.minLID, it.maxLID) + if lastBlock { tryNextBlock = false } - return lids, tryNextBlock + return tryNextBlock } func (it *IteratorDesc) loadNextLIDsBlock() { @@ -47,96 +64,46 @@ func (it *IteratorDesc) loadNextLIDsBlock() { logger.Panic("error loading LIDs block", zap.Error(err)) } - if block.getCount() != int(it.table.GetChunksCount(it.blockIndex)) { + if block.GetCount() != int(it.table.GetChunksCount(it.blockIndex)) { logger.Panic("unexpected LIDs count") } - it.lids = block.getLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) - it.tryNextBlock = it.table.HasTIDInNextBlock(it.blockIndex, it.tid) + it.batch = block.GetLIDs(it.table.GetChunkIndex(it.blockIndex, it.tid)) + it.counter.AddLIDsCount(it.batch.Len()) + tryNextBlock := it.table.HasTIDInNextBlock(it.blockIndex, it.tid) + it.tryNextBlock = it.narrowLIDsRange(tryNextBlock) + it.it = it.batch.Iter() it.blockIndex++ } +func (it *IteratorDesc) discardBlock() { + it.batch = node.EmptyBatch() + it.it = it.batch.Iter() +} + func (it *IteratorDesc) Next() node.LID { - for len(it.lids) == 0 { + for { + v, ok := it.it.Next() + if ok { + return node.NewDescLID(v) + } if !it.tryNextBlock { return node.NullLID() } - - it.loadNextLIDsBlock() // last chunk in block but not last for tid; need load next block - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) // inc loaded LIDs count + it.loadNextLIDsBlock() } - - lid := it.lids[0] - it.lids = it.lids[1:] - return node.NewDescLID(lid) } // NextGeq finds next greater or equal func (it *IteratorDesc) NextGeq(nextID node.LID) node.LID { for { - for len(it.lids) == 0 { - if !it.tryNextBlock { - return node.NullLID() - } - - it.loadNextLIDsBlock() // last chunk in block but not last for tid; need load next block - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) // inc loaded LIDs count - } - - // fast path: last LID < nextID => skip the entire block - // TODO(cheb0): We could also pass LID into narrowLIDsRange to perform block skipping once we add something like MinLID to LID block header - if nextID.Unpack() > it.lids[len(it.lids)-1] { - it.lids = it.lids[:0] - continue - } - - idx := sort.Search(len(it.lids), func(i int) bool { return it.lids[i] >= nextID.Unpack() }) - if idx < len(it.lids) { - it.lids = it.lids[idx:] - lid := it.lids[0] - it.lids = it.lids[1:] - return node.NewDescLID(lid) - } - - it.lids = it.lids[:0] - } -} - -func (it *IteratorDesc) NextBatch() node.LIDBatch { - return it.NextBatchGeq(node.NewDescZeroLID()) -} - -func (it *IteratorDesc) NextBatchGeq(nextID node.LID) node.LIDBatch { - for { - for len(it.lids) == 0 { - if !it.tryNextBlock { - return node.NewDescBatch(nil) - } - it.loadNextLIDsBlock() - it.lids, it.tryNextBlock = it.narrowLIDsRange(it.lids, it.tryNextBlock) - it.counter.AddLIDsCount(len(it.lids)) + v, ok := it.it.NextGeq(nextID.Unpack()) + if ok { + return node.NewDescLID(v) } - last := it.lids[len(it.lids)-1] - if nextID.Unpack() > last { - it.lids = it.lids[:0] - continue - } - - // fast path: last LID < nextLID => skip the entire block - if nextID.Unpack() > it.lids[len(it.lids)-1] { - it.lids = it.lids[:0] - continue - } - - idx := sort.Search(len(it.lids), func(i int) bool { return it.lids[i] >= nextID.Unpack() }) - if idx < len(it.lids) { - batch := it.lids[idx:len(it.lids)] - it.lids = it.lids[:0] - return node.NewDescBatch(batch) + if !it.tryNextBlock { + return node.NullLID() } - - it.lids = it.lids[:0] + it.loadNextLIDsBlock() } } diff --git a/frac/sealed/lids/loader.go b/frac/sealed/lids/loader.go index cf987a979..37e50c454 100644 --- a/frac/sealed/lids/loader.go +++ b/frac/sealed/lids/loader.go @@ -80,10 +80,8 @@ func (l *Loader) readLIDsBlock(blockIndex uint32) (*Block, error) { } block := &Block{} - err = block.Unpack(l.blockBuf, l.fracVer, l.unpackBuf) - if err != nil { + if err := block.Unpack(l.blockBuf, l.fracVer, l.unpackBuf); err != nil { return nil, err } - - return block, err + return block, nil } diff --git a/frac/sealed/seqids/loader.go b/frac/sealed/seqids/loader.go index defa865f2..334e07a8f 100644 --- a/frac/sealed/seqids/loader.go +++ b/frac/sealed/seqids/loader.go @@ -13,7 +13,6 @@ import ( type Table struct { MinBlockIDs []seq.ID // from max to min - IDBlocksTotal uint32 IDsTotal uint32 StartBlockIndex uint32 } diff --git a/frac/sealed_index.go b/frac/sealed_index.go index 84d563c38..cbc4fae3b 100644 --- a/frac/sealed_index.go +++ b/frac/sealed_index.go @@ -264,12 +264,12 @@ func (ti *sealedTokenIndex) GetLIDsFromTIDs(tids []uint32, stats lids.Counter, m if order.IsReverse() { getBlockIndex = func(tid uint32) uint32 { return ti.lidsTable.GetLastBlockIndexForTID(tid) } getLIDsIterator = func(startIndex uint32, tid uint32) node.Node { - return (*lids.IteratorAsc)(lids.NewLIDsCursor(ti.lidsTable, ti.lidsLoader, startIndex, tid, stats, minLID, maxLID)) + return lids.NewIteratorAsc(ti.lidsTable, ti.lidsLoader, startIndex, tid, stats, minLID, maxLID) } } else { getBlockIndex = func(tid uint32) uint32 { return ti.lidsTable.GetFirstBlockIndexForTID(tid) } getLIDsIterator = func(startIndex uint32, tid uint32) node.Node { - return (*lids.IteratorDesc)(lids.NewLIDsCursor(ti.lidsTable, ti.lidsLoader, startIndex, tid, stats, minLID, maxLID)) + return lids.NewIteratorDesc(ti.lidsTable, ti.lidsLoader, startIndex, tid, stats, minLID, maxLID) } } diff --git a/frac/sealed_loader.go b/frac/sealed_loader.go index 893b75a42..10d95a2d3 100644 --- a/frac/sealed_loader.go +++ b/frac/sealed_loader.go @@ -36,7 +36,7 @@ func (l *LegacyLoader) Load(blocksData *sealed.BlocksData, info *common.Info, re l.skipSection() // skip token table blocks var err error - blocksData.IDsTable, blocksData.BlocksOffsets, err = l.loadIDs(info.BinaryDataVer) + blocksData.IDsTable, blocksData.BlocksOffsets, err = l.loadIDs(info) if err != nil { logger.Fatal("legacy load ids error", zap.Error(err)) } @@ -77,7 +77,7 @@ func (l *LegacyLoader) skipSection() { } // loadIDs reads the BlockOffsets block and then scans MID/RID/Pos triplets. -func (l *LegacyLoader) loadIDs(fracVersion config.BinaryDataVersion) (seqids.Table, []uint64, error) { +func (l *LegacyLoader) loadIDs(info *common.Info) (seqids.Table, []uint64, error) { var buf []byte data, _, err := l.reader.ReadIndexBlock(l.blockIndex, buf) @@ -94,9 +94,8 @@ func (l *LegacyLoader) loadIDs(fracVersion config.BinaryDataVersion) (seqids.Tab l.blockIndex++ table := seqids.Table{ - StartBlockIndex: l.blockIndex, // absolute index of first MID block in .index - IDsTotal: offsets.IDsTotal, - IDBlocksTotal: uint32(len(offsets.Offsets)), + StartBlockIndex: l.blockIndex, // absolute index of first MID block in .index + IDsTotal: info.DocsTotal + 1, // Increment by one for [seq.SystemID] } for { @@ -111,7 +110,7 @@ func (l *LegacyLoader) loadIDs(fracVersion config.BinaryDataVersion) (seqids.Tab } mid := seq.MID(h.GetExt1()) - if fracVersion < config.BinaryDataV2 { + if info.BinaryDataVer < config.BinaryDataV2 { mid = seq.MillisToMID(h.GetExt1()) } @@ -184,10 +183,9 @@ func (l *Loader) Load(blocksData *sealed.BlocksData, info *common.Info, readers if err != nil { logger.Fatal("load offsets error", zap.Error(err)) } - blocksData.BlocksOffsets = blockOffsets.Offsets - blocksData.IDsTable = l.loadIDsTable(readers.ID, blockOffsets.IDsTotal, info.BinaryDataVer) + blocksData.IDsTable = l.loadIDsTable(readers.ID, info) blocksData.LIDsTable, err = l.loadLIDsTable(readers.LID) if err != nil { logger.Fatal("load lids error", zap.Error(err)) @@ -227,10 +225,10 @@ func (l *Loader) loadBlocksOffsets(r storage.IndexReader) (sealed.BlockOffsets, // loadIDsTable scans block headers in the .id file to build seqids.Table. // Blocks are stored as (MIDs, RIDs, Pos) triplets; we only need MIDs headers. -func (l *Loader) loadIDsTable(r storage.IndexReader, idsTotal uint32, fracVersion config.BinaryDataVersion) seqids.Table { +func (l *Loader) loadIDsTable(r storage.IndexReader, info *common.Info) seqids.Table { table := seqids.Table{ StartBlockIndex: 0, - IDsTotal: idsTotal, + IDsTotal: info.DocsTotal + 1, // Increment by one for [seq.SystemID] } blocksCount, err := r.BlocksCount() @@ -248,7 +246,7 @@ func (l *Loader) loadIDsTable(r storage.IndexReader, idsTotal uint32, fracVersio } var mid seq.MID - if fracVersion < config.BinaryDataV2 { + if info.BinaryDataVer < config.BinaryDataV2 { mid = seq.MillisToMID(header.GetExt1()) } else { mid = seq.MID(header.GetExt1()) @@ -258,8 +256,6 @@ func (l *Loader) loadIDsTable(r storage.IndexReader, idsTotal uint32, fracVersio MID: mid, RID: seq.RID(header.GetExt2()), }) - - table.IDBlocksTotal++ } return table diff --git a/frac/sealed_source.go b/frac/sealed_source.go new file mode 100644 index 000000000..4dca01d9d --- /dev/null +++ b/frac/sealed_source.go @@ -0,0 +1,160 @@ +package frac + +import ( + "iter" + "slices" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" + "github.com/ozontech/seq-db/indexwriter" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" +) + +type DocBlockLocation = util.Pair[[]byte, uint64] + +// SealedSource implements [indexwriter.Source] for a sealed fraction. +// Used as input to [compaction.MergeSource] when compacting multiple fractions. +type SealedSource struct { + f *Sealed + + idsProvider *seqids.Provider + lidsLoader *lids.Loader + + tokenBlockLoader *token.BlockLoader + tokenTableLoader *token.TableLoader +} + +func NewSealedSource(f *Sealed) *SealedSource { + f.init(true) + return &SealedSource{ + f: f, + idsProvider: seqids.NewProvider( + &f.idReader, + f.indexCache.MIDs, + f.indexCache.RIDs, + f.indexCache.Params, + &f.blocksData.IDsTable, + f.info.BinaryDataVer, + ), + lidsLoader: lids.NewLoader(f.Info().BinaryDataVer, &f.lidReader, f.indexCache.LIDs), + tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.tokenReader, f.indexCache.Tokens), + tokenTableLoader: token.NewTableLoader(f.BaseFileName, f.IsLegacy, &f.tokenReader, f.indexCache.TokenTable), + } +} + +func (s *SealedSource) Info() *common.Info { + return s.f.info +} + +func (s *SealedSource) BlockOffsets() []uint64 { + return s.f.blocksData.BlocksOffsets +} + +func (s *SealedSource) ID() iter.Seq2[indexwriter.DocLocation, error] { + return func(yield func(indexwriter.DocLocation, error) bool) { + for lid := uint32(0); lid < s.f.blocksData.IDsTable.IDsTotal; lid++ { + mid, err := s.idsProvider.MID(seq.LID(lid)) + if err != nil { + yield(indexwriter.DocLocation{}, err) + return + } + + rid, err := s.idsProvider.RID(seq.LID(lid)) + if err != nil { + yield(indexwriter.DocLocation{}, err) + return + } + + pos, err := s.idsProvider.DocPos(seq.LID(lid)) + if err != nil { + yield(indexwriter.DocLocation{}, err) + return + } + + if !yield(indexwriter.DocLocation{First: seq.ID{MID: mid, RID: rid}, Second: pos}, nil) { + return + } + } + } +} + +func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[indexwriter.TokenPosting, error]] { + tokenTable := s.tokenTableLoader.Load() + + fields := make([]string, 0, len(tokenTable)) + for field := range tokenTable { + fields = append(fields, field) + } + + slices.Sort(fields) + return func(yield func(string, iter.Seq2[indexwriter.TokenPosting, error]) bool) { + for _, field := range fields { + if !yield(field, s.postingsForField(field)) { + return + } + } + } +} + +func (s *SealedSource) postingsForField(field string) iter.Seq2[indexwriter.TokenPosting, error] { + lidsTable := s.f.blocksData.LIDsTable + tokenTable := s.tokenTableLoader.Load() + + var lidsBuf []uint32 + + return func(yield func(indexwriter.TokenPosting, error) bool) { + for _, entry := range tokenTable[field].Entries { + block := s.tokenBlockLoader.Load(entry.BlockIndex) + + for tid := entry.StartTID; tid < entry.StartTID+entry.ValCount; tid++ { + lidsBuf = lidsBuf[:0] + + tokenVal := block.GetToken(entry.GetIndexInTokensBlock(tid)) + firstBlock := lidsTable.GetFirstBlockIndexForTID(tid) + lastBlock := lidsTable.GetLastBlockIndexForTID(tid) + + for bi := firstBlock; bi <= lastBlock; bi++ { + lidBlock, err := s.lidsLoader.GetLIDsBlock(bi) + if err != nil { + yield(indexwriter.TokenPosting{}, err) + return + } + + chunkIdx := lidsTable.GetChunkIndex(bi, tid) + lidsBuf = lidBlock.CopyLIDs(chunkIdx, lidsBuf) + } + + if !yield(indexwriter.TokenPosting{First: tokenVal, Second: lidsBuf}, nil) { + return + } + } + } + } +} + +func (s *SealedSource) DocBlock() iter.Seq2[DocBlockLocation, error] { + return func(yield func(DocBlockLocation, error) bool) { + // We do not want to cache payload of DocBlock because + // it will just pollute cache and cause unnecessary evictions. + r := storage.NewDocBlocksReader(s.f.readLimiter, s.f.docsFile) + + for _, offset := range s.f.blocksData.BlocksOffsets { + // Read DocBlock payload (including its header) but do not decompress it. + // Caller of [SealedSource.DocBlock] will decide whether it requires decompressed data. + payload, _, err := r.ReadDocBlock(int64(offset)) + if err != nil { + yield(DocBlockLocation{}, err) + return + } + + loc := DocBlockLocation{First: payload, Second: offset} + if !yield(loc, nil) { + return + } + } + } +} diff --git a/fracmanager/config.go b/fracmanager/config.go index e295aadab..1f31957eb 100644 --- a/fracmanager/config.go +++ b/fracmanager/config.go @@ -35,6 +35,8 @@ type Config struct { OffloadingQueueSize uint64 OffloadingRetention time.Duration OffloadingRetryDelay time.Duration + + CompactionEnabled bool } func FillConfigWithDefault(config *Config) *Config { @@ -69,6 +71,9 @@ func FillConfigWithDefault(config *Config) *Config { if config.SealParams.TokenTableZstdLevel == 0 { config.SealParams.TokenTableZstdLevel = zstdDefaultLevel } + if config.SealParams.LidsBitmapThreshold == 0 { + config.SealParams.LidsBitmapThreshold = consts.DefaultLIDBlockCap + } if config.ReplayWorkers == 0 { config.ReplayWorkers = consts.DefaultReplayWorkers } diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 569c5cec0..bdd411a1a 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -13,6 +13,7 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -79,11 +80,11 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk wg.Wait() // finalize appender to prevent new writes - appender := lc.registry.Appender() - if err := appender.Finalize(); err != nil { + appender := lc.registry.appender() + if err := appender.finalize(); err != nil { logger.Fatal("shutdown fraction freezing error", zap.Error(err)) } - appender.WaitWriteIdle() + appender.waitWriteIdle() stopIdx() @@ -98,16 +99,59 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk return &fm, stop, nil } +type CompactionSnapshot struct { + claimed []*refCountedSealed +} + +func (cs *CompactionSnapshot) Fractions() []*frac.Sealed { + result := make([]*frac.Sealed, len(cs.claimed)) + for i, f := range cs.claimed { + result[i] = f.Sealed + } + return result +} + +func (cs *CompactionSnapshot) Destroy() { + for _, f := range cs.claimed { + f.Destroy() + } +} + +func (fm *FracManager) FractionName() string { + filePath := fileBasePattern + fm.lc.provider.nextFractionID() + baseFilePath := filepath.Join(fm.lc.provider.config.DataDir, filePath) + return baseFilePath +} + +func (fm *FracManager) SealedFractionsSnapshot() []*frac.Sealed { + return fm.lc.registry.sealedSnapshot() +} + +func (fm *FracManager) ClaimForCompaction(names []string) (*CompactionSnapshot, error) { + claimed, err := fm.lc.registry.claimForCompaction(names) + if err != nil { + return nil, err + } + return &CompactionSnapshot{claimed: claimed}, nil +} + +func (fm *FracManager) SubstituteWithSealed(produced *sealed.PreloadedData, snapshot *CompactionSnapshot) { + fm.lc.registry.substituteWithSealed( + fm.lc.provider.NewSealedPreloaded(produced.Info.Path, produced), + snapshot.claimed..., + ) +} + func (fm *FracManager) AcquireFraction(name string) (frac.Fraction, func(), bool) { - return fm.lc.registry.AcquireOneFraction(name) + return fm.lc.registry.acquireOneFraction(name) } func (fm *FracManager) AcquireFractions() (List, func()) { - return fm.lc.registry.AcquireAllFractions() + return fm.lc.registry.acquireAllFractions() } func (fm *FracManager) Oldest() uint64 { - return fm.lc.registry.OldestTotal() + return fm.lc.registry.oldestTotal() } func (fm *FracManager) Flags() *StateManager { @@ -123,7 +167,7 @@ func (fm *FracManager) Append(ctx context.Context, docs storage.DocBlock, metas return ctx.Err() default: // Try to append data to the currently active fraction - err := fm.lc.registry.Appender().Append(docs, metas) + err := fm.lc.registry.appender().append(docs, metas) if err != nil { logger.Info("append fail", zap.Error(err)) if err == ErrFractionNotWritable { @@ -169,7 +213,7 @@ func startStatsWorker(ctx context.Context, cfg *Config, reg *fractionRegistry, w logger.Info("stats loop is started") // Run stats collection every 10 seconds util.RunEvery(ctx.Done(), time.Second*10, func() { - stats := reg.Stats() + stats := reg.statistics() stats.Log() // Log statistics stats.SetMetrics() // Update Prometheus metrics diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index c4ec1cad3..39349289b 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -3,7 +3,7 @@ package fracmanager import "sync" func (fm *FracManager) WaitIdleForTests() { - fm.lc.registry.Appender().WaitWriteIdle() + fm.lc.registry.appender().waitWriteIdle() } func (fm *FracManager) SealForcedForTests() { diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index 4c372754e..4a25deae7 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -66,7 +66,7 @@ func TestSealingOnShutdown(t *testing.T) { cfg, fm, stop := setupFracManager(t, cfg) appendDocsToFracManager(t, fm, 10) - activeName := fm.lc.registry.all.fractions[0].Info().Name() + activeName := fm.lc.registry.snapshot.fractions[0].Info().Name() stop() @@ -74,7 +74,7 @@ func TestSealingOnShutdown(t *testing.T) { cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown cfg, fm, stop = setupFracManager(t, cfg) - allFractions := fm.lc.registry.all.fractions + allFractions := fm.lc.registry.snapshot.fractions assert.Equal(t, 1, len(allFractions), "should have one fraction") assert.Equal(t, activeName, allFractions[0].Info().Name(), "fraction should have the same name") _, ok := allFractions[0].(*syncAppender) @@ -84,7 +84,7 @@ func TestSealingOnShutdown(t *testing.T) { // third start _, fm, stop = setupFracManager(t, cfg) - allFractions = fm.lc.registry.all.fractions + allFractions = fm.lc.registry.snapshot.fractions assert.Equal(t, 2, len(allFractions), "should have 2 fraction: new active and old sealed") _, ok = allFractions[0].(*refCountedSealed) assert.True(t, ok, "first fraction should be sealed") diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go index c70bbd374..ee255543d 100644 --- a/fracmanager/fracs_stats.go +++ b/fracmanager/fracs_stats.go @@ -76,6 +76,7 @@ type registryStats struct { active fracsStats // Statistics for active fraction sealing fracsStats // Statistics for fractions in the sealing process sealed fracsStats // Statistics for fractions on sealed disk + compacting fracsStats // Statistics for fractions participating in compaction offloading fracsStats // Statistics for fractions in the offloading process remotes fracsStats // Statistics for fractions in remote storage } @@ -84,6 +85,7 @@ func (s *registryStats) Log() { s.active.Log("active") s.sealing.Log("sealing") s.sealed.Log("sealed") + s.compacting.Log("compacting") s.offloading.Log("offloading") s.remotes.Log("remotes") } @@ -92,10 +94,11 @@ func (s *registryStats) SetMetrics() { s.active.SetMetrics(dataSizeTotal, "active") s.sealing.SetMetrics(dataSizeTotal, "sealing") s.sealed.SetMetrics(dataSizeTotal, "sealed") + s.compacting.SetMetrics(dataSizeTotal, "compacting") s.offloading.SetMetrics(dataSizeTotal, "offloading") s.remotes.SetMetrics(dataSizeTotal, "remotes") } func (s registryStats) TotalSizeOnDiskLocal() uint64 { - return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk + return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk + s.compacting.totalSizeOnDisk } diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 54b109795..143616d35 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -5,6 +5,7 @@ import ( "io" "math/rand" "path/filepath" + "sync" "time" "github.com/RoaringBitmap/roaring/v2" @@ -14,9 +15,9 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" "github.com/ozontech/seq-db/util" @@ -39,8 +40,10 @@ type fractionProvider struct { cacheProvider *CacheMaintainer // Cache provider for data access optimization activeIndexer *frac.ActiveIndexer // Indexer for active fractions readLimiter *storage.ReadLimiter // Read rate limiter - ulidEntropy io.Reader // Entropy source for ULID generation skipMaskProvider skipMaskProvider + + mu sync.Mutex + ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( @@ -115,6 +118,8 @@ func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedIn // IMPORTANT: This method is not thread-safe. When used in concurrent environments, // external synchronization must be provided to avoid ID collisions func (fp *fractionProvider) nextFractionID() string { + fp.mu.Lock() + defer fp.mu.Unlock() return ulid.MustNew(ulid.Timestamp(time.Now()), fp.ulidEntropy).String() } @@ -136,7 +141,21 @@ func (fp *fractionProvider) Seal(a *frac.Active) (*frac.Sealed, error) { if err != nil { return nil, err } - preloaded, err := sealing.Seal(src, fp.config.SealParams) + + params := fp.config.SealParams + // NOTE(dkharms): If compaction is enabled we do not want to waste CPU on compression. + // + // Sealed fractions will be picked up by compaction workers almost instantly, + // and that will trigger compression again. + if fp.config.CompactionEnabled { + params = common.SealParams{ + DocBlocksZstdLevel: params.DocBlocksZstdLevel, + LIDBlockSize: params.LIDBlockSize, + DocBlockSize: params.DocBlockSize, + } + } + + preloaded, err := sealing.Seal(src, params) if err != nil { return nil, err } diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index b0667c04b..c01123834 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -21,16 +21,17 @@ type fractionRegistry struct { sealing map[string]*syncAppender // fractions being sealed (0-5 typical) sealed PartitionedCollection[*refCountedSealed] // local sealed fractions (can be thousands) + compacting map[string]*refCountedSealed // fractions participating in compaction offloading PartitionedCollection[*refCountedSealed] // fractions being offloaded (0-5 typical) remotes PartitionedCollection[*refCountedRemote] // offloaded fractions (can be thousands) stats registryStats // size statistics for monitoring muAppender sync.RWMutex - appender *syncAppender // currently active writable fraction + sappender *syncAppender // currently active writable fraction - muAll sync.RWMutex - all fractionsSnapshot // all fractions + muSnapshot sync.RWMutex + snapshot fractionsSnapshot // all fractions } // NewFractionRegistry creates and initializes a new fraction registry instance. @@ -51,10 +52,11 @@ func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []* } reg := fractionRegistry{ - appender: &syncAppender{refCountedActive: refCountedActive{Active: active}}, + sappender: &syncAppender{refCountedActive: refCountedActive{Active: active}}, sealing: map[string]*syncAppender{}, sealed: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return creationTime(rcs) }), + compacting: map[string]*refCountedSealed{}, offloading: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return lastDocTime(rcs) }), remotes: NewPartitionedCollection(func(rcr *refCountedRemote) uint64 { return lastDocTime(rcr) }), } @@ -76,51 +78,51 @@ func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []* return ®, nil } -// Appender returns the currently active writable fraction. -func (r *fractionRegistry) Appender() *syncAppender { +// appender returns the currently active writable fraction. +func (r *fractionRegistry) appender() *syncAppender { r.muAppender.RLock() defer r.muAppender.RUnlock() - return r.appender + return r.sappender } -func (r *fractionRegistry) AcquireOneFraction(name string) (frac.Fraction, func(), bool) { - r.muAll.RLock() - defer r.muAll.RUnlock() +func (r *fractionRegistry) acquireOneFraction(name string) (frac.Fraction, func(), bool) { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() - return r.all.AcquireOne(name) + return r.snapshot.AcquireOne(name) } -// AcquireAllFractions returns a read-only view of all fractions -func (r *fractionRegistry) AcquireAllFractions() ([]frac.Fraction, func()) { - r.muAll.RLock() - defer r.muAll.RUnlock() +// acquireAllFractions returns a read-only view of all fractions +func (r *fractionRegistry) acquireAllFractions() ([]frac.Fraction, func()) { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() - return r.all.AcquireAll() + return r.snapshot.AcquireAll() } -// Stats returns current size statistics of the registry. -func (r *fractionRegistry) Stats() registryStats { +// statistics returns current size statistics of the registry. +func (r *fractionRegistry) statistics() registryStats { r.mu.RLock() s := r.stats - i := r.appender.Info() + i := r.sappender.Info() r.mu.RUnlock() s.active.Set(i) return s } -// OldestTotal returns the creation time of the oldest fraction in the registry. -func (r *fractionRegistry) OldestTotal() uint64 { - r.muAll.RLock() - defer r.muAll.RUnlock() - return r.all.oldestTotal +// oldestTotal returns the creation time of the oldest fraction in the registry. +func (r *fractionRegistry) oldestTotal() uint64 { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() + return r.snapshot.oldestTotal } -// OldestLocal returns the creation time of the oldest local fraction in the registry. -func (r *fractionRegistry) OldestLocal() uint64 { - r.muAll.RLock() - defer r.muAll.RUnlock() - return r.all.oldestLocal +// oldestLocal returns the creation time of the oldest local fraction in the registry. +func (r *fractionRegistry) oldestLocal() uint64 { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() + return r.snapshot.oldestLocal } type activeProvider interface { @@ -131,39 +133,39 @@ func (r *fractionRegistry) setAppender(appender *syncAppender) { r.muAppender.Lock() defer r.muAppender.Unlock() - r.appender = appender + r.sappender = appender - r.muAll.Lock() - defer r.muAll.Unlock() + r.muSnapshot.Lock() + defer r.muSnapshot.Unlock() - r.all.AddActive(appender) + r.snapshot.AddActive(appender) } -// RotateIfFull completes the current active fraction and starts a new one. +// rotateIfFull completes the current active fraction and starts a new one. // Moves previous active fraction to sealing queue. // Should be called when the current active fraction reaches size limit and needs to be rotated -func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*refCountedActive, func(), error) { +func (r *fractionRegistry) rotateIfFull(maxSize uint64, ap activeProvider) (*refCountedActive, func(), error) { r.mu.Lock() defer r.mu.Unlock() - if r.appender.Info().DocsOnDisk <= maxSize { + if r.sappender.Info().DocsOnDisk <= maxSize { return nil, nil, nil } - old := r.appender + old := r.sappender r.sealing[old.Info().Name()] = old r.setAppender(&syncAppender{refCountedActive: refCountedActive{Active: ap.CreateActive()}}) - if err := old.Finalize(); err != nil { + if err := old.finalize(); err != nil { return nil, nil, err } curInfo := old.Info() r.stats.sealing.Add(curInfo) - r.appender.Suspend(old.Suspended()) + r.sappender.suspend(old.isSuspended()) wg := sync.WaitGroup{} wg.Add(1) @@ -172,7 +174,7 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*ref go func() { defer wg.Done() - old.WaitWriteIdle() // can be long enough + old.waitWriteIdle() // can be long enough finalInfo := old.Info() r.mu.Lock() @@ -187,11 +189,11 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*ref return &old.refCountedActive, wg.Wait, nil } -func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { +func (r *fractionRegistry) suspendIfOverCapacity(maxQueue, maxSize uint64) { r.mu.Lock() defer r.mu.Unlock() - suspended := r.appender.Suspended() + suspended := r.sappender.isSuspended() if maxQueue > 0 && r.stats.sealing.count >= int(maxQueue) { if !suspended { @@ -199,7 +201,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "sealing queue size exceeded"), zap.Uint64("limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.appender.Suspend(true) + r.sappender.suspend(true) } return } @@ -212,7 +214,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "occupied space limit exceeded"), zap.Float64("queue_size_limit_gb", util.Float64ToPrec(util.SizeToUnit(maxSize, "gb"), 2)), zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2))) - r.appender.Suspend(true) + r.sappender.suspend(true) } return } @@ -223,20 +225,21 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2)), zap.Uint64("sealing_queue_size_limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.appender.Suspend(false) + r.sappender.suspend(false) } } func (r *fractionRegistry) diskUsage() uint64 { - return r.appender.Info().FullSize() + + return r.sappender.Info().FullSize() + r.stats.sealed.totalSizeOnDisk + r.stats.sealing.totalSizeOnDisk + + r.stats.compacting.totalSizeOnDisk + r.stats.offloading.totalSizeOnDisk } -// EvictLocalForDelete removes oldest local fractions to free disk space. +// evictLocalForDelete removes oldest local fractions to free disk space. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocalForDelete(sizeLimit uint64) (evicted []*refCountedSealed, err error) { +func (r *fractionRegistry) evictLocalForDelete(sizeLimit uint64) (evicted []*refCountedSealed, err error) { r.mu.Lock() defer r.mu.Unlock() @@ -249,9 +252,9 @@ func (r *fractionRegistry) EvictLocalForDelete(sizeLimit uint64) (evicted []*ref return evicted, nil } -// EvictLocalForOffload removes oldest local fractions to moves it to offloading queue. +// evictLocalForOffload removes oldest local fractions to moves it to offloading queue. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocalForOffload(sizeLimit uint64) ([]*refCountedSealed, error) { +func (r *fractionRegistry) evictLocalForOffload(sizeLimit uint64) ([]*refCountedSealed, error) { r.mu.Lock() defer r.mu.Unlock() @@ -272,16 +275,17 @@ func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, er var releasingSize uint64 // calculate total used disk space - totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.appender.Info().FullSize() - - evicted := []*refCountedSealed{} + totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.sappender.Info().FullSize() + var evicted []*refCountedSealed for r.sealed.Len() > 0 && totalUsedSize-releasingSize > sizeLimit { for _, s := range r.sealed.GetByPartition(r.sealed.MinPartition()) { info := s.Info() releasingSize += info.FullSize() + r.stats.sealed.Sub(info) r.sealed.Del(info.Name()) + evicted = append(evicted, s) } } @@ -296,10 +300,10 @@ func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, er return evicted, nil } -// EvictRemote removes oldest remote fractions based on retention policy. +// evictRemote removes oldest remote fractions based on retention policy. // Fractions older than retention period are permanently deleted. // Returns removed fractions or empty slice if nothing to remove. -func (r *fractionRegistry) EvictRemote(retention time.Duration) []*refCountedRemote { +func (r *fractionRegistry) evictRemote(retention time.Duration) []*refCountedRemote { if retention == 0 { return nil } @@ -322,9 +326,9 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*refCountedRem return evicted } -// EvictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. +// evictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. // Used when offloading queue grows too large due to slow remote storage performance. -func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) (evicted []*refCountedSealed) { +func (r *fractionRegistry) evictOverflowed(sizeLimit uint64) (evicted []*refCountedSealed) { if sizeLimit == 0 { return nil } @@ -355,23 +359,43 @@ loop: return evicted } -// PromoteToSealed moves fractions from sealing to local queue when sealing completes. -func (r *fractionRegistry) PromoteToSealed(active *refCountedActive, sealed *frac.Sealed) { +// promoteToSealed moves fractions from sealing to local queue when sealing completes. +func (r *fractionRegistry) promoteToSealed(active *refCountedActive, sealed ...*frac.Sealed) { r.mu.Lock() defer r.mu.Unlock() - r.sealed.Add(sealed.Info().Name(), &refCountedSealed{Sealed: sealed}) - r.stats.sealed.Add(sealed.Info()) - r.stats.sealing.Sub(active.Info()) + for _, f := range sealed { + info := f.Info() + r.sealed.Add(info.Name(), &refCountedSealed{Sealed: f}) + r.stats.sealed.Add(info) + } + r.stats.sealing.Sub(active.Info()) delete(r.sealing, active.Info().Name()) r.rebuildSnapshot() } -// PromoteToRemote moves fractions from offloading to remote queue when offloading completes. +func (r *fractionRegistry) substituteWithSealed(produced *frac.Sealed, consumed ...*refCountedSealed) { + r.mu.Lock() + defer r.mu.Unlock() + + for _, f := range consumed { + info := f.Info() + r.stats.compacting.Sub(info) + delete(r.compacting, info.Name()) + } + + info := produced.Info() + r.stats.sealed.Add(info) + r.sealed.Add(info.Name(), &refCountedSealed{Sealed: produced}) + + r.rebuildSnapshot() +} + +// promoteToRemote moves fractions from offloading to remote queue when offloading completes. // Special case: handles fractions that don't require offloading (remote == nil). -func (r *fractionRegistry) PromoteToRemote(sealed *refCountedSealed, remote *frac.Remote) { +func (r *fractionRegistry) promoteToRemote(sealed *refCountedSealed, remote *frac.Remote) { r.mu.Lock() defer r.mu.Unlock() @@ -380,14 +404,60 @@ func (r *fractionRegistry) PromoteToRemote(sealed *refCountedSealed, remote *fra r.stats.remotes.Add(remote.Info()) } - r.stats.offloading.Sub(sealed.Info()) r.offloading.Del(sealed.Info().Name()) + r.stats.offloading.Sub(sealed.Info()) + + r.rebuildSnapshot() +} + +func (r *fractionRegistry) sealedSnapshot() []*frac.Sealed { + r.mu.RLock() + defer r.mu.RUnlock() + + result := make([]*frac.Sealed, 0, r.sealed.Len()) + for s := range r.sealed.All() { + result = append(result, s.Sealed) + } + + return result +} + +func (r *fractionRegistry) claimForCompaction(names []string) ([]*refCountedSealed, error) { + r.mu.Lock() + defer r.mu.Unlock() + + for _, name := range names { + // NOTE(dkharms): If offloading pressure is high on the oldest fractions, + // compaction may repeatedly fail to claim them and get into livelock. + if _, ok := r.sealed.Get(name); !ok { + return nil, fmt.Errorf( + "fraction %q is not available for compaction", + name, + ) + } + } + + claimed := make([]*refCountedSealed, 0, len(names)) + for _, name := range names { + s, _ := r.sealed.Get(name) + + r.sealed.Del(name) + r.stats.sealed.Sub(s.Info()) + + r.compacting[name] = s + r.stats.compacting.Add(s.Info()) + + claimed = append(claimed, s) + } + r.rebuildSnapshot() + return claimed, nil } // rebuildSnapshot reconstructs the all fractions list func (r *fractionRegistry) rebuildSnapshot() { - capacity := r.remotes.Len() + r.offloading.Len() + r.sealed.Len() + len(r.sealing) + 1 + capacity := r.remotes.Len() + r.offloading.Len() + + r.sealed.Len() + len(r.compacting) + len(r.sealing) + 1 // allocate extra capacity to accommodate appender rotation that may occur during snapshot lifetime all := newFractionsSnapshot(capacity + 1) @@ -404,13 +474,18 @@ func (r *fractionRegistry) rebuildSnapshot() { all.AddSealed(s) } + for _, c := range r.compacting { + all.AddSealed(c) + } + for _, a := range r.sealing { all.AddActive(a) } - all.AddActive(r.appender) + all.AddActive(r.sappender) + + r.muSnapshot.Lock() + defer r.muSnapshot.Unlock() - r.muAll.Lock() - defer r.muAll.Unlock() - r.all = all + r.snapshot = all } diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index 24025c23d..e98c58712 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -42,7 +42,7 @@ func newLifecycleManager( // Maintain performs periodic lifecycle management tasks. // It coordinates rotation, offloading, cleanup based on configuration. func (lc *lifecycleManager) Maintain(ctx context.Context, cfg *Config, wg *sync.WaitGroup) { - lc.registry.SuspendIfOverCapacity(cfg.SealingQueueLen, cfg.SuspendThreshold()) + lc.registry.suspendIfOverCapacity(cfg.SealingQueueLen, cfg.SuspendThreshold()) lc.rotate(cfg.FracSize, wg) if cfg.OffloadingEnabled { @@ -68,7 +68,7 @@ func (lc *lifecycleManager) SyncInfoCache() { // rotate checks if active fraction needs rotation based on size limit. // Creates new active fraction and starts sealing the previous one. func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { - active, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, lc.provider) + active, waitBeforeSealing, err := lc.registry.rotateIfFull(maxSize, lc.provider) if err != nil { logger.Fatal("active fraction rotation error", zap.Error(err)) } @@ -89,7 +89,7 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { } lc.infoCache.Add(sealed.Info()) - lc.registry.PromoteToSealed(active, sealed) + lc.registry.promoteToSealed(active, sealed) active.Destroy() }() } @@ -97,7 +97,7 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { // offloadLocal starts offloading of local fractions to remote storage. // Selects fractions based on disk space usage and retention policy. func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, retryDelay time.Duration, wg *sync.WaitGroup) { - toOffload, err := lc.registry.EvictLocalForOffload(sizeLimit) + toOffload, err := lc.registry.evictLocalForOffload(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } @@ -108,7 +108,7 @@ func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, remote := lc.offloadWithRetry(ctx, frac.Sealed, retryDelay) - lc.registry.PromoteToRemote(frac, remote) + lc.registry.promoteToRemote(frac, remote) if remote == nil { lc.infoCache.Remove(frac.Info().Name()) @@ -181,7 +181,7 @@ func (lc *lifecycleManager) tryOffload(ctx context.Context, sealed *frac.Sealed) // cleanRemote deletes outdated remote fractions based on retention policy. func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGroup) { - toDelete := lc.registry.EvictRemote(retention) + toDelete := lc.registry.evictRemote(retention) wg.Add(len(toDelete)) for _, remote := range toDelete { go func() { @@ -194,10 +194,11 @@ func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGr // cleanLocal deletes outdated local fractions when offloading is disabled. func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { - toDelete, err := lc.registry.EvictLocalForDelete(sizeLimit) + toDelete, err := lc.registry.evictLocalForDelete(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } + if len(toDelete) > 0 && !lc.flags.IsCapacityExceeded() { if err := lc.flags.setCapacityExceeded(true); err != nil { logger.Fatal("can't set capacity_exceeded flag", zap.Error(err)) @@ -217,14 +218,14 @@ func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { // updateOldestMetric updates the prometheus metric with oldest fraction timestamp. func (lc *lifecycleManager) updateOldestMetric() { - oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.OldestTotal()) * time.Millisecond).Seconds()) - oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.OldestLocal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.oldestTotal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.oldestLocal()) * time.Millisecond).Seconds()) } // removeOverflowed removes fractions from offloading queue that exceed size limit // Stops ongoing offloading tasks and cleans up both local and remote resources. func (lc *lifecycleManager) removeOverflowed(sizeLimit uint64, wg *sync.WaitGroup) { - evicted := lc.registry.EvictOverflowed(sizeLimit) + evicted := lc.registry.evictOverflowed(sizeLimit) for _, sealed := range evicted { wg.Add(1) go func() { diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go index cb9ab1e0a..bebc2c1f8 100644 --- a/fracmanager/lifecycle_manager_test.go +++ b/fracmanager/lifecycle_manager_test.go @@ -38,7 +38,7 @@ func TestFracInfoCache(t *testing.T) { defer tearDown() fillRotateAndCheck := func(names map[string]struct{}) { - appender := lc.registry.Appender() + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} @@ -56,13 +56,13 @@ func TestFracInfoCache(t *testing.T) { for range 10 { fillRotateAndCheck(first) } - halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + halfSize := lc.registry.statistics().TotalSizeOnDiskLocal() second := map[string]struct{}{} for range 10 { fillRotateAndCheck(second) } - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total-halfSize, &wg) @@ -86,7 +86,7 @@ func TestCapacityExceeded(t *testing.T) { const fracsCount = 10 fillAndRotate := func() { - appender := lc.registry.Appender() + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} @@ -102,19 +102,19 @@ func TestCapacityExceeded(t *testing.T) { } assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total, &wg) wg.Wait() - assert.Equal(t, fracsCount, lc.registry.Stats().sealed.count, "as much as was added, so much should be") + assert.Equal(t, fracsCount, lc.registry.statistics().sealed.count, "as much as was added, so much should be") assert.False(t, lc.flags.IsCapacityExceeded(), "there should still be no deletions, and the flag is false") lc.cleanLocal(total-1, &wg) wg.Wait() - assert.Equal(t, fracsCount-1, lc.registry.Stats().sealed.count, "expect one less") + assert.Equal(t, fracsCount-1, lc.registry.statistics().sealed.count, "expect one less") assert.True(t, lc.flags.IsCapacityExceeded(), "the flag must be true now") } @@ -124,30 +124,30 @@ func TestOldestMetrics(t *testing.T) { const fracsCount = 10 fillAndRotate := func() { - appender := lc.registry.Appender() + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() } - firstFracTime := lc.registry.Appender().Info().CreationTime + firstFracTime := lc.registry.appender().Info().CreationTime for range fracsCount { fillAndRotate() } // Check state after initial rotations - assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") - assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") + assert.Equal(t, firstFracTime, lc.registry.oldestTotal(), "should point to the very first fraction when all data is local") + assert.Equal(t, firstFracTime, lc.registry.oldestLocal(), "should point to the first fraction when nothing is offloaded") - halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + halfSize := lc.registry.statistics().TotalSizeOnDiskLocal() - halfwayFracTime := lc.registry.Appender().Info().CreationTime + halfwayFracTime := lc.registry.appender().Info().CreationTime for range fracsCount { fillAndRotate() } - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.offloadLocal(t.Context(), total-halfSize, 0, &wg) @@ -155,8 +155,8 @@ func TestOldestMetrics(t *testing.T) { // Check state after offloading assert.NotEqual(t, firstFracTime, halfwayFracTime, "expect different creation times") - assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") - assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") + assert.Equal(t, firstFracTime, lc.registry.oldestTotal(), "should still reference the first fraction after offload") + assert.Equal(t, halfwayFracTime, lc.registry.oldestLocal(), "should point to the oldest remaining local fraction after offload") } func TestPendingDestroy(t *testing.T) { @@ -170,19 +170,19 @@ func TestPendingDestroy(t *testing.T) { // appending docs to `fracsCount` fractions where the last is active and the rest are sealed wg := sync.WaitGroup{} for range fracsCount - 1 { - appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + appendDocsToActive(t, lc.registry.appender().Active, docsPerFrac) lc.rotate(0, &wg) } - appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + appendDocsToActive(t, lc.registry.appender().Active, docsPerFrac) // wait sealing complete wg.Wait() // take all fracs to search - fractions1, release1 := lc.registry.AcquireAllFractions() + fractions1, release1 := lc.registry.acquireAllFractions() // delete all sealing fracs - lc.cleanLocal(lc.registry.Appender().Info().FullSize(), &wg) + lc.cleanLocal(lc.registry.appender().Info().FullSize(), &wg) var ( beforeRelease time.Time @@ -220,7 +220,7 @@ func TestPendingDestroy(t *testing.T) { cleanup.Wait() assert.Less(t, beforeRelease, afterCleanup, "we expect cleanup to happen after release") - fractions2, release2 := lc.registry.AcquireAllFractions() + fractions2, release2 := lc.registry.acquireAllFractions() assert.Len(t, fractions2, 1, "only one active fraction should remain") singleName := fractions2[0].Info().Name() diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index f85c3f8f0..51c16b6be 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -19,8 +19,8 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" testscommon "github.com/ozontech/seq-db/tests/common" ) diff --git a/fracmanager/sync_appender.go b/fracmanager/sync_appender.go index 76cf4ee03..1acb15a3a 100644 --- a/fracmanager/sync_appender.go +++ b/fracmanager/sync_appender.go @@ -26,8 +26,8 @@ type syncAppender struct { suspended bool // Temporarily suspended for writes } -// Append adds documents to the active fraction -func (a *syncAppender) Append(docs, meta []byte) error { +// append adds documents to the active fraction +func (a *syncAppender) append(docs, meta []byte) error { a.mu.RLock() if a.finalized { a.mu.RUnlock() @@ -43,22 +43,22 @@ func (a *syncAppender) Append(docs, meta []byte) error { return a.refCountedActive.Append(docs, meta, &a.wg) } -func (a *syncAppender) Suspended() bool { +func (a *syncAppender) isSuspended() bool { a.mu.Lock() defer a.mu.Unlock() return a.suspended } -func (a *syncAppender) Suspend(value bool) { +func (a *syncAppender) suspend(value bool) { a.mu.Lock() a.suspended = value a.mu.Unlock() } -// WaitWriteIdle waits for all pending write operations to complete +// waitWriteIdle waits for all pending write operations to complete // Used before sealing to ensure data consistency. -func (a *syncAppender) WaitWriteIdle() { +func (a *syncAppender) waitWriteIdle() { start := time.Now() logger.Info("waiting fraction to stop write...", zap.String("name", a.BaseFileName)) a.wg.Wait() @@ -70,8 +70,8 @@ func (a *syncAppender) WaitWriteIdle() { ) } -// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. -func (a *syncAppender) Finalize() error { +// finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (a *syncAppender) finalize() error { a.mu.Lock() if a.finalized { a.mu.Unlock() diff --git a/frac/sealed/sealing/blocks_builder.go b/indexwriter/blocks.go similarity index 52% rename from frac/sealed/sealing/blocks_builder.go rename to indexwriter/blocks.go index 3c6ce1b08..253d0abd5 100644 --- a/frac/sealed/sealing/blocks_builder.go +++ b/indexwriter/blocks.go @@ -1,64 +1,57 @@ -package sealing +package indexwriter import ( "encoding/binary" "iter" "unsafe" - "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/util" ) -type ( - TokenBlock = util.Pair[tokensSealBlock, []token.FieldTable] -) +type tokenFieldBlock = util.Pair[unpackedTokenBlock, []token.FieldTable] -// tokensExt represents the token ID range contained in a block. -type tokensExt struct { +// tokenExt represents the token ID range contained in a block. +type tokenExt struct { minTID uint32 // First token ID in the block maxTID uint32 // Last token ID in the block } -// tokensSealBlock represents a sealed block containing token data with metadata. -type tokensSealBlock struct { - ext tokensExt // Tokens block metadata for registry marking +// unpackedTokenBlock represents a sealed block containing token data with metadata. +type unpackedTokenBlock struct { + ext tokenExt // Tokens block metadata for registry marking payload token.Block // Actual token data payload } -// lidsExt represents the range and continuation status of LID blocks. -type lidsExt struct { +// lidExt represents the range and continuation status of LID blocks. +type lidExt struct { minTID uint32 // First token ID in the LID block maxTID uint32 // Last token ID in the LID block isContinued bool // Whether LID sequence continues in next block } -// lidsSealBlock represents a sealed block containing LID (Local ID) data. -type lidsSealBlock struct { - ext lidsExt // LIDs block metadata for registry marking - payload lids.Block // LID data payload +// unpackedLIDBlock represents a sealed block containing LID (Local ID) data. +type unpackedLIDBlock struct { + ext lidExt // LIDs block metadata for registry marking + payload lids.UnpackedBlock // LID data payload } -// idsSealBlock represents a sealed block containing various identifier types. -type idsSealBlock struct { +// unpackedIDBlock represents a sealed block containing various identifier types. +type unpackedIDBlock struct { mids seqids.BlockMIDs rids seqids.BlockRIDs params seqids.BlockParams } -// blocksBuilder constructs sealed blocks from various data sources. -// Provides error tracking and consistency validation during block construction. -type blocksBuilder struct{} - -func (bb *blocksBuilder) BuildTokenBlocks( +func tokenBlock( it iter.Seq2[string, iter.Seq2[TokenPosting, error]], accumulate func([]uint32) error, blockCapacity int, -) iter.Seq2[TokenBlock, error] { - return func(yield func(TokenBlock, error) bool) { +) iter.Seq2[tokenFieldBlock, error] { + return func(yield func(tokenFieldBlock, error) bool) { var ( - block tokensSealBlock + block unpackedTokenBlock blockIdx uint32 blockSize int ) @@ -87,7 +80,7 @@ func (bb *blocksBuilder) BuildTokenBlocks( emitFieldEntry() block.ext.maxTID = currentTID - pair := TokenBlock{First: block, Second: pendingTable} + pair := tokenFieldBlock{First: block, Second: pendingTable} if !yield(pair, nil) { return false } @@ -106,15 +99,15 @@ func (bb *blocksBuilder) BuildTokenBlocks( } block.ext.minTID = 1 - for field, tokenIterator := range it { + for field, tokIt := range it { emitFieldEntry() fieldName = field fieldEntryStartTID = currentTID + 1 - for pair, err := range tokenIterator { + for pair, err := range tokIt { if err != nil { - yield(TokenBlock{}, err) + yield(tokenFieldBlock{}, err) return } @@ -132,7 +125,7 @@ func (bb *blocksBuilder) BuildTokenBlocks( block.payload.Payload = append(block.payload.Payload, tok...) if err := accumulate(tlids); err != nil { - yield(TokenBlock{}, err) + yield(tokenFieldBlock{}, err) return } @@ -149,7 +142,7 @@ func (bb *blocksBuilder) BuildTokenBlocks( func newTokenTableEntry( entryStartTID, entryEndTID uint32, - blockIndex uint32, block tokensSealBlock, + blockIndex uint32, block unpackedTokenBlock, ) *token.TableEntry { // Convert global TIDs to block-local indices firstIndex := entryStartTID - block.ext.minTID @@ -169,15 +162,15 @@ func newTokenTableEntry( } } -// seqBlockID accumulates scalar (ID, position) pairs into sealed ID blocks. +// idBlock accumulates scalar (ID, position) pairs into sealed ID blocks. // A new block is yielded every `blockCapacity` IDs. -func seqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[idsSealBlock, error] { - return func(yield func(idsSealBlock, error) bool) { - var block idsSealBlock +func idBlock(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[unpackedIDBlock, error] { + return func(yield func(unpackedIDBlock, error) bool) { + var block unpackedIDBlock for pair, err := range ids { if err != nil { - yield(idsSealBlock{}, err) + yield(unpackedIDBlock{}, err) return } @@ -203,87 +196,24 @@ func seqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[ } } -type lidAccumulator struct { - blockCapacity int - onBlock func(lidsSealBlock) error - - currentTID uint32 - currentBlock lidsSealBlock - - isEndOfToken bool - isContinued bool -} - -func newLIDAccumulator( - blockCapacity int, - onBlock func(lidsSealBlock) error, -) *lidAccumulator { - if blockCapacity == 0 { - blockCapacity = consts.DefaultLIDBlockCap +// collapseOrderedFieldsTables merges FieldTables with the same field name. +// Assumes input is sorted by Field. +func collapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { + if len(src) == 0 { + return nil } - a := &lidAccumulator{ - blockCapacity: blockCapacity, - onBlock: onBlock, - } - - a.currentBlock.ext.minTID = 1 - a.currentBlock.payload = lids.Block{ - LIDs: make([]uint32, 0, blockCapacity), - Offsets: []uint32{0}, - } - - return a -} -// Add processes LIDs of one token (must be called in TID order). -// -// For each block that fills up, `onBlock` is called immediately -// before the backing arrays are reset, so `onBlock` may read the -// block data but must not retain references to it. -func (a *lidAccumulator) Add(lidsbuf []uint32) error { - a.currentTID++ - - for _, lid := range lidsbuf { - if len(a.currentBlock.payload.LIDs) == a.blockCapacity { - if err := a.onBlock(a.finalizeBlock()); err != nil { - return err - } - - a.currentBlock.ext.minTID = a.currentTID - a.currentBlock.payload.LIDs = a.currentBlock.payload.LIDs[:0] - a.currentBlock.payload.Offsets = a.currentBlock.payload.Offsets[:1] + current := src[0] + var dst []token.FieldTable + for _, ft := range src[1:] { + if current.Field == ft.Field { + current.Entries = append(current.Entries, ft.Entries...) + continue } - a.isEndOfToken = false - a.currentBlock.ext.maxTID = a.currentTID - a.currentBlock.payload.LIDs = append(a.currentBlock.payload.LIDs, lid) + dst = append(dst, current) + current = ft } - a.isEndOfToken = true - a.currentBlock.payload.Offsets = append( - a.currentBlock.payload.Offsets, - uint32(len(a.currentBlock.payload.LIDs)), - ) - - return nil -} - -func (a *lidAccumulator) Finalize() error { - return a.onBlock(a.finalizeBlock()) -} - -func (a *lidAccumulator) finalizeBlock() lidsSealBlock { - if !a.isEndOfToken { - a.currentBlock.payload.Offsets = append( - a.currentBlock.payload.Offsets, - uint32(len(a.currentBlock.payload.LIDs)), - ) - } - - result := a.currentBlock - result.payload.IsLastLID = a.isEndOfToken - result.ext.isContinued = a.isContinued - - a.isContinued = !a.isEndOfToken - return result + return append(dst, current) } diff --git a/frac/sealed/sealing/blocks_builder_test.go b/indexwriter/blocks_test.go similarity index 77% rename from frac/sealed/sealing/blocks_builder_test.go rename to indexwriter/blocks_test.go index d6bca1442..70616fe24 100644 --- a/frac/sealed/sealing/blocks_builder_test.go +++ b/indexwriter/blocks_test.go @@ -1,4 +1,4 @@ -package sealing +package indexwriter import ( "iter" @@ -7,27 +7,20 @@ import ( "github.com/stretchr/testify/assert" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" ) -var _ Source = (*mockSource)(nil) - type mockSource struct { - info common.Info - tokens [][]byte - fields []string - fieldMaxTIDs []uint32 - ids []seq.ID - pos []seq.DocPos - tokenLIDs [][]uint32 - blocksOffsets []uint64 + tokens [][]byte + fields []string + fieldMaxTIDs []uint32 + ids []seq.ID + pos []seq.DocPos + tokenLIDs [][]uint32 } -func (m *mockSource) Info() *common.Info { return &m.info } - func (m *mockSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { start := 0 @@ -48,8 +41,7 @@ func (m *mockSource) tokensForField(start, end int) iter.Seq2[TokenPosting, erro if j < len(m.tokenLIDs) { lidsbuf = m.tokenLIDs[j] } - pair := TokenPosting{First: m.tokens[j], Second: lidsbuf} - if !yield(pair, nil) { + if !yield(TokenPosting{First: m.tokens[j], Second: lidsbuf}, nil) { return } } @@ -66,8 +58,6 @@ func (m *mockSource) ID() iter.Seq2[DocLocation, error] { } } -func (m *mockSource) BlockOffsets() []uint64 { return m.blocksOffsets } - func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { src := mockSource{ tokens: [][]byte{ @@ -114,10 +104,10 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { const blockSize = 24 const lidBlockCap = 3 - var lidBlocks []lidsSealBlock + var lidBlocks []unpackedLIDBlock lidAccumulator := newLIDAccumulator( lidBlockCap, - func(block lidsSealBlock) error { + func(block unpackedLIDBlock) error { block.payload.LIDs = slices.Clone(block.payload.LIDs) block.payload.Offsets = slices.Clone(block.payload.Offsets) lidBlocks = append(lidBlocks, block) @@ -125,12 +115,9 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { }, ) - var bb blocksBuilder - tokenBlocks := bb.BuildTokenBlocks( + tokenBlocksIter := tokenBlock( src.TokenTriplet(), - func(lids []uint32) error { - return lidAccumulator.Add(lids) - }, + lidAccumulator.add, blockSize, ) @@ -142,7 +129,7 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { blockIndex := 0 allFieldsTables := []token.FieldTable{} - for pair, err := range tokenBlocks { + for pair, err := range tokenBlocksIter { assert.NoError(t, err) block, fieldsTables := pair.First, pair.Second assert.Equal(t, expectedSizes[blockIndex], block.payload.Len()) @@ -249,32 +236,32 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { }, } assert.Equal(t, actualTokenTable.FieldsTables, expectedTokenTable.FieldsTables) - assert.NoError(t, lidAccumulator.Finalize()) + assert.NoError(t, lidAccumulator.finalize()) - expectedLIDBlocks := []lidsSealBlock{ + expectedLIDBlocks := []unpackedLIDBlock{ { - ext: lidsExt{minTID: 1, maxTID: 1, isContinued: false}, - payload: lids.Block{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, + ext: lidExt{minTID: 1, maxTID: 1, isContinued: false}, + payload: lids.UnpackedBlock{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, }, { - ext: lidsExt{minTID: 1, maxTID: 3, isContinued: true}, - payload: lids.Block{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 1, maxTID: 3, isContinued: true}, + payload: lids.UnpackedBlock{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 4, maxTID: 6, isContinued: false}, - payload: lids.Block{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 4, maxTID: 6, isContinued: false}, + payload: lids.UnpackedBlock{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 7, maxTID: 9, isContinued: false}, - payload: lids.Block{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 7, maxTID: 9, isContinued: false}, + payload: lids.UnpackedBlock{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 10, maxTID: 12, isContinued: false}, - payload: lids.Block{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 10, maxTID: 12, isContinued: false}, + payload: lids.UnpackedBlock{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 13, maxTID: 14, isContinued: false}, - payload: lids.Block{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, + ext: lidExt{minTID: 13, maxTID: 14, isContinued: false}, + payload: lids.UnpackedBlock{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, }, } assert.Equal(t, expectedLIDBlocks, lidBlocks) @@ -313,7 +300,7 @@ func TestBlocksBuilder_IDsBlocks(t *testing.T) { i := 0 ids := []seq.ID{} pos := []seq.DocPos{} - for block, err := range seqBlockID(src.ID(), 3) { + for block, err := range idBlock(src.ID(), 3) { assert.NoError(t, err) assert.Equal(t, expectedSizes[i], len(block.mids.Values)) diff --git a/frac/sealed/sealing/index.go b/indexwriter/index.go similarity index 65% rename from frac/sealed/sealing/index.go rename to indexwriter/index.go index e14b36b9e..7e62fd1f8 100644 --- a/frac/sealed/sealing/index.go +++ b/indexwriter/index.go @@ -1,7 +1,8 @@ -package sealing +package indexwriter import ( "io" + "iter" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" @@ -11,9 +12,34 @@ import ( "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" "github.com/ozontech/seq-db/zstd" ) +type ( + DocLocation = util.Pair[seq.ID, seq.DocPos] + TokenPosting = util.Pair[[]byte, []uint32] +) + +// Source defines the data required to write all index files for a fraction. +type Source interface { + // Info returns metadata describing this source. + Info() *common.Info + + // ID returns an iterator over stored document identifiers paired with + // their positions, in descending [seq.ID] order. + ID() iter.Seq2[DocLocation, error] + + // BlockOffsets returns byte offsets to each document block + // within this source's `.docs` file. + BlockOffsets() []uint64 + + // TokenTriplet iterates over fields in lexicographic order. + // For each field, it yields tokens (lexicographically sorted) + // paired with the local document ID list for that token. + TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] +} + // indexBlock is one compressed (or not) block with its registry metadata. type indexBlock struct { codec storage.Codec @@ -23,58 +49,55 @@ type indexBlock struct { ext2 uint64 } -func (i indexBlock) Bin(pos int64) (storage.IndexBlockHeader, []byte) { +func (i indexBlock) bin(pos int64) (storage.IndexBlockHeader, []byte) { return storage.NewIndexBlockHeader(pos, i.ext1, i.ext2, uint32(len(i.payload)), i.rawLen, i.codec), i.payload } -type IndexSealer struct { +type IndexWriter struct { params common.SealParams - buf1 []byte - buf2 []byte - buf32 []uint32 - buf64 []uint64 + buf1 []byte + buf2 []byte + buf64 []uint64 + lidsPacker *lids.BlockPacker idsTable seqids.Table lidsTable lids.Table tokenTable token.Table } -func NewIndexSealer(params common.SealParams) *IndexSealer { - return &IndexSealer{ - params: params, - buf1: make([]byte, 0, consts.RegularBlockSize), - buf2: make([]byte, 0, consts.RegularBlockSize), - buf32: make([]uint32, 0, consts.DefaultLIDBlockCap), - buf64: make([]uint64, 0, consts.RegularBlockSize), +func New(params common.SealParams) *IndexWriter { + lidsPacker := lids.NewBlockPackerWithThreshold(params.LidsBitmapThreshold) + return &IndexWriter{ + params: params, + buf1: make([]byte, 0, consts.RegularBlockSize), + buf2: make([]byte, 0, consts.RegularBlockSize), + buf64: make([]uint64, 0, consts.RegularBlockSize), + lidsPacker: lidsPacker, } } -func (s *IndexSealer) LIDsTable() lids.Table { +func (s *IndexWriter) LIDsTable() lids.Table { return s.lidsTable } -func (s *IndexSealer) TokenTable() token.Table { +func (s *IndexWriter) TokenTable() token.Table { return s.tokenTable } -func (s *IndexSealer) IDsTable() seqids.Table { +func (s *IndexWriter) IDsTable() seqids.Table { return s.idsTable } // WriteOffsetsFile writes the .offsets file containing a single BlockOffsets block. -func (s *IndexSealer) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { w, err := newWriter(ws) if err != nil { return err } defer w.release() - offsets := sealed.BlockOffsets{ - IDsTotal: src.Info().DocsTotal + 1, - Offsets: src.BlockOffsets(), - } - + offsets := sealed.BlockOffsets{Offsets: src.BlockOffsets()} if err := w.writeBlock(blockTypeOffset, s.packBlocksOffsetsBlock(offsets)); err != nil { return err } @@ -82,14 +105,14 @@ func (s *IndexSealer) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { return w.finalize() } -func (s *IndexSealer) WriteIDFile(ws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteIDFile(ws io.WriteSeeker, src Source) error { w, err := newWriter(ws) if err != nil { return err } defer w.release() - for block, err := range seqBlockID(src.ID(), consts.IDsPerBlock) { + for block, err := range idBlock(src.ID(), consts.IDsPerBlock) { if err != nil { return err } @@ -110,7 +133,7 @@ func (s *IndexSealer) WriteIDFile(ws io.WriteSeeker, src Source) error { return w.finalize() } -func (s *IndexSealer) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) error { tw, err := newWriter(tws) if err != nil { return err @@ -123,19 +146,15 @@ func (s *IndexSealer) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err } defer lw.release() - var ( - bb blocksBuilder - allFieldsTables []token.FieldTable - ) - lidAccumulator := newLIDAccumulator( s.params.LIDBlockSize, - func(block lidsSealBlock) error { + func(block unpackedLIDBlock) error { return lw.writeBlock(blockTypeLID, s.packLIDsBlock(block)) }, ) - for pair, err := range bb.BuildTokenBlocks(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { + var allFieldsTables []token.FieldTable + for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.add, consts.RegularBlockSize) { if err != nil { return err } @@ -154,15 +173,15 @@ func (s *IndexSealer) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err return s.finalizeTokenFile(tw, allFieldsTables) } -func (s *IndexSealer) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) error { - if err := lidAccumulator.Finalize(); err != nil { +func (s *IndexWriter) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) error { + if err := lidAccumulator.finalize(); err != nil { return err } return w.finalize() } -func (s *IndexSealer) finalizeTokenFile(w *writer, allFieldsTables []token.FieldTable) error { +func (s *IndexWriter) finalizeTokenFile(w *writer, allFieldsTables []token.FieldTable) error { // Emit section separator. if err := w.writeEmptyBlock(); err != nil { return err @@ -176,54 +195,38 @@ func (s *IndexSealer) finalizeTokenFile(w *writer, allFieldsTables []token.Field return w.finalize() } -func (s *IndexSealer) WriteInfoFile(ws io.Writer, src Source) error { +func (s *IndexWriter) WriteInfoFile(ws io.Writer, src Source) error { block := sealed.BlockInfo{Info: src.Info()} _, err := ws.Write(s.packInfoBlock(block).payload) return err } -// collapseOrderedFieldsTables merges FieldTables with the same field name. -// Assumes input is sorted by Field. -func collapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { - if len(src) == 0 { - return nil - } - - current := src[0] - var dst []token.FieldTable - for _, ft := range src[1:] { - if current.Field == ft.Field { - current.Entries = append(current.Entries, ft.Entries...) - continue - } - - dst = append(dst, current) - current = ft - } - - return append(dst, current) -} - func newIndexBlock(raw []byte) indexBlock { return indexBlock{codec: storage.CodecNo, rawLen: uint32(len(raw)), payload: raw} } -func (s *IndexSealer) newIndexBlockZSTD(raw []byte, level int) indexBlock { +func (s *IndexWriter) newIndexBlockZSTD(raw []byte, level int) indexBlock { + if level <= 0 { + return newIndexBlock(raw) + } + s.buf2 = zstd.CompressLevel(raw, s.buf2[:0], level) if len(s.buf2) < len(raw) { return indexBlock{codec: storage.CodecZSTD, rawLen: uint32(len(raw)), payload: s.buf2} } + return newIndexBlock(raw) } // packInfoBlock packs fraction information into an index block. -func (s *IndexSealer) packInfoBlock(block sealed.BlockInfo) indexBlock { +func (s *IndexWriter) packInfoBlock(block sealed.BlockInfo) indexBlock { + s.idsTable.IDsTotal = block.Info.DocsTotal + 1 // Increment by one for [seq.SystemID] s.buf1 = block.Pack(s.buf1[:0]) return newIndexBlock(s.buf1) // Info block is typically small, no compression } // packTokenBlock packs token data into a compressed index block. -func (s *IndexSealer) packTokenBlock(block tokensSealBlock) indexBlock { +func (s *IndexWriter) packTokenBlock(block unpackedTokenBlock) indexBlock { s.buf1 = block.payload.Pack(s.buf1[:0]) // Pack token data b := s.newIndexBlockZSTD(s.buf1, s.params.TokenListZstdLevel) // Store TID range in extended metadata @@ -232,7 +235,7 @@ func (s *IndexSealer) packTokenBlock(block tokensSealBlock) indexBlock { } // packTokenTableBlock packs the token table into a compressed index block. -func (s *IndexSealer) packTokenTableBlock(tokenTableBlock token.TableBlock) indexBlock { +func (s *IndexWriter) packTokenTableBlock(tokenTableBlock token.TableBlock) indexBlock { s.tokenTable = token.TableFromBlocks([]token.TableBlock{tokenTableBlock}) // Store for PreloadedData // Packing block @@ -241,11 +244,7 @@ func (s *IndexSealer) packTokenTableBlock(tokenTableBlock token.TableBlock) inde } // packBlocksOffsetsBlock packs document block offsets into a compressed index block. -func (s *IndexSealer) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlock { - // Update IDs table for PreloadedData - s.idsTable.IDsTotal = block.IDsTotal // Total number of IDs - s.idsTable.IDBlocksTotal = uint32(len(block.Offsets)) // Number of ID blocks - +func (s *IndexWriter) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlock { // Packing block s.buf1 = block.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.DocsPositionsZstdLevel) @@ -253,7 +252,7 @@ func (s *IndexSealer) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlo } // packMIDsBlock packs MIDs into a compressed index block. -func (s *IndexSealer) packMIDsBlock(block idsSealBlock) indexBlock { +func (s *IndexWriter) packMIDsBlock(block unpackedIDBlock) indexBlock { // Get the last ID in the block (smallest due to descending order) last := len(block.mids.Values) - 1 @@ -276,14 +275,14 @@ func (s *IndexSealer) packMIDsBlock(block idsSealBlock) indexBlock { } // packRIDsBlock packs RIDs into a compressed index block. -func (s *IndexSealer) packRIDsBlock(block idsSealBlock) indexBlock { +func (s *IndexWriter) packRIDsBlock(block unpackedIDBlock) indexBlock { s.buf1 = block.rids.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b } // packPosBlock packs document positions into a compressed index block. -func (s *IndexSealer) packPosBlock(block idsSealBlock) indexBlock { +func (s *IndexWriter) packPosBlock(block unpackedIDBlock) indexBlock { s.buf1 = block.params.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b @@ -291,7 +290,7 @@ func (s *IndexSealer) packPosBlock(block idsSealBlock) indexBlock { // packLIDsBlock packs Local IDs (LIDs) into a compressed index block. // Also updates LIDs table for preloaded data access. -func (s *IndexSealer) packLIDsBlock(block lidsSealBlock) indexBlock { +func (s *IndexWriter) packLIDsBlock(block unpackedLIDBlock) indexBlock { var ext1 uint64 if block.ext.isContinued { // todo: Legacy continuation flag ext1 = 1 @@ -304,7 +303,7 @@ func (s *IndexSealer) packLIDsBlock(block lidsSealBlock) indexBlock { s.lidsTable.IsContinued = append(s.lidsTable.IsContinued, block.ext.isContinued) // Packing block - s.buf1 = block.payload.Pack(s.buf1[:0], s.buf32[:0]) + s.buf1 = s.lidsPacker.Pack(&block.payload, s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.LIDsZstdLevel) b.ext1 = ext1 // Legacy continuation flag b.ext2 = uint64(block.ext.maxTID)<<32 | uint64(block.ext.minTID) // TID range diff --git a/indexwriter/lid_accumulator.go b/indexwriter/lid_accumulator.go new file mode 100644 index 000000000..7b05d500d --- /dev/null +++ b/indexwriter/lid_accumulator.go @@ -0,0 +1,89 @@ +package indexwriter + +import ( + "github.com/ozontech/seq-db/frac/sealed/lids" +) + +// lidAccumulator accumulates LIDs into blocks of fixed capacity. +// It implements the add function that receives []uint32 directly. +type lidAccumulator struct { + blockCapacity int + onBlock func(unpackedLIDBlock) error + + currentTID uint32 + currentBlock unpackedLIDBlock + + isEndOfToken bool + isContinued bool +} + +func newLIDAccumulator( + blockCapacity int, + onBlock func(unpackedLIDBlock) error, +) *lidAccumulator { + a := &lidAccumulator{ + blockCapacity: blockCapacity, + onBlock: onBlock, + } + + a.currentBlock.ext.minTID = 1 + a.currentBlock.payload = lids.UnpackedBlock{ + LIDs: make([]uint32, 0, blockCapacity), + Offsets: []uint32{0}, + } + + return a +} + +// add processes LIDs of one token (must be called in TID order). +// +// For each block that fills up, `onBlock` is called immediately +// before the backing arrays are reset, so `onBlock` may read the +// block data but must not retain references to it. +func (a *lidAccumulator) add(lidsbuf []uint32) error { + a.currentTID++ + + for _, lid := range lidsbuf { + if len(a.currentBlock.payload.LIDs) == a.blockCapacity { + if err := a.onBlock(a.finalizeBlock()); err != nil { + return err + } + + a.currentBlock.ext.minTID = a.currentTID + a.currentBlock.payload.LIDs = a.currentBlock.payload.LIDs[:0] + a.currentBlock.payload.Offsets = a.currentBlock.payload.Offsets[:1] + } + + a.isEndOfToken = false + a.currentBlock.ext.maxTID = a.currentTID + a.currentBlock.payload.LIDs = append(a.currentBlock.payload.LIDs, lid) + } + + a.isEndOfToken = true + a.currentBlock.payload.Offsets = append( + a.currentBlock.payload.Offsets, + uint32(len(a.currentBlock.payload.LIDs)), + ) + + return nil +} + +func (a *lidAccumulator) finalize() error { + return a.onBlock(a.finalizeBlock()) +} + +func (a *lidAccumulator) finalizeBlock() unpackedLIDBlock { + if !a.isEndOfToken { + a.currentBlock.payload.Offsets = append( + a.currentBlock.payload.Offsets, + uint32(len(a.currentBlock.payload.LIDs)), + ) + } + + result := a.currentBlock + result.payload.IsLastLID = a.isEndOfToken + result.ext.isContinued = a.isContinued + + a.isContinued = !a.isEndOfToken + return result +} diff --git a/frac/sealed/sealing/writer.go b/indexwriter/writer.go similarity index 96% rename from frac/sealed/sealing/writer.go rename to indexwriter/writer.go index 1a147e4ef..7746c1db1 100644 --- a/frac/sealed/sealing/writer.go +++ b/indexwriter/writer.go @@ -1,4 +1,4 @@ -package sealing +package indexwriter import ( "bytes" @@ -73,7 +73,7 @@ func newWriter(ws io.WriteSeeker) (*writer, error) { } func (w *writer) writeBlock(btype string, block indexBlock) error { - header, payload := block.Bin(int64(w.pos)) + header, payload := block.bin(int64(w.pos)) if _, err := w.wpayload.Write(payload); err != nil { return err } @@ -92,7 +92,7 @@ func (w *writer) writeBlock(btype string, block indexBlock) error { } func (w *writer) writeEmptyBlock() error { - header, _ := indexBlock{}.Bin(int64(w.pos)) + header, _ := indexBlock{}.bin(int64(w.pos)) w.wheader.Write(header) return nil } diff --git a/node/batch.go b/node/batch.go index 92c8aa629..6da8ef8f5 100644 --- a/node/batch.go +++ b/node/batch.go @@ -1,54 +1,326 @@ package node import ( - "slices" + "math" + "sort" + + "github.com/RoaringBitmap/roaring/v2" ) -// LIDBatch represents a batch of LIDs. lids are stored as uint32 slice and sorted in ascending order regardless of doc order. -// This allows to avoid copying and use reference to LID blocks data. -// Such batches are also logically immutable - we can't append or delete from them, only union or intersect. But we can zero out (reset) them. -type LIDBatch struct { - lids []uint32 - desc bool // if doc order is DESC (default order) +// LIDBatch is batch of lids. It's immutable and can not be modified. Lids are always +// sorted in ascending way for every underlying implementation. +type LIDBatch interface { + Len() int + IsEmpty() bool + // Min returns minimum (first) value. Panics if batch is empty. + Min() uint32 + // Max returns max (last) value. Panics if batch is empty. + Max() uint32 + CopyLIDs(desc bool, dst []LID) []LID + // Iter iterates lids in ascending way. + Iter() Iter + // ReverseIter iterates lids in descending way. + ReverseIter() Iter + // Narrow returns a batch containing only LIDs from minLID to maxLID (inclusive both). + Narrow(minLID, maxLID uint32) LIDBatch +} + +type Iter interface { + Next() (uint32, bool) + NextGeq(geq uint32) (uint32, bool) } -// NewDescBatch creates a batch of lids for DESC docs order -func NewDescBatch(lids []uint32) LIDBatch { - return LIDBatch{ - lids: lids, - desc: true, +func NewBitmapBatch(b *roaring.Bitmap) LIDBatch { + if b == nil || b.IsEmpty() { + return EmptyBatch() + } + return &bitmapBatch{ + bm: b, + min: b.Minimum(), + max: b.Maximum(), } } -// NewAscBatch creates a batch of lids for ASC docs order -func NewAscBatch(lids []uint32) LIDBatch { - return LIDBatch{ - lids: lids, - desc: false, +func NewBitmapBatchFromLids(lids []uint32) LIDBatch { + if len(lids) == 0 { + return EmptyBatch() } + b := roaring.NewBitmap() + b.AddMany(lids) + b.RunOptimize() + return NewBitmapBatch(b) } -func (b LIDBatch) Len() int { +func NewSliceBatch(lids []uint32) LIDBatch { + if len(lids) == 0 { + return EmptyBatch() + } + return &sliceBatch{lids: lids} +} + +// sliceBatch a batch of LIDs based on slice. LIDs are always sorted in ascending way. +// It's never empty. +type sliceBatch struct { + lids []uint32 +} + +func (b *sliceBatch) Len() int { return len(b.lids) } -func (b LIDBatch) LIDs(out []LID) []LID { - if b.desc { +func (b *sliceBatch) IsEmpty() bool { + return false +} + +func (b *sliceBatch) Min() uint32 { + return b.lids[0] +} + +func (b *sliceBatch) Max() uint32 { + return b.lids[len(b.lids)-1] +} + +func (b *sliceBatch) Narrow(minLID, maxLID uint32) LIDBatch { + batchMin := b.lids[0] + batchMax := b.lids[len(b.lids)-1] + if minLID <= batchMin && batchMax <= maxLID { + return b + } + if maxLID < batchMin || minLID > batchMax { + return EmptyBatch() + } + lo := 0 + if minLID > batchMin { + lo = sort.Search(len(b.lids), func(i int) bool { return b.lids[i] >= minLID }) + } + hi := len(b.lids) + if maxLID < batchMax { + hi = sort.Search(len(b.lids), func(i int) bool { return b.lids[i] > maxLID }) + } + if lo >= hi { + return EmptyBatch() + } + return &sliceBatch{lids: b.lids[lo:hi]} +} + +func (b *sliceBatch) Iter() Iter { + return &sliceIter{lids: b.lids, max: b.Max()} +} + +func (b *sliceBatch) ReverseIter() Iter { + return &sliceReverseIter{lids: b.lids, idx: len(b.lids) - 1} +} + +func (b *sliceBatch) CopyLIDs(desc bool, dst []LID) []LID { + if desc { for _, lid := range b.lids { - out = append(out, NewDescLID(lid)) + dst = append(dst, NewDescLID(lid)) + } + } else { + for i := len(b.lids) - 1; i >= 0; i-- { + dst = append(dst, NewAscLID(b.lids[i])) + } + } + return dst +} + +type sliceIter struct { + lids []uint32 + idx int + max uint32 +} + +func (it *sliceIter) Next() (uint32, bool) { + if it.idx >= len(it.lids) { + return 0, false + } + v := it.lids[it.idx] + it.idx++ + return v, true +} + +func (it *sliceIter) NextGeq(geq uint32) (uint32, bool) { + if it.idx >= len(it.lids) || (geq > it.max) { + it.idx = len(it.lids) + return 0, false + } + rest := it.lids[it.idx:] + off := sort.Search(len(rest), func(i int) bool { return rest[i] >= geq }) + it.idx += off + return it.Next() +} + +type sliceReverseIter struct { + lids []uint32 + idx int +} + +func (it *sliceReverseIter) Next() (uint32, bool) { + if it.idx < 0 { + return 0, false + } + v := it.lids[it.idx] + it.idx-- + return v, true +} + +func (it *sliceReverseIter) NextGeq(leq uint32) (uint32, bool) { + if it.idx < 0 { + return 0, false + } + right := it.idx + 1 + idx := sort.Search(right, func(i int) bool { return it.lids[i] > leq }) - 1 + if idx < 0 { + it.idx = -1 + return 0, false + } + it.idx = idx + return it.Next() +} + +// bitmapBatch a LIDs batch based on roaring bitmap. Never empty. +type bitmapBatch struct { + bm *roaring.Bitmap + min uint32 + max uint32 +} + +func (b *bitmapBatch) Len() int { + return int(b.bm.GetCardinality()) +} + +func (b *bitmapBatch) IsEmpty() bool { + return false +} + +func (b *bitmapBatch) Min() uint32 { + return b.min +} + +func (b *bitmapBatch) Max() uint32 { + return b.max +} + +func (b *bitmapBatch) Narrow(minLID, maxLID uint32) LIDBatch { + if minLID <= b.min && b.max <= maxLID { + return b + } + if maxLID < b.min || minLID > b.max { + return EmptyBatch() + } + // TODO(cheb0) use copy-on-write for bitmap? + out := b.bm.Clone() + if minLID > b.min { + out.RemoveRange(0, uint64(minLID)) + } + if maxLID < b.max { + out.RemoveRange(uint64(maxLID)+1, uint64(0x100000000)) + } + return NewBitmapBatch(out) +} + +func (b *bitmapBatch) Iter() Iter { + return newBitmapIter(b.bm) +} + +func (b *bitmapBatch) ReverseIter() Iter { + return newBitmapReverseIter(b.bm) +} + +func (b *bitmapBatch) CopyLIDs(desc bool, dst []LID) []LID { + if desc { + it := b.bm.Iterator() + for it.HasNext() { + dst = append(dst, NewDescLID(it.Next())) } } else { - for _, lid := range slices.Backward(b.lids) { - out = append(out, NewAscLID(lid)) + it := b.bm.ReverseIterator() + for it.HasNext() { + dst = append(dst, NewAscLID(it.Next())) } } + return dst +} - return out +type emptyBatch struct{} + +var emptyBatchInstance = emptyBatch{} + +func EmptyBatch() LIDBatch { + return emptyBatchInstance +} + +func (emptyBatch) Len() int { return 0 } +func (emptyBatch) IsEmpty() bool { return true } + +func (emptyBatch) Min() uint32 { + panic("min called on empty batch") +} + +func (emptyBatch) Max() uint32 { + panic("Maximum called on empty batch") +} + +func (emptyBatch) Narrow(uint32, uint32) LIDBatch { return emptyBatchInstance } +func (emptyBatch) CopyLIDs(_ bool, dst []LID) []LID { return dst } +func (emptyBatch) Iter() Iter { return emptyIterInstance } +func (emptyBatch) ReverseIter() Iter { return emptyIterInstance } + +type emptyIter struct{} + +var emptyIterInstance = emptyIter{} + +func (emptyIter) Next() (uint32, bool) { return 0, false } +func (emptyIter) NextGeq(uint32) (uint32, bool) { return 0, false } + +type bitmapIter struct { + it roaring.IntIterator +} + +func newBitmapIter(b *roaring.Bitmap) *bitmapIter { + var it roaring.IntIterator + it.Initialize(b) + return &bitmapIter{it: it} +} + +func (f *bitmapIter) Next() (uint32, bool) { + if !f.it.HasNext() { + return 0, false + } + return f.it.Next(), true +} + +func (f *bitmapIter) NextGeq(geq uint32) (uint32, bool) { + f.it.AdvanceIfNeeded(geq) + if !f.it.HasNext() { + return 0, false + } + return f.it.Next(), true +} + +type bitmapReverseIter struct { + bm *roaring.Bitmap + pos uint32 +} + +func newBitmapReverseIter(bm *roaring.Bitmap) *bitmapReverseIter { + return &bitmapReverseIter{bm: bm, pos: math.MaxUint32} +} + +func (it *bitmapReverseIter) Next() (uint32, bool) { + prev := it.bm.PreviousValue(it.pos - 1) + if prev == -1 { + return 0, false + } + it.pos = uint32(prev) + return uint32(prev), true } -func (b LIDBatch) Reset() LIDBatch { - return LIDBatch{ - lids: b.lids[:0], - desc: b.desc, +func (it *bitmapReverseIter) NextGeq(leq uint32) (uint32, bool) { + prev := it.bm.PreviousValue(min(it.pos-1, leq)) + if prev == -1 { + return 0, false } + it.pos = uint32(prev) + return uint32(prev), true } diff --git a/node/batch_test.go b/node/batch_test.go new file mode 100644 index 000000000..1454b5389 --- /dev/null +++ b/node/batch_test.go @@ -0,0 +1,242 @@ +package node + +import ( + "math" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type batchFactory func([]uint32) LIDBatch + +var batchFactories = []struct { + name string + build batchFactory +}{ + {name: "bitmap", build: NewBitmapBatchFromLids}, + {name: "slice", build: NewSliceBatch}, +} + +func TestLIDBatchNarrow(t *testing.T) { + testCases := []struct { + name string + input []uint32 + minLID uint32 + maxLID uint32 + expected []uint32 + }{ + { + name: "empty batch", + input: nil, + minLID: 0, + maxLID: 10, + expected: nil}, + { + name: "full range no-op", + input: []uint32{1, 5, 10}, + minLID: 0, + maxLID: math.MaxUint32, + expected: []uint32{1, 5, 10}}, + { + name: "trim below", + input: []uint32{1, 5, 10, 15}, + minLID: 6, + maxLID: 20, + expected: []uint32{10, 15}}, + { + name: "exact min boundary", + input: []uint32{1, 5, 10, 15}, + minLID: 10, + maxLID: 20, + expected: []uint32{10, 15}}, + { + name: "trim above", + input: []uint32{1, 5, 10, 15}, + minLID: 0, + maxLID: 10, + expected: []uint32{1, 5, 10}}, + { + name: "trim both sides", + input: []uint32{1, 5, 10, 15, 20}, + minLID: 5, + maxLID: 15, + expected: []uint32{5, 10, 15}}, + { + name: "no overlap below", + input: []uint32{1, 5, 10}, + minLID: 11, + maxLID: 20, + expected: nil}, + { + name: "no overlap above", + input: []uint32{5, 10, 20}, + minLID: 0, + maxLID: 4, + expected: nil}, + { + name: "overlap at min", + input: []uint32{7, 10, 20, 25}, + minLID: 0, + maxLID: 7, + expected: []uint32{7}}, + { + name: "overlap at max", + input: []uint32{7, 10, 20, 25}, + minLID: 25, + maxLID: 40, + expected: []uint32{25}}, + } + + for _, impl := range batchFactories { + t.Run(impl.name, func(t *testing.T) { + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + b := impl.build(tc.input) + got := b.Narrow(tc.minLID, tc.maxLID) + assert.Equal(t, tc.expected, toSlice(got)) + }) + } + }) + } +} + +func TestBitmapBatchNarrow_NoCloneWhenUnchanged(t *testing.T) { + src := NewBitmapBatchFromLids([]uint32{1, 5, 10, 15, 20}) + got := src.Narrow(1, 20) + assert.Same(t, src, got) +} + +func toSlice(b LIDBatch) []uint32 { + if b == nil || b.IsEmpty() { + return nil + } + it := b.Iter() + out := make([]uint32, 0, b.Len()) + for { + v, ok := it.Next() + if !ok { + break + } + out = append(out, v) + } + return out +} + +func TestNextGeq(t *testing.T) { + for _, impl := range batchFactories { + t.Run(impl.name, func(t *testing.T) { + b := impl.build([]uint32{1, 5, 10, 15, 20, 21, 22, 26, 30}) + it := b.Iter() + + v, ok := it.NextGeq(1) + require.True(t, ok) + assert.Equal(t, uint32(1), v) + + // calling NextGeq with already seen value returns next value + v, ok = it.NextGeq(1) + require.True(t, ok) + assert.Equal(t, uint32(5), v) + + v, ok = it.NextGeq(13) + require.True(t, ok) + assert.Equal(t, uint32(15), v) + + v, ok = it.NextGeq(22) + require.True(t, ok) + assert.Equal(t, uint32(22), v) + + _, ok = it.NextGeq(50) + assert.False(t, ok) + }) + } +} + +func TestReverseNextGeq(t *testing.T) { + for _, impl := range batchFactories { + t.Run(impl.name, func(t *testing.T) { + b := impl.build([]uint32{3, 5, 10, 15, 20, 21, 22, 26, 30}) + it := b.ReverseIter() + + v, ok := it.NextGeq(1000) + require.True(t, ok) + assert.Equal(t, uint32(30), v) + + // calling NextGeq with already seen value returns next value + v, ok = it.NextGeq(30) + require.True(t, ok) + assert.Equal(t, uint32(26), v) + + v, ok = it.NextGeq(20) + require.True(t, ok) + assert.Equal(t, uint32(20), v) + + v, ok = it.NextGeq(9) + require.True(t, ok) + assert.Equal(t, uint32(5), v) + + _, ok = it.NextGeq(2) + assert.False(t, ok) + }) + } +} + +func TestBatchIter(t *testing.T) { + for _, impl := range batchFactories { + t.Run(impl.name, func(t *testing.T) { + b := impl.build([]uint32{1, 5, 10, 15, 20}) + it := b.Iter() + + var got []uint32 + for { + v, ok := it.Next() + if !ok { + break + } + got = append(got, v) + } + assert.Equal(t, []uint32{1, 5, 10, 15, 20}, got) + + b = impl.build([]uint32{1, 5, 10, 15, 20}) + it = b.Iter() + v, ok := it.NextGeq(11) + require.True(t, ok) + assert.Equal(t, uint32(15), v) + + b = impl.build([]uint32{1, 5, 10}) + it = b.Iter() + _, ok = it.NextGeq(100) + assert.False(t, ok) + }) + } +} + +func TestBatchReverseIter(t *testing.T) { + for _, impl := range batchFactories { + t.Run(impl.name, func(t *testing.T) { + b := impl.build([]uint32{1, 5, 10, 15, 20}) + it := b.ReverseIter() + + var got []uint32 + for { + v, ok := it.Next() + if !ok { + break + } + got = append(got, v) + } + assert.Equal(t, []uint32{20, 15, 10, 5, 1}, got) + + b = impl.build([]uint32{1, 5, 10, 15, 20}) + it = b.ReverseIter() + v, ok := it.NextGeq(11) + require.True(t, ok) + assert.Equal(t, uint32(10), v) + + b = impl.build([]uint32{1, 5, 10}) + it = b.ReverseIter() + _, ok = it.NextGeq(0) + assert.False(t, ok) + }) + } +} diff --git a/node/node.go b/node/node.go index 5334f2184..98cf21e3c 100644 --- a/node/node.go +++ b/node/node.go @@ -14,9 +14,9 @@ type Node interface { type BatchedNode interface { fmt.Stringer // NextBatch returns next batch. Returns nil when exhausted. - NextBatch() LIDBatch + NextBatch(need int) LIDBatch // NextBatchGeq returns next batch (LIDs >= minLID). Returns nil when exhausted. - NextBatchGeq(nextLID LID) LIDBatch + NextBatchGeq(need int, nextLID LID) LIDBatch } type Sourced interface { diff --git a/frac/sealed/sealing/sealer.go b/sealing/sealer.go similarity index 61% rename from frac/sealed/sealing/sealer.go rename to sealing/sealer.go index 57863d825..0c21ffc44 100644 --- a/frac/sealed/sealing/sealer.go +++ b/sealing/sealer.go @@ -2,41 +2,19 @@ package sealing import ( "errors" - "iter" "os" "path/filepath" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/indexwriter" "github.com/ozontech/seq-db/util" ) -type ( - DocLocation = util.Pair[seq.ID, seq.DocPos] - TokenPosting = util.Pair[[]byte, []uint32] -) - -// Source interface defines the contract for data sources that can be sealed. -// Provides access to all necessary data components for index creation -type Source interface { - // Info returns metadata describing this source. - Info() *common.Info - - // ID returns an iterator over stored document identifiers paired with - // their positions, in descending [seq.ID] order. - ID() iter.Seq2[DocLocation, error] - - // BlockOffsets returns byte offsets to each document block - // within this source's `.docs` file. - BlockOffsets() []uint64 - - // TokenTriplet iterates over fields in lexicographic order. - // For each field, it yields tokens (lexicographically sorted) - // paired with the local document ID list for that token. - TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] -} +// Source defines the contract for data sources that can be sealed. +// Provides access to all necessary data components for index creation. +type Source = indexwriter.Source // Seal writes five index files (.info, .token, .offsets, .id, .lid) for the fraction // and returns PreloadedData for fast initialization of the sealed fraction. @@ -47,12 +25,11 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { return nil, errors.New("sealing of an empty active fraction is not supported") } - sealer := NewIndexSealer(params) - + w := indexwriter.New(params) if err := createAndWrite( info.Path+consts.OffsetsTmpFileSuffix, info.Path+consts.OffsetsFileSuffix, - func(f *os.File) error { return sealer.WriteOffsetsFile(f, src) }, + func(f *os.File) error { return w.WriteOffsetsFile(f, src) }, ); err != nil { return nil, err } @@ -60,7 +37,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.IDTmpFileSuffix, info.Path+consts.IDFileSuffix, - func(f *os.File) error { return sealer.WriteIDFile(f, src) }, + func(f *os.File) error { return w.WriteIDFile(f, src) }, ); err != nil { return nil, err } @@ -68,7 +45,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWriteBoth( info.Path+consts.TokenTmpFileSuffix, info.Path+consts.TokenFileSuffix, info.Path+consts.LIDTmpFileSuffix, info.Path+consts.LIDFileSuffix, - func(tokenF, lidF *os.File) error { return sealer.WriteTokenTriplet(tokenF, lidF, src) }, + func(tokenF, lidF *os.File) error { return w.WriteTokenTriplet(tokenF, lidF, src) }, ); err != nil { return nil, err } @@ -76,7 +53,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.InfoTmpFileSuffix, info.Path+consts.InfoFileSuffix, - func(f *os.File) error { return sealer.WriteInfoFile(f, src) }, + func(f *os.File) error { return w.WriteInfoFile(f, src) }, ); err != nil { return nil, err } @@ -100,13 +77,13 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { } info.IndexOnDisk = totalSize - lidsTable := sealer.LIDsTable() + lidsTable := w.LIDsTable() preloaded := &sealed.PreloadedData{ Info: info, - TokenTable: sealer.TokenTable(), + TokenTable: w.TokenTable(), BlocksData: sealed.BlocksData{ - IDsTable: sealer.IDsTable(), + IDsTable: w.IDsTable(), LIDsTable: &lidsTable, BlocksOffsets: src.BlockOffsets(), }, @@ -123,10 +100,7 @@ func syncAndClose(f *os.File) error { return f.Close() } -func createAndWrite( - tmp, final string, - write func(*os.File) error, -) error { +func createAndWrite(tmp, final string, write func(*os.File) error) error { f, err := os.Create(tmp) if err != nil { return err @@ -140,16 +114,16 @@ func createAndWrite( } func createAndWriteBoth( - tmpa, finala, - tmpb, finalb string, + atmp, afinal, + btmp, bfinal string, write func(*os.File, *os.File) error, ) error { - a, err := os.Create(tmpa) + a, err := os.Create(atmp) if err != nil { return err } - b, err := os.Create(tmpb) + b, err := os.Create(btmp) if err != nil { a.Close() return err @@ -160,9 +134,9 @@ func createAndWriteBoth( return err } - if err := os.Rename(tmpa, finala); err != nil { + if err := os.Rename(atmp, afinal); err != nil { return err } - return os.Rename(tmpb, finalb) + return os.Rename(btmp, bfinal) } diff --git a/seq/seq.go b/seq/seq.go index adae42656..d3557a16e 100644 --- a/seq/seq.go +++ b/seq/seq.go @@ -11,9 +11,13 @@ import ( ) var ( - SystemMID MID = math.MaxUint64 - SystemRID RID = math.MaxUint64 - SystemID ID = ID{SystemMID, SystemRID} + SystemMID MID = math.MaxUint64 + SystemRID RID = math.MaxUint64 + + SystemID ID = ID{SystemMID, SystemRID} + MinID ID = ID{0, 0} + MaxID ID = SystemID + SystemDocPos DocPos = DocPos(0) ) diff --git a/storeapi/store.go b/storeapi/store.go index dd53079e8..8cdd76b0f 100644 --- a/storeapi/store.go +++ b/storeapi/store.go @@ -8,6 +8,7 @@ import ( "go.uber.org/atomic" + "github.com/ozontech/seq-db/compaction" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" @@ -30,7 +31,8 @@ type Store struct { FracManager *fracmanager.FracManager fracManagerStop func() - SkipMaskManager *skipmaskmanager.SkipMaskManager + SkipMaskManager *skipmaskmanager.SkipMaskManager + CompactionExecutor *compaction.Executor isStopped atomic.Bool } @@ -39,6 +41,7 @@ type StoreConfig struct { API APIConfig FracManager fracmanager.Config SkipMaskManagerConfig skipmaskmanager.Config + Compaction compaction.Config } func (c *StoreConfig) setDefaults() error { @@ -66,23 +69,26 @@ func NewStore( } skipMaskManager := skipmaskmanager.New(ctx, c.SkipMaskManagerConfig, skipMaskParams, mappingProvider) - fracManager, stop, err := fracmanager.New(ctx, &c.FracManager, s3cli, skipMaskManager) if err != nil { return nil, fmt.Errorf("loading fractions error: %w", err) } + planner := compaction.NewPlanner(ctx, fracManager, c.Compaction) + executor := compaction.NewExecutor(c.Compaction.Workers, c.FracManager.SealParams, planner) + skipMaskManager.Start(fracManager) return &Store{ Config: c, // We will set grpcAddr later in Start() - grpcAddr: "", - grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), - FracManager: fracManager, - fracManagerStop: stop, - SkipMaskManager: skipMaskManager, - isStopped: atomic.Bool{}, + grpcAddr: "", + grpcServer: newGRPCServer(c.API, fracManager, mappingProvider), + FracManager: fracManager, + fracManagerStop: stop, + SkipMaskManager: skipMaskManager, + CompactionExecutor: executor, + isStopped: atomic.Bool{}, }, nil } @@ -107,6 +113,7 @@ func (s *Store) Stop() { s.grpcServer.Stop(ctx) s.fracManagerStop() s.SkipMaskManager.Stop() + s.CompactionExecutor.Stop() logger.Info("store stopped") }