From 8f8da4e1fc1eca2f9a0e1c966ba6642de4fd8ee0 Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Thu, 25 Jun 2026 20:43:45 +0300 Subject: [PATCH] feat(search): dense semantic search with v5 embedding model - Add SeforimEmbedder (ONNX v5 int8) + HebrewV5Normalizer (final-letter folding) - HybridSearchEngine (BM25 + dense, RRF) + VectorSearcher over a fused Lucene index - BuildVectorIndex / fused KnnFloatVectorField indexing in the generator - Bundle + fetch the v5 model (PackageArtifacts, DownloadEmbedModel -> v5-int8) - CI: free disk space on the runner before the build --- generator/packaging/build.gradle.kts | 27 ++- .../packaging/DownloadEmbedModel.kt | 84 ++++++++ .../packaging/PackageArtifacts.kt | 25 ++- generator/searchindex/build.gradle.kts | 7 +- .../searchindex/BuildLuceneIndex.kt | 37 +++- .../lucene/LuceneTextIndexWriter.kt | 12 +- gradle/libs.versions.toml | 4 + search/build.gradle.kts | 5 + .../search/HebrewV5Normalizer.kt | 27 +++ .../search/HybridSearchEngine.kt | 187 ++++++++++++++++++ .../seforimlibrary/search/SeforimEmbedder.kt | 146 ++++++++++++++ .../seforimlibrary/search/VectorSearcher.kt | 59 ++++++ .../search/SeforimEmbedderTest.kt | 39 ++++ 13 files changed, 653 insertions(+), 6 deletions(-) create mode 100644 generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt create mode 100644 search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt create mode 100644 search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt create mode 100644 search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedder.kt create mode 100644 search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt create mode 100644 search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedderTest.kt diff --git a/generator/packaging/build.gradle.kts b/generator/packaging/build.gradle.kts index 698c0fe8..e32d2481 100644 --- a/generator/packaging/build.gradle.kts +++ b/generator/packaging/build.gradle.kts @@ -80,12 +80,37 @@ tasks.register("downloadLexicalDb") { jvmArgs = listOf("-Xmx512m") } +// Download the dense embedding model (int8 ONNX + tokenizer) from the private +// SeforimEmbedding v4-int8 release next to seforim.db, so it gets bundled. +// Needs GITHUB_TOKEN/GH_TOKEN; fails soft (bundle without model) if unavailable. +// Usage: +// GITHUB_TOKEN=… ./gradlew :packaging:downloadEmbedModel +tasks.register("downloadEmbedModel") { + group = "application" + description = "Download int8 embedding model + tokenizer from the private v4-int8 release next to seforim.db." + + dependsOn("jvmJar") + mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.DownloadEmbedModelKt") + classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath") + + if (project.hasProperty("seforimDb")) { + systemProperty("seforimDb", project.property("seforimDb") as String) + } else if (System.getenv("SEFORIM_DB") != null) { + systemProperty("seforimDb", System.getenv("SEFORIM_DB")) + } else { + val defaultDbPath = rootProject.layout.buildDirectory.file("seforim.db").get().asFile.absolutePath + systemProperty("seforimDb", defaultDbPath) + } + + jvmArgs = listOf("-Xmx256m") +} + // Package DB + Lucene indexes into single tar.zst and split tasks.register("packageArtifacts") { group = "application" description = "Create seforim_bundle.tar.zst (DB + indexes + release info) with zstd and split into ~1.9GiB parts." - dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb") + dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb", "downloadEmbedModel") mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.PackageArtifactsKt") classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath") diff --git a/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt b/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt new file mode 100644 index 00000000..008a2d5b --- /dev/null +++ b/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt @@ -0,0 +1,84 @@ +package io.github.kdroidfilter.seforimlibrary.packaging + +import co.touchlab.kermit.Logger +import co.touchlab.kermit.Severity +import io.github.kdroidfilter.seforimlibrary.common.OptimizedHttpClient +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths + +private const val RELEASE_API = + "https://api.github.com/repos/kdroidFilter/SeforimEmbedding/releases/tags/v5-int8" +private const val USER_AGENT = "SeforimLibrary-DownloadEmbedModel/1.0" + +// Runtime dense-search artifacts pulled from the private SeforimEmbedding release. +private val ASSETS = listOf("seforim-embed-v5-int8.onnx", "tokenizer.json") + +/** + * Download the int8 embedding model + tokenizer from the private `v5-int8` release + * and place them next to `seforim.db` so [PackageArtifacts] bundles them. + * + * Requires a token with read access to the private repo via `GITHUB_TOKEN` / `GH_TOKEN` + * (consumed by [OptimizedHttpClient]). On any failure (no token, network, missing + * asset) it logs a warning and exits 0 so packaging proceeds WITHOUT the model + * (the app then degrades to lexical-only search). + * + * Usage: + * ./gradlew :packaging:downloadEmbedModel + * ./gradlew :packaging:downloadEmbedModel -PseforimDb=/path/to/seforim.db + */ +fun main(args: Array) { + Logger.setMinSeverity(Severity.Info) + val logger = Logger.withTag("DownloadEmbedModel") + + val dbPath = resolveDbPath(args) + + val present = ASSETS.all { name -> + val p = dbPath.resolveSibling(name) + Files.exists(p) && Files.isRegularFile(p) && Files.size(p) > 0 + } + if (present) { + logger.i { "Embedding model already present next to ${dbPath.fileName}; skipping download" } + return + } + + runCatching { + val json = OptimizedHttpClient.fetchJson(RELEASE_API, USER_AGENT, logger) + for (name in ASSETS) { + val out = dbPath.resolveSibling(name) + if (Files.exists(out) && Files.size(out) > 0) { + logger.i { "Using existing $name" } + continue + } + val url = assetApiUrl(json, name) + ?: throw IllegalStateException("Asset '$name' not found in v4-int8 release") + Files.createDirectories(out.parent) + val tmp = out.resolveSibling("${out.fileName}.part") + Files.deleteIfExists(tmp) + // Asset API url + Accept: octet-stream + token -> works for private repos. + OptimizedHttpClient.downloadFile(url, tmp, USER_AGENT, logger, "Downloading $name") + Files.deleteIfExists(out) + Files.move(tmp, out) + logger.i { "Downloaded $name -> ${out.toAbsolutePath()}" } + } + }.onFailure { + logger.w(it) { "Could not download the embedding model; bundle will omit it (dense search disabled)" } + } +} + +private fun resolveDbPath(args: Array): Path { + val dbPathStr = args.getOrNull(0) + ?: System.getProperty("seforimDb") + ?: System.getenv("SEFORIM_DB") + ?: Paths.get("build", "seforim.db").toString() + return Paths.get(dbPathStr) +} + +/** Extract the GitHub *asset API* url for a given asset name from the release JSON. */ +private fun assetApiUrl(json: String, name: String): String? { + val re = Regex( + "\\{\"url\":\"(https://api\\.github\\.com/[^\"]+?/assets/\\d+)\"[^{}]*?\"name\":\"" + + Regex.escape(name) + "\"", + ) + return re.find(json)?.groupValues?.get(1) +} diff --git a/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt b/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt index cb51569d..c553fe25 100644 --- a/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt +++ b/generator/packaging/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt @@ -65,13 +65,18 @@ fun main(args: Array) { // Resolve precomputed catalog next to the DB val catalogPath: Path = dbPath.resolveSibling("catalog.pb") - + // Resolve release info file next to the DB val releaseInfoPath: Path = dbPath.resolveSibling("release_info.txt") // Resolve lexical DB next to the DB val lexicalDbPath: Path = dbPath.resolveSibling("lexical.db") + // Resolve the dense embedding model (int8 ONNX) + tokenizer next to the DB. + // Bundled so the app gets dense search out of the box; absent -> lexical only. + val embedModelPath: Path = dbPath.resolveSibling("seforim-embed-v5-int8.onnx") + val embedTokenizerPath: Path = dbPath.resolveSibling("tokenizer.json") + if (!textIndexDir.toFile().isDirectory) { logger.w { "Lucene text index directory missing: $textIndexDir (will skip)" } } @@ -126,6 +131,8 @@ fun main(args: Array) { " - Catalog: $catalogPath\n" + " - Release info: $releaseInfoPath\n" + " - Lexical DB: $lexicalDbPath\n" + + " - Embed model: $embedModelPath\n" + + " - Embed tokenizer: $embedTokenizerPath\n" + " - Text index: $textIndexDir\n" + " - Lookup index: $lookupIndexDir\n" + " -> Bundle .tar.zst: $bundleOutputPath\n" + @@ -171,6 +178,20 @@ fun main(args: Array) { logger.w { "Lexical DB missing: $lexicalDbPath (skipped)" } } + // Add the dense embedding model + tokenizer if available + if (embedModelPath.exists()) { + addFileToTar(tar, embedModelPath, embedModelPath.fileName.toString(), logger) + logger.i { "Added embedding model to bundle" } + } else { + logger.w { "Embedding model missing: $embedModelPath (skipped, dense search disabled)" } + } + if (embedTokenizerPath.exists()) { + addFileToTar(tar, embedTokenizerPath, embedTokenizerPath.fileName.toString(), logger) + logger.i { "Added embedding tokenizer to bundle" } + } else { + logger.w { "Embedding tokenizer missing: $embedTokenizerPath (skipped)" } + } + // Add the precomputed catalog if available if (haveCatalog) { addFileToTar(tar, catalogPath, catalogPath.fileName.toString(), logger) @@ -178,7 +199,7 @@ fun main(args: Array) { } else { logger.w { "Precomputed catalog missing: $catalogPath (skipped)" } } - + // Add the release info file if available if (haveReleaseInfo) { addFileToTar(tar, releaseInfoPath, releaseInfoPath.fileName.toString(), logger) diff --git a/generator/searchindex/build.gradle.kts b/generator/searchindex/build.gradle.kts index d723f4e4..f5284199 100644 --- a/generator/searchindex/build.gradle.kts +++ b/generator/searchindex/build.gradle.kts @@ -64,6 +64,12 @@ tasks.register("buildLuceneIndexDefault") { systemProperty("inMemoryDb", "true") } + // Optional: -PvectorsBin=/path (dir with ids.i64+vecs.f32+meta.txt) → SINGLE + // fused index (text + dense KnnFloatVectorField per line). + (project.findProperty("vectorsBin") as String?)?.let { systemProperty("vectorsBin", it) } + // Optional: -PindexThreads=N to cap concurrent indexing threads (lower = less RAM). + (project.findProperty("indexThreads") as String?)?.let { systemProperty("indexThreads", it) } + jvmArgs = listOf( "-Xmx$generatorHeap", "-XX:+UseG1GC", @@ -72,4 +78,3 @@ tasks.register("buildLuceneIndexDefault") { "--add-modules=jdk.incubator.vector" ) } - diff --git a/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt b/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt index 2b96ca91..bf926120 100644 --- a/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt +++ b/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt @@ -22,7 +22,10 @@ import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper import org.apache.lucene.analysis.ngram.NGramTokenFilter import org.jsoup.Jsoup import org.jsoup.safety.Safelist +import java.io.DataInputStream import java.io.File +import java.nio.ByteBuffer +import java.nio.ByteOrder import java.nio.file.Files import java.nio.file.Path import java.nio.file.Paths @@ -58,6 +61,12 @@ fun main() = runBlocking { runCatching { Files.createDirectories(indexDir) } runCatching { Files.createDirectories(lookupDir) } + // Optional dense embeddings -> SINGLE fused index. -DvectorsBin points to a dir + // with ids.i64 + vecs.f32 + meta.txt (produced by embed_corpus_bin.py against this + // same DB). Each line then also gets a KnnFloatVectorField in the text index. + val vectorProvider: ((Long) -> FloatArray?)? = + System.getProperty("vectorsBin")?.let { loadVectorProvider(Paths.get(it), logger) } + // Open repository (prefer in-memory for faster reads) val useMemoryDb = (System.getProperty("inMemoryDb") ?: "true") != "false" // Use a shared in-memory DB so multiple connections can read concurrently when multithreading @@ -115,7 +124,7 @@ fun main() = runBlocking { ) ) - LuceneTextIndexWriter(indexDir, analyzer = analyzer).use { writer -> + LuceneTextIndexWriter(indexDir, analyzer = analyzer, vectorProvider = vectorProvider).use { writer -> LuceneLookupIndexWriter(lookupDir, analyzer = analyzer).use { lookup -> val books = repo.getAllBooks() val indexThreads = (System.getProperty("indexThreads") ?: Runtime.getRuntime().availableProcessors().toString()).toInt().coerceAtLeast(1) @@ -300,3 +309,29 @@ private fun sanitizeAcronymTerm(raw: String): String { if (raw.isEmpty()) return "" return normalizePostHtmlForIndex(raw) } + +/** + * Loads dense embeddings produced by embed_corpus_bin.py (ids.i64 + vecs.f32 + meta.txt, + * little-endian) into RAM and returns a thread-safe lineId -> vector lookup, used by + * LuceneTextIndexWriter to attach a KnnFloatVectorField per line (single fused index). + */ +private fun loadVectorProvider(dir: Path, logger: co.touchlab.kermit.Logger): ((Long) -> FloatArray?) { + val meta = File(dir.toFile(), "meta.txt").readText().trim().split(" ") + val n = meta[0].toInt() + val dim = meta[1].toInt() + logger.i { "Loading $n dense vectors (dim $dim) from $dir for the fused index" } + val idsBuf = ByteBuffer.wrap(File(dir.toFile(), "ids.i64").readBytes()).order(ByteOrder.LITTLE_ENDIAN) + val rowOf = HashMap(n * 2) + for (i in 0 until n) rowOf[idsBuf.long] = i + val vecs = FloatArray(n * dim) + DataInputStream(File(dir.toFile(), "vecs.f32").inputStream().buffered(1 shl 20)).use { din -> + val rec = ByteArray(dim * 4) + val bb = ByteBuffer.wrap(rec).order(ByteOrder.LITTLE_ENDIAN) + for (i in 0 until n) { + din.readFully(rec); bb.rewind() + for (j in 0 until dim) vecs[i * dim + j] = bb.float + } + } + logger.i { "Dense vectors loaded (${(n.toLong() * dim * 4L) / 1_000_000} MB in RAM)" } + return { lineId -> rowOf[lineId]?.let { row -> vecs.copyOfRange(row * dim, row * dim + dim) } } +} diff --git a/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt b/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt index 0b841112..00b2c63c 100644 --- a/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt +++ b/generator/searchindex/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt @@ -6,6 +6,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer import org.apache.lucene.document.Document import org.apache.lucene.document.Field import org.apache.lucene.document.IntPoint +import org.apache.lucene.document.KnnFloatVectorField import org.apache.lucene.document.StoredField import org.apache.lucene.document.StringField import org.apache.lucene.document.TextField @@ -24,12 +25,16 @@ class LuceneTextIndexWriter( indexDir: Path, analyzer: Analyzer = StandardAnalyzer(), private val indexHebrewField: Boolean = false, - private val indexPrimaryText: Boolean = true + private val indexPrimaryText: Boolean = true, + // Optional dense embeddings: if provided, each line doc also gets a + // KnnFloatVectorField("vec", COSINE) -> SINGLE index holding text + vectors. + private val vectorProvider: ((Long) -> FloatArray?)? = null, ) : TextIndexWriter { companion object Fields { const val FIELD_TYPE = "type" const val TYPE_LINE = "line" const val TYPE_BOOK_TITLE = "book_title" + const val FIELD_VEC = "vec" const val FIELD_BOOK_ID = "book_id" const val FIELD_CATEGORY_ID = "category_id" @@ -106,6 +111,11 @@ class LuceneTextIndexWriter( // Index 4-gram tokens for substring search (per-field analyzer applies NGram filter) add(TextField(FIELD_TEXT_NG4, normalizedText, Field.Store.NO)) // rawPlainText is no longer stored - snippet source is fetched from DB at query time + + // Dense embedding (single fused index): attach the line's vector if available. + vectorProvider?.invoke(lineId)?.let { vec -> + add(KnnFloatVectorField(FIELD_VEC, vec, org.apache.lucene.index.VectorSimilarityFunction.COSINE)) + } } writer.addDocument(doc) } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index dc3ebdfc..65f06001 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -5,6 +5,8 @@ kotlin = "2.3.20" agp = "8.12.3" jvmToolchain = "25" lucene = "10.4.0" +onnxruntime = "1.20.0" +djlHuggingface = "0.30.0" maven-publish = "0.36.0" kotlinx-coroutines = "1.10.2" kotlinx-serialization = "1.10.0" @@ -34,6 +36,8 @@ lucene-highlighter = { module = "org.apache.lucene:lucene-highlighter", version. lucene-queryparser = { module = "org.apache.lucene:lucene-queryparser", version.ref = "lucene" } lucene-analysis-common = { module = "org.apache.lucene:lucene-analysis-common", version.ref = "lucene" } lucene-core = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" } +onnxruntime = { module = "com.microsoft.onnxruntime:onnxruntime", version.ref = "onnxruntime" } +djl-huggingface-tokenizers = { module = "ai.djl.huggingface:tokenizers", version.ref = "djlHuggingface" } sqlDelight-driver-sqlite = { module = "app.cash.sqldelight:sqlite-driver", version.ref = "sqlDelight" } sqlDelight-driver-android = { module = "app.cash.sqldelight:android-driver", version.ref = "sqlDelight" } sqlDelight-driver-native = { module = "app.cash.sqldelight:native-driver", version.ref = "sqlDelight" } diff --git a/search/build.gradle.kts b/search/build.gradle.kts index 52c283fb..4a2e4111 100644 --- a/search/build.gradle.kts +++ b/search/build.gradle.kts @@ -20,6 +20,11 @@ kotlin { implementation(libs.sqlDelight.driver.sqlite) implementation(libs.kermit) implementation(libs.jsoup) + // Dense semantic search: ONNX Runtime (query embedding) + HuggingFace tokenizer. + // Stock Maven `onnxruntime` is CPU-only on desktop JVM (no DirectML/CoreML/XNNPACK + // native — those need a custom build; CUDA needs the separate onnxruntime_gpu). + implementation(libs.onnxruntime) + implementation(libs.djl.huggingface.tokenizers) } jvmTest.dependencies { diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt new file mode 100644 index 00000000..304d725b --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt @@ -0,0 +1,27 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import org.jsoup.parser.Parser + +/** + * Exact JVM port of `normalize_he_v5.py`: strip HTML/nikud/teamim and non-Hebrew + * characters, then fold final (sofit) letters (ך→כ ם→מ ן→נ ף→פ ץ→צ). It MUST be + * applied to any text before embedding it with a v5 model (both indexed lines and + * query), so vectors stay comparable. Pair it with a v5 model + a v5-built index. + */ +object HebrewV5Normalizer { + private val TAG = Regex("<[^>]+>") + private val MARKS = Regex("[֑-ׇ]") + private val DROP = Regex("[^א-ת0-9\\s.,:;!?()\\[\\]\"'\\-/׳״]") + private val WS = Regex("\\s+") + // Final (sofit) -> base letter folding. + private val FINALS = mapOf('ך' to 'כ', 'ם' to 'מ', 'ן' to 'נ', 'ף' to 'פ', 'ץ' to 'צ') + + fun clean(text: String): String { + var s = TAG.replace(text, " ") + s = Parser.unescapeEntities(s, false) + s = MARKS.replace(s, "") + s = DROP.replace(s, " ") + s = WS.replace(s, " ").trim() + return buildString(s.length) { for (c in s) append(FINALS[c] ?: c) } + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt new file mode 100644 index 00000000..59deb5cf --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt @@ -0,0 +1,187 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import co.touchlab.kermit.Logger +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.sync.Mutex +import kotlinx.coroutines.sync.withLock +import kotlinx.coroutines.withContext +import java.nio.file.Files +import java.nio.file.Path + +/** + * Hybrid search = lexical (BM25 + MagicDictionary, [LuceneSearchEngine]) fused with + * dense semantic search (v4 embedding + [VectorSearcher]) via Reciprocal Rank Fusion. + * + * RRF(doc) = Σ 1/(K + rank_in_list) + * + * Implements [SearchEngine] so the app uses it transparently: [openSession] runs the + * fused search (the session pages over the in-memory fused list); all other methods + * (facets, snippets, title prefix) delegate to the lexical engine. + * + * Falls back to pure lexical when: the model/vector index is absent, OR the filters + * are not supported by the dense index (categoryFilter / lineIds). Book / base-book + * filters ARE supported by both paths. + * + * Dense-only hits (lines the lexical path missed) are turned into full [LineHit]s via + * [resolveLine], supplied by the caller (fetches title + text + snippet by line id). + */ +class HybridSearchEngine( + private val lexical: LuceneSearchEngine, + private val modelDir: Path?, + private val indexDir: Path, + private val rrfK: Int = 60, + private val candidates: Int = 150, + private val resolveLine: suspend (lineId: Long, bookId: Long, query: String) -> LineHit?, +) : SearchEngine { + + private val logger = Logger.withTag("HybridSearch") + + // The embedder (heavy OrtSession) + vector searcher are loaded LAZILY on the first + // dense search, off the main thread (see [ensureDense]) — so the first query just + // shows the normal search spinner instead of freezing the UI while the model loads. + @Volatile private var embedder: SeforimEmbedder? = null + @Volatile private var vectorSearcher: VectorSearcher? = null + @Volatile private var denseTried = false + private val denseMutex = Mutex() + + // Cheap, no-load check: are the model + vector index even present? Decides whether to + // take the dense path (then load lazily); the actual OrtSession is built in fuse(). + private val denseConfigured: Boolean = + indexDir != null && Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir) + + val denseEnabled: Boolean get() = embedder != null && vectorSearcher != null + + /** Load the embedder + vector searcher once, on a background thread. Idempotent. */ + private suspend fun ensureDense() { + if (denseTried) return + denseMutex.withLock { + if (denseTried) return + withContext(Dispatchers.IO) { + val emb = SeforimEmbedder.tryLoad(modelDir) + val vs = if (emb != null && indexDir != null && Files.isDirectory(indexDir)) { + runCatching { VectorSearcher(indexDir) }.getOrNull() + } else null + if (vs != null) { + embedder = emb; vectorSearcher = vs + logger.i { "dense ready (vector index $indexDir)" } + } else { + runCatching { emb?.close() } + logger.i { "dense unavailable -> lexical only" } + } + } + denseTried = true + } + } + + override fun openSession( + query: String, + near: Int, + bookFilter: Long?, + categoryFilter: Long?, + bookIds: Collection?, + lineIds: Collection?, + baseBookOnly: Boolean, + ): SearchSession? { + if (query.isBlank()) return null + // Dense index supports book / base-book filters only. For category/line filters + // (or when dense isn't configured), use pure lexical to stay correct. The model + // itself is loaded lazily in fuse() — denseConfigured is a cheap no-load check. + val denseOk = denseConfigured && categoryFilter == null && lineIds == null + if (!denseOk) { + return lexical.openSession(query, near, bookFilter, categoryFilter, bookIds, lineIds, baseBookOnly) + } + val effBookIds = bookIds ?: bookFilter?.let { listOf(it) } + return HybridSession(query, near, effBookIds, baseBookOnly) + } + + override fun searchBooksByTitlePrefix(query: String, limit: Int): List = + lexical.searchBooksByTitlePrefix(query, limit) + + override fun buildSnippet(rawText: String, query: String, near: Int): String = + lexical.buildSnippet(rawText, query, near) + + override fun buildHighlightTerms(query: String): List = + lexical.buildHighlightTerms(query) + + override fun computeFacets( + query: String, near: Int, bookFilter: Long?, categoryFilter: Long?, + bookIds: Collection?, lineIds: Collection?, baseBookOnly: Boolean, + ): SearchFacets? = lexical.computeFacets(query, near, bookFilter, categoryFilter, bookIds, lineIds, baseBookOnly) + + override fun close() { + runCatching { vectorSearcher?.close() } + runCatching { embedder?.close() } + runCatching { lexical.close() } + } + + /** Lexical page + dense KNN, fused by RRF, resolved to full LineHits. */ + private suspend fun fuse(query: String, near: Int, bookIds: Collection?, baseOnly: Boolean): List { + ensureDense() // first call loads the model off-main (covered by the search spinner) + val lexHits = lexical.openSession(query, near = near, bookIds = bookIds, baseBookOnly = baseOnly) + ?.use { it.nextPage(candidates)?.hits ?: emptyList() } ?: emptyList() + + // Dense failed to load -> lexical-only result (still correct, just no semantic recall). + val emb = embedder + val vs = vectorSearcher + if (emb == null || vs == null) return lexHits + + val denseHits = withContext(Dispatchers.Default) { + val qVec = emb.embed(query) + vs.search(qVec, candidates, baseOnly, bookIds) + } + + val rrf = HashMap() + val bookOf = HashMap() + lexHits.forEachIndexed { rank, h -> + rrf.merge(h.lineId, 1.0 / (rrfK + rank + 1), Double::plus); bookOf[h.lineId] = h.bookId + } + denseHits.forEachIndexed { rank, h -> + rrf.merge(h.lineId, 1.0 / (rrfK + rank + 1), Double::plus); bookOf.putIfAbsent(h.lineId, h.bookId) + } + val lexById = lexHits.associateBy { it.lineId } + val ordered = rrf.entries.sortedByDescending { it.value } + val out = ArrayList(ordered.size) + for ((lineId, score) in ordered) { + val hit = lexById[lineId] ?: resolveLine(lineId, bookOf[lineId] ?: -1L, query) + if (hit != null) out += hit.copy(score = score.toFloat()) + } + return out + } + + private inner class HybridSession( + private val query: String, + private val near: Int, + private val bookIds: Collection?, + private val baseOnly: Boolean, + ) : SearchSession { + private var fused: List? = null + private var offset = 0 + + override suspend fun nextPage(limit: Int): SearchPage? { + val all = fused ?: fuse(query, near, bookIds, baseOnly).also { fused = it } + if (offset >= all.size) return null + val end = minOf(offset + limit, all.size) + val slice = all.subList(offset, end) + offset = end + return SearchPage(hits = slice, totalHits = all.size.toLong(), isLastPage = offset >= all.size) + } + + override fun close() {} + } + + companion object { + private val logger = Logger.withTag("HybridSearch") + + fun create( + lexical: LuceneSearchEngine, + indexDir: Path, + modelDir: Path? = null, + resolveLine: suspend (Long, Long, String) -> LineHit?, + ): HybridSearchEngine { + // ONE index: dense vectors live in the SAME Lucene index as the text + // (seforim.db.lucene) — no separate vector index. The embedder + searcher + // load lazily on the first dense search (no UI freeze). + return HybridSearchEngine(lexical, modelDir, indexDir, resolveLine = resolveLine) + } + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedder.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedder.kt new file mode 100644 index 00000000..a72738ad --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedder.kt @@ -0,0 +1,146 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer +import ai.onnxruntime.OnnxTensor +import ai.onnxruntime.OrtEnvironment +import ai.onnxruntime.OrtSession +import co.touchlab.kermit.Logger +import java.io.Closeable +import java.nio.file.Files +import java.nio.file.Path + +/** + * Produces L2-normalized 384-d sentence embeddings for Hebrew/Aramaic text on the JVM, + * using the v5 model trained in the SeforimEmbedding project (ONNX export). + * + * The ONNX graph bakes in pooling + projection + L2 normalization, so [embed] returns + * a vector ready for a Lucene `KnnFloatVectorField` (cosine). Query text is normalized + * with [HebrewV5Normalizer] to match the training distribution. + * + * Model artifacts (from the SeforimEmbedding release): + * - `seforim-embed-v5-int8.onnx` (the model) + * - `tokenizer.json` (the matching tokenizer) + * Place both in a directory and point to it via `-DseforimEmbedModelDir=…`, + * the `SEFORIM_EMBED_MODEL` env var, or one of the default candidate locations. + * If not found, [tryLoad] returns null and dense search is simply disabled. + */ +class SeforimEmbedder private constructor( + onnxModel: Path, + tokenizerJson: Path, + private val maxLen: Int = 128, +) : Closeable { + + val dim: Int = 384 + + private val tokenizer: HuggingFaceTokenizer = HuggingFaceTokenizer.newInstance(tokenizerJson) + private val env: OrtEnvironment = OrtEnvironment.getEnvironment() + private val session: OrtSession = openSession(env, onnxModel) + + // The stock Maven `onnxruntime` artifact is CPU-only on desktop JVM. Hardware EPs only + // work if a capable native is bundled: CUDA (swap to the onnxruntime_gpu artifact + + // system CUDA/cuDNN, NVIDIA), or DirectML/CoreML (custom ORT build). They're opt-in via + // -DseforimEmbedGpu=true / SEFORIM_EMBED_GPU=1; otherwise an optimized CPU session + // (full graph opt + all cores). int8 is already CPU-friendly. + private fun openSession(env: OrtEnvironment, model: Path): OrtSession { + val wantGpu = System.getProperty("seforimEmbedGpu")?.toBoolean() == true || + System.getenv("SEFORIM_EMBED_GPU")?.toBoolean() == true + if (wantGpu) { + val eps = listOf Unit>>( + "CUDA" to { it.addCUDA(0) }, + "DirectML" to { it.addDirectML(0) }, + "CoreML" to { it.addCoreML() }, + ) + for ((name, add) in eps) { + runCatching { + val o = OrtSession.SessionOptions().apply { + setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT); add(this) + } + return env.createSession(model.toString(), o).also { logger.i { "ONNX EP: $name (GPU)" } } + } + } + logger.i { "No GPU EP available in this build; using CPU" } + } + val o = OrtSession.SessionOptions().apply { + setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT) + runCatching { setIntraOpNumThreads(Runtime.getRuntime().availableProcessors()) } + } + return env.createSession(model.toString(), o) + } + + // v5 models were trained on final-folded text; queries must be normalized + // the same way so query and indexed vectors stay comparable. + private val normalize: (String) -> String = HebrewV5Normalizer::clean + + /** Embed a single text (normalized like the corpus) into a normalized float[dim]. */ + fun embed(text: String): FloatArray { + val enc = tokenizer.encode(normalize(text)) + var ids = enc.ids + var mask = enc.attentionMask + if (ids.size > maxLen) { + ids = ids.copyOf(maxLen) + mask = mask.copyOf(maxLen) + } + OnnxTensor.createTensor(env, arrayOf(ids)).use { idsT -> + OnnxTensor.createTensor(env, arrayOf(mask)).use { maskT -> + session.run(mapOf("input_ids" to idsT, "attention_mask" to maskT)).use { res -> + @Suppress("UNCHECKED_CAST") + return (res[0].value as Array)[0] + } + } + } + } + + override fun close() { + session.close() + env.close() + tokenizer.close() + } + + companion object { + private val logger = Logger.withTag("SeforimEmbedder") + + /** + * Locate the model and load an embedder, or return null if unavailable + * (dense search then degrades gracefully to lexical-only). + */ + private fun candidateDirs(explicitDir: Path?): List = listOfNotNull( + explicitDir, + System.getProperty("seforimEmbedModelDir")?.let { Path.of(it) }, + System.getenv("SEFORIM_EMBED_MODEL")?.let { Path.of(it) }, + Path.of(System.getProperty("user.home"), "IdeaProjects/SeforimEmbedding/artifacts"), + ).distinct() + + /** Cheap presence check (model + tokenizer files) WITHOUT creating the heavy + * OrtSession — lets callers decide to take the dense path then load lazily. */ + fun isAvailable(explicitDir: Path? = null): Boolean = + candidateDirs(explicitDir).any { findOnnx(it) != null && findTokenizer(it) != null } + + fun tryLoad(explicitDir: Path? = null): SeforimEmbedder? { + val dirs = candidateDirs(explicitDir) + for (dir in dirs) { + val onnx = findOnnx(dir) ?: continue + val tok = findTokenizer(dir) ?: continue + return runCatching { + logger.i { "Loading dense embedder: onnx=$onnx tokenizer=$tok" } + SeforimEmbedder(onnx, tok) + }.onFailure { logger.w(it) { "Failed to load embedder from $dir" } }.getOrNull() + } + logger.i { "No embedding model found; dense search disabled. Checked: $dirs" } + return null + } + + // Prefer the newest model and the int8-quantized variant (4x smaller, ~3x + // faster CPU embedding) when present. v5 models fold final letters; the + // matching normalization is selected automatically from the filename. + private fun findOnnx(dir: Path): Path? = listOf( + "seforim-embed-v5-int8.onnx", "seforim-embed-v5.onnx", + "seforim-embed-v4-int8.onnx", "model.onnx", "seforim-embed-v4.onnx", + ).map { dir.resolve(it) }.firstOrNull { Files.isRegularFile(it) } + + private fun findTokenizer(dir: Path): Path? = listOf( + dir.resolve("tokenizer.json"), + dir.resolve("tokenizer_v4/tokenizer.json"), + dir.resolve("model_v4_phase2a/tokenizer.json"), + ).firstOrNull { Files.isRegularFile(it) } + } +} diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt new file mode 100644 index 00000000..3868c8c0 --- /dev/null +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt @@ -0,0 +1,59 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import org.apache.lucene.document.IntPoint +import org.apache.lucene.index.DirectoryReader +import org.apache.lucene.search.BooleanClause +import org.apache.lucene.search.BooleanQuery +import org.apache.lucene.search.IndexSearcher +import org.apache.lucene.search.KnnFloatVectorQuery +import org.apache.lucene.search.Query +import org.apache.lucene.store.FSDirectory +import java.io.Closeable +import java.nio.file.Path + +/** A dense (semantic) hit from the fused Lucene index. */ +data class DenseHit(val lineId: Long, val bookId: Long, val score: Float) + +/** + * Runs a filtered KNN query over the SINGLE fused Lucene index (text fields + + * dense `KnnFloatVectorField` per line, built by the generator's text index + * writer). Filters (base-books / specific books) are applied as a pre-filter so + * the KNN only considers eligible documents. + * + * Returns line ids that are joined back to the DB by the caller. + */ +class VectorSearcher(indexDir: Path) : Closeable { + private val dir = FSDirectory.open(indexDir) + + private fun filterQuery(baseBookOnly: Boolean, bookIds: Collection?): Query? { + val b = BooleanQuery.Builder() + var any = false + if (baseBookOnly) { + // Field name matches the fused text index (LuceneTextIndexWriter.FIELD_IS_BASE_BOOK). + b.add(IntPoint.newExactQuery("is_base_book", 1), BooleanClause.Occur.FILTER); any = true + } + if (!bookIds.isNullOrEmpty()) { + b.add(IntPoint.newSetQuery("book_id", *bookIds.map { it.toInt() }.toIntArray()), BooleanClause.Occur.FILTER); any = true + } + return if (any) b.build() else null + } + + fun search(query: FloatArray, k: Int, baseBookOnly: Boolean = false, bookIds: Collection? = null): List { + DirectoryReader.open(dir).use { reader -> + val searcher = IndexSearcher(reader) + val knn = KnnFloatVectorQuery("vec", query, k, filterQuery(baseBookOnly, bookIds)) + val top = searcher.search(knn, k) + val stored = searcher.storedFields() + return top.scoreDocs.map { sd -> + val d = stored.document(sd.doc) + DenseHit( + lineId = d.getField("line_id").numericValue().toLong(), + bookId = d.getField("book_id").numericValue().toLong(), + score = sd.score, + ) + } + } + } + + override fun close() = dir.close() +} diff --git a/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedderTest.kt b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedderTest.kt new file mode 100644 index 00000000..5aa05a60 --- /dev/null +++ b/search/src/jvmTest/kotlin/io/github/kdroidfilter/seforimlibrary/search/SeforimEmbedderTest.kt @@ -0,0 +1,39 @@ +package io.github.kdroidfilter.seforimlibrary.search + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +class SeforimEmbedderTest { + + private fun cos(a: FloatArray, b: FloatArray): Float { + var s = 0f + for (i in a.indices) s += a[i] * b[i] + return s + } + + @Test + fun embedderLoadsAndProducesUsableVectors() { + val embedder = SeforimEmbedder.tryLoad() + if (embedder == null) { + println("[skip] no embedding model found (set -DseforimEmbedModelDir) — dense search disabled") + return + } + embedder.use { e -> + val q = e.embed("מה מברכים על אוכל") + assertEquals(384, q.size, "embedding dim") + + // deterministic + normalized: same text twice -> cosine ~1 + val q2 = e.embed("מה מברכים על אוכל") + assertTrue(cos(q, q2) > 0.999f, "self-cosine should be ~1.0") + + // sanity: a topically related text should be closer than an unrelated one + val related = e.embed("ברכת הנהנין על פירות וירקות") + val unrelated = e.embed("הלכות טומאה וטהרה של כלים") + val cr = cos(q, related) + val cu = cos(q, unrelated) + println("[embedder] cos(related)=$cr cos(unrelated)=$cu") + assertTrue(cr > cu, "related ($cr) should be closer than unrelated ($cu)") + } + } +}