kdroidFilter · kdroidFilter · Jun 26, 2026 · Jun 25, 2026
diff --git a/generator/packaging/build.gradle.kts b/generator/packaging/build.gradle.kts
@@ -80,12 +80,37 @@ tasks.register<JavaExec>("downloadLexicalDb") {
     jvmArgs = listOf("-Xmx512m")
 }
 
+// Download the dense embedding model (int8 ONNX + tokenizer) from the private
+// SeforimEmbedding v4-int8 release next to seforim.db, so it gets bundled.
+// Needs GITHUB_TOKEN/GH_TOKEN; fails soft (bundle without model) if unavailable.
+// Usage:
+//   GITHUB_TOKEN=… ./gradlew :packaging:downloadEmbedModel
+tasks.register<JavaExec>("downloadEmbedModel") {
+    group = "application"
+    description = "Download int8 embedding model + tokenizer from the private v4-int8 release next to seforim.db."
+
+    dependsOn("jvmJar")
+    mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.DownloadEmbedModelKt")
+    classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath")
+
+    if (project.hasProperty("seforimDb")) {
+        systemProperty("seforimDb", project.property("seforimDb") as String)
+    } else if (System.getenv("SEFORIM_DB") != null) {
+        systemProperty("seforimDb", System.getenv("SEFORIM_DB"))
+    } else {
+        val defaultDbPath = rootProject.layout.buildDirectory.file("seforim.db").get().asFile.absolutePath
+        systemProperty("seforimDb", defaultDbPath)
+    }
+
+    jvmArgs = listOf("-Xmx256m")
+}
+
 // Package DB + Lucene indexes into single tar.zst and split
 tasks.register<JavaExec>("packageArtifacts") {
     group = "application"
     description = "Create seforim_bundle.tar.zst (DB + indexes + release info) with zstd and split into ~1.9GiB parts."
 
-    dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb")
+    dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb", "downloadEmbedModel")
     mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.PackageArtifactsKt")
     classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath")
 

diff --git a/.../src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt b/.../src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/DownloadEmbedModel.kt
@@ -0,0 +1,84 @@
+package io.github.kdroidfilter.seforimlibrary.packaging
+
+import co.touchlab.kermit.Logger
+import co.touchlab.kermit.Severity
+import io.github.kdroidfilter.seforimlibrary.common.OptimizedHttpClient
+import java.nio.file.Files
+import java.nio.file.Path
+import java.nio.file.Paths
+
+private const val RELEASE_API =
+    "https://api.github.com/repos/kdroidFilter/SeforimEmbedding/releases/tags/v5-int8"
+private const val USER_AGENT = "SeforimLibrary-DownloadEmbedModel/1.0"
+
+// Runtime dense-search artifacts pulled from the private SeforimEmbedding release.
+private val ASSETS = listOf("seforim-embed-v5-int8.onnx", "tokenizer.json")
+
+/**
+ * Download the int8 embedding model + tokenizer from the private `v5-int8` release
+ * and place them next to `seforim.db` so [PackageArtifacts] bundles them.
+ *
+ * Requires a token with read access to the private repo via `GITHUB_TOKEN` / `GH_TOKEN`
+ * (consumed by [OptimizedHttpClient]). On any failure (no token, network, missing
+ * asset) it logs a warning and exits 0 so packaging proceeds WITHOUT the model
+ * (the app then degrades to lexical-only search).
+ *
+ * Usage:
+ *   ./gradlew :packaging:downloadEmbedModel
+ *   ./gradlew :packaging:downloadEmbedModel -PseforimDb=/path/to/seforim.db
+ */
+fun main(args: Array<String>) {
+    Logger.setMinSeverity(Severity.Info)
+    val logger = Logger.withTag("DownloadEmbedModel")
+
+    val dbPath = resolveDbPath(args)
+
+    val present = ASSETS.all { name ->
+        val p = dbPath.resolveSibling(name)
+        Files.exists(p) && Files.isRegularFile(p) && Files.size(p) > 0
+    }
+    if (present) {
+        logger.i { "Embedding model already present next to ${dbPath.fileName}; skipping download" }
+        return
+    }
+
+    runCatching {
+        val json = OptimizedHttpClient.fetchJson(RELEASE_API, USER_AGENT, logger)
+        for (name in ASSETS) {
+            val out = dbPath.resolveSibling(name)
+            if (Files.exists(out) && Files.size(out) > 0) {
+                logger.i { "Using existing $name" }
+                continue
+            }
+            val url = assetApiUrl(json, name)
+                ?: throw IllegalStateException("Asset '$name' not found in v4-int8 release")
+            Files.createDirectories(out.parent)
+            val tmp = out.resolveSibling("${out.fileName}.part")
+            Files.deleteIfExists(tmp)
+            // Asset API url + Accept: octet-stream + token -> works for private repos.
+            OptimizedHttpClient.downloadFile(url, tmp, USER_AGENT, logger, "Downloading $name")
+            Files.deleteIfExists(out)
+            Files.move(tmp, out)
+            logger.i { "Downloaded $name -> ${out.toAbsolutePath()}" }
+        }
+    }.onFailure {
+        logger.w(it) { "Could not download the embedding model; bundle will omit it (dense search disabled)" }
+    }
+}
+
+private fun resolveDbPath(args: Array<String>): Path {
+    val dbPathStr = args.getOrNull(0)
+        ?: System.getProperty("seforimDb")
+        ?: System.getenv("SEFORIM_DB")
+        ?: Paths.get("build", "seforim.db").toString()
+    return Paths.get(dbPathStr)
+}
+
+/** Extract the GitHub *asset API* url for a given asset name from the release JSON. */
+private fun assetApiUrl(json: String, name: String): String? {
+    val re = Regex(
+        "\\{\"url\":\"(https://api\\.github\\.com/[^\"]+?/assets/\\d+)\"[^{}]*?\"name\":\"" +
+            Regex.escape(name) + "\"",
+    )
+    return re.find(json)?.groupValues?.get(1)
+}
diff --git a/...ng/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt b/...ng/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/packaging/PackageArtifacts.kt
@@ -65,13 +65,18 @@ fun main(args: Array<String>) {
 
     // Resolve precomputed catalog next to the DB
     val catalogPath: Path = dbPath.resolveSibling("catalog.pb")
-    
+
     // Resolve release info file next to the DB
     val releaseInfoPath: Path = dbPath.resolveSibling("release_info.txt")
 
     // Resolve lexical DB next to the DB
     val lexicalDbPath: Path = dbPath.resolveSibling("lexical.db")
 
+    // Resolve the dense embedding model (int8 ONNX) + tokenizer next to the DB.
+    // Bundled so the app gets dense search out of the box; absent -> lexical only.
+    val embedModelPath: Path = dbPath.resolveSibling("seforim-embed-v5-int8.onnx")
+    val embedTokenizerPath: Path = dbPath.resolveSibling("tokenizer.json")
+
     if (!textIndexDir.toFile().isDirectory) {
         logger.w { "Lucene text index directory missing: $textIndexDir (will skip)" }
     }
@@ -126,6 +131,8 @@ fun main(args: Array<String>) {
             " - Catalog: $catalogPath\n" +
             " - Release info: $releaseInfoPath\n" +
             " - Lexical DB: $lexicalDbPath\n" +
+            " - Embed model: $embedModelPath\n" +
+            " - Embed tokenizer: $embedTokenizerPath\n" +
             " - Text index: $textIndexDir\n" +
             " - Lookup index: $lookupIndexDir\n" +
             " -> Bundle .tar.zst: $bundleOutputPath\n" +
@@ -171,14 +178,28 @@ fun main(args: Array<String>) {
                             logger.w { "Lexical DB missing: $lexicalDbPath (skipped)" }
                         }
 
+                        // Add the dense embedding model + tokenizer if available
+                        if (embedModelPath.exists()) {
+                            addFileToTar(tar, embedModelPath, embedModelPath.fileName.toString(), logger)
+                            logger.i { "Added embedding model to bundle" }
+                        } else {
+                            logger.w { "Embedding model missing: $embedModelPath (skipped, dense search disabled)" }
+                        }
+                        if (embedTokenizerPath.exists()) {
+                            addFileToTar(tar, embedTokenizerPath, embedTokenizerPath.fileName.toString(), logger)
+                            logger.i { "Added embedding tokenizer to bundle" }
+                        } else {
+                            logger.w { "Embedding tokenizer missing: $embedTokenizerPath (skipped)" }
+                        }
+
                         // Add the precomputed catalog if available
                         if (haveCatalog) {
                             addFileToTar(tar, catalogPath, catalogPath.fileName.toString(), logger)
                             logger.i { "Added precomputed catalog to bundle" }
                         } else {
                             logger.w { "Precomputed catalog missing: $catalogPath (skipped)" }
                         }
-                        
+
                         // Add the release info file if available
                         if (haveReleaseInfo) {
                             addFileToTar(tar, releaseInfoPath, releaseInfoPath.fileName.toString(), logger)

diff --git a/generator/searchindex/build.gradle.kts b/generator/searchindex/build.gradle.kts
@@ -64,6 +64,12 @@ tasks.register<JavaExec>("buildLuceneIndexDefault") {
         systemProperty("inMemoryDb", "true")
     }
 
+    // Optional: -PvectorsBin=/path (dir with ids.i64+vecs.f32+meta.txt) → SINGLE
+    // fused index (text + dense KnnFloatVectorField per line).
+    (project.findProperty("vectorsBin") as String?)?.let { systemProperty("vectorsBin", it) }
+    // Optional: -PindexThreads=N to cap concurrent indexing threads (lower = less RAM).
+    (project.findProperty("indexThreads") as String?)?.let { systemProperty("indexThreads", it) }
+
     jvmArgs = listOf(
         "-Xmx$generatorHeap",
         "-XX:+UseG1GC",
@@ -72,4 +78,3 @@ tasks.register<JavaExec>("buildLuceneIndexDefault") {
         "--add-modules=jdk.incubator.vector"
     )
 }
-
diff --git a/.../src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt b/.../src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/BuildLuceneIndex.kt
@@ -22,7 +22,10 @@ import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
 import org.apache.lucene.analysis.ngram.NGramTokenFilter
 import org.jsoup.Jsoup
 import org.jsoup.safety.Safelist
+import java.io.DataInputStream
 import java.io.File
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
 import java.nio.file.Files
 import java.nio.file.Path
 import java.nio.file.Paths
@@ -58,6 +61,12 @@ fun main() = runBlocking {
     runCatching { Files.createDirectories(indexDir) }
     runCatching { Files.createDirectories(lookupDir) }
 
+    // Optional dense embeddings -> SINGLE fused index. -DvectorsBin points to a dir
+    // with ids.i64 + vecs.f32 + meta.txt (produced by embed_corpus_bin.py against this
+    // same DB). Each line then also gets a KnnFloatVectorField in the text index.
+    val vectorProvider: ((Long) -> FloatArray?)? =
+        System.getProperty("vectorsBin")?.let { loadVectorProvider(Paths.get(it), logger) }
+
     // Open repository (prefer in-memory for faster reads)
     val useMemoryDb = (System.getProperty("inMemoryDb") ?: "true") != "false"
     // Use a shared in-memory DB so multiple connections can read concurrently when multithreading
@@ -115,7 +124,7 @@ fun main() = runBlocking {
         )
     )
 
-    LuceneTextIndexWriter(indexDir, analyzer = analyzer).use { writer ->
+    LuceneTextIndexWriter(indexDir, analyzer = analyzer, vectorProvider = vectorProvider).use { writer ->
         LuceneLookupIndexWriter(lookupDir, analyzer = analyzer).use { lookup ->
             val books = repo.getAllBooks()
             val indexThreads = (System.getProperty("indexThreads") ?: Runtime.getRuntime().availableProcessors().toString()).toInt().coerceAtLeast(1)
@@ -300,3 +309,29 @@ private fun sanitizeAcronymTerm(raw: String): String {
     if (raw.isEmpty()) return ""
     return normalizePostHtmlForIndex(raw)
 }
+
+/**
+ * Loads dense embeddings produced by embed_corpus_bin.py (ids.i64 + vecs.f32 + meta.txt,
+ * little-endian) into RAM and returns a thread-safe lineId -> vector lookup, used by
+ * LuceneTextIndexWriter to attach a KnnFloatVectorField per line (single fused index).
+ */
+private fun loadVectorProvider(dir: Path, logger: co.touchlab.kermit.Logger): ((Long) -> FloatArray?) {
+    val meta = File(dir.toFile(), "meta.txt").readText().trim().split(" ")
+    val n = meta[0].toInt()
+    val dim = meta[1].toInt()
+    logger.i { "Loading $n dense vectors (dim $dim) from $dir for the fused index" }
+    val idsBuf = ByteBuffer.wrap(File(dir.toFile(), "ids.i64").readBytes()).order(ByteOrder.LITTLE_ENDIAN)
+    val rowOf = HashMap<Long, Int>(n * 2)
+    for (i in 0 until n) rowOf[idsBuf.long] = i
+    val vecs = FloatArray(n * dim)
+    DataInputStream(File(dir.toFile(), "vecs.f32").inputStream().buffered(1 shl 20)).use { din ->
+        val rec = ByteArray(dim * 4)
+        val bb = ByteBuffer.wrap(rec).order(ByteOrder.LITTLE_ENDIAN)
+        for (i in 0 until n) {
+            din.readFully(rec); bb.rewind()
+            for (j in 0 until dim) vecs[i * dim + j] = bb.float
+        }
+    }
+    logger.i { "Dense vectors loaded (${(n.toLong() * dim * 4L) / 1_000_000} MB in RAM)" }
+    return { lineId -> rowOf[lineId]?.let { row -> vecs.copyOfRange(row * dim, row * dim + dim) } }
+}
diff --git a/.../kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt b/.../kotlin/io/github/kdroidfilter/seforimlibrary/searchindex/lucene/LuceneTextIndexWriter.kt
@@ -6,6 +6,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer
 import org.apache.lucene.document.Document
 import org.apache.lucene.document.Field
 import org.apache.lucene.document.IntPoint
+import org.apache.lucene.document.KnnFloatVectorField
 import org.apache.lucene.document.StoredField
 import org.apache.lucene.document.StringField
 import org.apache.lucene.document.TextField
@@ -24,12 +25,16 @@ class LuceneTextIndexWriter(
     indexDir: Path,
     analyzer: Analyzer = StandardAnalyzer(),
     private val indexHebrewField: Boolean = false,
-    private val indexPrimaryText: Boolean = true
+    private val indexPrimaryText: Boolean = true,
+    // Optional dense embeddings: if provided, each line doc also gets a
+    // KnnFloatVectorField("vec", COSINE) -> SINGLE index holding text + vectors.
+    private val vectorProvider: ((Long) -> FloatArray?)? = null,
 ) : TextIndexWriter {
     companion object Fields {
         const val FIELD_TYPE = "type"
         const val TYPE_LINE = "line"
         const val TYPE_BOOK_TITLE = "book_title"
+        const val FIELD_VEC = "vec"
 
         const val FIELD_BOOK_ID = "book_id"
         const val FIELD_CATEGORY_ID = "category_id"
@@ -106,6 +111,11 @@ class LuceneTextIndexWriter(
             // Index 4-gram tokens for substring search (per-field analyzer applies NGram filter)
             add(TextField(FIELD_TEXT_NG4, normalizedText, Field.Store.NO))
             // rawPlainText is no longer stored - snippet source is fetched from DB at query time
+
+            // Dense embedding (single fused index): attach the line's vector if available.
+            vectorProvider?.invoke(lineId)?.let { vec ->
+                add(KnnFloatVectorField(FIELD_VEC, vec, org.apache.lucene.index.VectorSimilarityFunction.COSINE))
+            }
         }
         writer.addDocument(doc)
     }

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -5,6 +5,8 @@ kotlin = "2.3.20"
 agp = "8.12.3"
 jvmToolchain = "25"
 lucene = "10.4.0"
+onnxruntime = "1.20.0"
+djlHuggingface = "0.30.0"
 maven-publish = "0.36.0"
 kotlinx-coroutines = "1.10.2"
 kotlinx-serialization = "1.10.0"
@@ -34,6 +36,8 @@ lucene-highlighter = { module = "org.apache.lucene:lucene-highlighter", version.
 lucene-queryparser = { module = "org.apache.lucene:lucene-queryparser", version.ref = "lucene" }
 lucene-analysis-common = { module = "org.apache.lucene:lucene-analysis-common", version.ref = "lucene" }
 lucene-core = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" }
+onnxruntime = { module = "com.microsoft.onnxruntime:onnxruntime", version.ref = "onnxruntime" }
+djl-huggingface-tokenizers = { module = "ai.djl.huggingface:tokenizers", version.ref = "djlHuggingface" }
 sqlDelight-driver-sqlite = { module = "app.cash.sqldelight:sqlite-driver", version.ref = "sqlDelight" }
 sqlDelight-driver-android = { module = "app.cash.sqldelight:android-driver", version.ref = "sqlDelight" }
 sqlDelight-driver-native = { module = "app.cash.sqldelight:native-driver", version.ref = "sqlDelight" }

diff --git a/search/build.gradle.kts b/search/build.gradle.kts
@@ -20,6 +20,11 @@ kotlin {
             implementation(libs.sqlDelight.driver.sqlite)
             implementation(libs.kermit)
             implementation(libs.jsoup)
+            // Dense semantic search: ONNX Runtime (query embedding) + HuggingFace tokenizer.
+            // Stock Maven `onnxruntime` is CPU-only on desktop JVM (no DirectML/CoreML/XNNPACK
+            // native — those need a custom build; CUDA needs the separate onnxruntime_gpu).
+            implementation(libs.onnxruntime)
+            implementation(libs.djl.huggingface.tokenizers)
         }
 
         jvmTest.dependencies {

diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HebrewV5Normalizer.kt
@@ -0,0 +1,27 @@
+package io.github.kdroidfilter.seforimlibrary.search
+
+import org.jsoup.parser.Parser
+
+/**
+ * Exact JVM port of `normalize_he_v5.py`: strip HTML/nikud/teamim and non-Hebrew
+ * characters, then fold final (sofit) letters (ך→כ ם→מ ן→נ ף→פ ץ→צ). It MUST be
+ * applied to any text before embedding it with a v5 model (both indexed lines and
+ * query), so vectors stay comparable. Pair it with a v5 model + a v5-built index.
+ */
+object HebrewV5Normalizer {
+    private val TAG = Regex("<[^>]+>")
+    private val MARKS = Regex("[֑-ׇ]")
+    private val DROP = Regex("[^א-ת0-9\\s.,:;!?()\\[\\]\"'\\-/׳״]")
+    private val WS = Regex("\\s+")
+    // Final (sofit) -> base letter folding.
+    private val FINALS = mapOf('ך' to 'כ', 'ם' to 'מ', 'ן' to 'נ', 'ף' to 'פ', 'ץ' to 'צ')
+
+    fun clean(text: String): String {
+        var s = TAG.replace(text, " ")
+        s = Parser.unescapeEntities(s, false)
+        s = MARKS.replace(s, "")
+        s = DROP.replace(s, " ")
+        s = WS.replace(s, " ").trim()
+        return buildString(s.length) { for (c in s) append(FINALS[c] ?: c) }
+    }
+}