Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion generator/packaging/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,37 @@ tasks.register<JavaExec>("downloadLexicalDb") {
jvmArgs = listOf("-Xmx512m")
}

// Download the dense embedding model (int8 ONNX + tokenizer) from the private
// SeforimEmbedding v4-int8 release next to seforim.db, so it gets bundled.
// Needs GITHUB_TOKEN/GH_TOKEN; fails soft (bundle without model) if unavailable.
// Usage:
// GITHUB_TOKEN=… ./gradlew :packaging:downloadEmbedModel
tasks.register<JavaExec>("downloadEmbedModel") {
group = "application"
description = "Download int8 embedding model + tokenizer from the private v4-int8 release next to seforim.db."

dependsOn("jvmJar")
mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.DownloadEmbedModelKt")
classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath")

if (project.hasProperty("seforimDb")) {
systemProperty("seforimDb", project.property("seforimDb") as String)
} else if (System.getenv("SEFORIM_DB") != null) {
systemProperty("seforimDb", System.getenv("SEFORIM_DB"))
} else {
val defaultDbPath = rootProject.layout.buildDirectory.file("seforim.db").get().asFile.absolutePath
systemProperty("seforimDb", defaultDbPath)
}

jvmArgs = listOf("-Xmx256m")
}

// Package DB + Lucene indexes into single tar.zst and split
tasks.register<JavaExec>("packageArtifacts") {
group = "application"
description = "Create seforim_bundle.tar.zst (DB + indexes + release info) with zstd and split into ~1.9GiB parts."

dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb")
dependsOn("jvmJar", "writeReleaseInfo", "downloadLexicalDb", "downloadEmbedModel")
mainClass.set("io.github.kdroidfilter.seforimlibrary.packaging.PackageArtifactsKt")
classpath = files(tasks.named("jvmJar")) + configurations.getByName("jvmRuntimeClasspath")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package io.github.kdroidfilter.seforimlibrary.packaging

import co.touchlab.kermit.Logger
import co.touchlab.kermit.Severity
import io.github.kdroidfilter.seforimlibrary.common.OptimizedHttpClient
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths

private const val RELEASE_API =
"https://api.github.com/repos/kdroidFilter/SeforimEmbedding/releases/tags/v5-int8"
private const val USER_AGENT = "SeforimLibrary-DownloadEmbedModel/1.0"

// Runtime dense-search artifacts pulled from the private SeforimEmbedding release.
private val ASSETS = listOf("seforim-embed-v5-int8.onnx", "tokenizer.json")

/**
* Download the int8 embedding model + tokenizer from the private `v5-int8` release
* and place them next to `seforim.db` so [PackageArtifacts] bundles them.
*
* Requires a token with read access to the private repo via `GITHUB_TOKEN` / `GH_TOKEN`
* (consumed by [OptimizedHttpClient]). On any failure (no token, network, missing
* asset) it logs a warning and exits 0 so packaging proceeds WITHOUT the model
* (the app then degrades to lexical-only search).
*
* Usage:
* ./gradlew :packaging:downloadEmbedModel
* ./gradlew :packaging:downloadEmbedModel -PseforimDb=/path/to/seforim.db
*/
fun main(args: Array<String>) {
Logger.setMinSeverity(Severity.Info)
val logger = Logger.withTag("DownloadEmbedModel")

val dbPath = resolveDbPath(args)

val present = ASSETS.all { name ->
val p = dbPath.resolveSibling(name)
Files.exists(p) && Files.isRegularFile(p) && Files.size(p) > 0
}
if (present) {
logger.i { "Embedding model already present next to ${dbPath.fileName}; skipping download" }
return
}

runCatching {
val json = OptimizedHttpClient.fetchJson(RELEASE_API, USER_AGENT, logger)
for (name in ASSETS) {
val out = dbPath.resolveSibling(name)
if (Files.exists(out) && Files.size(out) > 0) {
logger.i { "Using existing $name" }
continue
}
val url = assetApiUrl(json, name)
?: throw IllegalStateException("Asset '$name' not found in v4-int8 release")
Files.createDirectories(out.parent)
val tmp = out.resolveSibling("${out.fileName}.part")
Files.deleteIfExists(tmp)
// Asset API url + Accept: octet-stream + token -> works for private repos.
OptimizedHttpClient.downloadFile(url, tmp, USER_AGENT, logger, "Downloading $name")
Files.deleteIfExists(out)
Files.move(tmp, out)
logger.i { "Downloaded $name -> ${out.toAbsolutePath()}" }
}
}.onFailure {
logger.w(it) { "Could not download the embedding model; bundle will omit it (dense search disabled)" }
}
}

private fun resolveDbPath(args: Array<String>): Path {
val dbPathStr = args.getOrNull(0)
?: System.getProperty("seforimDb")
?: System.getenv("SEFORIM_DB")
?: Paths.get("build", "seforim.db").toString()
return Paths.get(dbPathStr)
}

/** Extract the GitHub *asset API* url for a given asset name from the release JSON. */
private fun assetApiUrl(json: String, name: String): String? {
val re = Regex(
"\\{\"url\":\"(https://api\\.github\\.com/[^\"]+?/assets/\\d+)\"[^{}]*?\"name\":\"" +
Regex.escape(name) + "\"",
)
return re.find(json)?.groupValues?.get(1)
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,18 @@ fun main(args: Array<String>) {

// Resolve precomputed catalog next to the DB
val catalogPath: Path = dbPath.resolveSibling("catalog.pb")

// Resolve release info file next to the DB
val releaseInfoPath: Path = dbPath.resolveSibling("release_info.txt")

// Resolve lexical DB next to the DB
val lexicalDbPath: Path = dbPath.resolveSibling("lexical.db")

// Resolve the dense embedding model (int8 ONNX) + tokenizer next to the DB.
// Bundled so the app gets dense search out of the box; absent -> lexical only.
val embedModelPath: Path = dbPath.resolveSibling("seforim-embed-v5-int8.onnx")
val embedTokenizerPath: Path = dbPath.resolveSibling("tokenizer.json")

if (!textIndexDir.toFile().isDirectory) {
logger.w { "Lucene text index directory missing: $textIndexDir (will skip)" }
}
Expand Down Expand Up @@ -126,6 +131,8 @@ fun main(args: Array<String>) {
" - Catalog: $catalogPath\n" +
" - Release info: $releaseInfoPath\n" +
" - Lexical DB: $lexicalDbPath\n" +
" - Embed model: $embedModelPath\n" +
" - Embed tokenizer: $embedTokenizerPath\n" +
" - Text index: $textIndexDir\n" +
" - Lookup index: $lookupIndexDir\n" +
" -> Bundle .tar.zst: $bundleOutputPath\n" +
Expand Down Expand Up @@ -171,14 +178,28 @@ fun main(args: Array<String>) {
logger.w { "Lexical DB missing: $lexicalDbPath (skipped)" }
}

// Add the dense embedding model + tokenizer if available
if (embedModelPath.exists()) {
addFileToTar(tar, embedModelPath, embedModelPath.fileName.toString(), logger)
logger.i { "Added embedding model to bundle" }
} else {
logger.w { "Embedding model missing: $embedModelPath (skipped, dense search disabled)" }
}
if (embedTokenizerPath.exists()) {
addFileToTar(tar, embedTokenizerPath, embedTokenizerPath.fileName.toString(), logger)
logger.i { "Added embedding tokenizer to bundle" }
} else {
logger.w { "Embedding tokenizer missing: $embedTokenizerPath (skipped)" }
}

// Add the precomputed catalog if available
if (haveCatalog) {
addFileToTar(tar, catalogPath, catalogPath.fileName.toString(), logger)
logger.i { "Added precomputed catalog to bundle" }
} else {
logger.w { "Precomputed catalog missing: $catalogPath (skipped)" }
}

// Add the release info file if available
if (haveReleaseInfo) {
addFileToTar(tar, releaseInfoPath, releaseInfoPath.fileName.toString(), logger)
Expand Down
7 changes: 6 additions & 1 deletion generator/searchindex/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ tasks.register<JavaExec>("buildLuceneIndexDefault") {
systemProperty("inMemoryDb", "true")
}

// Optional: -PvectorsBin=/path (dir with ids.i64+vecs.f32+meta.txt) → SINGLE
// fused index (text + dense KnnFloatVectorField per line).
(project.findProperty("vectorsBin") as String?)?.let { systemProperty("vectorsBin", it) }
// Optional: -PindexThreads=N to cap concurrent indexing threads (lower = less RAM).
(project.findProperty("indexThreads") as String?)?.let { systemProperty("indexThreads", it) }

jvmArgs = listOf(
"-Xmx$generatorHeap",
"-XX:+UseG1GC",
Expand All @@ -72,4 +78,3 @@ tasks.register<JavaExec>("buildLuceneIndexDefault") {
"--add-modules=jdk.incubator.vector"
)
}

Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
import org.apache.lucene.analysis.ngram.NGramTokenFilter
import org.jsoup.Jsoup
import org.jsoup.safety.Safelist
import java.io.DataInputStream
import java.io.File
import java.nio.ByteBuffer
import java.nio.ByteOrder
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
Expand Down Expand Up @@ -58,6 +61,12 @@ fun main() = runBlocking {
runCatching { Files.createDirectories(indexDir) }
runCatching { Files.createDirectories(lookupDir) }

// Optional dense embeddings -> SINGLE fused index. -DvectorsBin points to a dir
// with ids.i64 + vecs.f32 + meta.txt (produced by embed_corpus_bin.py against this
// same DB). Each line then also gets a KnnFloatVectorField in the text index.
val vectorProvider: ((Long) -> FloatArray?)? =
System.getProperty("vectorsBin")?.let { loadVectorProvider(Paths.get(it), logger) }

// Open repository (prefer in-memory for faster reads)
val useMemoryDb = (System.getProperty("inMemoryDb") ?: "true") != "false"
// Use a shared in-memory DB so multiple connections can read concurrently when multithreading
Expand Down Expand Up @@ -115,7 +124,7 @@ fun main() = runBlocking {
)
)

LuceneTextIndexWriter(indexDir, analyzer = analyzer).use { writer ->
LuceneTextIndexWriter(indexDir, analyzer = analyzer, vectorProvider = vectorProvider).use { writer ->
LuceneLookupIndexWriter(lookupDir, analyzer = analyzer).use { lookup ->
val books = repo.getAllBooks()
val indexThreads = (System.getProperty("indexThreads") ?: Runtime.getRuntime().availableProcessors().toString()).toInt().coerceAtLeast(1)
Expand Down Expand Up @@ -300,3 +309,29 @@ private fun sanitizeAcronymTerm(raw: String): String {
if (raw.isEmpty()) return ""
return normalizePostHtmlForIndex(raw)
}

/**
* Loads dense embeddings produced by embed_corpus_bin.py (ids.i64 + vecs.f32 + meta.txt,
* little-endian) into RAM and returns a thread-safe lineId -> vector lookup, used by
* LuceneTextIndexWriter to attach a KnnFloatVectorField per line (single fused index).
*/
private fun loadVectorProvider(dir: Path, logger: co.touchlab.kermit.Logger): ((Long) -> FloatArray?) {
val meta = File(dir.toFile(), "meta.txt").readText().trim().split(" ")
val n = meta[0].toInt()
val dim = meta[1].toInt()
logger.i { "Loading $n dense vectors (dim $dim) from $dir for the fused index" }
val idsBuf = ByteBuffer.wrap(File(dir.toFile(), "ids.i64").readBytes()).order(ByteOrder.LITTLE_ENDIAN)
val rowOf = HashMap<Long, Int>(n * 2)
for (i in 0 until n) rowOf[idsBuf.long] = i
val vecs = FloatArray(n * dim)
DataInputStream(File(dir.toFile(), "vecs.f32").inputStream().buffered(1 shl 20)).use { din ->
val rec = ByteArray(dim * 4)
val bb = ByteBuffer.wrap(rec).order(ByteOrder.LITTLE_ENDIAN)
for (i in 0 until n) {
din.readFully(rec); bb.rewind()
for (j in 0 until dim) vecs[i * dim + j] = bb.float
}
}
logger.i { "Dense vectors loaded (${(n.toLong() * dim * 4L) / 1_000_000} MB in RAM)" }
return { lineId -> rowOf[lineId]?.let { row -> vecs.copyOfRange(row * dim, row * dim + dim) } }
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.document.IntPoint
import org.apache.lucene.document.KnnFloatVectorField
import org.apache.lucene.document.StoredField
import org.apache.lucene.document.StringField
import org.apache.lucene.document.TextField
Expand All @@ -24,12 +25,16 @@ class LuceneTextIndexWriter(
indexDir: Path,
analyzer: Analyzer = StandardAnalyzer(),
private val indexHebrewField: Boolean = false,
private val indexPrimaryText: Boolean = true
private val indexPrimaryText: Boolean = true,
// Optional dense embeddings: if provided, each line doc also gets a
// KnnFloatVectorField("vec", COSINE) -> SINGLE index holding text + vectors.
private val vectorProvider: ((Long) -> FloatArray?)? = null,
) : TextIndexWriter {
companion object Fields {
const val FIELD_TYPE = "type"
const val TYPE_LINE = "line"
const val TYPE_BOOK_TITLE = "book_title"
const val FIELD_VEC = "vec"

const val FIELD_BOOK_ID = "book_id"
const val FIELD_CATEGORY_ID = "category_id"
Expand Down Expand Up @@ -106,6 +111,11 @@ class LuceneTextIndexWriter(
// Index 4-gram tokens for substring search (per-field analyzer applies NGram filter)
add(TextField(FIELD_TEXT_NG4, normalizedText, Field.Store.NO))
// rawPlainText is no longer stored - snippet source is fetched from DB at query time

// Dense embedding (single fused index): attach the line's vector if available.
vectorProvider?.invoke(lineId)?.let { vec ->
add(KnnFloatVectorField(FIELD_VEC, vec, org.apache.lucene.index.VectorSimilarityFunction.COSINE))
}
}
writer.addDocument(doc)
}
Expand Down
4 changes: 4 additions & 0 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ kotlin = "2.3.20"
agp = "8.12.3"
jvmToolchain = "25"
lucene = "10.4.0"
onnxruntime = "1.20.0"
djlHuggingface = "0.30.0"
maven-publish = "0.36.0"
kotlinx-coroutines = "1.10.2"
kotlinx-serialization = "1.10.0"
Expand Down Expand Up @@ -34,6 +36,8 @@ lucene-highlighter = { module = "org.apache.lucene:lucene-highlighter", version.
lucene-queryparser = { module = "org.apache.lucene:lucene-queryparser", version.ref = "lucene" }
lucene-analysis-common = { module = "org.apache.lucene:lucene-analysis-common", version.ref = "lucene" }
lucene-core = { module = "org.apache.lucene:lucene-core", version.ref = "lucene" }
onnxruntime = { module = "com.microsoft.onnxruntime:onnxruntime", version.ref = "onnxruntime" }
djl-huggingface-tokenizers = { module = "ai.djl.huggingface:tokenizers", version.ref = "djlHuggingface" }
sqlDelight-driver-sqlite = { module = "app.cash.sqldelight:sqlite-driver", version.ref = "sqlDelight" }
sqlDelight-driver-android = { module = "app.cash.sqldelight:android-driver", version.ref = "sqlDelight" }
sqlDelight-driver-native = { module = "app.cash.sqldelight:native-driver", version.ref = "sqlDelight" }
Expand Down
5 changes: 5 additions & 0 deletions search/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ kotlin {
implementation(libs.sqlDelight.driver.sqlite)
implementation(libs.kermit)
implementation(libs.jsoup)
// Dense semantic search: ONNX Runtime (query embedding) + HuggingFace tokenizer.
// Stock Maven `onnxruntime` is CPU-only on desktop JVM (no DirectML/CoreML/XNNPACK
// native — those need a custom build; CUDA needs the separate onnxruntime_gpu).
implementation(libs.onnxruntime)
implementation(libs.djl.huggingface.tokenizers)
}

jvmTest.dependencies {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package io.github.kdroidfilter.seforimlibrary.search

import org.jsoup.parser.Parser

/**
* Exact JVM port of `normalize_he_v5.py`: strip HTML/nikud/teamim and non-Hebrew
* characters, then fold final (sofit) letters (ך→כ ם→מ ן→נ ף→פ ץ→צ). It MUST be
* applied to any text before embedding it with a v5 model (both indexed lines and
* query), so vectors stay comparable. Pair it with a v5 model + a v5-built index.
*/
object HebrewV5Normalizer {
private val TAG = Regex("<[^>]+>")
private val MARKS = Regex("[֑-ׇ]")
private val DROP = Regex("[^א-ת0-9\\s.,:;!?()\\[\\]\"'\\-/׳״]")
private val WS = Regex("\\s+")
// Final (sofit) -> base letter folding.
private val FINALS = mapOf('ך' to 'כ', 'ם' to 'מ', 'ן' to 'נ', 'ף' to 'פ', 'ץ' to 'צ')

fun clean(text: String): String {
var s = TAG.replace(text, " ")
s = Parser.unescapeEntities(s, false)
s = MARKS.replace(s, "")
s = DROP.replace(s, " ")
s = WS.replace(s, " ").trim()
return buildString(s.length) { for (c in s) append(FINALS[c] ?: c) }
}
}
Loading
Loading