From 285cc8f3a65c270fe2eb19fa3de4abd286712202 Mon Sep 17 00:00:00 2001 From: Elie Gambache Date: Fri, 26 Jun 2026 11:51:50 +0300 Subject: [PATCH] feat(search): embedding-only semantic highlight + native-image dense fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace dictionary-based smart highlight with embedding-based semantic passages: semanticSpan() picks the passage closest in meaning to the query; semanticFind() runs a book-scoped dense KNN for smart find-in-page - Remove buildHighlightTerms (dictionary expansion) from the engine - VectorSearcher: open the index with NIOFSDirectory under GraalVM native image (MMapDirectory's MemorySegmentIndexInputProvider can't load there), matching LuceneSearchEngine — fixes dense search + highlight being silently disabled - fuse(): degrade to lexical-only (and log) instead of throwing if dense fails - denseReady() to warm/probe the dense backend --- .../search/HybridSearchEngine.kt | 167 ++++++++++++++++-- .../search/LuceneSearchEngine.kt | 47 ----- .../seforimlibrary/search/SearchEngine.kt | 32 +++- .../seforimlibrary/search/VectorSearcher.kt | 10 +- 4 files changed, 185 insertions(+), 71 deletions(-) diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt index 59deb5cf..e1b61805 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt @@ -5,6 +5,8 @@ import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.sync.Mutex import kotlinx.coroutines.sync.withLock import kotlinx.coroutines.withContext +import org.jsoup.Jsoup +import org.jsoup.safety.Safelist import java.nio.file.Files import java.nio.file.Path @@ -47,7 +49,7 @@ class HybridSearchEngine( // Cheap, no-load check: are the model + vector index even present? Decides whether to // take the dense path (then load lazily); the actual OrtSession is built in fuse(). private val denseConfigured: Boolean = - indexDir != null && Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir) + Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir) val denseEnabled: Boolean get() = embedder != null && vectorSearcher != null @@ -58,7 +60,7 @@ class HybridSearchEngine( if (denseTried) return withContext(Dispatchers.IO) { val emb = SeforimEmbedder.tryLoad(modelDir) - val vs = if (emb != null && indexDir != null && Files.isDirectory(indexDir)) { + val vs = if (emb != null && Files.isDirectory(indexDir)) { runCatching { VectorSearcher(indexDir) }.getOrNull() } else null if (vs != null) { @@ -100,8 +102,46 @@ class HybridSearchEngine( override fun buildSnippet(rawText: String, query: String, near: Int): String = lexical.buildSnippet(rawText, query, near) - override fun buildHighlightTerms(query: String): List = - lexical.buildHighlightTerms(query) + /** + * Picks the passage of [text] closest in meaning to [query], using the SAME dense + * encoder as semantic search — so the highlight reflects meaning instead of scattering + * dictionary word matches. Returns the winning passage verbatim, or null when dense is + * unavailable or the text isn't worth localizing (a single short clause). + */ + override suspend fun semanticSpan(query: String, text: String): String? { + if (query.isBlank() || text.isBlank()) return null + ensureDense() + val emb = embedder ?: return null + val clauses = splitClauses(text) + if (clauses.size < 2) return null + return withContext(Dispatchers.Default) { + val qVec = emb.embed(query) + clauses + .map { it to cosine(qVec, emb.embed(it)) } + .maxByOrNull { it.second } + ?.first + } + } + + /** + * Embedding-based find-in-page in one book: dense KNN over the index scoped to [bookId]. + * Reuses the line vectors already in the index, so this is just the query embedding + a + * filtered KNN — the per-line passage is computed by the caller on the displayed text. + */ + override suspend fun denseReady(): Boolean { + ensureDense() + return denseEnabled + } + + override suspend fun semanticFind(query: String, bookId: Long, limit: Int): List { + if (query.isBlank()) return emptyList() + ensureDense() + val emb = embedder ?: return emptyList() + val vs = vectorSearcher ?: return emptyList() + return withContext(Dispatchers.Default) { + vs.search(emb.embed(query), limit, baseBookOnly = false, bookIds = listOf(bookId)).map { it.lineId } + } + } override fun computeFacets( query: String, near: Int, bookFilter: Long?, categoryFilter: Long?, @@ -114,8 +154,12 @@ class HybridSearchEngine( runCatching { lexical.close() } } + /** The fused, RRF-ordered hits plus the ids that the lexical path matched (so the dense-only + * ones can get a meaning-based snippet lazily — their lexical snippet is meaningless). */ + private class FusedResult(val hits: List, val lexicalIds: Set) + /** Lexical page + dense KNN, fused by RRF, resolved to full LineHits. */ - private suspend fun fuse(query: String, near: Int, bookIds: Collection?, baseOnly: Boolean): List { + private suspend fun fuse(query: String, near: Int, bookIds: Collection?, baseOnly: Boolean): FusedResult { ensureDense() // first call loads the model off-main (covered by the search spinner) val lexHits = lexical.openSession(query, near = near, bookIds = bookIds, baseBookOnly = baseOnly) ?.use { it.nextPage(candidates)?.hits ?: emptyList() } ?: emptyList() @@ -123,11 +167,18 @@ class HybridSearchEngine( // Dense failed to load -> lexical-only result (still correct, just no semantic recall). val emb = embedder val vs = vectorSearcher - if (emb == null || vs == null) return lexHits + if (emb == null || vs == null) return FusedResult(lexHits, lexHits.map { it.lineId }.toSet()) - val denseHits = withContext(Dispatchers.Default) { - val qVec = emb.embed(query) - vs.search(qVec, candidates, baseOnly, bookIds) + // Don't let a dense failure sink the whole search: degrade to lexical and log. This + // also catches native-image gaps (e.g. a missing Lucene KNN vectors-format SPI entry). + val denseHits = try { + withContext(Dispatchers.Default) { + val qVec = emb.embed(query) + vs.search(qVec, candidates, baseOnly, bookIds) + } + } catch (e: Exception) { + logger.w(e) { "dense KNN failed; falling back to lexical-only" } + return FusedResult(lexHits, lexHits.map { it.lineId }.toSet()) } val rrf = HashMap() @@ -145,7 +196,31 @@ class HybridSearchEngine( val hit = lexById[lineId] ?: resolveLine(lineId, bookOf[lineId] ?: -1L, query) if (hit != null) out += hit.copy(score = score.toFloat()) } - return out + return FusedResult(out, lexById.keys) + } + + /** + * Snippet for a dense-only hit: the lexical builder can't anchor (the query words aren't in + * the text) and, fed the passage, it tokenizes and bolds each word separately — so short + * function words (ואם, או…) get highlighted on their own. Instead, locate the passage closest + * in meaning and bold it as ONE contiguous span inside a context window. Null -> keep lexical. + */ + private suspend fun semanticSnippet(query: String, rawText: String, near: Int): String? { + // Jsoup.clean returns HTML-escaped text; substrings stay escaped, so we splice in + // directly (same convention as the lexical snippet builder). + val clean = Jsoup.clean(rawText, Safelist.none()) + val passage = semanticSpan(query, clean) ?: return null + val idx = clean.indexOf(passage) + if (idx < 0) return lexical.buildSnippet(rawText, passage, near) + val from = (idx - SNIPPET_CONTEXT).coerceAtLeast(0) + val to = (idx + passage.length + SNIPPET_CONTEXT).coerceAtMost(clean.length) + return buildString { + if (from > 0) append("…") + append(clean, from, idx) + append("").append(passage).append("") + append(clean, idx + passage.length, to) + if (to < clean.length) append("…") + } } private inner class HybridSession( @@ -154,24 +229,86 @@ class HybridSearchEngine( private val bookIds: Collection?, private val baseOnly: Boolean, ) : SearchSession { - private var fused: List? = null + private var fused: FusedResult? = null private var offset = 0 override suspend fun nextPage(limit: Int): SearchPage? { val all = fused ?: fuse(query, near, bookIds, baseOnly).also { fused = it } - if (offset >= all.size) return null - val end = minOf(offset + limit, all.size) - val slice = all.subList(offset, end) + if (offset >= all.hits.size) return null + val end = minOf(offset + limit, all.hits.size) + // Re-snippet only the dense-only hits on THIS page (bounded work) so semantic + // results show the passage that actually matched, not the line's opening words. + val slice = all.hits.subList(offset, end).map { hit -> + if (hit.lineId in all.lexicalIds) { + hit + } else { + semanticSnippet(query, hit.rawText, near)?.let { hit.copy(snippet = it) } ?: hit + } + } offset = end - return SearchPage(hits = slice, totalHits = all.size.toLong(), isLastPage = offset >= all.size) + return SearchPage(hits = slice, totalHits = all.hits.size.toLong(), isLastPage = offset >= all.hits.size) } override fun close() {} } + /** + * Splits [text] into passage candidates: first on strong clause delimiters (incl. the + * Hebrew sof-pasuk ׃), then sliding word windows over any segment too long to localize. + * Substrings are verbatim slices of [text] (original spacing preserved) so the caller + * can match them back diacritic-insensitively. Tiny fragments are dropped to keep a + * highlight that is a *passage*, not a stray word. + */ + private fun splitClauses(text: String): List { + val delimiters = charArrayOf('.', '!', '?', ':', ';', '׃', '\n') + val segments = ArrayList() + var start = 0 + for (i in text.indices) { + if (text[i] in delimiters) { + if (i > start) segments += start..i + start = i + 1 + } + } + if (start < text.length) segments += start until text.length + + val out = ArrayList() + for (seg in segments) { + val sub = text.substring(seg.first, seg.last + 1) + val wordRanges = WORD_RE.findAll(sub).map { it.range }.toList() + if (wordRanges.size < MIN_PASSAGE_WORDS) continue + if (wordRanges.size <= MAX_PASSAGE_WORDS) { + out += sub.substring(wordRanges.first().first, wordRanges.last().last + 1) + } else { + // Segment too long to localize: slide word windows over the original slice. + var w = 0 + while (w < wordRanges.size) { + val from = wordRanges[w].first + val toIdx = minOf(w + MAX_PASSAGE_WORDS - 1, wordRanges.size - 1) + out += sub.substring(from, wordRanges[toIdx].last + 1) + if (toIdx == wordRanges.size - 1) break + w += WINDOW_STRIDE_WORDS + } + } + } + return if (out.isEmpty()) listOf(text.trim()) else out + } + + private fun cosine(a: FloatArray, b: FloatArray): Float { + var s = 0f + val n = minOf(a.size, b.size) + for (i in 0 until n) s += a[i] * b[i] + return s + } + companion object { private val logger = Logger.withTag("HybridSearch") + private val WORD_RE = Regex("\\S+") + private const val MIN_PASSAGE_WORDS = 3 + private const val MAX_PASSAGE_WORDS = 14 + private const val WINDOW_STRIDE_WORDS = 8 + private const val SNIPPET_CONTEXT = 90 // chars of context kept on each side of the passage + fun create( lexical: LuceneSearchEngine, indexDir: Path, diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt index 5bb3bbb9..b29acc66 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt @@ -218,53 +218,6 @@ class LuceneSearchEngine( return buildSnippetInternal(rawClean, anchorTerms, highlightTerms) } - override fun buildHighlightTerms(query: String): List { - val parsed = SearchQueryParser.parse(query) - val norm = HebrewTextUtils.normalizeHebrew(parsed.freeText) - // Verbatim tokens from the quoted phrases, highlighted without dictionary expansion. - val exactPhraseTokens = parsed.exactPhrases - .flatMap { analyzeToTerms(stdAnalyzer, HebrewTextUtils.normalizeHebrew(it)) ?: emptyList() } - - if (norm.isBlank()) return filterTermsForHighlight(exactPhraseTokens) - - val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList() - val hasHashem = query.contains("ה׳") || query.contains("ה'") - - // Filter single letters and stop words (same logic as buildSearchContext) - val analyzedStd = analyzedRaw.filter { token -> - if (token == "ה" && hasHashem) return@filter true - if (token.any { it.isDigit() }) return@filter true - token.length >= 2 && token !in setOf( - "א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ", - "נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת", - ) - } - - // Get dictionary expansions - val tokenExpansions: Map> = - analyzedStd.associateWith { token -> - val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList() - listOf(expansion) - } - - // Filter hallucinations for highlighting - val tokenExpansionsForHighlight = tokenExpansions.mapValues { (token, exps) -> - exps.filter { exp -> !isHallucinatedExpansion(token, exp) } - } - - // Build expanded terms (filter 2-letter from expansions only) - val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten() - val expandedTerms = allExpansionsForHighlight - .flatMap { it.surface + it.variants + it.base } - .filter { it.length > 2 } - .distinct() - - val ngramTerms = buildNgramTerms(analyzedStd, gram = 4) - val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList() - - return filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms + exactPhraseTokens) - } - override fun close() { // Directory is closed automatically when readers are closed } diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt index 827b639b..f2d6cef3 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt @@ -86,17 +86,33 @@ interface SearchEngine : Closeable { fun buildSnippet(rawText: String, query: String, near: Int): String /** - * Builds a list of terms to highlight for a given query, using dictionary expansion. + * Returns the contiguous passage within [text] whose meaning is closest to [query], + * for semantic highlighting (using the same dense encoder as semantic search). * - * This is useful for intelligent find-in-page that matches the same words - * as the global search (including synonyms and morphological variants). - * The terms are filtered to exclude hallucinated mappings and short words - * that only came from dictionary expansion. + * The result is a verbatim substring of [text] so callers can locate it with the + * usual diacritic-insensitive matching and highlight that single span — instead of + * scattering dictionary-expanded word matches that don't reflect meaning. * - * @param query The search query in Hebrew - * @return List of normalized terms to highlight (includes original tokens + expansions) + * @return the best-matching passage, or null when dense search is unavailable or no + * passage stands out (e.g. the text is a single short clause). */ - fun buildHighlightTerms(query: String): List + suspend fun semanticSpan(query: String, text: String): String? = null + + /** + * Embedding-based find-in-page within a single book: returns the ids of the lines + * semantically closest to [query] (dense KNN over the index, scoped to [bookId]), + * ordered by relevance. Used by the "smart" find mode — the simple mode matches literal + * words instead. The per-line passage to highlight is computed by the caller via + * [semanticSpan] on the displayed text. Empty when dense search is unavailable. + */ + suspend fun semanticFind(query: String, bookId: Long, limit: Int): List = emptyList() + + /** + * Ensures the dense backend (embedding model + vector index) is loaded and reports whether + * it is actually available. Useful for diagnostics and to decide whether semantic features + * can run. Returns false for engines without a dense path. + */ + suspend fun denseReady(): Boolean = false /** * Computes aggregate facet counts without loading full results. diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt index 3868c8c0..c62f4bd3 100644 --- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt +++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt @@ -8,6 +8,7 @@ import org.apache.lucene.search.IndexSearcher import org.apache.lucene.search.KnnFloatVectorQuery import org.apache.lucene.search.Query import org.apache.lucene.store.FSDirectory +import org.apache.lucene.store.NIOFSDirectory import java.io.Closeable import java.nio.file.Path @@ -23,7 +24,14 @@ data class DenseHit(val lineId: Long, val bookId: Long, val score: Float) * Returns line ids that are joined back to the DB by the caller. */ class VectorSearcher(indexDir: Path) : Closeable { - private val dir = FSDirectory.open(indexDir) + // GraalVM native image can't instantiate MMapDirectory's MemorySegmentIndexInputProvider + // (Panama foreign downcalls) — use NIOFSDirectory there, like LuceneSearchEngine does. + private val dir = + if (System.getProperty("org.graalvm.nativeimage.imagecode") != null) { + NIOFSDirectory(indexDir) + } else { + FSDirectory.open(indexDir) + } private fun filterQuery(baseBookOnly: Boolean, bookIds: Collection?): Query? { val b = BooleanQuery.Builder()