From 285cc8f3a65c270fe2eb19fa3de4abd286712202 Mon Sep 17 00:00:00 2001
From: Elie Gambache <elyahou.hadass@gmail.com>
Date: Fri, 26 Jun 2026 11:51:50 +0300
Subject: [PATCH] feat(search): embedding-only semantic highlight +
 native-image dense fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace dictionary-based smart highlight with embedding-based semantic
  passages: semanticSpan() picks the passage closest in meaning to the query;
  semanticFind() runs a book-scoped dense KNN for smart find-in-page
- Remove buildHighlightTerms (dictionary expansion) from the engine
- VectorSearcher: open the index with NIOFSDirectory under GraalVM native image
  (MMapDirectory's MemorySegmentIndexInputProvider can't load there), matching
  LuceneSearchEngine — fixes dense search + highlight being silently disabled
- fuse(): degrade to lexical-only (and log) instead of throwing if dense fails
- denseReady() to warm/probe the dense backend
---
 .../search/HybridSearchEngine.kt              | 167 ++++++++++++++++--
 .../search/LuceneSearchEngine.kt              |  47 -----
 .../seforimlibrary/search/SearchEngine.kt     |  32 +++-
 .../seforimlibrary/search/VectorSearcher.kt   |  10 +-
 4 files changed, 185 insertions(+), 71 deletions(-)
diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt
index 59deb5cf..e1b61805 100644
--- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt
+++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/HybridSearchEngine.kt
@@ -5,6 +5,8 @@ import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.sync.Mutex
 import kotlinx.coroutines.sync.withLock
 import kotlinx.coroutines.withContext
+import org.jsoup.Jsoup
+import org.jsoup.safety.Safelist
 import java.nio.file.Files
 import java.nio.file.Path
 
@@ -47,7 +49,7 @@ class HybridSearchEngine(
     // Cheap, no-load check: are the model + vector index even present? Decides whether to
     // take the dense path (then load lazily); the actual OrtSession is built in fuse().
     private val denseConfigured: Boolean =
-        indexDir != null && Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir)
+        Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir)
 
     val denseEnabled: Boolean get() = embedder != null && vectorSearcher != null
 
@@ -58,7 +60,7 @@ class HybridSearchEngine(
             if (denseTried) return
             withContext(Dispatchers.IO) {
                 val emb = SeforimEmbedder.tryLoad(modelDir)
-                val vs = if (emb != null && indexDir != null && Files.isDirectory(indexDir)) {
+                val vs = if (emb != null && Files.isDirectory(indexDir)) {
                     runCatching { VectorSearcher(indexDir) }.getOrNull()
                 } else null
                 if (vs != null) {
@@ -100,8 +102,46 @@ class HybridSearchEngine(
     override fun buildSnippet(rawText: String, query: String, near: Int): String =
         lexical.buildSnippet(rawText, query, near)
 
-    override fun buildHighlightTerms(query: String): List<String> =
-        lexical.buildHighlightTerms(query)
+    /**
+     * Picks the passage of [text] closest in meaning to [query], using the SAME dense
+     * encoder as semantic search — so the highlight reflects meaning instead of scattering
+     * dictionary word matches. Returns the winning passage verbatim, or null when dense is
+     * unavailable or the text isn't worth localizing (a single short clause).
+     */
+    override suspend fun semanticSpan(query: String, text: String): String? {
+        if (query.isBlank() || text.isBlank()) return null
+        ensureDense()
+        val emb = embedder ?: return null
+        val clauses = splitClauses(text)
+        if (clauses.size < 2) return null
+        return withContext(Dispatchers.Default) {
+            val qVec = emb.embed(query)
+            clauses
+                .map { it to cosine(qVec, emb.embed(it)) }
+                .maxByOrNull { it.second }
+                ?.first
+        }
+    }
+
+    /**
+     * Embedding-based find-in-page in one book: dense KNN over the index scoped to [bookId].
+     * Reuses the line vectors already in the index, so this is just the query embedding + a
+     * filtered KNN — the per-line passage is computed by the caller on the displayed text.
+     */
+    override suspend fun denseReady(): Boolean {
+        ensureDense()
+        return denseEnabled
+    }
+
+    override suspend fun semanticFind(query: String, bookId: Long, limit: Int): List<Long> {
+        if (query.isBlank()) return emptyList()
+        ensureDense()
+        val emb = embedder ?: return emptyList()
+        val vs = vectorSearcher ?: return emptyList()
+        return withContext(Dispatchers.Default) {
+            vs.search(emb.embed(query), limit, baseBookOnly = false, bookIds = listOf(bookId)).map { it.lineId }
+        }
+    }
 
     override fun computeFacets(
         query: String, near: Int, bookFilter: Long?, categoryFilter: Long?,
@@ -114,8 +154,12 @@ class HybridSearchEngine(
         runCatching { lexical.close() }
     }
 
+    /** The fused, RRF-ordered hits plus the ids that the lexical path matched (so the dense-only
+     *  ones can get a meaning-based snippet lazily — their lexical snippet is meaningless). */
+    private class FusedResult(val hits: List<LineHit>, val lexicalIds: Set<Long>)
+
     /** Lexical page + dense KNN, fused by RRF, resolved to full LineHits. */
-    private suspend fun fuse(query: String, near: Int, bookIds: Collection<Long>?, baseOnly: Boolean): List<LineHit> {
+    private suspend fun fuse(query: String, near: Int, bookIds: Collection<Long>?, baseOnly: Boolean): FusedResult {
         ensureDense()   // first call loads the model off-main (covered by the search spinner)
         val lexHits = lexical.openSession(query, near = near, bookIds = bookIds, baseBookOnly = baseOnly)
             ?.use { it.nextPage(candidates)?.hits ?: emptyList() } ?: emptyList()
@@ -123,11 +167,18 @@ class HybridSearchEngine(
         // Dense failed to load -> lexical-only result (still correct, just no semantic recall).
         val emb = embedder
         val vs = vectorSearcher
-        if (emb == null || vs == null) return lexHits
+        if (emb == null || vs == null) return FusedResult(lexHits, lexHits.map { it.lineId }.toSet())
 
-        val denseHits = withContext(Dispatchers.Default) {
-            val qVec = emb.embed(query)
-            vs.search(qVec, candidates, baseOnly, bookIds)
+        // Don't let a dense failure sink the whole search: degrade to lexical and log. This
+        // also catches native-image gaps (e.g. a missing Lucene KNN vectors-format SPI entry).
+        val denseHits = try {
+            withContext(Dispatchers.Default) {
+                val qVec = emb.embed(query)
+                vs.search(qVec, candidates, baseOnly, bookIds)
+            }
+        } catch (e: Exception) {
+            logger.w(e) { "dense KNN failed; falling back to lexical-only" }
+            return FusedResult(lexHits, lexHits.map { it.lineId }.toSet())
         }
 
         val rrf = HashMap<Long, Double>()
@@ -145,7 +196,31 @@ class HybridSearchEngine(
             val hit = lexById[lineId] ?: resolveLine(lineId, bookOf[lineId] ?: -1L, query)
             if (hit != null) out += hit.copy(score = score.toFloat())
         }
-        return out
+        return FusedResult(out, lexById.keys)
+    }
+
+    /**
+     * Snippet for a dense-only hit: the lexical builder can't anchor (the query words aren't in
+     * the text) and, fed the passage, it tokenizes and bolds each word separately — so short
+     * function words (ואם, או…) get highlighted on their own. Instead, locate the passage closest
+     * in meaning and bold it as ONE contiguous span inside a context window. Null -> keep lexical.
+     */
+    private suspend fun semanticSnippet(query: String, rawText: String, near: Int): String? {
+        // Jsoup.clean returns HTML-escaped text; substrings stay escaped, so we splice <b> in
+        // directly (same convention as the lexical snippet builder).
+        val clean = Jsoup.clean(rawText, Safelist.none())
+        val passage = semanticSpan(query, clean) ?: return null
+        val idx = clean.indexOf(passage)
+        if (idx < 0) return lexical.buildSnippet(rawText, passage, near)
+        val from = (idx - SNIPPET_CONTEXT).coerceAtLeast(0)
+        val to = (idx + passage.length + SNIPPET_CONTEXT).coerceAtMost(clean.length)
+        return buildString {
+            if (from > 0) append("…")
+            append(clean, from, idx)
+            append("<b>").append(passage).append("</b>")
+            append(clean, idx + passage.length, to)
+            if (to < clean.length) append("…")
+        }
     }
 
     private inner class HybridSession(
@@ -154,24 +229,86 @@ class HybridSearchEngine(
         private val bookIds: Collection<Long>?,
         private val baseOnly: Boolean,
     ) : SearchSession {
-        private var fused: List<LineHit>? = null
+        private var fused: FusedResult? = null
         private var offset = 0
 
         override suspend fun nextPage(limit: Int): SearchPage? {
             val all = fused ?: fuse(query, near, bookIds, baseOnly).also { fused = it }
-            if (offset >= all.size) return null
-            val end = minOf(offset + limit, all.size)
-            val slice = all.subList(offset, end)
+            if (offset >= all.hits.size) return null
+            val end = minOf(offset + limit, all.hits.size)
+            // Re-snippet only the dense-only hits on THIS page (bounded work) so semantic
+            // results show the passage that actually matched, not the line's opening words.
+            val slice = all.hits.subList(offset, end).map { hit ->
+                if (hit.lineId in all.lexicalIds) {
+                    hit
+                } else {
+                    semanticSnippet(query, hit.rawText, near)?.let { hit.copy(snippet = it) } ?: hit
+                }
+            }
             offset = end
-            return SearchPage(hits = slice, totalHits = all.size.toLong(), isLastPage = offset >= all.size)
+            return SearchPage(hits = slice, totalHits = all.hits.size.toLong(), isLastPage = offset >= all.hits.size)
         }
 
         override fun close() {}
     }
 
+    /**
+     * Splits [text] into passage candidates: first on strong clause delimiters (incl. the
+     * Hebrew sof-pasuk ׃), then sliding word windows over any segment too long to localize.
+     * Substrings are verbatim slices of [text] (original spacing preserved) so the caller
+     * can match them back diacritic-insensitively. Tiny fragments are dropped to keep a
+     * highlight that is a *passage*, not a stray word.
+     */
+    private fun splitClauses(text: String): List<String> {
+        val delimiters = charArrayOf('.', '!', '?', ':', ';', '׃', '\n')
+        val segments = ArrayList<IntRange>()
+        var start = 0
+        for (i in text.indices) {
+            if (text[i] in delimiters) {
+                if (i > start) segments += start..i
+                start = i + 1
+            }
+        }
+        if (start < text.length) segments += start until text.length
+
+        val out = ArrayList<String>()
+        for (seg in segments) {
+            val sub = text.substring(seg.first, seg.last + 1)
+            val wordRanges = WORD_RE.findAll(sub).map { it.range }.toList()
+            if (wordRanges.size < MIN_PASSAGE_WORDS) continue
+            if (wordRanges.size <= MAX_PASSAGE_WORDS) {
+                out += sub.substring(wordRanges.first().first, wordRanges.last().last + 1)
+            } else {
+                // Segment too long to localize: slide word windows over the original slice.
+                var w = 0
+                while (w < wordRanges.size) {
+                    val from = wordRanges[w].first
+                    val toIdx = minOf(w + MAX_PASSAGE_WORDS - 1, wordRanges.size - 1)
+                    out += sub.substring(from, wordRanges[toIdx].last + 1)
+                    if (toIdx == wordRanges.size - 1) break
+                    w += WINDOW_STRIDE_WORDS
+                }
+            }
+        }
+        return if (out.isEmpty()) listOf(text.trim()) else out
+    }
+
+    private fun cosine(a: FloatArray, b: FloatArray): Float {
+        var s = 0f
+        val n = minOf(a.size, b.size)
+        for (i in 0 until n) s += a[i] * b[i]
+        return s
+    }
+
     companion object {
         private val logger = Logger.withTag("HybridSearch")
 
+        private val WORD_RE = Regex("\\S+")
+        private const val MIN_PASSAGE_WORDS = 3
+        private const val MAX_PASSAGE_WORDS = 14
+        private const val WINDOW_STRIDE_WORDS = 8
+        private const val SNIPPET_CONTEXT = 90 // chars of context kept on each side of the passage
+
         fun create(
             lexical: LuceneSearchEngine,
             indexDir: Path,
diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt
index 5bb3bbb9..b29acc66 100644
--- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt
+++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/LuceneSearchEngine.kt
@@ -218,53 +218,6 @@ class LuceneSearchEngine(
         return buildSnippetInternal(rawClean, anchorTerms, highlightTerms)
     }
 
-    override fun buildHighlightTerms(query: String): List<String> {
-        val parsed = SearchQueryParser.parse(query)
-        val norm = HebrewTextUtils.normalizeHebrew(parsed.freeText)
-        // Verbatim tokens from the quoted phrases, highlighted without dictionary expansion.
-        val exactPhraseTokens = parsed.exactPhrases
-            .flatMap { analyzeToTerms(stdAnalyzer, HebrewTextUtils.normalizeHebrew(it)) ?: emptyList() }
-
-        if (norm.isBlank()) return filterTermsForHighlight(exactPhraseTokens)
-
-        val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList()
-        val hasHashem = query.contains("ה׳") || query.contains("ה'")
-
-        // Filter single letters and stop words (same logic as buildSearchContext)
-        val analyzedStd = analyzedRaw.filter { token ->
-            if (token == "ה" && hasHashem) return@filter true
-            if (token.any { it.isDigit() }) return@filter true
-            token.length >= 2 && token !in setOf(
-                "א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ",
-                "נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת",
-            )
-        }
-
-        // Get dictionary expansions
-        val tokenExpansions: Map<String, List<MagicDictionaryIndex.Expansion>> =
-            analyzedStd.associateWith { token ->
-                val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList()
-                listOf(expansion)
-            }
-
-        // Filter hallucinations for highlighting
-        val tokenExpansionsForHighlight = tokenExpansions.mapValues { (token, exps) ->
-            exps.filter { exp -> !isHallucinatedExpansion(token, exp) }
-        }
-
-        // Build expanded terms (filter 2-letter from expansions only)
-        val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten()
-        val expandedTerms = allExpansionsForHighlight
-            .flatMap { it.surface + it.variants + it.base }
-            .filter { it.length > 2 }
-            .distinct()
-
-        val ngramTerms = buildNgramTerms(analyzedStd, gram = 4)
-        val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList()
-
-        return filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms + exactPhraseTokens)
-    }
-
     override fun close() {
         // Directory is closed automatically when readers are closed
     }
diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt
index 827b639b..f2d6cef3 100644
--- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt
+++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/SearchEngine.kt
@@ -86,17 +86,33 @@ interface SearchEngine : Closeable {
     fun buildSnippet(rawText: String, query: String, near: Int): String
 
     /**
-     * Builds a list of terms to highlight for a given query, using dictionary expansion.
+     * Returns the contiguous passage within [text] whose meaning is closest to [query],
+     * for semantic highlighting (using the same dense encoder as semantic search).
      *
-     * This is useful for intelligent find-in-page that matches the same words
-     * as the global search (including synonyms and morphological variants).
-     * The terms are filtered to exclude hallucinated mappings and short words
-     * that only came from dictionary expansion.
+     * The result is a verbatim substring of [text] so callers can locate it with the
+     * usual diacritic-insensitive matching and highlight that single span — instead of
+     * scattering dictionary-expanded word matches that don't reflect meaning.
      *
-     * @param query The search query in Hebrew
-     * @return List of normalized terms to highlight (includes original tokens + expansions)
+     * @return the best-matching passage, or null when dense search is unavailable or no
+     *         passage stands out (e.g. the text is a single short clause).
      */
-    fun buildHighlightTerms(query: String): List<String>
+    suspend fun semanticSpan(query: String, text: String): String? = null
+
+    /**
+     * Embedding-based find-in-page within a single book: returns the ids of the lines
+     * semantically closest to [query] (dense KNN over the index, scoped to [bookId]),
+     * ordered by relevance. Used by the "smart" find mode — the simple mode matches literal
+     * words instead. The per-line passage to highlight is computed by the caller via
+     * [semanticSpan] on the displayed text. Empty when dense search is unavailable.
+     */
+    suspend fun semanticFind(query: String, bookId: Long, limit: Int): List<Long> = emptyList()
+
+    /**
+     * Ensures the dense backend (embedding model + vector index) is loaded and reports whether
+     * it is actually available. Useful for diagnostics and to decide whether semantic features
+     * can run. Returns false for engines without a dense path.
+     */
+    suspend fun denseReady(): Boolean = false
 
     /**
      * Computes aggregate facet counts without loading full results.
diff --git a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt
index 3868c8c0..c62f4bd3 100644
--- a/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt
+++ b/search/src/jvmMain/kotlin/io/github/kdroidfilter/seforimlibrary/search/VectorSearcher.kt
@@ -8,6 +8,7 @@ import org.apache.lucene.search.IndexSearcher
 import org.apache.lucene.search.KnnFloatVectorQuery
 import org.apache.lucene.search.Query
 import org.apache.lucene.store.FSDirectory
+import org.apache.lucene.store.NIOFSDirectory
 import java.io.Closeable
 import java.nio.file.Path
 
@@ -23,7 +24,14 @@ data class DenseHit(val lineId: Long, val bookId: Long, val score: Float)
  * Returns line ids that are joined back to the DB by the caller.
  */
 class VectorSearcher(indexDir: Path) : Closeable {
-    private val dir = FSDirectory.open(indexDir)
+    // GraalVM native image can't instantiate MMapDirectory's MemorySegmentIndexInputProvider
+    // (Panama foreign downcalls) — use NIOFSDirectory there, like LuceneSearchEngine does.
+    private val dir =
+        if (System.getProperty("org.graalvm.nativeimage.imagecode") != null) {
+            NIOFSDirectory(indexDir)
+        } else {
+            FSDirectory.open(indexDir)
+        }
 
     private fun filterQuery(baseBookOnly: Boolean, bookIds: Collection<Long>?): Query? {
         val b = BooleanQuery.Builder()