Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock
import kotlinx.coroutines.withContext
import org.jsoup.Jsoup
import org.jsoup.safety.Safelist
import java.nio.file.Files
import java.nio.file.Path

Expand Down Expand Up @@ -47,7 +49,7 @@ class HybridSearchEngine(
// Cheap, no-load check: are the model + vector index even present? Decides whether to
// take the dense path (then load lazily); the actual OrtSession is built in fuse().
private val denseConfigured: Boolean =
indexDir != null && Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir)
Files.isDirectory(indexDir) && SeforimEmbedder.isAvailable(modelDir)

val denseEnabled: Boolean get() = embedder != null && vectorSearcher != null

Expand All @@ -58,7 +60,7 @@ class HybridSearchEngine(
if (denseTried) return
withContext(Dispatchers.IO) {
val emb = SeforimEmbedder.tryLoad(modelDir)
val vs = if (emb != null && indexDir != null && Files.isDirectory(indexDir)) {
val vs = if (emb != null && Files.isDirectory(indexDir)) {
runCatching { VectorSearcher(indexDir) }.getOrNull()
} else null
if (vs != null) {
Expand Down Expand Up @@ -100,8 +102,46 @@ class HybridSearchEngine(
override fun buildSnippet(rawText: String, query: String, near: Int): String =
lexical.buildSnippet(rawText, query, near)

override fun buildHighlightTerms(query: String): List<String> =
lexical.buildHighlightTerms(query)
/**
* Picks the passage of [text] closest in meaning to [query], using the SAME dense
* encoder as semantic search — so the highlight reflects meaning instead of scattering
* dictionary word matches. Returns the winning passage verbatim, or null when dense is
* unavailable or the text isn't worth localizing (a single short clause).
*/
override suspend fun semanticSpan(query: String, text: String): String? {
if (query.isBlank() || text.isBlank()) return null
ensureDense()
val emb = embedder ?: return null
val clauses = splitClauses(text)
if (clauses.size < 2) return null
return withContext(Dispatchers.Default) {
val qVec = emb.embed(query)
clauses
.map { it to cosine(qVec, emb.embed(it)) }
.maxByOrNull { it.second }
?.first
}
}

/**
* Embedding-based find-in-page in one book: dense KNN over the index scoped to [bookId].
* Reuses the line vectors already in the index, so this is just the query embedding + a
* filtered KNN — the per-line passage is computed by the caller on the displayed text.
*/
override suspend fun denseReady(): Boolean {
ensureDense()
return denseEnabled
}

override suspend fun semanticFind(query: String, bookId: Long, limit: Int): List<Long> {
if (query.isBlank()) return emptyList()
ensureDense()
val emb = embedder ?: return emptyList()
val vs = vectorSearcher ?: return emptyList()
return withContext(Dispatchers.Default) {
vs.search(emb.embed(query), limit, baseBookOnly = false, bookIds = listOf(bookId)).map { it.lineId }
}
}

override fun computeFacets(
query: String, near: Int, bookFilter: Long?, categoryFilter: Long?,
Expand All @@ -114,20 +154,31 @@ class HybridSearchEngine(
runCatching { lexical.close() }
}

/** The fused, RRF-ordered hits plus the ids that the lexical path matched (so the dense-only
* ones can get a meaning-based snippet lazily — their lexical snippet is meaningless). */
private class FusedResult(val hits: List<LineHit>, val lexicalIds: Set<Long>)

/** Lexical page + dense KNN, fused by RRF, resolved to full LineHits. */
private suspend fun fuse(query: String, near: Int, bookIds: Collection<Long>?, baseOnly: Boolean): List<LineHit> {
private suspend fun fuse(query: String, near: Int, bookIds: Collection<Long>?, baseOnly: Boolean): FusedResult {
ensureDense() // first call loads the model off-main (covered by the search spinner)
val lexHits = lexical.openSession(query, near = near, bookIds = bookIds, baseBookOnly = baseOnly)
?.use { it.nextPage(candidates)?.hits ?: emptyList() } ?: emptyList()

// Dense failed to load -> lexical-only result (still correct, just no semantic recall).
val emb = embedder
val vs = vectorSearcher
if (emb == null || vs == null) return lexHits
if (emb == null || vs == null) return FusedResult(lexHits, lexHits.map { it.lineId }.toSet())

val denseHits = withContext(Dispatchers.Default) {
val qVec = emb.embed(query)
vs.search(qVec, candidates, baseOnly, bookIds)
// Don't let a dense failure sink the whole search: degrade to lexical and log. This
// also catches native-image gaps (e.g. a missing Lucene KNN vectors-format SPI entry).
val denseHits = try {
withContext(Dispatchers.Default) {
val qVec = emb.embed(query)
vs.search(qVec, candidates, baseOnly, bookIds)
}
} catch (e: Exception) {
logger.w(e) { "dense KNN failed; falling back to lexical-only" }
return FusedResult(lexHits, lexHits.map { it.lineId }.toSet())
}

val rrf = HashMap<Long, Double>()
Expand All @@ -145,7 +196,31 @@ class HybridSearchEngine(
val hit = lexById[lineId] ?: resolveLine(lineId, bookOf[lineId] ?: -1L, query)
if (hit != null) out += hit.copy(score = score.toFloat())
}
return out
return FusedResult(out, lexById.keys)
}

/**
* Snippet for a dense-only hit: the lexical builder can't anchor (the query words aren't in
* the text) and, fed the passage, it tokenizes and bolds each word separately — so short
* function words (ואם, או…) get highlighted on their own. Instead, locate the passage closest
* in meaning and bold it as ONE contiguous span inside a context window. Null -> keep lexical.
*/
private suspend fun semanticSnippet(query: String, rawText: String, near: Int): String? {
// Jsoup.clean returns HTML-escaped text; substrings stay escaped, so we splice <b> in
// directly (same convention as the lexical snippet builder).
val clean = Jsoup.clean(rawText, Safelist.none())
val passage = semanticSpan(query, clean) ?: return null
val idx = clean.indexOf(passage)
if (idx < 0) return lexical.buildSnippet(rawText, passage, near)
val from = (idx - SNIPPET_CONTEXT).coerceAtLeast(0)
val to = (idx + passage.length + SNIPPET_CONTEXT).coerceAtMost(clean.length)
return buildString {
if (from > 0) append("…")
append(clean, from, idx)
append("<b>").append(passage).append("</b>")
append(clean, idx + passage.length, to)
if (to < clean.length) append("…")
}
}

private inner class HybridSession(
Expand All @@ -154,24 +229,86 @@ class HybridSearchEngine(
private val bookIds: Collection<Long>?,
private val baseOnly: Boolean,
) : SearchSession {
private var fused: List<LineHit>? = null
private var fused: FusedResult? = null
private var offset = 0

override suspend fun nextPage(limit: Int): SearchPage? {
val all = fused ?: fuse(query, near, bookIds, baseOnly).also { fused = it }
if (offset >= all.size) return null
val end = minOf(offset + limit, all.size)
val slice = all.subList(offset, end)
if (offset >= all.hits.size) return null
val end = minOf(offset + limit, all.hits.size)
// Re-snippet only the dense-only hits on THIS page (bounded work) so semantic
// results show the passage that actually matched, not the line's opening words.
val slice = all.hits.subList(offset, end).map { hit ->
if (hit.lineId in all.lexicalIds) {
hit
} else {
semanticSnippet(query, hit.rawText, near)?.let { hit.copy(snippet = it) } ?: hit
}
}
offset = end
return SearchPage(hits = slice, totalHits = all.size.toLong(), isLastPage = offset >= all.size)
return SearchPage(hits = slice, totalHits = all.hits.size.toLong(), isLastPage = offset >= all.hits.size)
}

override fun close() {}
}

/**
* Splits [text] into passage candidates: first on strong clause delimiters (incl. the
* Hebrew sof-pasuk ׃), then sliding word windows over any segment too long to localize.
* Substrings are verbatim slices of [text] (original spacing preserved) so the caller
* can match them back diacritic-insensitively. Tiny fragments are dropped to keep a
* highlight that is a *passage*, not a stray word.
*/
private fun splitClauses(text: String): List<String> {
val delimiters = charArrayOf('.', '!', '?', ':', ';', '׃', '\n')
val segments = ArrayList<IntRange>()
var start = 0
for (i in text.indices) {
if (text[i] in delimiters) {
if (i > start) segments += start..i
start = i + 1
}
}
if (start < text.length) segments += start until text.length

val out = ArrayList<String>()
for (seg in segments) {
val sub = text.substring(seg.first, seg.last + 1)
val wordRanges = WORD_RE.findAll(sub).map { it.range }.toList()
if (wordRanges.size < MIN_PASSAGE_WORDS) continue
if (wordRanges.size <= MAX_PASSAGE_WORDS) {
out += sub.substring(wordRanges.first().first, wordRanges.last().last + 1)
} else {
// Segment too long to localize: slide word windows over the original slice.
var w = 0
while (w < wordRanges.size) {
val from = wordRanges[w].first
val toIdx = minOf(w + MAX_PASSAGE_WORDS - 1, wordRanges.size - 1)
out += sub.substring(from, wordRanges[toIdx].last + 1)
if (toIdx == wordRanges.size - 1) break
w += WINDOW_STRIDE_WORDS
}
}
}
return if (out.isEmpty()) listOf(text.trim()) else out
}

private fun cosine(a: FloatArray, b: FloatArray): Float {
var s = 0f
val n = minOf(a.size, b.size)
for (i in 0 until n) s += a[i] * b[i]
return s
}

companion object {
private val logger = Logger.withTag("HybridSearch")

private val WORD_RE = Regex("\\S+")
private const val MIN_PASSAGE_WORDS = 3
private const val MAX_PASSAGE_WORDS = 14
private const val WINDOW_STRIDE_WORDS = 8
private const val SNIPPET_CONTEXT = 90 // chars of context kept on each side of the passage

fun create(
lexical: LuceneSearchEngine,
indexDir: Path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,53 +218,6 @@ class LuceneSearchEngine(
return buildSnippetInternal(rawClean, anchorTerms, highlightTerms)
}

override fun buildHighlightTerms(query: String): List<String> {
val parsed = SearchQueryParser.parse(query)
val norm = HebrewTextUtils.normalizeHebrew(parsed.freeText)
// Verbatim tokens from the quoted phrases, highlighted without dictionary expansion.
val exactPhraseTokens = parsed.exactPhrases
.flatMap { analyzeToTerms(stdAnalyzer, HebrewTextUtils.normalizeHebrew(it)) ?: emptyList() }

if (norm.isBlank()) return filterTermsForHighlight(exactPhraseTokens)

val analyzedRaw = analyzeToTerms(stdAnalyzer, norm) ?: emptyList()
val hasHashem = query.contains("ה׳") || query.contains("ה'")

// Filter single letters and stop words (same logic as buildSearchContext)
val analyzedStd = analyzedRaw.filter { token ->
if (token == "ה" && hasHashem) return@filter true
if (token.any { it.isDigit() }) return@filter true
token.length >= 2 && token !in setOf(
"א", "ב", "ג", "ד", "ה", "ו", "ז", "ח", "ט", "י", "כ", "ל", "מ",
"נ", "ס", "ע", "פ", "צ", "ק", "ר", "ש", "ת",
)
}

// Get dictionary expansions
val tokenExpansions: Map<String, List<MagicDictionaryIndex.Expansion>> =
analyzedStd.associateWith { token ->
val expansion = magicDict?.expansionFor(token) ?: return@associateWith emptyList()
listOf(expansion)
}

// Filter hallucinations for highlighting
val tokenExpansionsForHighlight = tokenExpansions.mapValues { (token, exps) ->
exps.filter { exp -> !isHallucinatedExpansion(token, exp) }
}

// Build expanded terms (filter 2-letter from expansions only)
val allExpansionsForHighlight = tokenExpansionsForHighlight.values.flatten()
val expandedTerms = allExpansionsForHighlight
.flatMap { it.surface + it.variants + it.base }
.filter { it.length > 2 }
.distinct()

val ngramTerms = buildNgramTerms(analyzedStd, gram = 4)
val hashemTerms = if (hasHashem) loadHashemHighlightTerms() else emptyList()

return filterTermsForHighlight(analyzedStd + expandedTerms + ngramTerms + hashemTerms + exactPhraseTokens)
}

override fun close() {
// Directory is closed automatically when readers are closed
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,33 @@ interface SearchEngine : Closeable {
fun buildSnippet(rawText: String, query: String, near: Int): String

/**
* Builds a list of terms to highlight for a given query, using dictionary expansion.
* Returns the contiguous passage within [text] whose meaning is closest to [query],
* for semantic highlighting (using the same dense encoder as semantic search).
*
* This is useful for intelligent find-in-page that matches the same words
* as the global search (including synonyms and morphological variants).
* The terms are filtered to exclude hallucinated mappings and short words
* that only came from dictionary expansion.
* The result is a verbatim substring of [text] so callers can locate it with the
* usual diacritic-insensitive matching and highlight that single span — instead of
* scattering dictionary-expanded word matches that don't reflect meaning.
*
* @param query The search query in Hebrew
* @return List of normalized terms to highlight (includes original tokens + expansions)
* @return the best-matching passage, or null when dense search is unavailable or no
* passage stands out (e.g. the text is a single short clause).
*/
fun buildHighlightTerms(query: String): List<String>
suspend fun semanticSpan(query: String, text: String): String? = null

/**
* Embedding-based find-in-page within a single book: returns the ids of the lines
* semantically closest to [query] (dense KNN over the index, scoped to [bookId]),
* ordered by relevance. Used by the "smart" find mode — the simple mode matches literal
* words instead. The per-line passage to highlight is computed by the caller via
* [semanticSpan] on the displayed text. Empty when dense search is unavailable.
*/
suspend fun semanticFind(query: String, bookId: Long, limit: Int): List<Long> = emptyList()

/**
* Ensures the dense backend (embedding model + vector index) is loaded and reports whether
* it is actually available. Useful for diagnostics and to decide whether semantic features
* can run. Returns false for engines without a dense path.
*/
suspend fun denseReady(): Boolean = false

/**
* Computes aggregate facet counts without loading full results.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.KnnFloatVectorQuery
import org.apache.lucene.search.Query
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.store.NIOFSDirectory
import java.io.Closeable
import java.nio.file.Path

Expand All @@ -23,7 +24,14 @@ data class DenseHit(val lineId: Long, val bookId: Long, val score: Float)
* Returns line ids that are joined back to the DB by the caller.
*/
class VectorSearcher(indexDir: Path) : Closeable {
private val dir = FSDirectory.open(indexDir)
// GraalVM native image can't instantiate MMapDirectory's MemorySegmentIndexInputProvider
// (Panama foreign downcalls) — use NIOFSDirectory there, like LuceneSearchEngine does.
private val dir =
if (System.getProperty("org.graalvm.nativeimage.imagecode") != null) {
NIOFSDirectory(indexDir)
} else {
FSDirectory.open(indexDir)
}

private fun filterQuery(baseBookOnly: Boolean, bookIds: Collection<Long>?): Query? {
val b = BooleanQuery.Builder()
Expand Down
Loading