diff --git a/.dockerignore b/.dockerignore index 70c6ab6..f7e4d21 100644 --- a/.dockerignore +++ b/.dockerignore @@ -50,17 +50,14 @@ dist/ .DS_Store Thumbs.db -# Node / frontend (falls frontend im Projekt ist) node_modules/ dist/ **/*.tsv **/*.gz -# allow test data files as they are small and necessary for tests -!src/backend/search_engine/index_builder/test_data/*.tsv -!src/backend/search_engine/index_builder/test_data/*.gz -/src/backend/search_engine/index_builder/build/ -/src/backend/search_engine/index_builder/data/ -/src/backend/search_engine/index/bin/ -/src/backend/search_engine/models/neuspell-scrnn-probwordnoise/ \ No newline at end of file +src/backend/search_engine/tests/ +src/backend/search_engine/index_builder/build/ +src/backend/search_engine/index_builder/data/ +src/backend/search_engine/index/bin/ +src/backend/search_engine/models/neuspell-scrnn-probwordnoise/ \ No newline at end of file diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml index 8feaed2..15a5238 100644 --- a/.github/actions/setup/action.yml +++ b/.github/actions/setup/action.yml @@ -16,4 +16,4 @@ runs: shell: bash run: | sudo apt-get update -y - sudo apt-get install -y libstemmer-dev build-essential git + sudo apt-get install -y libstemmer-dev build-essential git clang-format diff --git a/.gitignore b/.gitignore index fa3d7ef..97b6718 100644 --- a/.gitignore +++ b/.gitignore @@ -237,7 +237,6 @@ dist-ssr !src/backend/search_engine/index_builder/test_data/*.tsv !src/backend/search_engine/index_builder/test_data/*.gz -/src/backend/search_engine/index_builder/data/docstore -/src/backend/search_engine/index_builder/data/index -/src/backend/search_engine/index_builder/data/partial_indices -/src/backend/search_engine/index/bin/ \ No newline at end of file +/src/backend/search_engine/index_builder/data/ +!src/backend/search_engine/index_builder/data/data.md +/src/backend/search_engine/index/bin/ diff --git a/README.md b/README.md index c5cd424..14d795e 100644 --- a/README.md +++ b/README.md @@ -20,15 +20,44 @@ Seekr consists of several core subsystems working together: * uv * Node.js * npm -* Docker (for containerized integration/unit tests) +* Docker (for containerized integration/unit tests & deployment) * LFS (downloading ML models from GitHub) * CMake (building and compiling the CPP components) * Just (command runner) ## Entrypoints +### Docker -### Build the Index -Before running the system, build the index with a memory limit: +```bash +just deploy +``` +Automated build process. Will download the dataset and build the index if +it does not exist yet. This preprocessing can take up to 2 hours. + +Afterwards, it spins up a frontend and a backend container. + +- Access search engine frontend via `http://localhost:8080`. +- API-only: `http://localhost:8000`. + + **Search Endpoint** + + **GET** `/search` + + Query parameters: + + | Parameter | Type | Description | + | --------- | ------ | ---------------------------------------------- | + | `q` | string | Search query (1–50 characters) | + | `limit` | int | Maximum number of results (1–500, default: 10) | + + +### Manual usage +Download the dataset: +```bash +cd src && uv run --project backend python -m backend.search_engine.scripts.download_dataset +``` + +Build the index with a memory limit: ```bash just build-index ``` diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..273799d --- /dev/null +++ b/deploy.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euo pipefail + +if [ ! -f src/backend/search_engine/index_builder/data/msmarco-docs.tsv ]; then + echo "msmarco-docs.tsv file not found. Starting download..." + cd src && uv run --project backend python -m backend.search_engine.scripts.download_dataset +fi + +if [ -z "$(ls -A src/backend/search_engine/index/bin/ 2>/dev/null)" ]; then + echo "Index binaries not found. Starting build process..." + just build-index +fi + +echo "Spinning up containers..." +docker compose up -d \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..657fefc --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,21 @@ +version: "3.9" +name: "seekr-search-engine" +services: + backend: + build: src/backend + container_name: seekr-backend + ports: + - "8000:8000" # expose backend for FE-less API access + volumes: # bind mount large data files + - ./src/backend/search_engine/index/bin:/app/src/backend/search_engine/index/bin + - ./src/backend/search_engine/index_builder/data/msmarco-docs.tsv:/app/src/backend/search_engine/index_builder/data/msmarco-docs.tsv + - ./src/backend/search_engine/models/neuspell-scrnn-probwordnoise:/app/src/backend/search_engine/models/neuspell-scrnn-probwordnoise + frontend: + build: src/frontend + container_name: seekr-frontend + depends_on: + - backend + ports: + - "8080:8080" + environment: + ENV: "DOCKER" \ No newline at end of file diff --git a/justfile b/justfile index caf8abf..7856c9f 100644 --- a/justfile +++ b/justfile @@ -18,6 +18,10 @@ local *uvicorn-args: chmod +x local.sh && \ ./local.sh {{uvicorn-args}} +deploy: + chmod +x deploy.sh && \ + ./deploy.sh + build-index memory-limit="1024" max-docs="-1": cd src/backend/search_engine/scripts/ && \ chmod +x build-index.sh && \ @@ -37,12 +41,23 @@ generate-stubs: ./generate-stubs.sh lint: - @echo "Linting with Ruff..." - cd src/backend && uv run ruff check api/ search_engine/ ../../tests/ - cd src/backend && uv run ruff format --check --diff api/ search_engine/ ../../tests/ + @echo "Linting Python code..." + cd src/backend && uv run ruff check api/ search_engine/ tests/ + cd src/backend && uv run ruff format --check --diff api/ search_engine/ tests/ + @echo "Linting C++ code..." # only format-check instead of linting to avoid dependency-related failures + clang-format --dry-run --Werror \ + src/backend/bindings/utils.cpp \ + src/backend/search_engine/index_builder/index_builder.cpp \ + src/backend/search_engine/index_builder/merge_partial_indices.cpp format: - cd src/backend && uv run ruff format api/ search_engine/ ../../tests/ + @echo "Formatting Python code..." + cd src/backend && uv run ruff format api/ search_engine/ tests/ + @echo "Formatting C++ code..." + clang-format -i \ + src/backend/bindings/utils.cpp \ + src/backend/search_engine/index_builder/index_builder.cpp \ + src/backend/search_engine/index_builder/merge_partial_indices.cpp mypy: @echo "Type checking with MyPy..." @@ -50,4 +65,4 @@ mypy: cd src/backend && uv run mypy search_engine/ test: - just -f tests/justfile test \ No newline at end of file + just -f src/backend/tests/justfile test \ No newline at end of file diff --git a/local.sh b/local.sh index a410d7a..58e8bf4 100755 --- a/local.sh +++ b/local.sh @@ -6,7 +6,7 @@ export LOG_LEVEL=DEBUG PIDS=() cd src -uv run --project backend uvicorn backend.api.v1.app:app --host 127.0.0.1 --port 8000 "$@" & +uv run --project backend --refresh uvicorn backend.api.v1.app:app --host 127.0.0.1 --port 8000 "$@" & PIDS+=($!) echo "Uvicorn server started with PID ${PIDS[0]}" diff --git a/src/backend/.clang-format b/src/backend/.clang-format new file mode 100644 index 0000000..4ca6bcb --- /dev/null +++ b/src/backend/.clang-format @@ -0,0 +1,3 @@ +BasedOnStyle: Google +IndentWidth: 4 +ColumnLimit: 100 diff --git a/src/backend/.dockerignore b/src/backend/.dockerignore new file mode 100644 index 0000000..1499f19 --- /dev/null +++ b/src/backend/.dockerignore @@ -0,0 +1,65 @@ +# Python bytecode +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# Poetry / uv +.python-version +.uv/ +.python-version +.poetry/ +pdm.lock +.pdm-build/ + +# Test / coverage +.coverage +.coverage.* +.pytest_cache/ +htmlcov/ +nosetests.xml +coverage.xml +*.cover +*.py,cover + +# Distribution / packaging +build/ +dist/ +*.egg-info/ +*.egg +*.whl +*.tar.gz + +# IDE / editor +.vscode/ +.idea/ +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +# OS files +.DS_Store +Thumbs.db + +node_modules/ +dist/ + +**/*.tsv +**/*.gz +# allow test data files as they are small and necessary for tests +!search_engine/index_builder/test_data/*.tsv +!search_engine/index_builder/test_data/*.gz + +search_engine/index_builder/build/ +search_engine/index_builder/data/ +search_engine/index/bin/ +search_engine/models/neuspell-scrnn-probwordnoise/ \ No newline at end of file diff --git a/src/backend/Dockerfile b/src/backend/Dockerfile new file mode 100644 index 0000000..b93b94f --- /dev/null +++ b/src/backend/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.13-slim + +# system dependencies +# build-essential for packages using c extensions +# git for packages installed from git +# libstemmer for CMake build of index_builder +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + curl \ + cmake \ + libstemmer-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# set workdir to where pyproject.toml is located for uv +WORKDIR /app/src/backend + +# copy first to cache dependencies +COPY pyproject.toml . +COPY uv.lock . +COPY bindings/ ./bindings/ + +RUN uv sync + +COPY . . + +ENV PYTHONPATH="/app/src:$PYTHONPATH" + +CMD ["uv", "run", "uvicorn", "api.v1.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/src/backend/bindings/cpp_utils/_core.pyi b/src/backend/bindings/cpp_utils/_core.pyi index 15822d2..f197165 100644 --- a/src/backend/bindings/cpp_utils/_core.pyi +++ b/src/backend/bindings/cpp_utils/_core.pyi @@ -28,12 +28,15 @@ class Metadata: class DocStore: def get(self, doc_id: int) -> DocInfo | None: ... + def get_tsv_offset(self, doc_id: int) -> int | None: + ... class IndexAccessor: def get(self, term: str) -> PostingList | None: ... class InvertedIndex: def __init__(self, arg0: str) -> None: ... + def clear_cache(self) -> None: ... @property def doc_store(self) -> DocStore: ... diff --git a/src/backend/bindings/utils.cpp b/src/backend/bindings/utils.cpp index a6618e8..c080115 100644 --- a/src/backend/bindings/utils.cpp +++ b/src/backend/bindings/utils.cpp @@ -1,27 +1,78 @@ +#include #include -#include // for automatic conversion of STL containers -#include -#include +#include // for automatic conversion of STL containers + +#include #include #include -#include -#include +#include +#include #include +#include +#include +#include +#include + #include "libstemmer.h" namespace py = pybind11; -struct SnowballStemmer { - struct sb_stemmer* stemmer; - SnowballStemmer() { - stemmer = sb_stemmer_new("english", nullptr); +bool is_valid_utf8(const std::string& s) { + const unsigned char* b = reinterpret_cast(s.data()); + size_t n = s.size(); + + for (size_t i = 0; i < n; ++i) { + if (b[i] <= 0x7F) continue; + + size_t len = 0; + if ((b[i] & 0xE0) == 0xC0) + len = 1; + else if ((b[i] & 0xF0) == 0xE0) + len = 2; + else if ((b[i] & 0xF8) == 0xF0) + len = 3; + else + return false; + + if (i + len >= n) return false; + + for (size_t j = 1; j <= len; ++j) { + if ((b[i + j] & 0xC0) != 0x80) return false; + } + i += len; } - ~SnowballStemmer() { - sb_stemmer_delete(stemmer); + return true; +} + +std::string latin1_to_utf8(const std::string& s) { + std::string out; + out.reserve(s.size() * 2); + + for (unsigned char c : s) { + if (c < 0x80) { + out.push_back(static_cast(c)); + } else { + out.push_back(static_cast(0xC0 | (c >> 6))); + out.push_back(static_cast(0x80 | (c & 0x3F))); + } } + return out; +} + +std::string ensure_utf8(const std::string& s) { + if (is_valid_utf8(s)) { + return s; + } + return latin1_to_utf8(s); +} + +struct SnowballStemmer { + struct sb_stemmer* stemmer; + SnowballStemmer() { stemmer = sb_stemmer_new("english", nullptr); } + ~SnowballStemmer() { sb_stemmer_delete(stemmer); } std::string stem(const std::string& word) { - const sb_symbol* stemmed = sb_stemmer_stem(stemmer, - reinterpret_cast(word.c_str()), word.size()); + const sb_symbol* stemmed = + sb_stemmer_stem(stemmer, reinterpret_cast(word.c_str()), word.size()); int out_len = sb_stemmer_length(stemmer); if (stemmed == nullptr || out_len <= 0) return std::string(); return std::string(reinterpret_cast(stemmed), static_cast(out_len)); @@ -33,14 +84,14 @@ const std::unordered_set KEEP_TOKENS = {"AND", "&", "OR", "|", "NOT std::vector normalize_search_query(const std::string& text) { std::vector tokens; - std::string token; // lowercase version for stemming - std::string token_original; // exact original casing + std::string token; // lowercase version for stemming + std::string token_original; // exact original casing auto flush_token = [&]() { if (token.empty()) return; if (KEEP_TOKENS.find(token_original) != KEEP_TOKENS.end()) { - tokens.push_back(token_original); // keep original casing for operators/parentheses + tokens.push_back(token_original); // keep original casing for operators/parentheses } else { tokens.push_back(stemmer.stem(token)); } @@ -52,7 +103,7 @@ std::vector normalize_search_query(const std::string& text) { for (char c : text) { if (std::isalnum(static_cast(c))) { token += std::tolower(static_cast(c)); - token_original += c; // keep original case + token_original += c; // keep original case continue; } @@ -67,7 +118,7 @@ std::vector normalize_search_query(const std::string& text) { std::string special(1, c); if (KEEP_TOKENS.find(special) != KEEP_TOKENS.end()) { - tokens.push_back(special); // operator/punctuation + tokens.push_back(special); // operator/punctuation } } @@ -83,8 +134,10 @@ struct Metadata { std::unordered_map doc_lengths; void load(const std::string& path) { + std::cout << "Metadata: " << path << std::endl; std::ifstream in(path, std::ios::binary); if (!in.is_open()) throw std::runtime_error("Cannot open metadata file"); + if (in.peek() == EOF) throw std::runtime_error("Metadata file is empty"); in.read(reinterpret_cast(&num_docs), sizeof(num_docs)); in.read(reinterpret_cast(&avg_doc_length), sizeof(avg_doc_length)); @@ -113,11 +166,9 @@ struct PostingList { PostingList() = default; - PostingList( - const std::vector& p, - const std::unordered_map& tf, - const std::unordered_map>& pos - ) : postings(p), term_frequencies(tf), positions(pos) {} + PostingList(const std::vector& p, const std::unordered_map& tf, + const std::unordered_map>& pos) + : postings(p), term_frequencies(tf), positions(pos) {} void build_skip_pointers() { size_t skip_interval = static_cast(std::sqrt(postings.size())); @@ -129,119 +180,140 @@ struct PostingList { } }; -PostingList read_posting_list(std::ifstream& in, uint64_t offset, uint32_t docFreq) { +PostingList read_posting_list(std::ifstream& in, uint64_t offset, uint32_t doc_freq) { PostingList pl; - pl.doc_frequency = docFreq; + pl.doc_frequency = doc_freq; in.seekg(offset); - - pl.postings.resize(docFreq); - - for (uint32_t i = 0; i < docFreq; i++) { + + pl.postings.resize(doc_freq); + + for (uint32_t i = 0; i < doc_freq; i++) { uint32_t doc_id, pos_count; in.read(reinterpret_cast(&doc_id), sizeof(doc_id)); in.read(reinterpret_cast(&pos_count), sizeof(pos_count)); - + pl.postings[i] = doc_id; pl.term_frequencies[doc_id] = pos_count; - std::vector positions(pos_count); in.read(reinterpret_cast(positions.data()), pos_count * sizeof(uint32_t)); pl.positions[doc_id] = std::move(positions); } - + pl.build_skip_pointers(); - + return pl; } -struct DocInfo { - std::string url; - std::string title; - - DocInfo() = default; +// Optimized helper to only read positions for one document for snippetting +std::vector scan_posting_list_for_doc(std::ifstream& in, uint64_t offset, + uint32_t doc_freq, uint32_t target_doc_id) { + in.seekg(offset); - DocInfo(const std::string& u, const std::string& t) - : url(u), title(t) {} -}; + for (uint32_t i = 0; i < doc_freq; i++) { + uint32_t doc_id, pos_count; + in.read(reinterpret_cast(&doc_id), sizeof(doc_id)); + in.read(reinterpret_cast(&pos_count), sizeof(pos_count)); -class DocStore { -private: - std::unordered_map offsets; - std::ifstream data_in; - uint32_t total_docs; + if (doc_id == target_doc_id) { + std::vector positions(pos_count); + in.read(reinterpret_cast(positions.data()), pos_count * sizeof(uint32_t)); + return positions; + } -public: - void open(const std::string& dir_name) { - data_in.open(dir_name + "/docstore.bin", std::ios::binary); - std::ifstream off(dir_name + "/docstore_offsets.bin", std::ios::binary); + // if we passed the doc_id (list is sorted), it's not there + if (doc_id > target_doc_id) { + return {}; + } - if (!data_in || !off) - throw std::runtime_error("Could not open docstore"); + // skip positions for this doc + in.seekg(pos_count * sizeof(uint32_t), std::ios::cur); + } - // docCount at the beginning - data_in.read(reinterpret_cast(&total_docs), sizeof(total_docs)); + return {}; +} - while (true) { - uint32_t id; - uint64_t off64; +struct DocInfo { + std::string url; + std::string title; + std::string snippet; - if (!off.read(reinterpret_cast(&id), sizeof(id))) break; - if (!off.read(reinterpret_cast(&off64), sizeof(off64))) break; + DocInfo() = default; - offsets[id] = off64; - } - } + DocInfo(const std::string& u, const std::string& t, const std::string& s) + : url(u), title(t), snippet(s) {} +}; - std::optional get(uint32_t doc_id) { - auto it = offsets.find(doc_id); - if (it == offsets.end()) return std::nullopt; +class InvertedIndex; // forward declaration - uint64_t offset = it->second; - data_in.seekg(offset); +class DocStore { + private: + InvertedIndex* parent; + struct DocOffset { + uint64_t docstore_offset; // docstore data offset + uint64_t tsv_offset; // offset into the msmarco tsv for body retrieval + }; + std::unordered_map offsets; + std::ifstream data_in; + std::ifstream tsv_in; + uint32_t total_docs = 0; - uint32_t url_len; - data_in.read(reinterpret_cast(&url_len), sizeof(url_len)); + struct Hit { + uint32_t pos; + std::string term; + }; - std::string url(url_len, '\0'); - data_in.read(url.data(), url_len); + struct SubsnippetResult { + uint32_t start; + uint32_t end; + std::vector remaining_hits; + }; - uint32_t title_len; - data_in.read(reinterpret_cast(&title_len), sizeof(title_len)); + SubsnippetResult find_subsnippet(const std::vector& hits, int max_window_size, + size_t required_term_count, + std::vector>& term_positions); - std::string title(title_len, '\0'); - data_in.read(title.data(), title_len); + public: + std::vector query_terms; - return DocInfo{url, title}; - } + DocStore(InvertedIndex* p) : parent(p) {} + void open(const std::string& dir_name); + std::string load_snippet(uint32_t doc_id, + std::vector>& snippet_window_borders, + std::vector>& term_positions, + uint64_t tsv_offset); + std::string get_snippet(uint32_t doc_id, uint64_t tsv_offset); + std::optional get(uint32_t doc_id); + std::optional get_tsv_offset(uint32_t doc_id); uint32_t size() const { return total_docs; } }; -class InvertedIndex; // forward - class IndexAccessor { -private: + private: InvertedIndex* parent; -public: + + public: IndexAccessor(InvertedIndex* p) : parent(p) {} std::optional get(const std::string& term); }; class InvertedIndex { -private: + private: std::unordered_map term_to_offset; std::unordered_map term_to_docfreq; std::ifstream postings_file; -public: + public: Metadata metadata; DocStore doc_store; IndexAccessor index; - InvertedIndex(const std::string& base_path) - : index(this) - { + // Query cache for performance (especially snippeting of rare + common term combos) + std::unordered_map> cache; + void clear_cache() { cache.clear(); } + + InvertedIndex(const std::string& base_path) : doc_store(this), index(this) { std::ifstream index_file(base_path + "/index.bin", std::ios::binary); while (true) { uint32_t term_len; @@ -253,11 +325,11 @@ class InvertedIndex { uint64_t offset; if (!index_file.read(reinterpret_cast(&offset), sizeof(offset))) break; - uint32_t docFreq; - index_file.read(reinterpret_cast(&docFreq), sizeof(docFreq)); - + uint32_t doc_freq; + index_file.read(reinterpret_cast(&doc_freq), sizeof(doc_freq)); + term_to_offset[term] = offset; - term_to_docfreq[term] = docFreq; + term_to_docfreq[term] = doc_freq; } postings_file.open(base_path + "/postinglists.bin", std::ios::binary); @@ -267,22 +339,389 @@ class InvertedIndex { doc_store.open(base_path); } + friend class DocStore; friend class IndexAccessor; }; +// --- Docstore --- +void DocStore::open(const std::string& dir_name) { + data_in.open(dir_name + "/docstore.bin", std::ios::binary); + std::string data_dir = "data"; + const char* test_env = std::getenv( + "ENV"); // for integration tests, test with controlled and small dataset in test_data + if (test_env && std::string(test_env) == "TEST_ENV") { + data_dir = "test_data"; + } + tsv_in.open(dir_name + "/../../index_builder/" + data_dir + "/msmarco-docs.tsv", + std::ios::binary); + std::ifstream off(dir_name + "/docstore_offsets.bin", std::ios::binary); + + if (!data_in || !tsv_in || !off) throw std::runtime_error("Could not open docstore"); + + // docCount at the beginning + data_in.read(reinterpret_cast(&total_docs), sizeof(total_docs)); + + while (true) { + uint32_t id; + uint64_t off64; + uint64_t tsvOff; + + if (!off.read(reinterpret_cast(&id), sizeof(id))) break; + if (!off.read(reinterpret_cast(&off64), sizeof(off64))) break; + if (!off.read(reinterpret_cast(&tsvOff), sizeof(tsvOff))) break; + + offsets[id] = {off64, tsvOff}; + } +} + +std::string DocStore::load_snippet( + uint32_t doc_id, std::vector>& snippet_window_borders, + std::vector>& term_positions, uint64_t tsv_offset) { + if (snippet_window_borders.empty()) return ""; + + std::sort(snippet_window_borders.begin(), snippet_window_borders.end()); + tsv_in.clear(); + tsv_in.seekg(tsv_offset); + std::string line; + if (!std::getline(tsv_in, line)) { + return ""; + } + + // parse line: [DocID] \t [URL] \t [Title] \t [Content] + // we need to find the 3rd tab to get to content + size_t pos = 0; + int tab_count = 0; + while (tab_count < 3) { + pos = line.find('\t', pos); + if (pos == std::string::npos) return ""; // invalid format + pos++; // skip the tab + tab_count++; + } + + size_t content_start = pos; + size_t len = line.size(); + std::string snippet; + snippet.reserve(200); + uint32_t current_word_pos = 0; + size_t i = content_start; + size_t window_idx = 0; + + if (snippet_window_borders[0].first > 0) { + snippet += "... "; + } + + // helper lambda to check if character is sentence-ending punctuation + auto is_sentence_end = [](char c) { return c == '.' || c == '!' || c == '?'; }; + + // Calculate threshold for last window (last 10%) + uint32_t last_window_start = snippet_window_borders.back().first; + uint32_t last_window_end = snippet_window_borders.back().second; + uint32_t last_window_size = last_window_end - last_window_start + 1; + uint32_t last_window_threshold = last_window_end - (last_window_size / 10); + + bool stopped_at_sentence_end = false; + + while (i < len && window_idx < snippet_window_borders.size()) { + std::set highlight_positions; + if (window_idx < term_positions.size()) { + highlight_positions = std::set(term_positions[window_idx].begin(), + term_positions[window_idx].end()); + } + // --- determine word --- + size_t word_start = i; + while (word_start < len && !std::isalnum(static_cast(line[word_start]))) { + word_start++; + } + std::string separator = line.substr(i, word_start - i); + if (word_start >= len) { + // No more words + break; + } + size_t word_end = word_start; + while (word_end < len && std::isalnum(static_cast(line[word_end]))) { + word_end++; + } + std::string word = line.substr(word_start, word_end - word_start); + // --------------------- + + // check if current word is in relevant window + // skip windows that are already passed + while (window_idx < snippet_window_borders.size() && + current_word_pos > snippet_window_borders[window_idx].second) { + window_idx++; + if (window_idx < snippet_window_borders.size()) { + snippet += " ... "; + } + } + + if (window_idx < snippet_window_borders.size()) { + uint32_t w_start = snippet_window_borders[window_idx].first; + uint32_t w_end = snippet_window_borders[window_idx].second; + + if (current_word_pos >= w_start && current_word_pos <= w_end) { + // highlight a found term + if (highlight_positions.count(current_word_pos)) { + word = "" + word + ""; + } + + if (current_word_pos == w_start) { + snippet += word; + } else { + snippet += separator + word; + } + + // check if we're in the last window and in its last 10% + bool is_last_window = (window_idx == snippet_window_borders.size() - 1); + if (is_last_window && current_word_pos >= last_window_threshold && + current_word_pos < w_end) { + // look for sentence-ending punctuation after this word + size_t check_pos = word_end; + while (check_pos < len && check_pos < word_end + 3) { + if (is_sentence_end(line[check_pos])) { + // add it and stop early + snippet += line[check_pos]; + stopped_at_sentence_end = true; + break; + } + check_pos++; + } + if (stopped_at_sentence_end) { + break; + } + } + } + } + + // advance + i = word_end; + current_word_pos++; + } + + // check if there is more text after the snippets (only if we didn't stop at sentence end) + if (!stopped_at_sentence_end && window_idx >= snippet_window_borders.size()) { + size_t check = i; + while (check < len && !std::isalnum(static_cast(line[check]))) check++; + if (check < len) { + snippet += " ..."; + } + } + + return ensure_utf8(snippet); +} + +DocStore::SubsnippetResult DocStore::find_subsnippet( + const std::vector& hits, int max_window_size, size_t required_term_count, + std::vector>& term_positions) { + SubsnippetResult result{}; + result.start = 0; + result.end = 0; + + if (hits.empty()) return result; + + std::unordered_map + window_term_count; // count term occurance in the window + + uint32_t left = 0; + uint32_t best_start = hits[0].pos; + uint32_t best_end = hits[0].pos; + uint32_t best_score = 0; + + // mark the best window indices + uint32_t best_left_idx = 0; + uint32_t best_right_idx = 0; + + for (uint32_t right = 0; right < hits.size(); ++right) { + window_term_count[hits[right].term]++; + + // shrink window if too large, adjust term counts + while (hits[right].pos - hits[left].pos > max_window_size) { + auto& c = window_term_count[hits[left].term]; + if (--c == 0) window_term_count.erase(hits[left].term); + left++; + } + + // score is number of unique terms in this window + uint32_t score = window_term_count.size(); + + // update score or choose smaller snippet -> terms more together + if (score > best_score || + (score == best_score && (hits[right].pos - hits[left].pos) < (best_end - best_start))) { + best_score = score; + best_start = hits[left].pos; + best_end = hits[right].pos; + best_left_idx = left; + best_right_idx = right; + + if (best_score == required_term_count) break; // perfect snippet found + } + } + + result.start = best_start; + result.end = best_end; + + // collect left hits (only relevant if called by first window for second window) + result.remaining_hits.reserve(hits.size()); + for (uint32_t i = 0; i < hits.size(); ++i) { + if (i > best_right_idx) result.remaining_hits.push_back(hits[i]); + } + + // for highlighting positions bold + std::vector window_positions; + + for (const auto& hit : hits) { + if (hit.pos >= best_start && hit.pos <= best_end) { + window_positions.push_back(hit.pos); + } + } + + term_positions.push_back(window_positions); + + return result; +} + +// total snippet length: max. MAX_WINDOW_SIZE x 2 + 1 or 2x "..." +std::string DocStore::get_snippet(uint32_t doc_id, uint64_t tsv_offset) { + int MAX_WINDOW_SIZE = 15; // max. num of words PER subsnippet + + if (query_terms.empty()) { + throw std::runtime_error( + "Set query_terms (not empty): InvertedIndex().doc_store.query_terms = ..."); + } + std::set unique_terms(query_terms.begin(), query_terms.end()); + + // e.g. + // hits = [ {pos: 3, term: "foo"}, {pos: 10, term: "bar"}, {pos: 15, term: "foo"}, {pos: 18, + // term: "bar"} ] + std::vector hits; + for (const auto& term : unique_terms) { + auto cache_it = parent->cache.find(term); + if (cache_it != parent->cache.end()) { + const auto& pl = *cache_it->second; + auto posIt = pl.positions.find(doc_id); + if (posIt != pl.positions.end()) { + for (uint32_t pos : posIt->second) hits.push_back(Hit{pos, term}); + } + continue; + } + + // --- term not found in cache --- + auto termIt = parent->term_to_offset.find(term); + if (termIt == parent->term_to_offset.end()) continue; + auto docIt = parent->term_to_docfreq.find(term); + + std::vector positions = + scan_posting_list_for_doc(parent->postings_file, termIt->second, docIt->second, doc_id); + if (positions.empty()) continue; + + for (uint32_t pos : positions) hits.push_back(Hit{pos, term}); + // --------------------------------- + } + std::sort(hits.begin(), hits.end(), [](const Hit& a, const Hit& b) { return a.pos < b.pos; }); + + std::vector> term_positions; // positions of the search terms in window i + // example: [[1, 3, 5], [2, 5]], in window 0 (index 0): term A and B at pos. 1, 3, 5, etc. + // create first optimal snippet + SubsnippetResult first_snippet = + find_subsnippet(hits, MAX_WINDOW_SIZE, unique_terms.size(), term_positions); + + // can be one if all terms fit into MAX_WINDOW_SIZE, or at most 2 for remaining terms + // not more than 2 for readability + std::vector> snippet_window_borders; + snippet_window_borders.push_back({first_snippet.start, first_snippet.end}); + + if (!first_snippet.remaining_hits.empty()) { + SubsnippetResult second_snippet = find_subsnippet( + first_snippet.remaining_hits, MAX_WINDOW_SIZE, unique_terms.size(), term_positions); + + if (second_snippet.end > 0) { + snippet_window_borders.push_back({second_snippet.start, second_snippet.end}); + } + } + + // enhance context if windows are too small + int total_budget = MAX_WINDOW_SIZE * 2; + + if (snippet_window_borders.size() == 1) { + // only one window --> can consume 2x the size + auto& [start, end] = snippet_window_borders[0]; + int window_len = end - start; + int remaining = total_budget - window_len; + int left_context = remaining / 2; + int right_context = remaining - left_context; + + start = (start >= left_context) ? start - left_context : 0; + end = end + right_context; + + } else { + // 2 windows --> half each + int budget_per_window = total_budget / snippet_window_borders.size(); + + for (auto& [start, end] : snippet_window_borders) { + int window_len = end - start; + int remaining = budget_per_window - window_len; + int left_context = remaining / 2; + int right_context = remaining - left_context; + + start = (start >= left_context) ? start - left_context : 0; + end = end + right_context; + } + } + + return load_snippet(doc_id, snippet_window_borders, term_positions, tsv_offset); +} + +std::optional DocStore::get_tsv_offset(uint32_t doc_id) { + auto it = offsets.find(doc_id); + if (it == offsets.end()) return std::nullopt; + return it->second.tsv_offset; +} + +std::optional DocStore::get( + uint32_t doc_id) { // only load snippet when required as resource-intensive + auto it = offsets.find(doc_id); + if (it == offsets.end()) return std::nullopt; + + uint64_t docstore_offset = it->second.docstore_offset; + data_in.seekg(docstore_offset); + uint64_t tsv_offset = it->second.tsv_offset; + + uint32_t url_len; + data_in.read(reinterpret_cast(&url_len), sizeof(url_len)); + + std::string url(url_len, '\0'); + data_in.read(url.data(), url_len); + + uint32_t title_len; + data_in.read(reinterpret_cast(&title_len), sizeof(title_len)); + + std::string title(title_len, '\0'); + data_in.read(title.data(), title_len); + + std::string snippet = get_snippet(doc_id, tsv_offset); + return DocInfo{url, title, snippet}; +} +// -------------------- + std::optional IndexAccessor::get(const std::string& term) { + // Check cache + auto cache_it = parent->cache.find(term); + if (cache_it != parent->cache.end()) { + return *cache_it->second; + } + auto it = parent->term_to_offset.find(term); if (it == parent->term_to_offset.end()) return std::nullopt; - uint32_t docFreq = parent->term_to_docfreq.at(term); - PostingList pl = read_posting_list(parent->postings_file, it->second, docFreq); + uint32_t doc_freq = parent->term_to_docfreq.at(term); + PostingList pl = read_posting_list(parent->postings_file, it->second, doc_freq); + + // Add to cache + parent->cache[term] = std::make_shared(pl); + return pl; } -PostingList positional_intersect( - const PostingList& pl1, - const PostingList& pl2, - uint32_t distance -) { +PostingList positional_intersect(const PostingList& pl1, const PostingList& pl2, + uint32_t distance) { PostingList result; const auto& p1 = pl1.postings; @@ -296,7 +735,7 @@ PostingList positional_intersect( const size_t n1 = p1.size(); const size_t n2 = p2.size(); - result.postings.reserve(std::min(n1, n2)); // conservative + result.postings.reserve(std::min(n1, n2)); // conservative while (i < n1 && j < n2) { uint32_t doc1 = p1[i]; @@ -354,7 +793,7 @@ PostingList positional_intersect( } } - else { // doc2 < doc1 + else { // doc2 < doc1 // skip pointer support for pl2 auto it_s2 = skip2.find(j); if (it_s2 != skip2.end() && it_s2->second < n2 && p2[it_s2->second] <= doc1) { @@ -370,12 +809,7 @@ PostingList positional_intersect( return result; } -// faster if left posting list is smaller -PostingList find_docs( - const PostingList& pl1, - const PostingList& pl2, - const std::string& mode -) { +PostingList find_docs(const PostingList& pl1, const PostingList& pl2, const std::string& mode) { const auto& p1 = pl1.postings; const auto& p2 = pl2.postings; @@ -390,12 +824,11 @@ PostingList find_docs( const size_t n2 = p2.size(); std::vector result_postings; - result_postings.reserve(std::min(n1, n2)); // most likely + result_postings.reserve(std::min(n1, n2)); // most likely std::unordered_map result_tf; if (mode == "AND") { - while (i < n1 && j < n2) { uint32_t d1 = p1[i]; uint32_t d2 = p2[j]; @@ -406,16 +839,14 @@ PostingList find_docs( i++; j++; - } - else if (d1 < d2) { + } else if (d1 < d2) { auto it = skip1.find(i); if (it != skip1.end() && p1[it->second] <= d2) { i = it->second; } else { i++; } - } - else { // d2 < d1 + } else { // d2 < d1 auto it = skip2.find(j); if (it != skip2.end() && p2[it->second] <= d1) { j = it->second; @@ -443,14 +874,13 @@ PostingList find_docs( if (x == y) { merged.push_back(x); result_tf[x] = tf1.at(x) + tf2.at(y); - a++; b++; - } - else if (x < y) { + a++; + b++; + } else if (x < y) { merged.push_back(x); result_tf[x] = tf1.at(x); a++; - } - else { + } else { merged.push_back(y); result_tf[y] = tf2.at(y); b++; @@ -505,41 +935,30 @@ PostingList find_docs( PYBIND11_MODULE(_core, m) { m.doc() = "CPP utils for search engine"; - m.def("normalize_search_query", &normalize_search_query, - py::arg("text"), - "Normalize and stem search query into tokens, but keep logical operators and parentheses as is"); + m.def("normalize_search_query", &normalize_search_query, py::arg("text"), + "Normalize and stem search query into tokens, but keep logical operators and parentheses " + "as is"); - m.def("positional_intersect", &positional_intersect, - py::arg("pl1"), py::arg("pl2"), py::arg("distance") = 1, - "Positional intersection of two posting lists with given distance"); + m.def("positional_intersect", &positional_intersect, py::arg("pl1"), py::arg("pl2"), + py::arg("distance") = 1, + "Positional intersection of two posting lists with given distance"); - m.def( - "find_docs", - &find_docs, - py::arg("pl1"), - py::arg("pl2"), - py::arg("mode"), - "Find documents that are in both posting lists" - ); + m.def("find_docs", &find_docs, py::arg("pl1"), py::arg("pl2"), py::arg("mode"), + "Find documents that are in both posting lists"); py::class_(m, "DocInfo") .def(py::init<>()) - .def(py::init(), - py::arg("url"), py::arg("title")) + .def(py::init(), py::arg("url"), + py::arg("title"), py::arg("snippet")) .def_readonly("url", &DocInfo::url) - .def_readonly("title", &DocInfo::title); + .def_readonly("title", &DocInfo::title) + .def_readonly("snippet", &DocInfo::snippet); py::class_(m, "PostingList") .def(py::init<>()) - .def(py::init< - const std::vector&, - const std::unordered_map&, - const std::unordered_map>& - >(), - py::arg("postings"), - py::arg("term_frequencies"), - py::arg("positions") - ) + .def(py::init&, const std::unordered_map&, + const std::unordered_map>&>(), + py::arg("postings"), py::arg("term_frequencies"), py::arg("positions")) .def_readonly("postings", &PostingList::postings) .def_readonly("term_frequencies", &PostingList::term_frequencies) .def_readonly("positions", &PostingList::positions) @@ -553,14 +972,16 @@ PYBIND11_MODULE(_core, m) { .def("get_doc_length", &Metadata::get_doc_length, py::arg("doc_id")); py::class_(m, "DocStore") - .def("get", &DocStore::get, py::arg("doc_id")); + .def("get", &DocStore::get, py::arg("doc_id")) + .def("get_tsv_offset", &DocStore::get_tsv_offset, py::arg("doc_id")) + .def_readwrite("query_terms", &DocStore::query_terms); - py::class_(m, "IndexAccessor") - .def("get", &IndexAccessor::get, py::arg("term")); + py::class_(m, "IndexAccessor").def("get", &IndexAccessor::get, py::arg("term")); py::class_(m, "InvertedIndex") .def(py::init()) .def_readonly("index", &InvertedIndex::index) .def_readonly("metadata", &InvertedIndex::metadata) - .def_readonly("doc_store", &InvertedIndex::doc_store); -} \ No newline at end of file + .def_readonly("doc_store", &InvertedIndex::doc_store) + .def("clear_cache", &InvertedIndex::clear_cache); +} diff --git a/src/backend/search_engine/index_builder/index_builder.cpp b/src/backend/search_engine/index_builder/index_builder.cpp index f15f567..0c57d5e 100644 --- a/src/backend/search_engine/index_builder/index_builder.cpp +++ b/src/backend/search_engine/index_builder/index_builder.cpp @@ -1,45 +1,52 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include + +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "include/robin_hood.h" // not encoded as neglectably small class DocStoreWriter { -private: - std::ofstream outStream; + private: + std::ofstream outStream; std::ofstream offsetStream; - uint64_t currentByteOffset; // offset where next doc will be written/read + uint64_t currentByteOffset; // offset where next doc will be written/read uint32_t docCount; -public: + public: void init(const std::string& filename_base) { - outStream.open(filename_base + "/docstore.bin", std::ios::binary | std::ios::out | std::ios::trunc); - offsetStream.open(filename_base + "/docstore_offsets.bin", std::ios::binary | std::ios::out | std::ios::trunc); - - currentByteOffset = 0; + outStream.open(filename_base + "/docstore.bin", + std::ios::binary | std::ios::out | std::ios::trunc); + offsetStream.open(filename_base + "/docstore_offsets.bin", + std::ios::binary | std::ios::out | std::ios::trunc); + + currentByteOffset = 0; docCount = 0; - + outStream.write(reinterpret_cast(&docCount), sizeof(docCount)); currentByteOffset += sizeof(docCount); } - void addDocument(uint32_t docId, const std::string& url, const std::string& title) { + void addDocument(uint32_t docId, const std::string& url, const std::string& title, + uint64_t tsvOffset) { /* offsetStream: [0-3] docId = 42 - [4-11] offset = 0 (start of this doc in outStream) + [4-11] docStoreOffset = 0 (start of this doc in outStream) + [12-19] tsvOffset = ... (start of the line of this doc in original tsv) - [12-15] docId = 105 - [16-23] offset = 18 (start of this doc in outStream) + [20-23] docId = 105 + [24-31] docStoreOffset = 18 (start of this doc in outStream) + [32-39] tsvOffset = ... ... outStream: @@ -54,8 +61,10 @@ class DocStoreWriter { [35-36] 'H' 'i' */ offsetStream.write(reinterpret_cast(&docId), sizeof(docId)); - offsetStream.write(reinterpret_cast(¤tByteOffset), sizeof(currentByteOffset)); - + offsetStream.write(reinterpret_cast(¤tByteOffset), + sizeof(currentByteOffset)); + offsetStream.write(reinterpret_cast(&tsvOffset), sizeof(tsvOffset)); + uint32_t urlLen = url.size(); outStream.write(reinterpret_cast(&urlLen), sizeof(urlLen)); outStream.write(url.data(), urlLen); @@ -66,7 +75,6 @@ class DocStoreWriter { // faster than tellp() currentByteOffset += sizeof(uint32_t) + urlLen + sizeof(uint32_t) + titleLen; - docCount++; } @@ -87,17 +95,16 @@ struct Posting { }; static const robin_hood::unordered_flat_set STOP_WORDS = { - "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", - "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", - "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" -}; + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", + "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", + "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"}; class Tokenizer { -private: + private: struct sb_stemmer* stemmer; std::string tokenBuffer; - -public: + + public: Tokenizer() { stemmer = sb_stemmer_new("english", "UTF_8"); if (!stemmer) { @@ -105,64 +112,58 @@ class Tokenizer { } tokenBuffer.reserve(64); } - + ~Tokenizer() { if (stemmer) sb_stemmer_delete(stemmer); } - + // non-copyable Tokenizer(const Tokenizer&) = delete; Tokenizer& operator=(const Tokenizer&) = delete; - - template + + template void tokenize(const char* text, size_t len, Callback&& callback) { int position = 0; size_t i = 0; - + while (i < len) { - while (i < len && !std::isalpha(static_cast(text[i]))) { + while (i < len && !std::isalnum(static_cast(text[i]))) { i++; } if (i >= len) break; - + tokenBuffer.clear(); - while (i < len && std::isalpha(static_cast(text[i]))) { + while (i < len && std::isalnum(static_cast(text[i]))) { tokenBuffer.push_back(std::tolower(static_cast(text[i]))); i++; } - + if (tokenBuffer.empty()) continue; - + if (STOP_WORDS.count(tokenBuffer)) { position++; continue; } - - const sb_symbol* stemmed = sb_stemmer_stem( - stemmer, - reinterpret_cast(tokenBuffer.data()), - tokenBuffer.size() - ); + + const sb_symbol* stemmed = + sb_stemmer_stem(stemmer, reinterpret_cast(tokenBuffer.data()), + tokenBuffer.size()); int stemLen = sb_stemmer_length(stemmer); - + std::string term(reinterpret_cast(stemmed), stemLen); - + callback(std::move(term), position); position++; } } }; -void spillToDisk( - robin_hood::unordered_flat_map>& termPostings, - const robin_hood::unordered_flat_map& termDictionary, - const std::string& postingsFile, - const std::string& dictFile) -{ +void spillToDisk(robin_hood::unordered_flat_map>& termPostings, + const robin_hood::unordered_flat_map& termDictionary, + const std::string& postingsFile, const std::string& dictFile) { std::vector> sortedTerms; sortedTerms.reserve(termDictionary.size()); - for (const auto& kv : termDictionary) - sortedTerms.emplace_back(kv.first, kv.second); + for (const auto& kv : termDictionary) sortedTerms.emplace_back(kv.first, kv.second); // sort for more efficient merging of the spilled files later std::sort(sortedTerms.begin(), sortedTerms.end(), @@ -170,10 +171,9 @@ void spillToDisk( std::ofstream postOut(postingsFile, std::ios::binary); std::ofstream dictOut(dictFile, std::ios::binary); - if (!postOut || !dictOut) - throw std::runtime_error("Failed to open output files"); + if (!postOut || !dictOut) throw std::runtime_error("Failed to open output files"); - static char postBuffer[8 * 1024 * 1024]; // 8MB + static char postBuffer[8 * 1024 * 1024]; // 8MB static char dictBuffer[8 * 1024 * 1024]; postOut.rdbuf()->pubsetbuf(postBuffer, sizeof(postBuffer)); dictOut.rdbuf()->pubsetbuf(dictBuffer, sizeof(dictBuffer)); @@ -182,8 +182,7 @@ void spillToDisk( for (const auto& [term, termId] : sortedTerms) { auto it = termPostings.find(termId); - if (it == termPostings.end()) - continue; + if (it == termPostings.end()) continue; // sort for search and union of posting lists std::vector& postings = it->second; @@ -191,7 +190,7 @@ void spillToDisk( [](const Posting& a, const Posting& b) { return a.docId < b.docId; }); uint64_t startOffset = offset; - + uint32_t docFreq = postings.size(); for (const auto& posting : postings) { @@ -254,32 +253,35 @@ int main(int argc, char* argv[]) { std::filesystem::path projectRoot = exePath.parent_path().parent_path(); std::string dataDir = "/data"; - - const char* test_env = std::getenv("ENV"); // for integration tests, test with controlled and small dataset in test_data + + const char* test_env = std::getenv( + "ENV"); // for integration tests, test with controlled and small dataset in test_data if (test_env && std::string(test_env) == "TEST_ENV") { std::cout << "TEST ENVIRONMENT, building index with test data." << std::endl; dataDir = "/test_data"; - } + } std::string projectDir = projectRoot.string(); std::string partialIndexPostingsDir = projectDir + dataDir + "/partial_indices/postings"; std::string partialIndexDictDir = projectDir + dataDir + "/partial_indices/dictionaries"; - std::string metadataDir = projectDir + dataDir + "/index"; + std::string outputDir = + (projectRoot.parent_path() / "index" / "bin") + .string(); // put in parallel directory index/ where python code expects it + std::string docstoreBase = outputDir; std::filesystem::create_directories(partialIndexPostingsDir); std::filesystem::create_directories(partialIndexDictDir); - std::filesystem::create_directories(metadataDir); + std::filesystem::create_directories(outputDir); Tokenizer tokenizer; std::ifstream infile(projectDir + dataDir + "/msmarco-docs.tsv"); - + if (!infile.is_open()) { std::cerr << "Failed to open input file\n"; return 1; } DocStoreWriter docStore; - std::string docstoreBase = projectDir + dataDir + "/docstore"; std::filesystem::create_directories(docstoreBase); docStore.init(docstoreBase); @@ -299,14 +301,16 @@ int main(int argc, char* argv[]) { uint32_t partialIndexesCount = 0; size_t memoryBytes = 0; + size_t currentLineOffset = + infile.tellg(); // store tsv file offset to restore original doc content for snippets while (std::getline(infile, line)) { lineNumber++; if (memoryBytes > MEMORYLIMIT) { std::string postingsFile = partialIndexPostingsDir + "/postings_" + std::to_string(partialIndexesCount) + ".bin"; - std::string dictFile = partialIndexDictDir + "/dictionary_" + - std::to_string(partialIndexesCount) + ".bin"; + std::string dictFile = + partialIndexDictDir + "/dictionary_" + std::to_string(partialIndexesCount) + ".bin"; try { spillToDisk(termPostings, termDictionary, postingsFile, dictFile); } catch (const std::exception& e) { @@ -319,15 +323,16 @@ int main(int argc, char* argv[]) { memoryBytes = 0; auto elapsed = duration(high_resolution_clock::now() - start).count(); std::cout << "[Partial Index #" << partialIndexesCount << "] " - << "Lines processed: " << lineNumber - << " Time: " << elapsed << "s\n"; + << "Lines processed: " << lineNumber << " Time: " << elapsed << "s\n"; } size_t pos1 = line.find('\t'); size_t pos2 = line.find('\t', pos1 + 1); size_t pos3 = line.find('\t', pos2 + 1); - if (pos3 == std::string::npos) + if (pos3 == std::string::npos) { + currentLineOffset = infile.tellg(); continue; + } // parse docId int docId = -1; @@ -343,12 +348,15 @@ int main(int argc, char* argv[]) { } } } - if (docId < 0) continue; + if (docId < 0) { + currentLineOffset = infile.tellg(); + continue; + } std::string url = line.substr(pos1 + 1, pos2 - pos1 - 1); std::string title = line.substr(pos2 + 1, pos3 - pos2 - 1); - docStore.addDocument(docId, url, title); + docStore.addDocument(docId, url, title, currentLineOffset); // ensure docLengths vector is large enough if (static_cast(docId) >= docLengths.size()) { @@ -362,37 +370,15 @@ int main(int argc, char* argv[]) { size_t titleLen = pos3 - pos2 - 1; const char* contentStart = line.data() + pos3 + 1; size_t contentLen = line.size() - pos3 - 1; - - // process title - tokenizer.tokenize(titleStart, titleLen, [&](std::string&& term, int position) { - docTermCount++; - - uint32_t termId; - auto it = termDictionary.find(term); - if (it == termDictionary.end()) { - termId = termDictionary.size(); - memoryBytes += sizeof(uint32_t) + term.size(); - termDictionary.emplace(std::move(term), termId); - } else { - termId = it->second; - } - auto& postings = termPostings[termId]; - if (postings.empty() || postings.back().docId != docId) { - postings.push_back({docId, {}}); - postings.back().positions.reserve(8); - postings.back().positions.push_back(position); - memoryBytes += sizeof(Posting) + sizeof(int); - } else { - postings.back().positions.push_back(position); - memoryBytes += sizeof(int); - } - }); - + // do not store title positions as it would mix with body positions + tokenizer.tokenize(titleStart, titleLen, + [&](std::string&& term, int position) { docTermCount++; }); + // process content (positions continue from title) tokenizer.tokenize(contentStart, contentLen, [&](std::string&& term, int position) { docTermCount++; - + uint32_t termId; auto it = termDictionary.find(term); if (it == termDictionary.end()) { @@ -414,10 +400,12 @@ int main(int argc, char* argv[]) { memoryBytes += sizeof(int); } }); - + docLengths[docId] = docTermCount; - + if (maxDocs != -1 && lineNumber >= maxDocs) break; + + currentLineOffset = infile.tellg(); } // final flush if remaining data @@ -438,17 +426,17 @@ int main(int argc, char* argv[]) { } double avgDocLength = numDocs > 0 ? static_cast(totalTerms) / numDocs : 0.0; - std::string metadataFile = metadataDir + "/metadata.bin"; + std::string metadataFile = outputDir + "/metadata.bin"; std::ofstream metaOut(metadataFile, std::ios::binary); if (!metaOut) { std::cerr << "Failed to open metadata file for writing\n"; return 1; } - + // write header: numDocs, avgDocLength metaOut.write(reinterpret_cast(&numDocs), sizeof(numDocs)); metaOut.write(reinterpret_cast(&avgDocLength), sizeof(avgDocLength)); - + // write document lengths array (only non-zero entries with their docIds) for (size_t docId = 0; docId < docLengths.size(); docId++) { if (docLengths[docId] > 0) { @@ -460,7 +448,8 @@ int main(int argc, char* argv[]) { } metaOut.close(); - std::cout << "Metadata written: " << numDocs << " documents, avg length: " << avgDocLength << std::endl; + std::cout << "Metadata written: " << numDocs << " documents, avg length: " << avgDocLength + << std::endl; double totalTime = duration(high_resolution_clock::now() - start).count(); std::cout << "Indexing completed in " << totalTime << " seconds.\n"; diff --git a/src/backend/search_engine/index_builder/merge_partial_indices.cpp b/src/backend/search_engine/index_builder/merge_partial_indices.cpp index 919f993..c3e7279 100644 --- a/src/backend/search_engine/index_builder/merge_partial_indices.cpp +++ b/src/backend/search_engine/index_builder/merge_partial_indices.cpp @@ -1,16 +1,16 @@ -#include -#include -#include -#include -#include -#include #include + #include -#include #include +#include +#include #include +#include +#include #include - +#include +#include +#include struct DictEntry { std::string term; @@ -24,45 +24,44 @@ struct PostingEntry { std::vector positions; }; -std::vector readDictionary(const std::string& dictFile, const std::string& postingsFile) { +std::vector readDictionary(const std::string& dictFile, + const std::string& postingsFile) { std::vector entries; - + std::ifstream dictIn(dictFile, std::ios::binary); if (!dictIn) { std::cerr << "Failed to open dictionary file: " << dictFile << std::endl; return entries; } - // get the size of the postings file to determine the last posting's size std::ifstream postIn(postingsFile, std::ios::binary | std::ios::ate); uint64_t postingsFileSize = postIn.tellg(); postIn.close(); - + // read all dictionary entries while (dictIn.peek() != EOF) { DictEntry entry; - + // read term length uint32_t termLen; dictIn.read(reinterpret_cast(&termLen), sizeof(termLen)); if (!dictIn) break; - // read term entry.term.resize(termLen); dictIn.read(&entry.term[0], termLen); if (!dictIn) break; - + // read offset dictIn.read(reinterpret_cast(&entry.offset), sizeof(entry.offset)); if (!dictIn) break; - + // read docFreq dictIn.read(reinterpret_cast(&entry.docFreq), sizeof(entry.docFreq)); if (!dictIn) break; - + entries.push_back(entry); } - + // calculate posting sizes using next entry's offset for (size_t i = 0; i < entries.size(); i++) { if (i + 1 < entries.size()) { @@ -71,42 +70,42 @@ std::vector readDictionary(const std::string& dictFile, const std::st entries[i].postingSize = postingsFileSize - entries[i].offset; } } - + return entries; } -std::vector readAndParsePosting(const std::string& postingsFile, uint64_t offset, uint64_t size) { +std::vector readAndParsePosting(const std::string& postingsFile, uint64_t offset, + uint64_t size) { std::vector entries; - + std::ifstream postIn(postingsFile, std::ios::binary); if (!postIn) { std::cerr << "Failed to open postings file: " << postingsFile << std::endl; return entries; } - + postIn.seekg(offset); - + uint64_t bytesRead = 0; while (bytesRead < size) { PostingEntry entry; - + // read docId postIn.read(reinterpret_cast(&entry.docId), sizeof(entry.docId)); bytesRead += sizeof(entry.docId); - + // read posCount uint32_t posCount; postIn.read(reinterpret_cast(&posCount), sizeof(posCount)); bytesRead += sizeof(posCount); - // read positions entry.positions.resize(posCount); postIn.read(reinterpret_cast(entry.positions.data()), posCount * sizeof(uint32_t)); bytesRead += posCount * sizeof(uint32_t); - + entries.push_back(entry); } - + return entries; } @@ -114,13 +113,13 @@ void writePosting(std::ofstream& out, const std::vector& entries) for (const auto& entry : entries) { // write docId out.write(reinterpret_cast(&entry.docId), sizeof(entry.docId)); - + // write posCount uint32_t posCount = entry.positions.size(); out.write(reinterpret_cast(&posCount), sizeof(posCount)); - + // write positions - out.write(reinterpret_cast(entry.positions.data()), + out.write(reinterpret_cast(entry.positions.data()), posCount * sizeof(uint32_t)); } } @@ -128,28 +127,26 @@ void writePosting(std::ofstream& out, const std::vector& entries) std::vector mergePostings(const std::vector>& allPostings) { // use a map to merge postings by docId (automatically sorted) std::map> mergedMap; - for (const auto& postings : allPostings) { for (const auto& entry : postings) { auto& positions = mergedMap[entry.docId]; positions.insert(positions.end(), entry.positions.begin(), entry.positions.end()); } } - + // convert map to vector and sort positions within each document std::vector result; result.reserve(mergedMap.size()); - + for (auto& [docId, positions] : mergedMap) { // sort positions for this document std::sort(positions.begin(), positions.end()); - + PostingEntry entry; entry.docId = docId; entry.positions = std::move(positions); result.push_back(std::move(entry)); } - return result; } @@ -177,35 +174,29 @@ int main(int argc, char* argv[]) { std::string projectDir = projectRoot.string(); std::string dataDir = "/data"; - - const char* test_env = std::getenv("ENV"); // for integration tests, test with controlled and small dataset in test_data + + const char* test_env = std::getenv( + "ENV"); // for integration tests, test with controlled and small dataset in test_data if (test_env && std::string(test_env) == "TEST_ENV") { std::cout << "TEST ENVIRONMENT, merging index with test data." << std::endl; dataDir = "/test_data"; - } + } std::string partialIndexPostingsDir = projectDir + dataDir + "/partial_indices/postings"; std::string partialIndexDictDir = projectDir + dataDir + "/partial_indices/dictionaries"; std::string metadataDir = projectDir + dataDir + "/index"; - std::string outputDir = (projectRoot.parent_path() / "index" / "bin").string(); // put in parallel directory index/ where python code expects it - std::filesystem::create_directories(outputDir); - - // copy from building dir to output dir - if (std::filesystem::exists(metadataDir + "/metadata.bin")) std::filesystem::remove(outputDir + "/metadata.bin"); - std::filesystem::copy(metadataDir + "/metadata.bin", outputDir + "/metadata.bin"); - - if (std::filesystem::exists(outputDir + "/docstore.bin")) std::filesystem::remove(outputDir + "/docstore.bin"); - if (std::filesystem::exists(outputDir + "/docstore_offsets.bin")) std::filesystem::remove(outputDir + "/docstore_offsets.bin"); - std::filesystem::copy(projectDir + dataDir + "/docstore/docstore.bin", outputDir + "/docstore.bin"); - std::filesystem::copy(projectDir + dataDir + "/docstore/docstore_offsets.bin", outputDir + "/docstore_offsets.bin"); + std::string outputDir = + (projectRoot.parent_path() / "index" / "bin") + .string(); // put in parallel directory index/ where python code expects it // get number of partial indices size_t partialIndexCount = 0; while (true) { - std::string dictFile = partialIndexDictDir + "/dictionary_" + std::to_string(partialIndexCount) + ".bin"; + std::string dictFile = + partialIndexDictDir + "/dictionary_" + std::to_string(partialIndexCount) + ".bin"; std::ifstream testIn(dictFile); if (!testIn.is_open()) break; testIn.close(); - partialIndexCount++; + partialIndexCount++; } partialIndexCount++; std::cout << "Found " << partialIndexCount << " partial indices to merge.\n"; @@ -213,21 +204,19 @@ int main(int argc, char* argv[]) { auto allDicts = std::vector>(partialIndexCount); for (size_t i = 0; i < partialIndexCount - 1; i++) { std::string dictFile = partialIndexDictDir + "/dictionary_" + std::to_string(i) + ".bin"; - std::string postingsFile = partialIndexPostingsDir + "/postings_" + std::to_string(i) + ".bin"; + std::string postingsFile = + partialIndexPostingsDir + "/postings_" + std::to_string(i) + ".bin"; allDicts[i] = readDictionary(dictFile, postingsFile); } - allDicts[partialIndexCount - 1] = readDictionary( - partialIndexDictDir + "/dictionary_final.bin", - partialIndexPostingsDir + "/postings_final.bin" - ); + allDicts[partialIndexCount - 1] = + readDictionary(partialIndexDictDir + "/dictionary_final.bin", + partialIndexPostingsDir + "/postings_final.bin"); struct HeapEntry { std::string term; size_t dictIndex; size_t entryIndex; - bool operator>(const HeapEntry& other) const { - return term > other.term; - } + bool operator>(const HeapEntry& other) const { return term > other.term; } }; std::priority_queue, std::greater> minHeap; // initialize heap with first entry from each dictionary @@ -248,77 +237,79 @@ int main(int argc, char* argv[]) { while (!minHeap.empty()) { auto current = minHeap.top(); minHeap.pop(); - + const std::string& term = current.term; - + // collect all postings for this term from all dictionaries std::vector> postingsToMerge; std::vector docFreqs; - + // add the current entry's posting { size_t dictIndex = current.dictIndex; size_t entryIndex = current.entryIndex; const DictEntry& entry = allDicts[dictIndex][entryIndex]; docFreqs.push_back(entry.docFreq); - - std::string postingsFile = (dictIndex == partialIndexCount - 1) - ? partialIndexPostingsDir + "/postings_final.bin" - : partialIndexPostingsDir + "/postings_" + std::to_string(dictIndex) + ".bin"; - - postingsToMerge.push_back(readAndParsePosting(postingsFile, entry.offset, entry.postingSize)); - + + std::string postingsFile = + (dictIndex == partialIndexCount - 1) + ? partialIndexPostingsDir + "/postings_final.bin" + : partialIndexPostingsDir + "/postings_" + std::to_string(dictIndex) + ".bin"; + + postingsToMerge.push_back( + readAndParsePosting(postingsFile, entry.offset, entry.postingSize)); + if (entryIndex + 1 < allDicts[dictIndex].size()) { const DictEntry& nextEntry = allDicts[dictIndex][entryIndex + 1]; minHeap.push({nextEntry.term, dictIndex, entryIndex + 1}); } } - // check if the next entries in the heap have the same term while (!minHeap.empty() && minHeap.top().term == term) { auto same = minHeap.top(); minHeap.pop(); - size_t dictIndex = same.dictIndex; size_t entryIndex = same.entryIndex; const DictEntry& entry = allDicts[dictIndex][entryIndex]; docFreqs.push_back(entry.docFreq); - - std::string postingsFile = (dictIndex == partialIndexCount - 1) - ? partialIndexPostingsDir + "/postings_final.bin" - : partialIndexPostingsDir + "/postings_" + std::to_string(dictIndex) + ".bin"; - - postingsToMerge.push_back(readAndParsePosting(postingsFile, entry.offset, entry.postingSize)); - + + std::string postingsFile = + (dictIndex == partialIndexCount - 1) + ? partialIndexPostingsDir + "/postings_final.bin" + : partialIndexPostingsDir + "/postings_" + std::to_string(dictIndex) + ".bin"; + + postingsToMerge.push_back( + readAndParsePosting(postingsFile, entry.offset, entry.postingSize)); + // push next entry from this dictionary into the heap if (entryIndex + 1 < allDicts[dictIndex].size()) { const DictEntry& nextEntry = allDicts[dictIndex][entryIndex + 1]; minHeap.push({nextEntry.term, dictIndex, entryIndex + 1}); - } + } } - + // merge postings properly (sorted by docId, with positions merged and sorted) std::vector mergedPostings = mergePostings(postingsToMerge); - + // calculate actual docFreq (number of unique documents) uint32_t totalDocFreq = mergedPostings.size(); - + // write posting data to final postings file writePosting(finalPostOut, mergedPostings); - + // calculate size of merged posting uint64_t postingSize = 0; for (const auto& entry : mergedPostings) { - postingSize += sizeof(uint32_t) + sizeof(uint32_t) + entry.positions.size() * sizeof(uint32_t); + postingSize += + sizeof(uint32_t) + sizeof(uint32_t) + entry.positions.size() * sizeof(uint32_t); } - + // write dictionary entry to final dictionary file uint32_t termLen = term.size(); finalDictOut.write(reinterpret_cast(&termLen), sizeof(termLen)); finalDictOut.write(term.data(), termLen); finalDictOut.write(reinterpret_cast(&finalOffset), sizeof(finalOffset)); finalDictOut.write(reinterpret_cast(&totalDocFreq), sizeof(totalDocFreq)); - finalOffset += postingSize; termsProcessed++; @@ -326,14 +317,14 @@ int main(int argc, char* argv[]) { if (termsProcessed % 100000 == 0) { auto now = high_resolution_clock::now(); auto elapsed = std::chrono::duration_cast(now - start).count(); - std::cout << "Processed " << termsProcessed << " terms, elapsed time: " - << elapsed << "s" << std::endl; + std::cout << "Processed " << termsProcessed << " terms, elapsed time: " << elapsed + << "s" << std::endl; } } std::cout << "Merging completed successfully.\n"; - std::cout << "Time taken: " - << duration_cast(high_resolution_clock::now() - start).count() + std::cout << "Time taken: " + << duration_cast(high_resolution_clock::now() - start).count() << " seconds.\n"; finalPostOut.close(); finalDictOut.close(); diff --git a/src/backend/search_engine/index_builder/test_data/msmarco-docs.tsv b/src/backend/search_engine/index_builder/test_data/msmarco-docs.tsv index 69243b4..0224892 100644 --- a/src/backend/search_engine/index_builder/test_data/msmarco-docs.tsv +++ b/src/backend/search_engine/index_builder/test_data/msmarco-docs.tsv @@ -7,4 +7,5 @@ D5 http://example.com/5 Title Six beta D6 http://example.com/6 Title Seven gamma delta D7 http://example.com/7 Title Eight alpha delta delta D8 http://example.com/8 Title Nine beta gamma -D9 http://example.com/9 Title Ten alpha beta gamma delta \ No newline at end of file +D9 http://example.com/9 Title Ten alpha beta gamma delta +D10 http://example.com/10 Title Eleven Hello - what is up? These are the first 15 words of the body. Now, we have a big gap until the next window. This gap should be very long to see the effect of 2 potential different windows. Here begins the second snippet. \ No newline at end of file diff --git a/src/backend/search_engine/models/index.py b/src/backend/search_engine/models/index.py index 69fc457..3728ead 100644 --- a/src/backend/search_engine/models/index.py +++ b/src/backend/search_engine/models/index.py @@ -5,6 +5,7 @@ class SearchResult(BaseModel): document_id: int url: HttpUrl title: str + snippet: str class SearchResults(BaseModel): diff --git a/src/backend/search_engine/query/query_engine.py b/src/backend/search_engine/query/query_engine.py index f3bb81c..10e7212 100755 --- a/src/backend/search_engine/query/query_engine.py +++ b/src/backend/search_engine/query/query_engine.py @@ -107,7 +107,7 @@ def _bool_search(self, node: Node | None) -> PostingList: end = time.perf_counter() logger.debug( - f"Node={node.value}, Result docs={len(result.postings)}, " + f"Node={node.value!r}, Result docs={len(result.postings)}, " f"Execution time: {end - start:.6f} seconds" ) return result @@ -137,15 +137,16 @@ def search_results(self, limit: int = 10) -> SearchResults: correction = repl(self.corrector, raw_query) if not qt._has_operators(normalized_tokens): + self.inverted_index.doc_store.query_terms = list( + set(normalized_tokens) + ) # needed for snippets if (raw_query.startswith('"') and raw_query.endswith('"')) or ( raw_query.startswith("'") and raw_query.endswith("'") ): # positional phrase search logger.debug("Executing positional phrase query search...") normalized_tokens_no_quots = normalize_search_query(raw_query[1:-1]) - posting_lists = self._positional_phrase_search( - normalized_tokens_no_quots - ) + result = self._positional_phrase_search(normalized_tokens_no_quots) else: # any order -> create AND query logger.debug("Executing phrase query search...") @@ -153,22 +154,31 @@ def search_results(self, limit: int = 10) -> SearchResults: logger.debug(f"Converted to AND query: {and_query}") qt.parse_query(and_query) logger.debug(f"Query tree: {qt.root}") - posting_lists = self._bool_search(qt.root) + result = self._bool_search(qt.root) else: logger.debug("Executing bool query search...") try: qt.parse_query(normalized_tokens) + self.inverted_index.doc_store.query_terms = ( + qt.unique_terms + ) # needed for snippets logger.debug(f"Query tree: {qt.root}") - posting_lists = self._bool_search(qt.root) + result = self._bool_search(qt.root) except InvalidOperatorError as e: logger.error(f"Invalid query syntax: {e}") raise - if posting_lists is None or len(posting_lists.postings) == 0: + if result is None or len(result.postings) == 0: return SearchResults(search_results=[], correction=correction) + logger.debug( + f"Found {len(result.postings)} results in {time.perf_counter() - start:.6f} seconds" + ) + + top_n_results = result # TODO will be done by BM25 ranking later + search_results = [] - for doc_id in posting_lists.postings[:limit]: + for doc_id in top_n_results.postings[:limit]: doc_data = self.inverted_index.doc_store.get(doc_id) if doc_data is None: continue @@ -177,12 +187,14 @@ def search_results(self, limit: int = 10) -> SearchResults: if url is None: continue title = doc_data.title or "Untitled" + snippet = doc_data.snippet try: search_result = SearchResult( document_id=doc_id, url=url, # type: ignore[arg-type] title=title, + snippet=snippet, ) search_results.append(search_result) except Exception as e: @@ -194,4 +206,7 @@ def search_results(self, limit: int = 10) -> SearchResults: f"Returned {len(search_results)} results. " f"Total execution time: {end - start:.6f} seconds" ) + # clear cache to free memory + self.inverted_index.clear_cache() + return SearchResults(search_results=search_results, correction=correction) diff --git a/src/backend/search_engine/query/query_preprocessing.py b/src/backend/search_engine/query/query_preprocessing.py index 03750f4..4c3b863 100644 --- a/src/backend/search_engine/query/query_preprocessing.py +++ b/src/backend/search_engine/query/query_preprocessing.py @@ -38,6 +38,9 @@ class QueryTree: def __init__(self) -> None: self._root: Node | None = None self._warnings_stack: list[ParenthesesWarning] = [] + self.unique_terms: set[str] = ( + set() + ) # only positive (non-negated) terms for snippeting @property def root(self) -> Node | None: @@ -103,7 +106,7 @@ def _parse_query(self, tokens: list[str]) -> Node: return node # handle NOT and parentheses - def _parse_term(self, tokens: list[str]) -> Node: + def _parse_term(self, tokens: list[str], negated: bool = False) -> Node: if not tokens: raise ValueError("Unexpected end of tokens while parsing term") @@ -111,7 +114,7 @@ def _parse_term(self, tokens: list[str]) -> Node: if token in NOT: tokens.pop(0) - word_node = self._parse_term(tokens) + word_node = self._parse_term(tokens, negated=True) return Node(token, left=None, right=word_node) if token == "(": @@ -136,6 +139,8 @@ def _parse_term(self, tokens: list[str]) -> Node: # leaf node/actual word if token not in (AND | OR | NOT): + if not negated: + self.unique_terms.add(token) tokens.pop(0) return Node(token) diff --git a/tests/Dockerfile b/src/backend/tests/Dockerfile similarity index 62% rename from tests/Dockerfile rename to src/backend/tests/Dockerfile index 4eadbb3..43badbd 100644 --- a/tests/Dockerfile +++ b/src/backend/tests/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.13-slim # system dependencies # build-essential for packages using c extensions # git for packages installed from git -# zlib and libstemmer for CMake build of index_builder +# libstemmer for CMake build of index_builder RUN apt-get update && apt-get install -y \ build-essential \ git \ @@ -20,18 +20,14 @@ ENV PATH="/root/.local/bin:$PATH" WORKDIR /app/src/backend # copy first to cache dependencies -COPY src/backend/pyproject.toml . -COPY src/backend/uv.lock . -COPY src/backend/bindings/ ./bindings/ +COPY pyproject.toml . +COPY uv.lock . +COPY bindings/ ./bindings/ RUN uv sync -COPY src/backend/ . +COPY . . -COPY tests/ /app/tests/ +RUN chmod +x /app/src/backend/tests/entrypoint.sh -COPY justfile /app/justfile - -RUN chmod +x /app/tests/entrypoint.sh - -ENTRYPOINT ["/app/tests/entrypoint.sh"] +ENTRYPOINT ["/app/src/backend/tests/entrypoint.sh"] \ No newline at end of file diff --git a/src/backend/tests/conftest.py b/src/backend/tests/conftest.py new file mode 100644 index 0000000..5c1cbce --- /dev/null +++ b/src/backend/tests/conftest.py @@ -0,0 +1,4 @@ +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) diff --git a/tests/docker-compose.yml b/src/backend/tests/docker-compose.yml similarity index 80% rename from tests/docker-compose.yml rename to src/backend/tests/docker-compose.yml index 01b8ba5..c306e1f 100644 --- a/tests/docker-compose.yml +++ b/src/backend/tests/docker-compose.yml @@ -5,4 +5,4 @@ services: dockerfile: tests/Dockerfile image: seekr-pytest-ci-env environment: - ENV: "TEST_ENV" \ No newline at end of file + ENV: "TEST_ENV" \ No newline at end of file diff --git a/src/backend/tests/entrypoint.sh b/src/backend/tests/entrypoint.sh new file mode 100644 index 0000000..b71f084 --- /dev/null +++ b/src/backend/tests/entrypoint.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +cd search_engine/scripts/ && \ + chmod +x build-index.sh && \ + ./build-index.sh 1024 -1 + +uv run pytest /app/src/backend/tests \ No newline at end of file diff --git a/tests/justfile b/src/backend/tests/justfile similarity index 100% rename from tests/justfile rename to src/backend/tests/justfile diff --git a/src/backend/tests/test_index.py b/src/backend/tests/test_index.py new file mode 100644 index 0000000..bff358c --- /dev/null +++ b/src/backend/tests/test_index.py @@ -0,0 +1,175 @@ +import pytest + +from cpp_utils import DocInfo, normalize_search_query +from backend.search_engine.index.index_loader import get_index + +EXPECTED = { + "alpha": { + "postings": [0, 1, 4, 7, 9], + "tf": { + 0: 2, + 1: 3, + 4: 1, + 7: 1, + 9: 1, + }, + "pos": { + 0: [0, 2], + 1: [0, 1, 2], + 4: [1], + 7: [0], + 9: [0], + }, + }, + "beta": { + "postings": [0, 2, 4, 5, 8, 9], + "tf": { + 0: 2, + 2: 2, + 4: 1, + 5: 1, + 8: 1, + 9: 1, + }, + "pos": { + 0: [1, 3], + 2: [0, 1], + 4: [2], + 5: [0], + 8: [0], + 9: [1], + }, + }, + "gamma": { + "postings": [2, 3, 6, 8, 9], + "tf": { + 2: 2, + 3: 3, + 6: 1, + 8: 1, + 9: 1, + }, + "pos": { + 2: [2, 3], + 3: [0, 1, 2], + 6: [0], + 8: [1], + 9: [2], + }, + }, + "delta": { + "postings": [3, 4, 6, 7, 9], + "tf": { + 3: 1, + 4: 1, + 6: 1, + 7: 2, + 9: 1, + }, + "pos": { + 3: [3], + 4: [0], + 6: [1], + 7: [1, 2], + 9: [3], + }, + }, +} + + +def test_index_metadata(): + inverted_index = get_index() + metadata = inverted_index.metadata + + expected_num_docs = 11 + # counting title + body, excluding stop words + expected_doc_lengths = { + 0: 6, + 1: 5, + 2: 6, + 3: 6, + 4: 5, + 5: 3, + 6: 4, + 7: 5, + 8: 4, + 9: 6, + 10: 31, + } + expected_avg_length = sum(expected_doc_lengths.values()) / expected_num_docs + assert metadata.num_docs == expected_num_docs + assert metadata.avg_doc_length == expected_avg_length + assert metadata.doc_lengths == expected_doc_lengths + + for doc_id, length in expected_doc_lengths.items(): + assert metadata.get_doc_length(doc_id) == length + + +@pytest.mark.parametrize("term", ["alpha", "beta", "gamma", "delta"]) +def test_real_index_postings(term): + inverted_index = get_index() + result = inverted_index.index.get(term) + + assert result.postings == EXPECTED[term]["postings"] + assert result.term_frequencies == EXPECTED[term]["tf"] + assert result.positions == EXPECTED[term]["pos"] + + assert result.postings == sorted(result.postings) + for doc_id, pos in result.positions.items(): + assert len(pos) == result.term_frequencies[doc_id] + + +class TestDocStore: + def test_docstore_metadata(self): + inverted_index = get_index() + doc_store = inverted_index.doc_store + + expected_docs = { + 0: DocInfo("http://example.com/0", "Title One", snippet=""), + 1: DocInfo("http://example.com/1", "Title Two", snippet=""), + 2: DocInfo("http://example.com/2", "Title Three", snippet=""), + 3: DocInfo("http://example.com/3", "Title Four", snippet=""), + 4: DocInfo("http://example.com/4", "Title Five", snippet=""), + 5: DocInfo("http://example.com/5", "Title Six", snippet=""), + 6: DocInfo("http://example.com/6", "Title Seven", snippet=""), + 7: DocInfo("http://example.com/7", "Title Eight", snippet=""), + 8: DocInfo("http://example.com/8", "Title Nine", snippet=""), + 9: DocInfo("http://example.com/9", "Title Ten", snippet=""), + 10: DocInfo("http://example.com/10", "Title Eleven", snippet=""), + } + + doc_store.query_terms = ["-1"] # not needed here, but must be set + for doc_id, doc_info in expected_docs.items(): + assert doc_store.get(doc_id).url == doc_info.url + assert doc_store.get(doc_id).title == doc_info.title + + @pytest.mark.parametrize( + "query, expected_snippet", + [ + ( + "hello", + "Hello - what is up? These are the first 15 words of the body. Now, we have ...", + ), + ( + "snippet", + "... very long to see the effect of 2 potential different windows. Here begins the second snippet", + ), + ( + "hello body", + "Hello - what is up? These are the first 15 words of the body. Now, we have a big gap until the next ...", + ), + ( + "hello snippet", + "Hello - what is up? These are the first 15 ... potential different windows. Here begins the second snippet", + ), + ], + ) + def test_docstore_snippets(self, query: str, expected_snippet: str): + inverted_index = get_index() + doc_store = inverted_index.doc_store + + DOC_ID = 10 + doc_store.query_terms = normalize_search_query(query) + snippet = doc_store.get(DOC_ID).snippet + + assert snippet == expected_snippet diff --git a/tests/test_query/test_query_engine.py b/src/backend/tests/test_query/test_query_engine.py similarity index 98% rename from tests/test_query/test_query_engine.py rename to src/backend/tests/test_query/test_query_engine.py index 510f3be..f9bfe9c 100644 --- a/tests/test_query/test_query_engine.py +++ b/src/backend/tests/test_query/test_query_engine.py @@ -638,9 +638,9 @@ def test_search_results_basic( positions={1: [0], 2: [1], 3: [2]}, ) mock_inverted_index.doc_store.get.side_effect = [ - DocInfo(url="http://example.com/1", title="Doc 1"), - DocInfo(url="http://example.com/2", title="Doc 2"), - DocInfo(url="http://example.com/3", title="Doc 3"), + DocInfo(url="http://example.com/1", title="Doc 1", snippet=""), + DocInfo(url="http://example.com/2", title="Doc 2", snippet=""), + DocInfo(url="http://example.com/3", title="Doc 3", snippet=""), ] mock_spell_corrector.correct.return_value = "test" @@ -664,7 +664,7 @@ def test_search_results_with_limit( positions={i: [0] for i in range(1, 6)}, ) mock_inverted_index.doc_store.get.side_effect = [ - DocInfo(url=f"http://example.com/{i}", title=f"Doc {i}") + DocInfo(url=f"http://example.com/{i}", title=f"Doc {i}", snippet="") for i in range(1, 6) ] @@ -750,7 +750,8 @@ def test_full_workflow( ] mock_inverted_index.doc_store.get.side_effect = [ - DocInfo(url=f"http://example.com/{i}", title=f"Doc {i}") for i in [2, 3, 5] + DocInfo(url=f"http://example.com/{i}", title=f"Doc {i}", snippet="") + for i in [2, 3, 5] ] term1 = Node(value="term1") diff --git a/tests/test_query/test_query_preprocessing.py b/src/backend/tests/test_query/test_query_preprocessing.py similarity index 100% rename from tests/test_query/test_query_preprocessing.py rename to src/backend/tests/test_query/test_query_preprocessing.py diff --git a/src/frontend/Dockerfile b/src/frontend/Dockerfile new file mode 100644 index 0000000..862ee6d --- /dev/null +++ b/src/frontend/Dockerfile @@ -0,0 +1,11 @@ +FROM node:20-slim + +WORKDIR /app + +COPY package.json package-lock.json . + +RUN npm ci + +COPY . . + +CMD ["npm", "run", "dev"] \ No newline at end of file diff --git a/src/frontend/src/components/SearchResults.tsx b/src/frontend/src/components/SearchResults.tsx index 1319a34..ac1d4b1 100644 --- a/src/frontend/src/components/SearchResults.tsx +++ b/src/frontend/src/components/SearchResults.tsx @@ -4,7 +4,7 @@ import { Card } from "@/components/ui/card"; interface SearchResult { title: string; url: string; - description?: string; + snippet?: string; } interface SearchResultsProps { @@ -37,9 +37,9 @@ export const SearchResults = ({ results }: SearchResultsProps) => {

{result.url}

- {result.description && ( + {(result.snippet) && (

- {result.description} +

)} diff --git a/src/frontend/src/pages/Index.tsx b/src/frontend/src/pages/Index.tsx index dc2d9b2..7b975da 100644 --- a/src/frontend/src/pages/Index.tsx +++ b/src/frontend/src/pages/Index.tsx @@ -10,7 +10,7 @@ import seekrLogo from "@/assets/seekr-logo.png"; interface SearchResult { title: string; url: string; - description?: string; + snippet?: string; } const Index = () => { @@ -128,7 +128,7 @@ const Index = () => { try { const response = await fetch( - `http://127.0.0.1:8000/search?q=${encodeURIComponent(query)}&limit=${customLimit}` + `/search?q=${encodeURIComponent(query)}&limit=${customLimit}` ); if (!response.ok) { @@ -136,7 +136,7 @@ const Index = () => { try { const data = await response.json(); if (data.detail) errorMsg = data.detail; - } catch {} + } catch { } throw new Error(errorMsg); } @@ -282,11 +282,10 @@ const Index = () => { key={rpp} onClick={() => handleResultsPerPageChange(rpp)} disabled={isLoading} - className={`px-3 py-2 text-sm rounded-lg border transition-colors ${ - resultsPerPage === rpp - ? "bg-primary text-primary-foreground border-primary" - : "border-input hover:bg-accent" - } disabled:opacity-50 disabled:cursor-not-allowed`} + className={`px-3 py-2 text-sm rounded-lg border transition-colors ${resultsPerPage === rpp + ? "bg-primary text-primary-foreground border-primary" + : "border-input hover:bg-accent" + } disabled:opacity-50 disabled:cursor-not-allowed`} > {rpp} @@ -371,22 +370,43 @@ const Index = () => { {/* Pagination */} {totalPages > 1 && ( -
- {Array.from({ length: totalPages }, (_, index) => index + 1).map( - (pageNum) => ( - + + {/* Page Numbers */} + {getPageNumbers().map((pageNum, index) => ( + - ) - )} + > + {pageNum} + + ))} + + {/* Next Button */} +
)} diff --git a/src/frontend/vite.config.ts b/src/frontend/vite.config.ts index da25c6d..af653ec 100644 --- a/src/frontend/vite.config.ts +++ b/src/frontend/vite.config.ts @@ -4,15 +4,25 @@ import path from "path"; import { componentTagger } from "lovable-tagger"; // https://vitejs.dev/config/ -export default defineConfig(({ mode }) => ({ - server: { - host: "::", - port: 8080, - }, - plugins: [react(), mode === "development" && componentTagger()].filter(Boolean), - resolve: { - alias: { - "@": path.resolve(__dirname, "./src"), +export default defineConfig(({ mode }) => { + const isDocker = process.env.ENV === "DOCKER"; + + return { + server: { + host: "::", + port: 8080, + proxy: { + "/search": { + target: isDocker ? "http://backend:8000" : "http://127.0.0.1:8000", + changeOrigin: true, + }, + }, + }, + plugins: [react(), mode === "development" && componentTagger()].filter(Boolean), + resolve: { + alias: { + "@": path.resolve(__dirname, "./src"), + }, }, - }, -})); + }; +}); diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 8d9f898..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,4 +0,0 @@ -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) diff --git a/tests/entrypoint.sh b/tests/entrypoint.sh deleted file mode 100644 index 71af52a..0000000 --- a/tests/entrypoint.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -set -e - -just -f /app/justfile build-index - -uv run pytest /app/tests \ No newline at end of file diff --git a/tests/test_index_builder/test_index_builder.py b/tests/test_index_builder/test_index_builder.py deleted file mode 100644 index 69b852a..0000000 --- a/tests/test_index_builder/test_index_builder.py +++ /dev/null @@ -1,141 +0,0 @@ -import pytest - -from cpp_utils import DocInfo -from backend.search_engine.index.index_loader import get_index - -EXPECTED = { - "alpha": { - "postings": [0, 1, 4, 7, 9], - "tf": { - 0: 2, - 1: 3, - 4: 1, - 7: 1, - 9: 1, - }, - "pos": { - 0: [0, 2], - 1: [0, 1, 2], - 4: [1], - 7: [0], - 9: [0], - }, - }, - "beta": { - "postings": [0, 2, 4, 5, 8, 9], - "tf": { - 0: 2, - 2: 2, - 4: 1, - 5: 1, - 8: 1, - 9: 1, - }, - "pos": { - 0: [1, 3], - 2: [0, 1], - 4: [2], - 5: [0], - 8: [0], - 9: [1], - }, - }, - "gamma": { - "postings": [2, 3, 6, 8, 9], - "tf": { - 2: 2, - 3: 3, - 6: 1, - 8: 1, - 9: 1, - }, - "pos": { - 2: [2, 3], - 3: [0, 1, 2], - 6: [0], - 8: [1], - 9: [2], - }, - }, - "delta": { - "postings": [3, 4, 6, 7, 9], - "tf": { - 3: 1, - 4: 1, - 6: 1, - 7: 2, - 9: 1, - }, - "pos": { - 3: [3], - 4: [0], - 6: [1], - 7: [1, 2], - 9: [3], - }, - }, -} - - -def test_real_index_metadata(): - inverted_index = get_index() - metadata = inverted_index.metadata - - expected_num_docs = 10 - # counting title + body - expected_doc_lengths = { - 0: 6, - 1: 5, - 2: 6, - 3: 6, - 4: 5, - 5: 3, - 6: 4, - 7: 5, - 8: 4, - 9: 6, - } - expected_avg_length = sum(expected_doc_lengths.values()) / expected_num_docs - - assert metadata.num_docs == expected_num_docs - assert metadata.avg_doc_length == expected_avg_length - assert metadata.doc_lengths == expected_doc_lengths - - for doc_id, length in expected_doc_lengths.items(): - assert metadata.get_doc_length(doc_id) == length - - -@pytest.mark.parametrize("term", ["alpha", "beta", "gamma", "delta"]) -def test_real_index_postings(term): - inverted_index = get_index() - result = inverted_index.index.get(term) - - assert result.postings == EXPECTED[term]["postings"] - assert result.term_frequencies == EXPECTED[term]["tf"] - assert result.positions == EXPECTED[term]["pos"] - - assert result.postings == sorted(result.postings) - for doc_id, pos in result.positions.items(): - assert len(pos) == result.term_frequencies[doc_id] - - -def test_real_index_docstore(): - inverted_index = get_index() - doc_store = inverted_index.doc_store - - expected_docs = { - 0: DocInfo("http://example.com/0", "Title One"), - 1: DocInfo("http://example.com/1", "Title Two"), - 2: DocInfo("http://example.com/2", "Title Three"), - 3: DocInfo("http://example.com/3", "Title Four"), - 4: DocInfo("http://example.com/4", "Title Five"), - 5: DocInfo("http://example.com/5", "Title Six"), - 6: DocInfo("http://example.com/6", "Title Seven"), - 7: DocInfo("http://example.com/7", "Title Eight"), - 8: DocInfo("http://example.com/8", "Title Nine"), - 9: DocInfo("http://example.com/9", "Title Ten"), - } - - for doc_id, doc_info in expected_docs.items(): - assert doc_store.get(doc_id).url == doc_info.url - assert doc_store.get(doc_id).title == doc_info.title