Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,14 @@ dist/
.DS_Store
Thumbs.db

# Node / frontend (falls frontend im Projekt ist)
node_modules/
dist/

**/*.tsv
**/*.gz
# allow test data files as they are small and necessary for tests
!src/backend/search_engine/index_builder/test_data/*.tsv
!src/backend/search_engine/index_builder/test_data/*.gz

/src/backend/search_engine/index_builder/build/
/src/backend/search_engine/index_builder/data/
/src/backend/search_engine/index/bin/
/src/backend/search_engine/models/neuspell-scrnn-probwordnoise/
src/backend/search_engine/tests/
src/backend/search_engine/index_builder/build/
src/backend/search_engine/index_builder/data/
src/backend/search_engine/index/bin/
src/backend/search_engine/models/neuspell-scrnn-probwordnoise/
2 changes: 1 addition & 1 deletion .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ runs:
shell: bash
run: |
sudo apt-get update -y
sudo apt-get install -y libstemmer-dev build-essential git
sudo apt-get install -y libstemmer-dev build-essential git clang-format
7 changes: 3 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ dist-ssr
!src/backend/search_engine/index_builder/test_data/*.tsv
!src/backend/search_engine/index_builder/test_data/*.gz

/src/backend/search_engine/index_builder/data/docstore
/src/backend/search_engine/index_builder/data/index
/src/backend/search_engine/index_builder/data/partial_indices
/src/backend/search_engine/index/bin/
/src/backend/search_engine/index_builder/data/
!src/backend/search_engine/index_builder/data/data.md
/src/backend/search_engine/index/bin/
35 changes: 32 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,44 @@ Seekr consists of several core subsystems working together:
* uv
* Node.js
* npm
* Docker (for containerized integration/unit tests)
* Docker (for containerized integration/unit tests & deployment)
* LFS (downloading ML models from GitHub)
* CMake (building and compiling the CPP components)
* Just (command runner)

## Entrypoints
### Docker

### Build the Index
Before running the system, build the index with a memory limit:
```bash
just deploy
```
Automated build process. Will download the dataset and build the index if
it does not exist yet. This preprocessing can take up to 2 hours.

Afterwards, it spins up a frontend and a backend container.

- Access search engine frontend via `http://localhost:8080`.
- API-only: `http://localhost:8000`.

**Search Endpoint**

**GET** `/search`

Query parameters:

| Parameter | Type | Description |
| --------- | ------ | ---------------------------------------------- |
| `q` | string | Search query (1–50 characters) |
| `limit` | int | Maximum number of results (1–500, default: 10) |


### Manual usage
Download the dataset:
```bash
cd src && uv run --project backend python -m backend.search_engine.scripts.download_dataset
```

Build the index with a memory limit:
```bash
just build-index <memory-limit>
```
Expand Down
15 changes: 15 additions & 0 deletions deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -euo pipefail

if [ ! -f src/backend/search_engine/index_builder/data/msmarco-docs.tsv ]; then
echo "msmarco-docs.tsv file not found. Starting download..."
cd src && uv run --project backend python -m backend.search_engine.scripts.download_dataset
fi

if [ -z "$(ls -A src/backend/search_engine/index/bin/ 2>/dev/null)" ]; then
echo "Index binaries not found. Starting build process..."
just build-index
fi

echo "Spinning up containers..."
docker compose up -d
21 changes: 21 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
version: "3.9"
name: "seekr-search-engine"
services:
backend:
build: src/backend
container_name: seekr-backend
ports:
- "8000:8000" # expose backend for FE-less API access
volumes: # bind mount large data files
- ./src/backend/search_engine/index/bin:/app/src/backend/search_engine/index/bin
- ./src/backend/search_engine/index_builder/data/msmarco-docs.tsv:/app/src/backend/search_engine/index_builder/data/msmarco-docs.tsv
- ./src/backend/search_engine/models/neuspell-scrnn-probwordnoise:/app/src/backend/search_engine/models/neuspell-scrnn-probwordnoise
frontend:
build: src/frontend
container_name: seekr-frontend
depends_on:
- backend
ports:
- "8080:8080"
environment:
ENV: "DOCKER"
25 changes: 20 additions & 5 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ local *uvicorn-args:
chmod +x local.sh && \
./local.sh {{uvicorn-args}}

deploy:
chmod +x deploy.sh && \
./deploy.sh

build-index memory-limit="1024" max-docs="-1":
cd src/backend/search_engine/scripts/ && \
chmod +x build-index.sh && \
Expand All @@ -37,17 +41,28 @@ generate-stubs:
./generate-stubs.sh

lint:
@echo "Linting with Ruff..."
cd src/backend && uv run ruff check api/ search_engine/ ../../tests/
cd src/backend && uv run ruff format --check --diff api/ search_engine/ ../../tests/
@echo "Linting Python code..."
cd src/backend && uv run ruff check api/ search_engine/ tests/
cd src/backend && uv run ruff format --check --diff api/ search_engine/ tests/
@echo "Linting C++ code..." # only format-check instead of linting to avoid dependency-related failures
clang-format --dry-run --Werror \
src/backend/bindings/utils.cpp \
src/backend/search_engine/index_builder/index_builder.cpp \
src/backend/search_engine/index_builder/merge_partial_indices.cpp

format:
cd src/backend && uv run ruff format api/ search_engine/ ../../tests/
@echo "Formatting Python code..."
cd src/backend && uv run ruff format api/ search_engine/ tests/
@echo "Formatting C++ code..."
clang-format -i \
src/backend/bindings/utils.cpp \
src/backend/search_engine/index_builder/index_builder.cpp \
src/backend/search_engine/index_builder/merge_partial_indices.cpp

mypy:
@echo "Type checking with MyPy..."
cd src/backend && uv run mypy api/
cd src/backend && uv run mypy search_engine/

test:
just -f tests/justfile test
just -f src/backend/tests/justfile test
2 changes: 1 addition & 1 deletion local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export LOG_LEVEL=DEBUG
PIDS=()

cd src
uv run --project backend uvicorn backend.api.v1.app:app --host 127.0.0.1 --port 8000 "$@" &
uv run --project backend --refresh uvicorn backend.api.v1.app:app --host 127.0.0.1 --port 8000 "$@" &
PIDS+=($!)
echo "Uvicorn server started with PID ${PIDS[0]}"

Expand Down
3 changes: 3 additions & 0 deletions src/backend/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BasedOnStyle: Google
IndentWidth: 4
ColumnLimit: 100
65 changes: 65 additions & 0 deletions src/backend/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Python bytecode
__pycache__/
*.py[cod]
*$py.class

# Virtual environments
.venv/
venv/
env/
ENV/
env.bak/
venv.bak/

# Poetry / uv
.python-version
.uv/
.python-version
.poetry/
pdm.lock
.pdm-build/

# Test / coverage
.coverage
.coverage.*
.pytest_cache/
htmlcov/
nosetests.xml
coverage.xml
*.cover
*.py,cover

# Distribution / packaging
build/
dist/
*.egg-info/
*.egg
*.whl
*.tar.gz

# IDE / editor
.vscode/
.idea/
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?

# OS files
.DS_Store
Thumbs.db

node_modules/
dist/

**/*.tsv
**/*.gz
# allow test data files as they are small and necessary for tests
!search_engine/index_builder/test_data/*.tsv
!search_engine/index_builder/test_data/*.gz

search_engine/index_builder/build/
search_engine/index_builder/data/
search_engine/index/bin/
search_engine/models/neuspell-scrnn-probwordnoise/
32 changes: 32 additions & 0 deletions src/backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM python:3.13-slim

# system dependencies
# build-essential for packages using c extensions
# git for packages installed from git
# libstemmer for CMake build of index_builder
RUN apt-get update && apt-get install -y \
build-essential \
git \
curl \
cmake \
libstemmer-dev \
&& rm -rf /var/lib/apt/lists/*

RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:$PATH"

# set workdir to where pyproject.toml is located for uv
WORKDIR /app/src/backend

# copy first to cache dependencies
COPY pyproject.toml .
COPY uv.lock .
COPY bindings/ ./bindings/

RUN uv sync

COPY . .

ENV PYTHONPATH="/app/src:$PYTHONPATH"

CMD ["uv", "run", "uvicorn", "api.v1.app:app", "--host", "0.0.0.0", "--port", "8000"]
3 changes: 3 additions & 0 deletions src/backend/bindings/cpp_utils/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ class Metadata:
class DocStore:
def get(self, doc_id: int) -> DocInfo | None:
...
def get_tsv_offset(self, doc_id: int) -> int | None:
...
class IndexAccessor:
def get(self, term: str) -> PostingList | None:
...
class InvertedIndex:
def __init__(self, arg0: str) -> None:
...
def clear_cache(self) -> None: ...
@property
def doc_store(self) -> DocStore:
...
Expand Down
Loading
Loading