diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..e8eb5b3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,20 @@ +.env +.env.* +!.env.example +.venv +venv +env +__pycache__ +*.py[cod] +.pytest_cache +.mypy_cache +.ruff_cache +.git +.gitignore +.DS_Store +data +docs/assets/*.gif +docs/assets/*.png +docs/assets/*.jpg +docs/assets/*.jpeg +.langgraph_api diff --git a/.env.example b/.env.example index c4e5c7f..03af09e 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,15 @@ -OPENAI_API_KEY=your_key_here -OPENAI_MODEL=gpt-5.2 -OPENAI_FALLBACK_MODEL=5 +# Set DEMO_MODE=true to run the Streamlit UI without real OpenAI calls. +DEMO_MODE=false + +# Required unless DEMO_MODE=true. +OPENAI_API_KEY= + +# GPT-5.4 is the default portfolio model. If your account does not have access, +# set OPENAI_MODEL to another model you can use and keep the fallback valid. +OPENAI_MODEL=gpt-5.4 +OPENAI_FALLBACK_MODEL=gpt-5.4-mini OPENAI_REASONING_EFFORT=high + OPENAI_EMBEDDING_MODEL=text-embedding-3-small DOCS_DIR=docs DATA_DIR=data diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c2b2511 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: CI + +on: + push: + branches: ["main", "chore/**"] + pull_request: + branches: ["main"] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + + - name: Run pytest + run: pytest diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e46335b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app/src \ + STREAMLIT_SERVER_ADDRESS=0.0.0.0 \ + STREAMLIT_SERVER_PORT=8501 + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN python -m pip install --upgrade pip \ + && python -m pip install --no-cache-dir -r requirements.txt + +COPY app.py langgraph.json ./ +COPY src ./src +COPY scripts ./scripts +COPY docs ./docs +COPY .env.example ./.env.example + +EXPOSE 8501 + +CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0", "--server.port=8501"] diff --git a/README.md b/README.md index 5f17000..ed29add 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,258 @@ -# Senior Product Manager AI Agent (MVP) +# Senior Product Manager AI Agent -Локальный облегченный MVP доменно-экспертного PM-ассистента для D2C/B2C продуктов с подпиской в нише снижения веса и telehealth. +Кратко по-русски: это local-first AI ассистент для анализа D2C/B2C subscription, weight-loss, wellness/nutra и telehealth продуктов. Проект показывает практическую LLM-инженерию: LangGraph workflow, RAG на FAISS, OpenAI embeddings, structured subagent outputs, memory summary, Streamlit UI и debug sidebar. + +## Overview + +Senior Product Manager AI Agent is a portfolio-grade MVP of a domain-expert assistant for product, growth, retention, and compliance analysis in subscription health businesses. + +It is intentionally lightweight: a Python app with a Streamlit chat UI, a deterministic LangGraph workflow, prompt-routed specialist subagents, a local FAISS knowledge base, and optional demo mode that runs without real OpenAI calls. + +## Problem + +D2C and telehealth subscription teams often need to reason across several connected systems at once: + +- acquisition and funnel conversion +- onboarding and activation +- churn, refunds, pricing, LTV, and payback +- ad policy and health-claim risk +- product strategy tradeoffs under incomplete data + +Generic chatbots tend to give shallow startup advice. This project makes those tradeoffs explicit and forces the assistant to think like a senior PM/operator: metrics first, practical experiments, compliance-aware growth, and clear assumptions. + +## Key Features + +- LangGraph workflow with explicit `retrieve -> route -> subagents -> synthesize` steps. +- Prompt-routed subagents: `GrowthAgent`, `SubscriptionAgent`, `ComplianceAgent`, and `PMStrategistAgent`. +- FAISS-based local RAG over Markdown knowledge files. +- OpenAI text embeddings via `text-embedding-3-small`. +- OpenAI GPT-5.4 as the default LLM, with configurable fallback. +- Compressed structured JSON outputs from subagents. +- Conversation memory summary to keep context compact. +- Streamlit chat UI with streaming-style response rendering. +- Debug sidebar showing active subagents, retrieved chunks, token usage, memory summary, and raw subagent outputs. +- Demo mode for screenshots, interviews, and local review without API calls. +- Mocked pytest coverage and CI. +- Docker and Docker Compose support. + +## Architecture + +Full architecture notes live in [docs/architecture.md](docs/architecture.md). + +```mermaid +flowchart LR + User[User] --> UI[Streamlit Chat UI] + UI --> Orchestrator[PMOrchestrator] + Orchestrator --> Retrieve[retrieve] + Retrieve --> Route[route] + Route --> Growth[GrowthAgent] + Growth --> Subscription[SubscriptionAgent] + Subscription --> Compliance[ComplianceAgent] + Compliance --> Synthesize[synthesize] + Synthesize --> Final[Final PM Answer] + Final --> UI +``` + +## Agent Workflow + +The orchestrator is a simple linear LangGraph graph. The router determines which subagents are active, but execution stays predictable and cheap. + +1. `retrieve`: search the local FAISS index for relevant knowledge chunks. +2. `route`: activate subagents based on query keywords. +3. `GrowthAgent`: diagnose funnel, CRO, acquisition, activation, and onboarding. +4. `SubscriptionAgent`: analyze retention, pricing, CAC/LTV, churn, refunds, and payback. +5. `ComplianceAgent`: flag risky health, telehealth, and ad-policy claims when relevant. +6. `PMStrategistAgent`: synthesize evidence and subagent outputs into the final PM answer. + +Growth and subscription agents are included by default because most subscription product questions require both conversion and economics reasoning. + +## RAG Pipeline + +```mermaid +flowchart LR + Docs[docs/] --> Load[Load .md and .txt files] + Load --> Chunk[Chunking with overlap] + Chunk --> Embed[OpenAI text embeddings] + Embed --> Index[FAISS local index] + Index --> Retrieve[Retrieve top-k chunks] + Retrieve --> Evidence[Evidence block] + Evidence --> Workflow[LangGraph LLM workflow] +``` + +Knowledge folders: + +- `docs/growth`: funnel optimization, activation, pricing, paywalls, experimentation. +- `docs/retention`: subscription economics, churn, lifecycle, save flows. +- `docs/compliance`: ad policy, telehealth, health-claim guardrails. +- `docs/competitors`: digital health market and competitor context. + +## Tech Stack -## Стек - Python - LangGraph -- OpenAI API (`gpt-5.2` только для LLM-вызовов) -- OpenAI text embeddings (`text-embedding-3-small`) +- OpenAI API +- OpenAI text embeddings - FAISS local vector store -- Streamlit UI - -## Структура проекта -- `app.py` - чат-приложение Streamlit + debug-панель в сайдбаре. -- `src/pm_agent/config.py` - настройки окружения. -- `src/pm_agent/llm.py` - обертки для OpenAI chat и embeddings. -- `src/pm_agent/prompts.py` - системные промпты и ключевые слова роутинга. -- `src/pm_agent/rag.py` - загрузка документов, чанкинг, сборка/поиск в FAISS. -- `src/pm_agent/orchestrator.py` - последовательный оркестратор LangGraph + сабагенты. -- `src/pm_agent/memory.py` - summary-память диалога. -- `scripts/ingest_docs.py` - сборка FAISS-индекса из `/docs`. -- `docs/*` - стартовая база знаний. - -## Quickstart -1. Создайте и активируйте виртуальное окружение. -2. Установите зависимости: - ```bash - python -m pip install -r requirements.txt - ``` -3. Настройте окружение: - ```bash - cp .env.example .env - # укажите OPENAI_API_KEY - ``` - Опционально: fallback-модель, если `gpt-5.2` недоступна для вашего ключа: - ```bash - # in .env - OPENAI_MODEL=gpt-5.2 - OPENAI_FALLBACK_MODEL=gpt-5 - OPENAI_REASONING_EFFORT=high - ``` -4. Соберите локальный индекс: - ```bash - python scripts/ingest_docs.py - ``` -5. Запустите UI: - ```bash - streamlit run app.py - ``` - -## Импорт внешних статей в базу знаний -Используйте это, чтобы импортировать выбранные URL в `docs/growth` и `docs/competitors`: +- Streamlit +- pytest +- Docker / Docker Compose + +## Demo + +Demo questions are available in [docs/demo_questions.md](docs/demo_questions.md). + +Run the app without OpenAI calls: + +```bash +DEMO_MODE=true streamlit run app.py +``` + +Screenshots: + +![Demo flow](docs/assets/demo_flow.gif) + +![Main chat UI](docs/assets/main_chat_ui.png) + +![Debug sidebar with active subagents](docs/assets/debug_sidebar_subagents.png) + +![Retrieved chunks](docs/assets/retrieved_chunks.png) + +![Final answer example](docs/assets/final_answer_example.png) + +## Setup + +Create and activate a virtual environment: + +```bash +python -m venv .venv +source .venv/bin/activate +``` + +Install dependencies: + +```bash +python -m pip install --upgrade pip +python -m pip install -r requirements.txt +``` + +Create local environment config: + +```bash +cp .env.example .env +``` + +Set your OpenAI key in `.env`: + +```bash +OPENAI_API_KEY=your_key_here +OPENAI_MODEL=gpt-5.4 +OPENAI_FALLBACK_MODEL=gpt-5.4-mini +OPENAI_REASONING_EFFORT=high +``` + +If your account does not have access to the default model, replace `OPENAI_MODEL` and `OPENAI_FALLBACK_MODEL` with models available to your OpenAI project. + +Start Streamlit: + +```bash +streamlit run app.py +``` + +## Docker Setup + +Run in demo mode: + +```bash +DEMO_MODE=true docker compose up --build +``` + +Run with OpenAI: + +```bash +cp .env.example .env +# edit .env and set OPENAI_API_KEY +docker compose up --build +``` + +The Compose setup mounts local `docs/` and `data/` folders: + +- `docs/` lets you edit the knowledge base locally. +- `data/` stores the generated FAISS index and chunk store. + +Open the UI at: + +```text +http://localhost:8501 +``` + +## How to Build FAISS Index + +After adding or editing documents under `docs/`, rebuild the local index: + +```bash +python scripts/ingest_docs.py +``` + +Optional external article import: ```bash python scripts/import_external_articles.py python scripts/ingest_docs.py ``` -## Примечания -- Сабагенты роутятся промптом и вызываются оркестратором последовательно. -- Передача данных между сабагентами использует сжатый общий JSON-контракт (`agent`, `summary`, `key_findings`, `recommendations`, `assumptions`, `compliance_flags`, опционально `experiments`). -- Нет автономных циклов, внешней БД и enterprise-инфраструктуры. -- В debug-сайдбаре отображаются активные сабагенты, извлеченные чанки и расход токенов. +Generated index files are written to `data/` and intentionally excluded from git. + +## Debug Sidebar + +The Streamlit sidebar shows: + +- active subagents selected by the router +- retrieved FAISS chunks with similarity scores +- token usage estimates or API usage +- conversation memory summary +- raw structured subagent outputs +- active LLM and fallback model + +This is useful for debugging RAG grounding, routing behavior, and answer quality during portfolio demos. + +## Tests + +Run the mocked test suite: + +```bash +pytest +``` + +Run lightweight eval and stress scripts: + +```bash +python scripts/run_evals.py +python scripts/run_stress_tests.py +``` + +Run everything: + +```bash +bash scripts/run_all_quality_checks.sh +``` + +The automated checks avoid real OpenAI calls and cover chunking, document loading, FAISS/RAG build with fake embeddings, router logic, structured output normalization, memory summary behavior, and settings loading. + +## Roadmap + +- Add richer retrieval metadata and source summaries. +- Add a small regression eval dataset with expected answer traits. +- Add optional LangGraph Studio trace instructions for deeper graph inspection. +- Add better response formatting controls for long tables. +- Add a saved demo conversation mode. +- Add a small CLI for ingestion and eval commands. + +## Known Limitations + +- This is an MVP, not a production medical, legal, or compliance system. +- RAG is simple top-k retrieval over local Markdown files. +- FAISS index files are local and must be rebuilt after knowledge-base changes. +- Subagents are prompt-based and sequential, not autonomous planners. +- The debug token panel is approximate when using mocked/demo mode. +- Compliance output is a product-policy aid, not legal or regulatory advice. +- Model availability depends on the user's OpenAI project access. diff --git a/app.py b/app.py index f45b57a..de0509e 100644 --- a/app.py +++ b/app.py @@ -13,6 +13,7 @@ sys.path.insert(0, str(ROOT / "src")) from pm_agent.config import load_settings +from pm_agent.demo import DemoLLMClient, DemoRAGStore from pm_agent.llm import OpenAIClient from pm_agent.memory import ConversationMemory from pm_agent.orchestrator import PMOrchestrator @@ -62,14 +63,18 @@ def render_assistant_plain(text: str, container) -> None: @st.cache_resource def build_runtime() -> tuple: settings = load_settings() - llm_client = OpenAIClient( - api_key=settings.openai_api_key, - model=settings.openai_model, - embedding_model=settings.embedding_model, - fallback_model=settings.fallback_openai_model, - reasoning_effort=settings.openai_reasoning_effort, - ) - rag_store = RAGStore(settings, llm_client) + if settings.demo_mode: + llm_client = DemoLLMClient() + rag_store = DemoRAGStore() + else: + llm_client = OpenAIClient( + api_key=settings.openai_api_key, + model=settings.openai_model, + embedding_model=settings.embedding_model, + fallback_model=settings.fallback_openai_model, + reasoning_effort=settings.openai_reasoning_effort, + ) + rag_store = RAGStore(settings, llm_client) orchestrator = PMOrchestrator(settings, llm_client, rag_store) memory = ConversationMemory(llm_client=llm_client, refresh_after_messages=8) return settings, rag_store, orchestrator, memory @@ -117,6 +122,22 @@ def render_debug_panel(debug_data: dict, memory_summary: str, has_index: bool) - st.write(subagent_outputs.get("compliance", "")) +def build_debug_data(result, token_usage: dict[str, int] | None = None) -> dict: + return { + "active_subagents": result.active_subagents, + "retrieved_chunks": [ + { + "source": c.source, + "score": c.score, + "text": c.text, + } + for c in result.retrieved_chunks + ], + "token_usage": token_usage or result.token_usage, + "subagent_outputs": result.subagent_outputs, + } + + def stream_text(text: str, delay: float = 0.01) -> None: placeholder = st.empty() rendered = "" @@ -182,7 +203,7 @@ def main() -> None: unsafe_allow_html=True, ) st.title("Senior Product Manager AI Agent (Weight Loss / Telehealth)") - st.caption("MVP stack: LangGraph + GPT-5.2 + OpenAI Small Embeddings + FAISS + Streamlit") + st.caption("MVP stack: LangGraph + OpenAI GPT-5.4 + OpenAI embeddings + FAISS + Streamlit") settings, rag_store, orchestrator, memory = build_runtime() @@ -193,8 +214,26 @@ def main() -> None: if "debug_data" not in st.session_state: st.session_state.debug_data = {} + if settings.demo_mode and st.query_params.get("demo") == "1" and not st.session_state.messages: + demo_query = ( + "We have a D2C weight-loss subscription funnel with paid social traffic, " + "quiz onboarding, a 7-day trial, and high month-1 churn. What should we fix first?" + ) + result = orchestrator.run( + user_query=demo_query, + chat_history=[{"role": "user", "content": demo_query}], + memory_summary="", + ) + st.session_state.messages = [ + {"role": "user", "content": demo_query}, + {"role": "assistant", "content": result.final_answer}, + ] + st.session_state.debug_data = build_debug_data(result) + with st.sidebar: st.header("Knowledge Base") + if settings.demo_mode: + st.info("Demo mode is enabled. The app uses deterministic local responses and no OpenAI calls.") st.write(f"Primary LLM: `{settings.openai_model}`") st.write( f"Fallback LLM: `{settings.fallback_openai_model or '(disabled)'}`" @@ -263,19 +302,7 @@ def main() -> None: token_usage["completion_tokens"] = token_usage.get("completion_tokens", 0) + summary_usage.completion_tokens token_usage["total_tokens"] = token_usage.get("total_tokens", 0) + summary_usage.total_tokens - st.session_state.debug_data = { - "active_subagents": result.active_subagents, - "retrieved_chunks": [ - { - "source": c.source, - "score": c.score, - "text": c.text, - } - for c in result.retrieved_chunks - ], - "token_usage": token_usage, - "subagent_outputs": result.subagent_outputs, - } + st.session_state.debug_data = build_debug_data(result, token_usage=token_usage) if __name__ == "__main__": diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7c91d69 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,20 @@ +services: + product-manager-agent: + build: . + ports: + - "8501:8501" + environment: + DEMO_MODE: ${DEMO_MODE:-false} + OPENAI_API_KEY: ${OPENAI_API_KEY:-} + OPENAI_MODEL: ${OPENAI_MODEL:-gpt-5.4} + OPENAI_FALLBACK_MODEL: ${OPENAI_FALLBACK_MODEL:-gpt-5.4-mini} + OPENAI_REASONING_EFFORT: ${OPENAI_REASONING_EFFORT:-high} + OPENAI_EMBEDDING_MODEL: ${OPENAI_EMBEDDING_MODEL:-text-embedding-3-small} + DOCS_DIR: /app/docs + DATA_DIR: /app/data + TOP_K: ${TOP_K:-4} + MAX_HISTORY_MESSAGES: ${MAX_HISTORY_MESSAGES:-8} + volumes: + - ./docs:/app/docs + - ./data:/app/data + command: streamlit run app.py --server.address=0.0.0.0 --server.port=8501 diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..2b5d180 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,59 @@ +# Architecture + +This project is a local-first MVP of a domain-expert AI assistant for D2C/B2C subscription, telehealth, wellness, and weight-loss product analytics. The implementation intentionally avoids heavy infrastructure: Streamlit handles the UI, LangGraph handles the workflow, OpenAI handles LLM and embedding calls, and FAISS stores the local retrieval index. + +## Agent Workflow + +```mermaid +flowchart LR + User[User] --> UI[Streamlit Chat UI] + UI --> Orchestrator[PMOrchestrator] + Orchestrator --> Retrieve[retrieve] + Retrieve --> Route[route] + Route --> Growth[GrowthAgent] + Growth --> Subscription[SubscriptionAgent] + Subscription --> Compliance[ComplianceAgent] + Compliance --> Synthesize[synthesize] + Synthesize --> Final[Final PM Answer] + Final --> UI +``` + +The graph is intentionally linear. The router determines which subagents are active, but the orchestrator still keeps the execution path predictable and cheap. Growth and subscription analysis are included by default because most product questions in this domain require both funnel and economics thinking. + +## RAG Pipeline + +```mermaid +flowchart LR + Docs[docs/] --> Load[Load .md and .txt files] + Load --> Chunk[Chunking with overlap] + Chunk --> Embed[OpenAI text embeddings] + Embed --> Index[FAISS local index] + Index --> Retrieve[Retrieve top-k chunks] + Retrieve --> Evidence[Evidence block] + Evidence --> Workflow[LangGraph LLM workflow] +``` + +The knowledge base is deliberately simple: + +- `docs/growth` contains funnel, activation, pricing, paywall, and experimentation material. +- `docs/retention` contains churn, lifecycle, and subscription economics material. +- `docs/compliance` contains ad policy and health-claim guardrails. +- `docs/competitors` contains market and digital health context. + +## Main Components + +- `app.py`: Streamlit chat UI, debug sidebar, demo mode, and runtime wiring. +- `src/pm_agent/orchestrator.py`: LangGraph workflow, retrieval, routing, subagent calls, synthesis, token aggregation. +- `src/pm_agent/prompts.py`: domain-specific prompts and compressed subagent output contract. +- `src/pm_agent/rag.py`: document loading, chunking, FAISS indexing, and retrieval. +- `src/pm_agent/llm.py`: OpenAI chat/embedding wrapper with compatibility retries and fallback model handling. +- `src/pm_agent/memory.py`: compact conversation summary memory. +- `src/pm_agent/demo.py`: deterministic local demo client used for screenshots and API-key-free demos. + +## Design Choices + +- Linear graph over autonomous loops keeps cost and debugging predictable. +- Prompt-routed subagents provide domain specialization without agent swarm complexity. +- Structured subagent outputs make synthesis easier to inspect and test. +- Local FAISS keeps the MVP fast, cheap, and simple to run. +- The debug sidebar exposes active subagents, retrieved chunks, token usage, and raw subagent outputs for portfolio review. diff --git a/docs/assets/debug_sidebar_subagents.png b/docs/assets/debug_sidebar_subagents.png new file mode 100644 index 0000000..93903b2 Binary files /dev/null and b/docs/assets/debug_sidebar_subagents.png differ diff --git a/docs/assets/demo_flow.gif b/docs/assets/demo_flow.gif new file mode 100644 index 0000000..ab963ba Binary files /dev/null and b/docs/assets/demo_flow.gif differ diff --git a/docs/assets/final_answer_example.png b/docs/assets/final_answer_example.png new file mode 100644 index 0000000..1d76c8c Binary files /dev/null and b/docs/assets/final_answer_example.png differ diff --git a/docs/assets/main_chat_ui.png b/docs/assets/main_chat_ui.png new file mode 100644 index 0000000..00225eb Binary files /dev/null and b/docs/assets/main_chat_ui.png differ diff --git a/docs/assets/retrieved_chunks.png b/docs/assets/retrieved_chunks.png new file mode 100644 index 0000000..9250501 Binary files /dev/null and b/docs/assets/retrieved_chunks.png differ diff --git a/docs/demo_questions.md b/docs/demo_questions.md new file mode 100644 index 0000000..aa95b23 --- /dev/null +++ b/docs/demo_questions.md @@ -0,0 +1,17 @@ +# Demo Questions + +Use these prompts to demonstrate the agent's domain reasoning, RAG grounding, structured subagent workflow, and debug sidebar. + +1. We have a D2C weight-loss subscription funnel with paid social traffic, quiz onboarding, a 7-day trial, and high month-1 churn. What should we fix first? + +2. Our checkout conversion is stable, but CAC increased 35% and first renewal retention dropped. Diagnose the likely issue and propose a 14-day plan. + +3. Design a pricing test for a telehealth-enabled weight-loss subscription with a monthly plan, annual upsell, and optional provider consult. + +4. Review this ad claim for Meta and Google Ads risk: "Lose 20 pounds fast with our clinically proven plan, no diet or exercise required." + +5. Our quiz start rate is strong, but only 38% complete the quiz and 18% reach checkout. How would you instrument and improve this funnel? + +6. We want to reduce cancellation without dark patterns. What save-flow and lifecycle interventions should we test? + +7. Given CAC of $95, ARPU of $89/month, 68% gross margin, and monthly churn of 12%, estimate LTV/payback and identify the highest-leverage product lever. diff --git a/docs/evals.md b/docs/evals.md new file mode 100644 index 0000000..a96d0d7 --- /dev/null +++ b/docs/evals.md @@ -0,0 +1,36 @@ +# Lightweight Evaluation Guide + +This MVP uses lightweight qualitative evals plus mocked unit tests. The goal is not benchmark perfection; it is to verify that the assistant behaves like a credible senior PM for subscription, growth, and telehealth product questions. + +## Evaluation Criteria + +Score each answer from 1 to 5. + +| Criterion | What Good Looks Like | +|---|---| +| Relevance | Directly answers the user's product question and stays in the D2C/B2C subscription or telehealth context. | +| Groundedness | Uses retrieved knowledge as evidence and avoids unsupported facts or invented citations. | +| Practical Value | Produces concrete recommendations, metrics, experiments, owners, and decision rules. | +| Compliance Risk | Avoids guaranteed health claims, diagnosis/treatment advice, policy-unsafe ad copy, and manipulative retention tactics. | +| Actionability | Gives a PM or growth team a realistic next step for the next 7-30 days. | + +## Suggested Manual Eval Flow + +1. Run the app in demo mode or with an OpenAI key. +2. Ask the prompts from `docs/demo_questions.md`. +3. Open the debug sidebar and confirm active subagents match the query. +4. Check that retrieved chunks are plausible for the question. +5. Score the final answer on the five criteria above. +6. Record weak answers and improve prompts, source docs, or routing keywords. + +## Automated Checks + +The repository also includes mocked tests and scripts that do not call OpenAI: + +```bash +pytest +python scripts/run_evals.py +python scripts/run_stress_tests.py +``` + +These checks cover routing, JSON extraction/normalization, chunking, FAISS indexing with fake embeddings, memory summary refresh behavior, settings loading, and boundary/stress behavior. diff --git a/package.json b/package.json new file mode 100644 index 0000000..2dd1a5d --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "name": "product-manager", + "version": "1.0.0", + "description": "Lightweight local-first MVP for a domain-expert PM assistant in D2C/B2C weight-loss subscription and telehealth products.", + "main": "index.js", + "directories": { + "doc": "docs" + }, + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC" +} diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..20c8418 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +pythonpath = src +testpaths = tests +addopts = -q diff --git a/requirements.txt b/requirements.txt index 4a7c1d4..acc2e16 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ faiss-cpu>=1.8.0 numpy>=1.26.0 python-dotenv>=1.0.1 trafilatura>=2.0.0 +requests>=2.32.0 +pytest>=8.2.0 diff --git a/scripts/ingest_docs.py b/scripts/ingest_docs.py index 13ce795..ab57a84 100644 --- a/scripts/ingest_docs.py +++ b/scripts/ingest_docs.py @@ -7,17 +7,23 @@ sys.path.insert(0, str(ROOT / "src")) from pm_agent.config import load_settings +from pm_agent.demo import DemoLLMClient from pm_agent.llm import OpenAIClient from pm_agent.rag import RAGStore def main() -> None: settings = load_settings() - llm_client = OpenAIClient( - api_key=settings.openai_api_key, - model=settings.openai_model, - embedding_model=settings.embedding_model, - ) + if settings.demo_mode: + llm_client = DemoLLMClient() + else: + llm_client = OpenAIClient( + api_key=settings.openai_api_key, + model=settings.openai_model, + embedding_model=settings.embedding_model, + fallback_model=settings.fallback_openai_model, + reasoning_effort=settings.openai_reasoning_effort, + ) rag_store = RAGStore(settings=settings, llm_client=llm_client) stats = rag_store.build_from_docs(docs_dir=settings.docs_dir) diff --git a/scripts/run_all_quality_checks.sh b/scripts/run_all_quality_checks.sh index 39a9d5b..541a8c6 100755 --- a/scripts/run_all_quality_checks.sh +++ b/scripts/run_all_quality_checks.sh @@ -4,12 +4,16 @@ set -euo pipefail cd "$(dirname "$0")/.." export PYTHONPATH="src" +PYTHON_BIN="${PYTHON:-python3}" +if [[ -x ".venv/bin/python" ]]; then + PYTHON_BIN=".venv/bin/python" +fi echo "[1/3] Running unit and boundary tests..." -python -m unittest discover -s tests -p 'test_*.py' -v +"$PYTHON_BIN" -m pytest echo "[2/3] Running evals..." -python scripts/run_evals.py +"$PYTHON_BIN" scripts/run_evals.py echo "[3/3] Running stress tests..." -python scripts/run_stress_tests.py +"$PYTHON_BIN" scripts/run_stress_tests.py diff --git a/scripts/run_evals.py b/scripts/run_evals.py index d706f9f..21ac037 100644 --- a/scripts/run_evals.py +++ b/scripts/run_evals.py @@ -1,9 +1,13 @@ from __future__ import annotations import json +import sys import tempfile from pathlib import Path +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "src")) + from pm_agent.config import Settings from pm_agent.llm import UsageStats from pm_agent.orchestrator import PMOrchestrator diff --git a/scripts/run_stress_tests.py b/scripts/run_stress_tests.py index cadaf3c..0db46d1 100644 --- a/scripts/run_stress_tests.py +++ b/scripts/run_stress_tests.py @@ -2,9 +2,13 @@ import random import statistics +import sys import time from pathlib import Path +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "src")) + from pm_agent.config import Settings from pm_agent.llm import UsageStats from pm_agent.orchestrator import PMOrchestrator diff --git a/src/pm_agent/config.py b/src/pm_agent/config.py index 35f7512..78cefa1 100644 --- a/src/pm_agent/config.py +++ b/src/pm_agent/config.py @@ -10,14 +10,15 @@ @dataclass(frozen=True) class Settings: openai_api_key: str - openai_model: str = "gpt-5.3" - fallback_openai_model: str = "" + openai_model: str = "gpt-5.4" + fallback_openai_model: str = "gpt-5.4-mini" openai_reasoning_effort: str = "high" embedding_model: str = "text-embedding-3-small" docs_dir: Path = Path("docs") data_dir: Path = Path("data") top_k: int = 4 max_history_messages: int = 8 + demo_mode: bool = False @property def faiss_index_path(self) -> Path: @@ -31,8 +32,9 @@ def chunk_store_path(self) -> Path: def load_settings() -> Settings: load_dotenv() + demo_mode = os.getenv("DEMO_MODE", "false").strip().lower() in {"1", "true", "yes", "on"} api_key = os.getenv("OPENAI_API_KEY", "").strip() - if not api_key: + if not api_key and not demo_mode: raise ValueError("OPENAI_API_KEY is required. Add it to .env or environment variables.") docs_dir = Path(os.getenv("DOCS_DIR", "docs")) @@ -40,13 +42,14 @@ def load_settings() -> Settings: data_dir.mkdir(parents=True, exist_ok=True) return Settings( - openai_api_key=api_key, - openai_model=os.getenv("OPENAI_MODEL", "gpt-5.3"), - fallback_openai_model=os.getenv("OPENAI_FALLBACK_MODEL", "").strip(), + openai_api_key=api_key or "demo-mode-no-api-key", + openai_model=os.getenv("OPENAI_MODEL", "gpt-5.4"), + fallback_openai_model=os.getenv("OPENAI_FALLBACK_MODEL", "gpt-5.4-mini").strip(), openai_reasoning_effort=os.getenv("OPENAI_REASONING_EFFORT", "high").strip(), embedding_model=os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small"), docs_dir=docs_dir, data_dir=data_dir, top_k=int(os.getenv("TOP_K", "4")), max_history_messages=int(os.getenv("MAX_HISTORY_MESSAGES", "8")), + demo_mode=demo_mode, ) diff --git a/src/pm_agent/demo.py b/src/pm_agent/demo.py new file mode 100644 index 0000000..f4437c5 --- /dev/null +++ b/src/pm_agent/demo.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from .llm import UsageStats +from .rag import RetrievedChunk + + +class DemoLLMClient: + """Deterministic local client for portfolio demos without OpenAI calls.""" + + active_model = "demo-local" + + def __init__(self) -> None: + self.model = "demo-local" + self.fallback_model = "" + self.embedding_model = "demo-local-embeddings" + + def complete( + self, + *, + system_prompt: str, + user_prompt: str, + temperature: float = 0.2, + max_completion_tokens: int = 900, + ) -> tuple[str, UsageStats]: + if "GrowthAgent" in system_prompt: + return ( + '{"agent":"GrowthAgent","summary":"Activation and checkout friction are likely constraining paid growth efficiency.",' + '"key_findings":["Quiz completion and checkout conversion should be segmented by channel and offer.","Faster D0-D3 first value can improve both conversion confidence and retention."],' + '"recommendations":[{"title":"Instrument funnel drop-off by channel and offer","rationale":"CAC decisions are unreliable without stage-level conversion visibility.","expected_impact":"High","effort":"S","success_metric":"Checkout CVR and CAC by cohort"},{"title":"Shorten first-value path","rationale":"Weight-loss subscriptions retain better when users see a concrete plan quickly.","expected_impact":"High","effort":"M","success_metric":"D1/D7 activation"}],' + '"assumptions":["Paid social is the main acquisition channel."],"compliance_flags":[],"experiments":[{"hypothesis":"Reducing quiz friction increases checkout starts without lowering paid conversion quality.","metric":"Quiz completion rate","guardrail_metric":"Refund rate","decision_rule":"+10% completion with flat or better checkout CVR"}]}', + UsageStats(prompt_tokens=520, completion_tokens=170), + ) + + if "SubscriptionAgent" in system_prompt: + return ( + '{"agent":"SubscriptionAgent","summary":"The strongest economics lever is reducing month-1 churn before scaling spend.",' + '"key_findings":["Payback should be measured on contribution margin, not revenue.","The first renewal is the highest-risk lifecycle moment."],' + '"recommendations":[{"title":"Build first-renewal retention dashboard","rationale":"Cohort payback depends on D0-D30 activation and refund behavior.","expected_impact":"High","effort":"S","success_metric":"Payback P50/P75 and month-1 churn"},{"title":"Test transparent billing copy","rationale":"Clear renewal expectations can reduce refunds and support tickets.","expected_impact":"Medium","effort":"S","success_metric":"Refund rate and first renewal retention"}],' + '"assumptions":["The product uses a monthly subscription after a trial or intro offer."],"compliance_flags":[],"experiments":[{"hypothesis":"Transparent renewal framing improves trust without materially reducing paid conversion.","metric":"First renewal retention","guardrail_metric":"Checkout CVR","decision_rule":"Ship if refunds fall and checkout CVR drops less than 3%"}]}', + UsageStats(prompt_tokens=480, completion_tokens=155), + ) + + if "ComplianceAgent" in system_prompt: + return ( + '{"agent":"ComplianceAgent","summary":"Messaging should avoid guaranteed outcomes and personal-attribute targeting.",' + '"key_findings":["Health claims need qualification and substantiation.","Telehealth copy should not imply prescription eligibility is guaranteed."],' + '"recommendations":[{"title":"Rewrite outcome claims as support claims","rationale":"Policy-safe language preserves conversion intent while reducing ad risk.","expected_impact":"Medium","effort":"S","success_metric":"Ad approval rate"}],' + '"assumptions":[],"compliance_flags":["Avoid guaranteed weight-loss claims.","Avoid implying the viewer has obesity or a medical condition."],"experiments":[]}', + UsageStats(prompt_tokens=360, completion_tokens=105), + ) + + return ( + "1. Executive Recommendation\n" + "Focus the MVP on one measurable loop: acquire qualified users, get them to a credible first plan within D0-D3, and measure whether that improves first-renewal retention before scaling spend.\n\n" + "2. Diagnosis\n" + "The likely bottleneck is not only acquisition cost. It is the combined effect of funnel friction, unclear renewal expectations, and weak early activation. Optimize CAC against contribution-margin payback, segmented by channel, offer, and plan.\n\n" + "3. Prioritized Action Plan\n" + "P0: Instrument quiz to checkout to first renewal by cohort. Success metrics: CAC, checkout CVR, month-1 churn.\n" + "P1: Improve D0-D3 first value. Success metrics: D1/D7 activation and first renewal retention.\n" + "P2: Rewrite billing and outcome claims. Success metrics: refund rate and ad approval rate.\n\n" + "4. Compliance Guardrails\n" + "Avoid guaranteed weight-loss claims, medical promises, and copy that implies the viewer has a condition. Use qualified language such as designed to support, results vary, and eligibility is determined by a licensed provider.\n\n" + "5. Next 14 Days\n" + "Ship instrumentation, run one onboarding simplification test, and review ad/landing claims before increasing paid spend.", + UsageStats(prompt_tokens=900, completion_tokens=330), + ) + + def embed_texts(self, texts: list[str]) -> list[list[float]]: + return [[float(len(text) % 17 + 1), float(text.lower().count("retention") + 1)] for text in texts] + + +class DemoRAGStore: + def __init__(self) -> None: + self._chunks = [ + RetrievedChunk( + chunk_id=0, + source="docs/growth/activation_framework.md", + text="Activation should be measured by the first meaningful action that predicts retention, not by account creation alone.", + score=0.91, + ), + RetrievedChunk( + chunk_id=1, + source="docs/retention/retention_playbook.md", + text="Segment churn by channel, offer, cohort, tenure, usage, refund reason, and first-value completion.", + score=0.87, + ), + RetrievedChunk( + chunk_id=2, + source="docs/compliance/ad_policy_guardrails.md", + text="Avoid guaranteed health outcomes, personal-attribute targeting, and unsubstantiated weight-loss claims.", + score=0.82, + ), + ] + + def has_index(self) -> bool: + return True + + def build_from_docs(self) -> dict[str, int]: + return {"documents": 3, "chunks": len(self._chunks)} + + def retrieve(self, query: str, top_k: int | None = None) -> list[RetrievedChunk]: + if not query.strip(): + return [] + return self._chunks[: top_k or len(self._chunks)] diff --git a/src/pm_agent/langgraph_app.py b/src/pm_agent/langgraph_app.py index b6e9392..f67dbc5 100644 --- a/src/pm_agent/langgraph_app.py +++ b/src/pm_agent/langgraph_app.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(SRC)) from pm_agent.config import load_settings +from pm_agent.demo import DemoLLMClient, DemoRAGStore from pm_agent.llm import OpenAIClient from pm_agent.orchestrator import PMOrchestrator from pm_agent.rag import RAGStore @@ -16,14 +17,18 @@ def build_graph(): settings = load_settings() - llm_client = OpenAIClient( - api_key=settings.openai_api_key, - model=settings.openai_model, - embedding_model=settings.embedding_model, - fallback_model=settings.fallback_openai_model, - reasoning_effort=settings.openai_reasoning_effort, - ) - rag_store = RAGStore(settings, llm_client) + if settings.demo_mode: + llm_client = DemoLLMClient() + rag_store = DemoRAGStore() + else: + llm_client = OpenAIClient( + api_key=settings.openai_api_key, + model=settings.openai_model, + embedding_model=settings.embedding_model, + fallback_model=settings.fallback_openai_model, + reasoning_effort=settings.openai_reasoning_effort, + ) + rag_store = RAGStore(settings, llm_client) orchestrator = PMOrchestrator(settings, llm_client, rag_store) return orchestrator.graph diff --git a/tests/test_memory_and_config.py b/tests/test_memory_and_config.py index d2b26bd..dfa61e7 100644 --- a/tests/test_memory_and_config.py +++ b/tests/test_memory_and_config.py @@ -76,6 +76,16 @@ def test_load_settings_reads_env(self) -> None: self.assertEqual(settings.max_history_messages, 11) self.assertTrue(settings.data_dir.exists()) + @patch.dict(os.environ, {"DEMO_MODE": "true"}, clear=True) + def test_load_settings_allows_demo_mode_without_api_key(self) -> None: + with patch("pm_agent.config.load_dotenv", return_value=None): + settings = load_settings() + + self.assertTrue(settings.demo_mode) + self.assertEqual(settings.openai_api_key, "demo-mode-no-api-key") + self.assertEqual(settings.openai_model, "gpt-5.4") + self.assertEqual(settings.fallback_openai_model, "gpt-5.4-mini") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 897b1b9..5ad69f6 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -88,6 +88,36 @@ def test_extract_json_object_with_wrapped_text(self) -> None: parsed = orch._extract_json_object("preface {\"agent\":\"GrowthAgent\",\"summary\":\"ok\"} suffix") self.assertEqual(parsed["agent"], "GrowthAgent") + def test_normalize_subagent_output_limits_schema(self) -> None: + orch = PMOrchestrator(self.make_settings(), FakeLLMForOrchestrator(), FakeRAGStore()) + normalized = orch._normalize_subagent_output( + { + "agent": "GrowthAgent", + "summary": "ok", + "key_findings": ["a", "b", "c", "d"], + "recommendations": [ + {"title": "r1", "rationale": "why", "expected_impact": "high", "effort": "S", "success_metric": "CVR"}, + {"title": "r2"}, + {"title": "r3"}, + {"title": "r4"}, + ], + "experiments": [ + {"hypothesis": "h1", "metric": "m1", "guardrail_metric": "g1", "decision_rule": "d1"}, + {"hypothesis": "h2"}, + {"hypothesis": "h3"}, + ], + "assumptions": ["a1"], + "compliance_flags": ["f1"], + }, + agent="GrowthAgent", + raw="raw", + ) + + self.assertEqual(len(normalized["key_findings"]), 3) + self.assertEqual(len(normalized["recommendations"]), 3) + self.assertEqual(len(normalized["experiments"]), 2) + self.assertEqual(normalized["raw_text"], "") + def test_run_end_to_end(self) -> None: orch = PMOrchestrator(self.make_settings(), FakeLLMForOrchestrator(), FakeRAGStore()) result = orch.run(