From 89411ae8b3738c96fccf08013b0a8e2ab40d52b5 Mon Sep 17 00:00:00 2001
From: Strange Def <strangedef@gmail.com>
Date: Wed, 10 Jun 2026 11:24:37 +0700
Subject: [PATCH] feat: implement exclude_titles filtering in discovery engines

Added  filter across JobSpy, Workday, and SmartExtract discovery engines
to properly filter out jobs that contain any of the excluded title keywords
specified in the user's searches.yaml configuration.
---
 src/applypilot/discovery/jobspy.py       | 25 ++++++++++++----
 src/applypilot/discovery/smartextract.py | 38 +++++++++++++++++-------
 src/applypilot/discovery/workday.py      | 28 +++++++++++++++--
 3 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/src/applypilot/discovery/jobspy.py b/src/applypilot/discovery/jobspy.py
index b5e54ff4..eb67722d 100644
--- a/src/applypilot/discovery/jobspy.py
+++ b/src/applypilot/discovery/jobspy.py
@@ -15,7 +15,7 @@
 from jobspy import scrape_jobs
 
 from applypilot import config
-from applypilot.database import get_connection, init_db, store_jobs
+from applypilot.database import get_connection, init_db
 
 log = logging.getLogger(__name__)
 
@@ -115,6 +115,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- DB storage (JobSpy DataFrame -> SQLite) ---------------------------------
 
 def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tuple[int, int]:
@@ -129,7 +140,6 @@ def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tup
             continue
 
         title = str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None
-        company = str(row.get("company", "")) if str(row.get("company", "")) != "nan" else None
         location_str = str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None
 
         # Build salary string from min/max
@@ -195,6 +205,7 @@ def _run_one_search(
     accept_locs: list[str],
     reject_locs: list[str],
     glassdoor_map: dict,
+    exclude_titles: list[str],
 ) -> dict:
     """Run a single search query and store results in DB."""
     s = search
@@ -268,11 +279,14 @@ def _run_one_search(
         log.info("[%s] 0 results", label)
         return {"new": 0, "existing": 0, "errors": 0, "filtered": 0, "total": 0, "label": label}
 
-    # Filter by location before storing
+    # Filter by location and title before storing
     before = len(df)
     df = df[df.apply(lambda row: _location_ok(
         str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None,
         accept_locs, reject_locs,
+    ) and _title_ok(
+        str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None,
+        exclude_titles,
     ), axis=1)]
     filtered = before - len(df)
 
@@ -281,7 +295,7 @@ def _run_one_search(
 
     msg = f"[{label}] {before} results -> {new} new, {existing} dupes"
     if filtered:
-        msg += f", {filtered} filtered (location)"
+        msg += f", {filtered} filtered (location/title)"
     log.info(msg)
 
     return {"new": new, "existing": existing, "errors": 0, "filtered": filtered, "total": before, "label": label}
@@ -377,6 +391,7 @@ def _full_crawl(
     defaults = search_cfg.get("defaults", {})
     glassdoor_map = search_cfg.get("glassdoor_location_map", {})
     accept_locs, reject_locs = _load_location_config(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     if tiers:
         queries = [q for q in queries if q.get("tier") in tiers]
@@ -411,7 +426,7 @@ def _full_crawl(
         result = _run_one_search(
             s, sites, results_per_site, hours_old,
             proxy_config, defaults, max_retries,
-            accept_locs, reject_locs, glassdoor_map,
+            accept_locs, reject_locs, glassdoor_map, exclude_titles
         )
         completed += 1
         total_new += result["new"]
diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py
index cf49a9a2..5661592b 100644
--- a/src/applypilot/discovery/smartextract.py
+++ b/src/applypilot/discovery/smartextract.py
@@ -20,17 +20,15 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
-from pathlib import Path
 from urllib.parse import quote_plus
 
-import httpx
 import yaml
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright
 
 from applypilot import config
 from applypilot.config import CONFIG_DIR
-from applypilot.database import get_connection, init_db, store_jobs, get_stats
+from applypilot.database import init_db, get_stats
 from applypilot.llm import get_client
 
 log = logging.getLogger(__name__)
@@ -73,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- Site configuration from YAML --------------------------------------------
 
 def load_sites() -> list[dict]:
@@ -92,6 +101,7 @@ def _store_jobs_filtered(
     strategy: str,
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
 ) -> tuple[int, int]:
     """Store jobs with location filtering. Returns (new, existing)."""
     now = datetime.now(timezone.utc).isoformat()
@@ -106,6 +116,9 @@ def _store_jobs_filtered(
         if not _location_ok(job.get("location"), accept_locs, reject_locs):
             filtered += 1
             continue
+        if not _title_ok(job.get("title"), exclude_titles):
+            filtered += 1
+            continue
         try:
             conn.execute(
                 "INSERT INTO jobs (url, title, salary, description, location, site, strategy, discovered_at) "
@@ -118,7 +131,7 @@ def _store_jobs_filtered(
             existing += 1
 
     if filtered:
-        log.info("Filtered %d jobs (wrong location)", filtered)
+        log.info("Filtered %d jobs (wrong location or title)", filtered)
     conn.commit()
     return new, existing
 
@@ -424,7 +437,7 @@ def format_strategy_briefing(intel: dict) -> str:
             sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)")
             sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}")
         else:
-            sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
+            sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)")
         if other:
             types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other]
             sections.append(f"Other JSON-LD types (NOT job data): {types}")
@@ -439,7 +452,7 @@ def format_strategy_briefing(intel: dict) -> str:
             sections.append(f"  Status: {resp['status']} | Size: {resp['size']:,} chars | Type: {resp.get('type', '?')}")
             if "first_item_keys" in resp:
                 sections.append(f"  Item keys: {resp['first_item_keys']}")
-                sections.append(f"  Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:1000]}")
+                sections.append(f"  Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:100]}")
             if "keys" in resp:
                 sections.append(f"  Object keys: {resp['keys']}")
             for k, v in resp.items():
@@ -447,17 +460,17 @@ def format_strategy_briefing(intel: dict) -> str:
                     arr_name = k.replace("nested_", "")
                     sections.append(f"  .{arr_name}: array of {v['count']} items")
                     sections.append(f"    Item keys: {v['first_item_keys']}")
-                    sections.append(f"    Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:1000]}")
+                    sections.append(f"    Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:100]}")
                     for sk, sv in v.items():
                         if sk.startswith("first_item.") and isinstance(sv, dict):
                             sub_name = sk.replace("first_item.", "")
                             if "count" in sv:
                                 sections.append(f"    .{arr_name}[0].{sub_name}: array of {sv['count']} items")
                                 sections.append(f"      Item keys: {sv['first_item_keys']}")
-                                sections.append(f"      Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:1500]}")
+                                sections.append(f"      Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:150]}")
                             elif "keys" in sv:
                                 sections.append(f"    .{arr_name}[0].{sub_name}: object with keys {sv['keys']}")
-                                sections.append(f"      Sample: {json.dumps(sv.get('sample', {}), indent=2)[:1500]}")
+                                sections.append(f"      Sample: {json.dumps(sv.get('sample', {}), indent=2)[:150]}")
     else:
         sections.append("\nAPI RESPONSES: none intercepted")
 
@@ -1016,6 +1029,7 @@ def _run_all(
     targets: list[dict],
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
     workers: int = 1,
 ) -> dict:
     """Run smart extract on all targets.
@@ -1038,7 +1052,8 @@ def _process_result(r: dict, target: dict) -> None:
         if jobs:
             new, existing = _store_jobs_filtered(conn, jobs, target["name"],
                                                   r.get("strategy", "?"),
-                                                  accept_locs, reject_locs)
+                                                  accept_locs, reject_locs,
+                                                  exclude_titles)
             total_new += new
             total_existing += existing
             log.info("DB: +%d new, %d already existed", new, existing)
@@ -1103,6 +1118,7 @@ def run_smart_extract(
     """
     search_cfg = config.load_search_config()
     accept_locs, reject_locs = _load_location_filter(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     targets = build_scrape_targets(sites=sites, search_cfg=search_cfg)
 
@@ -1115,4 +1131,4 @@ def run_smart_extract(
     log.info("Sites: %d searchable, %d static | Total targets: %d (workers=%d)",
              search_sites, static_sites, len(targets), workers)
 
-    return _run_all(targets, accept_locs, reject_locs, workers=workers)
+    return _run_all(targets, accept_locs, reject_locs, exclude_titles, workers=workers)
diff --git a/src/applypilot/discovery/workday.py b/src/applypilot/discovery/workday.py
index cef69fe4..d3621cca 100644
--- a/src/applypilot/discovery/workday.py
+++ b/src/applypilot/discovery/workday.py
@@ -71,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) ->
     return False
 
 
+def _title_ok(title: str | None, exclude_titles: list[str]) -> bool:
+    """Check if a job title passes the user's negative title filter."""
+    if not title or not exclude_titles:
+        return True
+    t_lower = title.lower()
+    for ex in exclude_titles:
+        if ex.lower() in t_lower:
+            return False
+    return True
+
+
 # -- HTML stripper -----------------------------------------------------------
 
 class _HTMLStripper(HTMLParser):
@@ -194,6 +205,7 @@ def search_employer(
     max_results: int = 0,
     accept_locs: list[str] | None = None,
     reject_locs: list[str] | None = None,
+    exclude_titles: list[str] | None = None,
 ) -> list[dict]:
     """Search an employer, paginate through all results, optionally filter by location."""
     log.info("%s: searching \"%s\"...", employer["name"], search_text)
@@ -225,6 +237,9 @@ def search_employer(
                 if not _location_ok(loc, accept_locs, reject_locs):
                     continue
 
+            if exclude_titles and not _title_ok(j.get("title", ""), exclude_titles):
+                continue
+
             all_jobs.append({
                 "title": j.get("title", ""),
                 "location": loc,
@@ -246,7 +261,7 @@ def search_employer(
             break
 
     log.info("%s: %d jobs found%s", employer["name"], len(all_jobs),
-             " (filtered)" if location_filter else "")
+             " (filtered)" if location_filter or exclude_titles else "")
     return all_jobs
 
 
@@ -347,6 +362,7 @@ def _process_one(
     location_filter: bool,
     accept_locs: list[str],
     reject_locs: list[str],
+    exclude_titles: list[str],
 ) -> dict:
     """Search one employer, fetch details, store results."""
     emp = employers[employer_key]
@@ -357,6 +373,7 @@ def _process_one(
             location_filter=location_filter,
             accept_locs=accept_locs,
             reject_locs=reject_locs,
+            exclude_titles=exclude_titles,
         )
     except Exception as e:
         log.error("%s: ERROR searching '%s': %s", emp["name"], search_text, e)
@@ -390,6 +407,7 @@ def scrape_employers(
     max_results: int = 0,
     accept_locs: list[str] | None = None,
     reject_locs: list[str] | None = None,
+    exclude_titles: list[str] | None = None,
     workers: int = 1,
 ) -> dict:
     """Run full scrape: search -> filter -> detail -> store.
@@ -404,6 +422,8 @@ def scrape_employers(
         accept_locs = []
     if reject_locs is None:
         reject_locs = []
+    if exclude_titles is None:
+        exclude_titles = []
 
     # Ensure DB schema
     init_db()
@@ -423,7 +443,7 @@ def scrape_employers(
             futures = {
                 pool.submit(
                     _process_one, key, employers, search_text,
-                    location_filter, accept_locs, reject_locs,
+                    location_filter, accept_locs, reject_locs, exclude_titles
                 ): key
                 for key in valid_keys
             }
@@ -446,7 +466,7 @@ def scrape_employers(
         for key in valid_keys:
             result = _process_one(
                 key, employers, search_text,
-                location_filter, accept_locs, reject_locs,
+                location_filter, accept_locs, reject_locs, exclude_titles
             )
             completed += 1
             total_new += result["new"]
@@ -493,6 +513,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
     search_cfg = config.load_search_config()
     queries_cfg = search_cfg.get("queries", [])
     accept_locs, reject_locs = _load_location_filter(search_cfg)
+    exclude_titles = search_cfg.get("exclude_titles", [])
 
     # Default to tier 1-2 queries for workday scraping
     max_tier = search_cfg.get("workday_max_tier", 2)
@@ -526,6 +547,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di
             location_filter=location_filter,
             accept_locs=accept_locs,
             reject_locs=reject_locs,
+            exclude_titles=exclude_titles,
             workers=workers,
         )
         grand_new += result["new"]