From 89411ae8b3738c96fccf08013b0a8e2ab40d52b5 Mon Sep 17 00:00:00 2001 From: Strange Def Date: Wed, 10 Jun 2026 11:24:37 +0700 Subject: [PATCH] feat: implement exclude_titles filtering in discovery engines Added filter across JobSpy, Workday, and SmartExtract discovery engines to properly filter out jobs that contain any of the excluded title keywords specified in the user's searches.yaml configuration. --- src/applypilot/discovery/jobspy.py | 25 ++++++++++++---- src/applypilot/discovery/smartextract.py | 38 +++++++++++++++++------- src/applypilot/discovery/workday.py | 28 +++++++++++++++-- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/src/applypilot/discovery/jobspy.py b/src/applypilot/discovery/jobspy.py index b5e54ff4..eb67722d 100644 --- a/src/applypilot/discovery/jobspy.py +++ b/src/applypilot/discovery/jobspy.py @@ -15,7 +15,7 @@ from jobspy import scrape_jobs from applypilot import config -from applypilot.database import get_connection, init_db, store_jobs +from applypilot.database import get_connection, init_db log = logging.getLogger(__name__) @@ -115,6 +115,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- DB storage (JobSpy DataFrame -> SQLite) --------------------------------- def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tuple[int, int]: @@ -129,7 +140,6 @@ def store_jobspy_results(conn: sqlite3.Connection, df, source_label: str) -> tup continue title = str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None - company = str(row.get("company", "")) if str(row.get("company", "")) != "nan" else None location_str = str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None # Build salary string from min/max @@ -195,6 +205,7 @@ def _run_one_search( accept_locs: list[str], reject_locs: list[str], glassdoor_map: dict, + exclude_titles: list[str], ) -> dict: """Run a single search query and store results in DB.""" s = search @@ -268,11 +279,14 @@ def _run_one_search( log.info("[%s] 0 results", label) return {"new": 0, "existing": 0, "errors": 0, "filtered": 0, "total": 0, "label": label} - # Filter by location before storing + # Filter by location and title before storing before = len(df) df = df[df.apply(lambda row: _location_ok( str(row.get("location", "")) if str(row.get("location", "")) != "nan" else None, accept_locs, reject_locs, + ) and _title_ok( + str(row.get("title", "")) if str(row.get("title", "")) != "nan" else None, + exclude_titles, ), axis=1)] filtered = before - len(df) @@ -281,7 +295,7 @@ def _run_one_search( msg = f"[{label}] {before} results -> {new} new, {existing} dupes" if filtered: - msg += f", {filtered} filtered (location)" + msg += f", {filtered} filtered (location/title)" log.info(msg) return {"new": new, "existing": existing, "errors": 0, "filtered": filtered, "total": before, "label": label} @@ -377,6 +391,7 @@ def _full_crawl( defaults = search_cfg.get("defaults", {}) glassdoor_map = search_cfg.get("glassdoor_location_map", {}) accept_locs, reject_locs = _load_location_config(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) if tiers: queries = [q for q in queries if q.get("tier") in tiers] @@ -411,7 +426,7 @@ def _full_crawl( result = _run_one_search( s, sites, results_per_site, hours_old, proxy_config, defaults, max_retries, - accept_locs, reject_locs, glassdoor_map, + accept_locs, reject_locs, glassdoor_map, exclude_titles ) completed += 1 total_new += result["new"] diff --git a/src/applypilot/discovery/smartextract.py b/src/applypilot/discovery/smartextract.py index cf49a9a2..5661592b 100644 --- a/src/applypilot/discovery/smartextract.py +++ b/src/applypilot/discovery/smartextract.py @@ -20,17 +20,15 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone -from pathlib import Path from urllib.parse import quote_plus -import httpx import yaml from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright from applypilot import config from applypilot.config import CONFIG_DIR -from applypilot.database import get_connection, init_db, store_jobs, get_stats +from applypilot.database import init_db, get_stats from applypilot.llm import get_client log = logging.getLogger(__name__) @@ -73,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- Site configuration from YAML -------------------------------------------- def load_sites() -> list[dict]: @@ -92,6 +101,7 @@ def _store_jobs_filtered( strategy: str, accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], ) -> tuple[int, int]: """Store jobs with location filtering. Returns (new, existing).""" now = datetime.now(timezone.utc).isoformat() @@ -106,6 +116,9 @@ def _store_jobs_filtered( if not _location_ok(job.get("location"), accept_locs, reject_locs): filtered += 1 continue + if not _title_ok(job.get("title"), exclude_titles): + filtered += 1 + continue try: conn.execute( "INSERT INTO jobs (url, title, salary, description, location, site, strategy, discovered_at) " @@ -118,7 +131,7 @@ def _store_jobs_filtered( existing += 1 if filtered: - log.info("Filtered %d jobs (wrong location)", filtered) + log.info("Filtered %d jobs (wrong location or title)", filtered) conn.commit() return new, existing @@ -424,7 +437,7 @@ def format_strategy_briefing(intel: dict) -> str: sections.append(f"\nJSON-LD: {len(job_postings)} JobPosting entries found (usable!)") sections.append(f"First JobPosting:\n{json.dumps(job_postings[0], indent=2)[:3000]}") else: - sections.append(f"\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)") + sections.append("\nJSON-LD: NO JobPosting entries (json_ld strategy will NOT work)") if other: types = [j.get("@type", "?") if isinstance(j, dict) else "?" for j in other] sections.append(f"Other JSON-LD types (NOT job data): {types}") @@ -439,7 +452,7 @@ def format_strategy_briefing(intel: dict) -> str: sections.append(f" Status: {resp['status']} | Size: {resp['size']:,} chars | Type: {resp.get('type', '?')}") if "first_item_keys" in resp: sections.append(f" Item keys: {resp['first_item_keys']}") - sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:1000]}") + sections.append(f" Sample: {json.dumps(resp.get('first_item_sample', {}), indent=2)[:100]}") if "keys" in resp: sections.append(f" Object keys: {resp['keys']}") for k, v in resp.items(): @@ -447,17 +460,17 @@ def format_strategy_briefing(intel: dict) -> str: arr_name = k.replace("nested_", "") sections.append(f" .{arr_name}: array of {v['count']} items") sections.append(f" Item keys: {v['first_item_keys']}") - sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:1000]}") + sections.append(f" Sample: {json.dumps(v.get('first_item_sample', {}), indent=2)[:100]}") for sk, sv in v.items(): if sk.startswith("first_item.") and isinstance(sv, dict): sub_name = sk.replace("first_item.", "") if "count" in sv: sections.append(f" .{arr_name}[0].{sub_name}: array of {sv['count']} items") sections.append(f" Item keys: {sv['first_item_keys']}") - sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:1500]}") + sections.append(f" Sample: {json.dumps(sv.get('first_item_sample', {}), indent=2)[:150]}") elif "keys" in sv: sections.append(f" .{arr_name}[0].{sub_name}: object with keys {sv['keys']}") - sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:1500]}") + sections.append(f" Sample: {json.dumps(sv.get('sample', {}), indent=2)[:150]}") else: sections.append("\nAPI RESPONSES: none intercepted") @@ -1016,6 +1029,7 @@ def _run_all( targets: list[dict], accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], workers: int = 1, ) -> dict: """Run smart extract on all targets. @@ -1038,7 +1052,8 @@ def _process_result(r: dict, target: dict) -> None: if jobs: new, existing = _store_jobs_filtered(conn, jobs, target["name"], r.get("strategy", "?"), - accept_locs, reject_locs) + accept_locs, reject_locs, + exclude_titles) total_new += new total_existing += existing log.info("DB: +%d new, %d already existed", new, existing) @@ -1103,6 +1118,7 @@ def run_smart_extract( """ search_cfg = config.load_search_config() accept_locs, reject_locs = _load_location_filter(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) targets = build_scrape_targets(sites=sites, search_cfg=search_cfg) @@ -1115,4 +1131,4 @@ def run_smart_extract( log.info("Sites: %d searchable, %d static | Total targets: %d (workers=%d)", search_sites, static_sites, len(targets), workers) - return _run_all(targets, accept_locs, reject_locs, workers=workers) + return _run_all(targets, accept_locs, reject_locs, exclude_titles, workers=workers) diff --git a/src/applypilot/discovery/workday.py b/src/applypilot/discovery/workday.py index cef69fe4..d3621cca 100644 --- a/src/applypilot/discovery/workday.py +++ b/src/applypilot/discovery/workday.py @@ -71,6 +71,17 @@ def _location_ok(location: str | None, accept: list[str], reject: list[str]) -> return False +def _title_ok(title: str | None, exclude_titles: list[str]) -> bool: + """Check if a job title passes the user's negative title filter.""" + if not title or not exclude_titles: + return True + t_lower = title.lower() + for ex in exclude_titles: + if ex.lower() in t_lower: + return False + return True + + # -- HTML stripper ----------------------------------------------------------- class _HTMLStripper(HTMLParser): @@ -194,6 +205,7 @@ def search_employer( max_results: int = 0, accept_locs: list[str] | None = None, reject_locs: list[str] | None = None, + exclude_titles: list[str] | None = None, ) -> list[dict]: """Search an employer, paginate through all results, optionally filter by location.""" log.info("%s: searching \"%s\"...", employer["name"], search_text) @@ -225,6 +237,9 @@ def search_employer( if not _location_ok(loc, accept_locs, reject_locs): continue + if exclude_titles and not _title_ok(j.get("title", ""), exclude_titles): + continue + all_jobs.append({ "title": j.get("title", ""), "location": loc, @@ -246,7 +261,7 @@ def search_employer( break log.info("%s: %d jobs found%s", employer["name"], len(all_jobs), - " (filtered)" if location_filter else "") + " (filtered)" if location_filter or exclude_titles else "") return all_jobs @@ -347,6 +362,7 @@ def _process_one( location_filter: bool, accept_locs: list[str], reject_locs: list[str], + exclude_titles: list[str], ) -> dict: """Search one employer, fetch details, store results.""" emp = employers[employer_key] @@ -357,6 +373,7 @@ def _process_one( location_filter=location_filter, accept_locs=accept_locs, reject_locs=reject_locs, + exclude_titles=exclude_titles, ) except Exception as e: log.error("%s: ERROR searching '%s': %s", emp["name"], search_text, e) @@ -390,6 +407,7 @@ def scrape_employers( max_results: int = 0, accept_locs: list[str] | None = None, reject_locs: list[str] | None = None, + exclude_titles: list[str] | None = None, workers: int = 1, ) -> dict: """Run full scrape: search -> filter -> detail -> store. @@ -404,6 +422,8 @@ def scrape_employers( accept_locs = [] if reject_locs is None: reject_locs = [] + if exclude_titles is None: + exclude_titles = [] # Ensure DB schema init_db() @@ -423,7 +443,7 @@ def scrape_employers( futures = { pool.submit( _process_one, key, employers, search_text, - location_filter, accept_locs, reject_locs, + location_filter, accept_locs, reject_locs, exclude_titles ): key for key in valid_keys } @@ -446,7 +466,7 @@ def scrape_employers( for key in valid_keys: result = _process_one( key, employers, search_text, - location_filter, accept_locs, reject_locs, + location_filter, accept_locs, reject_locs, exclude_titles ) completed += 1 total_new += result["new"] @@ -493,6 +513,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di search_cfg = config.load_search_config() queries_cfg = search_cfg.get("queries", []) accept_locs, reject_locs = _load_location_filter(search_cfg) + exclude_titles = search_cfg.get("exclude_titles", []) # Default to tier 1-2 queries for workday scraping max_tier = search_cfg.get("workday_max_tier", 2) @@ -526,6 +547,7 @@ def run_workday_discovery(employers: dict | None = None, workers: int = 1) -> di location_filter=location_filter, accept_locs=accept_locs, reject_locs=reject_locs, + exclude_titles=exclude_titles, workers=workers, ) grand_new += result["new"]