-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproperties_scraper.py
More file actions
1032 lines (901 loc) · 42.7 KB
/
properties_scraper.py
File metadata and controls
1032 lines (901 loc) · 42.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""BC cheap property scraper — targets hidden/obscure listings under $30k.
Sources covered (in order of obscurity):
1. Craigslist BC regions (real estate by owner — cheap section)
2. Kijiji BC (land for sale)
3. UsedEverywhere BC (smaller, regional)
4. Facebook Marketplace (requires manual cookie — skipped by default)
5. BC Government Crown Land (rare sales/leases)
6. Tax sale properties (municipal tax sales — hidden gems)
7. Estate sale listings (smaller newspapers)
8. Abandoned/forfeited mineral claims that include surface rights
The philosophy: popular listings on REALTOR.ca, Zillow, Point2Homes are picked
over within hours. The real bargains are on obscure regional sites, tax sales,
and by-owner ads in small-town classifieds.
"""
import hashlib
import json
import re
import sys
import time
from urllib.parse import urljoin, urlparse
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
print("Missing deps: pip install requests beautifulsoup4", file=sys.stderr)
sys.exit(1)
import db
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "en-CA,en;q=0.9",
}
TIMEOUT = 20
MAX_PRICE = 30000
# BC regions — each has its own craigslist + used.ca subdomain
BC_REGIONS = [
# (region_key, display_name, craigslist_subdomain, used_subdomain)
("vancouver", "Vancouver/Lower Mainland", "vancouver", "usedvancouver"),
("victoria", "Vancouver Island South", "victoria", "usedvictoria"),
("nanaimo", "Vancouver Island Central", "nanaimo", "usednanaimo"),
("comoxvalley", "Vancouver Island North", "comoxvalley", "usedcomoxvalley"),
("kamloops", "Kamloops/Thompson", "kamloops", "usedkamloops"),
("kelowna", "Okanagan", "kelowna", "usedkelowna"),
("kootenays", "Kootenays", "kootenays", "usedkootenays"),
("cariboo", "Cariboo", "cariboo", None),
("princegeorge", "Northern BC", "princegeorge", None),
("skeena", "Skeena/NW BC", "skeena", None),
("sunshine", "Sunshine Coast", "sunshine", None),
("whistler", "Sea to Sky", "whistler", None),
]
# Keywords that suggest a HIDDEN deal (not listed on MLS)
HIDDEN_KEYWORDS = [
"by owner", "fsbo", "no realtor", "private sale", "cash only",
"needs work", "fixer", "as is", "tear down", "motivated seller",
"estate sale", "tax sale", "recreational", "off grid", "off-grid",
"remote", "crown land", "bush lot", "hunting", "trapper", "cabin",
"shack", "hermit", "acreage", "raw land", "unimproved"
]
# Keywords that PENALIZE (commercial/scam indicators)
BAD_KEYWORDS = [
"rent to own", "lease to own", "timeshare", "investment opportunity",
"multiple units available", "financing available",
]
# Rental / non-freehold / developed indicators — REJECT outright
REJECT_KEYWORDS = [
"for rent", "per month", "monthly", "/month", "/mo", "rental",
"for lease", "tenancy", "tenant", "bedroom", "bedrooms", "bed",
"beds", "bdrm", "bdrms", "bath", "baths", "condo", "apartment",
"suite", "basement", "townhouse", "duplex", "roommate", "sublet",
"furnished", "strata", "leasehold", "room for", "room for rent",
"mobile home park", "pad rental",
]
# Positive land keywords — must match at least one for sale-type scrapes
LAND_KEYWORDS = [
"land", "lot", "acre", "acreage", "parcel", "raw land", "vacant",
"bush", "timber", "recreational", "rural", "off grid", "off-grid",
"cabin", "homestead", "wilderness", "hunting", "crown", "unimproved",
]
def _word_hit(text: str, keywords: list) -> bool:
"""Match keywords on word boundaries — avoids 'Parkland' → 'land'."""
t = (text or "").lower()
for k in keywords:
# multi-word phrases: plain substring is fine (they already contain a space)
if " " in k or "/" in k or "-" in k:
if k in t:
return True
continue
# single words: require word boundary
if re.search(rf"\b{re.escape(k)}\b", t):
return True
return False
def _is_rental_or_developed(text: str) -> bool:
"""True if listing looks like a rental or a developed residential unit."""
return _word_hit(text, REJECT_KEYWORDS)
def _is_land(text: str) -> bool:
"""True if listing looks like undeveloped / rural land."""
return _word_hit(text, LAND_KEYWORDS)
def _looks_like_bc(text: str, url: str = "") -> bool:
"""Guard against cross-province Kijiji 'related ads' bleeding into results."""
blob = f"{text} {url}".lower()
# Hard-reject obvious other-province markers
other = [
"newfoundland", "labrador", "nova scotia", "new brunswick", "quebec",
"ontario", "manitoba", "saskatchewan", "alberta", "pei", "yukon",
"nunavut", "northwest territories",
"/st-johns/", "/halifax/", "/toronto/", "/calgary/", "/edmonton/",
"/winnipeg/", "/regina/", "/montreal/", "/ottawa/",
]
if any(m in blob for m in other):
return False
return True
PROPERTY_TYPES = {
"land": ["acres", "acre", "lot", "land", "parcel", "raw land"],
"cabin": ["cabin", "shack", "hut", "cottage"],
"mobile": ["mobile", "manufactured", "trailer", "rv"],
"house": ["house", "home", "bungalow", "bedroom"],
"recreational": ["hunting", "fishing", "recreational", "rec"],
}
def _parse_price(text: str) -> int | None:
"""Extract first dollar amount from text."""
if not text:
return None
# Match $X,XXX or $XXXXX
m = re.search(r'\$\s*([\d,]+)', text)
if m:
try:
return int(m.group(1).replace(",", ""))
except ValueError:
return None
return None
def _parse_acres(text: str) -> float | None:
if not text:
return None
m = re.search(r'(\d+(?:\.\d+)?)\s*ac(?:re)?s?', text, re.IGNORECASE)
if m:
try:
return float(m.group(1))
except ValueError:
return None
return None
def _classify_type(text: str) -> str:
t = (text or "").lower()
for ptype, keywords in PROPERTY_TYPES.items():
if any(k in t for k in keywords):
return ptype
return "other"
def _score_listing(data: dict) -> float:
"""Score 0-100 for how attractive this listing is."""
score = 50.0
price = data.get("price") or 999999
text = f"{data.get('title','')} {data.get('description','')}".lower()
# Price scoring — cheaper = better
if price <= 5000:
score += 30
elif price <= 10000:
score += 25
elif price <= 15000:
score += 18
elif price <= 20000:
score += 12
elif price <= 25000:
score += 6
# Hidden listing indicators
hidden_hits = sum(1 for k in HIDDEN_KEYWORDS if k in text)
score += min(hidden_hits * 4, 20)
# Land with acreage is gold
acres = data.get("size_acres") or 0
if acres >= 20:
score += 15
elif acres >= 10:
score += 10
elif acres >= 5:
score += 6
elif acres >= 1:
score += 3
# Bad indicators
bad_hits = sum(1 for k in BAD_KEYWORDS if k in text)
score -= bad_hits * 15
# Too-good-to-be-true check
if price and price < 500:
score -= 20 # probably scam or listing error
return max(0.0, min(100.0, score))
def _is_hidden(data: dict) -> bool:
"""Flag as hidden if it has by-owner/obscure characteristics."""
text = f"{data.get('title','')} {data.get('description','')}".lower()
hits = sum(1 for k in HIDDEN_KEYWORDS if k in text)
return hits >= 2
# ── Craigslist scraper ────────────────────────────────
def scrape_craigslist(region_key: str, subdomain: str, max_price: int = MAX_PRICE) -> list[dict]:
"""Scrape craigslist real estate for sale (land only) for a BC region.
Categories we hit (sale only, never rentals):
rea = real estate for sale — all
reo = real estate — by owner
We then filter for land/acreage keywords and reject anything that smells
like a rental, condo, suite, or leasehold.
"""
listings = []
cats = [
f"https://{subdomain}.craigslist.org/search/rea?max_price={max_price}&sort=priceasc",
f"https://{subdomain}.craigslist.org/search/reo?max_price={max_price}&sort=priceasc",
]
for url in cats:
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
# Craigslist modern layout uses result-row li elements
for item in soup.select("li.cl-static-search-result, li.result-row, div.cl-search-result"):
try:
title_el = item.select_one("a.posting-title, a.result-title, a")
if not title_el:
continue
title = title_el.get_text(strip=True)
link = title_el.get("href", "")
if link and not link.startswith("http"):
link = urljoin(url, link)
price_el = item.select_one(".result-price, .priceinfo, .price")
price_text = price_el.get_text(strip=True) if price_el else ""
price = _parse_price(price_text)
location_el = item.select_one(".result-hood, .location, .nearby")
location = location_el.get_text(strip=True).strip("()") if location_el else ""
if not title or (price and price > max_price):
continue
# Reject joke/test ads priced under $500
if price is not None and price < 500:
continue
# Only undeveloped land, freehold — reject rentals/condos/leasehold
combined = f"{title} {location}"
if not _looks_like_bc(combined, link):
continue
if _is_rental_or_developed(combined):
continue
if not _is_land(combined):
continue
data = {
"title": title,
"price": price,
"location": location,
"region": region_key,
"property_type": _classify_type(title),
"size_acres": _parse_acres(title),
"description": title,
"listing_url": link,
"source": "craigslist",
"posted_date": None,
}
data["score"] = _score_listing(data)
data["is_hidden"] = _is_hidden(data)
listings.append(data)
except Exception:
continue
except Exception as e:
print(f" craigslist {subdomain} error: {e}", file=sys.stderr)
time.sleep(0.5)
return listings
# ── Kijiji scraper ────────────────────────────────────
def scrape_kijiji(max_price: int = MAX_PRICE) -> list[dict]:
"""Scrape Kijiji BC for land/vacant lots."""
listings = []
# Kijiji BC — LAND FOR SALE ONLY (category 641 = Land, l9007 = BC province).
# We hit BC-wide AND the low-traffic rural sub-locations nobody searches.
urls = [
f"https://www.kijiji.ca/b-land-for-sale/british-columbia/c641l9007?price=__,{max_price}",
# Cariboo Area (100 Mile, Williams Lake, Quesnel) — hidden rural
f"https://www.kijiji.ca/b-land-for-sale/cariboo-area/c641l1700296?price=__,{max_price}",
# Prince George / Northern BC — low traffic
f"https://www.kijiji.ca/b-land-for-sale/prince-george/c641l1700295?price=__,{max_price}",
# Fort St John / Peace Country — very obscure
f"https://www.kijiji.ca/b-land-for-sale/fort-st-john/c641l80015?price=__,{max_price}",
# Kootenays — Nelson/Cranbrook area
f"https://www.kijiji.ca/b-land-for-sale/kootenays/c641l1700299?price=__,{max_price}",
# Comox Valley / North Island
f"https://www.kijiji.ca/b-land-for-sale/comox-valley/c641l1700298?price=__,{max_price}",
]
for url in urls:
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
# Kijiji uses various class names, try multiple
for item in soup.select('[data-testid*="listing"], [class*="listing-"], li.regular-ad'):
try:
title_el = item.select_one('a[class*="title"], h3 a, .title a')
if not title_el:
continue
title = title_el.get_text(strip=True)
link = title_el.get("href", "")
if link and not link.startswith("http"):
link = urljoin("https://www.kijiji.ca", link)
price_el = item.select_one('[class*="price"], .price')
price_text = price_el.get_text(strip=True) if price_el else ""
price = _parse_price(price_text)
location_el = item.select_one('[class*="location"], .location')
location = location_el.get_text(strip=True) if location_el else ""
desc_el = item.select_one('[class*="description"], .description')
desc = desc_el.get_text(strip=True) if desc_el else title
if not title or (price and price > max_price):
continue
combined = f"{title} {desc} {location}"
if not _looks_like_bc(combined, link):
continue
if _is_rental_or_developed(combined):
continue
if not _is_land(combined):
continue
data = {
"title": title,
"price": price,
"location": location,
"region": _guess_region(location),
"property_type": _classify_type(f"{title} {desc}"),
"size_acres": _parse_acres(f"{title} {desc}"),
"description": desc,
"listing_url": link,
"source": "kijiji",
}
data["score"] = _score_listing(data)
data["is_hidden"] = _is_hidden(data)
listings.append(data)
except Exception:
continue
except Exception as e:
print(f" kijiji error: {e}", file=sys.stderr)
time.sleep(1)
return listings
# ── Used.ca scraper (regional classifieds) ────────────
def scrape_used(subdomain: str, region_key: str, max_price: int = MAX_PRICE) -> list[dict]:
"""Scrape UsedEverywhere regional sites — these are hidden gems."""
listings = []
# Try both modern and legacy Used.ca URL patterns; land/vacant-land only
candidates = [
f"https://www.{subdomain}.com/real-estate-for-sale/land-for-sale?price_max={max_price}",
f"https://www.{subdomain}.com/real-estate/vacant-land?price_max={max_price}",
]
for url in candidates:
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
for item in soup.select('[class*="listing"], .ad-item, article'):
try:
title_el = item.select_one('a, h2, h3')
if not title_el:
continue
title = title_el.get_text(strip=True)
link = title_el.get("href", "") if title_el.name == "a" else ""
if link and not link.startswith("http"):
link = urljoin(f"https://www.{subdomain}.com", link)
price_el = item.select_one('[class*="price"]')
price = _parse_price(price_el.get_text() if price_el else "")
if not title or (price and price > max_price):
continue
if _is_rental_or_developed(title):
continue
if not _is_land(title):
continue
data = {
"title": title,
"price": price,
"location": "",
"region": region_key,
"property_type": _classify_type(title),
"size_acres": _parse_acres(title),
"description": title,
"listing_url": link,
"source": "used.ca",
}
data["score"] = _score_listing(data)
data["is_hidden"] = _is_hidden(data)
listings.append(data)
except Exception:
continue
except Exception as e:
print(f" used.ca {subdomain} error: {e}", file=sys.stderr)
time.sleep(0.5)
return listings
# ── Obscure rural sources ─────────────────────────────
#
# Craigslist and Kijiji are picked over. The real hidden deals in BC live on
# small regional FSBO sites, land-only specialty sites, and local community
# boards in the north/interior that nobody outside the area searches.
#
# Each entry: (label, url, parser_hint). Parser is generic — pulls any anchor
# whose surrounding block contains a $ price ≤ max_price and land keywords.
OBSCURE_SOURCES = [
# LandQuest — BC rural/recreational specialist, hidden listings
("landquest", "https://www.landquest.com/listings"),
("landquest-bare", "https://www.landquest.com/buy/bare-land"),
# For Sale By Owner Canada — FSBO, BC acreages
("forsalebyowner-bc", "https://www.forsalebyowner.ca/real-estate/canada/any-city/BC/homes/for-sale-rent-lease?page=1&type=7"),
# ComFree (legacy FSBO, now PropertyGuys)
("comfree-bc-land", "https://www.comfree.com/bc/vacant-land"),
# PropertyGuys — BC FSBO
("propertyguys-bc", "https://www.propertyguys.com/listings/british-columbia"),
# REALTOR.ca — tiny obscure subregions that get ignored
("realtor-cariboo", "https://www.realtor.ca/bc/cariboo/land-for-sale"),
("realtor-northernrockies", "https://www.realtor.ca/bc/northern-rockies/land-for-sale"),
("realtor-oliver-rural", "https://www.realtor.ca/bc/oliver-rural/real-estate"),
# Trovit aggregator — pulls from small-town boards
("trovit-cariboo", "https://property.trovit.ca/acres-cariboo"),
("trovit-peace", "https://property.trovit.ca/acres-peace-river"),
("trovit-kootenay", "https://property.trovit.ca/acres-kootenay"),
# Cariboo Listings — local-only site
("cariboolistings", "https://www.cariboolistings.ca/land/"),
# Remax Remote Areas BC — specifically flagged as remote
("remax-remote-bc", "https://www.remax.ca/bc/remote-areas-land-for-sale"),
]
def scrape_obscure_source(label: str, url: str, max_price: int = MAX_PRICE) -> list[dict]:
"""Generic scraper for small regional real-estate pages.
Uses a forgiving strategy: grab every anchor, look at its surrounding
block for a dollar amount and land keywords. Hidden-deal sites almost
never use the same HTML structure twice, so we can't hard-code selectors.
"""
listings = []
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
return listings
soup = BeautifulSoup(r.text, "html.parser")
# Find anchors that look like listing links
seen = set()
for a in soup.find_all("a", href=True):
try:
link = a["href"]
if link.startswith("#") or link.startswith("javascript:"):
continue
if link in seen:
continue
seen.add(link)
if not link.startswith("http"):
link = urljoin(url, link)
# Pull title + surrounding context (parent block)
title = a.get_text(" ", strip=True)
if not title or len(title) < 10:
continue
parent = a.find_parent(["li", "article", "div"])
context = parent.get_text(" ", strip=True) if parent else title
context = context[:600]
price = _parse_price(context)
if not price or price > max_price or price < 500:
continue
if not _looks_like_bc(f"{context} {title}", link):
continue
if _is_rental_or_developed(f"{context} {title}"):
continue
if not _is_land(f"{context} {title}"):
continue
data = {
"title": title[:200],
"price": price,
"location": "",
"region": _guess_region(context),
"property_type": _classify_type(context),
"size_acres": _parse_acres(context),
"description": context,
"listing_url": link,
"source": label,
}
data["score"] = _score_listing(data)
data["is_hidden"] = _is_hidden(data)
listings.append(data)
except Exception:
continue
except Exception as e:
print(f" {label} error: {e}", file=sys.stderr)
return listings
# ── Kijiji detail-page hydration ──────────────────────
#
# Kijiji list pages load prices in JS, so our scraper often gets title+URL
# but no price. This walks each saved Kijiji row and fetches the listing
# detail page, which IS server-rendered and has price, description, location,
# and acreage. Dramatically improves data quality without touching a browser.
def hydrate_kijiji_listings(limit: int = 200, verbose: bool = True) -> int:
"""Re-visit saved Kijiji rows with null price and pull full details.
Returns number of rows updated.
"""
conn = db.get_conn()
rows = conn.execute(
"""SELECT id, listing_url FROM properties
WHERE source = 'kijiji' AND price IS NULL
AND listing_url IS NOT NULL
LIMIT ?""",
(limit,)
).fetchall()
conn.close()
if verbose:
print(f"\n=== Hydrating {len(rows)} Kijiji listings ===\n")
updated = 0
for row in rows:
rid = row["id"]
url = row["listing_url"]
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
# Kijiji SSR: price is in a meta tag or data attribute
price = None
price_meta = soup.find("meta", attrs={"itemprop": "price"})
if price_meta and price_meta.get("content"):
price = _parse_price("$" + price_meta["content"])
if not price:
# Fallback: look for any $N,NNN near top of body
body_text = soup.get_text(" ", strip=True)[:3000]
price = _parse_price(body_text)
# Location
location = ""
addr = soup.find("span", attrs={"itemprop": "address"})
if addr:
location = addr.get_text(" ", strip=True)
else:
loc_meta = soup.find("meta", attrs={"property": "og:locality"})
if loc_meta:
location = loc_meta.get("content", "")
# Description
desc = ""
desc_el = soup.find("div", attrs={"itemprop": "description"})
if desc_el:
desc = desc_el.get_text(" ", strip=True)[:2000]
else:
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
desc = meta_desc.get("content", "")[:2000]
combined = f"{desc} {location}"
# Re-validate: freehold land only
if _is_rental_or_developed(combined):
# This row shouldn't be here — delete it
conn = db.get_conn()
conn.execute("DELETE FROM properties WHERE id=?", (rid,))
conn.commit()
conn.close()
if verbose:
print(f" [{rid}] DROPPED (rental/developed)")
continue
if not _looks_like_bc(combined, url):
conn = db.get_conn()
conn.execute("DELETE FROM properties WHERE id=?", (rid,))
conn.commit()
conn.close()
if verbose:
print(f" [{rid}] DROPPED (not BC)")
continue
if price and price > MAX_PRICE:
conn = db.get_conn()
conn.execute("DELETE FROM properties WHERE id=?", (rid,))
conn.commit()
conn.close()
if verbose:
print(f" [{rid}] DROPPED (${price:,} > ${MAX_PRICE:,})")
continue
acres = _parse_acres(combined)
region = _guess_region(f"{location} {desc[:500]}")
# Recompute score with fresh data
fake = {
"title": "",
"description": desc,
"price": price,
"size_acres": acres,
}
score = _score_listing(fake)
is_hidden = 1 if _is_hidden(fake) else 0
conn = db.get_conn()
conn.execute(
"""UPDATE properties
SET price = COALESCE(?, price),
location = COALESCE(NULLIF(?, ''), location),
description = COALESCE(NULLIF(?, ''), description),
size_acres = COALESCE(?, size_acres),
region = COALESCE(NULLIF(?, 'unknown'), region),
score = ?,
is_hidden = ?
WHERE id = ?""",
(price, location, desc, acres, region, score, is_hidden, rid)
)
conn.commit()
conn.close()
updated += 1
if verbose and updated % 10 == 0:
print(f" ...{updated} hydrated")
except Exception as e:
if verbose:
print(f" [{rid}] error: {e}")
continue
time.sleep(0.5) # be polite
if verbose:
print(f"\n=== Hydrated {updated}/{len(rows)} listings ===\n")
return updated
# ── Tax Sale Listings ─────────────────────────────────
def scrape_bc_tax_sales(verbose: bool = True) -> list[dict]:
"""BC municipal tax sales — the REAL hidden-gem source.
Tax sales happen on the last Monday of September every year. Properties
go for the upset price (usually 2-3 years of unpaid taxes = often $2k-$15k)
regardless of actual value. The owner has a 1-year redemption period, so
roughly 90% get redeemed, but the other 10% become yours for pennies.
We hit taxsaleshub.ca (BC aggregator) and a handful of specific
municipalities known for cheap rural land at tax sale.
"""
listings = []
# BC-wide aggregators (catch everything, including municipalities we don't list below)
urls = [
("taxsaleshub", "https://taxsaleshub.ca/british-columbia/"),
("taxsaleshub-bc", "https://taxsaleshub.ca/properties/?refinementList%5BprovinceName%5D%5B0%5D=British%20Columbia"),
("taxsalecompass", "https://taxsalecompass.ca/province-british-columbia.php"),
]
if verbose:
print(f"\n === BC Tax Sale Scraper ===", flush=True)
for source_label, url in urls:
if verbose:
print(f" Tax aggregator: {source_label}...", end=" ", flush=True)
before = len(listings)
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
continue
soup = BeautifulSoup(r.text, "html.parser")
# Grab every anchor — tax sale pages are usually long lists of
# links to municipal tax sale pages or individual property entries
seen = set()
for a in soup.find_all("a", href=True):
try:
link = a["href"]
if link.startswith("#") or link.startswith("javascript:") or link.startswith("mailto:"):
continue
if link in seen:
continue
seen.add(link)
if not link.startswith("http"):
link = urljoin(url, link)
title = a.get_text(" ", strip=True)
if not title or len(title) < 5 or len(title) > 300:
continue
# Get context from parent block
parent = a.find_parent(["li", "tr", "article", "div"])
context = parent.get_text(" ", strip=True)[:800] if parent else title
# Skip navigation / footer / unrelated links
low = title.lower()
if any(skip in low for skip in ["home", "contact", "about", "privacy", "terms", "login", "sign up", "subscribe", "facebook", "twitter", "share", "see all", "guides", "wikipedia"]):
continue
# Must be an actual BC tax sale LISTING URL, not a nav link.
# taxsaleshub uses /british-columbia/<muni>/<hash>/
# If the link doesn't match that shape, skip.
link_lower = link.lower()
# HARD REQUIREMENT: must be a taxsaleshub listing URL —
# /british-columbia/<muni>/<hash>/ — nothing else accepted
if "taxsaleshub.ca" not in link_lower:
continue
if "/british-columbia/" not in link_lower:
continue
parts = [p for p in link_lower.replace("https://", "").replace("http://", "").split("/") if p]
# expected: ['taxsaleshub.ca', 'british-columbia', '<muni>', '<hash>']
if len(parts) < 4:
continue
# Must look tax-sale-related
if not any(k in context.lower() for k in ["tax sale", "upset", "delinquent", "arrears", "folio", "pid ", "lot ", "parcel"]):
continue
price = _parse_price(context)
data = {
"title": title[:200],
"description": context[:500],
"price": price,
"location": "",
"region": _guess_region(context),
"property_type": "land",
"size_acres": _parse_acres(context),
"listing_url": link,
"source": f"tax-sale-{source_label}",
"is_hidden": True,
"notes": "BC municipal tax sale — check redemption period",
}
data["score"] = _score_listing(data) + 10 # bonus for tax sales
listings.append(data)
except Exception:
continue
except Exception as e:
print(f" tax sale {source_label} error: {e}", file=sys.stderr)
if verbose:
print(f"{len(listings) - before} found", flush=True)
# Verified BC municipal tax-sale pages — focus on rural/northern/interior
# where land is actually cheap. Urban ones (Vancouver, West Van, Abbotsford)
# included so the scraper catches the rare bargain but real gems are up north.
municipal_pages = [
# ── Cariboo / Interior North ── (gold mine for cheap rural land)
("quesnel", "https://www.quesnel.ca/municipal-services/taxes-utilities/tax-sale"),
("williams-lake", "https://www.williamslake.ca/650/Tax-Sale"),
("100-mile-house", "http://www.100milehouse.com/district-services/property-taxes-assessment"),
# ── Peace River (cheapest land in the province) ──
("fort-st-john", "https://www.fortstjohn.ca/municipal-services/property-taxes/annual-tax-sale"),
("dawson-creek", "https://www.dawsoncreek.ca/en/Home-Property-Utilities/tax-sales.aspx"),
# ── Northwest BC / Skeena ──
("smithers", "https://www.smithers.ca/propertytaxes"),
# ── Kootenays ──
("trail", "https://trail.ca/en/news/notice-of-2025-tax-sale.aspx"),
("merritt", "https://www.merritt.ca/taxsale/"),
# ── Okanagan / Thompson ──
("kelowna", "https://www.kelowna.ca/city-hall/property-taxes/tax-sale"),
("salmon-arm", "https://www.salmonarm.ca/151/Property-Tax-Sale"),
("lake-country", "https://www.lakecountry.bc.ca/taxsale"),
# ── Sunshine Coast / Islands (rare deals) ──
("sechelt", "https://www.sechelt.ca/en/our-government/tax-sales.aspx"),
("powell-river", "https://powellriver.ca/pages/annual-tax-sale"),
("cumberland", "https://cumberland.ca/tax-sale/"),
("courtenay", "https://www.courtenay.ca/services/property-taxes/tax-sales"),
# ── Lower Mainland (unlikely but free check) ──
("maple-ridge", "https://www.mapleridge.ca/your-government/property-taxes/tax-sale"),
("abbotsford", "https://www.abbotsford.ca/city-services/property-taxes/tax-sale"),
("white-rock", "https://www.whiterockcity.ca/227/Tax-Sale"),
("west-vancouver", "https://westvancouver.ca/annual-property-tax-sale"),
("vancouver-city", "https://vancouver.ca/home-property-development/auction-of-tax-sale-property.aspx"),
# ── Taxsaleshub per-municipality BC subpages (deep-link aggregator) ──
("tsh-powell-river","https://taxsaleshub.ca/british-columbia/powell-river/"),
]
# Auto-discover additional BC municipality subpages from taxsaleshub index
try:
r = requests.get("https://taxsaleshub.ca/british-columbia/", headers=HEADERS, timeout=TIMEOUT)
if r.status_code == 200:
soup = BeautifulSoup(r.text, "html.parser")
discovered = set()
for a in soup.find_all("a", href=True):
href = a["href"]
# Match /british-columbia/<muni>/ where <muni> isn't a hash slug
m = re.match(r"^https?://taxsaleshub\.ca/british-columbia/([a-z-]+)/?$", href)
if m:
muni = m.group(1)
if muni and muni not in [p[0] for p in municipal_pages]:
discovered.add(muni)
for muni in discovered:
municipal_pages.append(
(f"tsh-{muni}", f"https://taxsaleshub.ca/british-columbia/{muni}/")
)
except Exception:
pass
if verbose:
print(f" Checking {len(municipal_pages)} BC municipalities...", flush=True)
for muni, url in municipal_pages:
muni_before = len(listings)
try:
r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
if r.status_code != 200:
if verbose:
print(f" {muni}: HTTP {r.status_code}", flush=True)
continue
soup = BeautifulSoup(r.text, "html.parser")
# Look for tables (most municipalities list tax-sale properties in tables)
for table in soup.find_all("table"):
for row in table.find_all("tr")[1:]: # skip header
cells = [c.get_text(" ", strip=True) for c in row.find_all(["td", "th"])]
if not cells or len(cells) < 2:
continue
row_text = " | ".join(cells)
if len(row_text) < 15:
continue
# Must have a dollar amount OR look like a folio/PID/legal description
price = _parse_price(row_text)
if not price and not any(k in row_text.lower() for k in ["pid", "folio", "lot ", "parcel", "plan"]):
continue
if price and price > MAX_PRICE:
continue
data = {
"title": f"Tax Sale ({muni}): {cells[0][:150]}",
"description": row_text[:500],
"price": price,
"location": muni,
"region": _guess_region(muni + " " + row_text),
"property_type": "land",
"size_acres": _parse_acres(row_text),
"listing_url": url,
"source": f"tax-sale-{muni}",
"is_hidden": True,
"notes": f"Municipal tax sale listing from {muni}",
}
data["score"] = _score_listing(data) + 15 # strong bonus
listings.append(data)
except Exception as e:
print(f" {muni} error: {e}", file=sys.stderr)
added = len(listings) - muni_before
if verbose and added > 0:
print(f" {muni}: +{added} listings", flush=True)
time.sleep(0.5)
return listings
# ── Crown Land Info ───────────────────────────────────
def scrape_crown_land_info() -> list[dict]:
"""BC Crown land — occasionally available for sale or nominal rent.
Most Crown land is not for sale, but there's a disposition process."""
return [{
"title": "BC Crown Land Dispositions — Application Process",
"description": "Crown land in BC is rarely sold outright but can be leased, "
"licensed, or purchased through application. Contact FrontCounter BC. "
"Homesite applications (rural areas) allow purchase of up to 2 hectares "
"at market value. Remote areas sometimes have unadvertised availability.",
"listing_url": "https://www2.gov.bc.ca/gov/content/industry/natural-resource-use/land-use/crown-land",
"source": "crown-land-info",
"price": None,
"region": "bc-wide",
"property_type": "info",
"notes": "Apply via FrontCounter BC — sometimes bargains for homesite applications",
"is_hidden": True,
"score": 70.0,
}]
# ── Region guesser ────────────────────────────────────
_REGION_KEYWORDS = {
"kootenay": ["kootenay", "nelson", "cranbrook", "trail", "rossland", "kimberley", "fernie", "revelstoke", "creston"],
"cariboo": ["cariboo", "quesnel", "williams lake", "100 mile", "likely", "barkerville", "horsefly"],
"north": ["prince george", "fort st john", "dawson creek", "mackenzie", "smithers", "terrace", "prince rupert", "stewart", "atlin"],
"okanagan": ["okanagan", "kelowna", "vernon", "penticton", "osoyoos", "oliver", "westbank"],
"kamloops": ["kamloops", "merritt", "ashcroft", "cache creek", "logan lake", "chase"],
"island": ["victoria", "nanaimo", "courtenay", "comox", "campbell river", "port hardy", "tofino", "ucluelet", "duncan", "sooke"],
"coast": ["sunshine coast", "sechelt", "gibsons", "powell river", "bella coola"],
"vancouver": ["vancouver", "burnaby", "surrey", "langley", "abbotsford", "chilliwack", "hope", "mission", "maple ridge"],
}
def _guess_region(text: str) -> str:
if not text:
return "unknown"
t = text.lower()
for region, keywords in _REGION_KEYWORDS.items():
if any(k in t for k in keywords):
return region
return "unknown"
# ── Main scrape runner ────────────────────────────────
def scrape_all_bc(max_price: int = MAX_PRICE, verbose: bool = True) -> int:
"""Run all scrapers and insert into database. Returns new-insert count."""
db.init_db()
total_new = 0
total_seen = 0
if verbose:
print(f"\n=== BC Cheap Property Hunt (max ${max_price:,}) ===\n")
# 1. Craigslist — all BC regions
for region_key, display, cl_sub, used_sub in BC_REGIONS:
if verbose:
print(f" Craigslist {display}...", end=" ", flush=True)
listings = scrape_craigslist(region_key, cl_sub, max_price)
new_count = 0
for data in listings:
total_seen += 1
if db.insert_property(data):
new_count += 1
total_new += 1
if verbose:
print(f"{len(listings)} found, {new_count} new")
time.sleep(1.5)
# 2. Kijiji BC
if verbose:
print(f"\n Kijiji BC...", end=" ", flush=True)
listings = scrape_kijiji(max_price)
new_count = 0
for data in listings:
total_seen += 1
if db.insert_property(data):
new_count += 1
total_new += 1
if verbose:
print(f"{len(listings)} found, {new_count} new")
# 3. Used.ca regional sites
for region_key, display, cl_sub, used_sub in BC_REGIONS:
if not used_sub:
continue
if verbose:
print(f" Used.ca {display}...", end=" ", flush=True)
listings = scrape_used(used_sub, region_key, max_price)
new_count = 0
for data in listings:
total_seen += 1
if db.insert_property(data):
new_count += 1
total_new += 1
if verbose:
print(f"{len(listings)} found, {new_count} new")
time.sleep(1)
# 4. Obscure rural / FSBO / specialty sources
if verbose:
print()
for label, url in OBSCURE_SOURCES:
if verbose: