From ea983096a94834f8264ac5e7020c9fc5ec6a8b5e Mon Sep 17 00:00:00 2001 From: Tianye Song Date: Wed, 27 May 2026 10:23:50 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat(domain-skills):=20add=20=E8=B4=9D?= =?UTF-8?q?=E5=A3=B3=E6=89=BE=E6=88=BF=20(ke.com)=20scraping=20skill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds agent-workspace/domain-skills/ke-com/scraping.md with three field-tested approaches for extracting housing data from ke.com. All approaches verified live across Beijing, Shanghai, and Guangzhou on 2026-04-28. Approach 1 — 二手房 listing search (/ershoufang/): scrapes 30 resale listings per city via http_get. Extracts title, URL, total price (万), unit price (元/平), floor/year/layout/area/orientation, community name, tags (满五年/地铁 etc.), and follower count. Approach 2 — 租房 listing search (/zufang/): scrapes 30 rental listings per city via http_get. Extracts title, URL, price (元/月), district, neighborhood, community name, and full description string. Approach 3 — property detail page: requires browser (goto + js). Returns full property details via CSS selectors: area, layout, floor, orientation, decoration, community, district, description, and tags. Covers 8 confirmed city subdomains (bj/sh/gz/sz/cd/wh/hz/nj), URL patterns for all page types, and all known gotchas: district filters redirect to login page, page 2+ returns CAPTCHA, detail pages always require browser, rental pages follow a 302 before serving HTML. --- .../domain-skills/ke-com/scraping.md | 278 ++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 agent-workspace/domain-skills/ke-com/scraping.md diff --git a/agent-workspace/domain-skills/ke-com/scraping.md b/agent-workspace/domain-skills/ke-com/scraping.md new file mode 100644 index 00000000..1a949761 --- /dev/null +++ b/agent-workspace/domain-skills/ke-com/scraping.md @@ -0,0 +1,278 @@ +# 贝壳找房 (ke.com) — Data Extraction + +Field-tested against ke.com on 2026-04-28. +No authentication required for listing search. +All listing page requests work via `http_get` without a browser. + +--- + +## TL;DR + +贝壳找房 (`{city}.ke.com`) returns full HTML for the **first page** of any +search via `http_get`. Subsequent pages and individual property detail pages +are protected by CAPTCHA and require a browser session. + +**What you can do with `http_get` (no browser):** +- Scrape the first 30 listings of any city-wide search (二手房 or 租房) +- Extract: title, URL, price, floor/year/layout/area/orientation, community + name, tags (满五年/地铁 etc.), follower count +- Switch cities by changing the subdomain (`bj`, `sh`, `gz`, `sz`, etc.) + +**What requires a browser (`goto` + `js`):** +- District/neighborhood filtering (e.g. `/ershoufang/chaoyangqu/`) — redirects + to login page via `http_get` +- Page 2 and beyond (302 redirect → CAPTCHA on direct `http_get`) +- Individual property detail pages (CAPTCHA page returned) + +--- + +## Approach 1: 二手房 Listing Search + +`GET https://{city}.ke.com/ershoufang/` + +Returns the first 30 listings for a city. District/neighborhood filtering +requires a logged-in browser session — `http_get` on `/ershoufang/{district}/` +redirects to the login page. + +```python +from helpers import http_get +import re + +# City subdomain codes (confirmed working): +# bj=北京, sh=上海, gz=广州, sz=深圳, cd=成都, wh=武汉, hz=杭州, nj=南京 + +def ke_search_ershoufang(city="bj"): + """Search 二手房 (resale housing) listings on ke.com. + + Args: + city: City subdomain, e.g. 'bj' (北京), 'sh' (上海), 'gz' (广州), + 'sz' (深圳), 'cd' (成都), 'hz' (杭州), 'wh' (武汉), 'nj' (南京) + + Returns up to 30 listings from the first page. Each listing contains: + title, url, total_price, unit_price, info (floor/year/layout/area/orientation), + community, tags, followers. + + Note: district/neighborhood filtering requires a logged-in browser session. + """ + url = f"https://{city}.ke.com/ershoufang/" + html = http_get(url) + + ul = re.search(r'', html, re.DOTALL) + if not ul: + return [] + + listings = [] + for li in re.findall(r'
  • (.*?)
  • ', ul.group(1), re.DOTALL): + title = re.search(r'title="([^"]+)"', li) + href = re.search(r'href="(https://[a-z]+\.ke\.com/ershoufang/\d+\.html)"', li) + total = re.search(r'class="totalPrice[^"]*"[^>]*>.*?]*>\s*(\d+)\s*.*?', li, re.DOTALL) + unit = re.search(r'(\d[\d,]+元/平)', li) + info_m = re.search(r'class="houseInfo"[^>]*>(.*?)', li, re.DOTALL) + pos_m = re.search(r'class="positionInfo"[^>]*>(.*?)', li, re.DOTALL) + tags = re.findall(r'class="(?:taxfree|five|subway|matching)[^"]*"[^>]*>\s*([^<]+?)\s*<', li) + follow = re.search(r'(\d+)人关注', li) + + info = re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', ' ', info_m.group(1))).strip() if info_m else None + pos = re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', ' ', pos_m.group(1))).strip() if pos_m else None + + if not title: + continue + listings.append({ + "title": title.group(1), + "url": href.group(1) if href else None, + "total_price": f"{total.group(1)}万" if total else None, # e.g. "890万" + "unit_price": unit.group(1) if unit else None, # e.g. "61,720元/平" + "info": info, # e.g. "高楼层 (共24层) | 2002年 | 3室2厅 | 144.2平米 | 西南" + "community": pos, # e.g. "盛和家园" + "tags": [t.strip() for t in tags], # e.g. ["满五年", "地铁"] + "followers": int(follow.group(1)) if follow else None, + }) + return listings + +listings = ke_search_ershoufang(city="bj") +# [ +# { +# "title": "盛和家园 满五年唯一 不临街 观景房 电梯刷卡进入", +# "url": "https://bj.ke.com/ershoufang/101133688738.html", +# "total_price": "890万", +# "unit_price": "61,720元/平", +# "info": "高楼层 (共24层) | 2002年 | 3室2厅 | 144.2平米 | 西南", +# "community": "盛和家园", +# "tags": ["满五年"], +# "followers": 221, +# }, +# ... # up to 30 listings +# ] +``` + +--- + +## Approach 2: 租房 Listing Search + +`GET https://{city}.ke.com/zufang/` + +```python +from helpers import http_get +import re + +def ke_search_zufang(city="bj"): + """Search 租房 (rental) listings on ke.com. + + Args: + city: City subdomain, e.g. 'bj', 'sh', 'gz' + + Returns up to 30 rental listings from the first page. + Note: ke.com issues a 302 on first hit; http_get follows the redirect + automatically and the final response contains full listing HTML. + """ + html = http_get(f"https://{city}.ke.com/zufang/") + + listings = [] + # Rental pages use a different HTML structure from resale pages + blocks = re.findall( + r'href="(/zufang/[A-Z]{2}[\w]+\.html)"[^>]*title="([^"]+)"', + html + ) + prices = re.findall(r'(\d+)', html) + descs = re.findall( + r'class="content__list--item--des"[^>]*>(.*?)

    ', + html, re.DOTALL + ) + + for i, (path_url, title) in enumerate(blocks): + desc_raw = descs[i] if i < len(descs) else "" + desc_clean = re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '|', desc_raw)).strip() + parts = [p for p in (x.strip() for x in desc_clean.split('|')) if p and p != '/'] + + listings.append({ + "title": title, + "url": f"https://{city}.ke.com{path_url}", + "price": f"{prices[i]}元/月" if i < len(prices) else None, + "district": parts[0] if len(parts) > 0 else None, # e.g. "海淀区" + "area_name": parts[1] if len(parts) > 1 else None, # e.g. "马甸" + "community": parts[2] if len(parts) > 2 else None, # e.g. "月季园" + "desc": desc_clean, + }) + return listings + +rentals = ke_search_zufang(city="bj") +# [ +# { +# "title": "整租·月季园 2室1厅 南/北", +# "url": "https://bj.ke.com/zufang/BJ2143777429737439232.html", +# "price": "6300元/月", +# "district": "海淀区", +# "area_name": "马甸", +# "community": "月季园", +# "desc": "海淀区 | 马甸 | 月季园 / 57.41㎡ / 南 北 / 2室1厅1卫 / 中楼层 (21层)", +# }, +# ... +# ] +``` + +--- + +## Approach 3: Property Detail Page (Browser Required) + +Individual property pages (`/ershoufang/{id}.html`, `/zufang/{id}.html`) return +a CAPTCHA page via `http_get`. Use the browser to load them. + +```python +from helpers import goto, wait_for_load, wait, js + +def ke_get_detail(property_url): + """Fetch full property details from a ke.com listing page. + + Requires browser. property_url comes from ke_search_ershoufang() or + ke_search_zufang(). CSS selectors verified against ke.com detail pages. + """ + goto(property_url) + wait_for_load() + wait(2) + + return js(""" + ({ + title: document.querySelector('.mainInfo')?.innerText?.trim() + || document.querySelector('h1.title')?.innerText?.trim(), + total_price: document.querySelector('.total')?.innerText?.trim(), + unit_price: document.querySelector('.unitPriceValue')?.innerText?.trim(), + area: document.querySelector('.area .mainInfo')?.innerText?.trim(), + layout: document.querySelector('.room .mainInfo')?.innerText?.trim(), + floor: document.querySelector('.floor .mainInfo')?.innerText?.trim(), + orientation: document.querySelector('.toward .mainInfo')?.innerText?.trim(), + decoration: document.querySelector('.decoration .mainInfo')?.innerText?.trim(), + community: document.querySelector('.communityName .info')?.innerText?.trim(), + district: document.querySelector('.areaName .info')?.innerText?.trim(), + description: document.querySelector('.seller-desc')?.innerText?.trim(), + tags: Array.from(document.querySelectorAll('.tag-list .content')) + .map(e => e.innerText.trim()).filter(Boolean), + }) + """) +``` + +--- + +## URL Reference + +### City Subdomains + +| 城市 | 子域名 | 二手房 URL | +|------|--------|-----------| +| 北京 | `bj` | `https://bj.ke.com/ershoufang/` | +| 上海 | `sh` | `https://sh.ke.com/ershoufang/` | +| 广州 | `gz` | `https://gz.ke.com/ershoufang/` | +| 深圳 | `sz` | `https://sz.ke.com/ershoufang/` | +| 成都 | `cd` | `https://cd.ke.com/ershoufang/` | +| 杭州 | `hz` | `https://hz.ke.com/ershoufang/` | +| 武汉 | `wh` | `https://wh.ke.com/ershoufang/` | +| 南京 | `nj` | `https://nj.ke.com/ershoufang/` | + +### URL Pattern + +``` +# 二手房 (resale) — city-wide first page only via http_get +https://{city}.ke.com/ershoufang/ + +# 租房 (rental) — city-wide first page only via http_get +https://{city}.ke.com/zufang/ + +# 新房 (new development) +https://{city}.ke.com/loupan/ + +# District filter (browser required — http_get redirects to login) +https://{city}.ke.com/ershoufang/{district}/ + +# Pagination (browser required — http_get returns CAPTCHA) +https://{city}.ke.com/ershoufang/pg{n}/ +https://{city}.ke.com/ershoufang/{district}/pg{n}/ + +# Property detail (browser required — http_get returns CAPTCHA) +https://{city}.ke.com/ershoufang/{house_id}.html +https://{city}.ke.com/zufang/{house_id}.html +``` + +- **Common Beijing District Slugs** (browser required for district filtering): +`chaoyangqu` 朝阳 / `haidianqu` 海淀 / `xichengqu` 西城 / `dongchengqu` 东城 / +`fengtaiqu` 丰台 / `shijingshanqu` 石景山 / `changpingqu` 昌平 / `tongzhouqu` 通州 + +--- + +## Gotchas + +- **Only city-wide first page is accessible via `http_get`** — district filters + (e.g. `/ershoufang/chaoyangqu/`) redirect to a login page. Page 2+ redirects + to a CAPTCHA page. Both require a logged-in browser session. +- **Detail pages always return CAPTCHA via `http_get`** — even for the very + first request. Always use `goto()` + `js()` for detail pages. +- **Rental pages (`/zufang/`) follow a 302 on first hit** — `http_get` follows + redirects automatically; the final response (141KB+) contains full listing HTML. +- **City subdomains differ from link-on-ke.com** — `bj.ke.com` works; + `www.ke.com` redirects to the homepage without listing data. +- **`info` field format for 二手房**: pipe-separated string, e.g. + `"高楼层 (共24层) | 2002年 | 3室2厅 | 144.2平米 | 西南"`. Split on `|` and + strip whitespace to extract individual attributes. +- **Rental `desc` field format**: pipe/slash-separated, e.g. + `"海淀区 | 马甸 | 月季园 / 57.41㎡ / 南 北 / 2室1厅1卫 / 中楼层"`. +- **30 listings per page** — confirmed for both 二手房 and 租房. +- **`total_price` is in 万元** (10,000 RMB), e.g. `"890万"` = 8,900,000 RMB. + `unit_price` is 元/平方米, e.g. `"61,720元/平"`. From 8db19bcd138334d1c5a4d3a4c86ad56025498687 Mon Sep 17 00:00:00 2001 From: Tianye Song Date: Wed, 27 May 2026 10:36:45 +0800 Subject: [PATCH 2/2] fix(ke-com): rewrite rental parser to avoid index misalignment Independent global regex scans for prices and descriptions returned different counts than listing blocks (e.g. Shanghai: 20 blocks vs 30 prices), causing price/desc to silently belong to the wrong listing. Rewrite ke_search_zufang to split on data-house_code boundaries and extract all fields per-chunk, with a targeted price selector that matches only the listing price element. --- .../domain-skills/ke-com/scraping.md | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/agent-workspace/domain-skills/ke-com/scraping.md b/agent-workspace/domain-skills/ke-com/scraping.md index 1a949761..638955a2 100644 --- a/agent-workspace/domain-skills/ke-com/scraping.md +++ b/agent-workspace/domain-skills/ke-com/scraping.md @@ -122,36 +122,48 @@ def ke_search_zufang(city="bj"): city: City subdomain, e.g. 'bj', 'sh', 'gz' Returns up to 30 rental listings from the first page. - Note: ke.com issues a 302 on first hit; http_get follows the redirect - automatically and the final response contains full listing HTML. + Each listing's fields are extracted from its own HTML chunk keyed by + data-house_code, so price and description always correspond to the + correct listing with no index misalignment. """ html = http_get(f"https://{city}.ke.com/zufang/") + # Split on each listing boundary so that price/desc are extracted from + # the same chunk as href/title — avoids silent misalignment from + # independent global regex scans. + chunks = re.split(r'(?=data-house_code=")', html) listings = [] - # Rental pages use a different HTML structure from resale pages - blocks = re.findall( - r'href="(/zufang/[A-Z]{2}[\w]+\.html)"[^>]*title="([^"]+)"', - html - ) - prices = re.findall(r'(\d+)', html) - descs = re.findall( - r'class="content__list--item--des"[^>]*>(.*?)

    ', - html, re.DOTALL - ) - - for i, (path_url, title) in enumerate(blocks): - desc_raw = descs[i] if i < len(descs) else "" - desc_clean = re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '|', desc_raw)).strip() - parts = [p for p in (x.strip() for x in desc_clean.split('|')) if p and p != '/'] + for chunk in chunks: + if not chunk.startswith('data-house_code="'): + continue + href = re.search(r'href="(/zufang/[A-Z]{2}[\w]+\.html)"', chunk) + title = re.search(r'title="([^"]+)"', chunk) + # Price lives in: 6300 元/月 + price = re.search(r'class="content__list--item-price">(\d+)', chunk) + desc = re.search( + r'class="content__list--item--des"[^>]*>(.*?)

    ', + chunk, re.DOTALL + ) + if not href or not title: + continue + + desc_clean = district = area_name = community = None + if desc: + desc_clean = re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '|', desc.group(1))).strip() + parts = [p for p in (x.strip() for x in desc_clean.split('|')) + if p and p not in ('/', '-')] + district = parts[0] if parts else None # e.g. "海淀区" + area_name = parts[1] if len(parts) > 1 else None # e.g. "马甸" + community = parts[2] if len(parts) > 2 else None # e.g. "月季园" listings.append({ - "title": title, - "url": f"https://{city}.ke.com{path_url}", - "price": f"{prices[i]}元/月" if i < len(prices) else None, - "district": parts[0] if len(parts) > 0 else None, # e.g. "海淀区" - "area_name": parts[1] if len(parts) > 1 else None, # e.g. "马甸" - "community": parts[2] if len(parts) > 2 else None, # e.g. "月季园" - "desc": desc_clean, + "title": title.group(1).strip(), + "url": f"https://{city}.ke.com{href.group(1)}", + "price": f"{price.group(1)}元/月" if price else None, + "district": district, + "area_name": area_name, + "community": community, + "desc": desc_clean, }) return listings