From 96244baa91cfcbdb897f3bf0e5698a6eb85c8a9c Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 4 May 2026 20:46:19 +0200 Subject: [PATCH 1/3] feat(playwright): add JsRenderingDetector parse filter Heuristically flags URLs whose content looks JavaScript-rendered by inspecting SPA framework fingerprints, noscript blocks, empty hydration roots, and a thin-content fallback. Sets a routing metadata key so DelegatorProtocol can dispatch subsequent fetches to Playwright while the bulk of the crawl stays on a cheap HTTP client. --- docs/src/main/asciidoc/configuration.adoc | 54 ++++ external/playwright/README.md | 74 ++++++ .../parsefilter/JsRenderingDetector.java | 243 ++++++++++++++++++ .../parsefilter/JsRenderingDetectorTest.java | 148 +++++++++++ 4 files changed, 519 insertions(+) create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetector.java create mode 100644 external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/parsefilter/JsRenderingDetectorTest.java diff --git a/docs/src/main/asciidoc/configuration.adoc b/docs/src/main/asciidoc/configuration.adoc index 7526546c3..b0e5147d0 100644 --- a/docs/src/main/asciidoc/configuration.adoc +++ b/docs/src/main/asciidoc/configuration.adoc @@ -494,6 +494,60 @@ See the link:https://github.com/apache/stormcrawler/tree/main/external/playwrigh | playwright.load.event | - | Page load event to wait for (e.g., "domcontentloaded", "networkidle"). |=== +===== JS rendering detection + +Browser fetching is much more expensive than a plain HTTP fetch, so most operators only want +Playwright on URLs that actually need it. The `JsRenderingDetector` parse filter inspects the +parsed page from a cheap fetch and sets a metadata flag (default `fetch.with=playwright`) on URLs +that look JavaScript-rendered. Pair it with link:https://github.com/apache/stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/protocol/DelegatorProtocol.java[DelegatorProtocol] +to route subsequent fetches of those URLs to the Playwright protocol while leaving everything else +on a fast HTTP client. + +Detection signals (cheapest first, short-circuiting): + +* SPA framework fingerprints in raw HTML — `data-reactroot`, `ng-version=`, `__NEXT_DATA__`, + `window.__NUXT__`, `data-svelte-h=`, `data-vue-app`, `data-astro-cid`, `` blocks containing language like _"enable JavaScript"_. +* Empty SPA hydration roots: `
` / `#app` / `#__next` / `#__nuxt`. +* Outcome-based fallback: at least one `"); + Assertions.assertTrue( + p.get("u").getMetadata().getFirstValue("fetch.with.reason").startsWith("empty-root:")); + } + + @Test + void thinContentWithScriptIsFlagged() throws Exception { + final JsRenderingDetector d = detector("{\"minTextLength\":50,\"minOutlinks\":1}"); + // No fingerprints, no empty-root pattern; only the outcome-based path fires + final String html = + "

Hi

"; + final ParseResult p = applyTo(d, "u", html); + Assertions.assertNotNull(p.get("u").getMetadata().getFirstValue("fetch.with")); + Assertions.assertTrue( + p.get("u").getMetadata().getFirstValue("fetch.with.reason").startsWith("thin-content:")); + } + + @Test + void plainHtmlIsNotFlagged() throws Exception { + final JsRenderingDetector d = detector("{}"); + final String html = + "

" + + "A".repeat(1000) + + "

xyz"; + final ParseResult p = applyTo(d, "u", html); + Assertions.assertNull(p.get("u").getMetadata().getFirstValue("fetch.with")); + } + + @Test + void shortPageWithoutScriptIsNotFlagged() throws Exception { + // outcome-based fallback is gated on at least one "); + applyTo( + d, + "u", + "
"); Assertions.assertTrue( - p.get("u").getMetadata().getFirstValue("fetch.with.reason").startsWith("empty-root:")); + p.get("u") + .getMetadata() + .getFirstValue("fetch.with.reason") + .startsWith("empty-root:")); } @Test @@ -86,7 +100,10 @@ void thinContentWithScriptIsFlagged() throws Exception { final ParseResult p = applyTo(d, "u", html); Assertions.assertNotNull(p.get("u").getMetadata().getFirstValue("fetch.with")); Assertions.assertTrue( - p.get("u").getMetadata().getFirstValue("fetch.with.reason").startsWith("thin-content:")); + p.get("u") + .getMetadata() + .getFirstValue("fetch.with.reason") + .startsWith("thin-content:")); } @Test @@ -114,11 +131,7 @@ void skipsIfAlreadyFetchedByPlaywright() throws Exception { final ParseResult parse = new ParseResult(); // simulate metadata coming from a Playwright fetch parse.get("u").getMetadata().setValue(HttpProtocol.MD_KEY_END, "2026-05-04T00:00:00Z"); - d.filter( - "u", - "
".getBytes(StandardCharsets.UTF_8), - null, - parse); + d.filter("u", "
".getBytes(StandardCharsets.UTF_8), null, parse); Assertions.assertNull(parse.get("u").getMetadata().getFirstValue("fetch.with")); } @@ -133,7 +146,8 @@ void skipsIfAlreadyFlagged() throws Exception { null, parse); // not overwritten, no reason added - Assertions.assertEquals("playwright", parse.get("u").getMetadata().getFirstValue("fetch.with")); + Assertions.assertEquals( + "playwright", parse.get("u").getMetadata().getFirstValue("fetch.with")); Assertions.assertNull(parse.get("u").getMetadata().getFirstValue("fetch.with.reason")); } @@ -159,8 +173,7 @@ void requiredMessageMatchesAnywhere() throws Exception { d, "u", "
Loading...
"); - Assertions.assertEquals( - "playwright", p.get("u").getMetadata().getFirstValue("fetch.with")); + Assertions.assertEquals("playwright", p.get("u").getMetadata().getFirstValue("fetch.with")); Assertions.assertEquals( "required-message:Loading...", p.get("u").getMetadata().getFirstValue("fetch.with.reason"));