diff --git a/docs/src/main/asciidoc/configuration.adoc b/docs/src/main/asciidoc/configuration.adoc index 7526546c3..de2d39c97 100644 --- a/docs/src/main/asciidoc/configuration.adoc +++ b/docs/src/main/asciidoc/configuration.adoc @@ -494,6 +494,64 @@ See the link:https://github.com/apache/stormcrawler/tree/main/external/playwrigh | playwright.load.event | - | Page load event to wait for (e.g., "domcontentloaded", "networkidle"). |=== +===== JS rendering detection + +Browser fetching is much more expensive than a plain HTTP fetch, so most operators only want +Playwright on URLs that actually need it. The `JsRenderingDetector` parse filter inspects the +parsed page from a cheap fetch and sets a metadata flag (default `fetch.with=playwright`) on URLs +that look JavaScript-rendered. Pair it with link:https://github.com/apache/stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/protocol/DelegatorProtocol.java[DelegatorProtocol] +to route subsequent fetches of those URLs to the Playwright protocol while leaving everything else +on a fast HTTP client. + +Detection signals (cheapest first, short-circuiting): + +* SPA framework fingerprints in raw HTML — `data-reactroot`, `ng-version=`, `__NEXT_DATA__`, + `window.__NUXT__`, `data-svelte-h=`, `data-vue-app`, `data-astro-cid`, `` blocks containing language like _"enable JavaScript"_. +* Empty SPA hydration roots: `
` / `#app` / `#__next` / `#__nuxt`. +* Outcome-based fallback: at least one `"); + Assertions.assertTrue( + p.get("u") + .getMetadata() + .getFirstValue("fetch.with.reason") + .startsWith("empty-root:")); + } + + @Test + void thinContentWithScriptIsFlagged() throws Exception { + final JsRenderingDetector d = detector("{\"minTextLength\":50,\"minOutlinks\":1}"); + // No fingerprints, no empty-root pattern; only the outcome-based path fires + final String html = + "

Hi

"; + final ParseResult p = applyTo(d, "u", html); + Assertions.assertNotNull(p.get("u").getMetadata().getFirstValue("fetch.with")); + Assertions.assertTrue( + p.get("u") + .getMetadata() + .getFirstValue("fetch.with.reason") + .startsWith("thin-content:")); + } + + @Test + void plainHtmlIsNotFlagged() throws Exception { + final JsRenderingDetector d = detector("{}"); + final String html = + "

" + + "A".repeat(1000) + + "

xyz"; + final ParseResult p = applyTo(d, "u", html); + Assertions.assertNull(p.get("u").getMetadata().getFirstValue("fetch.with")); + } + + @Test + void shortPageWithoutScriptIsNotFlagged() throws Exception { + // outcome-based fallback is gated on at least one