From 62378b228410a159554014229a8ed5111f84305a Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 4 May 2026 20:39:18 +0200 Subject: [PATCH 1/4] feat(playwright): add PageAction extension point with built-in actions Introduces a configurable post-navigate chain loaded from JSON via playwright.page.actions.config.file, so site-specific DOM transformations no longer require subclassing the protocol. Ships built-ins for overlay dismissal, clickable expansion, scroll-to-bottom, wait-for-selector, JS evaluation, and screenshot capture. --- docs/src/main/asciidoc/configuration.adoc | 62 +++- docs/src/main/asciidoc/extending.adoc | 51 +++ external/playwright/README.md | 70 ++++ external/playwright/playwright-conf.yaml | 7 + .../protocol/playwright/HttpProtocol.java | 8 + .../protocol/playwright/PageAction.java | 57 ++++ .../protocol/playwright/PageActions.java | 153 +++++++++ .../actions/DismissOverlayAction.java | 102 ++++++ .../playwright/actions/EvaluateAction.java | 95 ++++++ .../actions/ExpandClickablesAction.java | 144 ++++++++ .../playwright/actions/ScreenshotAction.java | 97 ++++++ .../actions/ScrollToBottomAction.java | 81 +++++ .../actions/WaitForSelectorAction.java | 112 ++++++ .../playwright/PageActionsLiveTest.java | 322 ++++++++++++++++++ .../protocol/playwright/PageActionsTest.java | 89 +++++ .../actions/ActionConfigureTest.java | 173 ++++++++++ .../test/resources/page-actions-fixture.html | 64 ++++ .../test/resources/page-actions.chain.json | 24 ++ .../test/resources/page-actions.empty.json | 3 + .../test/resources/page-actions.invalid.json | 9 + .../src/test/resources/page-actions.live.json | 38 +++ .../test/resources/page-actions.single.json | 14 + 22 files changed, 1771 insertions(+), 4 deletions(-) create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java create mode 100644 external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java create mode 100644 external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java create mode 100644 external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java create mode 100644 external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java create mode 100644 external/playwright/src/test/resources/page-actions-fixture.html create mode 100644 external/playwright/src/test/resources/page-actions.chain.json create mode 100644 external/playwright/src/test/resources/page-actions.empty.json create mode 100644 external/playwright/src/test/resources/page-actions.invalid.json create mode 100644 external/playwright/src/test/resources/page-actions.live.json create mode 100644 external/playwright/src/test/resources/page-actions.single.json diff --git a/docs/src/main/asciidoc/configuration.adoc b/docs/src/main/asciidoc/configuration.adoc index 7526546c3..34b60290b 100644 --- a/docs/src/main/asciidoc/configuration.adoc +++ b/docs/src/main/asciidoc/configuration.adoc @@ -488,11 +488,65 @@ See the link:https://github.com/apache/stormcrawler/tree/main/external/playwrigh |=== | key | default value | description -| playwright.cdp.url | - | Chrome DevTools Protocol URL for connecting to an existing browser instance. -| playwright.remote.ws | - | Remote WebSocket URL for Playwright (alternative to CDP). -| playwright.skip.download | true | Skip automatic browser download. Set to false to let Playwright manage its own browser. -| playwright.load.event | - | Page load event to wait for (e.g., "domcontentloaded", "networkidle"). +| playwright.cdp.url | - | Chrome DevTools Protocol URL for connecting to an existing browser instance (e.g. `http://localhost:9222`). Mutually exclusive with `playwright.remote.ws`. +| playwright.remote.ws | - | Remote WebSocket URL for Playwright (alternative to CDP, e.g. `ws://localhost:3000/`). +| playwright.skip.download | false | Skip automatic browser download. Implicitly forced to `true` when `playwright.cdp.url` or `playwright.remote.ws` is set. +| playwright.load.event | load | Page load event to wait for. One of `load`, `domcontentloaded`, `networkidle`. +| playwright.skip.resource.types | - | List of resource types aborted during navigation (`document`, `stylesheet`, `image`, `media`, `font`, `script`, `texttrack`, `xhr`, `fetch`, `eventsource`, `websocket`, `manifest`, `other`). +| playwright.evaluations | - | List of JavaScript expressions evaluated after load; each JSON-serialised result is stored in response metadata under the expression itself. +| playwright.capture.content.on.error | false | If `true`, also capture `page.content()` for non-2xx responses — useful for SPAs that return a stub then hydrate via JS. +| playwright.override.status.on.content | false | When content was captured for a non-2xx response, override the reported HTTP status with `200`. The original status is preserved under the `playwright.origin.status` response metadata key. No-op unless `playwright.capture.content.on.error` is also `true`. +| playwright.page.actions.config.file | - | JSON file declaring an ordered chain of `PageAction` implementations applied after navigate succeeds and before content capture. See _Page actions_ below. +|=== + +===== Page actions + +The Playwright protocol exposes a `PageAction` extension point — an ordered chain of post-navigate +DOM transformations loaded from a JSON file referenced by `playwright.page.actions.config.file`. Use +this to plug site-specific behaviour (tab/accordion expansion, cookie-banner dismissal, +infinite-scroll, custom `evaluate()` calls, screenshotting, ...) into the protocol without +subclassing it. The chain runs only when content would otherwise be captured (on 2xx, or on non-2xx +when `playwright.capture.content.on.error` is `true`). Per-action failures are logged and swallowed +so one bad action cannot abort the rest of the chain. + +[source,json] +---- +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.DismissOverlayAction", + "name": "cookies", + "params": { "selectors": ["#cookie-accept"] } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction", + "name": "tabs", + "params": { + "selectors": [".tab-widget .tab-header"], + "root": ".tab-widget", + "body": ".tab-widget-body", + "waitMs": 300 + } + } + ] +} +---- + +Built-in actions: + +[cols="1,3", options="header"] |=== +| Class | Purpose + +| `ExpandClickablesAction` | Clicks every element matching the configured selectors and clones the resulting body container into a hidden cache under the same widget root, so `page.content()` ends up containing the HTML of every tab/accordion panel rather than only the active one. +| `EvaluateAction` | Evaluates a list of JavaScript expressions and stores each JSON-serialised result in response metadata. +| `ScrollToBottomAction` | Repeatedly scrolls to the bottom of the page until the document height stops growing, the step cap is reached, or the time budget elapses — useful for infinite-scroll feeds. +| `WaitForSelectorAction` | Waits for a selector to reach an `attached` / `detached` / `visible` / `hidden` state. Soft-fails on timeout by default; set `required: true` to fail. +| `DismissOverlayAction` | Dismisses cookie banners, GDPR walls, newsletter modals, etc. by clicking the first match of each selector, and optionally removes sticky overlays from the DOM via `removeSelectors`. +| `ScreenshotAction` | Captures a screenshot of the page and stores it base64-encoded in response metadata. For diagnostics / small-volume use; larger crawls should write to a blob store. +|=== + +See the link:https://github.com/apache/stormcrawler/tree/main/external/playwright[playwright module README] for the full parameter list of each built-in action and a guide on writing your own. ==== Language ID diff --git a/docs/src/main/asciidoc/extending.adoc b/docs/src/main/asciidoc/extending.adoc index f6db54abc..7277f88b9 100644 --- a/docs/src/main/asciidoc/extending.adoc +++ b/docs/src/main/asciidoc/extending.adoc @@ -162,6 +162,57 @@ https.protocol.implementation: "com.example.MyProtocol" Use the link:https://github.com/apache/stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/protocol/DelegatorProtocol.java[DelegatorProtocol] when you need to route URLs to different protocol implementations based on metadata or URL patterns. +==== Custom Page Action (Playwright) + +The link:https://github.com/apache/stormcrawler/tree/main/external/playwright[Playwright protocol] exposes a `PageAction` extension point so you can plug site-specific post-navigate behaviour (tab/accordion expansion, cookie-banner dismissal, infinite-scroll, custom `evaluate()` calls, screenshotting, ...) into the protocol without subclassing it. Actions are loaded as an ordered chain from a JSON file referenced by `playwright.page.actions.config.file` and follow the same `Configurable` lifecycle as URL/parse filters. The chain runs after `page.navigate()` succeeds and before `page.content()` is captured, so any DOM mutations land in the rendered content returned by the protocol. + +A handful of built-in actions ship with the module — `DismissOverlayAction`, `ExpandClickablesAction`, `ScrollToBottomAction`, `EvaluateAction`, `WaitForSelectorAction`, `ScreenshotAction`. Reach for a custom action when none of those fit. Extend `PageAction` and implement `apply`: + +[source,java] +---- +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.apache.stormcrawler.Metadata; +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.Page; +import java.util.Map; + +public class MyPageAction extends PageAction { + + private String selector; + + @Override + public void configure(Map stormConf, JsonNode params) { + if (!params.has("selector")) { + throw new IllegalArgumentException("MyPageAction requires 'selector'"); + } + this.selector = params.get("selector").asText(); + } + + @Override + public void apply(Page page, String url, + Metadata sourceMetadata, Metadata responseMetadata) { + page.locator(selector).click(); + } +} +---- + +Reference the action by its fully-qualified class name in the chain JSON: + +[source,json] +---- +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "com.example.MyPageAction", + "name": "my-action", + "params": { "selector": "#load-more" } + } + ] +} +---- + +Per-action failures in `apply()` are logged and swallowed by the chain wrapper so one bad action cannot abort the rest. If you need a hard failure on misconfiguration, throw from `configure()` — that propagates at topology start-up, before any URL is fetched. + ==== Custom Bolt or Spout For a bolt that emits on the status stream (like fetchers and parsers), extend link:https://github.com/apache/stormcrawler/blob/main/core/src/main/java/org/apache/stormcrawler/bolt/StatusEmitterBolt.java[StatusEmitterBolt]: diff --git a/external/playwright/README.md b/external/playwright/README.md index 725bea647..76b560ac0 100644 --- a/external/playwright/README.md +++ b/external/playwright/README.md @@ -40,6 +40,7 @@ The setting `playwright.skip.download` to `true` in the configuration will assum | `playwright.evaluations` | _empty_ | List of JavaScript expressions evaluated on the page after load. Each result is JSON-serialized and stored in the response metadata under the expression itself as the key. | | `playwright.capture.content.on.error` | `false` | By default the rendered DOM is only captured when the origin returns a 2xx status. Set to `true` to also capture `page.content()` for non-2xx responses — useful for Single-Page Applications that return a non-2xx stub document and then hydrate the real content via JavaScript. | | `playwright.override.status.on.content` | `false` | When the rendered DOM was captured for a non-2xx response, override the reported HTTP status with `200` so downstream components treat the URL as `FETCHED`. The original origin status is preserved in the response metadata under the key `playwright.origin.status`. No-op unless `playwright.capture.content.on.error` is also `true`. | +| `playwright.page.actions.config.file` | _unset_ | Path to a JSON file declaring an ordered chain of `PageAction` implementations applied after `page.navigate()` succeeds and before `page.content()` is captured. Use this to plug site-specific post-navigate behaviour (tab/accordion expansion, cookie-banner dismissal, scroll-to-bottom, custom `evaluate()` calls, ...) into the protocol without subclassing it. The chain runs only when content would otherwise be captured (i.e. on 2xx, or on non-2xx if `playwright.capture.content.on.error` is `true`). | Per-URL metadata triggers: @@ -47,3 +48,72 @@ Per-URL metadata triggers: |---|---| | `playwright.trace` | If present on the input metadata, a Playwright trace zip is recorded for the navigation and its path is returned in the response metadata under the same key. | +## Page actions + +Custom post-navigate behaviour is added by implementing `PageAction` and listing the implementation in the JSON file referenced by `playwright.page.actions.config.file`. Actions follow the same `Configurable` pattern as URL/parse filters and are loaded as an ordered chain. A failure in one action is logged and swallowed so the rest of the chain still runs. + +```json +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction", + "name": "tabs", + "params": { + "selectors": [".tab-widget .tab-header"], + "root": ".tab-widget", + "body": ".tab-widget-body", + "waitMs": 300 + } + } + ] +} +``` + +### Built-in actions + +| Class | Purpose | +|---|---| +| `ExpandClickablesAction` | Clicks every element matching the configured selectors and clones the resulting body container into a hidden cache under the same widget root, so `page.content()` ends up containing the HTML of every tab/accordion panel rather than only the originally active one. Anchors with an `href` are skipped. Parameters: `selectors` (array, required), `root` (string, required), `body` (string, required), `waitMs` (int, default `200`), `clickTimeoutMs` (int, default `2000`). | +| `EvaluateAction` | Evaluates a list of JavaScript expressions on the page and stores the JSON-serialised result of each in the response metadata. Parameters: `expressions` (array of strings, required), `keyPrefix` (string, optional — when set, results are stored under `keyPrefix + index` rather than under the expression itself, matching the legacy `playwright.evaluations` behaviour). | +| `ScrollToBottomAction` | Repeatedly scrolls to the bottom of the page until the document height stops growing, the step cap is reached, or the time budget elapses — useful for infinite-scroll feeds. Parameters: `waitMs` (int, default `500`), `maxSteps` (int, default `20`), `maxDurationMs` (int, default `15000`). | +| `WaitForSelectorAction` | Waits for a selector to reach a given state before allowing the chain to proceed. By default a timeout is treated as a soft failure (logged and swallowed); set `required: true` to fail the action on timeout. Parameters: `selector` (string, required), `state` (one of `attached`, `detached`, `visible`, `hidden` — default `visible`), `timeoutMs` (int, default `5000`), `required` (bool, default `false`). | +| `DismissOverlayAction` | Dismisses cookie banners, GDPR walls, newsletter modals, etc. by clicking the first match of each selector, and optionally removes sticky overlays by deleting matching elements from the DOM. Missing elements and click failures are silently skipped. Parameters: `selectors` (array of strings), `removeSelectors` (array of strings), `timeoutMs` (int, default `1500`). At least one of `selectors` or `removeSelectors` must be non-empty. | +| `ScreenshotAction` | Captures a screenshot of the page and stores it base64-encoded in the response metadata. Intended for diagnostics and small-volume use; larger crawls should write to a blob store instead. Parameters: `metadataKey` (string, default `playwright.screenshot`), `fullPage` (bool, default `false`), `type` (`png` or `jpeg`, default `png`), `quality` (int 0-100, only honoured for JPEG). | + +### Writing your own action + +Extend `PageAction` (which extends `AbstractConfigurable`) and implement `apply(Page, url, sourceMetadata, responseMetadata)`. Read parameters from the supplied `JsonNode` in `configure()` and validate at load time — anything thrown there propagates through `PageActions.fromConf()` and stops the topology from starting with a misconfigured chain. + +```java +public class MyAction extends PageAction { + + private String selector; + + @Override + public void configure(final Map stormConf, final JsonNode params) { + if (!params.has("selector")) { + throw new IllegalArgumentException("MyAction requires 'selector'"); + } + this.selector = params.get("selector").asText(); + } + + @Override + public void apply(final Page page, final String url, + final Metadata sourceMetadata, final Metadata responseMetadata) { + page.locator(selector).click(); + } +} +``` + +Reference it from the chain JSON via its fully-qualified class name. Per-action failures are logged and swallowed by the chain wrapper so one bad action cannot abort the rest — if you need a hard failure, raise it from `configure()`, not from `apply()`. + +## Tests + +The module has three test classes: + +- `PageActionsTest` — JSON loader / chain construction (no browser). +- `actions/ActionConfigureTest` — `configure()` validation for every built-in (no browser). +- `PageActionsLiveTest` — end-to-end browser tests for the chain and individual actions. + +The live tests use the same `assumeTrue("false".equals(System.getProperty("CI_ENV", "false")))` gate as `ProtocolTest`, so they run locally (`mvn test`) but skip on CI runners launched with `-DCI_ENV=true`. They expect a usable Chromium — either via `mvn exec:java -e -Dexec.mainClass=com.microsoft.playwright.CLI -Dexec.args="install chromium"` or by pointing `playwright.cdp.url` at an existing browser. + diff --git a/external/playwright/playwright-conf.yaml b/external/playwright/playwright-conf.yaml index 1020537b0..79f232af6 100644 --- a/external/playwright/playwright-conf.yaml +++ b/external/playwright/playwright-conf.yaml @@ -36,3 +36,10 @@ config: # playwright.capture.content.on.error is also true. # playwright.override.status.on.content: false + # JSON file declaring an ordered chain of PageAction implementations applied + # after navigate() succeeds and before page.content() is captured. Use this + # to plug in site-specific post-navigate behaviour (tab/accordion expansion, + # cookie-banner dismissal, infinite-scroll, custom evaluate calls, ...) + # without subclassing the protocol. See README for the JSON shape. + # playwright.page.actions.config.file: "page-actions.json" + diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java index b19e17010..301b106dd 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/HttpProtocol.java @@ -79,6 +79,8 @@ public class HttpProtocol extends AbstractHttpProtocol { private WaitUntilState loadEvent; + private PageActions pageActions = PageActions.emptyPageActions; + @Override public void configure(final Config conf) { super.configure(conf); @@ -172,6 +174,9 @@ public void configure(final Config conf) { // expressions to evaluate evaluations = ConfUtils.loadListFromConf(MD_EVALUATIONS, conf); + + // optional chain of page actions applied after navigate, before content capture + pageActions = PageActions.fromConf(conf); } @Override @@ -260,6 +265,8 @@ public ProtocolResponse getProtocolOutput(String url, Metadata md) throws Except boolean contentCaptured = false; if (fetched || captureContentOnError) { + // run any configured post-navigate actions before capturing content + pageActions.apply(page, url, md, responseMetaData); // retrieve the rendered content content = page.content().getBytes(StandardCharsets.UTF_8); contentCaptured = true; @@ -320,6 +327,7 @@ private Proxy getProxy(String proxyserver, String proxyuser, String proxypwd) { public void cleanup() { synchronized (this) { super.cleanup(); + pageActions.cleanup(); context.close(); browser.close(); } diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java new file mode 100644 index 000000000..8acbcd47b --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright; + +import com.microsoft.playwright.Page; +import org.apache.storm.task.IBolt; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.util.AbstractConfigurable; +import org.jetbrains.annotations.NotNull; + +/** + * A pluggable post-navigate page transformation. Each implementation is invoked after {@code + * page.navigate()} succeeds and before {@code page.content()} is captured, so any DOM mutations it + * makes are reflected in the rendered content returned by the protocol. + * + *

Actions are loaded as an ordered chain via {@link PageActions} from a JSON file referenced by + * the {@code playwright.page.actions.config.file} configuration key. They follow the same + * {@link org.apache.stormcrawler.util.Configurable} pattern as URL/parse filters. + */ +public abstract class PageAction extends AbstractConfigurable { + + /** + * Apply this action to the page. + * + * @param page the live Playwright {@link Page}, already navigated to {@code url} + * @param url the URL being fetched + * @param sourceMetadata input metadata associated with the URL (read-only intent) + * @param responseMetadata response metadata being built up; actions may add diagnostics here + */ + public abstract void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) + throws Exception; + + /** + * Release any resources held by the action. See {@link IBolt#cleanup()} for more details. + */ + public void cleanup() { + // nothing to do here + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java new file mode 100644 index 000000000..23e840a0c --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.microsoft.playwright.Page; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.apache.stormcrawler.JSONResource; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.util.ConfUtils; +import org.apache.stormcrawler.util.Configurable; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Ordered chain of {@link PageAction}s loaded from a JSON configuration file. The file is + * referenced by the {@code playwright.page.actions.config.file} configuration key and follows the + * same shape as URL/parse filter configs: + * + *

{@code
+ * {
+ *   "org.apache.stormcrawler.protocol.playwright.PageActions": [
+ *     { "class": "...ExpandClickablesAction", "name": "tabs",
+ *       "params": { "selectors": [".tab .header"], "root": ".tab", "body": ".tab-body" } }
+ *   ]
+ * }
+ * }
+ * + * @see Configurable#createConfiguredInstance(Class, Class, Map, JsonNode) + */ +public class PageActions implements JSONResource { + + public static final String CONFIG_KEY = "playwright.page.actions.config.file"; + + public static final PageActions emptyPageActions = new PageActions(); + + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(PageActions.class); + + private PageAction[] actions = new PageAction[0]; + + private final String configFile; + + private final Map stormConf; + + private PageActions() { + this.configFile = null; + this.stormConf = null; + } + + public PageActions(final Map stormConf, final String configFile) + throws IOException { + this.configFile = configFile; + this.stormConf = stormConf; + try { + loadJSONResources(); + } catch (final Exception e) { + throw new IOException("Unable to build JSON object from file " + configFile, e); + } + } + + /** Loads and configures the chain from the storm config, or returns an empty chain. */ + public static PageActions fromConf(final Map stormConf) { + final String configFile = ConfUtils.getString(stormConf, CONFIG_KEY); + if (StringUtils.isNotBlank(configFile)) { + try { + return new PageActions(stormConf, configFile); + } catch (final IOException e) { + final String message = + "Exception caught while loading PageActions from " + configFile; + LOG.error(message); + throw new RuntimeException(message, e); + } + } + return PageActions.emptyPageActions; + } + + @Override + public String getResourceFile() { + return this.configFile; + } + + @Override + public void loadJSONResources(final InputStream inputStream) throws IOException { + final ObjectMapper mapper = new ObjectMapper(); + final JsonNode confNode = mapper.readValue(inputStream, JsonNode.class); + final List list = + Configurable.createConfiguredInstance( + this.getClass(), PageAction.class, stormConf, confNode); + actions = list.toArray(new PageAction[0]); + } + + /** Run every action in order. Failures are logged and swallowed so one bad action cannot + * abort the rest of the chain. */ + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + for (final PageAction action : actions) { + final long start = System.currentTimeMillis(); + try { + action.apply(page, url, sourceMetadata, responseMetadata); + } catch (final Exception e) { + LOG.warn( + "PageAction {} ({}) failed for {}: {}", + action.getClass().getName(), + action.getName(), + url, + e.getMessage()); + } + LOG.debug( + "PageAction {} took {} msec", + action.getClass().getName(), + System.currentTimeMillis() - start); + } + } + + public void cleanup() { + for (final PageAction action : actions) { + try { + action.cleanup(); + } catch (final Exception e) { + LOG.warn( + "PageAction {} cleanup failed: {}", + action.getClass().getName(), + e.getMessage()); + } + } + } + + public int size() { + return actions.length; + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java new file mode 100644 index 000000000..e1ba2cc48 --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.ElementHandle; +import com.microsoft.playwright.Page; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Dismisses cookie banners, GDPR walls, paywalls, newsletter modals, etc. by clicking the first + * matching element of each configured selector. Each click is independently bounded by + * {@code timeoutMs}; missing elements and click failures are silently skipped, so it is safe to + * pass an over-broad set of fallback selectors. + * + *

Parameters

+ * + *
    + *
  • {@code selectors} (required, array of strings) + *
  • {@code timeoutMs} (optional, int, default 1500): per-click timeout + *
  • {@code removeSelectors} (optional, array of strings): elements matching these selectors + * are removed from the DOM after the clicks (useful for sticky overlays that don't have a + * close button) + *
+ */ +public class DismissOverlayAction extends PageAction { + + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(DismissOverlayAction.class); + + private static final String REMOVE_JS = "el => el.remove()"; + + private List selectors = List.of(); + private List removeSelectors = List.of(); + private int timeoutMs = 1500; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) return; + this.selectors = readStringArray(params, "selectors"); + this.removeSelectors = readStringArray(params, "removeSelectors"); + if (params.has("timeoutMs")) this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); + if (selectors.isEmpty() && removeSelectors.isEmpty()) { + throw new IllegalArgumentException( + "DismissOverlayAction requires non-empty 'selectors' or 'removeSelectors'"); + } + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + for (final String selector : selectors) { + try { + final ElementHandle handle = page.querySelector(selector); + if (handle == null) continue; + handle.click(new ElementHandle.ClickOptions().setTimeout(timeoutMs)); + } catch (final Exception e) { + LOG.debug("Could not click overlay {} on {}: {}", selector, url, e.getMessage()); + } + } + for (final String selector : removeSelectors) { + try { + for (final ElementHandle handle : page.querySelectorAll(selector)) { + handle.evaluate(REMOVE_JS); + } + } catch (final Exception e) { + LOG.debug("Could not remove overlay {} on {}: {}", selector, url, e.getMessage()); + } + } + } + + private static List readStringArray(final JsonNode params, final String key) { + final JsonNode node = params.get(key); + if (node == null || !node.isArray()) return List.of(); + final List list = new ArrayList<>(node.size()); + node.forEach(n -> list.add(n.asText())); + return list; + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java new file mode 100644 index 000000000..7b200f6f4 --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.microsoft.playwright.Page; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Evaluates a list of JavaScript expressions on the page and stores the JSON-serialised result of + * each in the response metadata. + * + *

Parameters

+ * + *
    + *
  • {@code expressions} (required, array of strings): JS expressions to evaluate + *
  • {@code keyPrefix} (optional, string): if set, the metadata key for each expression is + * {@code keyPrefix + index}; otherwise the expression itself is used as the key (matches the + * legacy {@code playwright.evaluations} behaviour) + *
+ */ +public class EvaluateAction extends PageAction { + + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(EvaluateAction.class); + + private final ObjectMapper mapper = new ObjectMapper(); + + private List expressions = List.of(); + private String keyPrefix; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) return; + final JsonNode exprs = params.get("expressions"); + if (exprs != null && exprs.isArray()) { + final List list = new ArrayList<>(exprs.size()); + exprs.forEach(n -> list.add(n.asText())); + this.expressions = list; + } + if (params.has("keyPrefix")) this.keyPrefix = params.get("keyPrefix").asText(); + if (expressions.isEmpty()) { + throw new IllegalArgumentException("EvaluateAction requires non-empty 'expressions'"); + } + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + for (int i = 0; i < expressions.size(); i++) { + final String expression = expressions.get(i); + try { + final Object result = page.evaluate(expression); + if (result == null) continue; + final String json = + mapper.writerWithDefaultPrettyPrinter().writeValueAsString(result); + final String key = keyPrefix == null ? expression : keyPrefix + i; + responseMetadata.setValue(key, json); + } catch (final JsonProcessingException e) { + LOG.debug( + "Could not serialise result of {} on {}: {}", + expression, + url, + e.getMessage()); + } catch (final Exception e) { + LOG.debug("Evaluate {} failed on {}: {}", expression, url, e.getMessage()); + } + } + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java new file mode 100644 index 000000000..a73fb108a --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.ElementHandle; +import com.microsoft.playwright.Page; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Clicks every element matching a list of selectors and, after each click, clones the rendered + * body container into a hidden cache under the same widget root. After the action runs, + * {@link Page#content()} contains the HTML of every panel a tab/accordion would normally hide + * behind user interaction — useful for SPAs whose visible markup depends on the active tab. + * + *

Anchor elements with an {@code href} are skipped to avoid following links. + * + *

Parameters

+ * + *
    + *
  • {@code selectors} (required, array): selectors whose matches will be clicked + *
  • {@code root} (required, string): selector for the widget root containing both the + * clickable and its body + *
  • {@code body} (required, string): selector for the body container that should be cached + *
  • {@code waitMs} (optional, int, default 200): time to wait after each click before caching + *
  • {@code clickTimeoutMs} (optional, int, default 2000): per-click timeout + *
+ */ +public class ExpandClickablesAction extends PageAction { + + private static final org.slf4j.Logger LOG = + LoggerFactory.getLogger(ExpandClickablesAction.class); + + /** + * Walks up to the configured root, finds the body, and appends a clone of it into a hidden + * cache element under the same root. Repeated calls accumulate every clicked panel. + */ + private static final String CACHE_BODY_JS = + "(el, opts) => {\n" + + " const widget = el.closest(opts.root);\n" + + " if (!widget) return;\n" + + " const bodyEl = widget.querySelector(opts.body);\n" + + " if (!bodyEl) return;\n" + + " let cache = widget.querySelector(':scope > .__sc_cache');\n" + + " if (!cache) {\n" + + " cache = document.createElement('div');\n" + + " cache.className = '__sc_cache';\n" + + " cache.setAttribute('aria-hidden', 'true');\n" + + " cache.style.display = 'none';\n" + + " widget.appendChild(cache);\n" + + " }\n" + + " cache.appendChild(bodyEl.cloneNode(true));\n" + + "}"; + + private static final String IS_LINK_JS = "e => e.tagName === 'A' && !!e.getAttribute('href')"; + + private List selectors = List.of(); + private String rootSelector; + private String bodySelector; + private int waitMs = 200; + private int clickTimeoutMs = 2000; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) { + return; + } + final JsonNode sels = params.get("selectors"); + if (sels != null && sels.isArray()) { + final List list = new ArrayList<>(sels.size()); + sels.forEach(n -> list.add(n.asText())); + this.selectors = list; + } + if (params.has("root")) this.rootSelector = params.get("root").asText(); + if (params.has("body")) this.bodySelector = params.get("body").asText(); + if (params.has("waitMs")) this.waitMs = params.get("waitMs").asInt(this.waitMs); + if (params.has("clickTimeoutMs")) + this.clickTimeoutMs = params.get("clickTimeoutMs").asInt(this.clickTimeoutMs); + + if (rootSelector == null || bodySelector == null) { + throw new IllegalArgumentException( + "ExpandClickablesAction requires both 'root' and 'body' selectors"); + } + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + final Map opts = new HashMap<>(); + opts.put("root", rootSelector); + opts.put("body", bodySelector); + + int clicked = 0; + for (final String selector : selectors) { + final List handles; + try { + handles = page.querySelectorAll(selector); + } catch (final Exception e) { + LOG.debug("Selector {} failed on {}: {}", selector, url, e.getMessage()); + continue; + } + for (final ElementHandle handle : handles) { + try { + if (Boolean.TRUE.equals(handle.evaluate(IS_LINK_JS))) { + continue; + } + handle.scrollIntoViewIfNeeded(); + handle.click(new ElementHandle.ClickOptions().setTimeout(clickTimeoutMs)); + page.waitForTimeout(waitMs); + handle.evaluate(CACHE_BODY_JS, opts); + clicked++; + } catch (final Exception e) { + LOG.debug("Skipping click on {} for {}: {}", selector, url, e.getMessage()); + } + } + } + LOG.debug("ExpandClickablesAction clicked {} elements on {}", clicked, url); + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java new file mode 100644 index 000000000..e8902baa9 --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.options.ScreenshotType; +import java.util.Base64; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Captures a screenshot of the page and stores it base64-encoded in the response metadata. Larger + * crawls should write to a blob store instead — this action is intended for diagnostics, sample + * runs, and small-volume use cases where carrying the image alongside the document is convenient. + * + *

Parameters

+ * + *
    + *
  • {@code metadataKey} (optional, string, default {@code playwright.screenshot}): metadata + * key under which the base64 string is stored + *
  • {@code fullPage} (optional, bool, default false): capture the entire scrollable page + *
  • {@code type} (optional, string, default {@code png}): {@code png} or {@code jpeg} + *
  • {@code quality} (optional, int, 0-100): only honoured for {@code jpeg} + *
+ */ +public class ScreenshotAction extends PageAction { + + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(ScreenshotAction.class); + + public static final String DEFAULT_METADATA_KEY = "playwright.screenshot"; + + private String metadataKey = DEFAULT_METADATA_KEY; + private boolean fullPage = false; + private ScreenshotType type = ScreenshotType.PNG; + private Integer quality; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) return; + if (params.has("metadataKey")) this.metadataKey = params.get("metadataKey").asText(); + if (params.has("fullPage")) this.fullPage = params.get("fullPage").asBoolean(false); + if (params.has("type")) { + final String t = params.get("type").asText().toLowerCase(); + switch (t) { + case "jpeg": + case "jpg": + this.type = ScreenshotType.JPEG; + break; + case "png": + this.type = ScreenshotType.PNG; + break; + default: + throw new IllegalArgumentException( + "Unknown screenshot type '" + t + "' (expected png or jpeg)"); + } + } + if (params.has("quality")) this.quality = params.get("quality").asInt(); + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + final Page.ScreenshotOptions options = + new Page.ScreenshotOptions().setFullPage(fullPage).setType(type); + if (type == ScreenshotType.JPEG && quality != null) { + options.setQuality(quality); + } + try { + final byte[] bytes = page.screenshot(options); + responseMetadata.setValue(metadataKey, Base64.getEncoder().encodeToString(bytes)); + } catch (final Exception e) { + LOG.debug("Screenshot failed for {}: {}", url, e.getMessage()); + } + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java new file mode 100644 index 000000000..56d28c8de --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.Page; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Repeatedly scrolls to the bottom of the page until the document height stops growing, the max + * number of steps is reached, or {@code maxDurationMs} elapses. Useful for infinite-scroll feeds + * that lazy-load on viewport entry. + * + *

Parameters

+ * + *
    + *
  • {@code waitMs} (optional, int, default 500): time to wait after each scroll before + * re-measuring height + *
  • {@code maxSteps} (optional, int, default 20): hard cap on scroll iterations + *
  • {@code maxDurationMs} (optional, int, default 15000): hard cap on total time spent + *
+ */ +public class ScrollToBottomAction extends PageAction { + + private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(ScrollToBottomAction.class); + + private static final String SCROLL_JS = "() => window.scrollTo(0, document.body.scrollHeight)"; + private static final String HEIGHT_JS = "() => document.body.scrollHeight"; + + private int waitMs = 500; + private int maxSteps = 20; + private int maxDurationMs = 15_000; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) return; + if (params.has("waitMs")) this.waitMs = params.get("waitMs").asInt(this.waitMs); + if (params.has("maxSteps")) this.maxSteps = params.get("maxSteps").asInt(this.maxSteps); + if (params.has("maxDurationMs")) + this.maxDurationMs = params.get("maxDurationMs").asInt(this.maxDurationMs); + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) { + final long deadline = System.currentTimeMillis() + maxDurationMs; + long previousHeight = -1; + int steps = 0; + while (steps < maxSteps && System.currentTimeMillis() < deadline) { + final long height = ((Number) page.evaluate(HEIGHT_JS)).longValue(); + if (height == previousHeight) break; + previousHeight = height; + page.evaluate(SCROLL_JS); + page.waitForTimeout(waitMs); + steps++; + } + LOG.debug("ScrollToBottomAction stopped after {} steps on {}", steps, url); + } +} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java new file mode 100644 index 000000000..e78b5cf4e --- /dev/null +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.options.WaitForSelectorState; +import java.util.Map; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.playwright.PageAction; +import org.jetbrains.annotations.NotNull; +import org.slf4j.LoggerFactory; + +/** + * Waits for a selector to reach a given state before allowing the chain to proceed. By default a + * timeout is treated as a soft failure (logged and swallowed) so the rest of the chain still runs; + * set {@code required: true} to make it propagate. + * + *

Parameters

+ * + *
    + *
  • {@code selector} (required, string) + *
  • {@code state} (optional, string, default {@code visible}): one of {@code attached}, + * {@code detached}, {@code visible}, {@code hidden} + *
  • {@code timeoutMs} (optional, int, default 5000) + *
  • {@code required} (optional, bool, default false): if true, a timeout aborts the action + * (and is logged and swallowed by the chain wrapper) + *
+ */ +public class WaitForSelectorAction extends PageAction { + + private static final org.slf4j.Logger LOG = + LoggerFactory.getLogger(WaitForSelectorAction.class); + + private String selector; + private WaitForSelectorState state = WaitForSelectorState.VISIBLE; + private int timeoutMs = 5000; + private boolean required = false; + + @Override + public void configure( + @NotNull final Map stormConf, @NotNull final JsonNode params) { + if (params == null || params.isMissingNode() || params.isNull()) { + throw new IllegalArgumentException("WaitForSelectorAction requires 'selector'"); + } + if (params.has("selector")) this.selector = params.get("selector").asText(); + if (params.has("timeoutMs")) this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); + if (params.has("required")) this.required = params.get("required").asBoolean(false); + if (params.has("state")) { + final String s = params.get("state").asText().toUpperCase(); + switch (s) { + case "ATTACHED": + this.state = WaitForSelectorState.ATTACHED; + break; + case "DETACHED": + this.state = WaitForSelectorState.DETACHED; + break; + case "HIDDEN": + this.state = WaitForSelectorState.HIDDEN; + break; + case "VISIBLE": + this.state = WaitForSelectorState.VISIBLE; + break; + default: + throw new IllegalArgumentException( + "Unknown state '" + + s + + "' (expected attached/detached/visible/hidden)"); + } + } + if (selector == null || selector.isEmpty()) { + throw new IllegalArgumentException("WaitForSelectorAction requires 'selector'"); + } + } + + @Override + public void apply( + @NotNull final Page page, + @NotNull final String url, + @NotNull final Metadata sourceMetadata, + @NotNull final Metadata responseMetadata) + throws Exception { + try { + page.waitForSelector( + selector, + new Page.WaitForSelectorOptions().setState(state).setTimeout(timeoutMs)); + } catch (final Exception e) { + if (required) throw e; + LOG.debug( + "Selector {} did not reach state {} within {}ms on {}: {}", + selector, + state, + timeoutMs, + url, + e.getMessage()); + } + } +} diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java new file mode 100644 index 000000000..b8ddfd506 --- /dev/null +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright; + +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.microsoft.playwright.Browser; +import com.microsoft.playwright.BrowserContext; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.Playwright; +import com.fasterxml.jackson.databind.JsonNode; +import java.nio.charset.StandardCharsets; +import java.util.Base64; +import java.util.concurrent.TimeUnit; +import org.apache.storm.Config; +import org.apache.stormcrawler.Metadata; +import org.apache.stormcrawler.protocol.AbstractProtocolTest; +import org.apache.stormcrawler.protocol.ProtocolResponse; +import org.apache.stormcrawler.protocol.playwright.actions.DismissOverlayAction; +import org.apache.stormcrawler.protocol.playwright.actions.EvaluateAction; +import org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction; +import org.apache.stormcrawler.protocol.playwright.actions.ScreenshotAction; +import org.apache.stormcrawler.protocol.playwright.actions.ScrollToBottomAction; +import org.apache.stormcrawler.protocol.playwright.actions.WaitForSelectorAction; +import org.eclipse.jetty.server.Handler; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Timeout; + +/** + * Live, browser-driven tests for the {@link PageActions} chain and individual {@code PageAction} + * implementations. Requires a working Playwright/Chrome install (or a {@code playwright.cdp.url}) + * and is skipped when {@code CI_ENV=true}, mirroring the gate used by {@link ProtocolTest}. + */ +class PageActionsLiveTest extends AbstractProtocolTest { + + private static final String USER_AGENT = "StormCrawlerTest"; + private static final String FIXTURE_PATH = "/page-actions-fixture.html"; + + @Override + protected Handler[] getHandlers() { + return new Handler[] {new LocalResourceHandler(), new WildcardResourceHandler()}; + } + + @BeforeEach + void setup() { + assumeTrue("false".equals(System.getProperty("CI_ENV", "false"))); + } + + private HttpProtocol getProtocol(final String pageActionsConfigFile) { + final Config conf = new Config(); + conf.put("http.agent.name", USER_AGENT); + final String cdpurl = System.getProperty("playwright.cdp.url"); + if (cdpurl != null) { + conf.put("playwright.cdp.url", cdpurl); + } + if (pageActionsConfigFile != null) { + conf.put(PageActions.CONFIG_KEY, pageActionsConfigFile); + } + final HttpProtocol protocol = new HttpProtocol(); + protocol.configure(conf); + return protocol; + } + + private String url() { + return "http://localhost:" + HTTP_PORT + FIXTURE_PATH; + } + + @Test + @Timeout(value = 2, unit = TimeUnit.MINUTES) + void chainAppliesAllActions() throws Exception { + final HttpProtocol protocol = getProtocol("page-actions.live.json"); + try { + final ProtocolResponse response = protocol.getProtocolOutput(url(), new Metadata()); + Assertions.assertEquals(200, response.getStatusCode()); + + final String content = new String(response.getContent(), StandardCharsets.UTF_8); + + // DismissOverlayAction: paywall removed from DOM, cookie banner click also removes it + Assertions.assertFalse( + content.contains("PAYWALL_CONTENT_REMOVED"), + "DismissOverlayAction should have removed the paywall element"); + Assertions.assertFalse( + content.contains("id=\"cookie-overlay\""), + "DismissOverlayAction click should have triggered the overlay removal"); + + // ExpandClickablesAction: every tab body should now be cached under the widget root + Assertions.assertTrue( + content.contains("CONTENT_TAB1"), "tab1 body should be cached"); + Assertions.assertTrue( + content.contains("CONTENT_TAB2"), "tab2 body should be cached"); + Assertions.assertTrue( + content.contains("CONTENT_TAB3"), "tab3 body should be cached"); + Assertions.assertTrue( + content.contains("__sc_cache"), "hidden cache element should be present"); + + // ScrollToBottomAction: lazy-loaded chunks should have appeared + Assertions.assertTrue( + content.contains("LAZY_LOADED_1"), + "ScrollToBottomAction should have triggered lazy loading"); + + // EvaluateAction: title is JSON-serialised under the expression key + final String title = + response.getMetadata().getFirstValue("document.title"); + Assertions.assertNotNull(title); + Assertions.assertTrue( + title.contains("StormCrawler PageActions Fixture"), + "EvaluateAction should have stored the title; got " + title); + + // ScreenshotAction: base64-encoded PNG under the configured key + final String shot = response.getMetadata().getFirstValue("test.screenshot"); + Assertions.assertNotNull(shot, "ScreenshotAction should have stored a screenshot"); + final byte[] decoded = Base64.getDecoder().decode(shot); + Assertions.assertTrue(decoded.length > 0); + // PNG magic bytes + Assertions.assertEquals((byte) 0x89, decoded[0]); + Assertions.assertEquals((byte) 0x50, decoded[1]); + Assertions.assertEquals((byte) 0x4E, decoded[2]); + Assertions.assertEquals((byte) 0x47, decoded[3]); + } finally { + protocol.cleanup(); + } + } + + @Test + @Timeout(value = 2, unit = TimeUnit.MINUTES) + void emptyChainProducesUntouchedContent() throws Exception { + final HttpProtocol protocol = getProtocol(null); + try { + final ProtocolResponse response = protocol.getProtocolOutput(url(), new Metadata()); + Assertions.assertEquals(200, response.getStatusCode()); + final String content = new String(response.getContent(), StandardCharsets.UTF_8); + // without DismissOverlayAction the paywall is still in the DOM + Assertions.assertTrue(content.contains("PAYWALL_CONTENT_REMOVED")); + // without ExpandClickablesAction only the initial body is rendered + Assertions.assertTrue(content.contains("INITIAL_BODY")); + Assertions.assertFalse(content.contains("CONTENT_TAB2")); + } finally { + protocol.cleanup(); + } + } + + @Test + @Timeout(value = 2, unit = TimeUnit.MINUTES) + void chainSwallowsActionFailures() throws Exception { + // Build an in-process chain by configuring HttpProtocol with a config file whose first + // action targets selectors that don't exist; the second action must still run. + final HttpProtocol protocol = getProtocol("page-actions.live.json"); + try { + // Drive a no-content page and assert we still get a response without exception + final String missing = "http://localhost:" + HTTP_PORT + "/does-not-exist.html"; + final ProtocolResponse response = protocol.getProtocolOutput(missing, new Metadata()); + // 404 → without captureContentOnError we get empty content but no thrown exception + Assertions.assertEquals(404, response.getStatusCode()); + Assertions.assertEquals(0, response.getContent().length); + } finally { + protocol.cleanup(); + } + } + + /** Drives a single action against a live page bypassing the protocol — useful for failure + * paths that the chain wrapper otherwise swallows. */ + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void waitForSelectorRequiredPropagates() throws Exception { + final WaitForSelectorAction action = new WaitForSelectorAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper() + .readTree("{\"selector\":\"#never\",\"timeoutMs\":250,\"required\":true}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + Assertions.assertThrows( + Exception.class, + () -> action.apply(page, url(), new Metadata(), new Metadata())); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void waitForSelectorSoftTimeoutReturnsCleanly() throws Exception { + final WaitForSelectorAction action = new WaitForSelectorAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper() + .readTree("{\"selector\":\"#never\",\"timeoutMs\":250,\"required\":false}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + // soft-fail: should not throw + action.apply(page, url(), new Metadata(), new Metadata()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void evaluateActionStoresJsonResult() throws Exception { + final EvaluateAction action = new EvaluateAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper().readTree("{\"expressions\":[\"({a:1,b:'two'})\"]}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + final Metadata md = new Metadata(); + action.apply(page, url(), new Metadata(), md); + final String stored = md.getFirstValue("({a:1,b:'two'})"); + Assertions.assertNotNull(stored); + // sanity-check that it parses back as JSON containing the right values + final JsonNode parsed = new ObjectMapper().readTree(stored); + Assertions.assertEquals(1, parsed.get("a").asInt()); + Assertions.assertEquals("two", parsed.get("b").asText()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void scrollToBottomTerminatesWithoutLazyContent() throws Exception { + // No-lazy fixture (uses the existing dynamic-scraping.html) → height never grows + final ScrollToBottomAction action = new ScrollToBottomAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper().readTree("{\"waitMs\":50,\"maxSteps\":3,\"maxDurationMs\":2000}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate("http://localhost:" + HTTP_PORT + "/dynamic-scraping.html"); + // should return cleanly (height stops growing immediately) + action.apply(page, url(), new Metadata(), new Metadata()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void expandClickablesIsNoOpWhenSelectorsMatchNothing() throws Exception { + final ExpandClickablesAction action = new ExpandClickablesAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper() + .readTree( + "{\"selectors\":[\".does-not-exist\"],\"root\":\".x\",\"body\":\".y\",\"waitMs\":50}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + // no matches → nothing to click → no exception + action.apply(page, url(), new Metadata(), new Metadata()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void dismissOverlayHandlesMissingSelectorsSilently() throws Exception { + final DismissOverlayAction action = new DismissOverlayAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper() + .readTree( + "{\"selectors\":[\"#nope\"],\"removeSelectors\":[\".also-nope\"]}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + // missing elements are skipped, not raised + action.apply(page, url(), new Metadata(), new Metadata()); + } + } + + @Test + @Timeout(value = 30, unit = TimeUnit.SECONDS) + void screenshotActionCapturesPng() throws Exception { + final ScreenshotAction action = new ScreenshotAction(); + action.configure( + java.util.Map.of(), + new ObjectMapper().readTree("{\"type\":\"png\",\"fullPage\":true}")); + + try (final Playwright pw = Playwright.create(); + final Browser browser = pw.chromium().launch(); + final BrowserContext ctx = browser.newContext(); + final Page page = ctx.newPage()) { + page.navigate(url()); + final Metadata md = new Metadata(); + action.apply(page, url(), new Metadata(), md); + final String shot = md.getFirstValue(ScreenshotAction.DEFAULT_METADATA_KEY); + Assertions.assertNotNull(shot); + final byte[] decoded = Base64.getDecoder().decode(shot); + Assertions.assertTrue(decoded.length > 0); + } + } +} diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java new file mode 100644 index 000000000..db66110ec --- /dev/null +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright; + +import java.util.Map; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class PageActionsTest { + + @Test + void emptyConfigReturnsEmptyChain() { + final PageActions actions = PageActions.fromConf(Map.of()); + Assertions.assertSame(PageActions.emptyPageActions, actions); + Assertions.assertEquals(0, actions.size()); + } + + @Test + void blankConfigPathReturnsEmptyChain() { + final PageActions actions = PageActions.fromConf(Map.of(PageActions.CONFIG_KEY, " ")); + Assertions.assertSame(PageActions.emptyPageActions, actions); + } + + @Test + void emptyJsonChainHasZeroActions() { + final PageActions actions = + PageActions.fromConf( + Map.of(PageActions.CONFIG_KEY, "page-actions.empty.json")); + Assertions.assertEquals(0, actions.size()); + } + + @Test + void singleActionChainLoads() { + final PageActions actions = + PageActions.fromConf( + Map.of(PageActions.CONFIG_KEY, "page-actions.single.json")); + Assertions.assertEquals(1, actions.size()); + } + + @Test + void multiActionChainLoadsInOrder() { + final PageActions actions = + PageActions.fromConf( + Map.of(PageActions.CONFIG_KEY, "page-actions.chain.json")); + Assertions.assertEquals(4, actions.size()); + } + + @Test + void missingConfigFileRaises() { + Assertions.assertThrows( + RuntimeException.class, + () -> + PageActions.fromConf( + Map.of(PageActions.CONFIG_KEY, "page-actions.does-not-exist.json"))); + } + + @Test + void invalidActionParamsRaiseAtLoadTime() { + // ExpandClickablesAction throws if root/body are missing — must propagate from configure() + Assertions.assertThrows( + RuntimeException.class, + () -> + PageActions.fromConf( + Map.of(PageActions.CONFIG_KEY, "page-actions.invalid.json"))); + } + + @Test + void emptyChainApplyIsNoOp() { + // apply() on the shared empty chain should not throw even with null-ish args via the + // public no-arg helpers; we can't pass a real Page without a browser, so just assert + // size and that cleanup is a no-op. + PageActions.emptyPageActions.cleanup(); + Assertions.assertEquals(0, PageActions.emptyPageActions.size()); + } +} diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java new file mode 100644 index 000000000..8c8184d4a --- /dev/null +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stormcrawler.protocol.playwright.actions; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.util.Map; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Pure unit tests for the configure() side of each built-in {@code PageAction}. They do not need a + * browser so they always run. + */ +class ActionConfigureTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final Map EMPTY_CONF = Map.of(); + + private static JsonNode params(final String json) throws Exception { + return MAPPER.readTree(json); + } + + // --- ExpandClickablesAction --- + + @Test + void expandClickablesRequiresRootAndBody() throws Exception { + final ExpandClickablesAction action = new ExpandClickablesAction(); + Assertions.assertThrows( + IllegalArgumentException.class, + () -> action.configure(EMPTY_CONF, params("{\"selectors\":[\".x\"]}"))); + } + + @Test + void expandClickablesAcceptsValidParams() throws Exception { + final ExpandClickablesAction action = new ExpandClickablesAction(); + action.configure( + EMPTY_CONF, + params( + "{\"selectors\":[\".tab .header\"],\"root\":\".tab\",\"body\":\".tab-body\",\"waitMs\":50}")); + } + + // --- EvaluateAction --- + + @Test + void evaluateRequiresNonEmptyExpressions() throws Exception { + final EvaluateAction action = new EvaluateAction(); + Assertions.assertThrows( + IllegalArgumentException.class, () -> action.configure(EMPTY_CONF, params("{}"))); + Assertions.assertThrows( + IllegalArgumentException.class, + () -> action.configure(EMPTY_CONF, params("{\"expressions\":[]}"))); + } + + @Test + void evaluateAcceptsExpressions() throws Exception { + final EvaluateAction action = new EvaluateAction(); + action.configure( + EMPTY_CONF, params("{\"expressions\":[\"window.location.href\"],\"keyPrefix\":\"e\"}")); + } + + // --- ScrollToBottomAction --- + + @Test + void scrollToBottomAcceptsEmptyConfig() throws Exception { + final ScrollToBottomAction action = new ScrollToBottomAction(); + action.configure(EMPTY_CONF, params("{}")); + } + + @Test + void scrollToBottomAcceptsOverrides() throws Exception { + final ScrollToBottomAction action = new ScrollToBottomAction(); + action.configure( + EMPTY_CONF, + params("{\"waitMs\":100,\"maxSteps\":5,\"maxDurationMs\":2000}")); + } + + // --- WaitForSelectorAction --- + + @Test + void waitForSelectorRequiresSelector() throws Exception { + final WaitForSelectorAction action = new WaitForSelectorAction(); + Assertions.assertThrows( + IllegalArgumentException.class, () -> action.configure(EMPTY_CONF, params("{}"))); + } + + @Test + void waitForSelectorRejectsUnknownState() throws Exception { + final WaitForSelectorAction action = new WaitForSelectorAction(); + Assertions.assertThrows( + IllegalArgumentException.class, + () -> + action.configure( + EMPTY_CONF, + params("{\"selector\":\"#x\",\"state\":\"sideways\"}"))); + } + + @Test + void waitForSelectorAcceptsAllValidStates() throws Exception { + for (final String state : new String[] {"attached", "detached", "visible", "hidden"}) { + final WaitForSelectorAction action = new WaitForSelectorAction(); + action.configure( + EMPTY_CONF, + params( + "{\"selector\":\"#x\",\"state\":\"" + + state + + "\",\"timeoutMs\":250,\"required\":true}")); + } + } + + // --- DismissOverlayAction --- + + @Test + void dismissOverlayRequiresAtLeastOneList() throws Exception { + final DismissOverlayAction action = new DismissOverlayAction(); + Assertions.assertThrows( + IllegalArgumentException.class, () -> action.configure(EMPTY_CONF, params("{}"))); + } + + @Test + void dismissOverlayAcceptsClickList() throws Exception { + final DismissOverlayAction action = new DismissOverlayAction(); + action.configure(EMPTY_CONF, params("{\"selectors\":[\"#cookie-accept\"]}")); + } + + @Test + void dismissOverlayAcceptsRemoveListOnly() throws Exception { + final DismissOverlayAction action = new DismissOverlayAction(); + action.configure(EMPTY_CONF, params("{\"removeSelectors\":[\".paywall\"]}")); + } + + // --- ScreenshotAction --- + + @Test + void screenshotAcceptsEmptyConfig() throws Exception { + final ScreenshotAction action = new ScreenshotAction(); + action.configure(EMPTY_CONF, params("{}")); + } + + @Test + void screenshotRejectsUnknownType() throws Exception { + final ScreenshotAction action = new ScreenshotAction(); + Assertions.assertThrows( + IllegalArgumentException.class, + () -> action.configure(EMPTY_CONF, params("{\"type\":\"webp\"}"))); + } + + @Test + void screenshotAcceptsPngAndJpeg() throws Exception { + final ScreenshotAction png = new ScreenshotAction(); + png.configure(EMPTY_CONF, params("{\"type\":\"png\",\"fullPage\":true}")); + + final ScreenshotAction jpeg = new ScreenshotAction(); + jpeg.configure( + EMPTY_CONF, + params( + "{\"type\":\"jpeg\",\"quality\":80,\"metadataKey\":\"my.shot\"}")); + } +} diff --git a/external/playwright/src/test/resources/page-actions-fixture.html b/external/playwright/src/test/resources/page-actions-fixture.html new file mode 100644 index 000000000..499d154e7 --- /dev/null +++ b/external/playwright/src/test/resources/page-actions-fixture.html @@ -0,0 +1,64 @@ + + + + + StormCrawler PageActions Fixture + + + +
PAYWALL_CONTENT_REMOVED
+ +
+ + + +
INITIAL_BODY
+
+ +
+ + + + diff --git a/external/playwright/src/test/resources/page-actions.chain.json b/external/playwright/src/test/resources/page-actions.chain.json new file mode 100644 index 000000000..4a06ab233 --- /dev/null +++ b/external/playwright/src/test/resources/page-actions.chain.json @@ -0,0 +1,24 @@ +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.DismissOverlayAction", + "name": "cookies", + "params": { "selectors": ["#cookie-accept"] } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ScrollToBottomAction", + "name": "scroll", + "params": { "waitMs": 50, "maxSteps": 3, "maxDurationMs": 500 } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.EvaluateAction", + "name": "perf", + "params": { "expressions": ["JSON.stringify(performance.timing)"] } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ScreenshotAction", + "name": "shot", + "params": { "fullPage": true, "type": "png" } + } + ] +} diff --git a/external/playwright/src/test/resources/page-actions.empty.json b/external/playwright/src/test/resources/page-actions.empty.json new file mode 100644 index 000000000..0d49d678e --- /dev/null +++ b/external/playwright/src/test/resources/page-actions.empty.json @@ -0,0 +1,3 @@ +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [] +} diff --git a/external/playwright/src/test/resources/page-actions.invalid.json b/external/playwright/src/test/resources/page-actions.invalid.json new file mode 100644 index 000000000..a2285781c --- /dev/null +++ b/external/playwright/src/test/resources/page-actions.invalid.json @@ -0,0 +1,9 @@ +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction", + "name": "missing-root-and-body", + "params": { "selectors": [".tab"] } + } + ] +} diff --git a/external/playwright/src/test/resources/page-actions.live.json b/external/playwright/src/test/resources/page-actions.live.json new file mode 100644 index 000000000..f1c17066d --- /dev/null +++ b/external/playwright/src/test/resources/page-actions.live.json @@ -0,0 +1,38 @@ +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.DismissOverlayAction", + "name": "cookies-and-paywall", + "params": { + "selectors": ["#cookie-accept"], + "removeSelectors": [".paywall"] + } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction", + "name": "tabs", + "params": { + "selectors": [".tab-widget .tab-header"], + "root": ".tab-widget", + "body": ".tab-widget-body", + "waitMs": 50, + "clickTimeoutMs": 2000 + } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ScrollToBottomAction", + "name": "scroll", + "params": { "waitMs": 200, "maxSteps": 5, "maxDurationMs": 5000 } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.EvaluateAction", + "name": "title", + "params": { "expressions": ["document.title"] } + }, + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ScreenshotAction", + "name": "shot", + "params": { "fullPage": false, "type": "png", "metadataKey": "test.screenshot" } + } + ] +} diff --git a/external/playwright/src/test/resources/page-actions.single.json b/external/playwright/src/test/resources/page-actions.single.json new file mode 100644 index 000000000..32c1a77a6 --- /dev/null +++ b/external/playwright/src/test/resources/page-actions.single.json @@ -0,0 +1,14 @@ +{ + "org.apache.stormcrawler.protocol.playwright.PageActions": [ + { + "class": "org.apache.stormcrawler.protocol.playwright.actions.ExpandClickablesAction", + "name": "tabs", + "params": { + "selectors": [".tab .header"], + "root": ".tab", + "body": ".tab-body", + "waitMs": 50 + } + } + ] +} From c0a162fa06d64363ab230de6949d4974e9998f98 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 4 May 2026 21:00:58 +0200 Subject: [PATCH 2/4] style(playwright): apply checkstyle braces and license-line spacing --- .../protocol/playwright/PageAction.java | 1 + .../protocol/playwright/PageActions.java | 1 + .../actions/DismissOverlayAction.java | 17 ++++++++++++---- .../playwright/actions/EvaluateAction.java | 13 +++++++++--- .../actions/ExpandClickablesAction.java | 16 +++++++++++---- .../playwright/actions/ScreenshotAction.java | 17 ++++++++++++---- .../actions/ScrollToBottomAction.java | 20 ++++++++++++++----- .../actions/WaitForSelectorAction.java | 17 ++++++++++++---- .../playwright/PageActionsLiveTest.java | 3 ++- .../protocol/playwright/PageActionsTest.java | 1 + .../actions/ActionConfigureTest.java | 1 + 11 files changed, 82 insertions(+), 25 deletions(-) diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java index 8acbcd47b..efc12c872 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright; import com.microsoft.playwright.Page; diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java index 23e840a0c..235710cde 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright; import com.fasterxml.jackson.databind.JsonNode; diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java index e1ba2cc48..494dfe4fd 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; @@ -56,10 +57,14 @@ public class DismissOverlayAction extends PageAction { @Override public void configure( @NotNull final Map stormConf, @NotNull final JsonNode params) { - if (params == null || params.isMissingNode() || params.isNull()) return; + if (params == null || params.isMissingNode() || params.isNull()) { + return; + } this.selectors = readStringArray(params, "selectors"); this.removeSelectors = readStringArray(params, "removeSelectors"); - if (params.has("timeoutMs")) this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); + if (params.has("timeoutMs")) { + this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); + } if (selectors.isEmpty() && removeSelectors.isEmpty()) { throw new IllegalArgumentException( "DismissOverlayAction requires non-empty 'selectors' or 'removeSelectors'"); @@ -75,7 +80,9 @@ public void apply( for (final String selector : selectors) { try { final ElementHandle handle = page.querySelector(selector); - if (handle == null) continue; + if (handle == null) { + continue; + } handle.click(new ElementHandle.ClickOptions().setTimeout(timeoutMs)); } catch (final Exception e) { LOG.debug("Could not click overlay {} on {}: {}", selector, url, e.getMessage()); @@ -94,7 +101,9 @@ public void apply( private static List readStringArray(final JsonNode params, final String key) { final JsonNode node = params.get(key); - if (node == null || !node.isArray()) return List.of(); + if (node == null || !node.isArray()) { + return List.of(); + } final List list = new ArrayList<>(node.size()); node.forEach(n -> list.add(n.asText())); return list; diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java index 7b200f6f4..cfd104c72 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/EvaluateAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.core.JsonProcessingException; @@ -53,14 +54,18 @@ public class EvaluateAction extends PageAction { @Override public void configure( @NotNull final Map stormConf, @NotNull final JsonNode params) { - if (params == null || params.isMissingNode() || params.isNull()) return; + if (params == null || params.isMissingNode() || params.isNull()) { + return; + } final JsonNode exprs = params.get("expressions"); if (exprs != null && exprs.isArray()) { final List list = new ArrayList<>(exprs.size()); exprs.forEach(n -> list.add(n.asText())); this.expressions = list; } - if (params.has("keyPrefix")) this.keyPrefix = params.get("keyPrefix").asText(); + if (params.has("keyPrefix")) { + this.keyPrefix = params.get("keyPrefix").asText(); + } if (expressions.isEmpty()) { throw new IllegalArgumentException("EvaluateAction requires non-empty 'expressions'"); } @@ -76,7 +81,9 @@ public void apply( final String expression = expressions.get(i); try { final Object result = page.evaluate(expression); - if (result == null) continue; + if (result == null) { + continue; + } final String json = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(result); final String key = keyPrefix == null ? expression : keyPrefix + i; diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java index a73fb108a..77bf82851 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; @@ -93,11 +94,18 @@ public void configure( sels.forEach(n -> list.add(n.asText())); this.selectors = list; } - if (params.has("root")) this.rootSelector = params.get("root").asText(); - if (params.has("body")) this.bodySelector = params.get("body").asText(); - if (params.has("waitMs")) this.waitMs = params.get("waitMs").asInt(this.waitMs); - if (params.has("clickTimeoutMs")) + if (params.has("root")) { + this.rootSelector = params.get("root").asText(); + } + if (params.has("body")) { + this.bodySelector = params.get("body").asText(); + } + if (params.has("waitMs")) { + this.waitMs = params.get("waitMs").asInt(this.waitMs); + } + if (params.has("clickTimeoutMs")) { this.clickTimeoutMs = params.get("clickTimeoutMs").asInt(this.clickTimeoutMs); + } if (rootSelector == null || bodySelector == null) { throw new IllegalArgumentException( diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java index e8902baa9..5ca843e9e 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; @@ -55,9 +56,15 @@ public class ScreenshotAction extends PageAction { @Override public void configure( @NotNull final Map stormConf, @NotNull final JsonNode params) { - if (params == null || params.isMissingNode() || params.isNull()) return; - if (params.has("metadataKey")) this.metadataKey = params.get("metadataKey").asText(); - if (params.has("fullPage")) this.fullPage = params.get("fullPage").asBoolean(false); + if (params == null || params.isMissingNode() || params.isNull()) { + return; + } + if (params.has("metadataKey")) { + this.metadataKey = params.get("metadataKey").asText(); + } + if (params.has("fullPage")) { + this.fullPage = params.get("fullPage").asBoolean(false); + } if (params.has("type")) { final String t = params.get("type").asText().toLowerCase(); switch (t) { @@ -73,7 +80,9 @@ public void configure( "Unknown screenshot type '" + t + "' (expected png or jpeg)"); } } - if (params.has("quality")) this.quality = params.get("quality").asInt(); + if (params.has("quality")) { + this.quality = params.get("quality").asInt(); + } } @Override diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java index 56d28c8de..97d407d5a 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScrollToBottomAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; @@ -52,11 +53,18 @@ public class ScrollToBottomAction extends PageAction { @Override public void configure( @NotNull final Map stormConf, @NotNull final JsonNode params) { - if (params == null || params.isMissingNode() || params.isNull()) return; - if (params.has("waitMs")) this.waitMs = params.get("waitMs").asInt(this.waitMs); - if (params.has("maxSteps")) this.maxSteps = params.get("maxSteps").asInt(this.maxSteps); - if (params.has("maxDurationMs")) + if (params == null || params.isMissingNode() || params.isNull()) { + return; + } + if (params.has("waitMs")) { + this.waitMs = params.get("waitMs").asInt(this.waitMs); + } + if (params.has("maxSteps")) { + this.maxSteps = params.get("maxSteps").asInt(this.maxSteps); + } + if (params.has("maxDurationMs")) { this.maxDurationMs = params.get("maxDurationMs").asInt(this.maxDurationMs); + } } @Override @@ -70,7 +78,9 @@ public void apply( int steps = 0; while (steps < maxSteps && System.currentTimeMillis() < deadline) { final long height = ((Number) page.evaluate(HEIGHT_JS)).longValue(); - if (height == previousHeight) break; + if (height == previousHeight) { + break; + } previousHeight = height; page.evaluate(SCROLL_JS); page.waitForTimeout(waitMs); diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java index e78b5cf4e..957bc59f3 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; @@ -57,9 +58,15 @@ public void configure( if (params == null || params.isMissingNode() || params.isNull()) { throw new IllegalArgumentException("WaitForSelectorAction requires 'selector'"); } - if (params.has("selector")) this.selector = params.get("selector").asText(); - if (params.has("timeoutMs")) this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); - if (params.has("required")) this.required = params.get("required").asBoolean(false); + if (params.has("selector")) { + this.selector = params.get("selector").asText(); + } + if (params.has("timeoutMs")) { + this.timeoutMs = params.get("timeoutMs").asInt(this.timeoutMs); + } + if (params.has("required")) { + this.required = params.get("required").asBoolean(false); + } if (params.has("state")) { final String s = params.get("state").asText().toUpperCase(); switch (s) { @@ -99,7 +106,9 @@ public void apply( selector, new Page.WaitForSelectorOptions().setState(state).setTimeout(timeoutMs)); } catch (final Exception e) { - if (required) throw e; + if (required) { + throw e; + } LOG.debug( "Selector {} did not reach state {} within {}ms on {}: {}", selector, diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java index b8ddfd506..5c1ebe652 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java @@ -14,16 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright; import static org.junit.jupiter.api.Assumptions.assumeTrue; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.microsoft.playwright.Browser; import com.microsoft.playwright.BrowserContext; import com.microsoft.playwright.Page; import com.microsoft.playwright.Playwright; -import com.fasterxml.jackson.databind.JsonNode; import java.nio.charset.StandardCharsets; import java.util.Base64; import java.util.concurrent.TimeUnit; diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java index db66110ec..ce9ddd1b9 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright; import java.util.Map; diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java index 8c8184d4a..22e4e0114 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.stormcrawler.protocol.playwright.actions; import com.fasterxml.jackson.databind.JsonNode; From b6017fad38354a762ff55daf8c731c533c0dc3b6 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 4 May 2026 21:03:15 +0200 Subject: [PATCH 3/4] style(playwright): apply google-java-format to PageAction chain --- .../protocol/playwright/PageAction.java | 8 +++---- .../protocol/playwright/PageActions.java | 6 +++-- .../actions/DismissOverlayAction.java | 12 +++++----- .../actions/ExpandClickablesAction.java | 12 +++++----- .../playwright/actions/ScreenshotAction.java | 4 ++-- .../actions/WaitForSelectorAction.java | 8 +++---- .../playwright/PageActionsLiveTest.java | 24 +++++++++---------- .../protocol/playwright/PageActionsTest.java | 13 +++++----- .../actions/ActionConfigureTest.java | 9 ++++--- 9 files changed, 47 insertions(+), 49 deletions(-) diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java index efc12c872..7b5b73039 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageAction.java @@ -29,8 +29,8 @@ * makes are reflected in the rendered content returned by the protocol. * *

Actions are loaded as an ordered chain via {@link PageActions} from a JSON file referenced by - * the {@code playwright.page.actions.config.file} configuration key. They follow the same - * {@link org.apache.stormcrawler.util.Configurable} pattern as URL/parse filters. + * the {@code playwright.page.actions.config.file} configuration key. They follow the same {@link + * org.apache.stormcrawler.util.Configurable} pattern as URL/parse filters. */ public abstract class PageAction extends AbstractConfigurable { @@ -49,9 +49,7 @@ public abstract void apply( @NotNull final Metadata responseMetadata) throws Exception; - /** - * Release any resources held by the action. See {@link IBolt#cleanup()} for more details. - */ + /** Release any resources held by the action. See {@link IBolt#cleanup()} for more details. */ public void cleanup() { // nothing to do here } diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java index 235710cde..12ebde54d 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/PageActions.java @@ -109,8 +109,10 @@ public void loadJSONResources(final InputStream inputStream) throws IOException actions = list.toArray(new PageAction[0]); } - /** Run every action in order. Failures are logged and swallowed so one bad action cannot - * abort the rest of the chain. */ + /** + * Run every action in order. Failures are logged and swallowed so one bad action cannot abort + * the rest of the chain. + */ public void apply( @NotNull final Page page, @NotNull final String url, diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java index 494dfe4fd..fa6f22fe9 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/DismissOverlayAction.java @@ -30,18 +30,18 @@ /** * Dismisses cookie banners, GDPR walls, paywalls, newsletter modals, etc. by clicking the first - * matching element of each configured selector. Each click is independently bounded by - * {@code timeoutMs}; missing elements and click failures are silently skipped, so it is safe to - * pass an over-broad set of fallback selectors. + * matching element of each configured selector. Each click is independently bounded by {@code + * timeoutMs}; missing elements and click failures are silently skipped, so it is safe to pass an + * over-broad set of fallback selectors. * *

Parameters

* *
    *
  • {@code selectors} (required, array of strings) *
  • {@code timeoutMs} (optional, int, default 1500): per-click timeout - *
  • {@code removeSelectors} (optional, array of strings): elements matching these selectors - * are removed from the DOM after the clicks (useful for sticky overlays that don't have a - * close button) + *
  • {@code removeSelectors} (optional, array of strings): elements matching these selectors are + * removed from the DOM after the clicks (useful for sticky overlays that don't have a close + * button) *
*/ public class DismissOverlayAction extends PageAction { diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java index 77bf82851..fb6b31991 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ExpandClickablesAction.java @@ -30,10 +30,10 @@ import org.slf4j.LoggerFactory; /** - * Clicks every element matching a list of selectors and, after each click, clones the rendered - * body container into a hidden cache under the same widget root. After the action runs, - * {@link Page#content()} contains the HTML of every panel a tab/accordion would normally hide - * behind user interaction — useful for SPAs whose visible markup depends on the active tab. + * Clicks every element matching a list of selectors and, after each click, clones the rendered body + * container into a hidden cache under the same widget root. After the action runs, {@link + * Page#content()} contains the HTML of every panel a tab/accordion would normally hide behind user + * interaction — useful for SPAs whose visible markup depends on the active tab. * *

Anchor elements with an {@code href} are skipped to avoid following links. * @@ -41,8 +41,8 @@ * *

    *
  • {@code selectors} (required, array): selectors whose matches will be clicked - *
  • {@code root} (required, string): selector for the widget root containing both the - * clickable and its body + *
  • {@code root} (required, string): selector for the widget root containing both the clickable + * and its body *
  • {@code body} (required, string): selector for the body container that should be cached *
  • {@code waitMs} (optional, int, default 200): time to wait after each click before caching *
  • {@code clickTimeoutMs} (optional, int, default 2000): per-click timeout diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java index 5ca843e9e..a24c8ea5a 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java @@ -35,8 +35,8 @@ *

    Parameters

    * *
      - *
    • {@code metadataKey} (optional, string, default {@code playwright.screenshot}): metadata - * key under which the base64 string is stored + *
    • {@code metadataKey} (optional, string, default {@code playwright.screenshot}): metadata key + * under which the base64 string is stored *
    • {@code fullPage} (optional, bool, default false): capture the entire scrollable page *
    • {@code type} (optional, string, default {@code png}): {@code png} or {@code jpeg} *
    • {@code quality} (optional, int, 0-100): only honoured for {@code jpeg} diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java index 957bc59f3..2a07f2c16 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java @@ -35,11 +35,11 @@ * *
        *
      • {@code selector} (required, string) - *
      • {@code state} (optional, string, default {@code visible}): one of {@code attached}, - * {@code detached}, {@code visible}, {@code hidden} + *
      • {@code state} (optional, string, default {@code visible}): one of {@code attached}, {@code + * detached}, {@code visible}, {@code hidden} *
      • {@code timeoutMs} (optional, int, default 5000) - *
      • {@code required} (optional, bool, default false): if true, a timeout aborts the action - * (and is logged and swallowed by the chain wrapper) + *
      • {@code required} (optional, bool, default false): if true, a timeout aborts the action (and + * is logged and swallowed by the chain wrapper) *
      */ public class WaitForSelectorAction extends PageAction { diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java index 5c1ebe652..38d096884 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsLiveTest.java @@ -102,12 +102,9 @@ void chainAppliesAllActions() throws Exception { "DismissOverlayAction click should have triggered the overlay removal"); // ExpandClickablesAction: every tab body should now be cached under the widget root - Assertions.assertTrue( - content.contains("CONTENT_TAB1"), "tab1 body should be cached"); - Assertions.assertTrue( - content.contains("CONTENT_TAB2"), "tab2 body should be cached"); - Assertions.assertTrue( - content.contains("CONTENT_TAB3"), "tab3 body should be cached"); + Assertions.assertTrue(content.contains("CONTENT_TAB1"), "tab1 body should be cached"); + Assertions.assertTrue(content.contains("CONTENT_TAB2"), "tab2 body should be cached"); + Assertions.assertTrue(content.contains("CONTENT_TAB3"), "tab3 body should be cached"); Assertions.assertTrue( content.contains("__sc_cache"), "hidden cache element should be present"); @@ -117,8 +114,7 @@ void chainAppliesAllActions() throws Exception { "ScrollToBottomAction should have triggered lazy loading"); // EvaluateAction: title is JSON-serialised under the expression key - final String title = - response.getMetadata().getFirstValue("document.title"); + final String title = response.getMetadata().getFirstValue("document.title"); Assertions.assertNotNull(title); Assertions.assertTrue( title.contains("StormCrawler PageActions Fixture"), @@ -175,8 +171,10 @@ void chainSwallowsActionFailures() throws Exception { } } - /** Drives a single action against a live page bypassing the protocol — useful for failure - * paths that the chain wrapper otherwise swallows. */ + /** + * Drives a single action against a live page bypassing the protocol — useful for failure paths + * that the chain wrapper otherwise swallows. + */ @Test @Timeout(value = 30, unit = TimeUnit.SECONDS) void waitForSelectorRequiredPropagates() throws Exception { @@ -204,7 +202,8 @@ void waitForSelectorSoftTimeoutReturnsCleanly() throws Exception { action.configure( java.util.Map.of(), new ObjectMapper() - .readTree("{\"selector\":\"#never\",\"timeoutMs\":250,\"required\":false}")); + .readTree( + "{\"selector\":\"#never\",\"timeoutMs\":250,\"required\":false}")); try (final Playwright pw = Playwright.create(); final Browser browser = pw.chromium().launch(); @@ -247,7 +246,8 @@ void scrollToBottomTerminatesWithoutLazyContent() throws Exception { final ScrollToBottomAction action = new ScrollToBottomAction(); action.configure( java.util.Map.of(), - new ObjectMapper().readTree("{\"waitMs\":50,\"maxSteps\":3,\"maxDurationMs\":2000}")); + new ObjectMapper() + .readTree("{\"waitMs\":50,\"maxSteps\":3,\"maxDurationMs\":2000}")); try (final Playwright pw = Playwright.create(); final Browser browser = pw.chromium().launch(); diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java index ce9ddd1b9..718e21948 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/PageActionsTest.java @@ -39,24 +39,21 @@ void blankConfigPathReturnsEmptyChain() { @Test void emptyJsonChainHasZeroActions() { final PageActions actions = - PageActions.fromConf( - Map.of(PageActions.CONFIG_KEY, "page-actions.empty.json")); + PageActions.fromConf(Map.of(PageActions.CONFIG_KEY, "page-actions.empty.json")); Assertions.assertEquals(0, actions.size()); } @Test void singleActionChainLoads() { final PageActions actions = - PageActions.fromConf( - Map.of(PageActions.CONFIG_KEY, "page-actions.single.json")); + PageActions.fromConf(Map.of(PageActions.CONFIG_KEY, "page-actions.single.json")); Assertions.assertEquals(1, actions.size()); } @Test void multiActionChainLoadsInOrder() { final PageActions actions = - PageActions.fromConf( - Map.of(PageActions.CONFIG_KEY, "page-actions.chain.json")); + PageActions.fromConf(Map.of(PageActions.CONFIG_KEY, "page-actions.chain.json")); Assertions.assertEquals(4, actions.size()); } @@ -66,7 +63,9 @@ void missingConfigFileRaises() { RuntimeException.class, () -> PageActions.fromConf( - Map.of(PageActions.CONFIG_KEY, "page-actions.does-not-exist.json"))); + Map.of( + PageActions.CONFIG_KEY, + "page-actions.does-not-exist.json"))); } @Test diff --git a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java index 22e4e0114..31d076fcb 100644 --- a/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java +++ b/external/playwright/src/test/java/org/apache/stormcrawler/protocol/playwright/actions/ActionConfigureTest.java @@ -71,7 +71,8 @@ void evaluateRequiresNonEmptyExpressions() throws Exception { void evaluateAcceptsExpressions() throws Exception { final EvaluateAction action = new EvaluateAction(); action.configure( - EMPTY_CONF, params("{\"expressions\":[\"window.location.href\"],\"keyPrefix\":\"e\"}")); + EMPTY_CONF, + params("{\"expressions\":[\"window.location.href\"],\"keyPrefix\":\"e\"}")); } // --- ScrollToBottomAction --- @@ -86,8 +87,7 @@ void scrollToBottomAcceptsEmptyConfig() throws Exception { void scrollToBottomAcceptsOverrides() throws Exception { final ScrollToBottomAction action = new ScrollToBottomAction(); action.configure( - EMPTY_CONF, - params("{\"waitMs\":100,\"maxSteps\":5,\"maxDurationMs\":2000}")); + EMPTY_CONF, params("{\"waitMs\":100,\"maxSteps\":5,\"maxDurationMs\":2000}")); } // --- WaitForSelectorAction --- @@ -168,7 +168,6 @@ void screenshotAcceptsPngAndJpeg() throws Exception { final ScreenshotAction jpeg = new ScreenshotAction(); jpeg.configure( EMPTY_CONF, - params( - "{\"type\":\"jpeg\",\"quality\":80,\"metadataKey\":\"my.shot\"}")); + params("{\"type\":\"jpeg\",\"quality\":80,\"metadataKey\":\"my.shot\"}")); } } From cb088437beeb6ddeac3dffdc96155707ca053728 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 4 May 2026 21:10:18 +0200 Subject: [PATCH 4/4] fix(playwright): use Locale.ROOT for case conversion in actions --- .../protocol/playwright/actions/ScreenshotAction.java | 3 ++- .../protocol/playwright/actions/WaitForSelectorAction.java | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java index a24c8ea5a..58cbccade 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/ScreenshotAction.java @@ -21,6 +21,7 @@ import com.microsoft.playwright.Page; import com.microsoft.playwright.options.ScreenshotType; import java.util.Base64; +import java.util.Locale; import java.util.Map; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.protocol.playwright.PageAction; @@ -66,7 +67,7 @@ public void configure( this.fullPage = params.get("fullPage").asBoolean(false); } if (params.has("type")) { - final String t = params.get("type").asText().toLowerCase(); + final String t = params.get("type").asText().toLowerCase(Locale.ROOT); switch (t) { case "jpeg": case "jpg": diff --git a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java index 2a07f2c16..00b4aa24b 100644 --- a/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java +++ b/external/playwright/src/main/java/org/apache/stormcrawler/protocol/playwright/actions/WaitForSelectorAction.java @@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.microsoft.playwright.Page; import com.microsoft.playwright.options.WaitForSelectorState; +import java.util.Locale; import java.util.Map; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.protocol.playwright.PageAction; @@ -68,7 +69,7 @@ public void configure( this.required = params.get("required").asBoolean(false); } if (params.has("state")) { - final String s = params.get("state").asText().toUpperCase(); + final String s = params.get("state").asText().toUpperCase(Locale.ROOT); switch (s) { case "ATTACHED": this.state = WaitForSelectorState.ATTACHED;