diff --git a/samples/browser-harness-webscraping/.env.template b/samples/browser-harness-webscraping/.env.template new file mode 100644 index 0000000..7b9c02c --- /dev/null +++ b/samples/browser-harness-webscraping/.env.template @@ -0,0 +1,6 @@ +# Microsoft Playwright Service - Environment Variables +# Copy this file to .env and fill in your values + +# Playwright Service (Required for all samples) +PLAYWRIGHT_SERVICE_URL= +PLAYWRIGHT_SERVICE_ACCESS_TOKEN= \ No newline at end of file diff --git a/samples/browser-harness-webscraping/README.md b/samples/browser-harness-webscraping/README.md new file mode 100644 index 0000000..983dd07 --- /dev/null +++ b/samples/browser-harness-webscraping/README.md @@ -0,0 +1,86 @@ +# Parallel Web Scraping with Browser-Harness + Playwright Workspaces + +This sample demonstrates how to use [browser-harness](https://github.com/browser-use/browser-harness) with [Playwright Workspaces (PWW)](https://aka.ms/pww/docs) to run 10+ parallel remote browser sessions for web scraping, with LiveView for real-time debuggability. + +## Overview + +When you need to scrape data from many pages simultaneously — product prices, inventory levels, competitor catalogs — you need parallel browser sessions. This sample shows how to: + +1. **Connect browser-harness** to PWW's remote CDP endpoint +1. **Spawn 10+ parallel browser sessions** — each with its own isolated browser +1. **Scrape product data** from multiple pages concurrently + +## Prerequisites + +- **Azure subscription** with permissions to create Playwright Workspaces +- **Playwright Workspace** & a **Playwright Service Access Token**. [Information on how to create a workspace](https://learn.microsoft.com/en-us/azure/app-testing/playwright-workspaces/quickstart-run-end-to-end-tests?tabs=playwrightcli&pivots=playwright-test-runner) and [how to create an access token](https://learn.microsoft.com/en-us/azure/app-testing/playwright-workspaces/how-to-manage-access-tokens) +- **Python 3.10+** +- **Git** installed + +## Quick Start + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. Set Up Environment Variables + +Copy `.env.template` to `.env` and fill in your values: + +```bash +cp .env.template .env +``` + +Required variables: +``` +PLAYWRIGHT_SERVICE_URL= +PLAYWRIGHT_SERVICE_ACCESS_TOKEN= +``` + + +### Use the setup prompt to setup browser-harness to connect to Playwright Service Browsers + +In a coding agent of your choice like Codex/Claude Code, use the following prompt: + +```text +Set up https://github.com/browser-use/browser-harness for me. + +Read `install.md` and follow the steps to install browser-harness and connect it to my Playwright Workspaces remote browsers. + +Get the SERVICE_URL needed for provisioning remote browsers by running `get_cdp_browsers_endpoint()` method from `playwright_service_client.py` + +Then update your skill to Follow the two-step connection flow for playwright remote browsers: + +1. HTTP GET the SERVICE_URL (allow 60-90s for the browser to spin up). Parse the JSON response to extract the `sessionUrl` (a wss:// WebSocket URL). +2. Set BU_CDP_WS to the resolved sessionUrl in .env, then restart the daemon ONCE. + +IMPORTANT: + +- Do NOT kill or restart the daemon after the session is connected — the remote browser is destroyed when the WebSocket connection closes. +- Do NOT set shouldRedirect=true; use shouldRedirect=false and manually resolve the sessionUrl. +- The cold start takes 30-90s. Use a generous timeout on the initial HTTP GET. +- After connecting, verify with: browser-harness <<'PY'\nprint(page_info())\nPY + +Once connected, confirm with a screenshot that the remote browser is alive. +``` + +#### Start scraping with the power of browser-harness and Playwright Remote Browsers + +Once this done, you can ask your agent to use browser-harness with playwright remote browsers to perform web scraping. Use a prompt similar to something like this: + +```text + +Go to ecommerce websites Website 1, Website 2, in Geography India search for gifts under 500 for 10 year old kids which is useful, reusable and not single time use. +Delivery in Bengaluru should be within 3 days.It should be such that 5 pieces of the item are available. +Create independent Playwright Service remote browser sessions per +website and use one sub-agent per website to browse in parallel using browser harness. Clone each remote session after scraping. + +``` + +## More Resources + +- [Playwright Workspaces Documentation](https://aka.ms/pww/docs) +- [Browser-Harness GitHub](https://github.com/browser-use/browser-harness) +- [PWW Pricing](https://aka.ms/pww/pricing) diff --git a/samples/browser-harness-webscraping/playwright_service_client.py b/samples/browser-harness-webscraping/playwright_service_client.py new file mode 100644 index 0000000..676df35 --- /dev/null +++ b/samples/browser-harness-webscraping/playwright_service_client.py @@ -0,0 +1,93 @@ +""" +Microsoft Playwright Service - Python Client + +Get a Service URL to connect to get remote CDP browsers. + +---------------------------------------- +📌 Prerequisites +---------------------------------------- +pip install python-dotenv + +---------------------------------------- +📌 Environment Variables +---------------------------------------- +PLAYWRIGHT_SERVICE_URL=wss://.api.playwright.microsoft.com/playwrightworkspaces//browsers +PLAYWRIGHT_SERVICE_ACCESS_TOKEN=your_access_token + +---------------------------------------- +📌 How to Use +---------------------------------------- + from playwright_service_client import get_cdp_browsers_endpoint + + endpoint = get_cdp_browsers_endpoint() +""" + +import re +import os +from dotenv import load_dotenv + +load_dotenv() + + +class PlaywrightServiceError(Exception): + """Exception for Playwright Service errors.""" + pass + + +# URL pattern: wss://.api.playwright.microsoft.com/playwrightworkspaces//browsers +_URL_PATTERN = re.compile( + r'wss://(\w+)\.api\.playwright\.microsoft\.com/playwrightworkspaces/([^/]+)/browsers' +) + + +def _parse_url(url: str) -> tuple[str, str]: + """Extract region and workspace ID from service URL.""" + match = _URL_PATTERN.match(url) + if not match: + raise PlaywrightServiceError( + f"Invalid PLAYWRIGHT_SERVICE_URL format: {url}\n" + f"Expected: wss://.api.playwright.microsoft.com/playwrightworkspaces//browsers" + ) + return match.group(1), match.group(2) + + +def get_cdp_browsers_endpoint( + service_url: str | None = None, + access_token: str | None = None +) -> str: + """ + Get the SERVICE_URL that an agent can use to get browsers that it can connect to via CDP + Args: + service_url: Service URL (defaults to PLAYWRIGHT_SERVICE_URL env var) + access_token: Access token (defaults to PLAYWRIGHT_SERVICE_ACCESS_TOKEN env var) + + Returns: + URL for getting CDP browsers + + Example: + SERVICE_URL = await get_cdp_browsers_endpoint() + """ + # Get credentials from env vars if not provided + service_url = service_url or os.getenv("PLAYWRIGHT_SERVICE_URL") + access_token = access_token or os.getenv("PLAYWRIGHT_SERVICE_ACCESS_TOKEN") + + if not service_url: + raise PlaywrightServiceError( + "PLAYWRIGHT_SERVICE_URL environment variable is not set.\n" + "Expected: wss://.api.playwright.microsoft.com/playwrightworkspaces//browsers" + ) + if not access_token: + raise PlaywrightServiceError( + "PLAYWRIGHT_SERVICE_ACCESS_TOKEN environment variable is not set." + ) + + # Parse URL to get region and workspace ID + region, workspace_id = _parse_url(service_url) + + # Build API URL + api_url = ( + f"https://{region}.api.playwright.microsoft.com" + f"/playwrightworkspaces/{workspace_id}/browsers" + f"?os=linux&browser=chromium&playwrightVersion=cdp&shouldRedirect=false") + + return api_url diff --git a/samples/browser-harness-webscraping/requirements.txt b/samples/browser-harness-webscraping/requirements.txt new file mode 100644 index 0000000..566cccb --- /dev/null +++ b/samples/browser-harness-webscraping/requirements.txt @@ -0,0 +1 @@ +python-dotenv