diff --git a/.dockerignore b/.dockerignore index 9979d3ca7f..c4d54d61e4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,88 @@ -output -__pycache__ .DS_Store -venv -.venv -data +._* +/.heartbeat.json +/package.json +/package-lock.json +__pycache__ +**/__pycache__ +*.pyc +*.pyo +*.py[cod] +*$py.class +.mypy_cache/ +.pytest_cache/ +.ruff_cache/ +.uv-cache/ +.github/ +.pdm-build/ +.pdm-python +.eggs/ +.git/ +!.git/ +.git/* +.vscode/ +!.git/HEAD +!.git/packed-refs +!.git/refs/ +!.git/refs/heads/ +!.git/refs/heads/** + +venv/ +.venv/ +.venv-old/ +.docker_venv/ +.docker-venv/ +node_modules/ +abx-dl/ +abxpkg/ +abx-plugins/ +abxbus/ +chrome/ +chromeprofile/ +chrome_profile/ +lib/ +out/ +users/ +archive/ +crawls/ +snapshots/ +logs/ +archivebox-docker-smoke*/ +archivebox-compose-smoke*/ +docker-test/ +docker-test*/ +core +*.core + +pdm.dev.lock +pdm.lock + +docs/ +build/ +dist/ +brew_dist/ +deb_dist/ +pip_dist/ +assets/ +docker/ +website/ +typings/ + +tmp/ +.tmp/ +data/ +data*/ +- +personas/ +sources/ +output/ +index.sqlite3 +index.sqlite3-wal +queue.sqlite3 +*.sqlite* +data.* +.archivebox_id +ArchiveBox.conf +*.stdout +*.stderr +*.log diff --git a/.github/.readthedocs.yaml b/.github/.readthedocs.yaml new file mode 100644 index 0000000000..2cefab193a --- /dev/null +++ b/.github/.readthedocs.yaml @@ -0,0 +1,26 @@ +# Read the Docs config for https://docs.archivebox.io +# https://docs.readthedocs.io/en/stable/config-file/v2.html + +version: 2 + +submodules: + include: all + recursive: true + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + #nodejs: "20" # not needed unless we need the full archivebox to run while building docs for some reason + +sphinx: + configuration: docs/conf.py + +formats: + - pdf + - epub + +# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 76273c5413..f78490a6fe 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1 +1,38 @@ -Make sure check in with me first or confirm your desired features line up with our roadmap: https://github.com/pirate/ArchiveBox#roadmap +# Contribution Process + +1. Confirm your desired features fit into our bigger project goals [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap). +2. Open an issue with your planned implementation to discuss +3. Check in with me before starting development to make sure your work wont conflict with or duplicate existing work +4. Setup your dev environment, make some changes, and test using the test input files +5. Commit, push, and submit a PR and wait for review feedback +6. Have patience, don't abandon your PR! We love contributors but we all have day jobs and don't always have time to respond to notifications instantly. If you want a faster response, ping @theSquashSH on twitter or Patreon. + +**Useful links:** + +- https://github.com/ArchiveBox/ArchiveBox/issues +- https://github.com/ArchiveBox/ArchiveBox/pulls +- https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap +- https://github.com/ArchiveBox/ArchiveBox/wiki/Install#manual-setup + +### Development Setup + +```bash +git clone https://github.com/ArchiveBox/ArchiveBox +cd ArchiveBox +# Ideally do this in a virtualenv +pip install -e '.[dev]' # or use: pipenv install --dev +``` + +### Running Tests + +```bash +./bin/lint.sh +./bin/test.sh +./bin/build.sh +``` + +For more common tasks see the `Development` section at the bottom of the README. + +### Getting Help + +Open issues on Github or message me https://sweeting.me/#contact. diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000000..72dea7c5a8 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: ["ArchiveBox", "pirate"] +custom: ["https://donate.archivebox.io", "https://swag.archivebox.io"] diff --git a/.github/ISSUE_TEMPLATE/1-bug_report.yml b/.github/ISSUE_TEMPLATE/1-bug_report.yml new file mode 100644 index 0000000000..f6e0b20ce4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1-bug_report.yml @@ -0,0 +1,198 @@ +name: ๐Ÿž Bug report +description: Report a bug or error you encountered in ArchiveBox +title: "Bug: ..." +assignees: + - pirate +type: 'Bug' +body: + - type: markdown + attributes: + value: | + *Please note:* it is normal to see errors occasionally for some extractors on some URLs (not every extractor will work on every type of page). + Please report archiving errors if you are seeing them *consistently across many URLs* or if they are *preventing you from using ArchiveBox*. + + - type: textarea + id: description + attributes: + label: Provide a screenshot and describe the bug + description: | + Attach a screenshot and describe what the issue is, what you expected to happen, and if relevant, the *URLs you were trying to archive*. + placeholder: | + Got a bunch of 'singlefile was unable to archive this page' errors when trying to archive URLs from this site: https://example.com/xyz ... + I also tried to archive the same URLs using `singlefile` directly and some of them worked but not all of them. etc. ... + validations: + required: true + + - type: textarea + id: steps_to_reproduce + attributes: + label: Steps to reproduce + description: Please provide the exact steps you took to trigger the issue (including any shell commands run, URLs visited, buttons clicked, etc.). + render: markdown + placeholder: | + 1. Started ArchiveBox by running: `docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox` in iTerm2 + 2. Went to the https://127.0.0.1:8000/add/ page in Google Chrome + 3. Typed 'https://example.com/xyz' into the 'Add URL' input field + 4. Clicked the 'Add+' button + 5. Got a 500 error and saw the errors below in terminal + validations: + required: true + + - type: textarea + id: logs + attributes: + label: Logs or errors + description: "Paste any terminal output, logs, or errors (check `data/logs/errors.log` as well)." + placeholder: | + โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ + โ”‚ [2024-11-02 19:54:28] ArchiveBox v0.8.6rc0: archivebox add https://example.com#1234567 โ”‚ + โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ + + [+] [2024-11-02 19:54:29] Adding 1 links to index (crawl depth=0)... + > Saved verbatim input to sources/1730577269-import.txt + > Parsed 1 URLs from input (Generic TXT) + ... + render: shell + validations: + required: false + + - type: textarea + id: version + attributes: + label: ArchiveBox Version + description: | + **REQUIRED:** Run the `archivebox version` command inside your collection dir and paste the *full output* here (*not just the version number*). + For Docker Compose run: `docker compose run archivebox version` + For plain Docker run: `docker run -v $PWD:/data archivebox/archivebox version` + render: shell + placeholder: | + 0.8.6 + ArchiveBox v0.8.6rc0 COMMIT_HASH=721427a BUILD_TIME=2024-10-21 12:57:02 1729515422 + IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-15.1-arm64-arm-64bit PYTHON=Cpython (venv) + EUID=502:20 UID=502:20 FS_UID=502:20 FS_PERMS=644 FS_ATOMIC=True FS_REMOTE=False + DEBUG=False IS_TTY=True SUDO=False ID=dfa11485:aa78ad45 SEARCH_BACKEND=ripgrep LDAP=False + + Binary Dependencies: + โˆš python 3.14.0 venv_pip ~/.venv/bin/python + โˆš django 6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/__init__.py + โˆš sqlite 2.6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/base.py + โˆš pip 24.3.1 venv_pip ~/.venv/bin/pip + ... + validations: + required: true + + - type: dropdown + id: install_method + validations: + required: true + attributes: + label: How did you install the version of ArchiveBox you are using? + multiple: false + options: + - pip + - apt + - brew + - nix + - Docker (or Podman/LXC/K8s/TrueNAS/Proxmox/etc) + - Other + + - type: dropdown + id: operating_system + validations: + required: true + attributes: + label: What operating system are you running on? + description: | + Please note we are *unable to provide support for Windows users* unless you are using [Docker on Windows](https://github.com/ArchiveBox/archivebox#:~:text=windows%20without%20docker). + multiple: false + options: + - Linux (Ubuntu/Debian/Arch/Alpine/etc.) + - macOS (including Docker on macOS) + - BSD (FreeBSD/OpenBSD/NetBSD/etc.) + - Windows (including WSL, WSL2, Docker Desktop on Windows) + - Other + + - type: checkboxes + id: filesystem + attributes: + label: What type of drive are you using to store your ArchiveBox data? + description: Are you using a [remote filesystem](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage#supported-remote-filesystems) or FUSE mount for `data/` OR `data/archive`? + options: + - label: "some of `data/` is on a local SSD or NVMe drive" + required: false + - label: "some of `data/` is on a spinning hard drive or external USB drive" + required: false + - label: "some of `data/` is on a network mount (e.g. NFS/SMB/Ceph/GlusterFS/etc.)" + required: false + - label: "some of `data/` is on a FUSE mount (e.g. SSHFS/RClone/S3/B2/Google Drive/Dropbox/etc.)" + required: false + + + - type: textarea + id: docker_compose_yml + attributes: + label: Docker Compose Configuration + description: "If using Docker Compose, please share your full `docker-compose.yml` file. If using plain Docker, paste the `docker run ...` command you use." + placeholder: | + services: + archivebox: + image: archivebox/archivebox:latest + ports: + - 8000:8000 + volumes: + - ./data:/data + environment: + - ADMIN_USERNAME=admin + - ADMIN_PASSWORD=******** + - ALLOWED_HOSTS=* + - CSRF_TRUSTED_ORIGINS=https://archivebox.example.com + - PUBLIC_INDEX=True + - PUBLIC_SNAPSHOTS=True + - PUBLIC_ADD_VIEW=False + ... + + archivebox_scheduler: + image: archivebox/archivebox:latest + command: schedule --foreground --update --every=day + environment: + ... + + ... + render: shell + validations: + required: false + + - type: textarea + id: configuration + attributes: + label: ArchiveBox Configuration + description: "Please share your full `data/ArchiveBox.conf` file here." + render: shell + placeholder: | + [SERVER_CONFIG] + SECRET_KEY = "*********************" + + WGET_RESTRICT_FILE_NAMES=windows + USE_SYSTEM_WGET=true + CHECK_SSL_VALIDITY=false + ... + validations: + required: false + + + - type: markdown + attributes: + value: | + --- + + We strive to answer issues as quickly as possible, it usually takes us *about a ~week* to respond. + Make sure your `data/` is [**fully backed up**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#disk-layout) before trying anything suggested here, **we are not responsible for data loss**. + + In the meantime please consider: + + - ๐Ÿ’ฐ [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - ๐Ÿ‘จโ€โœˆ๏ธ [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - ๐Ÿ” [Searching the Documentation](https://docs.archivebox.io/) for answers to common questions + - ๐Ÿ“š Reading the [Troubleshooting Guide](https://github.com/ArchiveBox/ArchiveBox/wiki) + - โœจ Testing out a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) (issues are often already fixed in our latest `BETA` releases) + diff --git a/.github/ISSUE_TEMPLATE/2-feature_request.yml b/.github/ISSUE_TEMPLATE/2-feature_request.yml new file mode 100644 index 0000000000..b6985776f3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2-feature_request.yml @@ -0,0 +1,128 @@ +name: ๐Ÿ’ก Feature or enhancement request +description: Suggest an idea or improvement for this project +title: "Feature Request: ..." +assignees: + - pirate +type: 'Enhancement' +labels: 'status: idea phase' +body: + - type: dropdown + id: suggestion_type + validations: + required: true + attributes: + label: "What type of suggestion are you making?" + multiple: false + options: + - New extractor / type of content to save + - Proposing a new feature + - Modification of existing behavior + - Web UI or UX design improvement + + - type: textarea + id: current_problem + attributes: + label: "What is the problem that your feature request solves?" + description: | + Describe the problem or need that your feature request solves, feel free to include any screenshots or examples. + placeholder: | + e.g. I need to be able to archive spanish and french subtitle files from a particular movie site https://example.com/somevideos that's going down soon. + validations: + required: true + + - type: textarea + id: proposed_solution + attributes: + label: "What is your proposed solution?" + description: | + Describe the ideal specific solution you'd want, *and whether it fits into any broader scope of changes*. + placeholder: | + e.g. I specifically need a new archive method to look for multilingual subtitle files related to pages. + The bigger picture solution is the ability for custom user scripts to be run in a puppeteer context during archiving. + validations: + required: true + + - type: textarea + id: workarounds_tried + attributes: + label: "What hacks or alternative solutions have you tried to solve the problem?" + description: | + A description of any alternative approaches, workarounds, or other solutions you've considered to fix the problem. + placeholder: | + e.g. I wait for archivebox to finish archiving the page, then I manually run `yt-dlp --subs ` inside + the `data/archive//` directory to download the subtitle files and add them to the snapshot folder. + validations: + required: true + + - type: textarea + id: version + attributes: + label: Share the entire output of the `archivebox version` command for the current version you are using. + description: | + DO NOT JUST ENTER "the latest version" OR YOUR ISSUE WILL BE CLOSED. + We need to know what version of ArchiveBox and what feature flags you're currently running with in order to contextualize your feature request. + Sometimes we've already fixed the issues in newer BETA versions, sometimes features already exist but may not be available in your specific environment. + + Run the `archivebox version` command inside your current collection dir and paste the *full output* here (*not just the version number*). + For Docker Compose run: `docker compose run archivebox version` + For plain Docker run: `docker run -v $PWD:/data archivebox/archivebox version` + render: shell + placeholder: | + 0.8.6 + ArchiveBox v0.8.6rc0 COMMIT_HASH=721427a BUILD_TIME=2024-10-21 12:57:02 1729515422 + IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-15.1-arm64-arm-64bit PYTHON=Cpython (venv) + EUID=502:20 UID=502:20 FS_UID=502:20 FS_PERMS=644 FS_ATOMIC=True FS_REMOTE=False + DEBUG=False IS_TTY=True SUDO=False ID=dfa11485:aa78ad45 SEARCH_BACKEND=ripgrep LDAP=False + + Binary Dependencies: + โˆš python 3.14.0 venv_pip ~/.venv/bin/python + โˆš django 6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/__init__.py + โˆš sqlite 2.6.0 venv_pip ~/.venv/lib/python3.14/site-packages/django/db/backends/sqlite3/base.py + โˆš pip 24.3.1 venv_pip ~/.venv/bin/pip + ... + validations: + required: true + + - type: checkboxes + id: priority + attributes: + label: "How badly do you want this new feature?" + options: + - label: "It's an urgent deal-breaker, I can't live without it" + required: false + - label: "It's important to add it in the near-mid term future" + required: false + - label: "It would be nice to have eventually" + required: false + - label: "I'm willing to [start a PR](https://github.com/ArchiveBox/ArchiveBox#archivebox-development) to develop this myself" + required: false + - label: "I have [donated money](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) to go towards fixing this issue" + required: false + + - type: checkboxes + id: satisfaction_survey + attributes: + label: Mini Survey + description: How do you like ArchiveBox so far? + options: + - label: "I like ArchiveBox so far / would recommend it to a friend" + required: false + - label: "I've had a lot of difficulty getting ArchiveBox set up" + required: false + - label: "I would pay $10/mo for a hosted version of ArchiveBox if it had this feature" + required: false + + - type: markdown + attributes: + value: | + --- + + We strive to answer issues as quickly as possible, it usually takes us *about a ~week* to respond. + Make sure your `data/` is [**fully backed up**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#disk-layout) before trying any workarounds or BETAs suggested here, **we are not responsible for data loss**. + + In the meantime please consider: + + - ๐Ÿ’ฐ [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - ๐Ÿ“Š [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - ๐Ÿ” [Searching the Documentation](https://docs.archivebox.io/) for answers to common questions + - โœจ Testing out a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) (issues are often already fixed in our latest `BETA` releases) diff --git a/.github/ISSUE_TEMPLATE/3-documentation_change.yml b/.github/ISSUE_TEMPLATE/3-documentation_change.yml new file mode 100644 index 0000000000..c711f0897a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3-documentation_change.yml @@ -0,0 +1,52 @@ +name: ๐Ÿ“‘ Documentation improvement +description: Submit an idea or correction for the Wiki documentation +title: "Documentation: ..." +labels: 'touches: docs' +type: 'Enhancement' +assignees: + - pirate +body: + - type: markdown + attributes: + value: | + If you prefer, you can submit a [Pull Request](https://github.com/ArchiveBox/docs) on https://github.com/ArchiveBox/docs to edit the docs directly instead. + + - type: input + id: page_url + validations: + required: true + attributes: + label: "What is the URL of the page you'd like to see improved?" + placeholder: e.g. https://github.com/ArchiveBox/docs/wiki/Install + + - type: input + id: section_title + validations: + required: true + attributes: + label: "What is the title of the relevant section?" + placeholder: e.g. Option B. Automatic Setup Script + + - type: textarea + id: suggested_edit + attributes: + label: "What is the suggested edit?" + placeholder: | + e.g. Please document how to run the automatic setup script for ArchiveBox on TempleOS. + Attach images, screenshots, code snippets, etc. anything you think would help. + validations: + required: true + + - type: markdown + attributes: + value: | + --- + + We strive to address issues as quickly as possible, it usually takes us *about a ~week* to respond. + + In the meantime please consider: + + - ๐Ÿ’ฐ [Donating to support ArchiveBox open-source](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - ๐Ÿ‘จโ€โœˆ๏ธ [Hiring us for corporate deployments](https://docs.monadical.com/s/archivebox-consulting-services) with professional support, custom feature development, and help with CAPTCHAs/rate-limits + - ๐Ÿ” [Checking out the new ReadTheDocs Documentation](https://docs.archivebox.io/) + - โœจ Helping us test a newer [`BETA` release](https://github.com/ArchiveBox/ArchiveBox/releases) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 66a2d21bd1..0000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -name: ๐Ÿž Bug report -about: Create a report to help us improve -title: '' -labels: '' -assignees: '' - ---- - -(please fill out the following information, feel free to delete sections if they're not applicable or if long issue templates annoy you) - -#### Describe the bug -A description of what the bug is, what you expected to happen, -and any relevant context about issue. - -#### Steps to reproduce - -1. Ran ArchiveBox with the following config '...' -2. Saw this output during archiving '....' -3. UI didn't show the thing I was expecting '....' - -#### Screenshots or log output - -If applicable, post any relevant screenshots or copy/pasted terminal output from ArchiveBox. -If you're reporting a parsing / importing error, **you must paste a copy of your redacted import file here**. - -#### Software versions - - - OS: ([e.g. macOS 10.14] the operating system you're running ArchiveBox on) - - ArchiveBox version: (`git rev-parse HEAD | head -c7` [e.g. d798117] commit ID of the version you're running) - - Python version: (`python3 --version` [e.g. 3.7.0]) - - Chrome version: (`chromium-browser --version` [e.g. 73.1.2.3] if relevant to bug) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..110053ccbc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,11 @@ +blank_issues_enabled: false +contact_links: + - name: โ“ Ask a question or start a discussion + url: https://github.com/ArchiveBox/ArchiveBox/discussions + about: "Ask a question, get support, or start a design discussion (to report a problem please use '๐Ÿž Bug report' instead)" + - name: ๐Ÿ’ฌ Chat with the dev team & community on Zulip + url: https://zulip.archivebox.io + about: "Join us on our Zulip forum to chat with the developers and other users (it's similar to Discord but self-hosted)." + - name: ๐Ÿ’โ€โ™‚๏ธ Hire us for professional support with fast response times + url: https://docs.monadical.com/s/archivebox-consulting-services + about: "We provide hosting, development, and support, including on-prem/cloud w/ SSO & storage, CAPTCHA-solving, proxies, etc." diff --git a/.github/ISSUE_TEMPLATE/documentation_change.md b/.github/ISSUE_TEMPLATE/documentation_change.md deleted file mode 100644 index dc3c27414a..0000000000 --- a/.github/ISSUE_TEMPLATE/documentation_change.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -name: ๐Ÿ“‘ Documentation change -about: Submit a suggestion for the Wiki documentation -title: '' -labels: '' -assignees: '' - ---- - -## Wiki Page URL - - -## Suggested Edit - -... diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 0f9423f56a..0000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -name: ๐Ÿ’ก Feature request -about: Suggest an idea for this project -title: '' -labels: '' -assignees: '' - ---- - -(feel free to delete this template and write your own issue description if you don't find it helpful) - -## Type - - - [ ] General Question or Disussion - - [ ] Propose a brand new feature - - [ ] Request modification of existing behavior or design - -## What is the problem that your feature request solves -e.g. I need to be able to archive spanish and french subtitle files -from a particular movie site that's going down soon. - -## Describe the ideal specific solution you'd want, and whether it fits into any broader scope of changes -e.g. I specifically need a new archive method to look for multilingual subtitle files related to pages. -The bigger picture solution is the ability for custom user scripts to be run in a puppeteer context during archiving. - -## What hacks or alternative solutions have you tried to solve the problem? -A clear and concise description of any alternative solutions or features you've considered. - -## How badly do you want this new feature? - - - [ ] It's an urgent deal-breaker, I cant live without it - - [ ] It's important to add it in the near-mid term future - - [ ] It would be nice to have eventually - ---- - - - [ ] I'm willing to contribute to development / fixing this issue - - [ ] I like ArchiveBox so far / would recommend it to a friend diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c903d1a92b..5727af79f7 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,10 +1,12 @@ -**IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes, I will close them with great prejudice. The PEP8 checks I don't follow are intentional. PRs for minor bugfixes, typos, etc are fine.** + # Summary -e.g. This PR fixes ABC or adds the ability to do XYZ... + -**Related issues: #XYZ** (delete this line if there are no related issues) +# Related issues + + # Changes these areas @@ -13,9 +15,4 @@ e.g. This PR fixes ABC or adds the ability to do XYZ... - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture -- [ ] Archived data layout on disk - -# Roadmap Goals - -This PR helps us move towards xyz roadmap goal, as outlined here: https://github.com/pirate/ArchiveBox#roadmap -(delete this section if it's just a bugfix / simple PR) +- [ ] Snapshot data layout on disk diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000000..8fae71e187 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1,34 @@ +# Security Policy + +--- + +## Security Information + +Please see this wiki page for important notices about ArchiveBox security, publishing your archives securely, and the dangers of executing archived JS: + +https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview + +Also see this section of the README about important caveats when running ArchiveBox: + +https://github.com/ArchiveBox/ArchiveBox?tab=readme-ov-file#caveats + +You can also read these pages for more information about ArchiveBox's internals, development environment, DB schema, and more: + +- https://github.com/ArchiveBox/ArchiveBox#archive-layout +- https://github.com/ArchiveBox/ArchiveBox#archivebox-development +- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives +- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting + +--- + +## Reporting a Vulnerability + +We use Github's built-in [Private Reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature to accept vulnerability reports. + +1. Go to the Security tab on our Github repo: https://github.com/ArchiveBox/ArchiveBox/security + +2. Click the ["Report a Vulnerability"](https://github.com/ArchiveBox/ArchiveBox/security/advisories/new) button + +3. Fill out the form to submit the details of the report and it will be securely sent to the maintainers + +You can also contact the maintainers via our public [Zulip Chat Server zulip.archivebox.io](https://zulip.archivebox.io) or [Twitter DMs @ArchiveBoxApp](https://twitter.com/ArchiveBoxApp). diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..edc253a66e --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,25 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + target-branch: "dev" + schedule: + interval: "monthly" + groups: + pip: + patterns: + - "*" + - package-ecosystem: "npm" + directory: "/" + target-branch: "dev" + schedule: + interval: "monthly" + groups: + npm: + patterns: + - "*" diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml new file mode 100644 index 0000000000..2feee2e38c --- /dev/null +++ b/.github/workflows/claude.yml @@ -0,0 +1,49 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. + # prompt: 'Update the pull request description to include a summary of changes.' + + # Optional: Add claude_args to customize behavior and configuration + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://code.claude.com/docs/en/cli-reference for available options + claude_args: '--allowed-tools Bash(gh pr:*)' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000000..78bff0577b --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,96 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "dev" ] + pull_request: + branches: [ "dev" ] + schedule: + - cron: '33 17 * * 6' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + analyze: + name: Analyze (${{ matrix.language }}) + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners (GitHub.com only) + # Consider using larger runners or machines with greater resources for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # required to fetch internal or private CodeQL packs + packages: read + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + include: + - language: python + build-mode: none + # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' + # Use `c-cpp` to analyze code written in C, C++ or both + # Use 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, + # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. + # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how + # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + build-mode: ${{ matrix.build-mode }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + # If the analyze step fails for one of the languages you are analyzing with + # "We were unable to automatically build your code", modify the matrix above + # to set the build mode to "manual" for that language. Then modify this step + # to build your code. + # โ„น๏ธ Command-line programs to run using the OS shell. + # ๐Ÿ“š See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + - if: matrix.build-mode == 'manual' + run: | + echo 'If you are using a "manual" build mode for one or more of the' \ + 'languages you are analyzing, replace this with the commands to build' \ + 'your code, for example:' + echo ' make bootstrap' + echo ' make release' + exit 1 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" diff --git a/.github/workflows/deploy-publicsite.yml b/.github/workflows/deploy-publicsite.yml new file mode 100644 index 0000000000..daaf5a217e --- /dev/null +++ b/.github/workflows/deploy-publicsite.yml @@ -0,0 +1,43 @@ +name: Deploy Publicsite to GitHub Pages + +on: + push: + branches: + - dev + paths: + - publicsite/** + - .github/workflows/deploy-publicsite.yml + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: github-pages-publicsite + cancel-in-progress: true + +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: ./publicsite + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000000..c6a51d8b93 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,335 @@ +name: Build Docker image + +on: + workflow_dispatch: + workflow_call: + push: + branches: + - "**" + tags: + - 'v*' + # pull_request: + +env: + DOCKERHUB_IMAGE: archivebox/archivebox + GHCR_IMAGE: ghcr.io/archivebox/archivebox + ABX_DL_IMAGE: archivebox/abx-dl:latest + +permissions: + contents: read + packages: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + name: build ${{ matrix.platform }} + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - platform: linux/amd64 + runner: ubuntu-24.04 + cache_scope: docker-amd64 + artifact_name: digest-linux-amd64 + - platform: linux/arm64 + runner: ubuntu-24.04-arm + cache_scope: docker-arm64 + artifact_name: digest-linux-arm64 + steps: + - name: Checkout + uses: actions/checkout@v4 + # with: + # submodules: true + # fetch-depth: 1 + + - name: Wait for released ArchiveBox deps on PyPI + run: | + python3 - <<'PY' + import json + import re + import sys + import time + import tomllib + import urllib.request + + watched = {"abxbus", "abxpkg", "abx-plugins", "abx-dl"} + deps = tomllib.loads(open("pyproject.toml", "rb").read().decode())["project"]["dependencies"] + required = {} + for dep in deps: + for name in watched: + match = re.match(rf"{re.escape(name)}\s*(==|>=)\s*([^,;\s]+)", dep) + if match: + required[name] = match.group(2) + + deadline = time.monotonic() + 300 + missing = required.copy() + while missing and time.monotonic() < deadline: + for name, version in list(missing.items()): + with urllib.request.urlopen(f"https://pypi.org/pypi/{name}/json", timeout=20) as resp: + releases = json.load(resp)["releases"] + if version in releases: + print(f"{name} {version} is available on PyPI") + missing.pop(name) + else: + print(f"{name} {version} is not available on PyPI yet") + if missing: + time.sleep(10) + + if missing: + print(f"Missing PyPI releases after wait: {missing}", file=sys.stderr) + sys.exit(1) + PY + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + with: + version: latest + install: true + + - name: Builder instance name + run: echo ${{ steps.buildx.outputs.name }} + + - name: Available platforms + run: echo ${{ steps.buildx.outputs.platforms }} + + - name: Wait for published abx-dl image + id: abx_dl_image + shell: bash + run: | + set -Eeuo pipefail + deadline=$((SECONDS + 1800)) + until docker buildx imagetools inspect "${ABX_DL_IMAGE}" >/tmp/abx-dl-image.json; do + if (( SECONDS >= deadline )); then + echo "Timed out waiting for published ${ABX_DL_IMAGE}" >&2 + exit 1 + fi + echo "${ABX_DL_IMAGE} is not published yet; waiting..." + sleep 30 + done + + echo "image=${ABX_DL_IMAGE}" >> "$GITHUB_OUTPUT" + docker buildx imagetools inspect "${ABX_DL_IMAGE}" + + - name: Login to Docker Hub + uses: docker/login-action@v3 + if: github.event_name != 'pull_request' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Collect Docker labels + id: docker_meta + shell: bash + run: | + set -Eeuo pipefail + VERSION="$(python3 - <<'PY' + import tomllib + + with open("pyproject.toml", "rb") as f: + print(tomllib.load(f)["project"]["version"]) + PY + )" + + { + echo 'labels<> "$GITHUB_OUTPUT" + + echo "[+] Building ${{ matrix.platform }} for ${VERSION} using ${{ steps.abx_dl_image.outputs.image }}" + + - name: Build and push digest + id: docker_build + uses: docker/build-push-action@v6 + with: + context: ./ + file: ./Dockerfile + builder: ${{ steps.buildx.outputs.name }} + push: ${{ github.event_name != 'pull_request' }} + tags: | + ${{ env.DOCKERHUB_IMAGE }} + ${{ env.GHCR_IMAGE }} + labels: ${{ steps.docker_meta.outputs.labels }} + build-args: | + ABX_DL_IMAGE=${{ steps.abx_dl_image.outputs.image }} + cache-from: type=gha,scope=${{ matrix.cache_scope }} + cache-to: type=gha,mode=max,scope=${{ matrix.cache_scope }} + platforms: ${{ matrix.platform }} + outputs: type=image,push-by-digest=true,name-canonical=true,push=true + + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} + + - name: Export digest + shell: bash + run: | + set -Eeuo pipefail + mkdir -p /tmp/digests + digest="${{ steps.docker_build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.artifact_name }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + + publish: + name: publish multiarch tags + runs-on: ubuntu-24.04 + needs: + - build + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + version: latest + install: true + + - name: Login to Docker Hub + uses: docker/login-action@v3 + if: github.event_name != 'pull_request' + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digest-* + merge-multiple: true + + - name: Collect Docker tags + id: docker_meta + shell: bash + run: | + set -Eeuo pipefail + VERSION="$(python3 - <<'PY' + import tomllib + + with open("pyproject.toml", "rb") as f: + print(tomllib.load(f)["project"]["version"]) + PY + )" + BRANCH_TAG="$(printf '%s' "${GITHUB_REF_NAME}" | tr -c 'A-Za-z0-9_.-' '-' | sed -E 's/^-+//; s/-+$//; s/-+/-/g' | cut -c1-128)" + SHORT_SHA="${GITHUB_SHA::12}" + test -n "$BRANCH_TAG" + test -n "$SHORT_SHA" + + { + echo 'dockerhub_tags<> "$GITHUB_OUTPUT" + + echo "[+] Publishing Docker Hub tags:" + if [[ "${GITHUB_REF_NAME}" == "main" ]]; then + printf '%s\n' "${DOCKERHUB_IMAGE}:latest" + fi + printf '%s\n' "${DOCKERHUB_IMAGE}:${BRANCH_TAG}" "${DOCKERHUB_IMAGE}:${VERSION}" "${DOCKERHUB_IMAGE}:sha-${SHORT_SHA}" + echo "[+] Publishing GHCR tags:" + if [[ "${GITHUB_REF_NAME}" == "main" ]]; then + printf '%s\n' "${GHCR_IMAGE}:latest" + fi + printf '%s\n' "${GHCR_IMAGE}:${BRANCH_TAG}" "${GHCR_IMAGE}:${VERSION}" "${GHCR_IMAGE}:sha-${SHORT_SHA}" + + - name: Create Docker Hub manifest + shell: bash + run: | + set -Eeuo pipefail + mapfile -t DIGESTS < <(find /tmp/digests -maxdepth 1 -type f -printf '%f\n' | sort) + [[ "${#DIGESTS[@]}" -gt 0 ]] + + TAG_ARGS=() + while IFS= read -r tag; do + [[ -n "$tag" ]] && TAG_ARGS+=(--tag "$tag") + done <<< "${{ steps.docker_meta.outputs.dockerhub_tags }}" + + REFS=() + for digest in "${DIGESTS[@]}"; do + REFS+=("${DOCKERHUB_IMAGE}@sha256:${digest}") + done + + docker buildx imagetools create "${TAG_ARGS[@]}" "${REFS[@]}" + + - name: Create GHCR manifest + shell: bash + run: | + set -Eeuo pipefail + mapfile -t DIGESTS < <(find /tmp/digests -maxdepth 1 -type f -printf '%f\n' | sort) + [[ "${#DIGESTS[@]}" -gt 0 ]] + + TAG_ARGS=() + while IFS= read -r tag; do + [[ -n "$tag" ]] && TAG_ARGS+=(--tag "$tag") + done <<< "${{ steps.docker_meta.outputs.ghcr_tags }}" + + REFS=() + for digest in "${DIGESTS[@]}"; do + REFS+=("${GHCR_IMAGE}@sha256:${digest}") + done + + docker buildx imagetools create "${TAG_ARGS[@]}" "${REFS[@]}" + + - name: Inspect published images + shell: bash + run: | + set -Eeuo pipefail + while IFS= read -r tag; do + [[ -n "$tag" ]] && docker buildx imagetools inspect "$tag" + done <<< "${{ steps.docker_meta.outputs.dockerhub_tags }}" + while IFS= read -r tag; do + [[ -n "$tag" ]] && docker buildx imagetools inspect "$tag" + done <<< "${{ steps.docker_meta.outputs.ghcr_tags }}" + + - name: Update README + uses: peter-evans/dockerhub-description@v4 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + repository: archivebox/archivebox diff --git a/.github/workflows/duplicate-issue-detection.yml b/.github/workflows/duplicate-issue-detection.yml new file mode 100644 index 0000000000..98dcd8394a --- /dev/null +++ b/.github/workflows/duplicate-issue-detection.yml @@ -0,0 +1,59 @@ +name: Duplicate Issue Detection + +on: + issues: + types: [opened] + +jobs: + check-duplicates: + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Install opencode + run: curl -fsSL https://opencode.ai/install | bash + + - name: Check for duplicate issues + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENCODE_PERMISSION: | + { + "bash": { + "gh issue*": "allow", + "gh pr*": "allow", + "*": "deny" + }, + "webfetch": "allow" + } + run: | + opencode run -m anthropic/claude-haiku-4-5 "A new issue has been created: + + Issue number: + ${{ github.event.issue.number }} + + Lookup this issue and search through existing issues and PRs (excluding #${{ github.event.issue.number }}) in this repository to find any potential duplicates of this new issue. + Consider: + 1. Similar titles or descriptions + 2. Same error messages or symptoms + 3. Related functionality or components + 4. Similar feature requests + + If you find any potential duplicates, please comment on the new issue with: + - A brief explanation of why it might be a duplicate + - Links to the potentially duplicate issues or PRs + - A suggestion to check those issues first + + Use this format for the comment: + 'This issue might be a duplicate of existing issues. Please check: + - #[issue_number]: [brief description of similarity] + + Feel free to ignore if none of these address your specific case.' + + If no clear duplicates are found, do not comment." diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..2bb365bb9f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,51 @@ +name: Run linters + +on: + workflow_dispatch: + push: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + UV_NO_SOURCES: "1" + PYTHONPATH: ${{ github.workspace }}/abxpkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + +jobs: + lint: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abxpkg + run: git clone --depth=1 https://github.com/ArchiveBox/abxpkg.git abxpkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.10.6" + enable-cache: false + + - name: Install dependencies with uv + run: | + uv venv + uv pip install --all-groups -e ./abxpkg -e ./abx-plugins -e ./abx-dl -e ".[sonic,debug]" + + - name: Run prek + run: uv run --no-sync --no-sources prek run --all-files diff --git a/.github/workflows/pip.yml b/.github/workflows/pip.yml new file mode 100755 index 0000000000..8954235bd2 --- /dev/null +++ b/.github/workflows/pip.yml @@ -0,0 +1,109 @@ +name: Build Pip package + +on: + workflow_dispatch: + workflow_call: + push: + branches: + - '**' + tags: + - 'v*' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + PYTHON_VERSION: "3.13" + +jobs: + build: + permissions: + id-token: write + + runs-on: ubuntu-24.04 + environment: pypi + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.10.6" + enable-cache: false + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + architecture: x64 + + - name: APT install archivebox dev + run dependencies + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.0 + + - name: Wait for released ArchiveBox deps on PyPI + run: | + python - <<'PY' + import json + import re + import sys + import time + import tomllib + import urllib.request + + watched = {"abxbus", "abxpkg", "abx-plugins", "abx-dl"} + deps = tomllib.loads(open("pyproject.toml", "rb").read().decode())["project"]["dependencies"] + required = {} + for dep in deps: + for name in watched: + match = re.match(rf"{re.escape(name)}\s*(==|>=)\s*([^,;\s]+)", dep) + if match: + required[name] = match.group(2) + + deadline = time.monotonic() + 300 + missing = required.copy() + while missing and time.monotonic() < deadline: + for name, version in list(missing.items()): + with urllib.request.urlopen(f"https://pypi.org/pypi/{name}/json", timeout=20) as resp: + releases = json.load(resp)["releases"] + if version in releases: + print(f"{name} {version} is available on PyPI") + missing.pop(name) + else: + print(f"{name} {version} is not available on PyPI yet") + if missing: + time.sleep(10) + + if missing: + print(f"Missing PyPI releases after wait: {missing}", file=sys.stderr) + sys.exit(1) + PY + + - name: UV install archivebox dev + run sub-dependencies + run: uv sync --all-extras --no-install-project --no-install-workspace --no-sources --no-cache + + - name: UV build archivebox and archivebox/pkgs/* packages + run: | + uv build --all + + - name: Publish new package wheels and sdists to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + # ignore when publish to PyPI fails due to duplicate tag + continue-on-error: true + with: + password: ${{ secrets.PYPI_PAT_SECRET }} + + - name: UV install archivebox and archivebox/pkgs/* locally for tests + run: uv sync --all-extras --no-sources --no-cache + + - name: UV run archivebox init + archivebox version + run: | + mkdir -p data && cd data + uv run --no-sync --no-sources archivebox init \ + && uv run --no-sync --no-sources archivebox version + # && uv run archivebox add 'https://example.com' \ + # && uv run archivebox status \ + # || (echo "UV Failed to run archivebox!" && exit 1) diff --git a/.github/workflows/release-runner.yml b/.github/workflows/release-runner.yml new file mode 100644 index 0000000000..4fe4b65c0d --- /dev/null +++ b/.github/workflows/release-runner.yml @@ -0,0 +1,50 @@ +name: Release State + +on: + push: + branches: + - '**' + workflow_dispatch: + +permissions: + contents: write + id-token: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + release-state: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: true + ref: ${{ github.ref_name }} + + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - uses: astral-sh/setup-uv@v6 + with: + version: "0.10.6" + enable-cache: false + + - uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Configure git identity + run: | + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + - name: Run release script + env: + DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + GH_TOKEN: ${{ github.token }} + PYPI_PAT_SECRET: ${{ secrets.PYPI_PAT_SECRET }} + run: ./bin/release.sh diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000000..41436c487b --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,41 @@ +name: Release + +# Orchestrates the full release pipeline: +# 1. Build and publish pip package to PyPI +# 2. Build Homebrew formula after pip +# 3. Build Docker images in parallel +# +# Individual workflows also run on push for CI (see their own triggers). +# Manual fallback for the hosted release pipeline. The normal prerelease loop +# is run by bin/release.sh locally so published GitHub releases do not launch a +# duplicate release job. + +on: + workflow_dispatch: + +permissions: + contents: write + packages: write + id-token: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + pip: + name: Publish to PyPI + uses: ./.github/workflows/pip.yml + secrets: inherit + + homebrew: + name: Update Homebrew formula + needs: pip + uses: ./.github/workflows/homebrew.yml + secrets: inherit + + docker: + name: Build Docker images + needs: pip + uses: ./.github/workflows/docker.yml + secrets: inherit diff --git a/.github/workflows/test-parallel.yml b/.github/workflows/test-parallel.yml new file mode 100644 index 0000000000..4ac4d084fe --- /dev/null +++ b/.github/workflows/test-parallel.yml @@ -0,0 +1,369 @@ +name: Parallel Tests + +on: + workflow_dispatch: + pull_request: + branches: [dev, main, master] + push: + branches: [dev] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + PYTHONIOENCODING: utf-8 + PYTHONLEGACYWINDOWSSTDIO: utf-8 + USE_COLOR: False + UV_NO_SOURCES: "1" + +jobs: + discover-tests: + name: Discover test files + runs-on: ubuntu-24.04 + outputs: + test-files: ${{ steps.set-matrix.outputs.test-files }} + plugin-tests: ${{ steps.set-plugin-matrix.outputs.plugin-tests }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Discover test files + id: set-matrix + run: | + # Find all main test files + all_tests=$(find archivebox/tests -maxdepth 1 -name "test_*.py" -type f | sort) + + # Create JSON array with test file info + json_array="[" + first=true + for test_file in $all_tests; do + if [ "$first" = true ]; then + first=false + else + json_array+="," + fi + + # Extract a display name for the test + name="main/$(basename $test_file .py | sed 's/^test_//')" + needs_chromium=false + if grep -Eiq '(chrom|archivewebpage|PLUGINS=.*title|--plugins=.*title|SAVE_TITLE.*[Tt]rue)' "$test_file"; then + needs_chromium=true + fi + needs_sonic=false + if grep -Eiq "shutil\\.which\\([\"']sonic|SEARCH_BACKEND_ENGINE=.*sonic|worker_sonic" "$test_file"; then + needs_sonic=true + fi + + json_array+="{\"path\":\"$test_file\",\"name\":\"$name\",\"needs_chromium\":$needs_chromium,\"needs_sonic\":$needs_sonic}" + done + json_array+="]" + + echo "test-files=$json_array" >> $GITHUB_OUTPUT + echo "Found $(echo $all_tests | wc -w) test files" + echo "$json_array" | jq '.' + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Discover plugin tests + id: set-plugin-matrix + run: | + plugin_tests=$(find abx-plugins/abx_plugins/plugins -maxdepth 2 -type d -name tests | sed 's#abx-plugins/abx_plugins/plugins/##; s#/tests##' | sort) + + json_array="[" + first=true + for plugin_name in $plugin_tests; do + if [ "$first" = true ]; then + first=false + else + json_array+="," + fi + + needs_chromium=false + if grep -Riq "chrom" "abx-plugins/abx_plugins/plugins/$plugin_name"; then + needs_chromium=true + fi + needs_sonic=false + if [ "$plugin_name" = "search_backend_sonic" ]; then + needs_sonic=true + fi + + json_array+="{\"plugin\":\"$plugin_name\",\"name\":\"plugin/$plugin_name\",\"needs_chromium\":$needs_chromium,\"needs_sonic\":$needs_sonic}" + done + json_array+="]" + + echo "plugin-tests=$json_array" >> $GITHUB_OUTPUT + echo "Found $(echo $plugin_tests | wc -w) plugin test suites" + echo "$json_array" | jq '.' + + run-tests: + name: ${{ matrix.test.name }} + runs-on: ubuntu-24.04 + needs: discover-tests + env: + PYTHONPATH: ${{ github.workspace }}/abxpkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + CHROME_HEADLESS: "true" + CHROME_SANDBOX: "false" + PERSONAS_DIR: /tmp/abx-personas + CHROME_USER_DATA_DIR: /tmp/abx-personas/Default/chrome_profile + + strategy: + fail-fast: false + matrix: + test: ${{ fromJson(needs.discover-tests.outputs.test-files) }} + python: ["3.13"] + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abxpkg + run: git clone --depth=1 https://github.com/ArchiveBox/abxpkg.git abxpkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.10.6" + + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Cache uv + uses: actions/cache@v4 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.python }}-uv- + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.1 + + - name: Install dependencies with uv + run: | + uv venv + uv pip install --group dev -e ./abxpkg -e ./abx-plugins -e ./abx-dl -e ".[all]" + + - name: Install Chrome through ArchiveBox + if: ${{ matrix.test.needs_chromium }} + env: + MIN_CHROMIUM_MAJOR: "149" + LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_INSTALL_TIMEOUT: "900" + ABX_CI_DATA_DIR: ${{ runner.temp }}/archivebox-ci-data + run: | + set -euo pipefail + mkdir -p "$LIB_DIR" "$ABX_CI_DATA_DIR" + echo "LIB_DIR=$LIB_DIR" >> "$GITHUB_ENV" + echo "ABXPKG_LIB_DIR=$ABXPKG_LIB_DIR" >> "$GITHUB_ENV" + + uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox install chrome + + candidate="$(uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox shell -c 'from archivebox.machine.models import Binary; binary = Binary.objects.filter(name="chromium", status="installed").order_by("-modified_at").first(); print(binary.abspath if binary else "")' | tail -n 1)" + if [ ! -x "$candidate" ]; then + echo "ArchiveBox did not install an executable Chromium binary: ${candidate:-not found}" >&2 + exit 1 + fi + version="$("$candidate" --version || true)" + major="$(printf '%s\n' "$version" | sed -E 's/.* ([0-9]+)\..*/\1/' | head -1)" + case "$major" in + ''|*[!0-9]*) major=0 ;; + esac + if [ "$major" -lt "$MIN_CHROMIUM_MAJOR" ]; then + echo "ArchiveBox installed Chromium is too old: ${version:-unknown}" >&2 + exit 1 + fi + echo "Using ArchiveBox-installed Chromium: $candidate ($version)" + echo "CHROME_BINARY=$candidate" >> "$GITHUB_ENV" + + - name: Install Sonic through ArchiveBox + if: ${{ matrix.test.needs_sonic }} + env: + LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_INSTALL_TIMEOUT: "900" + ABX_CI_DATA_DIR: ${{ runner.temp }}/archivebox-ci-data + run: | + set -euo pipefail + mkdir -p "$LIB_DIR" "$ABX_CI_DATA_DIR" + echo "LIB_DIR=$LIB_DIR" >> "$GITHUB_ENV" + echo "ABXPKG_LIB_DIR=$ABXPKG_LIB_DIR" >> "$GITHUB_ENV" + + uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox install search_backend_sonic + + candidate="$(uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox shell -c 'from archivebox.machine.models import Binary; binary = Binary.objects.filter(name="sonic", status="installed").order_by("-modified_at").first(); print(binary.abspath if binary else "")' | tail -n 1)" + if [ ! -x "$candidate" ]; then + echo "ArchiveBox did not install an executable Sonic binary: ${candidate:-not found}" >&2 + exit 1 + fi + version="$("$candidate" --version || true)" + echo "Using ArchiveBox-installed Sonic: $candidate (${version:-version unknown})" + echo "SONIC_BINARY=$candidate" >> "$GITHUB_ENV" + echo "$(dirname "$candidate")" >> "$GITHUB_PATH" + + - name: Run test - ${{ matrix.test.name }} + run: | + mkdir -p tests/out + set +e + uv run --no-sync --no-sources pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs + status=$? + set -e + if [ "$status" -eq 5 ]; then + echo "No tests collected from ${{ matrix.test.path }}; treating as an empty test module." + exit 0 + fi + exit "$status" + + plugin-tests: + name: ${{ matrix.plugin.name }} + runs-on: ubuntu-24.04 + needs: discover-tests + env: + PYTHONPATH: ${{ github.workspace }}/abxpkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + CHROME_HEADLESS: "true" + CHROME_SANDBOX: "false" + PERSONAS_DIR: /tmp/abx-personas + CHROME_USER_DATA_DIR: /tmp/abx-personas/Default/chrome_profile + + strategy: + fail-fast: false + matrix: + plugin: ${{ fromJson(needs.discover-tests.outputs.plugin-tests) }} + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abxpkg + run: git clone --depth=1 https://github.com/ArchiveBox/abxpkg.git abxpkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + - name: Set up Python 3.13 + uses: actions/setup-python@v5 + with: + python-version: "3.13" + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.10.6" + + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Cache uv + uses: actions/cache@v4 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-3.13-uv-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-3.13-uv- + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.1 + + - name: Install dependencies with uv + run: | + uv venv + uv pip install --group dev -e ./abxpkg -e ./abx-plugins -e ./abx-dl -e ".[all]" + + - name: Install Chrome through ArchiveBox + if: ${{ matrix.plugin.needs_chromium }} + env: + MIN_CHROMIUM_MAJOR: "149" + LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_INSTALL_TIMEOUT: "900" + ABX_CI_DATA_DIR: ${{ runner.temp }}/archivebox-ci-data + run: | + set -euo pipefail + mkdir -p "$LIB_DIR" "$ABX_CI_DATA_DIR" + echo "LIB_DIR=$LIB_DIR" >> "$GITHUB_ENV" + echo "ABXPKG_LIB_DIR=$ABXPKG_LIB_DIR" >> "$GITHUB_ENV" + + uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox install chrome + + candidate="$(uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox shell -c 'from archivebox.machine.models import Binary; binary = Binary.objects.filter(name="chromium", status="installed").order_by("-modified_at").first(); print(binary.abspath if binary else "")' | tail -n 1)" + if [ ! -x "$candidate" ]; then + echo "ArchiveBox did not install an executable Chromium binary: ${candidate:-not found}" >&2 + exit 1 + fi + version="$("$candidate" --version || true)" + major="$(printf '%s\n' "$version" | sed -E 's/.* ([0-9]+)\..*/\1/' | head -1)" + case "$major" in + ''|*[!0-9]*) major=0 ;; + esac + if [ "$major" -lt "$MIN_CHROMIUM_MAJOR" ]; then + echo "ArchiveBox installed Chromium is too old: ${version:-unknown}" >&2 + exit 1 + fi + echo "Using ArchiveBox-installed Chromium: $candidate ($version)" + echo "CHROME_BINARY=$candidate" >> "$GITHUB_ENV" + + - name: Install Sonic through ArchiveBox + if: ${{ matrix.plugin.needs_sonic }} + env: + LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_LIB_DIR: ${{ runner.temp }}/abx-lib + ABXPKG_INSTALL_TIMEOUT: "900" + ABX_CI_DATA_DIR: ${{ runner.temp }}/archivebox-ci-data + run: | + set -euo pipefail + mkdir -p "$LIB_DIR" "$ABX_CI_DATA_DIR" + echo "LIB_DIR=$LIB_DIR" >> "$GITHUB_ENV" + echo "ABXPKG_LIB_DIR=$ABXPKG_LIB_DIR" >> "$GITHUB_ENV" + + uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox install search_backend_sonic + + candidate="$(uv run --project "$GITHUB_WORKSPACE" --directory "$ABX_CI_DATA_DIR" --no-sync --no-sources archivebox shell -c 'from archivebox.machine.models import Binary; binary = Binary.objects.filter(name="sonic", status="installed").order_by("-modified_at").first(); print(binary.abspath if binary else "")' | tail -n 1)" + if [ ! -x "$candidate" ]; then + echo "ArchiveBox did not install an executable Sonic binary: ${candidate:-not found}" >&2 + exit 1 + fi + version="$("$candidate" --version || true)" + echo "Using ArchiveBox-installed Sonic: $candidate (${version:-version unknown})" + echo "SONIC_BINARY=$candidate" >> "$GITHUB_ENV" + echo "$(dirname "$candidate")" >> "$GITHUB_PATH" + + - name: Run plugin tests - ${{ matrix.plugin.name }} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} + API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} + run: | + uv run --no-sync --no-sources bash ./bin/test_plugins.sh "${{ matrix.plugin.plugin }}" --no-coverage diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100755 index 0000000000..9c9a6616c6 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,156 @@ +name: Run tests +on: [push] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + DOCKER_IMAGE: archivebox-ci + PYTHONIOENCODING: utf-8 + PYTHONLEGACYWINDOWSSTDIO: utf-8 + USE_COLOR: False + UV_NO_SOURCES: "1" + +jobs: + python_tests: + runs-on: ${{ matrix.os }} + env: + PYTHONPATH: ${{ github.workspace }}/abxpkg:${{ github.workspace }}/abx-plugins:${{ github.workspace }}/abx-dl + + strategy: + matrix: + os: [ubuntu-22.04] + # os: [ubuntu-22.04, macos-latest, windows-latest] + python: ["3.13"] + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + - name: Clone abxpkg + run: git clone --depth=1 https://github.com/ArchiveBox/abxpkg.git abxpkg + + - name: Clone abx-plugins + run: git clone --depth=1 https://github.com/ArchiveBox/abx-plugins.git abx-plugins + + - name: Clone abx-dl + run: git clone --depth=1 https://github.com/ArchiveBox/abx-dl.git abx-dl + + ### Setup Python & JS Languages + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + architecture: x64 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "0.10.6" + + - name: Set up Node JS + uses: actions/setup-node@v4 + with: + node-version: 22 + + ### Install Python & JS Dependencies + - name: Cache uv + uses: actions/cache@v3 + with: + path: ~/.cache/uv + key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.python }}-uv- + + - uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.0 + + - name: Install dependencies with uv + run: | + uv venv + uv pip install --group dev -e ./abxpkg -e ./abx-plugins -e ./abx-dl -e ".[all]" + + ### Run the tests + - name: Directory listing for debugging + run: | + pwd + ls + + - name: Archivebox version + run: | + mkdir -p tests/out/data + DATA_DIR="$PWD/tests/out/data" uv run --no-sync --no-sources archivebox version + + - name: Test built package with pytest + # TODO: remove this exception for windows once we get tests passing on that platform + if: ${{ !contains(matrix.os, 'windows') }} + run: | + mkdir -p tests/out + uv run --no-sync --no-sources pytest -s archivebox/tests --basetemp=tests/out --ignore=archivebox/pkgs + + - name: Run plugin tests + if: ${{ !contains(matrix.os, 'windows') }} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + TWOCAPTCHA_API_KEY: ${{ secrets.TWOCAPTCHA_API_KEY }} + API_KEY_2CAPTCHA: ${{ secrets.TWOCAPTCHA_API_KEY }} + run: | + uv run --no-sync --no-sources bash ./bin/test_plugins.sh --no-coverage + + docker_tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 1 + + # TODO: as of 2020-11 this helper layer broke, upgrade and re-enable this once it's usable again + # - uses: satackey/action-docker-layer-caching@v0.0.8 + + - name: Build image + run: | + docker build . -t "$DOCKER_IMAGE" + + - name: Init data dir + run: | + mkdir "${{ github.workspace }}/data" + docker run -v "${{ github.workspace }}/data":/data "$DOCKER_IMAGE" init + + - name: Run test server + run: | + sudo bash -c 'echo "127.0.0.1 www.test-nginx-1.local www.test-nginx-2.local" >> /etc/hosts' + docker run --name www-nginx -p 80:80 -d nginx + + - name: Add link + run: | + docker run -v "$PWD"/data:/data --network host "$DOCKER_IMAGE" add http://www.test-nginx-1.local + + - name: Add stdin link + run: | + echo "http://www.test-nginx-2.local" | docker run -i --network host -v "$PWD"/data:/data "$DOCKER_IMAGE" add + + - name: List links + run: | + docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; } + docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; } + + - name: Start docker-compose stack + run: | + docker-compose run archivebox init + docker-compose up -d + sleep 5 + curl --silent --location 'http://127.0.0.1:8000' | grep 'ArchiveBox' + curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'window.django' + + - name: Check added urls show up in index + run: | + docker-compose run archivebox add 'http://example.com/#test_docker' --index-only + curl --silent --location 'http://127.0.0.1:8000' | grep 'http://example.com/#test_docker' + docker-compose down || true diff --git a/.github/workflows/update-homebrew-tap.yml b/.github/workflows/update-homebrew-tap.yml new file mode 100644 index 0000000000..ef2475ec76 --- /dev/null +++ b/.github/workflows/update-homebrew-tap.yml @@ -0,0 +1,35 @@ +name: Update Homebrew tap + +on: + push: + branches: [dev] + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: update-homebrew-tap-${{ github.ref }} + cancel-in-progress: true + +jobs: + dispatch: + runs-on: ubuntu-latest + + steps: + - name: Dispatch tap update + env: + HOMEBREW_TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} + run: | + if [ -z "$HOMEBREW_TAP_TOKEN" ]; then + echo "HOMEBREW_TAP_TOKEN is not configured; tap hourly fallback will update the formula." + exit 0 + fi + + curl --fail-with-body \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${HOMEBREW_TAP_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/ArchiveBox/homebrew-archivebox/dispatches \ + -d '{"event_type":"archivebox-dev-updated"}' diff --git a/.gitignore b/.gitignore index 5a6fcf3dcd..1f6cdd5024 100644 --- a/.gitignore +++ b/.gitignore @@ -1,20 +1,68 @@ -# OS cruft .DS_Store ._* - -# python +/.heartbeat.json +/package.json +/package-lock.json +*.pyc __pycache__/ -venv -.venv -archivebox/.venv -archivebox/venv +.mypy_cache/ +.eggs/ +tests/out/ -# vim -.swp* +# Coverage +.coverage +.coverage.* +coverage.json +coverage/ +htmlcov/ -# output artifacts -output -output/ -data +# Python and Node dependencies +venv/ +.venv/ +.docker-venv/ +node_modules/ +typings/ + +# Ignore dev lockfiles (should always be built fresh) +pdm.dev.lock +requirements-dev.txt + +# Packaging artifacts +requirements.txt +.pdm-python +.pdm-build +archivebox.egg-info +archivebox-*.tar.gz +build/ +dist/ + +# Data folders +lib/ +out/ +tmp/ data/ -archivebox/output +data*/ +archivebox/tests/data/ +archive/ +output/ +logs/ +/- +/personas/ +/sources/ +index.sqlite3 +queue.sqlite3 +*.sqlite* +data.* +.archivebox_id +ArchiveBox.conf +*.stdout +*.stderr +*.log +.tmp/ + +# vim +*.sw? +.vscode + +# Local Claude Code task locks (not project state) +.claude/scheduled_tasks.lock diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 9bbb6b2c0a..0000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "docs"] - path = docs - url = https://github.com/pirate/ArchiveBox.wiki.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..132f77319b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,67 @@ +default_language_version: + python: python3.13 + +repos: + - repo: https://github.com/asottile/yesqa + rev: v1.5.0 + hooks: + - id: yesqa + + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/asottile/pyupgrade + rev: v3.20.0 + hooks: + - id: pyupgrade + args: [--py313-plus] + + - repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma + + - repo: local + hooks: + - id: ruff-format + name: ruff-format + entry: uv run --active --no-sync --no-sources ruff format + language: system + types_or: [python, pyi] + - id: ruff-check + name: ruff-check + entry: uv run --active --no-sync --no-sources ruff check --fix + language: system + types_or: [python, pyi] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-ast + - id: check-toml + - id: check-yaml + exclude: ^\.github/workflows/homebrew\.yml$ + - id: check-json + - id: check-merge-conflict + - id: check-symlinks + - id: destroyed-symlinks + - id: check-case-conflict + - id: check-illegal-windows-names + - id: check-shebang-scripts-are-executable + exclude: ^(archivebox/.*\.py|archivebox/tests/.*\.py|archivebox/personas/export_browser_state\.js)$ + - id: mixed-line-ending + - id: fix-byte-order-marker + - id: end-of-file-fixer + - id: detect-private-key + - id: debug-statements + - id: forbid-submodules + exclude: ^docs$ + - id: check-added-large-files + args: ["--maxkb=600"] + - id: name-tests-test + args: ["--pytest-test-first"] + exclude: ^archivebox/tests/(data/|fixtures\.py$|migrations_helpers\.py$) diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000..6077e39c5f --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,14 @@ +I found 31 unique `no mocking` prompts in recent Codex history, across 21 session transcripts. Only one transcript literally started with `no mocking`; most had it later as testing guidance. Consolidated advice: + +- Tests must hit real user-facing code paths: CLI commands, REST/API calls, browser UI, real hooks, real ArchiveBox data dirs, real pytest fixtures, and real subprocess/binary behavior. +- No mocking, faking, simulating, monkey patching, handwritten fake objects, fake buses, fake hooks, fake binaries, fake handlers, or direct-post shortcuts when the user path is through UI/extension/CLI. +- No skipped, xfailed, flaky, or โ€œworks around platformโ€ tests. Flakiness is treated as a bug, especially on macOS/browser flows. +- Prefer live integration tests over narrow unit tests when behavior depends on browsers, binaries, ArchiveBox crawls, plugins, LLMs, or server state. +- Assertions must validate real correctness: returned values, exit codes, DB rows, filesystem contents, field values, uploaded files, rendered output, and side effects. โ€œNo error occurredโ€ or โ€œattribute existsโ€ is not enough. +- Start fixes with failing red tests that reproduce the missing behavior or regression, then implement the minimal fix and confirm the test passes. +- Use realistic setup patterns โ€œlike a user wouldโ€: events + bus + handlers, real browser pages/CDP sessions, real URLs or `pytest-httpserver`, real rows, real snapshots, real installs, real local browser/server state. +- For ArchiveBox/API tests, use existing `conftest.py` fixtures and test harnesses, real test DB rows/data dirs, and user-facing commands/APIs rather than bespoke helpers. +- For browser/extension tests, trigger behavior through the real extension UI or actual browser session, not direct posting or mocked browser/session objects. +- For binary/provider tests, use real binaries and real installs; verify constraints and final installed package metadata, not just install success. +- For coverage quality, keep tests strict, deterministic, grouped consistently, and use a few larger realistic tests when that gives better surface coverage than many tiny fake unit tests. +- Avoid weakening test coverage, adding compatibility/shim/fallback layers, or guessing from code shape. Trace root causes, verify assumptions with tests/scripts, and let real type/parse errors surface normally. diff --git a/CNAME b/CNAME deleted file mode 100644 index 4ff42236ef..0000000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -archivebox.io \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c53e5c7a6c..f510faa9fc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,72 +1,237 @@ -# This Dockerfile for ArchiveBox installs the following in a container: -# - curl, wget, python3, youtube-dl, google-chrome-unstable -# - ArchiveBox -# Usage: -# docker build github.com/pirate/ArchiveBox -t archivebox -# echo 'https://example.com' | docker run -i --mount type=bind,source=./data,target=/data archivebox /bin/archive -# docker run --mount type=bind,source=./data,target=/data archivebox /bin/archive 'https://example.com/some/rss/feed.xml' -# Documentation: -# https://github.com/pirate/ArchiveBox/wiki/Docker#docker - -FROM node:11-slim -LABEL maintainer="Nick Sweeting " - -RUN apt-get update \ - && apt-get install -yq --no-install-recommends \ - git wget curl youtube-dl gnupg2 libgconf-2-4 python3 python3-pip \ - && rm -rf /var/lib/apt/lists/* +# syntax=docker/dockerfile:1.7 + +# Multistage ArchiveBox Dockerfile that consumes the abx-dl runtime image. +# abx-dl owns Python, Node, Chromium, and downloader plugin runtimes. +# ArchiveBox owns ripgrep, sonic, supervisor, Django, and the app runtime. +# Build abx-dl first, then point this file at it: +# docker buildx build ../abx-dl -f ../abx-dl/Dockerfile \ +# --build-context abxbus=../abxbus \ +# --build-context abxpkg=../abxpkg \ +# --build-context abx-plugins=../abx-plugins \ +# -t archivebox/abx-dl:dev +# docker buildx build . -f Dockerfile \ +# --build-arg ABX_DL_IMAGE=archivebox/abx-dl:latest \ +# -t archivebox:multistage + +ARG ABX_DL_IMAGE=archivebox/abx-dl:latest + +FROM archivebox/sonic:1.4.9 AS sonic +FROM ${ABX_DL_IMAGE} AS archivebox-runtime-base -# Install latest chrome package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) -RUN apt-get update && apt-get install -y wget --no-install-recommends \ - && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ - && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ - && apt-get update \ - && apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \ - --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /src/*.deb - -# It's a good idea to use dumb-init to help prevent zombie chrome processes. -ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init -RUN chmod +x /usr/local/bin/dumb-init - -# Uncomment to skip the chromium download when installing puppeteer. If you do, -# you'll need to launch puppeteer with: -# browser.launch({executablePath: 'google-chrome-unstable'}) -ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true - -# Install puppeteer so it's available in the container. -RUN npm i puppeteer - -# Add user so we don't need --no-sandbox. -RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \ - && mkdir -p /home/pptruser/Downloads \ - && chown -R pptruser:pptruser /home/pptruser \ - && chown -R pptruser:pptruser /node_modules - -# Install the ArchiveBox repository and pip requirements -COPY . /home/pptruser/app -RUN mkdir -p /data \ - && chown -R pptruser:pptruser /data \ - && ln -s /data /home/pptruser/app/archivebox/output \ - && ln -s /home/pptruser/app/bin/* /bin/ \ - && ln -s /home/pptruser/app/bin/archivebox /bin/archive \ - && chown -R pptruser:pptruser /home/pptruser/app/archivebox - # && pip3 install -r /home/pptruser/app/archivebox/requirements.txt - -VOLUME /data - -ENV LANG=C.UTF-8 \ +ARG TARGETPLATFORM +ARG TARGETOS +ARG TARGETARCH +ARG TARGETVARIANT + +ENV TZ=UTC \ LANGUAGE=en_US:en \ LC_ALL=C.UTF-8 \ + LANG=C.UTF-8 \ + DEBIAN_FRONTEND=noninteractive \ + APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \ PYTHONIOENCODING=UTF-8 \ - CHROME_SANDBOX=False \ - CHROME_BINARY=google-chrome-unstable \ - OUTPUT_DIR=/data + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_COMPILE=1 \ + PIP_ONLY_BINARY=aiohttp \ + npm_config_loglevel=error + +ENV PYTHON_VERSION=3.13 \ + NODE_VERSION=24 + +ENV ARCHIVEBOX_USER=archivebox \ + DEFAULT_ARCHIVEBOX_UID=911 \ + DEFAULT_ARCHIVEBOX_GID=911 \ + IN_DOCKER=True + +ENV CODE_DIR=/app \ + DATA_DIR=/data \ + CONFIG_DIR=/opt/archivebox \ + LIB_DIR=/opt/archivebox/lib \ + ABXPKG_LIB_DIR=/opt/archivebox/lib \ + PLAYWRIGHT_BROWSERS_PATH=/opt/archivebox/lib/playwright/cache \ + PERSONAS_DIR=/data/personas \ + CHROME_USER_DATA_DIR=/data/personas/Default/chrome_profile \ + CHROME_HEADLESS=true \ + CHROME_SANDBOX=false \ + CHROME_ISOLATION=crawl \ + CHROME_ARGS_EXTRA='["--disable-gpu","--disable-features=Translate,OptimizationGuideModelDownloading,MediaRouter"]' + +ENV TMP_DIR=/tmp/archivebox \ + PIP_VENV_PYTHON=/venv/bin/python3 \ + GOOGLE_API_KEY=no \ + GOOGLE_DEFAULT_CLIENT_ID=no \ + GOOGLE_DEFAULT_CLIENT_SECRET=no + +ENV HOME=/home/archivebox \ + XDG_CONFIG_HOME=/home/archivebox/.config \ + XDG_CACHE_HOME=/home/archivebox/.cache \ + ABXPKG_INSTALL_TIMEOUT=600 \ + ABXPKG_POSTINSTALL_SCRIPTS=True \ + ABXPKG_MIN_RELEASE_AGE=0 \ + TIMEOUT=600 + +ENV UV_COMPILE_BYTECODE=false \ + UV_PYTHON_PREFERENCE=managed \ + UV_PYTHON_INSTALL_DIR=/opt/uv/python \ + UV_LINK_MODE=copy \ + UV_PROJECT_ENVIRONMENT=/venv \ + VIRTUAL_ENV=/venv \ + PATH="/venv/bin:/opt/node/bin:$PATH" + +SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"] +WORKDIR "$CODE_DIR" + +RUN cp /VERSION.txt /ABX-DL-VERSION.txt \ + && (echo "[i] Docker build for ArchiveBox multistage starting..." \ + && echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) (${TARGETARCH} ${TARGETVARIANT})" \ + && echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \ + && uname -a \ + && sed -n '1,7p' /etc/os-release \ + && which node && node --version \ + && which uv && uv self version \ + ) | tee -a /VERSION.txt + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + echo "[+] APT Installing ArchiveBox search dependency ripgrep for $TARGETPLATFORM..." \ + && apt-get update -qq \ + && apt-get install -qq -y --no-install-recommends ripgrep \ + && rm -rf /var/lib/apt/lists/* + +FROM archivebox-runtime-base AS archivebox-builder + +WORKDIR "$CODE_DIR" +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + --mount=type=bind,source=pyproject.toml,target=/app/pyproject.toml \ + echo "[+] UV Installing ArchiveBox dependencies from pyproject.toml..." \ + && echo 'Binary::apt::APT::Keep-Downloaded-Packages "1";' > /etc/apt/apt.conf.d/99keep-cache \ + && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/99no-install-recommends \ + && echo 'APT::Install-Suggests "0";' > /etc/apt/apt.conf.d/99no-install-suggests \ + && rm -f /etc/apt/apt.conf.d/docker-clean \ + && apt-get update -qq \ + && apt-get install -qq -y --no-install-recommends \ + build-essential gcc libldap2-dev libsasl2-dev libssl-dev \ + && /usr/bin/uv venv --clear /venv --python "${PYTHON_VERSION}" \ + && /usr/bin/uv pip install setuptools pip wheel \ + && /usr/bin/uv sync \ + --refresh \ + --no-dev \ + --inexact \ + --no-install-project \ + --no-install-workspace \ + --no-sources \ + && (find /venv/lib/python3.*/site-packages -type f -name '*.so' -exec strip --strip-unneeded {} + 2>/dev/null || true) \ + && rm -f /venv/bin/uv /venv/bin/uvx \ + && apt-get purge -y build-essential gcc libldap2-dev libsasl2-dev libssl-dev \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* + +COPY --chown=root:root --chmod=755 "." "$CODE_DIR/" +RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked,id=uv-$TARGETARCH$TARGETVARIANT \ + echo "[*] Installing ArchiveBox Python source code from $CODE_DIR..." \ + && COMMIT_HASH="$( \ + if [[ -f "$CODE_DIR/.git/HEAD" ]]; then \ + HEAD_REF="$(cat "$CODE_DIR/.git/HEAD")"; \ + if [[ "$HEAD_REF" =~ ^[0-9a-fA-F]{40}$ ]]; then \ + echo "$HEAD_REF"; \ + elif [[ "$HEAD_REF" == ref:\ * ]]; then \ + REF_PATH="${HEAD_REF#ref: }"; \ + cat "$CODE_DIR/.git/$REF_PATH" 2>/dev/null || awk -v ref="$REF_PATH" '$2 == ref {print $1}' "$CODE_DIR/.git/packed-refs" 2>/dev/null || true; \ + fi; \ + fi)" \ + && if [[ "$COMMIT_HASH" =~ ^[0-9a-fA-F]{40}$ ]]; then echo "COMMIT_HASH=$COMMIT_HASH" | tee -a /VERSION.txt; fi \ + && /usr/bin/uv pip install --no-deps "$CODE_DIR" \ + && rm -f /venv/bin/uv /venv/bin/uvx \ + && (/usr/bin/uv pip show archivebox && which archivebox) | tee -a /VERSION.txt \ + && rm -rf "$CODE_DIR/.git" + +FROM archivebox-runtime-base + +LABEL name="archivebox" \ + maintainer="Nick Sweeting " \ + description="All-in-one self-hosted internet archiving solution" \ + homepage="https://github.com/ArchiveBox/ArchiveBox" \ + documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker" \ + org.opencontainers.image.title="ArchiveBox" \ + org.opencontainers.image.vendor="ArchiveBox" \ + org.opencontainers.image.description="All-in-one self-hosted internet archiving solution" \ + org.opencontainers.image.source="https://github.com/ArchiveBox/ArchiveBox" \ + com.docker.image.source.entrypoint="Dockerfile" + +COPY --from=sonic /usr/local/bin/sonic /usr/local/bin/sonic +COPY --chown=root:root --chmod=755 "etc/sonic.cfg" /etc/sonic.cfg + +COPY --from=archivebox-builder /opt/uv/python /opt/uv/python +COPY --from=archivebox-builder /venv /venv +COPY --from=archivebox-builder /app /app +COPY --from=archivebox-builder /VERSION.txt /VERSION.txt + +RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_ARCHIVEBOX_UID}..." \ + && printf 'export PATH="/venv/bin:/opt/node/bin:$PATH"\n' > /etc/profile.d/archivebox-path.sh \ + && ln -sf /venv/bin/archivebox /usr/local/bin/archivebox \ + && ln -sf /venv/bin/daphne /usr/local/bin/daphne \ + && ln -sf /venv/bin/supervisord /usr/local/bin/supervisord \ + && ln -sf /venv/bin/supervisorctl /usr/local/bin/supervisorctl \ + && getent group "$ARCHIVEBOX_USER" >/dev/null || groupadd --system "$ARCHIVEBOX_USER" \ + && id -u "$ARCHIVEBOX_USER" >/dev/null 2>&1 || useradd --system --create-home --gid "$ARCHIVEBOX_USER" --groups audio,video "$ARCHIVEBOX_USER" \ + && usermod --append --groups audio,video "$ARCHIVEBOX_USER" \ + && [[ "$(id -u "$ARCHIVEBOX_USER")" == "$DEFAULT_ARCHIVEBOX_UID" ]] || usermod -u "$DEFAULT_ARCHIVEBOX_UID" "$ARCHIVEBOX_USER" \ + && [[ "$(id -g "$ARCHIVEBOX_USER")" == "$DEFAULT_ARCHIVEBOX_GID" ]] || groupmod -g "$DEFAULT_ARCHIVEBOX_GID" "$ARCHIVEBOX_USER" \ + && (which sonic && sonic --version) | tee -a /VERSION.txt \ + && install -d -o "$DEFAULT_ARCHIVEBOX_UID" -g "$DEFAULT_ARCHIVEBOX_GID" "$DATA_DIR" "$TMP_DIR" "$CONFIG_DIR" "$LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" \ + && install -d -o "$DEFAULT_ARCHIVEBOX_UID" -g "$DEFAULT_ARCHIVEBOX_GID" "/home/$ARCHIVEBOX_USER" "/home/$ARCHIVEBOX_USER/.cache" \ + && install -d -o "$DEFAULT_ARCHIVEBOX_UID" -g "$DEFAULT_ARCHIVEBOX_GID" "/home/$ARCHIVEBOX_USER/.cache/abxbus/semaphores" "/home/$ARCHIVEBOX_USER/.cache/pnpm" "/home/$ARCHIVEBOX_USER/.cache/uv" \ + && chown "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" "$DATA_DIR" "$TMP_DIR" "$LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" "/home/$ARCHIVEBOX_USER/.cache/abxbus" "/home/$ARCHIVEBOX_USER/.cache/abxbus/semaphores" \ + && openssl rand -hex 16 > /etc/machine-id \ + && echo -e "\nARCHIVEBOX_USER=$ARCHIVEBOX_USER ARCHIVEBOX_UID=$(id -u "$ARCHIVEBOX_USER") ARCHIVEBOX_GID=$(id -g "$ARCHIVEBOX_USER")" | tee -a /VERSION.txt \ + && echo -e "TMP_DIR=$TMP_DIR\nLIB_DIR=$LIB_DIR\nPLAYWRIGHT_BROWSERS_PATH=$PLAYWRIGHT_BROWSERS_PATH\nMACHINE_ID=$(cat /etc/machine-id)\n" | tee -a /VERSION.txt + +WORKDIR "$DATA_DIR" +RUN echo "[+] Initializing image collection..." \ + && find "$DATA_DIR" -mindepth 1 -maxdepth 1 -exec rm -rf {} + \ + && archivebox init \ + && (chown "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" \ + "$DATA_DIR" "$DATA_DIR"/.archivebox_id "$DATA_DIR"/ArchiveBox.conf "$DATA_DIR"/index.sqlite3 \ + "$DATA_DIR"/logs "$DATA_DIR"/logs/* "$DATA_DIR"/sources \ + "$DATA_DIR"/archive "$DATA_DIR"/archive/users "$DATA_DIR"/personas \ + "$DATA_DIR"/tmp "$DATA_DIR"/tmp/* \ + "$CONFIG_DIR" "$CONFIG_DIR"/config.env "$CONFIG_DIR"/derived.env \ + "$TMP_DIR" "$LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" "/home/$ARCHIVEBOX_USER/.cache" \ + 2>/dev/null || true) \ + && find "$TMP_DIR" -mindepth 1 -maxdepth 1 -exec rm -rf {} + + +RUN chmod +x "$CODE_DIR"/bin/*.sh \ + && chmod g+w "$TMP_DIR" "$LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" + +RUN "$LIB_DIR/playwright/bin/chromium" --version | tee -a /VERSION.txt \ + && "$LIB_DIR/uv/packages/papers-dl/venv/bin/papers-dl" --version | tee -a /VERSION.txt \ + && /usr/bin/rg --version | head -1 | tee -a /VERSION.txt \ + && /usr/local/bin/sonic --version | tee -a /VERSION.txt \ + && /venv/bin/supervisord --version | tee -a /VERSION.txt \ + && for forbidden_bin in gcc g++ make; do ! command -v "$forbidden_bin" || (echo "Unexpected build tool in runtime: $forbidden_bin=$(command -v "$forbidden_bin")" >&2 && exit 1); done \ + && stat -c "%U:%G %a %n" "$CONFIG_DIR" "$LIB_DIR" "$PLAYWRIGHT_BROWSERS_PATH" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "$CONFIG_DIR" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "$LIB_DIR" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups archivebox version 2>&1 | tee -a /VERSION.txt \ + && chown -R "$DEFAULT_ARCHIVEBOX_UID:$DEFAULT_ARCHIVEBOX_GID" "/home/$ARCHIVEBOX_USER/.cache" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "/home/$ARCHIVEBOX_USER/.cache/abxbus/semaphores" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups test -w "/home/$ARCHIVEBOX_USER/.cache/uv" \ + && setpriv --reuid="$ARCHIVEBOX_USER" --regid="$ARCHIVEBOX_USER" --init-groups abx-dl install \ + && rm -rf /root/.cache /var/cache/apt/* /var/lib/apt/lists/* + +RUN (echo -e "\n\n[โˆš] Finished ArchiveBox multistage Docker build successfully." \ + && echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) (${TARGETARCH} ${TARGETVARIANT})" \ + && echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \ + ) | tee -a /VERSION.txt + +WORKDIR "$DATA_DIR" +VOLUME "$DATA_DIR" +EXPOSE 8000 -# Run everything from here on out as non-privileged user -USER pptruser -WORKDIR /home/pptruser/app +HEALTHCHECK --interval=30s --timeout=20s --retries=15 \ + CMD curl --fail --silent --show-error --max-time 5 --connect-timeout 2 'http://admin.archivebox.localhost:8000/health/' | grep -q 'OK' -ENTRYPOINT ["dumb-init", "--"] -CMD ["/bin/archive"] +ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"] +CMD ["archivebox", "server", "--init", "0.0.0.0:8000"] diff --git a/LICENSE b/LICENSE index 8d78f3beca..f9a7bc7604 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Nick Sweeting +Copyright (c) 2026 Nick Sweeting Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 1622c3938a..e5fca21f70 100644 --- a/README.md +++ b/README.md @@ -1,278 +1,1677 @@ -
- -

ArchiveBox
The open-source self-hosted web archive.

- -โ–ถ๏ธ Quickstart | -Demo | -Github | -Documentation | -Info & Motivation | -Community | -Roadmap - -
-"Your own personal internet archive" (็ฝ‘็ซ™ๅญ˜ๆกฃ / ็ˆฌ่™ซ)
-
+
+ +

ArchiveBox
Open-source self-hosted web archiving.

+ +
+ +โ–ถ๏ธ Quickstart | Demo | GitHub | Documentation | Info & Motivation | Community + +
- - - - - - -
+     -*๐Ÿ’ฅ Attention: Big API changes are coming soon! Check out [v0.4.0](https://github.com/pirate/ArchiveBox/pull/207) and help us test it! ๐Ÿ’ฅ* +
+
+
-**ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).** +**ArchiveBox is a self-hosted app that lets you preserve content from websites in a variety of formats.** -You can use it to preserve access to websites you care about by storing them locally offline. ArchiveBox imports lists of URLs, renders the pages in a headless, autheticated, user-scriptable browser, and then archives the content in multiple redundant common formats (HTML, PDF, PNG, WARC) that will last long after the originals disappear off the internet. It automatically extracts assets and media from pages and saves them in easily-accessible folders, with out-of-the-box support for extracting git repositories, audio, video, subtitles, images, PDFs, and more. +We aim to make your data immediately useful, and kept in formats that other programs can read directly. As output, we save standard HTML, PNG, PDF, TXT, JSON, WARC, SQLite, all guaranteed to be readable for decades to come. ArchiveBox also has a CLI, REST API, and webhooks so you can set up integrations with other services. -#### How does it work? +Without active preservation effort, everything on the internet eventually disappears or degrades. + +*ArchiveBox is an open source tool that lets organizations & individuals archive both public & private web content while retaining control over their data. It can be used to save copies of bookmarks, preserve evidence for legal cases, backup photos from FB/Insta/Flickr or media from YT/Soundcloud/etc., save research papers, and more...* +
+ +> โžก๏ธ Get ArchiveBox with `pip install archivebox` on [Linux](#quickstart)/[macOS](#quickstart), or via **[Docker](#quickstart)** โญ๏ธ on any OS. + +*Once installed, you can interact with it through the: [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), [CLI](#usage), [self-hosted web interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive), [Python API](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#python-shell-usage), or [filesystem](#static-archive-exporting).* + +
+
+
+ +๐Ÿ“ฅ **You can feed ArchiveBox URLs one at a time, or schedule regular imports** from your bookmarks or history, social media feeds or RSS, link-saving services like Pocket/Pinboard, our [Browser Extension](https://github.com/ArchiveBox/archivebox-browser-extension), and more. +See Input Formats for a full list of supported input formats... + +
+ +snapshot detail page + +**It saves snapshots of the URLs you feed it in several redundant formats.** +It also detects any content featured *inside* pages & extracts it out into a folder: +- ๐ŸŒ **HTML**/**Any websites** โžก๏ธ `original HTML+CSS+JS`, `singlefile HTML`, `screenshot PNG`, `PDF`, `WARC`, `title`, `article text`, `favicon`, `headers`, ... +- ๐ŸŽฅ **Social Media**/**News** โžก๏ธ `post content TXT`, `comments`, `title`, `author`, `images`, ... +- ๐ŸŽฌ **YouTube**/**SoundCloud**/etc. โžก๏ธ `MP3/MP4`s, `subtitles`, `metadata`, `thumbnail`, ... +- ๐Ÿ’พ **Github**/**Gitlab**/etc. links โžก๏ธ `clone of GIT source code`, `README`, `images`, ... +- โœจ *and more, see [Output Formats](#output-formats) below...* + +You can run ArchiveBox as a Docker web app to manage these snapshots, or continue accessing the same collection using the `pip`-installed CLI, Python API, and SQLite3 APIs. +All the ways of using it are equivalent, and provide matching features like adding tags, scheduling regular crawls, viewing logs, and more... + +
+
+ +๐Ÿ› ๏ธ ArchiveBox uses [standard tools](#dependencies) like Chrome, [`wget`](https://www.gnu.org/software/wget/), & [`yt-dlp`](https://github.com/yt-dlp/yt-dlp), and stores data in [ordinary files & folders](#archive-layout). +*(no complex proprietary formats, all data is readable without needing to run ArchiveBox)* + +The goal is to sleep soundly knowing the part of the internet you care about will be automatically preserved in durable, easily accessible formats [for decades](#background--motivation) after it goes down. -```bash -echo 'http://example.com' | ./archive -``` -After installing the dependencies, just pipe some new links into the `./archive` command to start your archive. -ArchiveBox is written in Python 3.5 and uses wget, Chrome headless, youtube-dl, pywb, and other common unix tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). +
+
+ + +**๐Ÿ“ฆ  Install ArchiveBox using your preferred method: `docker` / `pip` / `apt` / etc. ([see full Quickstart below](#quickstart)).** -
-CLI Screenshot -Desktop index screenshot -Desktop details page Screenshot
-Demo | Usage | Screenshots +
Expand for quick copy-pastable install commands...   โคต๏ธ +
+
# Option A: Get ArchiveBox with Docker Compose (recommended):
+mkdir -p ~/archivebox/data && cd ~/archivebox
+curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml   # edit options in this file as-needed
+docker compose run archivebox init
+# docker compose run archivebox add 'https://example.com'
+# docker compose run archivebox help
+# docker compose up
+
+
+# Option B: Or use it as a plain Docker container: +mkdir -p ~/archivebox/data && cd ~/archivebox/data +docker run -it -v $PWD:/data archivebox/archivebox init +# docker run -it -v $PWD:/data archivebox/archivebox add 'https://example.com' +# docker run -it -v $PWD:/data archivebox/archivebox help +# docker run -it -v $PWD:/data -p 8000:8000 archivebox/archivebox +
+
+# Option C: Or install it with your preferred pkg manager (see Quickstart below for apt, brew, and more) +pip install archivebox +mkdir -p ~/archivebox/data && cd ~/archivebox/data +archivebox init +archivebox install +# archivebox add 'https://example.com' +# archivebox help +# archivebox server 0.0.0.0:8000 +
+
+# Option D: Or use the optional auto setup script to install it +curl -fsSL 'https://get.archivebox.io' | bash +
+
+Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI โžก๏ธ
+Set BIND_ADDR to change the base domain; web. and admin. subdomains are used automatically. +
+
+ + +
+

+bookshelf graphic   logo   bookshelf graphic +

+Demo | Screenshots | Usage
. . . . . . . . . . . . . . . . . . . . . . . . . . . . -

+

+cli init screenshot +cli init screenshot +server snapshot admin screenshot +server snapshot details page screenshot +

+
+ +## Key Features -## Quickstart +- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/dev/LICENSE), own your own data & maintain your privacy by self-hosting +- [**Powerful CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular dependencies](#dependencies) and [support for Google Drive/NFS/SMB/S3/B2/etc.](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-Up-Storage) +- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) +- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats) +- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) +- [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC +- [**Powerful CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) +- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) +- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) +- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345) -ArchiveBox has [3 main dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) beyond `python3`: `wget`, `chromium`, and `youtube-dl`. -To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings. +
+ +## ๐Ÿค Professional Integration + +ArchiveBox is free for everyone to self-host, but we also provide support, security review, and custom integrations to help NGOs, governments, and other organizations [run ArchiveBox professionally](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102): + +- **Journalists:** + `crawling during research`, `preserving cited pages`, `fact-checking & review` +- **Lawyers:** + `collecting & preserving evidence`, `detecting changes`, `tagging & review` +- **Researchers:** + `analyzing social media trends`, `getting LLM training data`, `crawling pipelines` +- **Individuals:** + `saving bookmarks`, `preserving portfolio content`, `legacy / memoirs archival` +- **Governments:** + `snapshotting public service sites`, `recordkeeping compliance` + +> ***[Contact us](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102)** if your org wants help using ArchiveBox professionally.* +> We offer: setup & support, CAPTCHA/ratelimit unblocking, SSO, audit logging/chain-of-custody, and more +> *ArchiveBox is a ๐Ÿ›๏ธ 501(c)(3) [nonprofit FSP](https://hackclub.com/hcb/) and all our work supports open-source development.* + +
+ +
+
+grassgrass +
+ + + +# Quickstart + +**๐Ÿ–ฅ  [Supported OSs](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#supported-systems):** Linux/BSD, macOS, Windows (Docker)   **๐Ÿ‘พ  CPUs:** `amd64` (`x86_64`), `arm64`, `arm7` (raspi>=3)
+ +
+ +#### โœณ๏ธ  Easy Setup + +
+Docker docker-compose (macOS/Linux/Windows)   ๐Ÿ‘ˆ  recommended   (click to expand) +
+๐Ÿ‘ Docker Compose is recommended for the easiest install/update UX + best security + all extras out-of-the-box. +

+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Download the docker-compose.yml file into a new empty directory (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox
    +# Read and edit docker-compose.yml options as-needed after downloading
    +curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
    +
  4. +
  5. Run the initial setup to create an admin user (or set ADMIN_USER/PASS in docker-compose.yml) +
    docker compose run archivebox init
    +
  6. +
  7. Next steps: Start the server then login to the Web UI http://archivebox.localhost:8000 โ‡ข Admin. +
    docker compose up
    +# completely optional, CLI can always be used without running a server
    +# docker compose run [-T] archivebox [subcommand] [--help]
    +docker compose run archivebox add 'https://example.com'
    +docker compose run archivebox help
    +
    +For more info, see Install: Docker Compose in the Wiki. โžก๏ธ +
  8. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
+ +
+Docker docker run (macOS/Linux/Windows) +
+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +docker run -v $PWD:/data -it archivebox/archivebox init
    +
    +
  4. +
  5. Optional: Start the server then login to the Web UI http://archivebox.localhost:8000 โ‡ข Admin. +
    docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox
    +# completely optional, CLI can always be used without running a server
    +# docker run -v $PWD:/data -it [subcommand] [--help]
    +docker run -v $PWD:/data -it archivebox/archivebox help
    +
    +For more info, see Install: Docker Compose in the Wiki. โžก๏ธ +
  6. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
+ +
+curl sh automatic setup script bash auto-setup script (macOS/Linux) +
+
    +
  1. Install Docker on your system (optional, highly recommended but not required).
  2. +
  3. Run the automatic setup script. +
    curl -fsSL 'https://get.archivebox.io' | bash
    +For more info, see Install: Bare Metal in the Wiki. โžก๏ธ +
  4. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See setup.sh for the source code of the auto-install script.
+See "Against curl | sh as an install method" blog post for my thoughts on the shortcomings of this install method. +

+
+ +
+ +#### ๐Ÿ›   Package Manager Setup + + + + +
+Pip pip (macOS/Linux/BSD) +
+
    + +
  1. Install Python >= v3.13 and Node >= v22 on your system (if not already installed).
  2. +
  3. Install the ArchiveBox package using pip3 (or uvx). +
    pip3 install --upgrade archivebox
    +archivebox version
    +# install any missing extras shown using apt/brew/pkg/etc. see Wiki for instructions
    +#    python@3.13 node curl wget git ripgrep ...
    +
    +See the Install: Bare Metal Wiki for full install instructions for each OS... +
  4. +
  5. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data   # for example
    +archivebox init     # instantialize a new collection
    +archivebox install  # install all the runtime dependencies (e.g. chrome, single-file, yt-dlp, etc.)
    +
    +
  6. +
  7. Optional: Start the server then login to the Web UI http://archivebox.localhost:8000 โ‡ข Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +
    +
  8. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+
+See the pip-archivebox repo for more details about this distribution. +

+
+ + +
+aptitude apt (Ubuntu/Debian/etc.) +
+
    +
  1. Add the third-party ArchiveBox apt repo and install archivebox. +
    echo 'deb [trusted=yes] https://archivebox.github.io/debian-archivebox dev main' | sudo tee /etc/apt/sources.list.d/archivebox.list
    +sudo apt update
    +sudo apt install archivebox
    +archivebox version                         # make sure the package is installed
    +
    +
  2. +
  3. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +archivebox init
    +sudo archivebox install
    +archivebox add 'https://example.com'
    +
    +
    +
  4. +
  5. Optional: Start the server then login to the Web UI http://archivebox.localhost8000 โ‡ข Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +
    +
  6. +
+See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See the debian-archivebox repo for more details about this distribution. +

+
+ +
+homebrew brew (macOS only) +
+
    +
  1. Install Homebrew on your system (if not already installed).
  2. +
  3. Install the ArchiveBox package using brew. +
    brew tap archivebox/archivebox
    +brew install archivebox
    +archivebox version                         # make sure all dependencies are installed
    +
    +See the Install: Bare Metal Wiki for more granular instructions for macOS... โžก๏ธ +
  4. +
  5. Create a new empty directory and initialize your collection (can be anywhere). +
    mkdir -p ~/archivebox/data && cd ~/archivebox/data
    +archivebox init
    +archivebox install
    +
    +
  6. +
  7. Optional: Start the server then login to the Web UI http://archivebox.localhost:8000 โ‡ข Admin. +
    archivebox server 0.0.0.0:8000
    +# completely optional, CLI can always be used without running a server
    +# archivebox [subcommand] [--help]
    +archivebox help
    +

    +
  8. +
+ +See below for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.
+See the homebrew-archivebox repo for more details about this distribution. +

+
+ +
+Arch pacman / FreeBSD pkg / Nix nix (Arch/FreeBSD/NixOS/more) +
+ +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.* + + +See below for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
+ +
+ +#### ๐ŸŽ—  Other Options + +
+Docker docker + Electron electron Desktop App (macOS/Linux/Windows) +
+
    +
  1. Install Docker on your system (if not already installed).
  2. +
  3. Download a binary release for your OS or build the native app from source
    + +
  4. +
+ +
+โœจ Alpha (contributors wanted!): for more info, see the: Electron ArchiveBox repo. +
+
+ +
+Self-hosting Platforms TrueNAS / UNRAID / YunoHost / Cloudron / etc. (self-hosting solutions) +
+ +> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.* + + +See below for usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive. +

+
+ +
+paid Paid hosting solutions (cloud VPS) +
+ +For more discussion on managed and paid hosting options see here: Issue #531. + +
+ +
+ +#### โžก๏ธ  Next Steps + +- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)... +- (Optional) Create a persona and import browser cookies to archive logged-in sites: `archivebox persona create --import=chrome personal` +- Tweak your UI or archiving behavior [Configuration](#configuration), read about some of the [Caveats](#caveats), or [Troubleshoot](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) +- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk... +- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)... + +
+ +### Usage + +#### โšก๏ธ  CLI Usage + +ArchiveBox commands can be run in a terminal [directly on your host](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage), or via [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage-1)/[Docker Compose](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage). +(depending on how you chose to install it above) + + + +```bash +mkdir -p ~/archivebox/data # create a new data dir anywhere +cd ~/archivebox/data # IMPORTANT: cd into the directory + +# archivebox [subcommand] [--help] +archivebox version +archivebox help -# 2. Download ArchiveBox -git clone https://github.com/pirate/ArchiveBox.git && cd ArchiveBox +# equivalent: docker compose run archivebox [subcommand] [--help] +docker compose run archivebox help -# 3. Add your first links to your archive -echo 'https://example.com' | ./archive # pass URLs to archive via stdin +# equivalent: docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] +docker run -it -v $PWD:/data archivebox/archivebox help -./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed +# optional: import your browser cookies into a persona for logged-in archiving +archivebox persona create --import=chrome personal +# supported: chrome/chromium/brave/edge (Chromium-based only) +# use --profile to target a specific profile (e.g. Default, Profile 1) +# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data ``` -Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archive.sweeting.me](https://archive.sweeting.me) -For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. +#### ArchiveBox Subcommands + +- `archivebox` `help`/`version` to see the list of available subcommands / currently installed version info +- `archivebox` `setup`/`init`/`config`/`status`/`shell`/`manage` to administer your collection +- `archivebox` `add`/`schedule` to pull in fresh URLs from [bookmarks/history/RSS/etc.](#input-formats) +- `archivebox` `list`/`update`/`remove` to manage existing Snapshots in your collection + +
+
+curl sh automatic setup script CLI Usage Examples: non-Docker +
+

+# make sure you have pip-installed ArchiveBox and it's available in your $PATH first  
+
+# archivebox [subcommand] [--help] +mkdir -p ~/archivebox/data && cd ~/archivebox/data +archivebox init +sudo archivebox install +archivebox add 'https://example.com' +archivebox version # get archivebox version info + check dependencies +archivebox help # get list of archivebox subcommands that can be run +
+For more info, see our Usage: CLI Usage wiki. โžก๏ธ +
+ +
+ +
+Docker CLI Usage Examples: Docker Compose +
+

+# make sure you have `docker-compose.yml` from the Quickstart instructions first
+
+# docker compose run archivebox [subcommand] [--help] +docker compose run archivebox init +docker compose run archivebox install +docker compose run archivebox version +docker compose run archivebox help +docker compose run archivebox add 'https://example.com' +# to start webserver: docker compose up +
+For more info, see our Usage: Docker Compose CLI wiki. โžก๏ธ +
+ +
+ +
+Docker CLI Usage Examples: Docker +
+

+# make sure you create and cd into in a new empty directory first  
+
+# docker run -it -v $PWD:/data archivebox/archivebox [subcommand] [--help] +docker run -v $PWD:/data -it archivebox/archivebox init +docker run -v $PWD:/data -it archivebox/archivebox install +docker run -v $PWD:/data -it archivebox/archivebox version +docker run -v $PWD:/data -it archivebox/archivebox help +docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' +# to start webserver: docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +
+For more info, see our Usage: Docker CLI wiki. โžก๏ธ +
+ +
+ +
+๐Ÿ—„  SQL/Python/Filesystem Usage +

+archivebox shell           # explore the Python library API in a REPL
+sqlite3 ./index.sqlite3    # run SQL queries directly on your index
+ls ./archive/*/index.html  # or inspect snapshot data directly on the filesystem
+
+For more info, see our Python Shell, SQL API, and Disk Layout wikis. โžก๏ธ +
+ + +
+ +
+๐Ÿ–ฅ  Web UI & API Usage +

+# Start the server on bare metal (pip/apt/brew/etc):
+archivebox manage createsuperuser              # create a new admin user via CLI
+archivebox server 0.0.0.0:8000                 # start the server
+
+# Or with Docker Compose: +nano docker-compose.yml # setup initial ADMIN_USERNAME & ADMIN_PASSWORD +docker compose up # start the server +
+# Or with a Docker container: +docker run -v $PWD:/data -it archivebox/archivebox archivebox manage createsuperuser +docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox +
+ +Open http://web.archivebox.localhost:8000 for the public UI and http://admin.archivebox.localhost:8000 for the admin UI โžก๏ธ
+Set BIND_ADDR to change the base domain; web. and admin. subdomains are used automatically. +

+For more info, see our Usage: Web UI wiki. โžก๏ธ +

+Optional: Change permissions to allow non-logged-in users + +

+archivebox config --set PUBLIC_ADD_VIEW=True   # allow guests to submit URLs 
+archivebox config --set PUBLIC_SNAPSHOTS=True  # allow guests to see snapshot content
+archivebox config --set PUBLIC_INDEX=True      # allow guests to see list of all snapshots
+# or
+docker compose run archivebox config --set ...
+
+# restart the server to apply any config changes
+
+
+ +
+
+ +> [!TIP] +> Whether in Docker or not, ArchiveBox commands work the same way, and can be used to access the same data on-disk. +> For example, you could run the Web UI in Docker Compose, and run one-off commands with `pip`-installed ArchiveBox. + +
+Expand to show comparison...
+ +

+archivebox add --depth=1 'https://example.com'                     # add a URL with pip-installed archivebox on the host
+docker compose run archivebox add --depth=1 'https://example.com'                       # or w/ Docker Compose
+docker run -it -v $PWD:/data archivebox/archivebox add --depth=1 'https://example.com'  # or w/ Docker, all equivalent
+
+ +For more info, see our Docker wiki. โžก๏ธ + +
+ + +
+
+grassgrass +
+
+ +
+. . . . . . . . . . . . . . . . . . . . . . . . . . . . +

+DEMO: https://demo.archivebox.io
+Usage | Configuration | Caveats +
+
-*(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)* +
--- -
- +
+lego
+
+ # Overview -Because modern websites are complicated and often rely on dynamic content, -ArchiveBox archives the sites in **several different formats** beyond what public -archiving services like Archive.org and Archive.is are capable of saving. Using multiple -methods and the market-dominant browser to execute JS ensures we can save even the most -complex, finicky websites in at least a few high-quality, long-term data formats. + -ArchiveBox imports a list of URLs from stdin, remote URL, or file, then adds the pages to a local archive folder using wget to create a browsable HTML clone, youtube-dl to extract media, and a full instance of Chrome headless for PDF, Screenshot, and DOM dumps, and more... +## Input Formats: How to pass URLs into ArchiveBox for saving -Running `./archive` adds only new, unique links into `output/` on each run. Because it will ignore duplicates and only archive each link the first time you add it, you can schedule it to [run on a timer](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) and re-import all your feeds multiple times a day. It will run quickly even if the feeds are large, because it's only archiving the newest links since the last run. For each link, it runs through all the archive methods. Methods that fail will save `None` and be automatically retried on the next run, methods that succeed save their output into the data folder and are never retried/overwritten by subsequent runs. Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). -All the archived links are stored by date bookmarked in `output/archive/`, and everything is indexed nicely with JSON & HTML files. The intent is for all the content to be viewable with common software in 50 - 100 years without needing to run ArchiveBox in a VM. +- From the official ArchiveBox Browser Extension + Provides realtime archiving of browsing history or selected pages from Chrome/Chromium/Firefox browsers. -#### Can import links from many formats: +- From manual imports of URLs from RSS, JSON, CSV, TXT, SQL, HTML, Markdown, etc. files + ArchiveBox supports injesting URLs in [any text-based format](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file). + +- From manually exported [browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (in Netscape format) + Instructions: Chrome, Firefox, Safari, IE, Opera, and more... + +- From URLs visited through a [MITM Proxy](https://mitmproxy.org/) with [`archivebox-proxy`](https://github.com/ArchiveBox/archivebox-proxy) + Provides [realtime archiving](https://github.com/ArchiveBox/ArchiveBox/issues/577) of all traffic from any device going through the proxy. + +- From bookmarking services or social media (e.g. Twitter bookmarks, Reddit saved posts, etc.) + Instructions: Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved, Wallabag, Unmark.it, OneTab, Firefox Sync, and more... -```bash -echo 'http://example.com' | ./archive -./archive ~/Downloads/firefox_bookmarks_export.html -./archive https://example.com/some/rss/feed.xml -``` - - Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more) - - RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format - - Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more -See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. + -#### Saves lots of useful stuff for each imported link: + + +```bash +# archivebox add --help +archivebox add 'https://example.com/some/page' +archivebox add --depth=1 --plugins=parse_rss_urls "file://$HOME/Downloads/some_feed.xml" +archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' +echo 'http://example.com' | archivebox add +echo 'any text with urls in it' | archivebox add + +# if using Docker, add -i when piping stdin: +# echo 'https://example.com' | docker run -v $PWD:/data -i archivebox/archivebox add +# if using Docker Compose, add -T when piping stdin / stdout: +# echo 'https://example.com' | docker compose run -T archivebox add +``` + + + +See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. - - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details - - **Title:** `title` title of the site - - **Favicon:** `favicon.ico` favicon of the site - - **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present - - **WARC:** `warc/.gz` gzipped WARC of all the resources fetched while archiving - - **PDF:** `output.pdf` Printed PDF of site using headless chrome - - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome - - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links - - *More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)...* +It also includes a built-in scheduled import feature with `archivebox schedule`, handled by the same orchestrator that powers `archivebox server`, so you can pull in URLs from RSS feeds and websites regularly without a separate cron container. -It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file. +
-If you're importing URLs with secret tokens in them (e.g Google Docs, CodiMD notepads, etc), you may want to disable some of these methods to avoid leaking private URLs to 3rd party APIs during the archiving process. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. -## Key Features + + +## Output Formats: What ArchiveBox saves for each URL + + + + +For each web page added, ArchiveBox creates a Snapshot folder and preserves its content as ordinary files inside the folder (e.g. HTML, PDF, PNG, JSON, etc.). + +It uses all available methods out-of-the-box, but you can disable extractors and fine-tune the [configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed. + +
+
+Expand to see the full list of ways it saves each page... + + +data/archive/{Snapshot.id}/
+
    +
  • Index: index.html & index.json HTML and JSON index files containing metadata and details
  • +
  • Title, Favicon, Headers Response headers, site favicon, and parsed site title
  • +
  • SingleFile: singlefile.html HTML snapshot rendered with headless Chrome using SingleFile
  • +
  • Wget Clone: example.com/page-name.html wget clone of the site with warc/TIMESTAMP.gz
  • +
  • Chrome Headless
      +
    • PDF: output.pdf Printed PDF of site using headless chrome
    • +
    • Screenshot: screenshot.png 1440x900 screenshot of site using headless chrome
    • +
    • DOM Dump: output.html DOM Dump of the HTML after rendering using headless chrome
    • +
  • +
  • Article Text: article.html/json Article text extraction using Readability & Mercury
  • +
  • Archive.org Permalink: archive.org.txt A link to the saved site on archive.org
  • +
  • Audio & Video: media/ all audio/video files + playlists, including subtitles & metadata w/ yt-dlp
  • +
  • Source Code: git/ clone of any repository found on GitHub, Bitbucket, or GitLab links
  • +
  • More coming soon! See the Roadmap...
  • +
+
+
+ +## Configuration + + + +ArchiveBox can be configured via environment variables, by using the `archivebox config` CLI, or by editing `./ArchiveBox.conf`. +
+
+Expand to see examples... +
archivebox config                               # view the entire config
+archivebox config --get CHROME_BINARY           # view a specific value
+
+archivebox config --set CHROME_BINARY=chromium # persist a config using CLI +# OR +echo CHROME_BINARY=chromium >> ArchiveBox.conf # persist a config using file +# OR +env CHROME_BINARY=chromium archivebox ... # run with a one-off config +
+These methods also work the same way when run inside Docker, see the Docker Configuration wiki page for details. +

+ +The configuration is documented here: **[Configuration Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)**, and loaded from: [`archivebox/config/`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/config/). + + +
+Expand to see the most common options to tweak... +

+# e.g. archivebox config --set TIMEOUT=120
+# or   docker compose run archivebox config --set TIMEOUT=120
+
+TIMEOUT=240 # default: 60 add more seconds on slower networks +CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL +
+PUBLIC_INDEX=True # default: True whether anon users can view index +PUBLIC_SNAPSHOTS=True # default: True whether anon users can view pages +PUBLIC_ADD_VIEW=False # default: False whether anon users can add new URLs +
+USER_AGENT="Mozilla/5.0 ..." # change this to get around bot blocking +
+
+
+ +## Dependencies + +To achieve high-fidelity archives in as many situations as possible, ArchiveBox depends on a variety of 3rd-party libraries and tools that specialize in extracting different types of content. + +> Under-the-hood, ArchiveBox uses [Django](https://www.djangoproject.com/start/overview/) to power its [Web UI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#ui-usage), [Django Ninja](https://django-ninja.dev/) for the REST API, and [SQlite](https://www.sqlite.org/locrsf.html) + the filesystem to provide [fast & durable metadata storage](https://www.sqlite.org/locrsf.html) w/ [deterministic upgrades](https://stackoverflow.com/a/39976321/2156113). + +ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install), [`wget`, `yt-dlp`, `readability`, etc.](#dependencies) internally, and its operation can be [tuned, secured, and extended](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) as-needed for many different applications. + +
+
+Expand to learn more about ArchiveBox's internals & dependencies...
+ +
+

TIP: For better security while running ArchiveBox, and to avoid polluting your host system with a bunch of sub-dependencies that you need to keep up-to-date,it is strongly recommended to use the โญ๏ธ official Docker image which provides everything in an easy container with simple one-liner upgrades.

+
+ +
    +
  • Language: Python >=3.13
  • +
  • Backend: Django + Django-Ninja for REST API
  • +
  • Frontend: Django Admin + Vanilla HTML, CSS, JS
  • +
  • Web Server: Django + daphne (ASGI)
  • +
  • Database: Django ORM saving to SQLite3 ./data/index.sqlite3
  • +
  • Job Queue: Custom orchestrator using supervisord for worker management
  • +
  • Build/test/lint: uv / pyright+ty+pytest / ruff
  • +
  • Subdependencies: abxpkg installs apt/brew/pip/npm pkgs at runtime (e.g. yt-dlp, singlefile, readability, git)
  • +
+ + +These optional subdependencies used for archiving sites include: + +archivebox --version CLI output screenshot showing dependencies installed + +
    +
  • chromium / chrome (for screenshots, PDF, DOM HTML, and headless JS scripts)
  • +
  • node & npm (for readability, mercury, and singlefile)
  • +
  • wget (for plain HTML, static files, and WARC saving)
  • +
  • curl (for fetching headers, favicon, and posting to Archive.org)
  • +
  • yt-dlp or youtube-dl (for audio, video, and subtitles)
  • +
  • git (for cloning git repos)
  • +
  • singlefile (for saving into a self-contained html file)
  • +
  • postlight/parser (for discussion threads, forums, and articles)
  • +
  • readability (for articles and long text content)
  • +
  • and more as we grow...
  • +
+ +You don't need to install every dependency to use ArchiveBox. ArchiveBox will automatically disable extractors that rely on dependencies that aren't installed, based on what is configured and available in your $PATH. + +If not using Docker, make sure to keep the dependencies up-to-date yourself and check that ArchiveBox isn't reporting any incompatibility with the versions you install. + +
#install python3 and archivebox with your system package manager
+# apt/brew/pip/etc install ... (see Quickstart instructions above)
+
+which -a archivebox # see where you have installed archivebox +archivebox install # auto install all the extractors and extras +archivebox --version # see info and check validity of installed dependencies +
+ +Installing directly on Windows without Docker or WSL/WSL2/Cygwin is not officially supported (I cannot respond to Windows support tickets), but some advanced users have reported getting it working. + +

Learn More

+ + +
+
+ + +## Archive Layout + +All of ArchiveBox's state (SQLite DB, content, config, logs, etc.) is stored in a single folder per collection. + +
+
+Expand to learn more about the layout of Archivebox's data on-disk...
+ +Data folders can be created anywhere (`~/archivebox/data` or `$PWD/data` as seen in our examples), and you can create as many data folders as you want to hold different collections. +All archivebox CLI commands are designed to be run from inside an ArchiveBox data folder, starting with archivebox init to initialize a new collection inside an empty directory. + +
mkdir -p ~/archivebox/data && cd ~/archivebox/data   # just an example, can be anywhere
+archivebox init
+ +The on-disk layout is optimized to be easy to browse by hand and durable long-term. The main index is a standard index.sqlite3 database in the root of the data folder (it can also be exported as static JSON/HTML), and the archive snapshots are organized by date-added timestamp in the data/archive/ subfolder. + + + + +
data/
+    index.sqlite3
+    ArchiveBox.conf
+    archive/
+        ...
+        1617687755/
+            index.html
+            index.json
+            screenshot.png
+            media/some_video.mp4
+            warc/1617687755.warc.gz
+            git/somerepo.git
+            ...
+
+ +Each snapshot subfolder data/archive/TIMESTAMP/ includes a static index.json and index.html describing its contents, and the snapshot extractor outputs are plain files within the folder. + +

Learn More

+ + +
+
+ + +## Static Archive Exporting + +You can export your index as static HTML using `archivebox list` (so you can view it without an ArchiveBox server). + +
+
+Expand to learn how to export your ArchiveBox collection...
+ +
+

NOTE: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the archivebox list command to export specific Snapshots or ranges.

+
+ +
# archivebox list --help
+archivebox list --html --with-headers > index.html     # export to static html table
+archivebox list --json --with-headers > index.json     # export to json blob
+archivebox list --csv=timestamp,url,title > index.csv  # export to csv spreadsheet
 
- - [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
- - [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
- - [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
- - **Doesn't require a constantly-running server**, proxy, or native app
- - Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
- - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
- - **Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)
- - Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
- - Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
+# (if using Docker Compose, add the -T flag when piping)
+# docker compose run -T archivebox list --html 'https://example.com' > index.json
+
-## Background & Motivation +The paths in the static exports are relative, make sure to keep them next to your `./archive` folder when backing them up or viewing them. -Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity. +

Learn More

-Whether it's to resist censorship by saving articles before they get taken down or edited, or -just to save a collection of early 2010's flash games you love to play, having the tools to -archive internet content enables to you save the stuff you care most about before it disappears. + -
-
- Image from WTF is Link Rot?...
+
+
+ + +
+security graphic
-The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. -I don't think everything should be preserved in an automated fashion, making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about. +## Caveats -## Comparison to Other Projects +### Archiving Private Content -โ–ถ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** + - The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. +If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**. -#### User Interface & Intended Purpose +
+
+Expand to learn about privacy, permissions, and user accounts... -ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI inferface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. -An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now it only ingests lists of links at a time via browser history, bookmarks, RSS, etc. +
# don't save private content to ArchiveBox, e.g.:
+archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
+archivebox add 'https://vimeo.com/somePrivateVideo'
 
-#### Private Local Archives vs Centralized Public Archives
+# restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
+archivebox config --set PUBLIC_INDEX=False
+archivebox config --set PUBLIC_SNAPSHOTS=False
+archivebox config --set PUBLIC_ADD_VIEW=False
+archivebox manage createsuperuser
+
-Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, including private/authenticated content that you wouldn't otherwise share with a centralized service. Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle. +
+

CAUTION: Assume anyone viewing your archives will be able to see any cookies, session tokens, or private URLs passed to ArchiveBox during archiving. +Make sure to secure your ArchiveBox data and don't share snapshots with others without stripping out sensitive headers and content first.

+
-#### Storage Requirements +

Learn More

-Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `FETCH_MEDIA=False` to skip audio & video files. + -## Learn more +
+
- -Whether you want learn which organizations are the big players in the web archiving space, want to find a specific open source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web! +### Security Risks of Viewing Archived JS - +Be aware that malicious archived JS can access the contents of other pages in your archive when viewed. Because the Web UI serves all viewed snapshots from a single domain, they share a request context and **typical CSRF/CORS/XSS/CSP protections do not work to prevent cross-site request attacks**. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page and [Issue #239](https://github.com/ArchiveBox/ArchiveBox/issues/239) for more details. - - [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) - + [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) - *Community-maintained indexes of archiving tools and institutions.* - + [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) - *Open source tools and projects in the internet archiving space.* - + [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) - *Articles, posts, and blogs relevant to ArchiveBox and web archiving in general.* - + [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities) - *A collection of the most active internet archiving communities and initiatives.* - - Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) - - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. - ---- - -# Documentation - +
+
+Expand to see risks and mitigations... -We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) for documentation. -You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder. +
# visiting an archived page with malicious JS:
+https://127.0.0.1:8000/archive/1602401954/example.com/index.html
 
-## Getting Started
+# example.com/index.js can now make a request to read everything from:
+https://127.0.0.1:8000/index.html
+https://127.0.0.1:8000/archive/*
+# then example.com/index.js can send it off to some evil server
+
- - [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart) - - [Install](https://github.com/pirate/ArchiveBox/wiki/Install) - - [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) +
+

NOTE: Only the wget & dom extractor methods execute archived JS when viewing snapshots, all other archive methods produce static output that does not execute JS on viewing.
+If you are worried about these issues ^ you should disable these extractors using:
archivebox config --set SAVE_WGET=False SAVE_DOM=False.

+
-## Reference +

Learn More

+ - - [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage) - - [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) - - [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) - - [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site) - - [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) - - [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive) - - [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium) - - [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview) - - [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting) +
+
-## More Info - - [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) - - [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) - - [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations) - - [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation) - - [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) +### Working Around Sites that Block Archiving + +For various reasons, many large sites (Reddit, Twitter, Cloudflare, etc.) actively block archiving or bots in general. There are a number of approaches to work around this, and we also provide consulting services to help here. + +
+
+Click to learn how to set up user agents, cookies, and site logins... +
+ + + + +In the future we plan on adding support for running JS scripts during archiving to block ads, cookie popups, modals, and fix other issues. Follow here for progress: Issue #51. + +
+
+ + +### Saving Multiple Snapshots of a Single URL + +ArchiveBox appends a hash with the current date `https://example.com#2020-10-24` to differentiate when a single URL is archived multiple times. + + +
+
+Click to learn how the Re-Snapshot feature works... +
+ + +Because ArchiveBox uniquely identifies snapshots by URL, it must use a workaround to take multiple snapshots of the same URL (otherwise they would show up as a single Snapshot entry). It makes the URLs of repeated snapshots unique by adding a hash with the archive date at the end: + +
archivebox add 'https://example.com#2020-10-24'
+...
+archivebox add 'https://example.com#2020-10-25'
+
+ +The Re-Snapshot Button button in the Admin UI is a shortcut for this hash-date multi-snapshotting workaround. + +Improved support for saving multiple snapshots of a single URL without this hash-date workaround will be added eventually (along with the ability to view diffs of the changes between runs). + +

Learn More

+ + + +
+
+ +### Storage Requirements + +Because ArchiveBox is designed to ingest a large volume of URLs with multiple copies of each URL stored by different 3rd-party tools, it can be quite disk-space intensive. There are also some special requirements when using filesystems like NFS/SMB/FUSE. + +
+
+Click to learn more about ArchiveBox's filesystem and hosting requirements... +
+ +
    +
  • ArchiveBox can use anywhere from ~1gb per 1000 Snapshots, to ~50gb per 1000 Snapshots, mostly dependent on whether you're saving video/audio using YTDLP_ENABLED=True and whether you lower YTDLP_MAX_SIZE=750m.
  • +
  • Disk usage can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by turning off extractors methods you don't need. You can also deduplicate content with a tool like fdupes or rdfind. +
  • +
  • Don't store large collections on older filesystems like EXT3/FAT as they may not be able to handle more than 50k directory entries in the data/archive/ folder. +
  • +
  • Try to keep the data/index.sqlite3 file on local drive (not a network mount) or SSD for maximum performance, however the data/archive/ folder can be on a network mount or slower HDD.
  • +
  • If using Docker or NFS/SMB/FUSE for the data/archive/ folder, make sure the mounted data directory is writable by its intended owner and consider disabling root_squash on your fileshare server. +
  • +
+ +

Learn More

+ + + + +
+
+ --- -# Screenshots +
+ + +## Screenshots + +
+ + + + + + + + + + + + + + + + +
+brew install archivebox
+archivebox version +
+archivebox init
+
+archivebox add + +archivebox data dir +
+archivebox server + +archivebox server add + +archivebox server list + +archivebox server detail +
+
+
-
-CLI Screenshot -Desktop index screenshot -Desktop details page Screenshot -Mobile details page screenshot +
+
+paisley graphic
---- -
-

- +# Background & Motivation + +ArchiveBox aims to enable more of the internet to be saved from deterioration by empowering people to self-host their own archives. The intent is for all the web content you care about to be viewable with common software in 50 - 100 years without needing to run ArchiveBox or other specialized software to replay it. + + +
+
+Click to read more about why archiving is important and how to do it ethically...
-This project is maintained mostly in my spare time with the help from generous contributors. -

-Contributor Spotlight:

- - - - - - - + +Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity. + +Whether it's to resist censorship by saving news articles before they get taken down or edited, or just to save a collection of early 2010's flash games you loved to play, having the tools to archive internet content enables to you save the stuff you care most about before it disappears. + +
+
+Image from Perma.cc...
+
+ +The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful. I don't think everything should be preserved in an automated fashion--making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about, just like libraries do. Without the work of archivists saving physical books, manuscrips, and paintings we wouldn't have any knowledge of our ancestors' history. I believe archiving the web is just as important to provide the same benefit to future generations. + +ArchiveBox's stance is that duplication of other people's content is only ethical if it: + +- A. doesn't deprive the original creators of revenue and +- B. is responsibly curated by an individual/institution. + +In the U.S., libraries, researchers, and archivists are allowed to duplicate copyrighted materials under "fair use" for private study, scholarship, or research. Archive.org's non-profit preservation work is covered under fair use in the US, and they properly handle unethical content/DMCA/GDPR removal requests to maintain good standing in the eyes of the law. + +As long as you A. don't try to profit off pirating copyrighted content and B. have processes in place to respond to removal requests, many countries allow you to use software like ArchiveBox to ethically and responsibly archive any web content you can view. That being said, ArchiveBox is not liable for how you choose to operate the software. You must research your own local laws and regulations, and get proper legal council if you plan to host a public instance (start by putting your DMCA/GDPR contact info in FOOTER_INFO and changing your instance's branding using CUSTOM_TEMPLATES_DIR). + +

- + + +## Comparison to Other Projects + +comparison + + +> **Check out our [community wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for a list of alternative web archiving tools and orgs.** + +ArchiveBox gained momentum in the internet archiving industry because it uniquely combines 3 things: + +- **it's distributed:** users own their data instead of entrusting it to one big central provider +- **it's future-proof:** saving in *multiple formats* and extracting out raw TXT, PNG, PDF, MP4, etc. files +- **it's extensible:** with powerful APIs, flexible storage, and a big community adding new extractors regularly +
+
+Expand for a more direct comparison to Archive.org and specific open-source alternatives...
+ +ArchiveBox tries to be a robust, set-and-forget archiving solution suitable for archiving RSS feeds, bookmarks, or your entire browsing history (beware, it may be too big to store), including private/authenticated content that you wouldn't otherwise share with a centralized service like Archive.org. + +

Comparison With Centralized Public Archives

+ +Not all content is suitable to be archived on a centralized, publicly accessible platform. Archive.org doesn't offer the ability to save things behind login walls for good reason, as the content may not have been intended for a public audience. ArchiveBox exists to fill that gap by letting everyone save what they have access to on an individual basis, and to encourage decentralized archiving that's less succeptible to censorship or natural disasters. + +By having users store their content locally or within their organizations, we can also save much larger portions of the internet than a centralized service has the disk capacity to handle. The eventual goal is to work towards federated archiving where users can share portions of their collections with each other, and with central archives on a case-by-case basis. + +

Comparison With Other Self-Hosted Archiving Options

+ +ArchiveBox differentiates itself from [similar self-hosted projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by providing both a comprehensive CLI interface for managing your archive, a Web UI that can be used either independently or together with the CLI, and a simple on-disk data format that can be used without either. + + +*If you want better fidelity for very complex interactive pages with heavy JS/streams/API requests, check out [ArchiveWeb.page](https://archiveweb.page) and [ReplayWeb.page](https://replayweb.page).* + +*If you want more bookmark categorization and note-taking features, check out [Memex](https://github.com/WorldBrain/Memex), [Hoarder](https://github.com/hoarder-app/hoarder), [LinkWarden](https://github.com/linkwarden/linkwarden), [Archivy](https://archivy.github.io/), or [LinkAce](https://www.linkace.org/).* + +*If you need more advanced recursive spider/crawling ability beyond `--depth=1`, check out [Browsertrix](https://github.com/webrecorder/browsertrix-crawler), [Photon](https://github.com/s0md3v/Photon), or [Scrapy](https://scrapy.org/) and pipe the outputted URLs into ArchiveBox.* + +For more alternatives, see our [list here](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)... + +ArchiveBox is neither the highest fidelity nor the simplest tool available for self-hosted archiving, rather it's a jack-of-all-trades that tries to do most things well by default. We encourage you to try these other tools made by our friends if ArchiveBox isn't suited to your needs. + +
+
- - - -

+ + +## Internet Archiving Ecosystem + + + +
+Our Community Wiki strives to be a comprehensive index of the web archiving industry... +
+ +- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) + - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#web-archiving-projects) + _List of ArchiveBox alternatives and open source projects in the internet archiving space._ + - [Awesome-Web-Archiving Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#the-master-lists) + _Community-maintained indexes of archiving tools and institutions like `iipc/awesome-web-archiving`._ + - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#reading-list) + _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ + - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#communities) + _A collection of the most active internet archiving communities and initiatives._ +- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) +- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://items.ssrc.org/parameters/on-the-importance-of-web-archiving/)" blog post. +- Reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter + +
+ +
+ +**Need help building a custom archiving solution?** + +> โœจ **[Hire the team that built Archivebox](https://zulip.archivebox.io/#narrow/stream/167-enterprise/topic/welcome/near/1191102) to solve archiving for your org.** ([@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp)) + +
+ + +
+documentation graphic +
+ +# Documentation + + + +We use the [ArchiveBox GitHub Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki) for documentation. +There is also a mirror available on Read the Docs (though it's sometimes outdated). + +> โœ๏ธ You can submit docs changes & suggestions in our dedicated repo [`ArchiveBox/docs`](https://github.com/ArchiveBox/docs). + +## Getting Started + +- [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) +- [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) +- [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) +- [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage) +- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) +- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) +- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) +- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) + +## Advanced + +- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) +- [Cookies & Sessions Setup](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install#setting-up-a-chromium-user-profile) (archiving sites that require logins) +- [Setting up the Search Backends](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search) (choosing ripgrep, Sonic, or FTS5) +- [Setting up Local/Remote Storages](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Storage) (S3/B2/Google Drive/SMB/NFS/etc.) +- [Setting up Authentication & Permissions](https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Authentication) (SSO/LDAP/OAuth/API Keys/etc.) +- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) (sharing your archive server with others) +- [Chromium Install Options](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) (installing and configuring ArchiveBox's Chrome) +- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives) +- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) + +## Developers + +- [Developer Documentation](https://github.com/ArchiveBox/ArchiveBox#archivebox-development) +- [Python API](https://docs.archivebox.io/) +- [REST API](https://demo.archivebox.io/api) (alpha) + +## More Info + +- [Bug Tracker](https://github.com/ArchiveBox/ArchiveBox/issues) +- [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) +- [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) +- [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation) +- [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) + +
+ +--- + +
+development +
+ +# ArchiveBox Development + +All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. + +For low hanging fruit / easy first tickets, see: ArchiveBox/Issues `#good first ticket` `#help wanted`. + +**Python API Documentation:** https://docs.archivebox.io/en/dev/archivebox.html#module-archivebox.main + +**Internal Architecture Diagrams:** https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams + + +### Setup the dev environment + +
Click to expand... + +#### 1. Setup the monorepo + +First make sure you have `uv` installed: https://docs.astral.sh/uv/getting-started/installation/ + + +```bash +git clone https://github.com/ArchiveBox/monorepo +cd monorepo +./bin/setup.sh + +# activate the monorepo venv, then cd into archivebox +source .venv/bin/activate +cd archivebox +``` + +Repos included in monorepo setup: + +- https://github.com/ArchiveBox/abxbus +- https://github.com/ArchiveBox/abxpkg +- https://github.com/ArchiveBox/abx-plugins +- https://github.com/ArchiveBox/abx-dl +- https://github.com/ArchiveBox/ArchiveBox + + +#### 2. Option A: Install the Python, JS, and system dependencies directly on your machine + + +```bash +# Install ArchiveBox runtime dependencies +mkdir -p data && cd data +archivebox init +archivebox install # detect and install all extractor dependencies + +# Run the development server w/ autoreloading (but no bg workers) +archivebox server --debug --reload 0.0.0.0:8000 + +# Run the production server (with bg workers but no autoreloading) +archivebox server 0.0.0.0:8000 +``` + +#### 2. Option B: Build the docker container and use that for development instead + + +```bash +# Optional: develop via docker by mounting the code dir into the container +# if you edit e.g. ./archivebox/core/models.py on the docker host, runserver +# inside the container will reload and pick up your changes +./bin/build_docker.sh dev + +docker run -it -v $PWD/data:/data archivebox/archivebox:dev init --install + +# Run the development server w/ autoreloading (but no bg workers) +docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server --debug --reload 0.0.0.0:8000 + +# Run the production server (with bg workers but no autoreloading) +docker run -it -v $PWD/data:/data -v $PWD/archivebox:/app/archivebox -p 8000:8000 archivebox/archivebox:dev server + +# (remove the --reload flag and add the --nothreading flag when profiling with the django debug toolbar) +# When using --reload, make sure any files you create can be read by the user in the Docker container, eg with 'chmod a+rX'. +``` + +
+ +### Common development tasks + +See the `./bin/` folder and read the source of the bash scripts within. +You can also run all these in Docker. For more examples see the GitHub Actions CI/CD tests that are run: `.github/workflows/*.yaml`. + +#### Run in DEBUG mode + +
Click to expand... + + +```bash +# set up persistent DEBUG=True for all runs +archivebox config --set DEBUG=True + +# OR you can run a dev server with DEBUG=True in a few ways: +archivebox server --debug --reload 0.0.0.0:8000 +# or +archivebox server --debug 0.0.0.0:8000 +# or +env DEBUG=True daphne -b 0.0.0.0 -p 8000 archivebox.core.asgi:application +``` + +https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running + +
+ +#### Install and run a specific GitHub branch + +
Click to expand... + +##### Use a Pre-Built Image + +If you're looking for the latest `dev` Docker image, it's often available pre-built on Docker Hub, simply pull and use `archivebox/archivebox:dev`. + + +```bash +docker pull archivebox/archivebox:dev +docker run archivebox/archivebox:dev version +# verify the BUILD_TIME and COMMIT_HASH in the output are recent +``` + +##### Build Branch from Source + +You can also build and run any branch yourself from source, for example to build & use `dev` locally: + + +```bash +# docker-compose.yml: +services: + archivebox: + image: archivebox/archivebox:dev + build: 'https://github.com/ArchiveBox/ArchiveBox.git#dev' + ... + +# or with plain Docker: +docker build -t archivebox:dev https://github.com/ArchiveBox/ArchiveBox.git#dev +docker run -it -v $PWD:/data archivebox:dev init + +# or with pip: +pip install 'git+https://github.com/pirate/ArchiveBox@dev' +npm install 'git+https://github.com/ArchiveBox/ArchiveBox.git#dev' +archivebox install +``` + +
+ +#### Run the linters / tests + +
Click to expand... + + +```bash +./bin/lint.sh +./bin/test.sh +``` +(uses `ruff`, `pyright`, `ty`, and `pytest -s`) + +
+ + +#### Make DB migrations, enter Django shell, other dev helper commands + +
Click to expand... + + +```bash +# generate the database migrations after changes to models.py +cd archivebox/ +./manage.py makemigrations + +# enter a python shell or a SQL shell +cd path/to/test/data/ +archivebox shell +archivebox manage dbshell + +# generate a graph of the ORM models +brew install graphviz +pip install pydot graphviz +archivebox manage graph_models -a -o orm.png +open orm.png + +# list all models with field db info and methods +archivebox manage list_model_info --all --signature --db-type --field-class + +# print all django settings +archivebox manage print_settings +archivebox manage print_settings --format=yaml # pip install pyyaml + +# autogenerate an admin.py from given app models +archivebox manage admin_generator core > core/admin.py + +# dump db data to a script that re-populates it +archivebox manage dumpscript core > scripts/testdata.py +archivebox manage reset core +archivebox manage runscript testdata + +# resetdb and clear all data! +archivebox manage reset_db + +# use django-tui to interactively explore commands +uv pip install django-tui +# ensure django-tui is in INSTALLED_APPS: core/settings.py +archivebox manage tui +``` + +ArchiveBox ORM models relatinoship graph + +- https://django-extensions.readthedocs.io/en/latest/command_extensions.html +- https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running +- https://github.com/anze3db/django-tui (explore `manage.py` commands as TUI) +- https://github.com/bloomberg/memray (advanced python profiler) +- https://github.com/laixintao/flameshow (display flamegraphs in terminal) +- https://github.com/taliraj/django-migrations-tui (explore migrations as TUI) + +
+ +#### Contributing a new extractor + + +Extractors are maintained in a separate repo here: https://github.com/ArchiveBox/abx-plugins (included in monorepo setup). + +Copy a similar plugin as a template to modify, then open a new PR to add it in that repo. + + +#### Build the docs, pip package, and docker image + +
Click to expand... + +(Normally CI takes care of this, but these scripts can be run to do it manually) + +```bash +./bin/build.sh + +# or individually: +./bin/build_docs.sh +./bin/build_pip.sh +./bin/build_docker.sh +``` + +
+ +#### Roll a release + +
Click to expand... + +(Normally CI takes care of this, but these scripts can be run to do it manually) + +```bash +./bin/release.sh + +# or individually: +./bin/release_docs.sh +./bin/release_pip.sh +./bin/release_docker.sh +``` + +
+ +--- + +## Further Reading + + + +- [ArchiveBox.io Website](https://archivebox.io) / [ArchiveBox Github (Source Code)](https://github.com/ArchiveBox/ArchiveBox) / [ArchiveBox Demo Server](https://demo.archivebox.io) +- [Documentation (Github Wiki)](https://github.com/ArchiveBox/ArchiveBox/wiki) / [API Reference Docs (ReadTheDocs)](https://docs.archivebox.io) / [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) / [Changelog](https://github.com/ArchiveBox/ArchiveBox/releases) +- [Bug Tracker (Github Issues)](https://github.com/ArchiveBox/ArchiveBox/issues) / [Discussions (Github Discussions)](https://github.com/ArchiveBox/ArchiveBox/discussions) / [Community Chat Forum (Zulip)](https://zulip.archivebox.io) +- Find us on social media: [Twitter `@ArchiveBoxApp`](https://twitter.com/ArchiveBoxApp), [LinkedIn](https://www.linkedin.com/company/archivebox/), [YouTube](https://www.youtube.com/@ArchiveBoxApp), [SaaSHub](https://www.saashub.com/archivebox), [Alternative.to](https://alternativeto.net/software/archivebox/about/), [Reddit](https://www.reddit.com/r/ArchiveBox/) + +--- + +
+
+๐Ÿ›๏ธ Contact us for professional support ๐Ÿ’ฌ
+

+   +   +   +   +   + +
+ArchiveBox operates as a US 501(c)(3) nonprofit FSP (sponsored by HCB), direct donations are tax-deductible. +

+  +  +

+
+โœจ Have spare CPU/disk/bandwidth after all your ็ฝ‘็ซ™ๅญ˜ๆกฃ็ˆฌ and want to help the world?
Check out our Good Karma Kit...
diff --git a/_config.yml b/_config.yml deleted file mode 100644 index c50ff38dab..0000000000 --- a/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-merlot \ No newline at end of file diff --git a/archive b/archive deleted file mode 120000 index 041799a6f4..0000000000 --- a/archive +++ /dev/null @@ -1 +0,0 @@ -bin/archivebox \ No newline at end of file diff --git a/archivebox/.flake8 b/archivebox/.flake8 new file mode 100644 index 0000000000..bb7176bd1f --- /dev/null +++ b/archivebox/.flake8 @@ -0,0 +1,6 @@ +[flake8] +ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391 +select = F,E9,W +max-line-length = 130 +max-complexity = 10 +exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data* diff --git a/archivebox/README.md b/archivebox/README.md new file mode 120000 index 0000000000..32d46ee883 --- /dev/null +++ b/archivebox/README.md @@ -0,0 +1 @@ +../README.md \ No newline at end of file diff --git a/archivebox/__init__.py b/archivebox/__init__.py old mode 100644 new mode 100755 index e69de29bb2..d8d691093f --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +# Welcome to the ArchiveBox source code! Thanks for checking it out! +# +# "We are swimming upstream against a great torrent of disorganization. +# In this, our main obligation is to establish arbitrary enclaves of order and system. +# It is the greatest possible victory to be, to continue to be, and to have been. +# No defeat can deprive us of the success of having existed for some moment of time +# in a universe that seems indifferent to us." +# --Norber Weiner + +__package__ = "archivebox" + +import os +import sys +from pathlib import Path +from typing import Protocol, cast + + +class _ReconfigurableStream(Protocol): + def reconfigure(self, *, line_buffering: bool) -> object: ... + + +# Force unbuffered output for real-time logs +if hasattr(sys.stdout, "reconfigure"): + cast(_ReconfigurableStream, sys.stdout).reconfigure(line_buffering=True) + cast(_ReconfigurableStream, sys.stderr).reconfigure(line_buffering=True) +os.environ["PYTHONUNBUFFERED"] = "1" + +ASCII_LOGO = """ + โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•— +โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•”โ•โ•โ•โ•โ•โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•”โ•โ•โ•โ•โ• โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•”โ•โ•โ•โ–ˆโ–ˆโ•—โ•šโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•”โ• +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘ โ•šโ–ˆโ–ˆโ–ˆโ•”โ• +โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘โ•šโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•”โ•โ•โ• โ–ˆโ–ˆโ•”โ•โ•โ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•”โ–ˆโ–ˆโ•— +โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ•šโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•—โ–ˆโ–ˆโ•‘ โ–ˆโ–ˆโ•‘โ–ˆโ–ˆโ•‘ โ•šโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ• โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•— โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ•šโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ•”โ•โ–ˆโ–ˆโ•”โ• โ–ˆโ–ˆโ•— +โ•šโ•โ• โ•šโ•โ•โ•šโ•โ• โ•šโ•โ• โ•šโ•โ•โ•โ•โ•โ•โ•šโ•โ• โ•šโ•โ•โ•šโ•โ• โ•šโ•โ•โ•โ• โ•šโ•โ•โ•โ•โ•โ•โ• โ•šโ•โ•โ•โ•โ•โ• โ•šโ•โ•โ•โ•โ•โ• โ•šโ•โ• โ•šโ•โ• +""" + +PACKAGE_DIR = Path(__file__).resolve().parent + +# # Add PACKAGE_DIR to sys.path - required for Django migrations to import models +# # Migrations reference models like 'machine.Binary' which need to be importable +# if str(PACKAGE_DIR) not in sys.path: +# sys.path.append(str(PACKAGE_DIR)) + +os.environ["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings" +os.environ["TZ"] = "UTC" + +# detect ArchiveBox user's UID/GID based on data dir ownership +from .config.permissions import drop_privileges # noqa + +drop_privileges() + +from .misc.checks import check_not_root, check_not_inside_source_dir, check_io_encoding # noqa + +check_not_root() +check_not_inside_source_dir() +check_io_encoding() + +from .config.version import VERSION # noqa + + +__version__ = VERSION +__author__ = "ArchiveBox" +__license__ = "MIT" + + +def __getattr__(name: str): + if name == "CONSTANTS": + from .config.constants import CONSTANTS + + os.environ.setdefault("MACHINE_ID", CONSTANTS.MACHINE_ID) + return CONSTANTS + if name == "DATA_DIR": + from .config.paths import DATA_DIR + + return DATA_DIR + if name == "VERSION": + return VERSION + if name in ("BUILTIN_PLUGINS_DIR", "USER_PLUGINS_DIR", "ALL_PLUGINS", "LOADED_PLUGINS"): + from abx_plugins import get_plugins_dir + from .config.constants import CONSTANTS + + builtin_plugins_dir = Path(get_plugins_dir()).resolve() + user_plugins_dir = CONSTANTS.USER_PLUGINS_DIR + plugins = { + "builtin": builtin_plugins_dir, + "user": user_plugins_dir, + } + values = { + "BUILTIN_PLUGINS_DIR": builtin_plugins_dir, + "USER_PLUGINS_DIR": user_plugins_dir, + "ALL_PLUGINS": plugins, + "LOADED_PLUGINS": plugins, + } + return values[name] + raise AttributeError(name) + + +__all__ = ( + "ASCII_LOGO", + "ASCII_ICON", + "PACKAGE_DIR", + "DATA_DIR", + "CONSTANTS", + "VERSION", + "BUILTIN_PLUGINS_DIR", + "USER_PLUGINS_DIR", + "ALL_PLUGINS", + "LOADED_PLUGINS", +) + +ASCII_ICON = """ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆ โ–ˆโ–ˆ + โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ +""" diff --git a/archivebox/__main__.py b/archivebox/__main__.py new file mode 100755 index 0000000000..fe4b74f445 --- /dev/null +++ b/archivebox/__main__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +"""This is the entrypoint for python -m archivebox ...""" + +__package__ = "archivebox" + +import archivebox # noqa # make sure monkey patches are applied before anything else +import sys + +from .cli import main + +ASCII_LOGO_MINI = r""" + _ _ _ ____ + / \ _ __ ___| |__ (_)_ _____| __ ) _____ __ + / _ \ | '__/ __| '_ \| \ \ / / _ \ _ \ / _ \ \/ / + / ___ \| | | (__| | | | |\ V / __/ |_) | (_) > < + /_/ \_\_| \___|_| |_|_| \_/ \___|____/ \___/_/\_\ +""" + +if __name__ == "__main__": + main(args=sys.argv[1:]) diff --git a/archivebox/api/__init__.py b/archivebox/api/__init__.py new file mode 100644 index 0000000000..24b3281374 --- /dev/null +++ b/archivebox/api/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.api" diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py new file mode 100644 index 0000000000..f54942c570 --- /dev/null +++ b/archivebox/api/admin.py @@ -0,0 +1,120 @@ +__package__ = "archivebox.api" + +from django import forms +from django.contrib import admin +from django.http import HttpRequest +from django.utils.text import capfirst +from signal_webhooks.admin import WebhookAdmin, WebhookModelForm +from signal_webhooks.settings import webhook_settings +from signal_webhooks.utils import get_webhook_model, model_from_reference + +from archivebox.base_models.admin import BaseModelAdmin + +from archivebox.api.models import APIToken + + +def _webhook_fields(*names: str) -> tuple[str, ...]: + model_fields = {field.name for field in get_webhook_model()._meta.fields} + return tuple(name for name in names if name in model_fields) + + +class APITokenAdmin(BaseModelAdmin): + list_display = ("created_at", "id", "created_by", "token_redacted", "expires") + sort_fields = ("id", "created_at", "created_by", "expires") + readonly_fields = ("created_at", "modified_at") + search_fields = ("id", "created_by__username", "token") + + fieldsets = ( + ( + "Token", + { + "fields": ("token", "expires"), + "classes": ("card",), + }, + ), + ( + "Owner", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("created_by",) + ordering = ["-created_at"] + list_per_page = 100 + + +class OutboundWebhookAdminForm(WebhookModelForm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.fields["ref"] = forms.ChoiceField( + label=self.fields["ref"].label, + help_text=self.fields["ref"].help_text, + choices=[ + (ref, f"{capfirst(model_from_reference(ref, check_hooks=False)._meta.verbose_name_plural)} ({ref})") + for ref in sorted(webhook_settings.HOOKS) + ], + ) + + +class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin): + form = OutboundWebhookAdminForm + list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display) + sort_fields = _webhook_fields("created_at", "created_by", "id", "ref", "endpoint", "last_success", "last_failure") + readonly_fields = _webhook_fields("created_at", "modified_at", *WebhookAdmin.readonly_fields) + + fieldsets = ( + ( + "Webhook", + { + "fields": _webhook_fields("name", "signal", "ref", "endpoint", "headers", "keep_last_response"), + "classes": ("card", "wide"), + }, + ), + ( + "Authentication", + { + "fields": _webhook_fields("auth_token"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": _webhook_fields("enabled", "last_success", "last_failure", "last_response"), + "classes": ("card",), + }, + ), + ( + "Owner", + { + "fields": _webhook_fields("created_by"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": _webhook_fields("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool: + """Preserve WebhookAdmin's auth token filter with Django's current admin signature.""" + return not lookup.startswith("auth_token") and admin.ModelAdmin.lookup_allowed(self, lookup, value, request) + + +def register_admin(admin_site: admin.AdminSite) -> None: + admin_site.register(APIToken, APITokenAdmin) + admin_site.register(get_webhook_model(), CustomWebhookAdmin) diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py new file mode 100644 index 0000000000..94e2f6e558 --- /dev/null +++ b/archivebox/api/apps.py @@ -0,0 +1,14 @@ +__package__ = "archivebox.api" + +from django.apps import AppConfig + + +class APIConfig(AppConfig): + name = "archivebox.api" + label = "api" + + +def register_admin(admin_site): + from archivebox.api.admin import register_admin + + register_admin(admin_site) diff --git a/archivebox/api/auth.py b/archivebox/api/auth.py new file mode 100644 index 0000000000..70d387664a --- /dev/null +++ b/archivebox/api/auth.py @@ -0,0 +1,137 @@ +__package__ = "archivebox.api" + +from datetime import timedelta + +from django.utils import timezone +from django.http import HttpRequest +from django.contrib.auth import authenticate +from django.contrib.auth.models import User + +from ninja.security import HttpBearer, APIKeyQuery, APIKeyHeader +from ninja.errors import HttpError + + +def get_or_create_api_token(user: User | None): + from archivebox.api.models import APIToken + + if user and user.is_superuser: + api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now()) + if api_tokens.exists(): + # unexpired token exists, use it + api_token = api_tokens.last() + else: + # does not exist, create a new one + api_token = APIToken.objects.create(created_by_id=user.pk, expires=timezone.now() + timedelta(days=30)) + + if api_token is None: + return None + assert api_token.is_valid(), f"API token is not valid {api_token}" + + return api_token + return None + + +def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None: + """Given an API token string, check if a corresponding non-expired APIToken exists, and return its user""" + from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time + + user: User | None = None + + submitted_empty_form = str(token).strip() in ("string", "", "None", "null") + if not submitted_empty_form: + try: + api_token = APIToken.objects.get(token=token) + if api_token.is_valid() and isinstance(api_token.created_by, User): + user = api_token.created_by + if request is not None: + setattr(request, "_api_token", api_token) + except APIToken.DoesNotExist: + pass + + return user + + +def token_from_request(request: HttpRequest) -> str: + token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key") or "" + auth_header = request.headers.get("Authorization", "") + if not token and auth_header.lower().startswith("bearer "): + token = auth_header.split(None, 1)[1].strip() + return token + + +def authenticated_user_from_request(request: HttpRequest) -> User | None: + user = request.user + if user.is_authenticated and user.is_active: + return user + + token = token_from_request(request) + token_user = auth_using_token(token=token, request=request) if token else None + if token_user and token_user.is_active: + request.user = token_user + return token_user + return None + + +def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None: + """Given a username and password, check if they are valid and return the corresponding user""" + user: User | None = None + + submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None)) + if not submitted_empty_form: + authenticated_user = authenticate( + username=username, + password=password, + ) + if isinstance(authenticated_user, User): + user = authenticated_user + return user + + +### Base Auth Types + + +def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None: + if user and user.pk: + request.user = user + setattr(request, "_api_auth_method", auth_method) + if not user.is_superuser: + raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)") + return user + + +### Django-Ninja-Provided Auth Methods + + +class HeaderTokenAuth(APIKeyHeader): + """Allow authenticating by passing X-API-Key=xyz as a request header""" + + param_name = "X-ArchiveBox-API-Key" + + def authenticate(self, request: HttpRequest, key: str | None) -> User | None: + return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__) + + +class BearerTokenAuth(HttpBearer): + """Allow authenticating by passing Bearer=xyz as a request header""" + + def authenticate(self, request: HttpRequest, token: str) -> User | None: + return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__) + + +class QueryParamTokenAuth(APIKeyQuery): + """Allow authenticating by passing api_key=xyz as a GET/POST query parameter""" + + param_name = "api_key" + + def authenticate(self, request: HttpRequest, key: str | None) -> User | None: + return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__) + + +### Enabled Auth Methods + +API_AUTH_METHODS = [ + HeaderTokenAuth(), + BearerTokenAuth(), + QueryParamTokenAuth(), + # django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False +] diff --git a/archivebox/api/middleware.py b/archivebox/api/middleware.py new file mode 100644 index 0000000000..8932762dae --- /dev/null +++ b/archivebox/api/middleware.py @@ -0,0 +1,32 @@ +__package__ = "archivebox.api" + +from django.http import HttpResponse + + +class ApiCorsMiddleware: + """Attach permissive CORS headers for API routes (token-based auth).""" + + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request): + if request.path.startswith("/api/"): + if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"): + response = HttpResponse(status=204) + return self._add_cors_headers(request, response) + + response = self.get_response(request) + return self._add_cors_headers(request, response) + + return self.get_response(request) + + def _add_cors_headers(self, request, response): + origin = request.META.get("HTTP_ORIGIN") + if not origin: + return response + + response["Access-Control-Allow-Origin"] = "*" + response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS" + response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken" + response["Access-Control-Max-Age"] = "600" + return response diff --git a/archivebox/api/migrations/0001_initial.py b/archivebox/api/migrations/0001_initial.py new file mode 100644 index 0000000000..1f3e6f3dda --- /dev/null +++ b/archivebox/api/migrations/0001_initial.py @@ -0,0 +1,239 @@ +# Generated by hand on 2025-12-29 +# Creates APIToken and OutboundWebhook tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from archivebox.uuid_compat import uuid7 +from archivebox.base_models.models import get_or_create_system_user_pk +import archivebox.api.models +import signal_webhooks.fields +import signal_webhooks.utils + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("auth", "0012_alter_user_first_name_max_length"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create api_apitoken table + CREATE TABLE IF NOT EXISTS api_apitoken ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + token VARCHAR(32) NOT NULL UNIQUE, + expires DATETIME, + + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS api_apitoken_created_by_id_idx ON api_apitoken(created_by_id); + CREATE INDEX IF NOT EXISTS api_apitoken_created_at_idx ON api_apitoken(created_at); + CREATE INDEX IF NOT EXISTS api_apitoken_token_idx ON api_apitoken(token); + + -- Create api_outboundwebhook table + CREATE TABLE IF NOT EXISTS api_outboundwebhook ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + name VARCHAR(255) NOT NULL UNIQUE, + signal VARCHAR(255) NOT NULL, + ref VARCHAR(1024) NOT NULL, + endpoint VARCHAR(2048) NOT NULL, + headers TEXT NOT NULL DEFAULT '{}', + auth_token TEXT NOT NULL DEFAULT '', + enabled BOOLEAN NOT NULL DEFAULT 1, + keep_last_response BOOLEAN NOT NULL DEFAULT 0, + created DATETIME NOT NULL, + updated DATETIME NOT NULL, + last_response TEXT NOT NULL DEFAULT '', + last_success DATETIME, + last_failure DATETIME, + + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_created_by_id_idx ON api_outboundwebhook(created_by_id); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_created_at_idx ON api_outboundwebhook(created_at); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_name_idx ON api_outboundwebhook(name); + CREATE INDEX IF NOT EXISTS api_outboundwebhook_ref_idx ON api_outboundwebhook(ref); + """, + reverse_sql=""" + DROP TABLE IF EXISTS api_outboundwebhook; + DROP TABLE IF EXISTS api_apitoken; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="APIToken", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)), + ("expires", models.DateTimeField(blank=True, null=True)), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "API Key", + "verbose_name_plural": "API Keys", + "app_label": "api", + }, + ), + migrations.CreateModel( + name="OutboundWebhook", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ( + "name", + models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"), + ), + ( + "signal", + models.CharField( + choices=[ + ("CREATE", "Create"), + ("UPDATE", "Update"), + ("DELETE", "Delete"), + ("M2M", "M2M changed"), + ("CREATE_OR_UPDATE", "Create or Update"), + ("CREATE_OR_DELETE", "Create or Delete"), + ("CREATE_OR_M2M", "Create or M2M changed"), + ("UPDATE_OR_DELETE", "Update or Delete"), + ("UPDATE_OR_M2M", "Update or M2M changed"), + ("DELETE_OR_M2M", "Delete or M2M changed"), + ("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"), + ("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"), + ("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"), + ("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"), + ("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"), + ], + help_text="Signal the webhook fires to.", + max_length=255, + verbose_name="signal", + ), + ), + ( + "ref", + models.CharField( + db_index=True, + help_text="Dot import notation to the model the webhook is for.", + max_length=1023, + validators=[signal_webhooks.utils.model_from_reference], + verbose_name="referenced model", + ), + ), + ( + "endpoint", + models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"), + ), + ( + "headers", + models.JSONField( + blank=True, + default=dict, + help_text="Headers to send with the webhook request.", + validators=[signal_webhooks.utils.is_dict], + verbose_name="headers", + ), + ), + ( + "auth_token", + signal_webhooks.fields.TokenField( + blank=True, + default="", + help_text="Authentication token to use in an Authorization header.", + max_length=8000, + validators=[signal_webhooks.utils.decode_cipher_key], + verbose_name="authentication token", + ), + ), + ("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")), + ( + "keep_last_response", + models.BooleanField( + default=False, + help_text="Should the webhook keep a log of the latest response it got?", + verbose_name="keep last response", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"), + ), + ( + "last_response", + models.CharField( + blank=True, + default="", + help_text="Latest response to this webhook.", + max_length=8000, + verbose_name="last response", + ), + ), + ( + "last_success", + models.DateTimeField( + default=None, + help_text="When the webhook last succeeded.", + null=True, + verbose_name="last success", + ), + ), + ( + "last_failure", + models.DateTimeField( + default=None, + help_text="When the webhook last failed.", + null=True, + verbose_name="last failure", + ), + ), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "API Outbound Webhook", + "app_label": "api", + }, + ), + migrations.AddConstraint( + model_name="outboundwebhook", + constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"), + ), + ], + ), + ] diff --git a/archivebox/requirements.txt b/archivebox/api/migrations/__init__.py similarity index 100% rename from archivebox/requirements.txt rename to archivebox/api/migrations/__init__.py diff --git a/archivebox/api/models.py b/archivebox/api/models.py new file mode 100755 index 0000000000..bb0e02ca2c --- /dev/null +++ b/archivebox/api/models.py @@ -0,0 +1,54 @@ +__package__ = "archivebox.api" + +import secrets +from archivebox.uuid_compat import CompactUUIDField, uuid7 + +from django.conf import settings +from django.db import models +from django.utils import timezone +from django_stubs_ext.db.models import TypedModelMeta +from signal_webhooks.models import WebhookBase + +from archivebox.base_models.models import get_or_create_system_user_pk + + +def generate_secret_token() -> str: + return secrets.token_hex(16) + + +class APIToken(models.Model): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + token = models.CharField(max_length=32, default=generate_secret_token, unique=True) + expires = models.DateTimeField(null=True, blank=True) + + class Meta(TypedModelMeta): + app_label = "api" + verbose_name = "API Key" + verbose_name_plural = "API Keys" + + def __str__(self) -> str: + return self.token + + @property + def token_redacted(self): + return f"************{self.token[-4:]}" + + def is_valid(self, for_date=None): + return not self.expires or self.expires >= (for_date or timezone.now()) + + +class OutboundWebhook(WebhookBase): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + class Meta(WebhookBase.Meta): + app_label = "api" + verbose_name = "API Outbound Webhook" + + def __str__(self) -> str: + return f"[{self.id}] {self.ref} -> {self.endpoint}" diff --git a/archivebox/api/urls.py b/archivebox/api/urls.py new file mode 100644 index 0000000000..ced60588dc --- /dev/null +++ b/archivebox/api/urls.py @@ -0,0 +1,29 @@ +__package__ = "archivebox.api" + +from urllib.parse import quote + +from django.http import HttpRequest, HttpResponseRedirect +from django.shortcuts import redirect +from django.urls import path +from django.views.generic.base import RedirectView + +from archivebox.core.routes_util import build_web_url +from .v1_api import urls as v1_api_urls + + +def archive_redirect_view(request: HttpRequest, url: str) -> HttpResponseRedirect: + if request.META.get("QUERY_STRING"): + url = f"{url}?{request.META['QUERY_STRING']}" + return redirect(build_web_url(f"/web/{quote(url, safe=':/')}", request=request)) + + +urlpatterns = [ + path("", RedirectView.as_view(url="/api/v1/docs")), + path("archive/", archive_redirect_view, name="api-archive-redirect"), + path("v1/", RedirectView.as_view(url="/api/v1/docs")), + path("v1/", v1_api_urls), + path("v1", RedirectView.as_view(url="/api/v1/docs")), + # ... v2 can be added here ... + # path("v2/", v2_api_urls), + # path("v2", RedirectView.as_view(url='/api/v2/docs')), +] diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py new file mode 100644 index 0000000000..a6784e6d93 --- /dev/null +++ b/archivebox/api/v1_api.py @@ -0,0 +1,136 @@ +__package__ = "archivebox.api" + + +from io import StringIO +from traceback import format_exception +from contextlib import redirect_stdout, redirect_stderr + +from django.http import HttpRequest, HttpResponse +from django.core.exceptions import ObjectDoesNotExist, EmptyResultSet, PermissionDenied +from django.contrib.auth.models import User + +from ninja import NinjaAPI, Swagger + +# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/ + +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH + +from archivebox.api.auth import API_AUTH_METHODS +from archivebox.api.models import APIToken + + +COMMIT_HASH = get_COMMIT_HASH() or "unknown" + +html_description = f""" +

Welcome to your ArchiveBox server's REST API [v1 ALPHA] homepage!

+
+WARNING: This API is still in an early development stage and may change! +
+ +Served by ArchiveBox v{VERSION} ({COMMIT_HASH[:8]}), API powered by django-ninja. +""" + + +def register_urls(api: NinjaAPI) -> NinjaAPI: + api.add_router("/auth/", "archivebox.api.v1_auth.router") + api.add_router("/core/", "archivebox.api.v1_core.router") + api.add_router("/crawls/", "archivebox.api.v1_crawls.router") + api.add_router("/cli/", "archivebox.api.v1_cli.router") + api.add_router("/machine/", "archivebox.api.v1_machine.router") + api.add_router("/personas/", "archivebox.api.v1_personas.router") + return api + + +class NinjaAPIWithIOCapture(NinjaAPI): + def create_temporal_response(self, request: HttpRequest) -> HttpResponse: + stdout, stderr = StringIO(), StringIO() + + with redirect_stderr(stderr): + with redirect_stdout(stdout): + setattr(request, "stdout", stdout) + setattr(request, "stderr", stderr) + + response = super().create_temporal_response(request) + + # Disable caching of API responses entirely + response["Cache-Control"] = "no-store" + + # Add debug stdout and stderr headers to response + response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200] + response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200] + # response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown' + + # Add Auth Headers to response + api_token_attr = request.__dict__.get("_api_token") + api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None + token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never" + + response["X-ArchiveBox-Auth-Method"] = str(request.__dict__.get("_api_auth_method", "None")) + response["X-ArchiveBox-Auth-Expires"] = token_expiry + response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None" + response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if request.user.pk else "None" + response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None" + + # import ipdb; ipdb.set_trace() + # print('RESPONDING NOW', response) + + return response + + +api = NinjaAPIWithIOCapture( + title="ArchiveBox API", + description=html_description, + version=VERSION, + auth=API_AUTH_METHODS, + urls_namespace="api-1", + docs=Swagger(settings={"persistAuthorization": True}), + # docs_decorator=login_required, + # renderer=ORJSONRenderer(), +) +api = register_urls(api) +urls = api.urls + + +@api.exception_handler(Exception) +def generic_exception_handler(request, err): + status = 503 + if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)): + status = 404 + + print("".join(format_exception(err))) + + return api.create_response( + request, + { + "succeeded": False, + "message": f"{err.__class__.__name__}: {err}", + "errors": [ + "".join(format_exception(err)), + # or send simpler parent-only traceback: + # *([str(err.__context__)] if getattr(err, '__context__', None) else []), + ], + }, + status=status, + ) + + +# import orjson +# from ninja.renderers import BaseRenderer +# class ORJSONRenderer(BaseRenderer): +# media_type = "application/json" +# def render(self, request, data, *, response_status): +# return { +# "success": True, +# "errors": [], +# "result": data, +# "stdout": ansi_to_html(stdout.getvalue().strip()), +# "stderr": ansi_to_html(stderr.getvalue().strip()), +# } +# return orjson.dumps(data) diff --git a/archivebox/api/v1_auth.py b/archivebox/api/v1_auth.py new file mode 100644 index 0000000000..e8c61e173f --- /dev/null +++ b/archivebox/api/v1_auth.py @@ -0,0 +1,65 @@ +__package__ = "archivebox.api" + +from django.http import HttpRequest + +from ninja import Router, Schema + +from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token + + +router = Router(tags=["Authentication"], auth=None) + + +class PasswordAuthSchema(Schema): + """Schema for a /get_api_token request""" + + username: str | None = None + password: str | None = None + + +@router.post( + "/get_api_token", + auth=None, + summary="Generate an API token for a given username & password (or currently logged-in user)", +) # auth=None because they are not authed yet +def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema): + user = auth_using_password( + username=auth_data.username, + password=auth_data.password, + request=request, + ) + + if user and user.is_superuser: + api_token = get_or_create_api_token(user) + assert api_token is not None, "Failed to create API token" + return { + "success": True, + "user_id": str(user.pk), + "username": user.username, + "token": api_token.token, + "expires": api_token.expires.isoformat() if api_token.expires else None, + } + + return {"success": False, "errors": ["Invalid credentials"]} + + +class TokenAuthSchema(Schema): + """Schema for a /check_api_token request""" + + token: str + + +@router.post( + "/check_api_token", + auth=None, + summary="Validate an API token to make sure its valid and non-expired", +) # auth=None because they are not authed yet +def check_api_token(request: HttpRequest, token_data: TokenAuthSchema): + user = auth_using_token( + token=token_data.token, + request=request, + ) + if user: + return {"success": True, "user_id": str(user.pk)} + + return {"success": False, "user_id": None} diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py new file mode 100644 index 0000000000..ea175f256c --- /dev/null +++ b/archivebox/api/v1_cli.py @@ -0,0 +1,313 @@ +__package__ = "archivebox.api" + +import json +from io import StringIO +from typing import Any +from enum import Enum + +from django.http import HttpRequest + +from ninja import Router, Schema +from ninja.errors import HttpError +from pydantic import Field + +from archivebox.misc.util import ansi_to_html +from archivebox.core.models import SnapshotQuerySet + + +# from .auth import API_AUTH_METHODS + +# router for API that exposes archivebox cli subcommands as REST endpoints +router = Router(tags=["ArchiveBox CLI Sub-Commands"]) + + +# Schemas + +JSONType = list[Any] | dict[str, Any] | bool | int | str | None +FILTER_PATTERNS_EXAMPLES = [["https://example.com"]] + + +class CLICommandResponseSchema(Schema): + success: bool + errors: list[str] + result: JSONType + result_format: str = "str" + stdout: str + stderr: str + + +FilterTypeChoices = Enum( + "FilterTypeChoices", + {filter_type: filter_type for filter_type in SnapshotQuerySet.FILTER_TYPE_CHOICES}, + type=str, +) + + +class AddCommandSchema(Schema): + urls: list[str] + snapshot_ids: list[str] | None = None + tag: str = "" + depth: int = 0 + max_urls: int = 0 + crawl_max_size: int = 0 + crawl_timeout: int = 0 + snapshot_max_size: int = 0 + parser: str = "auto" + plugins: str = "" + only_new: bool | None = None + update: bool = False + overwrite: bool = False + index_only: bool = False + + +class SnapshotFilterCommandSchema(Schema): + after: float | None = 0 + before: float | None = None + filter_type: str | None = FilterTypeChoices.substring + filter_patterns: list[str] | None = Field(default=None, examples=FILTER_PATTERNS_EXAMPLES) + status: str | None = None + url__icontains: str | None = None + url__istartswith: str | None = None + tag: str | None = None + crawl_id: str | None = None + limit: int | None = None + sort: str | None = None + search: str | None = None + + +class UpdateCommandSchema(SnapshotFilterCommandSchema): + resume: str | None = None + batch_size: int = 100 + continuous: bool = False + index_only: bool = False + migrate_only: bool = False + + +class ScheduleCommandSchema(Schema): + import_path: str | None = None + add: bool = False + show: bool = False + foreground: bool = False + run_all: bool = False + quiet: bool = False + every: str | None = None + tag: str = "" + depth: int = 0 + only_new: bool | None = None + update: bool = False + overwrite: bool = False + clear: bool = False + + +class ListCommandSchema(SnapshotFilterCommandSchema): + as_json: bool = True + as_html: bool = False + as_csv: str | None = "timestamp,url" + with_headers: bool = False + + +class RemoveCommandSchema(SnapshotFilterCommandSchema): + filter_type: str = FilterTypeChoices.exact + timeout: float = 60.0 + + +def snapshot_filter_kwargs(args: SnapshotFilterCommandSchema, *, default_filter_type: str) -> dict[str, Any]: + kwargs = args.dict() + kwargs["filter_patterns"] = kwargs.get("filter_patterns") or [] + kwargs["filter_type"] = kwargs.get("filter_type") or default_filter_type + return kwargs + + +@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]") +def cli_add(request: HttpRequest, args: AddCommandSchema): + from archivebox.cli.archivebox_add import add + + config_overrides: dict[str, object] = {} + if args.only_new is not None: + config_overrides["ONLY_NEW"] = bool(args.only_new) + if args.update or args.overwrite: + config_overrides["ONLY_NEW"] = False + crawl, snapshots = add( + urls=args.urls, + snapshot_ids=args.snapshot_ids, + tag=args.tag, + depth=args.depth, + max_urls=args.max_urls, + crawl_max_size=args.crawl_max_size, + crawl_timeout=args.crawl_timeout, + snapshot_max_size=args.snapshot_max_size, + index_only=args.index_only, + plugins=args.plugins, + parser=args.parser, + bg=True, # Always run in background for API calls + created_by_id=request.user.pk, + config=config_overrides or None, + ) + + snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)] + result_payload = { + "crawl_id": str(crawl.id), + "num_snapshots": len(snapshot_ids), + "snapshot_ids": snapshot_ids, + "queued_urls": args.urls, + } + stdout = request.__dict__.get("stdout") + stderr = request.__dict__.get("stderr") + + return { + "success": True, + "errors": [], + "result": result_payload, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]") +def cli_update(request: HttpRequest, args: UpdateCommandSchema): + from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset, update + from archivebox.core.snapshot_status import normalize_snapshot_status + + try: + status = normalize_snapshot_status(args.status) + except ValueError as err: + raise HttpError(400, str(err)) from err + + update_kwargs = snapshot_filter_kwargs(args, default_filter_type=FilterTypeChoices.substring) + update_kwargs["status"] = status + update_kwargs["stop_daemon_stack"] = False + + is_filtered_update = any( + (update_kwargs.get(key) for key in (*SnapshotQuerySet.FILTER_ARG_KEYS, "resume") if key != "filter_type"), + ) + matched_snapshot_ids = [] + if is_filtered_update: + matched_snapshot_ids = [ + str(snapshot_id) for snapshot_id in _build_filtered_snapshots_queryset(**update_kwargs).values_list("id", flat=True) + ] + + update(**update_kwargs) + stdout = request.__dict__.get("stdout") + stderr = request.__dict__.get("stderr") + return { + "success": True, + "errors": [], + "result": { + "matched_count": len(matched_snapshot_ids), + "snapshot_ids": matched_snapshot_ids, + } + if is_filtered_update + else None, + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]") +def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema): + from archivebox.cli.archivebox_schedule import schedule + + config_overrides: dict[str, object] = {} + if args.only_new is not None: + config_overrides["ONLY_NEW"] = bool(args.only_new) + if args.update or args.overwrite: + config_overrides["ONLY_NEW"] = False + result = schedule( + import_path=args.import_path, + add=args.add, + show=args.show, + foreground=args.foreground, + run_all=args.run_all, + quiet=args.quiet, + clear=args.clear, + every=args.every, + tag=args.tag, + depth=args.depth, + config=config_overrides or None, + ) + + stdout = request.__dict__.get("stdout") + stderr = request.__dict__.get("stderr") + return { + "success": True, + "errors": [], + "result": result, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]") +def cli_search(request: HttpRequest, args: ListCommandSchema): + from archivebox.cli.archivebox_snapshot import build_snapshot_queryset + + search_kwargs = snapshot_filter_kwargs(args, default_filter_type=FilterTypeChoices.substring) + as_json = search_kwargs.pop("as_json") + as_html = search_kwargs.pop("as_html") + as_csv = search_kwargs.pop("as_csv") + with_headers = search_kwargs.pop("with_headers") + try: + snapshots = build_snapshot_queryset(**search_kwargs).select_related("crawl", "crawl__created_by") + except ValueError as err: + raise HttpError(400, str(err)) from err + + result_format = "txt" + if as_json: + result_format = "json" + result = [ + json.loads(json.dumps(snapshot.to_dict(extended=True), default=str)) + for snapshot in snapshots.prefetch_related("tags").iterator(chunk_size=500) + ] + elif as_html: + result_format = "html" + result = "\n".join(snapshot.url for snapshot in snapshots.iterator(chunk_size=500)) + elif as_csv: + result_format = "csv" + cols = [col.strip() for col in as_csv.split(",") if col.strip()] + rows = [snapshot.to_csv(cols=cols, separator=",") for snapshot in snapshots.prefetch_related("tags").iterator(chunk_size=500)] + result = "\n".join((",".join(cols), *rows) if with_headers else rows) + else: + result = "\n".join(snapshot.url for snapshot in snapshots.iterator(chunk_size=500)) + + stdout = request.__dict__.get("stdout") + stderr = request.__dict__.get("stderr") + return { + "success": True, + "errors": [], + "result": result, + "result_format": result_format, + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } + + +@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]") +def cli_remove(request: HttpRequest, args: RemoveCommandSchema): + from archivebox.cli.archivebox_remove import remove + from archivebox.core.models import Snapshot + + remove_kwargs = snapshot_filter_kwargs(args, default_filter_type=FilterTypeChoices.exact) + timeout_arg = remove_kwargs.pop("timeout") + timeout = min(float(timeout_arg if timeout_arg is not None else 60.0), 60.0) + try: + snapshots_to_remove = Snapshot.objects.order_by("-created_at").search(**remove_kwargs) + except ValueError as err: + raise HttpError(400, str(err)) from err + + result = remove( + yes=True, # no way to interactively ask for confirmation via API, so we force yes + snapshots=snapshots_to_remove, + timeout=timeout, + ) + stdout = request.__dict__.get("stdout") + stderr = request.__dict__.get("stderr") + return { + "success": bool(result["success"]), + "errors": [str(result["error"])] if result["error"] else [], + "result": result, + "result_format": "json", + "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "", + "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "", + } diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py new file mode 100644 index 0000000000..a90ed6ecf5 --- /dev/null +++ b/archivebox/api/v1_core.py @@ -0,0 +1,1455 @@ +__package__ = "archivebox.api" + +import math +import json +import mimetypes +import re +from collections import defaultdict +from pathlib import Path, PurePosixPath +from uuid import UUID +from typing import Union, Any, Annotated +from datetime import datetime, time + +from django.db import transaction +from django.db.models import Model, Q +from django.http import HttpRequest, HttpResponse +from django.http.multipartparser import MultiPartParser, MultiPartParserError +from django.core.exceptions import ValidationError +from django.core.files.storage import FileSystemStorage +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User +from django.shortcuts import redirect +from django.utils import timezone +from django.utils.dateparse import parse_date, parse_datetime +from django.utils.feedgenerator import Rss201rev2Feed + +from ninja import Router, Schema, FilterLookup, FilterSchema, Query, Form, UploadedFile +from ninja.pagination import paginate, PaginationBase +from ninja.errors import HttpError + +from archivebox.core.models import Snapshot, ArchiveResult, Tag +from archivebox.core.permissions import public_snapshots_queryset +from archivebox.api.auth import authenticated_user_from_request +from archivebox.config.common import get_config +from archivebox.core.routes_util import build_web_url +from archivebox.misc.util import filter_queryset_by_uuid_substring, validate_url_length +from archivebox.core.tag_util import ( + add_snapshot_counts, + build_tag_cards, + delete_tag as delete_tag_record, + export_tag_snapshots_jsonl, + export_tag_urls, + get_matching_tags, + get_or_create_tag, + get_tag_by_ref, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, + rename_tag as rename_tag_record, +) +from archivebox.crawls.models import Crawl +from archivebox.api.v1_crawls import CrawlSchema, get_crawl_by_ref +from archivebox.search.config import get_search_mode, get_search_mode_backend +from archivebox.search.query import apply_snapshot_search +from archivebox.core.snapshot_status import filter_snapshots_by_status, normalize_snapshot_status + + +router = Router(tags=["Core Models"]) + +ARCHIVERESULT_UPLOAD_HOOK_NAME = "on_Snapshot__archivebox_browser_extension_upload" +ARCHIVERESULT_UPLOAD_PLUGIN_RE = re.compile(r"^[A-Za-z0-9_.-]{1,32}$") + + +class CustomPagination(PaginationBase): + class Input(PaginationBase.Input): + limit: int = 200 + offset: int = 0 + page: int = 0 + + class Output(PaginationBase.Output): + count: int + total_items: int + total_pages: int + page: int + limit: int + offset: int + num_items: int + items: list[Any] + + def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params): + limit = min(pagination.limit, 500) + offset = pagination.offset or (pagination.page * limit) + total = queryset.values("pk").distinct().count() if queryset.query.distinct else queryset.count() + total_pages = math.ceil(total / limit) + current_page = math.ceil(offset / (limit + 1)) + items = queryset[offset : offset + limit] + return { + "count": total, + "total_items": total, + "total_pages": total_pages, + "page": current_page, + "limit": limit, + "offset": offset, + "num_items": len(items), + "items": items, + } + + +### ArchiveResult ######################################################################### + + +class MinimalArchiveResultSchema(Schema): + TYPE: str = "core.models.ArchiveResult" + id: UUID + created_at: datetime | None + modified_at: datetime | None + created_by_id: str + created_by_username: str + status: str + retry_at: datetime | None = None + plugin: str + hook_name: str + process_id: UUID | None + cmd_version: str | None + cmd: list[str] | None + pwd: str | None + output_str: str + output_json: dict[str, Any] | None + output_files: dict[str, dict[str, Any]] | None + output_size: int + output_mimetypes: str + start_ts: datetime | None + end_ts: datetime | None + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by.pk) + + @staticmethod + def resolve_created_by_username(obj) -> str: + return obj.created_by.username + + @staticmethod + def resolve_output_files(obj): + return obj.output_file_map() + + @staticmethod + def resolve_output_mimetypes(obj) -> str: + mime_sizes: dict[str, int] = defaultdict(int) + for metadata in obj.output_file_map().values(): + if not isinstance(metadata, dict): + continue + mimetype = str(metadata.get("mimetype") or "").strip() + try: + size = max(int(metadata.get("size") or 0), 0) + except (TypeError, ValueError): + size = 0 + if mimetype and size: + mime_sizes[mimetype] += size + if mime_sizes: + return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return obj.output_mimetypes or "" + + +class ArchiveResultSchema(MinimalArchiveResultSchema): + TYPE: str = "core.models.ArchiveResult" + snapshot_id: UUID + snapshot_timestamp: str + snapshot_url: str + snapshot_tags: list[str] + + @staticmethod + def resolve_snapshot_timestamp(obj): + return obj.snapshot.timestamp + + @staticmethod + def resolve_snapshot_url(obj): + return obj.snapshot.url + + @staticmethod + def resolve_snapshot_id(obj): + return obj.snapshot_id + + @staticmethod + def resolve_snapshot_tags(obj): + return sorted(tag.name for tag in obj.snapshot.tags.all()) + + +class ArchiveResultFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + search: Annotated[ + str | None, + FilterLookup( + [ + "snapshot__url__icontains", + "snapshot__title__icontains", + "snapshot__tags__name__icontains", + "plugin", + "output_str__icontains", + "id__startswith", + "snapshot__id__startswith", + "snapshot__timestamp__startswith", + ], + ), + ] = None + snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None + snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None + snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None + status: Annotated[str | None, FilterLookup("status")] = None + output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None + plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None + hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None + process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None + cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None + pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None + cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None + + +@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult") +@paginate(CustomPagination) +def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]): + """List all ArchiveResult entries matching these filters.""" + queryset = filters.filter(ArchiveResult.objects.all()) + if filters.search or filters.snapshot_tag: + return queryset.distinct() + return queryset + + +def _uuid_ref_query(field_name: str, ref: str) -> Q: + raw_ref = str(ref or "").strip() + query = Q(**{f"{field_name}__startswith": raw_ref}) + if raw_ref: + query |= Q(**{f"{field_name}__icontains": raw_ref}) + try: + parsed_uuid = UUID(raw_ref) + except (TypeError, ValueError): + normalized_ref = raw_ref.replace("-", "") + if normalized_ref and normalized_ref != raw_ref: + query |= Q(**{f"{field_name}__startswith": normalized_ref}) + query |= Q(**{f"{field_name}__icontains": normalized_ref}) + else: + query |= Q(**{field_name: parsed_uuid}) + query |= Q(**{f"{field_name}__startswith": parsed_uuid.hex}) + return query + + +@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="get_archiveresult") +def get_archiveresult(request: HttpRequest, archiveresult_id: str): + """Get a specific ArchiveResult by id.""" + return ArchiveResult.objects.get(_uuid_ref_query("id", archiveresult_id)) + + +def _normalize_uploaded_archiveresult_plugin(plugin: str) -> str: + normalized = str(plugin or "").strip().strip("/") + if not ARCHIVERESULT_UPLOAD_PLUGIN_RE.fullmatch(normalized): + raise HttpError(400, "Invalid ArchiveResult plugin name") + return normalized + + +def _normalize_uploaded_archiveresult_output_path(output_path: str, *, filename: str) -> str: + raw_path = str(output_path or filename or "").strip().replace("\\", "/") + if not raw_path: + raise HttpError(400, "ArchiveResult output path is required") + + path = PurePosixPath(raw_path) + if path.is_absolute() or any(part in ("", ".", "..") for part in path.parts): + raise HttpError(400, "Invalid ArchiveResult output path") + + return str(path) + + +def _parse_archiveresult_output_json(output_json: str | None) -> dict[str, Any] | None: + if not output_json: + return None + try: + parsed = json.loads(output_json) + except json.JSONDecodeError as err: + raise HttpError(400, "ArchiveResult output_json must be valid JSON") from err + if parsed is None: + return None + if not isinstance(parsed, dict): + raise HttpError(400, "ArchiveResult output_json must be a JSON object") + return parsed + + +def _get_archiveresult_upload_data(request: HttpRequest): + cached = request.__dict__.get("_archiveresult_upload_data") + if cached is not None: + return cached + + if request.method.upper() == "PATCH" and request.content_type.startswith("multipart/"): + try: + data = MultiPartParser(request.META, request, request.upload_handlers, request.encoding).parse() + except MultiPartParserError as err: + raise HttpError(400, f"Invalid ArchiveResult multipart upload: {err}") from err + else: + data = (request.POST, request.FILES) + + setattr(request, "_archiveresult_upload_data", data) + return data + + +def _get_archiveresult_upload_files(request: HttpRequest, *, allow_empty: bool = False) -> list[UploadedFile]: + _post, request_files = _get_archiveresult_upload_data(request) + files = [*request_files.getlist("files"), *request_files.getlist("file")] + if not files and not allow_empty: + raise HttpError(400, "At least one ArchiveResult file is required") + return files + + +def _get_archiveresult_upload_form_values(request: HttpRequest, *field_names: str) -> list[str]: + request_post, _files = _get_archiveresult_upload_data(request) + values: list[str] = [] + for field_name in field_names: + values.extend(str(value) for value in request_post.getlist(field_name)) + if len(values) == 1: + value = values[0].strip() + if value.startswith("["): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + return [str(item) for item in parsed] + return values + + +def _get_archiveresult_upload_form_value(request: HttpRequest, *field_names: str) -> str: + request_post, _files = _get_archiveresult_upload_data(request) + for field_name in field_names: + value = request_post.get(field_name) + if value is not None: + return str(value) + return "" + + +def _parse_archiveresult_upload_int(value: str, field_name: str, *, default: int | None = None) -> int: + if value == "" and default is not None: + return default + try: + parsed = int(value) + except (TypeError, ValueError) as err: + raise HttpError(400, f"ArchiveResult {field_name} must be an integer") from err + if parsed < 0: + raise HttpError(400, f"ArchiveResult {field_name} must be non-negative") + return parsed + + +def _summarize_archiveresult_output_files(output_files: dict[str, dict[str, Any]]) -> tuple[int, str]: + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + for metadata in output_files.values(): + if not isinstance(metadata, dict): + continue + try: + size = max(int(metadata.get("size") or 0), 0) + except (TypeError, ValueError): + size = 0 + mime_type = str(metadata.get("mimetype") or "").strip() + total_size += size + if mime_type and size: + mime_sizes[mime_type] += size + output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return total_size, output_mimetypes + + +def _get_snapshot_by_ref(snapshot_id: str): + queryset = Snapshot.objects.select_related("crawl__created_by") + try: + return queryset.get(_uuid_ref_query("id", snapshot_id) | Q(timestamp__startswith=snapshot_id)) + except Snapshot.DoesNotExist: + return queryset.get(_uuid_ref_query("id", snapshot_id)) + + +def _queue_archiveresult_snapshot_maintenance(snapshot: Snapshot) -> None: + """ + Mark an uploaded ArchiveResult's Snapshot as dirty without finalizing it. + + Upload API handlers are allowed to persist files and ArchiveResult rows, but + Snapshot save() side effects, sealing, symlink creation, and index/details + rewrites belong to the runner. retry_at is the scheduler signal the runner + already watches, so only bump rows that are final or otherwise invisible. + """ + # ArchiveResult.save() updates parent snapshot health/mtime before this + # helper runs. Re-read the scheduler columns so the short CAS update below + # does not lose to our own earlier ArchiveResult write. + snapshot = Snapshot.objects.only("id", "status", "retry_at", "downloaded_at", "modified_at").get(id=snapshot.id) + now = timezone.now() + updates = {"modified_at": now} + if snapshot.downloaded_at is None: + updates["downloaded_at"] = now + if snapshot.status == Snapshot.StatusChoices.SEALED or snapshot.retry_at is None: + updates["retry_at"] = now + snapshot.safe_update(updates, refresh=False) + + +def _merge_archiveresult_output_file_maps(results: list[ArchiveResult]) -> dict[str, dict[str, Any]]: + output_files: dict[str, dict[str, Any]] = {} + for result in results: + output_files.update(result.output_file_map()) + return output_files + + +def _write_archiveresult_files( + request: HttpRequest, + snapshot: Snapshot, + plugin_name: str, + *, + existing_output_files: dict[str, dict[str, Any]] | None = None, + allow_empty: bool = False, +) -> dict[str, dict[str, Any]]: + files = _get_archiveresult_upload_files(request, allow_empty=allow_empty) + output_paths = _get_archiveresult_upload_form_values(request, "output_paths", "output_path") + mime_types = _get_archiveresult_upload_form_values(request, "mime_types", "mime_type") + chunk_output_path = _get_archiveresult_upload_form_value(request, "chunk_output_path") + + snapshot_dir = snapshot.output_dir + plugin_dir = snapshot_dir / plugin_name + storage = FileSystemStorage(location=str(plugin_dir)) + output_files = dict(existing_output_files or {}) + + if not files: + return output_files + + if chunk_output_path: + if len(files) != 1: + raise HttpError(400, "Exactly one ArchiveResult file chunk is required") + + uploaded_file = files[0] + relative_output_path = _normalize_uploaded_archiveresult_output_path( + chunk_output_path, + filename=uploaded_file.name, + ) + chunk_index = _parse_archiveresult_upload_int( + _get_archiveresult_upload_form_value(request, "chunk_index"), + "chunk_index", + ) + chunk_count = _parse_archiveresult_upload_int( + _get_archiveresult_upload_form_value(request, "chunk_count"), + "chunk_count", + ) + chunk_offset = _parse_archiveresult_upload_int( + _get_archiveresult_upload_form_value(request, "chunk_offset"), + "chunk_offset", + ) + chunk_total_size = _parse_archiveresult_upload_int( + _get_archiveresult_upload_form_value(request, "chunk_total_size"), + "chunk_total_size", + ) + + if chunk_count < 1: + raise HttpError(400, "ArchiveResult chunk_count must be at least 1") + if chunk_index >= chunk_count: + raise HttpError(400, "ArchiveResult chunk_index must be less than chunk_count") + if chunk_total_size and chunk_offset > chunk_total_size: + raise HttpError(400, "ArchiveResult chunk_offset cannot exceed chunk_total_size") + + if chunk_index == 0 and chunk_offset == 0 and storage.exists(relative_output_path): + storage.delete(relative_output_path) + + current_size = storage.size(relative_output_path) if storage.exists(relative_output_path) else 0 + if current_size != chunk_offset: + raise HttpError( + 409, + f"ArchiveResult chunk offset mismatch for {relative_output_path}: expected {current_size}, got {chunk_offset}", + ) + + Path(storage.path(relative_output_path)).parent.mkdir(parents=True, exist_ok=True) + with storage.open(relative_output_path, "ab") as destination: + for chunk in uploaded_file.chunks(): + destination.write(chunk) + + size = storage.size(relative_output_path) + upload_complete = chunk_index + 1 == chunk_count + if upload_complete and size != chunk_total_size: + raise HttpError( + 409, + f"ArchiveResult chunk size mismatch for {relative_output_path}: expected {chunk_total_size}, got {size}", + ) + + guessed_mime = mimetypes.guess_type(relative_output_path)[0] + output_mime_type = (mime_types[0] if mime_types else "") or uploaded_file.content_type or guessed_mime or "application/octet-stream" + output_files[relative_output_path] = { + "extension": PurePosixPath(relative_output_path).suffix.lower().lstrip("."), + "mimetype": output_mime_type, + "size": size, + "upload": { + "chunked": True, + "chunk_index": chunk_index, + "chunk_count": chunk_count, + "chunks_received": chunk_index + 1, + "complete": upload_complete, + }, + } + + return output_files + + for index, uploaded_file in enumerate(files): + relative_output_path = _normalize_uploaded_archiveresult_output_path( + output_paths[index] if index < len(output_paths) else "", + filename=uploaded_file.name, + ) + if storage.exists(relative_output_path): + storage.delete(relative_output_path) + saved_output_path = storage.save(relative_output_path, uploaded_file) + size = storage.size(saved_output_path) + guessed_mime = mimetypes.guess_type(saved_output_path)[0] + output_mime_type = ( + (mime_types[index] if index < len(mime_types) else "") + or uploaded_file.content_type + or guessed_mime + or "application/octet-stream" + ) + output_files[saved_output_path] = { + "extension": PurePosixPath(saved_output_path).suffix.lower().lstrip("."), + "mimetype": output_mime_type, + "size": size, + } + + return output_files + + +@router.post( + "/archiveresults", + response=ArchiveResultSchema, + url_name="create_archiveresult", +) +def create_archiveresult( + request: HttpRequest, + snapshot_id: str = Form(...), + plugin: str = Form(...), + output_str: str = Form(""), + hook_name: str = Form(ARCHIVERESULT_UPLOAD_HOOK_NAME), + status: str = Form(str(ArchiveResult.StatusChoices.SUCCEEDED)), + output_json: str = Form(""), +): + """Create or update an ArchiveResult with one or more output files.""" + snapshot = _get_snapshot_by_ref(snapshot_id) + plugin_name = _normalize_uploaded_archiveresult_plugin(plugin) + normalized_status = ArchiveResult.normalize_status(status) + parsed_output_json = _parse_archiveresult_output_json(output_json) + hook = hook_name or ARCHIVERESULT_UPLOAD_HOOK_NAME + matching_results = list( + ArchiveResult.objects.filter( + snapshot=snapshot, + plugin=plugin_name, + hook_name=hook, + ).order_by("created_at", "id"), + ) + existing_result = matching_results[0] if matching_results else None + existing_output_files = _merge_archiveresult_output_file_maps(matching_results) + output_files = _write_archiveresult_files( + request, + snapshot, + plugin_name, + existing_output_files=existing_output_files, + allow_empty=True, + ) + now = timezone.now() + + with transaction.atomic(): + matching_results = list( + ArchiveResult.objects.filter( + snapshot=snapshot, + plugin=plugin_name, + hook_name=hook, + ).order_by("created_at", "id"), + ) + if matching_results: + existing_result = matching_results[0] + output_files = { + **_merge_archiveresult_output_file_maps(matching_results), + **output_files, + } + duplicate_ids = [result.id for result in matching_results[1:]] + if duplicate_ids: + ArchiveResult.objects.filter(id__in=duplicate_ids).delete() + result = existing_result + else: + existing_result = None + result = ArchiveResult( + snapshot=snapshot, + plugin=plugin_name, + hook_name=hook, + ) + + if ( + existing_result + and normalized_status == ArchiveResult.StatusChoices.STARTED + and existing_result.status != ArchiveResult.StatusChoices.STARTED + ): + normalized_status = existing_result.status + output_size, output_mimetypes = _summarize_archiveresult_output_files(output_files) + output_file_paths = list(output_files.keys()) + result.status = normalized_status + result.output_str = output_str or (output_file_paths[0] if output_file_paths else "") + result.output_json = parsed_output_json + result.output_files = output_files + result.output_size = output_size + result.output_mimetypes = output_mimetypes + result.start_ts = result.start_ts or now + result.end_ts = now + result.save() + + if result.status != ArchiveResult.StatusChoices.STARTED: + _queue_archiveresult_snapshot_maintenance(snapshot) + return result + + +@router.patch("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema, url_name="patch_archiveresult") +def patch_archiveresult( + request: HttpRequest, + archiveresult_id: str, +): + """Append or replace files on an existing ArchiveResult.""" + result = ArchiveResult.objects.select_related("snapshot__crawl__created_by").get(_uuid_ref_query("id", archiveresult_id)) + output_files = _write_archiveresult_files( + request, + result.snapshot, + result.plugin, + existing_output_files=result.output_file_map(), + ) + latest_result = ArchiveResult.objects.only("output_files", "status").get(pk=result.pk) + output_files = { + **latest_result.output_file_map(), + **output_files, + } + output_size, output_mimetypes = _summarize_archiveresult_output_files(output_files) + + update_fields = ["output_files", "output_size", "output_mimetypes", "end_ts", "modified_at"] + result.output_files = output_files + result.output_size = output_size + result.output_mimetypes = output_mimetypes + result.end_ts = timezone.now() + output_str = _get_archiveresult_upload_form_value(request, "output_str") + status = _get_archiveresult_upload_form_value(request, "status") + output_json = _get_archiveresult_upload_form_value(request, "output_json") + if output_str: + result.output_str = output_str + update_fields.append("output_str") + if status: + normalized_status = ArchiveResult.normalize_status(status) + if normalized_status == ArchiveResult.StatusChoices.STARTED and latest_result.status != ArchiveResult.StatusChoices.STARTED: + normalized_status = latest_result.status + result.status = normalized_status + update_fields.append("status") + elif latest_result.status == ArchiveResult.StatusChoices.QUEUED and ArchiveResult.output_files_upload_complete(output_files): + result.status = ArchiveResult.StatusChoices.SUCCEEDED + update_fields.append("status") + if output_json: + result.output_json = _parse_archiveresult_output_json(output_json) + update_fields.append("output_json") + + result.save(update_fields=update_fields) + if result.status != ArchiveResult.StatusChoices.STARTED: + _queue_archiveresult_snapshot_maintenance(result.snapshot) + + return result + + +### Snapshot ######################################################################### + + +class SnapshotSchema(Schema): + TYPE: str = "core.models.Snapshot" + id: UUID + created_by_id: str + created_by_username: str + created_at: datetime + modified_at: datetime + status: str + retry_at: datetime | None + bookmarked_at: datetime + downloaded_at: datetime | None + url: str + tags: list[str] + title: str | None + timestamp: str + archive_path: str + archive_size: int + output_size: int + num_archiveresults: int + archiveresults: list[MinimalArchiveResultSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by.pk) + + @staticmethod + def resolve_created_by_username(obj): + return obj.created_by.username + + @staticmethod + def resolve_tags(obj): + return sorted(tag.name for tag in obj.tags.all()) + + @staticmethod + def resolve_archive_size(obj): + return int(obj.archive_size or 0) + + @staticmethod + def resolve_output_size(obj): + return SnapshotSchema.resolve_archive_size(obj) + + @staticmethod + def resolve_num_archiveresults(obj, context): + return obj.archiveresult_set.all().distinct().count() + + @staticmethod + def resolve_archiveresults(obj, context): + if bool(context["request"].__dict__.get("with_archiveresults", False)): + return obj.archiveresult_set.all().distinct() + return ArchiveResult.objects.none() + + +class SnapshotUpdateSchema(Schema): + action: str | None = None + status: str | None = None + retry_at: datetime | None = None + tags: list[str] | None = None + + +class SnapshotCreateSchema(Schema): + url: str + crawl_id: str | None = None + depth: int = 0 + title: str | None = None + tags: list[str] | None = None + status: str | None = None + + +class SnapshotDeleteResponseSchema(Schema): + success: bool + snapshot_id: str + crawl_id: str + deleted_count: int + + +def normalize_tag_list(tags: list[str] | None = None) -> list[str]: + return [tag.strip() for tag in (tags or []) if tag and tag.strip()] + + +def _parse_rss_before(before: str | None) -> datetime: + if not before: + return timezone.now() + + value = before.strip() + parsed_dt = None + + if len(value) == 8 and value.isdigit(): + parsed_date = datetime.strptime(value, "%Y%m%d").date() + else: + parsed_dt = parse_datetime(value) + parsed_date = None if parsed_dt else parse_date(value) + + if parsed_dt is None: + if parsed_date is None: + raise HttpError(400, "before must be an ISO datetime, YYYY-MM-DD, or YYYYMMDD") + parsed_dt = datetime.combine(parsed_date, time.max) + + if timezone.is_naive(parsed_dt): + parsed_dt = timezone.make_aware(parsed_dt, timezone.get_current_timezone()) + return parsed_dt + + +def _filter_snapshots_for_rss( + *, + crawl_id: str = "", + created_by: str = "", + before: str | None = None, + limit: int = 50, +): + limit = max(1, min(int(limit or 50), 500)) + before_dt = _parse_rss_before(before) + queryset = ( + Snapshot.objects.select_related("crawl__created_by") + .prefetch_related("tags") + .only( + "id", + "url", + "title", + "timestamp", + "bookmarked_at", + "created_at", + "modified_at", + "fs_version", + "crawl_id", + "crawl__id", + "crawl__created_by_id", + "crawl__created_by__id", + "crawl__created_by__username", + ) + .filter(bookmarked_at__lte=before_dt) + ) + crawl_id = crawl_id.strip() + if crawl_id: + matching_crawl_pks = list(filter_queryset_by_uuid_substring(Crawl.objects.all(), crawl_id).values_list("pk", flat=True)[:100]) + queryset = queryset.filter(crawl_id__in=matching_crawl_pks) + + created_by = created_by.strip() + if created_by: + created_by_query = Q(crawl__created_by__username__iexact=created_by) + user_model = get_user_model() + try: + prepared_pk = user_model._meta.pk.get_prep_value(created_by) + except (TypeError, ValueError, ValidationError): + prepared_pk = None + if prepared_pk not in (None, ""): + created_by_query |= Q(crawl__created_by_id=prepared_pk) + queryset = queryset.filter(created_by_query) + + return queryset.order_by("-bookmarked_at", "-created_at", "-id")[:limit] + + +def _snapshots_rss_response( + request: HttpRequest, + *, + snapshots, + title: str = "ArchiveBox Snapshots", +) -> HttpResponse: + web_base_url = build_web_url("/", request=request).rstrip("/") + feed_query = request.GET.copy() + for sensitive_param in ("api_key", "token", "password"): + feed_query.pop(sensitive_param, None) + feed_path = request.path + feed_url = request.build_absolute_uri(f"{feed_path}?{feed_query.urlencode()}" if feed_query else feed_path) + + feed = Rss201rev2Feed( + title=title, + link=build_web_url("/public/", request=request), + description="Recently added ArchiveBox snapshots.", + language="en", + feed_url=feed_url, + ) + + for snapshot in snapshots: + archived_url = build_web_url(f"/{snapshot.archive_path_from_db}", request=request) + tags = [tag.name for tag in snapshot.tags.all()] + crawl_user = snapshot.crawl.created_by if snapshot.crawl_id else None + description = f"Original URL: {snapshot.url}\nArchived snapshot: {archived_url}" + feed.add_item( + title=snapshot.title or snapshot.url, + link=archived_url or web_base_url, + description=description, + unique_id=str(snapshot.id), + unique_id_is_permalink=False, + pubdate=snapshot.bookmarked_at or snapshot.created_at, + updateddate=snapshot.modified_at, + author_name=crawl_user.username if crawl_user else None, + categories=tags, + ) + + return HttpResponse(feed.writeString("utf-8"), content_type="application/rss+xml; charset=utf-8") + + +class SnapshotFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup(["id__istartswith", "id__iendswith", "timestamp__startswith"])] = None + created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None + created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None + created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None + created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None + created_at: Annotated[datetime | None, FilterLookup("created_at")] = None + modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None + modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None + modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None + search: str | None = None + search_mode: str | None = None + status: str | None = None + url: Annotated[str | None, FilterLookup("url")] = None + tag: Annotated[str | None, FilterLookup("tags__name")] = None + title: Annotated[str | None, FilterLookup("title__icontains")] = None + timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None + bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None + bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None + + def filter_search(self, value: str | None) -> Q: + return Q() + + def filter_search_mode(self, value: str | None) -> Q: + return Q() + + def filter_status(self, value: str | None) -> Q: + return Q() + + +@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots") +@paginate(CustomPagination) +def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False): + """List all Snapshot entries matching these filters.""" + setattr(request, "with_archiveresults", with_archiveresults) + try: + queryset = filter_snapshots_by_status(Snapshot.objects.all(), filters.status) + except ValueError as err: + raise HttpError(400, str(err)) from err + queryset = filters.filter(queryset).distinct() + query = (filters.search or "").strip() + if not query: + return queryset + + runtime_config = request.archivebox_config + search_mode = get_search_mode(filters.search_mode, config=runtime_config) + try: + return apply_snapshot_search( + queryset, + query, + search_mode=search_mode, + config=runtime_config, + include_id_matches=True, + ) + except Exception: + if get_search_mode_backend(search_mode, config=runtime_config): + return queryset.none() + return apply_snapshot_search(queryset, query, search_mode="meta", config=runtime_config, include_id_matches=True) + + +@router.get("/snapshots.rss", url_name="get_snapshots_rss") +def get_snapshots_rss( + request: HttpRequest, + crawl_id: str = "", + created_by: str = "", + limit: int = 50, + before: str | None = None, +): + """Return matching snapshots as an RSS feed, newest first.""" + snapshots = _filter_snapshots_for_rss( + crawl_id=crawl_id, + created_by=created_by, + limit=limit, + before=before, + ) + return _snapshots_rss_response(request, snapshots=snapshots) + + +@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") +def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True): + """Get a specific Snapshot by id.""" + setattr(request, "with_archiveresults", with_archiveresults) + return _get_snapshot_by_ref(snapshot_id) + + +@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot") +def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema): + tags = normalize_tag_list(data.tags) + try: + status = normalize_snapshot_status(data.status) + except ValueError as err: + raise HttpError(400, str(err)) from err + if not data.url.strip(): + raise HttpError(400, "URL is required") + try: + validate_url_length(data.url.strip()) + except ValueError as err: + raise HttpError(400, str(err)) from err + if data.depth not in (0, 1, 2, 3, 4): + raise HttpError(400, "depth must be between 0 and 4") + + if data.crawl_id: + crawl = get_crawl_by_ref(data.crawl_id) + crawl_tags = normalize_tag_list(crawl.tags_str.split(",")) + tags = tags or crawl_tags + else: + crawl = Crawl.objects.create( + urls=data.url, + max_depth=max(data.depth, 0), + tags_str=",".join(tags), + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + created_by=request.user if isinstance(request.user, User) else None, + ) + + snapshot_defaults = { + "depth": data.depth, + "title": data.title, + "timestamp": str(timezone.now().timestamp()), + "status": status or Snapshot.StatusChoices.QUEUED, + "retry_at": timezone.now(), + } + snapshot, _ = Snapshot.objects.get_or_create( + url=data.url, + crawl=crawl, + defaults=snapshot_defaults, + ) + + update_fields: list[str] = [] + if data.title is not None and snapshot.title != data.title: + snapshot.title = data.title + update_fields.append("title") + if status is not None and snapshot.status != status: + snapshot.status = status + update_fields.append("status") + if update_fields: + update_fields.append("modified_at") + snapshot.save(update_fields=update_fields) + + if tags: + snapshot.save_tags(tags) + + try: + snapshot.ensure_crawl_symlink() + except Exception: + pass + + setattr(request, "with_archiveresults", False) + return snapshot + + +@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot") +def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateSchema): + """Update a snapshot (e.g., set status=sealed to cancel queued work).""" + snapshot = _get_snapshot_by_ref(snapshot_id) + + payload = data.dict(exclude_unset=True) + update_fields = ["modified_at"] + action = payload.pop("action", None) + tags = payload.pop("tags", None) + + if action: + if action == "pause": + snapshot.pause() + setattr(request, "with_archiveresults", False) + return snapshot + if action in ("resume", "unpause"): + snapshot.resume() + setattr(request, "with_archiveresults", False) + return snapshot + if action == "cancel": + snapshot.cancel() + setattr(request, "with_archiveresults", False) + return snapshot + raise HttpError(400, f"Invalid action: {action}") + + if "status" in payload: + try: + snapshot.status = normalize_snapshot_status(payload["status"]) + except ValueError as err: + raise HttpError(400, str(err)) from err + if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload: + snapshot.retry_at = None + update_fields.append("status") + + if "retry_at" in payload: + snapshot.retry_at = payload["retry_at"] + update_fields.append("retry_at") + + if tags is not None: + snapshot.save_tags(normalize_tag_list(tags)) + + if payload.get("status") == Snapshot.StatusChoices.SEALED: + snapshot.cancel() + else: + snapshot.save(update_fields=update_fields) + setattr(request, "with_archiveresults", False) + return snapshot + + +@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot") +def delete_snapshot(request: HttpRequest, snapshot_id: str): + snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False) + snapshot_id_str = str(snapshot.id) + crawl_id_str = str(snapshot.crawl.pk) + deleted_count, _ = snapshot.delete() + return { + "success": True, + "snapshot_id": snapshot_id_str, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, + } + + +### Tag ######################################################################### + + +class TagSchema(Schema): + TYPE: str = "core.models.Tag" + id: int + modified_at: datetime + created_at: datetime + created_by_id: str + created_by_username: str + name: str + num_snapshots: int + snapshots: list[SnapshotSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + user_model = get_user_model() + user = user_model.objects.get(id=obj.created_by_id) + username = user.username + return username if isinstance(username, str) else str(user) + + @staticmethod + def resolve_num_snapshots(obj, context): + return obj.snapshot_set.all().distinct().count() + + @staticmethod + def resolve_snapshots(obj, context): + if bool(context["request"].__dict__.get("with_snapshots", False)): + return obj.snapshot_set.all().distinct() + return Snapshot.objects.none() + + +@router.get("/tags", response=list[TagSchema], url_name="get_tags") +@paginate(CustomPagination) +def get_tags(request: HttpRequest): + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) + return get_matching_tags() + + +@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag") +def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True): + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", False) + try: + return get_tag_by_ref(tag_id) + except (Tag.DoesNotExist, ValidationError): + raise HttpError(404, "Tag not found") + + +@router.get( + "/any/{id}", + response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], + url_name="get_any", + summary="Get any object by its ID", +) +def get_any(request: HttpRequest, id: str): + """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.).""" + setattr(request, "with_snapshots", False) + setattr(request, "with_archiveresults", False) + + for getter in [get_snapshot, get_archiveresult, get_tag]: + try: + response = getter(request, id) + if isinstance(response, Model): + return redirect( + f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}", + ) + except Exception: + pass + + try: + from archivebox.api.v1_crawls import get_crawl + + response = get_crawl(request, id) + if isinstance(response, Model): + return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}") + except Exception: + pass + + raise HttpError(404, "Object with given ID not found") + + +### Tag Editor API Endpoints ######################################################################### + + +class TagAutocompleteSchema(Schema): + tags: list[dict] + + +class TagCreateSchema(Schema): + name: str + + +class TagCreateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + created: bool + + +class TagSearchSnapshotSchema(Schema): + id: str + title: str + url: str + favicon_url: str + admin_url: str + archive_url: str + downloaded_at: str | None = None + + +class TagSearchCardSchema(Schema): + id: int + name: str + slug: str + num_snapshots: int + filter_url: str + edit_url: str + export_urls_url: str + export_jsonl_url: str + rename_url: str + delete_url: str + snapshots: list[TagSearchSnapshotSchema] + + +class TagSearchResponseSchema(Schema): + tags: list[TagSearchCardSchema] + sort: str + created_by: str + year: str + has_snapshots: str + + +class TagUpdateSchema(Schema): + name: str + + +class TagUpdateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + + +class TagDeleteResponseSchema(Schema): + success: bool + tag_id: int + deleted_count: int + + +class TagSnapshotRequestSchema(Schema): + snapshot_id: str + tag_name: str | None = None + tag_id: int | None = None + + +class TagSnapshotResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + + +def _get_snapshot_for_tag_edit(snapshot_ref: str) -> Snapshot: + snapshot_ref = str(snapshot_ref or "").strip().lower() + if not snapshot_ref: + raise HttpError(400, "Snapshot id is required") + + snapshot_qs = Snapshot.objects.only("id") + is_full_uuid = len(snapshot_ref.replace("-", "")) == 32 and all(char in "0123456789abcdef-" for char in snapshot_ref) + if is_full_uuid: + try: + return snapshot_qs.get(pk=snapshot_ref.replace("-", "")) + except (Snapshot.DoesNotExist, ValueError): + pass + + if len(snapshot_ref) >= 14: + try: + return snapshot_qs.get(timestamp=snapshot_ref) + except Snapshot.DoesNotExist: + pass + except Snapshot.MultipleObjectsReturned: + snapshot = snapshot_qs.filter(timestamp=snapshot_ref).first() + if snapshot is not None: + return snapshot + + try: + return snapshot_qs.get(Q(id__startswith=snapshot_ref) | Q(timestamp__startswith=snapshot_ref)) + except Snapshot.DoesNotExist: + raise HttpError(404, "Snapshot not found") from None + except Snapshot.MultipleObjectsReturned: + snapshot = snapshot_qs.filter(Q(id__startswith=snapshot_ref) | Q(timestamp__startswith=snapshot_ref)).first() + if snapshot is None: + raise HttpError(404, "Snapshot not found") + return snapshot + + +@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags") +def search_tags( + request: HttpRequest, + q: str = "", + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +): + """Return detailed tag cards for admin/live-search UIs.""" + normalized_sort = normalize_tag_sort(sort) + normalized_created_by = normalize_created_by_filter(created_by) + normalized_year = normalize_created_year_filter(year) + normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots) + return { + "tags": build_tag_cards( + query=q, + request=request, + preview_limit=0, + sort=normalized_sort, + created_by=normalized_created_by, + year=normalized_year, + has_snapshots=normalized_has_snapshots, + ), + "sort": normalized_sort, + "created_by": normalized_created_by, + "year": normalized_year, + "has_snapshots": normalized_has_snapshots, + } + + +def _public_tag_listing_enabled() -> bool: + return get_config().PUBLIC_INDEX + + +def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool: + if authenticated_user_from_request(request): + return True + + return _public_tag_listing_enabled() + + +@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None) +def tags_autocomplete(request: HttpRequest, q: str = ""): + """Return tags matching the query for autocomplete.""" + if not _request_has_tag_autocomplete_access(request): + raise HttpError(401, "Authentication required") + + public_only = not request.user.is_authenticated and not request.__dict__.get("_api_token") + queryset = get_matching_tags(q) + public_snapshots = public_snapshots_queryset(Snapshot.objects.all()) + if public_only: + queryset = queryset.filter(snapshot_set__id__in=public_snapshots.values("id")).distinct() + tags = list(queryset[: 50 if not q else 20]) + add_snapshot_counts(tags, snapshot_queryset=public_snapshots if public_only else None) + + return { + "tags": [{"id": tag.pk, "name": tag.name, "num_snapshots": tag.__dict__.get("num_snapshots", 0)} for tag in tags], + } + + +@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create") +def tags_create(request: HttpRequest, data: TagCreateSchema): + """Create a new tag or return existing one.""" + try: + tag, created = get_or_create_tag( + data.name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + "created": created, + } + + +@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag") +def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema): + try: + tag = rename_tag_record(get_tag_by_ref(tag_id), data.name) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + except ValueError as err: + raise HttpError(400, str(err)) from err + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + } + + +@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag") +def delete_tag(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + deleted_count, _ = delete_tag_record(tag) + return { + "success": True, + "tag_id": int(tag_id), + "deleted_count": deleted_count, + } + + +@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export") +def tag_urls_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"' + return response + + +@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export") +def tag_snapshots_export(request: HttpRequest, tag_id: int): + try: + tag = get_tag_by_ref(tag_id) + except Tag.DoesNotExist as err: + raise HttpError(404, "Tag not found") from err + + response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8") + response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"' + return response + + +@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot") +def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): + """Add a tag to a snapshot. Creates the tag if it doesn't exist.""" + snapshot = _get_snapshot_for_tag_edit(data.snapshot_id) + + # Get or create the tag + if data.tag_name: + try: + tag, _ = get_or_create_tag( + data.tag_name, + created_by=request.user if request.user.is_authenticated else None, + ) + except ValueError as err: + raise HttpError(400, str(err)) from err + elif data.tag_id: + try: + tag = get_tag_by_ref(data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + else: + raise HttpError(400, "Either tag_name or tag_id is required") + + # Add the tag to the snapshot + snapshot.tags.add(tag.pk) + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + } + + +@router.post("/tags/remove-from-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_remove_from_snapshot") +def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema): + """Remove a tag from a snapshot.""" + snapshot = _get_snapshot_for_tag_edit(data.snapshot_id) + + # Get the tag + if data.tag_id: + try: + tag = Tag.objects.get(pk=data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + elif data.tag_name: + try: + tag = Tag.objects.get(name__iexact=data.tag_name.strip()) + except Tag.DoesNotExist: + raise HttpError(404, "Tag not found") + else: + raise HttpError(400, "Either tag_name or tag_id is required") + + # Remove the tag from the snapshot + snapshot.tags.remove(tag.pk) + + return { + "success": True, + "tag_id": tag.pk, + "tag_name": tag.name, + } diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py new file mode 100644 index 0000000000..711d7d6519 --- /dev/null +++ b/archivebox/api/v1_crawls.py @@ -0,0 +1,260 @@ +__package__ = "archivebox.api" + +from pathlib import Path +from uuid import UUID +from datetime import datetime +from django.http import FileResponse, HttpRequest +from django.shortcuts import redirect +from django.utils import timezone + +from django.contrib.auth import get_user_model +from django.contrib.auth.models import User + +from ninja import Router, Schema +from ninja.errors import HttpError + +from archivebox.core.models import Snapshot +from archivebox.core.permissions import ( + PERMISSIONS_PUBLIC, + PERMISSIONS_UNLISTED, + is_admin_user, + normalize_permissions, +) +from archivebox.config.common import get_config +from archivebox.crawls.models import Crawl +from archivebox.misc.util import filter_queryset_by_uuid_substring + +from .auth import API_AUTH_METHODS, authenticated_user_from_request + +router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS) + + +class CrawlSchema(Schema): + TYPE: str = "crawls.models.Crawl" + + id: UUID + + modified_at: datetime + created_at: datetime + created_by_id: str + created_by_username: str + + status: str + retry_at: datetime | None + is_paused: bool + + urls: str + max_depth: int + tags_str: str + config: dict + + # snapshots: List[SnapshotSchema] + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by_id) + + @staticmethod + def resolve_created_by_username(obj): + user_model = get_user_model() + user = user_model.objects.get(id=obj.created_by_id) + username = user.username + return username if isinstance(username, str) else str(user) + + @staticmethod + def resolve_config(obj): + # Redact credential values so REST responses can never leak the raw + # token/secret/api-key that the operator stored in Crawl.config. + from archivebox.config.common import redact_sensitive_config + + return redact_sensitive_config(obj.config) + + @staticmethod + def resolve_snapshots(obj, context): + if bool(context["request"].__dict__.get("with_snapshots", False)): + return obj.snapshot_set.all().distinct() + return Snapshot.objects.none() + + +class CrawlUpdateSchema(Schema): + action: str | None = None + status: str | None = None + retry_at: datetime | None = None + tags: list[str] | None = None + tags_str: str | None = None + + +class CrawlCreateSchema(Schema): + urls: list[str] + max_depth: int = 0 + tags: list[str] | None = None + tags_str: str = "" + label: str = "" + notes: str = "" + config: dict = {} + + +class CrawlDeleteResponseSchema(Schema): + success: bool + crawl_id: str + deleted_count: int + deleted_snapshots: int + + +def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]: + if tags is not None: + return [tag.strip() for tag in tags if tag and tag.strip()] + return [tag.strip() for tag in tags_str.split(",") if tag.strip()] + + +@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls") +def get_crawls(request: HttpRequest): + return Crawl.objects.all().distinct() + + +@router.post("/crawls", response=CrawlSchema, url_name="create_crawl") +def create_crawl(request: HttpRequest, data: CrawlCreateSchema): + urls = [url.strip() for url in data.urls if url and url.strip()] + if not urls: + raise HttpError(400, "At least one URL is required") + if data.max_depth not in (0, 1, 2, 3, 4): + raise HttpError(400, "max_depth must be between 0 and 4") + + tags = normalize_tag_list(data.tags, data.tags_str) + config = dict(data.config or {}) + config.setdefault("PERMISSIONS", str(get_config().PERMISSIONS)) + crawl = Crawl.objects.create( + urls="\n".join(urls), + max_depth=data.max_depth, + tags_str=",".join(tags), + label=data.label, + notes=data.notes, + config=config, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + created_by=request.user if isinstance(request.user, User) else None, + ) + return crawl + + +def get_crawl_by_ref(crawl_id: str): + return filter_queryset_by_uuid_substring(Crawl.objects.all(), crawl_id).get() + + +@router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl") +def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False): + """Get a specific Crawl by id.""" + setattr(request, "with_snapshots", with_snapshots) + setattr(request, "with_archiveresults", with_archiveresults) + crawl = get_crawl_by_ref(crawl_id) + + if crawl and as_rss: + query = request.GET.copy() + query.pop("as_rss", None) + query["crawl_id"] = str(crawl.id) + return redirect(f"/api/v1/core/snapshots.rss?{query.urlencode()}") + + return crawl + + +def crawl_file(request: HttpRequest, crawl_id: str, path: str): + # Try to resolve the crawl first; if it doesn't exist, return 404. + try: + crawl = get_crawl_by_ref(crawl_id) + except Crawl.DoesNotExist: + raise HttpError(404, "Crawl not found") + + user = authenticated_user_from_request(request) + + # Gate access using the same model as SnapshotView/can_view_snapshot: + # admins always pass; owners can see their own crawls; otherwise the crawl + # must be PUBLIC or UNLISTED. Don't disclose existence of private crawls. + if not is_admin_user(request): + permissions = normalize_permissions(crawl.permissions) + is_owner = bool(user and crawl.created_by_id == user.id) + if not is_owner and permissions not in {PERMISSIONS_PUBLIC, PERMISSIONS_UNLISTED}: + raise HttpError(404, "Crawl not found") + + crawl_root = Path(crawl.output_dir).resolve() + file_path = (crawl_root / path).resolve() + if not file_path.is_file() or crawl_root not in file_path.parents: + raise HttpError(404, "Crawl file not found") + + response = FileResponse(file_path.open("rb")) + response["Cache-Control"] = "no-store, no-cache, max-age=0, must-revalidate" + response["Pragma"] = "no-cache" + response["Expires"] = "0" + response["X-Content-Type-Options"] = "nosniff" + return response + + +@router.get("/crawl/{crawl_id}/files/{filename}", auth=None, url_name="crawl_file_root") +def crawl_file_root(request: HttpRequest, crawl_id: str, filename: str): + return crawl_file(request, crawl_id, filename) + + +@router.get("/crawl/{crawl_id}/files/{folder}/{filename}", auth=None, url_name="crawl_file_nested_1") +def crawl_file_nested_1(request: HttpRequest, crawl_id: str, folder: str, filename: str): + return crawl_file(request, crawl_id, f"{folder}/{filename}") + + +@router.get("/crawl/{crawl_id}/files/{folder}/{subfolder}/{filename}", auth=None, url_name="crawl_file_nested_2") +def crawl_file_nested_2(request: HttpRequest, crawl_id: str, folder: str, subfolder: str, filename: str): + return crawl_file(request, crawl_id, f"{folder}/{subfolder}/{filename}") + + +@router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl") +def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema): + """Update a crawl (e.g., set status=sealed to cancel queued work).""" + crawl = get_crawl_by_ref(crawl_id) + payload = data.dict(exclude_unset=True) + update_fields = ["modified_at"] + + action = payload.pop("action", None) + if action: + if action == "pause": + crawl.pause() + return crawl + if action in ("resume", "unpause"): + crawl.resume() + return crawl + if action == "cancel": + crawl.cancel() + return crawl + raise HttpError(400, f"Invalid action: {action}") + + tags = payload.pop("tags", None) + tags_str = payload.pop("tags_str", None) + if tags is not None or tags_str is not None: + crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or "")) + update_fields.append("tags_str") + + if "status" in payload: + if payload["status"] not in Crawl.StatusChoices.values: + raise HttpError(400, f"Invalid status: {payload['status']}") + if payload["status"] == Crawl.StatusChoices.SEALED: + crawl.cancel() + return crawl + crawl.status = payload["status"] + update_fields.append("status") + + if "retry_at" in payload: + crawl.retry_at = payload["retry_at"] + update_fields.append("retry_at") + + crawl.save(update_fields=update_fields) + return crawl + + +@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl") +def delete_crawl(request: HttpRequest, crawl_id: str): + crawl = get_crawl_by_ref(crawl_id) + crawl_id_str = str(crawl.id) + snapshot_count = crawl.snapshot_set.count() + deleted_count, _ = crawl.delete() + return { + "success": True, + "crawl_id": crawl_id_str, + "deleted_count": deleted_count, + "deleted_snapshots": snapshot_count, + } diff --git a/archivebox/api/v1_machine.py b/archivebox/api/v1_machine.py new file mode 100644 index 0000000000..e18dbe48fe --- /dev/null +++ b/archivebox/api/v1_machine.py @@ -0,0 +1,161 @@ +__package__ = "archivebox.api" + +from uuid import UUID +from typing import Annotated +from datetime import datetime + +from django.http import HttpRequest + +from ninja import FilterLookup, FilterSchema, Query, Router, Schema +from ninja.pagination import paginate + +from archivebox.api.v1_core import CustomPagination + + +router = Router(tags=["Machine and Dependencies"]) + + +# ============================================================================ +# Machine Schemas +# ============================================================================ + + +class MachineSchema(Schema): + """Schema for Machine model.""" + + TYPE: str = "machine.Machine" + id: UUID + created_at: datetime + modified_at: datetime + guid: str + hostname: str + hw_in_docker: bool + hw_in_vm: bool + hw_manufacturer: str + hw_product: str + hw_uuid: str + os_arch: str + os_family: str + os_platform: str + os_release: str + os_kernel: str + stats: dict + num_uses_succeeded: int + num_uses_failed: int + + +class MachineFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup("id__startswith")] = None + hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None + os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None + os_arch: Annotated[str | None, FilterLookup("os_arch")] = None + hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None + hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None + bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None + + +# ============================================================================ +# Binary Schemas +# ============================================================================ + + +class BinarySchema(Schema): + """Schema for Binary model.""" + + TYPE: str = "machine.Binary" + id: UUID + created_at: datetime + modified_at: datetime + machine_id: UUID + machine_hostname: str + name: str + binproviders: str + binprovider: str + abspath: str + version: str + sha256: str + status: str + is_valid: bool + num_uses_succeeded: int + num_uses_failed: int + + @staticmethod + def resolve_machine_hostname(obj) -> str: + return obj.machine.hostname + + @staticmethod + def resolve_is_valid(obj) -> bool: + return obj.is_valid + + +class BinaryFilterSchema(FilterSchema): + id: Annotated[str | None, FilterLookup("id__startswith")] = None + name: Annotated[str | None, FilterLookup("name__icontains")] = None + binprovider: Annotated[str | None, FilterLookup("binprovider")] = None + status: Annotated[str | None, FilterLookup("status")] = None + machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None + version: Annotated[str | None, FilterLookup("version__icontains")] = None + + +# ============================================================================ +# Machine Endpoints +# ============================================================================ + + +@router.get("/machines", response=list[MachineSchema], url_name="get_machines") +@paginate(CustomPagination) +def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]): + """List all machines.""" + from archivebox.machine.models import Machine + + return filters.filter(Machine.objects.all()).distinct() + + +@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine") +def get_current_machine(request: HttpRequest): + """Get the current machine.""" + from archivebox.machine.models import Machine + + return Machine.current() + + +@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine") +def get_machine(request: HttpRequest, machine_id: str): + """Get a specific machine by ID.""" + from archivebox.machine.models import Machine + from django.db.models import Q + + return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id)) + + +# ============================================================================ + + +# ============================================================================ +# Binary Endpoints +# ============================================================================ + + +@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries") +@paginate(CustomPagination) +def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]): + """List all binaries.""" + from archivebox.machine.models import Binary + + return filters.filter(Binary.objects.all().select_related("machine")).distinct() + + +@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary") +def get_binary(request: HttpRequest, binary_id: str): + """Get a specific binary by ID.""" + from archivebox.machine.models import Binary + + return Binary.objects.select_related("machine").get(id__startswith=binary_id) + + +@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name") +def get_binaries_by_name(request: HttpRequest, name: str): + """Get all binaries with the given name.""" + from archivebox.machine.models import Binary + + return list(Binary.objects.filter(name__iexact=name).select_related("machine")) diff --git a/archivebox/api/v1_personas.py b/archivebox/api/v1_personas.py new file mode 100644 index 0000000000..15036202eb --- /dev/null +++ b/archivebox/api/v1_personas.py @@ -0,0 +1,168 @@ +__package__ = "archivebox.api" + +import json +from datetime import datetime +from typing import Any +from uuid import UUID + +from django.db.models import Q +from django.http import HttpRequest +from ninja import Router, Schema +from ninja.pagination import paginate +from pydantic import Field + +from archivebox.api.v1_core import CustomPagination +from archivebox.personas.importers import validate_persona_name +from archivebox.personas.models import Persona + + +router = Router(tags=["Personas"]) + + +class PersonaBrowserSettingsSchema(Schema): + user_agent: str = "" + viewport_size: str = "" + viewport_device_scale_factor: float | None = None + language: str = "" + timezone: str = "" + geolocation: dict[str, Any] | None = None + + +class PersonaSyncSchema(Schema): + extension_persona_id: str + name: str + settings: PersonaBrowserSettingsSchema = Field(default_factory=PersonaBrowserSettingsSchema) + cookies_txt: str = "" + auth_json: dict[str, Any] = Field(default_factory=dict) + + +class PersonaSchema(Schema): + TYPE: str = "personas.models.Persona" + id: UUID + name: str + created_at: datetime + created_by_id: str + created_by_username: str + config: dict[str, Any] | None + + @staticmethod + def resolve_created_by_id(obj): + return str(obj.created_by.pk) + + @staticmethod + def resolve_created_by_username(obj) -> str: + return obj.created_by.username + + @staticmethod + def resolve_config(obj): + # Redact credential values so REST responses don't leak the raw + # token/secret/api-key the operator stored in Persona.config. + from archivebox.config.common import redact_sensitive_config + + return redact_sensitive_config(obj.config) + + +class PersonaSyncResponseSchema(Schema): + success: bool + created: bool + persona: PersonaSchema + cookies_file_written: bool + auth_file_written: bool + + +def browser_settings_to_config(extension_persona_id: str, settings: PersonaBrowserSettingsSchema) -> dict[str, Any]: + config: dict[str, Any] = { + "BROWSER_EXTENSION_PERSONA_ID": extension_persona_id, + "BROWSER_EXTENSION_SYNCED_AT": datetime.utcnow().isoformat() + "Z", + } + + if settings.user_agent: + config.update( + { + "USER_AGENT": settings.user_agent, + "CHROME_USER_AGENT": settings.user_agent, + "WGET_USER_AGENT": settings.user_agent, + "CURL_USER_AGENT": settings.user_agent, + }, + ) + if settings.viewport_size: + config.update( + { + "RESOLUTION": settings.viewport_size, + "CHROME_RESOLUTION": settings.viewport_size, + }, + ) + if settings.viewport_device_scale_factor is not None: + config["BROWSER_DEVICE_SCALE_FACTOR"] = settings.viewport_device_scale_factor + if settings.language: + config["BROWSER_LANGUAGE"] = settings.language + if settings.timezone: + config["BROWSER_TIMEZONE"] = settings.timezone + if settings.geolocation: + config["BROWSER_GEOLOCATION"] = settings.geolocation + + return config + + +def find_persona(extension_persona_id: str, name: str) -> Persona | None: + return ( + Persona.objects.filter( + Q(config__BROWSER_EXTENSION_PERSONA_ID=extension_persona_id) | Q(name=name), + ) + .order_by("created_at") + .first() + ) + + +@router.get("/personas", response=list[PersonaSchema], url_name="get_personas") +@paginate(CustomPagination) +def get_personas(request: HttpRequest): + """List personas available on this ArchiveBox server.""" + return Persona.objects.all().order_by("name") + + +@router.post("/sync", response=PersonaSyncResponseSchema, url_name="sync_persona") +def sync_persona(request: HttpRequest, payload: PersonaSyncSchema): + """ + Create or update a Persona from a browser extension profile export. + + The extension sends browser settings plus portable auth artifacts. The server + keeps browser override settings in Persona.config and writes cookies.txt / + auth.json into the persona directory for extractors to consume. + """ + name = payload.name.strip() + is_valid, error_message = validate_persona_name(name) + if not is_valid: + raise ValueError(error_message) + + persona = find_persona(payload.extension_persona_id, name) + created = persona is None + if persona is None: + persona = Persona(name=name) + if request.user.is_authenticated: + persona.created_by = request.user + + persona.config = { + **(persona.config or {}), + **browser_settings_to_config(payload.extension_persona_id, payload.settings), + } + persona.save() + persona.ensure_dirs() + + cookies_written = False + if payload.cookies_txt.strip(): + (persona.path / "cookies.txt").write_text(payload.cookies_txt) + cookies_written = True + + auth_written = False + if payload.auth_json: + (persona.path / "auth.json").write_text(json.dumps(payload.auth_json, indent=2, sort_keys=True) + "\n") + auth_written = True + + return { + "success": True, + "created": created, + "persona": persona, + "cookies_file_written": cookies_written, + "auth_file_written": auth_written, + } diff --git a/archivebox/api/webhooks.py b/archivebox/api/webhooks.py new file mode 100644 index 0000000000..78e5ec3310 --- /dev/null +++ b/archivebox/api/webhooks.py @@ -0,0 +1,32 @@ +__package__ = "archivebox.api" + +from typing import Any +from collections.abc import Callable +import logging + +from django.db import transaction +from signal_webhooks.handlers import sync_task_handler + + +logger = logging.getLogger(__name__) + + +def warning_error_handler(hook: Any, error: Exception | None) -> None: + if error is not None: + logger.warning("Outbound webhook %r failed: %s", hook.name, error) + return + + logger.warning("Outbound webhook %r returned a non-success response.", hook.name) + + +def transaction_on_commit_task_handler(hook: Callable[..., None], **kwargs: Any) -> None: + def run_webhook() -> None: + try: + sync_task_handler(hook, **kwargs) + except Exception: + logger.warning("Outbound webhook failed after transaction commit.", exc_info=True) + + try: + transaction.on_commit(run_webhook) + except Exception: + logger.warning("Could not schedule outbound webhook after transaction commit.", exc_info=True) diff --git a/archivebox/archive.py b/archivebox/archive.py deleted file mode 100755 index 5c0d195d93..0000000000 --- a/archivebox/archive.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 -""" -ArchiveBox command line application. - -./archive and ./bin/archivebox both point to this file, -but you can also run it directly using `python3 archive.py` - -Usage & Documentation: - https://github.com/pirate/ArchiveBox/Wiki -""" - -import os -import sys - -from links import links_after_timestamp -from index import write_links_index, load_links_index -from archive_methods import archive_link -from config import ( - ARCHIVE_DIR, - ONLY_NEW, - OUTPUT_DIR, - GIT_SHA, -) -from util import ( - save_remote_source, - save_stdin_source, -) -from logs import ( - log_archiving_started, - log_archiving_paused, - log_archiving_finished, -) - -__AUTHOR__ = 'Nick Sweeting ' -__VERSION__ = GIT_SHA[:9] -__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' -__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' - - -def print_help(): - print('ArchiveBox: The self-hosted internet archive.\n') - print("Documentation:") - print(" https://github.com/pirate/ArchiveBox/wiki\n") - print("UI Usage:") - print(" Open output/index.html to view your archive.\n") - print("CLI Usage:") - print(" echo 'https://example.com' | ./archive\n") - print(" ./archive ~/Downloads/bookmarks_export.html\n") - print(" ./archive https://example.com/feed.rss\n") - print(" ./archive 15109948213.123\n") - - -def main(*args): - if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2: - print_help() - raise SystemExit(0) - - if set(args).intersection(('--version', 'version')): - print('ArchiveBox version {}'.format(__VERSION__)) - raise SystemExit(0) - - ### Handle CLI arguments - # ./archive bookmarks.html - # ./archive 1523422111.234 - import_path, resume = None, None - if len(args) == 2: - # if the argument is a string, it's a import_path file to import - # if it's a number, it's a timestamp to resume archiving from - if args[1].replace('.', '').isdigit(): - import_path, resume = None, args[1] - else: - import_path, resume = args[1], None - - ### Set up output folder - if not os.path.exists(OUTPUT_DIR): - os.makedirs(OUTPUT_DIR) - - ### Handle ingesting urls piped in through stdin - # (.e.g if user does cat example_urls.txt | ./archive) - if not sys.stdin.isatty(): - stdin_raw_text = sys.stdin.read() - if stdin_raw_text and import_path: - print( - '[X] You should pass either a path as an argument, ' - 'or pass a list of links via stdin, but not both.\n' - ) - print_help() - raise SystemExit(1) - - import_path = save_stdin_source(stdin_raw_text) - - ### Handle ingesting urls from a remote file/feed - # (e.g. if an RSS feed URL is used as the import path) - if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - import_path = save_remote_source(import_path) - - ### Run the main archive update process - update_archive_data(import_path=import_path, resume=resume) - - -def update_archive_data(import_path=None, resume=None): - """The main ArchiveBox entrancepoint. Everything starts here.""" - - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path - all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) - - # Step 2: Write updated index with deduped old and new links back to disk - write_links_index(out_dir=OUTPUT_DIR, links=all_links) - - # Step 3: Run the archive methods for each link - links = new_links if ONLY_NEW else all_links - log_archiving_started(len(links), resume) - idx, link = 0, 0 - try: - for idx, link in enumerate(links_after_timestamp(links, resume)): - link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) - archive_link(link_dir, link) - - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link and link['timestamp']) - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) - - # Step 4: Re-write links index with updated titles, icons, and resources - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) - write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True) - - -if __name__ == '__main__': - main(*sys.argv) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py deleted file mode 100644 index b2f04f3349..0000000000 --- a/archivebox/archive_methods.py +++ /dev/null @@ -1,622 +0,0 @@ -import os - -from collections import defaultdict -from datetime import datetime - -from index import ( - write_link_index, - patch_links_index, - load_json_link_index, -) -from config import ( - CURL_BINARY, - GIT_BINARY, - WGET_BINARY, - YOUTUBEDL_BINARY, - FETCH_FAVICON, - FETCH_TITLE, - FETCH_WGET, - FETCH_WGET_REQUISITES, - FETCH_PDF, - FETCH_SCREENSHOT, - FETCH_DOM, - FETCH_WARC, - FETCH_GIT, - FETCH_MEDIA, - SUBMIT_ARCHIVE_DOT_ORG, - TIMEOUT, - MEDIA_TIMEOUT, - ANSI, - OUTPUT_DIR, - GIT_DOMAINS, - GIT_SHA, - WGET_USER_AGENT, - CHECK_SSL_VALIDITY, - COOKIES_FILE, - WGET_AUTO_COMPRESSION -) -from util import ( - domain, - extension, - without_query, - without_fragment, - fetch_page_title, - is_static_file, - TimedProgress, - chmod_file, - wget_output_path, - chrome_args, - check_link_structure, - run, PIPE, DEVNULL -) -from logs import ( - log_link_archiving_started, - log_link_archiving_finished, - log_archive_method_started, - log_archive_method_finished, -) - - - -class ArchiveError(Exception): - def __init__(self, message, hints=None): - super().__init__(message) - self.hints = hints - - -def archive_link(link_dir, link): - """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - - ARCHIVE_METHODS = ( - ('title', should_fetch_title, fetch_title), - ('favicon', should_fetch_favicon, fetch_favicon), - ('wget', should_fetch_wget, fetch_wget), - ('pdf', should_fetch_pdf, fetch_pdf), - ('screenshot', should_fetch_screenshot, fetch_screenshot), - ('dom', should_fetch_dom, fetch_dom), - ('git', should_fetch_git, fetch_git), - ('media', should_fetch_media, fetch_media), - ('archive_org', should_fetch_archive_dot_org, archive_dot_org), - ) - - try: - is_new = not os.path.exists(link_dir) - if is_new: - os.makedirs(link_dir) - - link = load_json_link_index(link_dir, link) - log_link_archiving_started(link_dir, link, is_new) - stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} - - for method_name, should_run, method_function in ARCHIVE_METHODS: - if method_name not in link['history']: - link['history'][method_name] = [] - - if should_run(link_dir, link): - log_archive_method_started(method_name) - - result = method_function(link_dir, link) - link['history'][method_name].append(result) - - stats[result['status']] += 1 - log_archive_method_finished(result) - else: - stats['skipped'] += 1 - - # print(' ', stats) - - write_link_index(link_dir, link) - patch_links_index(link) - log_link_archiving_finished(link_dir, link, is_new, stats) - - except Exception as err: - print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err)) - raise - - return link - - -### Archive Method Functions - -def should_fetch_title(link_dir, link): - # if link already has valid title, skip it - if link['title'] and not link['title'].lower().startswith('http'): - return False - - if is_static_file(link['url']): - return False - - return FETCH_TITLE - -def fetch_title(link_dir, link, timeout=TIMEOUT): - """try to guess the page's title from its content""" - - output = None - cmd = [ - CURL_BINARY, - link['url'], - '|', - 'grep', - '', - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - output = fetch_page_title(link['url'], timeout=timeout, progress=False) - if not output: - raise ArchiveError('Unable to detect page title') - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - - -def should_fetch_favicon(link_dir, link): - if os.path.exists(os.path.join(link_dir, 'favicon.ico')): - return False - - return FETCH_FAVICON - -def fetch_favicon(link_dir, link, timeout=TIMEOUT): - """download site favicon from google's favicon api""" - - output = 'favicon.ico' - cmd = [ - CURL_BINARY, - '--max-time', str(timeout), - '--location', - '--output', output, - *(() if CHECK_SSL_VALIDITY else ('--insecure',)), - 'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])), - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - chmod_file(output, cwd=link_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def should_fetch_wget(link_dir, link): - output_path = wget_output_path(link) - if output_path and os.path.exists(os.path.join(link_dir, output_path)): - return False - - return FETCH_WGET - - -def fetch_wget(link_dir, link, timeout=TIMEOUT): - """download full site using wget""" - - if FETCH_WARC: - warc_dir = os.path.join(link_dir, 'warc') - os.makedirs(warc_dir, exist_ok=True) - warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) - - # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html - output = None - cmd = [ - WGET_BINARY, - # '--server-response', # print headers for better error parsing - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - '--restrict-file-names=windows', - '--timeout={}'.format(timeout), - *(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), - *(() if FETCH_WARC else ('--timestamping',)), - *(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), - *(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), - *(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), - *(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), - *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), - link['url'], - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - output = wget_output_path(link) - - # parse out number of files downloaded from last line of stderr: - # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] - if line.strip() - ] - files_downloaded = ( - int(output_tail[-1].strip().split(' ', 2)[1] or 0) - if 'Downloaded:' in output_tail[-1] - else 0 - ) - - # Check for common failure cases - if result.returncode > 0 and files_downloaded < 1: - hints = ( - 'Got wget response code: {}.'.format(result.returncode), - *output_tail, - ) - if b'403: Forbidden' in result.stderr: - raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) - if b'404: Not Found' in result.stderr: - raise ArchiveError('404 Not Found', hints) - if b'ERROR 500: Internal Server Error' in result.stderr: - raise ArchiveError('500 Internal Server Error', hints) - raise ArchiveError('Got an error from the server', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def should_fetch_pdf(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'output.pdf')): - return False - - return FETCH_PDF - - -def fetch_pdf(link_dir, link, timeout=TIMEOUT): - """print PDF of site to file using chrome --headless""" - - output = 'output.pdf' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--print-to-pdf', - link['url'], - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to print PDF', hints) - - chmod_file('output.pdf', cwd=link_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def should_fetch_screenshot(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'screenshot.png')): - return False - - return FETCH_SCREENSHOT - -def fetch_screenshot(link_dir, link, timeout=TIMEOUT): - """take screenshot of site using chrome --headless""" - - output = 'screenshot.png' - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--screenshot', - link['url'], - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout) - - if result.returncode: - hints = (result.stderr or result.stdout).decode() - raise ArchiveError('Failed to take screenshot', hints) - - chmod_file(output, cwd=link_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def should_fetch_dom(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'output.html')): - return False - - return FETCH_DOM - -def fetch_dom(link_dir, link, timeout=TIMEOUT): - """print HTML of site to file using chrome --dump-html""" - - output = 'output.html' - output_path = os.path.join(link_dir, output) - cmd = [ - *chrome_args(TIMEOUT=timeout), - '--dump-dom', - link['url'] - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - with open(output_path, 'w+') as f: - result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout) - - if result.returncode: - hints = result.stderr.decode() - raise ArchiveError('Failed to fetch DOM', hints) - - chmod_file(output, cwd=link_dir) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def should_fetch_git(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'git')): - return False - - is_clonable_url = ( - (domain(link['url']) in GIT_DOMAINS) - or (extension(link['url']) == 'git') - ) - if not is_clonable_url: - return False - - return FETCH_GIT - - -def fetch_git(link_dir, link, timeout=TIMEOUT): - """download full site using git""" - - output = 'git' - output_path = os.path.join(link_dir, 'git') - os.makedirs(output_path, exist_ok=True) - cmd = [ - GIT_BINARY, - 'clone', - '--mirror', - '--recursive', - *(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), - without_query(without_fragment(link['url'])), - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - - if result.returncode == 128: - # ignore failed re-download when the folder already exists - pass - elif result.returncode > 0: - hints = 'Got git response code: {}.'.format(result.returncode) - raise ArchiveError('Failed git download', hints) - - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - - -def should_fetch_media(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'media')): - return False - - return FETCH_MEDIA - -def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT): - """Download playlists or individual video, audio, and subtitles using youtube-dl""" - - output = 'media' - output_path = os.path.join(link_dir, 'media') - os.makedirs(output_path, exist_ok=True) - cmd = [ - YOUTUBEDL_BINARY, - '--write-description', - '--write-info-json', - '--write-annotations', - '--yes-playlist', - '--write-thumbnail', - '--no-call-home', - '--no-check-certificate', - '--user-agent', - '--all-subs', - '--extract-audio', - '--keep-video', - '--ignore-errors', - '--geo-bypass', - '--audio-format', 'mp3', - '--audio-quality', '320K', - '--embed-thumbnail', - '--add-metadata', - *(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), - link['url'], - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1) - chmod_file(output, cwd=link_dir) - if result.returncode: - if (b'ERROR: Unsupported URL' in result.stderr - or b'HTTP Error 404' in result.stderr - or b'HTTP Error 403' in result.stderr - or b'URL could be a direct video link' in result.stderr - or b'Unable to extract container ID' in result.stderr): - # These happen too frequently on non-media pages to warrant printing to console - pass - else: - hints = ( - 'Got youtube-dl response code: {}.'.format(result.returncode), - *result.stderr.decode().split('\n'), - ) - raise ArchiveError('Failed to download media', hints) - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - - -def should_fetch_archive_dot_org(link_dir, link): - if is_static_file(link['url']): - return False - - if os.path.exists(os.path.join(link_dir, 'archive.org.txt')): - # if open(path, 'r').read().strip() != 'None': - return False - - return SUBMIT_ARCHIVE_DOT_ORG - -def archive_dot_org(link_dir, link, timeout=TIMEOUT): - """submit site to archive.org for archiving via their service, save returned archive url""" - - output = 'archive.org.txt' - archive_org_url = None - submit_url = 'https://web.archive.org/save/{}'.format(link['url']) - cmd = [ - CURL_BINARY, - '--location', - '--head', - '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(GIT_SHA), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from - '--max-time', str(timeout), - *(() if CHECK_SSL_VALIDITY else ('--insecure',)), - submit_url, - ] - status = 'succeeded' - timer = TimedProgress(timeout, prefix=' ') - try: - result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout) - content_location, errors = parse_archive_dot_org_response(result.stdout) - if content_location: - archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) - elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: - archive_org_url = None - # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url']))) - elif errors: - raise ArchiveError(', '.join(errors)) - else: - raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.') - except Exception as err: - status = 'failed' - output = err - finally: - timer.end() - - if not isinstance(output, Exception): - # instead of writing None when archive.org rejects the url write the - # url to resubmit it to archive.org. This is so when the user visits - # the URL in person, it will attempt to re-archive it, and it'll show the - # nicer error message explaining why the url was rejected if it fails. - archive_org_url = archive_org_url or submit_url - with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f: - f.write(archive_org_url) - chmod_file('archive.org.txt', cwd=link_dir) - output = archive_org_url - - return { - 'cmd': cmd, - 'pwd': link_dir, - 'output': output, - 'status': status, - **timer.stats, - } - -def parse_archive_dot_org_response(response): - # Parse archive.org response headers - headers = defaultdict(list) - - # lowercase all the header names and store in dict - for header in response.splitlines(): - if b':' not in header or not header.strip(): - continue - name, val = header.decode().split(':', 1) - headers[name.lower().strip()].append(val.strip()) - - # Get successful archive url in "content-location" header or any errors - content_location = headers['content-location'] - errors = headers['x-archive-wayback-runtime-error'] - return content_location, errors diff --git a/archivebox/base_models/__init__.py b/archivebox/base_models/__init__.py new file mode 100644 index 0000000000..7c4b68536c --- /dev/null +++ b/archivebox/base_models/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.base_models" diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py new file mode 100644 index 0000000000..887dae39b5 --- /dev/null +++ b/archivebox/base_models/admin.py @@ -0,0 +1,891 @@ +"""Base admin classes for models using UUIDv7.""" + +__package__ = "archivebox.base_models" + +import json +import uuid +from collections.abc import Mapping +from typing import NotRequired, TypedDict, cast + +from django import forms +from django.contrib import admin +from django.db import models +from django.forms.renderers import BaseRenderer +from django.http import HttpRequest, QueryDict +from django.urls import path, register_converter +from django.utils.safestring import SafeString, mark_safe +from django_object_actions import DjangoObjectActions + + +class HexUUIDConverter: + """URL path converter that canonicalizes UUIDs to their 32-char hex form. + + Accepts both the hyphenated (``aaaaaaaa-bbbb-...``) and bare-hex + (``aaaaaaaabbbb...``) UUID strings on the way in (Django's UUIDField + parses either), but ``to_url`` always emits the bare-hex form. This is + what makes ``reverse("admin:app_model_change", args=[obj.pk])`` produce + ``/admin/app/model/06a1a8facb0d.../change/`` instead of the default + hyphenated rendering โ€” admin links throughout the app reverse through + this converter once ``BaseModelAdmin.get_urls`` swaps in + ``<hexuuid:object_id>`` below. + """ + + regex = r"[0-9a-fA-F]{32}|[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}" + + def to_python(self, value: str) -> str: + # Strip hyphens but stay as a string โ€” Django admin treats the + # captured object_id as a string and calls ``model._meta.pk.to_python`` + # itself, so we don't want to short-circuit that. + return value.replace("-", "") + + def to_url(self, value) -> str: + if isinstance(value, uuid.UUID): + return value.hex + return str(value).replace("-", "") + + +register_converter(HexUUIDConverter, "hexuuid") + + +class ConfigOption(TypedDict): + plugin: str + type: str | list[str] + default: object + description: str + enum: NotRequired[list[object]] + pattern: NotRequired[str] + minimum: NotRequired[int | float] + maximum: NotRequired[int | float] + + +class KeyValueWidget(forms.Widget): + """ + A widget that renders JSON dict as editable key-value input fields + with + and - buttons to add/remove rows. + Includes autocomplete for available config keys from the plugin system. + """ + + template_name = "" # We render manually + + class Media: + css = { + "all": [], + } + js = [] + + def _get_config_options(self) -> dict[str, ConfigOption]: + """Get available config options from plugins.""" + try: + from archivebox.config.common import config_field_metadata + + options: dict[str, ConfigOption] = {} + for key, metadata in config_field_metadata().items(): + option_type = metadata.get("type", "string") + option: ConfigOption = { + "plugin": str(metadata.get("plugin", "archivebox")), + "type": cast(str | list[str], option_type if isinstance(option_type, (str, list)) else str(option_type)), + "default": metadata.get("default", ""), + "description": str(metadata.get("description", "")), + } + schema = metadata.get("schema") + if isinstance(schema, Mapping): + for schema_key in ("enum", "pattern", "minimum", "maximum"): + if schema_key in schema: + option[schema_key] = schema[schema_key] + options[key] = option + return options + except Exception: + return {} + + def _parse_value(self, value: object) -> dict[str, object]: + # Parse JSON value to dict + if value is None: + return {} + if isinstance(value, str): + try: + parsed = json.loads(value) if value else {} + except json.JSONDecodeError: + return {} + return parsed if isinstance(parsed, dict) else {} + if isinstance(value, Mapping): + return {str(key): item for key, item in value.items()} + return {} + + def render( + self, + name: str, + value: object, + attrs: Mapping[str, str] | None = None, + renderer: BaseRenderer | None = None, + ) -> SafeString: + data = self._parse_value(value) + + widget_id = attrs.get("id", name) if attrs else name + config_options = self._get_config_options() + + # Build datalist options + datalist_options = "\n".join( + f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>' + for key, opt in sorted(config_options.items()) + ) + + # Build config metadata as JSON for JS + config_meta_json = json.dumps(config_options) + + html = f''' + <div id="{widget_id}_container" class="key-value-editor" style="width: 100%; max-width: none;"> + <datalist id="{widget_id}_keys"> + {datalist_options} + </datalist> + <div id="{widget_id}_rows" class="key-value-rows"> + ''' + + # Render existing key-value pairs + for key, val in data.items(): + val_str = json.dumps(val) if not isinstance(val, str) else val + html += self._render_row(widget_id, key, val_str) + + # Always add one empty row for new entries + html += self._render_row(widget_id, "", "") + + html += f''' + </div> + <div style="display: flex; gap: 8px; align-items: center; margin-top: 8px;"> + <button type="button" onclick="addKeyValueRow_{widget_id}()" + style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;"> + + Add Row + </button> + </div> + <input type="hidden" name="{name}" id="{widget_id}" value=""> + <script> + (function() {{ + var configMeta_{widget_id} = {config_meta_json}; + var rowCounter_{widget_id} = 0; + + function stringifyValue_{widget_id}(value) {{ + return typeof value === 'string' ? value : JSON.stringify(value); + }} + + function getTypes_{widget_id}(meta) {{ + if (!meta || meta.type === undefined || meta.type === null) {{ + return []; + }} + return Array.isArray(meta.type) ? meta.type : [meta.type]; + }} + + function getMetaForKey_{widget_id}(key) {{ + if (!key) {{ + return null; + }} + + var explicitMeta = configMeta_{widget_id}[key]; + if (explicitMeta) {{ + return Object.assign({{ key: key }}, explicitMeta); + }} + + if (key.endsWith('_BINARY')) {{ + return {{ + key: key, + plugin: 'custom', + type: 'string', + default: '', + description: 'Path to binary executable', + }}; + }} + + if (isRegexConfigKey_{widget_id}(key)) {{ + return {{ + key: key, + plugin: 'custom', + type: 'string', + default: '', + description: 'Regex pattern list', + }}; + }} + + return null; + }} + + function describeMeta_{widget_id}(meta) {{ + if (!meta) {{ + return ''; + }} + + var details = ''; + if (Array.isArray(meta.enum) && meta.enum.length) {{ + details = 'Allowed: ' + meta.enum.map(stringifyValue_{widget_id}).join(', '); + }} else {{ + var types = getTypes_{widget_id}(meta); + if (types.length) {{ + details = 'Expected: ' + types.join(' or '); + }} + }} + + if (meta.minimum !== undefined || meta.maximum !== undefined) {{ + var bounds = []; + if (meta.minimum !== undefined) bounds.push('min ' + meta.minimum); + if (meta.maximum !== undefined) bounds.push('max ' + meta.maximum); + details += (details ? ' ' : '') + '(' + bounds.join(', ') + ')'; + }} + + return [meta.description || '', details].filter(Boolean).join(' '); + }} + + function getExampleInput_{widget_id}(key, meta) {{ + var types = getTypes_{widget_id}(meta); + if (key.endsWith('_BINARY')) {{ + return 'Example: wget or /usr/bin/wget'; + }} + if (key.endsWith('_ARGS_EXTRA') || key.endsWith('_ARGS')) {{ + return 'Example: ["--extra-arg"]'; + }} + if (types.includes('array')) {{ + return 'Example: ["value"]'; + }} + if (types.includes('object')) {{ + return 'Example: {{"key": "value"}}'; + }} + return ''; + }} + + function isRegexConfigKey_{widget_id}(key) {{ + return key === 'URL_ALLOWLIST' || + key === 'URL_DENYLIST' || + key.endsWith('_PATTERN') || + key.includes('REGEX'); + }} + + function isSimpleFilterPattern_{widget_id}(pattern) {{ + return /^[\\w.*:-]+$/.test(pattern); + }} + + function validateRegexPattern_{widget_id}(pattern) {{ + if (!pattern || isSimpleFilterPattern_{widget_id}(pattern)) {{ + return ''; + }} + + try {{ + new RegExp(pattern); + }} catch (error) {{ + return error && error.message ? error.message : 'Invalid regex'; + }} + return ''; + }} + + function validateRegexConfig_{widget_id}(key, raw, typeName) {{ + if (typeName === 'object') {{ + var parsed; + try {{ + parsed = JSON.parse(raw); + }} catch (error) {{ + return {{ ok: false, value: raw, message: 'Must be valid JSON' }}; + }} + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {{ + return {{ ok: false, value: parsed, message: 'Must be a JSON object' }}; + }} + for (var regexKey in parsed) {{ + var objectRegexError = validateRegexPattern_{widget_id}(regexKey); + if (objectRegexError) {{ + return {{ ok: false, value: parsed, message: 'Invalid regex key "' + regexKey + '": ' + objectRegexError }}; + }} + }} + return {{ ok: true, value: parsed, message: '' }}; + }} + + var patterns = raw.split(/[\\n,]+/).map(function(pattern) {{ + return pattern.trim(); + }}).filter(Boolean); + for (var i = 0; i < patterns.length; i++) {{ + var regexError = validateRegexPattern_{widget_id}(patterns[i]); + if (regexError) {{ + return {{ ok: false, value: raw, message: 'Invalid regex "' + patterns[i] + '": ' + regexError }}; + }} + }} + return {{ ok: true, value: raw, message: '' }}; + }} + + function validateBinaryValue_{widget_id}(raw) {{ + if (!raw) {{ + return {{ ok: true, value: raw, message: '' }}; + }} + + if (/['"`]/.test(raw)) {{ + return {{ ok: false, value: raw, message: 'Binary paths cannot contain quotes' }}; + }} + + if (/[;&|<>$(){{}}\\[\\]!]/.test(raw)) {{ + return {{ ok: false, value: raw, message: 'Binary paths can only be a binary name or absolute path' }}; + }} + + if (raw.startsWith('/')) {{ + if (/^[A-Za-z0-9_./+\\- ]+$/.test(raw)) {{ + return {{ ok: true, value: raw, message: '' }}; + }} + return {{ ok: false, value: raw, message: 'Absolute paths may only contain path-safe characters' }}; + }} + + if (/^[A-Za-z0-9_.+-]+$/.test(raw)) {{ + return {{ ok: true, value: raw, message: '' }}; + }} + + return {{ ok: false, value: raw, message: 'Enter a binary name like wget or an absolute path like /usr/bin/wget' }}; + }} + + function parseValue_{widget_id}(raw) {{ + try {{ + if (raw === 'true') return true; + if (raw === 'false') return false; + if (raw === 'null') return null; + if (raw !== '' && !isNaN(raw)) return Number(raw); + if ((raw.startsWith('{{') && raw.endsWith('}}')) || + (raw.startsWith('[') && raw.endsWith(']')) || + (raw.startsWith('"') && raw.endsWith('"'))) {{ + return JSON.parse(raw); + }} + }} catch (error) {{ + return raw; + }} + return raw; + }} + + function sameValue_{widget_id}(left, right) {{ + return left === right || JSON.stringify(left) === JSON.stringify(right); + }} + + function parseTypedValue_{widget_id}(raw, typeName, meta) {{ + var numberValue; + var parsed; + + if (typeName && meta && meta.key && isRegexConfigKey_{widget_id}(meta.key)) {{ + return validateRegexConfig_{widget_id}(meta.key, raw, typeName); + }} + + if (typeName === 'string' && meta && meta.key && meta.key.endsWith('_BINARY')) {{ + return validateBinaryValue_{widget_id}(raw); + }} + + if (typeName === 'string') {{ + if (meta.pattern) {{ + try {{ + if (!(new RegExp(meta.pattern)).test(raw)) {{ + return {{ ok: false, value: raw, message: 'Must match pattern ' + meta.pattern }}; + }} + }} catch (error) {{}} + }} + return {{ ok: true, value: raw, message: '' }}; + }} + + if (typeName === 'integer') {{ + if (!/^-?\\d+$/.test(raw)) {{ + return {{ ok: false, value: raw, message: 'Must be an integer' }}; + }} + numberValue = Number(raw); + if (meta.minimum !== undefined && numberValue < meta.minimum) {{ + return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }}; + }} + if (meta.maximum !== undefined && numberValue > meta.maximum) {{ + return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }}; + }} + return {{ ok: true, value: numberValue, message: '' }}; + }} + + if (typeName === 'number') {{ + if (raw === '' || isNaN(raw)) {{ + return {{ ok: false, value: raw, message: 'Must be a number' }}; + }} + numberValue = Number(raw); + if (meta.minimum !== undefined && numberValue < meta.minimum) {{ + return {{ ok: false, value: numberValue, message: 'Must be at least ' + meta.minimum }}; + }} + if (meta.maximum !== undefined && numberValue > meta.maximum) {{ + return {{ ok: false, value: numberValue, message: 'Must be at most ' + meta.maximum }}; + }} + return {{ ok: true, value: numberValue, message: '' }}; + }} + + if (typeName === 'boolean') {{ + var lowered = raw.toLowerCase(); + if (lowered === 'true' || raw === '1') return {{ ok: true, value: true, message: '' }}; + if (lowered === 'false' || raw === '0') return {{ ok: true, value: false, message: '' }}; + return {{ ok: false, value: raw, message: 'Must be true or false' }}; + }} + + if (typeName === 'null') {{ + return raw === 'null' + ? {{ ok: true, value: null, message: '' }} + : {{ ok: false, value: raw, message: 'Must be null' }}; + }} + + if (typeName === 'array' || typeName === 'object') {{ + try {{ + parsed = JSON.parse(raw); + }} catch (error) {{ + return {{ ok: false, value: raw, message: 'Must be valid JSON' }}; + }} + + if (typeName === 'array' && Array.isArray(parsed)) {{ + return {{ ok: true, value: parsed, message: '' }}; + }} + if (typeName === 'object' && parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {{ + return {{ ok: true, value: parsed, message: '' }}; + }} + + return {{ + ok: false, + value: parsed, + message: typeName === 'array' ? 'Must be a JSON array' : 'Must be a JSON object', + }}; + }} + + return {{ ok: true, value: parseValue_{widget_id}(raw), message: '' }}; + }} + + function validateValueAgainstMeta_{widget_id}(raw, meta) {{ + if (!meta || raw === '') {{ + return {{ state: 'neutral', value: raw, message: '' }}; + }} + + var enumValues = Array.isArray(meta.enum) ? meta.enum : []; + var types = getTypes_{widget_id}(meta); + if (!types.length) {{ + types = ['string']; + }} + + var error = 'Invalid value'; + for (var i = 0; i < types.length; i++) {{ + var candidate = parseTypedValue_{widget_id}(raw, types[i], meta); + if (!candidate.ok) {{ + error = candidate.message || error; + continue; + }} + if (enumValues.length && !enumValues.some(function(enumValue) {{ + return sameValue_{widget_id}(enumValue, candidate.value) || stringifyValue_{widget_id}(enumValue) === raw; + }})) {{ + error = 'Must be one of: ' + enumValues.map(stringifyValue_{widget_id}).join(', '); + continue; + }} + return {{ state: 'valid', value: candidate.value, message: '' }}; + }} + + return {{ state: 'invalid', value: raw, message: error }}; + }} + + function ensureRowId_{widget_id}(row) {{ + if (!row.dataset.rowId) {{ + row.dataset.rowId = String(rowCounter_{widget_id}++); + }} + return row.dataset.rowId; + }} + + function setRowHelp_{widget_id}(row) {{ + var keyInput = row.querySelector('.kv-key'); + var help = row.querySelector('.kv-help'); + if (!keyInput || !help) {{ + return; + }} + + var key = keyInput.value.trim(); + if (!key) {{ + help.textContent = ''; + return; + }} + + var meta = getMetaForKey_{widget_id}(key); + if (meta) {{ + var extra = isRegexConfigKey_{widget_id}(key) + ? ((meta.type === 'object' || (Array.isArray(meta.type) && meta.type.includes('object'))) + ? ' Expected: JSON object with regex keys.' + : ' Expected: valid regex.') + : ''; + var example = getExampleInput_{widget_id}(key, meta); + help.textContent = [describeMeta_{widget_id}(meta) + extra, example].filter(Boolean).join(' '); + }} else {{ + help.textContent = 'Custom key'; + }} + }} + + function configureValueInput_{widget_id}(row) {{ + var keyInput = row.querySelector('.kv-key'); + var valueInput = row.querySelector('.kv-value'); + var datalist = row.querySelector('.kv-value-options'); + if (!keyInput || !valueInput || !datalist) {{ + return; + }} + + var rowId = ensureRowId_{widget_id}(row); + datalist.id = '{widget_id}_value_options_' + rowId; + + var meta = getMetaForKey_{widget_id}(keyInput.value.trim()); + var enumValues = Array.isArray(meta && meta.enum) ? meta.enum : []; + var types = getTypes_{widget_id}(meta); + if (!enumValues.length && types.includes('boolean')) {{ + enumValues = ['True', 'False']; + }} + if (enumValues.length) {{ + datalist.innerHTML = enumValues.map(function(enumValue) {{ + return '<option value="' + stringifyValue_{widget_id}(enumValue).replace(/"/g, '"') + '"></option>'; + }}).join(''); + valueInput.setAttribute('list', datalist.id); + }} else {{ + datalist.innerHTML = ''; + valueInput.removeAttribute('list'); + }} + }} + + function setValueValidationState_{widget_id}(input, state, message) {{ + if (!input) {{ + return; + }} + + if (state === 'valid') {{ + input.style.borderColor = '#2da44e'; + input.style.boxShadow = '0 0 0 1px rgba(45, 164, 78, 0.18)'; + input.style.backgroundColor = '#f6ffed'; + }} else if (state === 'invalid') {{ + input.style.borderColor = '#cf222e'; + input.style.boxShadow = '0 0 0 1px rgba(207, 34, 46, 0.18)'; + input.style.backgroundColor = '#fff8f8'; + }} else {{ + input.style.borderColor = '#ccc'; + input.style.boxShadow = 'none'; + input.style.backgroundColor = ''; + }} + input.title = message || ''; + }} + + function applyValueValidation_{widget_id}(row) {{ + var keyInput = row.querySelector('.kv-key'); + var valueInput = row.querySelector('.kv-value'); + if (!keyInput || !valueInput) {{ + return; + }} + + var key = keyInput.value.trim(); + if (!key) {{ + setValueValidationState_{widget_id}(valueInput, 'neutral', ''); + return; + }} + + var meta = getMetaForKey_{widget_id}(key); + if (!meta) {{ + setValueValidationState_{widget_id}(valueInput, 'neutral', ''); + return; + }} + + var validation = validateValueAgainstMeta_{widget_id}(valueInput.value.trim(), meta); + setValueValidationState_{widget_id}(valueInput, validation.state, validation.message); + }} + + function coerceValueForStorage_{widget_id}(key, raw) {{ + var meta = getMetaForKey_{widget_id}(key); + if (!meta) {{ + return parseValue_{widget_id}(raw); + }} + + var validation = validateValueAgainstMeta_{widget_id}(raw, meta); + return validation.state === 'valid' ? validation.value : raw; + }} + + function initializeRows_{widget_id}() {{ + var container = document.getElementById('{widget_id}_rows'); + container.querySelectorAll('.key-value-row').forEach(function(row) {{ + ensureRowId_{widget_id}(row); + configureValueInput_{widget_id}(row); + setRowHelp_{widget_id}(row); + applyValueValidation_{widget_id}(row); + }}); + }} + + function updateHiddenField_{widget_id}() {{ + var container = document.getElementById('{widget_id}_rows'); + var rows = container.querySelectorAll('.key-value-row'); + var result = {{}}; + rows.forEach(function(row) {{ + var keyInput = row.querySelector('.kv-key'); + var valInput = row.querySelector('.kv-value'); + if (keyInput && valInput && keyInput.value.trim()) {{ + var key = keyInput.value.trim(); + var val = valInput.value.trim(); + result[key] = coerceValueForStorage_{widget_id}(key, val); + }} + }}); + document.getElementById('{widget_id}').value = JSON.stringify(result); + }} + + window.addKeyValueRow_{widget_id} = function() {{ + var container = document.getElementById('{widget_id}_rows'); + var newRow = document.createElement('div'); + newRow.className = 'key-value-row'; + newRow.style.cssText = 'margin-bottom: 6px;'; + newRow.innerHTML = '<div class="kv-inputs" style="display: flex; gap: 8px; align-items: center;">' + + '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' + + 'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' + + '<input type="text" class="kv-value" placeholder="value" ' + + 'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;">' + + '<datalist class="kv-value-options"></datalist>' + + '<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' + + 'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">โˆ’</button>' + + '</div>' + + '<div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div>'; + container.appendChild(newRow); + ensureRowId_{widget_id}(newRow); + configureValueInput_{widget_id}(newRow); + setRowHelp_{widget_id}(newRow); + applyValueValidation_{widget_id}(newRow); + updateHiddenField_{widget_id}(); + newRow.querySelector('.kv-key').focus(); + }}; + + window.removeKeyValueRow_{widget_id} = function(btn) {{ + var row = btn.closest('.key-value-row'); + row.remove(); + updateHiddenField_{widget_id}(); + }}; + + window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id}; + + function focusConfigKeyFromHash_{widget_id}() {{ + // Deep-link affordance: ``โ€ฆ/change/#SOME_KEY`` jumps directly + // to (or creates) the matching row in this editor. Used by + // the in-banner "pin via admin" link and the + // ``โ†’ Edit <KEY> in Machine.config`` shortcut on the live + // config detail page. + var hash = (window.location.hash || '').replace(/^#/, '').trim(); + if (!hash || !/^[A-Z][A-Z0-9_]*$/.test(hash)) {{ + return; + }} + var container = document.getElementById('{widget_id}_rows'); + if (!container) {{ + return; + }} + var match = null; + container.querySelectorAll('.key-value-row').forEach(function(row) {{ + if (match) {{ return; }} + var keyInput = row.querySelector('.kv-key'); + if (keyInput && keyInput.value.trim() === hash) {{ + match = row; + }} + }}); + if (!match) {{ + // No existing row for this key โ€” prepopulate one with the + // key filled in but value left blank so the operator just + // types/pastes the value and hits save. + window.addKeyValueRow_{widget_id}(); + var rows = container.querySelectorAll('.key-value-row'); + match = rows[rows.length - 1]; + var keyInput = match.querySelector('.kv-key'); + if (keyInput) {{ + keyInput.value = hash; + keyInput.dispatchEvent(new Event('input', {{ bubbles: true }})); + }} + }} + if (!match) {{ + return; + }} + match.scrollIntoView({{ behavior: 'smooth', block: 'center' }}); + var prevOutline = match.style.outline; + match.style.outline = '2px solid #f59e0b'; + match.style.outlineOffset = '2px'; + match.style.transition = 'outline 1.2s ease-out'; + setTimeout(function() {{ + match.style.outline = prevOutline || 'none'; + }}, 1400); + var valueInput = match.querySelector('.kv-value'); + if (valueInput) {{ + valueInput.focus(); + try {{ valueInput.setSelectionRange(valueInput.value.length, valueInput.value.length); }} catch (e) {{}} + }} + }} + + // Initialize on load + document.addEventListener('DOMContentLoaded', function() {{ + initializeRows_{widget_id}(); + updateHiddenField_{widget_id}(); + focusConfigKeyFromHash_{widget_id}(); + }}); + // Also run immediately in case DOM is already ready + if (document.readyState !== 'loading') {{ + initializeRows_{widget_id}(); + updateHiddenField_{widget_id}(); + focusConfigKeyFromHash_{widget_id}(); + }} + + window.addEventListener('hashchange', focusConfigKeyFromHash_{widget_id}); + + // Update on any input change + var rowsEl_{widget_id} = document.getElementById('{widget_id}_rows'); + + rowsEl_{widget_id}.addEventListener('input', function(event) {{ + var row = event.target.closest('.key-value-row'); + if (!row) {{ + return; + }} + + if (event.target.classList.contains('kv-key')) {{ + configureValueInput_{widget_id}(row); + setRowHelp_{widget_id}(row); + }} + + if (event.target.classList.contains('kv-key') || event.target.classList.contains('kv-value')) {{ + applyValueValidation_{widget_id}(row); + updateHiddenField_{widget_id}(); + }} + }}); + }})(); + </script> + </div> + ''' + return mark_safe(html) + + def _render_row(self, widget_id: str, key: str, value: str) -> str: + from archivebox.config.common import is_sensitive_config_key + + # Sensitive keys (``*TOKEN*``, ``*SECRET*``, ``*API_KEY*``, ``*APIKEY*``) are + # rendered write-only: the input is a password field with a placeholder + # showing the value is set, but the raw value is NEVER sent to the browser. + # When the user submits the form with the field left blank, the + # ``ConfigEditorMixin.save_model`` hook re-merges the previously-saved + # value so leaving it untouched is a no-op rather than a destructive clear. + is_sensitive = is_sensitive_config_key(key) + has_value = bool(value) + if is_sensitive: + input_type = "password" + rendered_value = "" + placeholder = ( + "โ€ขโ€ขโ€ขโ€ขโ€ขโ€ข (saved โ€” enter new value to replace, clear by deleting row)" if has_value else "value (will be saved write-only)" + ) + extra_attrs = ' autocomplete="off" data-sensitive="1"' + (' data-had-value="1"' if has_value else "") + else: + input_type = "text" + rendered_value = self._escape(value) + placeholder = "value" + extra_attrs = "" + return f''' + <div class="key-value-row" style="margin-bottom: 6px;"> + <div class="kv-inputs" style="display: flex; gap: 8px; align-items: center;"> + <input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys" + style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"> + <input type="{input_type}" class="kv-value" value="{rendered_value}" placeholder="{self._escape(placeholder)}"{extra_attrs} + style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"> + <datalist class="kv-value-options"></datalist> + <button type="button" onclick="removeKeyValueRow_{widget_id}(this)" + style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">โˆ’</button> + </div> + <div class="kv-help" style="margin-top: 4px; font-size: 11px; color: #666; font-style: italic;"></div> + </div> + ''' + + def _escape(self, s: object) -> str: + """Escape HTML special chars in attribute values.""" + if not s: + return "" + return str(s).replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) + + def value_from_datadict( + self, + data: QueryDict | Mapping[str, object], + files: object, + name: str, + ) -> str: + value = data.get(name, "{}") + return value if isinstance(value, str) else "{}" + + +class ConfigEditorMixin(admin.ModelAdmin): + """ + Mixin for admin classes with a config JSON field. + + Provides a key-value editor widget with autocomplete for available config keys. + """ + + def formfield_for_dbfield( + self, + db_field: models.Field, + request: HttpRequest, + **kwargs: object, + ) -> forms.Field | None: + """Use KeyValueWidget for the config JSON field.""" + if db_field.name == "config": + kwargs["widget"] = KeyValueWidget() + return super().formfield_for_dbfield(db_field, request, **kwargs) + + def save_model(self, request: HttpRequest, obj, form, change): + """Preserve write-only redacted credentials on save. + + The KeyValueWidget renders sensitive keys (``*TOKEN*``, ``*SECRET*``, + ``*API_KEY*``, ``*APIKEY*``) with an empty value + password input โ€” + the real value never leaves the server. On submit, an empty value + for a sensitive key that was previously set means "leave untouched", + not "clear it." We honor that here by re-merging the stored value + before the row is written. Explicitly removing the row in the UI + still clears it (the key is gone from the submitted JSON, so there's + nothing to merge over). + """ + from archivebox.config.common import is_sensitive_config_key + + if change and obj.pk and obj.config is not None: + try: + stored = type(obj).objects.filter(pk=obj.pk).values_list("config", flat=True).first() or {} + except Exception: + stored = {} + if isinstance(stored, dict): + new_config = dict(obj.config or {}) + for key, new_value in list(new_config.items()): + if not is_sensitive_config_key(key): + continue + if new_value not in (None, "") and new_value != "********": + continue + if key in stored: + new_config[key] = stored[key] + obj.config = new_config + super().save_model(request, obj, form, change) + + +class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin): + list_display = ("id", "created_at", "created_by") + readonly_fields = ("id", "created_at", "modified_at") + show_search_mode_selector = False + + def get_default_search_mode(self) -> str: + # The shared changelist template always asks every admin for a default + # search mode, even when the search-mode toggle is hidden. + return "meta" + + def get_form( + self, + request: HttpRequest, + obj: models.Model | None = None, + change: bool = False, + **kwargs: object, + ): + form = super().get_form(request, obj, change=change, **kwargs) + if "created_by" in form.base_fields: + form.base_fields["created_by"].initial = request.user + return form + + def get_urls(self): + """Swap the per-object admin URLs from ``<path:object_id>`` to + ``<hexuuid:object_id>`` so canonical change/delete/history URLs use the + 32-char hex form. The hyphenated form still resolves because the + converter's regex accepts both โ€” Django reverses through ``to_url`` + which always emits hex, so links in templates / changelists / inline + formsets all canonicalize automatically. + + Non-UUID PKs (an ``IntegerField`` PK on some legacy table, for example) + won't match the converter's regex and fall back to the default + ``<path:object_id>`` patterns we still include after our swap. + """ + info = self.opts.app_label, self.opts.model_name + object_routes = [ + path("<hexuuid:object_id>/history/", self.admin_site.admin_view(self.history_view), name="%s_%s_history" % info), + path("<hexuuid:object_id>/delete/", self.admin_site.admin_view(self.delete_view), name="%s_%s_delete" % info), + path("<hexuuid:object_id>/change/", self.admin_site.admin_view(self.change_view), name="%s_%s_change" % info), + ] + # Append after super().get_urls() so our patterns are the + # *last-registered* ones with the canonical admin URL names โ€” Django's + # reverse() picks the later registration when names collide, which is + # how we make ``reverse("admin:app_model_change", args=[obj.pk])`` + # emit the hex form. The original ``<path:object_id>`` routes stay in + # place as a fallback for non-UUID PKs and for resolving inbound + # hyphenated URLs (the ``hexuuid`` regex accepts both forms anyway). + return super().get_urls() + object_routes diff --git a/archivebox/base_models/apps.py b/archivebox/base_models/apps.py new file mode 100644 index 0000000000..82bd72f8bf --- /dev/null +++ b/archivebox/base_models/apps.py @@ -0,0 +1,7 @@ +# from django.apps import AppConfig + + +# class BaseModelsConfig(AppConfig): +# default_auto_field = 'django.db.models.BigAutoField' + +# name = 'base_models' diff --git a/archivebox/base_models/migrations/__init__.py b/archivebox/base_models/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py new file mode 100755 index 0000000000..296bf74182 --- /dev/null +++ b/archivebox/base_models/models.py @@ -0,0 +1,303 @@ +"""Base models using UUIDv7 for all id fields.""" + +__package__ = "archivebox.base_models" + +import json +import shutil +from typing import Any + +from archivebox.uuid_compat import CompactUUIDField, uuid7 +from pathlib import Path + +from django.db import models +from django.db.models import F +from django.db import transaction +from django.db.models.signals import pre_delete +from django.utils import timezone +from django.contrib.auth import get_user_model +from django.urls import reverse_lazy +from django.conf import settings + +from django_stubs_ext.db.models import TypedModelMeta + +from archivebox.config import CONSTANTS + + +def normalize_config_json_values(config: Any) -> Any: + if not isinstance(config, dict): + return config + + normalized = dict(config) + for key, value in list(normalized.items()): + if not isinstance(value, str) or len(value) < 2: + continue + if value[:1] != '"' or value[-1:] != '"': + continue + try: + decoded = json.loads(value) + except ValueError: + continue + if isinstance(decoded, str): + normalized[key] = decoded + return normalized + + +def get_or_create_system_user_pk(username="system"): + User = get_user_model() + # If there's exactly one superuser, use that for all system operations + if User.objects.filter(is_superuser=True).count() == 1: + return User.objects.filter(is_superuser=True).values_list("pk", flat=True)[0] + # Otherwise get or create the system user + user, _ = User.objects.get_or_create( + username=username, + defaults={"is_staff": True, "is_superuser": True, "email": "", "password": "!"}, + ) + return user.pk + + +class AutoDateTimeField(models.DateTimeField): + """DateTimeField that automatically updates on save (legacy compatibility).""" + + def pre_save(self, model_instance, add): + if add or self.attname not in model_instance.__dict__ or not model_instance.__dict__[self.attname]: + value = timezone.now() + setattr(model_instance, self.attname, value) + return value + return super().pre_save(model_instance, add) + + +class ModelWithUUID(models.Model): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=False, + db_index=True, + ) + + class Meta(TypedModelMeta): + abstract = True + + def __str__(self) -> str: + return f"[{self.id}] {self.__class__.__name__}" + + @property + def admin_change_url(self) -> str: + return f"/admin/{self._meta.app_label}/{self._meta.model_name}/{self.pk}/change/" + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_any", args=[self.id])) + + @property + def api_docs_url(self) -> str: + return f"/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}" + + +class ModelWithNotes(models.Model): + """Mixin for models with a notes field.""" + + notes = models.TextField(blank=True, null=False, default="") + + class Meta(TypedModelMeta): + abstract = True + + +class ModelWithHealthStats(models.Model): + """Mixin for models with health tracking fields.""" + + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + class Meta(TypedModelMeta): + abstract = True + + @property + def admin_change_url(self) -> str: + return f"/admin/{self._meta.app_label}/{self._meta.model_name}/{self.pk}/change/" + + @property + def health(self) -> int: + total = max(self.num_uses_failed + self.num_uses_succeeded, 1) + return round((self.num_uses_succeeded / total) * 100) + + def increment_health_stats(self, success: bool): + """Atomically increment success or failure counter using F() expression.""" + field = "num_uses_succeeded" if success else "num_uses_failed" + type(self).objects.filter(pk=self.pk).update( + **{ + field: F(field) + 1, + "modified_at": timezone.now(), + }, + ) + + +class ModelWithConfig(models.Model): + """Mixin for models with a JSON config field.""" + + config = models.JSONField(default=dict, null=True, blank=True, editable=True) + + class Meta(TypedModelMeta): + abstract = True + + def save(self, *args, **kwargs): + normalized_config = normalize_config_json_values(self.config) + if normalized_config != self.config: + self.config = normalized_config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + super().save(*args, **kwargs) + + +class ModelWithDeleteAfter(models.Model): + delete_after_final_statuses: tuple[str, ...] = () + delete_at = models.DateTimeField(default=None, null=True, blank=True, db_index=True) + + class Meta(TypedModelMeta): + abstract = True + + def save(self, *args, **kwargs): + update_fields = kwargs.get("update_fields") + if self.delete_at is None: + self.set_delete_at_from_config() + if self.delete_at is not None and update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "delete_at"])) + super().save(*args, **kwargs) + + def get_delete_after_config_value(self): + from archivebox.config.common import get_config + + return get_config(include_machine=False, resolve_plugins=False).DELETE_AFTER + + def set_delete_at_from_config(self, config_value=None) -> bool: + if self.delete_at is not None: + return False + + from archivebox.config.common import parse_delete_after + + duration = parse_delete_after(self.get_delete_after_config_value() if config_value is None else config_value) + if duration is None: + return False + + self.delete_at = (self.created_at or timezone.now()) + duration + return True + + @classmethod + def missing_delete_at_candidates(cls): + return cls.objects.none() + + @classmethod + def delete_expired(cls, *, batch_size: int = 100, backfill_missing: bool = True) -> int: + if backfill_missing: + missing_delete_at = list(cls.missing_delete_at_candidates().order_by("created_at", "pk")[:batch_size]) + for obj in missing_delete_at: + if obj.set_delete_at_from_config(): + cls.objects.filter(pk=obj.pk, delete_at__isnull=True).update( + delete_at=obj.delete_at, + modified_at=timezone.now(), + ) + + # Keep the expiration sweep anchored on delete_at. Some large tables + # have millions of final-status rows but almost no retained rows; adding + # the status filter before SQLite has narrowed by delete_at can make the + # runner scan the hot status index before it claims new work. + due_pks = list( + cls.objects.filter(delete_at__isnull=False, delete_at__lte=timezone.now()) + .order_by("delete_at", "pk") + .values_list("pk", flat=True)[:batch_size], + ) + if not due_pks: + return 0 + + queryset = cls.objects.filter(pk__in=due_pks) + if cls.delete_after_final_statuses: + queryset = queryset.filter(status__in=cls.delete_after_final_statuses) + + count = 0 + expired = list(queryset.order_by("delete_at", "pk")) + for obj in expired: + obj.delete() + count += 1 + return count + + +class ModelWithOutputDir(ModelWithUUID): + class Meta(ModelWithUUID.Meta): + abstract = True + + _delete_signal_registered = False + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + output_dir = Path(self.output_dir) + # Avoid holding SQLite write transactions open across slow filesystem work. + transaction.on_commit(lambda: output_dir.mkdir(parents=True, exist_ok=True)) + # Note: index.json is deprecated, models should use write_index_jsonl() for full data + + @property + def output_dir_parent(self) -> str: + return f"{self._meta.model_name}s" + + @property + def output_dir_name(self) -> str: + return str(self.id) + + @property + def output_dir_str(self) -> str: + return f"{self.output_dir_parent}/{self.output_dir_name}" + + @property + def output_dir(self) -> Path: + raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property") + + def output_paths_for_delete(self) -> tuple[Path, ...]: + return (Path(self.output_dir),) + + @classmethod + def validate_output_paths_for_delete(cls, paths) -> tuple[Path, ...]: + data_dir = CONSTANTS.DATA_DIR.resolve() + safe_paths = [] + for raw_path in paths: + path = Path(raw_path) + is_safe = False + for candidate in (path.absolute(), path.resolve()): + try: + candidate.relative_to(data_dir) + is_safe = True + break + except ValueError: + continue + if not is_safe: + raise ValueError(f"Refusing to delete output path outside DATA_DIR: {path}") + safe_paths.append(path) + return tuple(safe_paths) + + @classmethod + def delete_output_paths(cls, paths) -> None: + for path in cls.validate_output_paths_for_delete(paths): + if path.is_symlink() or path.is_file(): + path.unlink(missing_ok=True) + elif path.is_dir(): + shutil.rmtree(path, ignore_errors=True) + + @classmethod + def register_delete_signal(cls) -> None: + if cls._delete_signal_registered: + return + + def schedule_output_dir_cleanup(sender, instance, **kwargs): + if not isinstance(instance, ModelWithOutputDir): + return + paths = instance.validate_output_paths_for_delete(instance.output_paths_for_delete()) + transaction.on_commit(lambda paths=paths: instance.delete_output_paths(paths)) + + pre_delete.connect( + schedule_output_dir_cleanup, + dispatch_uid="archivebox.output_dir_cleanup_on_delete", + weak=False, + ) + cls._delete_signal_registered = True diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py new file mode 100644 index 0000000000..9ea9992a38 --- /dev/null +++ b/archivebox/cli/__init__.py @@ -0,0 +1,223 @@ +__package__ = "archivebox.cli" +__command__ = "archivebox" +import os +import sys +from importlib import import_module + +import rich_click as click +from rich.console import Console + +from archivebox.config.version import VERSION + +STDERR = Console(stderr=True) + + +if "--debug" in sys.argv: + os.environ["DEBUG"] = "True" + sys.argv.remove("--debug") + +# Universal `--init` flag: when passed to ANY subcommand (e.g. `archivebox server --init`, +# `archivebox add --init`, `archivebox shell --init`), run a `quick` archivebox init before +# the subcommand executes. Strip it from argv here so each subcommand's own click parser +# never sees it. Ignored for `help` and `init` themselves. +if "--init" in sys.argv: + sys.argv = [arg for arg in sys.argv if arg != "--init"] + os.environ["ARCHIVEBOX_WANTS_INIT"] = "1" + + +class ArchiveBoxGroup(click.Group): + """lazy loading click group for archivebox commands""" + + meta_commands = { + "help": "archivebox.cli.archivebox_help.main", + "version": "archivebox.cli.archivebox_version.main", + "mcp": "archivebox.cli.archivebox_mcp.main", + } + setup_commands = { + "init": "archivebox.cli.archivebox_init.main", + "install": "archivebox.cli.archivebox_install.main", + } + # Model commands (CRUD operations via subcommands) + model_commands = { + "crawl": "archivebox.cli.archivebox_crawl.main", + "snapshot": "archivebox.cli.archivebox_snapshot.main", + "archiveresult": "archivebox.cli.archivebox_archiveresult.main", + "tag": "archivebox.cli.archivebox_tag.main", + "binary": "archivebox.cli.archivebox_binary.main", + "process": "archivebox.cli.archivebox_process.main", + "machine": "archivebox.cli.archivebox_machine.main", + "persona": "archivebox.cli.archivebox_persona.main", + } + archive_commands = { + # High-level commands + "add": "archivebox.cli.archivebox_add.main", + "extract": "archivebox.cli.archivebox_extract.main", + "list": "archivebox.cli.archivebox_list.main", + "remove": "archivebox.cli.archivebox_remove.main", + "run": "archivebox.cli.archivebox_run.main", + "update": "archivebox.cli.archivebox_update.main", + "status": "archivebox.cli.archivebox_status.main", + "search": "archivebox.cli.archivebox_search.main", + "config": "archivebox.cli.archivebox_config.main", + "schedule": "archivebox.cli.archivebox_schedule.main", + "server": "archivebox.cli.archivebox_server.main", + "shell": "archivebox.cli.archivebox_shell.main", + "manage": "archivebox.cli.archivebox_manage.main", + # Introspection commands + "pluginmap": "archivebox.cli.archivebox_pluginmap.main", + } + all_subcommands = { + **meta_commands, + **setup_commands, + **model_commands, + **archive_commands, + } + renamed_commands = { + "setup": "install", + "import": "add", + "archive": "add", + } + + @classmethod + def get_canonical_name(cls, cmd_name): + return cls.renamed_commands.get(cmd_name, cmd_name) + + @classmethod + def _needs_django_for_lazy_import(cls, cmd_name: str) -> bool: + wants_help = any(arg in ("-h", "--help", "--version") for arg in sys.argv[1:]) + return not wants_help and (cmd_name in cls.archive_commands or cmd_name in cls.model_commands) + + @classmethod + def _setup_django_for_lazy_import(cls, cmd_name: str) -> None: + if not cls._needs_django_for_lazy_import(cmd_name): + return + + from django.apps import apps + + if apps.ready: + return + + from archivebox.config.django import setup_django + + setup_django() + + def get_command(self, ctx, cmd_name): + # handle renamed commands + if cmd_name in self.renamed_commands: + new_name = self.renamed_commands[cmd_name] + STDERR.print( + f" [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`", + ) + cmd_name = new_name + ctx.invoked_subcommand = cmd_name + + # handle lazy loading of commands + if cmd_name in self.all_subcommands: + self._setup_django_for_lazy_import(cmd_name) + return self._lazy_load(cmd_name) + + # fall-back to using click's default command lookup + return super().get_command(ctx, cmd_name) + + @classmethod + def _lazy_load(cls, cmd_name_or_path): + import_path = cls.all_subcommands.get(cmd_name_or_path) + if import_path is None: + import_path = cmd_name_or_path + modname, funcname = import_path.rsplit(".", 1) + + # print(f'LAZY LOADING {import_path}') + mod = import_module(modname) + func = vars(mod)[funcname] + + if func.__doc__ is None: + raise ValueError(f"lazy loading of {import_path} failed - no docstring found on method") + + # if not isinstance(cmd, click.BaseCommand): + # raise ValueError(f'lazy loading of {import_path} failed - not a click command') + + return func + + +@click.group(cls=ArchiveBoxGroup, invoke_without_command=True) +@click.option("--help", "-h", is_flag=True, help="Show help") +@click.version_option(VERSION, "-v", "--version", package_name="archivebox", message="%(version)s") +@click.pass_context +def cli(ctx, help=False): + """ArchiveBox: The self-hosted internet archive""" + + subcommand = ArchiveBoxGroup.get_canonical_name(ctx.invoked_subcommand) + + # if --help is passed or no subcommand is given, show custom help message + if help or ctx.invoked_subcommand is None: + ctx.invoke(ctx.command.get_command(ctx, "help")) + + # if the subcommand is in archive_commands or model_commands, + # then we need to set up the django environment and check that we're in a valid data folder + wants_help = any(arg in ("-h", "--help", "--version") for arg in sys.argv[1:]) + if not wants_help and (subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands): + # print('SETUP DJANGO AND CHECK DATA FOLDER') + try: + if subcommand == "server": + run_in_debug = "--reload" in sys.argv or os.environ.get("DEBUG") in ("1", "true", "True", "TRUE", "yes") + if run_in_debug: + os.environ["ARCHIVEBOX_RUNSERVER"] = "1" + if "--reload" in sys.argv: + os.environ["ARCHIVEBOX_AUTORELOAD"] = "1" + + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder, check_migrations + + setup_django() + if os.environ.get("ARCHIVEBOX_WANTS_INIT") == "1" and subcommand not in ("init", "help"): + # Universal `--init` was passed: build/upgrade the data folder before + # the regular preflight runs, so it succeeds on a fresh dir and an + # out-of-date schema both. Drop the env var afterwards so spawned + # subprocesses (supervisord workers, daphne, runner, etc.) inherit + # a clean env and don't re-trigger init in every child. + from archivebox.cli.archivebox_init import init as archivebox_init + + archivebox_init(quick=True) + os.environ.pop("ARCHIVEBOX_WANTS_INIT", None) + check_data_folder() + if subcommand != "update": + check_migrations(auto_apply=True) + except Exception as e: + STDERR.print(f"[red][X] Error setting up Django or checking data folder: {e}[/red]") + if subcommand not in ("manage", "shell"): # not all management commands need django to be setup beforehand + raise + + +def main(args=None, prog_name=None): + # show `docker run archivebox xyz` in help messages if running in docker + IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + IS_TTY = sys.stdin.isatty() + prog_name = prog_name or (f"docker compose run{'' if IS_TTY else ' -T'} archivebox" if IN_DOCKER else "archivebox") + + previous_unraisablehook = sys.unraisablehook + + def ignore_shutdown_unraisable(unraisable): + if isinstance(unraisable.exc_value, (KeyboardInterrupt, SystemExit)): + return + previous_unraisablehook(unraisable) + + sys.unraisablehook = ignore_shutdown_unraisable + try: + cli(args=args, prog_name=prog_name, standalone_mode=False) + except click.Abort: + STDERR.print("\n[red][X] Got CTRL+C. Exiting...[/red]") + raise SystemExit(130) from None + except click.ClickException as err: + err.show() + raise SystemExit(err.exit_code) from None + except click.exceptions.Exit as err: + raise SystemExit(err.exit_code) from None + except KeyboardInterrupt: + STDERR.print("\n[red][X] Got CTRL+C. Exiting...[/red]") + raise SystemExit(130) from None + finally: + sys.unraisablehook = previous_unraisablehook + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py new file mode 100644 index 0000000000..218abb104a --- /dev/null +++ b/archivebox/cli/archivebox_add.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +__package__ = "archivebox.cli" +__command__ = "archivebox add" + +import sys +import os +from pathlib import Path + +from typing import Any, TYPE_CHECKING + +import rich_click as click + +from archivebox.config import CONSTANTS +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.util import parse_filesize_to_bytes + + +if TYPE_CHECKING: + from django.db.models import QuerySet + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + +def _collect_input_urls(args: tuple[str, ...], *, parser: str = "auto") -> list[str]: + from archivebox.misc.jsonl import read_args_or_stdin + from archivebox.misc.util import validate_url + + urls: list[str] = [] + for record in read_args_or_stdin(args): + url = record.get("url") + if isinstance(url, str) and url: + try: + urls.append(validate_url(url)) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="URL") from err + + urls_field = record.get("urls") + if isinstance(urls_field, str): + for line in urls_field.splitlines(): + line = line.strip() + if line and not line.startswith("#"): + try: + urls.append(validate_url(line)) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="URL") from err + + return urls + + +@enforce_types +def add( + urls: str | list[str], + snapshot_ids: list[str] | None = None, + depth: int | str = 0, + max_urls: int = 0, + crawl_max_size: int | str = 0, + crawl_timeout: int = 0, + snapshot_max_size: int | str = 0, + crawl_max_concurrent_snapshots: int | None = None, + tag: str = "", + url_allowlist: str = "", + url_denylist: str = "", + parser: str = "auto", + plugins: str = "", + persona: str = "Default", + index_only: bool = False, + bg: bool = False, + created_by_id: int | None = None, + config: dict[str, Any] | None = None, +) -> tuple[Crawl, QuerySet[Snapshot]]: + """Add a new URL or list of URLs to your archive. + + The flow is: + 1. Save URLs to sources file + 2. Create Crawl with URLs and max_depth + 3. Crawl runner creates Snapshots from Crawl URLs (depth=0) + 4. Crawl runner runs parser extractors on root snapshots + 5. Parser extractors output to urls.jsonl + 6. URLs are added to Crawl.urls and child Snapshots are created + 7. Repeat until max_depth is reached + """ + + from rich import print + + depth = int(depth) + max_urls = int(max_urls or 0) + crawl_max_size = parse_filesize_to_bytes(crawl_max_size) + crawl_timeout = int(crawl_timeout or 0) + snapshot_max_size = parse_filesize_to_bytes(snapshot_max_size) + from archivebox.config.permissions import USER, HOSTNAME + from archivebox.config.common import get_config + + config_overrides = dict(config or {}) + runtime_config = get_config() + crawl_max_concurrent_snapshots_override = crawl_max_concurrent_snapshots is not None + if crawl_max_concurrent_snapshots is None: + crawl_max_concurrent_snapshots = runtime_config.CRAWL_MAX_CONCURRENT_SNAPSHOTS + crawl_max_concurrent_snapshots = int(crawl_max_concurrent_snapshots) + + if depth not in (0, 1, 2, 3, 4): + raise ValueError("Depth must be 0-4") + if max_urls < 0: + raise ValueError("max_urls must be >= 0") + if crawl_max_size < 0: + raise ValueError("crawl_max_size must be >= 0") + if crawl_timeout < 0: + raise ValueError("crawl_timeout must be >= 0") + if snapshot_max_size < 0: + raise ValueError("snapshot_max_size must be >= 0") + if crawl_max_concurrent_snapshots < 1: + raise ValueError("crawl_max_concurrent_snapshots must be >= 1") + + # import models once django is set up + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.personas.models import Persona + from archivebox.misc.logging_util import printable_filesize + from archivebox.misc.system import get_dir_size + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + from django.utils import timezone + + created_by_id = created_by_id or get_or_create_system_user_pk() + started_at = timezone.now() + + source_text = urls if isinstance(urls, str) else "\n".join(str(url) for url in urls) + + # 2. Create a new Crawl with inline URLs + cli_args = [*sys.argv] + if cli_args[0].lower().endswith("archivebox"): + cli_args[0] = "archivebox" + cmd_str = " ".join(cli_args) + + timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") + + persona_name = (persona or "Default").strip() or "Default" + plugins = plugins or "" + persona_obj, _ = Persona.objects.get_or_create(name=persona_name) + persona_obj.ensure_dirs() + effective_persona_config = get_config(persona=persona_obj) + + crawl_config = { + "PERMISSIONS": str(effective_persona_config.PERMISSIONS), + **({"INDEX_ONLY": True} if index_only else {}), + **({"PLUGINS": plugins} if plugins else {}), + **( + {"CRAWL_MAX_CONCURRENT_SNAPSHOTS": crawl_max_concurrent_snapshots} + if crawl_max_concurrent_snapshots_override + and crawl_max_concurrent_snapshots != int(effective_persona_config.CRAWL_MAX_CONCURRENT_SNAPSHOTS) + else {} + ), + **({"CRAWL_MAX_URLS": max_urls} if max_urls else {}), + **({"CRAWL_MAX_SIZE": crawl_max_size} if crawl_max_size else {}), + **({"CRAWL_TIMEOUT": crawl_timeout} if crawl_timeout else {}), + **({"SNAPSHOT_MAX_SIZE": snapshot_max_size} if snapshot_max_size else {}), + **({"PARSER": parser} if parser != "auto" else {}), + **({"URL_ALLOWLIST": url_allowlist} if url_allowlist else {}), + **({"URL_DENYLIST": url_denylist} if url_denylist else {}), + } + # Caller-supplied overrides (e.g. {"ONLY_NEW": False}) are the highest + # priority for crawl-frozen keys. Runtime-derived execution keys are + # stripped by Crawl.save() and rederived when hooks run. + crawl_config.update(config_overrides) + + # Crawl.urls is the user's submitted source, not a derived work queue for + # this add path. Keeping it byte-for-byte readable is what lets API/UI/CLI + # callers audit or resume imports without losing RSS/Netscape/JSON metadata + # that is not representable as one plain URL per line. + crawl = Crawl.objects.create( + urls=source_text, + # The internal root snapshot occupies depth 0. URLs discovered from the + # submitted source become normal child snapshots at depth 1, so the + # effective crawl limit must be one hop deeper than the user requested. + max_depth=depth + 1, + tags_str=tag, + persona_id=persona_obj.id, + label=f"{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]", + created_by_id=created_by_id, + status=Crawl.StatusChoices.QUEUED, + retry_at=None if index_only else timezone.now(), + config=crawl_config, + ) + + # Parser plugins consume this root snapshot through the normal Snapshot + # hook lifecycle. ArchiveBox does not select parser plugins or call them + # directly; non-parser plugins cheaply no-result unsupported internal input. + root_snapshot = Snapshot.objects.create( + url=Snapshot.INTERNAL_INPUT_URL, + crawl=crawl, + depth=0, + title="stdin.txt", + ) + staticfile_dir = root_snapshot.output_dir / "staticfile" + staticfile_dir.mkdir(parents=True, exist_ok=True) + (staticfile_dir / "stdin.txt").write_text(source_text, encoding="utf-8") + + print(f"[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]") + first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else "" + print(f" [dim]First URL: {first_url}[/dim]") + + # 3. The CrawlMachine will create Snapshots from all URLs when started + # Parser extractors run on snapshots and discover more URLs + # Discovered URLs become child Snapshots (depth+1) + + if index_only: + print("[yellow]\\[*] Index-only mode - URLs queued, runner not started[/yellow]") + return crawl, crawl.snapshot_set.none() + + # 5. Start the crawl runner to process the queue + # The runner will: + # - Process Crawl -> create Snapshots from all URLs + # - Process Snapshots -> run extractors + # - Parser extractors discover new URLs -> create child Snapshots + # - Repeat until max_depth reached + + if bg: + # Background mode: just queue work and return (background runner via server will pick it up). + print( + "[yellow]\\[*] URLs queued. The background runner will process them (run `archivebox server` or `archivebox run --daemon` if not already running).[/yellow]", + ) + else: + # Foreground mode: run full crawl runner until all work is done + print("[green]\\[*] Starting crawl runner to process crawl...[/green]") + from archivebox.machine.models import Process + from archivebox.core.takeover_util import command_owns_foreground_runner, current_command, standby_until_foreground_runner_needed + from archivebox.workers.supervisord_util import get_existing_supervisord_process, run_runner_worker, stop_own_supervisord_process + + command = current_command(Process.TypeChoices.ADD, data_dir=CONSTANTS.DATA_DIR, url=first_url) + exit_code = 0 + try: + try: + with foreground_shutdown_signals(first_signal_message=None), foreground_parent_watchdog(): + while True: + standby_until_foreground_runner_needed(command, data_dir=CONSTANTS.DATA_DIR) + exit_code = run_runner_worker( + ["--crawl-id", str(crawl.id)], + name=f"worker_runner_add_{os.getpid()}", + interactive_interrupts=True, + ) + crawl.refresh_from_db(fields=["status", "retry_at"]) + if exit_code == 0 and crawl.status == crawl.StatusChoices.SEALED: + break + # A shared supervisord can be stopped by another + # foreground owner (for example `archivebox server` + # shutting down) and stop the one-shot add runner while + # this add's crawl is still runnable. Keep the add + # process alive so it can re-enter the normal + # foreground-runner ownership loop and finish its crawl. + supervisor_gone = exit_code == 1 and get_existing_supervisord_process(quiet=True) is None + if crawl.status in crawl.RUNNABLE_STATES and (exit_code in (0, 130, 143) or supervisor_gone): + continue + if exit_code == 0: + break + if not command_owns_foreground_runner(command, data_dir=CONSTANTS.DATA_DIR): + continue + raise SystemExit(exit_code) + except KeyboardInterrupt: + exit_code = 130 + print("\n[red][X] archivebox add interrupted.[/red]") + print("[yellow]Hint: resume this crawl with:[/yellow]") + print(f" [green]archivebox run --crawl-id={crawl.id}[/green]") + raise SystemExit(exit_code) + finally: + command.mark_exited(exit_code=exit_code) + stop_own_supervisord_process() + + # Print summary for foreground runs + try: + crawl.refresh_from_db() + try: + from django.db.models import Count, Sum + + totals = crawl.snapshot_set.aggregate(snapshot_count=Count("id"), total_bytes=Sum("output_size")) + snapshots_count = int(totals["snapshot_count"] or 0) + total_bytes = int(totals["total_bytes"] or 0) + except Exception: + snapshots_count = crawl.snapshot_set.count() + total_bytes, _, _ = get_dir_size(crawl.output_dir) + total_size = printable_filesize(total_bytes) + total_time = timezone.now() - started_at + total_seconds = int(total_time.total_seconds()) + mins, secs = divmod(total_seconds, 60) + hours, mins = divmod(mins, 60) + if hours: + duration_str = f"{hours}h {mins}m {secs}s" + elif mins: + duration_str = f"{mins}m {secs}s" + else: + duration_str = f"{secs}s" + + # Output dir relative to DATA_DIR + try: + rel_output = Path(crawl.output_dir).relative_to(CONSTANTS.DATA_DIR) + rel_output_str = f"./{rel_output}" + except Exception: + rel_output_str = str(crawl.output_dir) + + from archivebox.core.routes_util import build_admin_url + + admin_url = build_admin_url(f"/admin/crawls/crawl/{crawl.id}/change/", config=config) + + print("\n[bold]crawl output saved to:[/bold]") + print(f" {rel_output_str}") + print(f" {admin_url}") + print(f"\n[bold]total urls snapshotted:[/bold] {snapshots_count}") + print(f"[bold]total size:[/bold] {total_size}") + print(f"[bold]total time:[/bold] {duration_str}") + except Exception: + # Summary is best-effort; avoid failing the command if something goes wrong + pass + + # 6. Return the list of Snapshots in this crawl + snapshots = crawl.snapshot_set.all() + return crawl, snapshots + + +@click.command() +@click.option( + "--depth", + "-d", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option("--max-urls", type=int, default=0, help="Maximum number of URLs to snapshot for this crawl (0 = unlimited)") +@click.option("--crawl-max-size", default="0", help="Maximum total crawl size in bytes or units like 45mb / 1gb (0 = unlimited)") +@click.option("--crawl-timeout", type=int, default=0, help="Maximum total crawl runtime in seconds (0 = unlimited)") +@click.option("--snapshot-max-size", default="0", help="Maximum per-snapshot size in bytes or units like 45mb / 1gb (0 = unlimited)") +@click.option("--crawl-max-concurrent-snapshots", type=int, default=None, help="Maximum snapshots to archive concurrently within one crawl") +@click.option("--tag", "-t", default="", help="Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3") +@click.option("--url-allowlist", "--domain-allowlist", default="", help="Comma-separated URL/domain allowlist for this crawl") +@click.option("--url-denylist", "--domain-denylist", default="", help="Comma-separated URL/domain denylist for this crawl") +@click.option("--parser", default="auto", help="Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)") +@click.option("--plugins", "-p", default="", help="Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...") +@click.option("--persona", default="Default", help="Authentication profile to use when archiving") +@click.option( + "--only-new/--no-only-new", + "only_new", + default=None, + help="Skip URLs that already have a snapshot (default: inherit from ONLY_NEW config). " + "Pass --no-only-new to force re-archive of URLs that already exist.", +) +@click.option("--index-only", is_flag=True, help="Just add the URLs to the index without archiving them now") +@click.option("--overwrite", is_flag=True, help="Re-archive URLs even if they already exist (alias for --no-only-new)") +@click.option("--update", is_flag=True, help="Re-archive URLs even if they already exist (alias for --no-only-new)") +@click.option("--bg", is_flag=True, help="Run archiving in background (queue work and return immediately)") +@click.argument("urls", nargs=-1, type=click.Path()) +@docstring(add.__doc__) +def main(**kwargs): + """Add a new URL or list of URLs to your archive""" + + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + + with foreground_shutdown_signals(), foreground_parent_watchdog(): + raw_urls = kwargs.pop("urls") + if raw_urls: + urls = _collect_input_urls(raw_urls, parser=kwargs.get("parser", "auto")) + elif not sys.stdin.isatty(): + urls = sys.stdin.read() + else: + urls = [] + if not urls: + raise click.UsageError("No URLs provided. Pass URLs as arguments or via stdin.") + if int(kwargs.get("max_urls") or 0) < 0: + raise click.BadParameter("max_urls must be 0 or a positive integer.", param_hint="--max-urls") + if int(kwargs.get("crawl_timeout") or 0) < 0: + raise click.BadParameter("crawl_timeout must be 0 or a positive integer.", param_hint="--crawl-timeout") + try: + kwargs["crawl_max_size"] = parse_filesize_to_bytes(kwargs.get("crawl_max_size")) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="--crawl-max-size") from err + try: + kwargs["snapshot_max_size"] = parse_filesize_to_bytes(kwargs.get("snapshot_max_size")) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="--snapshot-max-size") from err + if kwargs.get("crawl_max_concurrent_snapshots") is not None and int(kwargs["crawl_max_concurrent_snapshots"]) < 1: + raise click.BadParameter("crawl_max_concurrent_snapshots must be at least 1.", param_hint="--crawl-max-concurrent-snapshots") + + # Translate --only-new/--no-only-new into a crawl config override. + # add() takes config overrides as a dict; no per-flag kwargs. + overwrite = kwargs.pop("overwrite", False) + update = kwargs.pop("update", False) + only_new = kwargs.pop("only_new", None) + if overwrite or update: + only_new = False + if only_new is not None: + kwargs["config"] = {"ONLY_NEW": bool(only_new)} + + add(urls=urls, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py new file mode 100644 index 0000000000..1b79f32267 --- /dev/null +++ b/archivebox/cli/archivebox_archiveresult.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 + +""" +archivebox archiveresult <action> [args...] [--filters] + +Manage ArchiveResult records (plugin extraction results). + +Actions: + create - Create ArchiveResults for Snapshots (queue extractions) + list - List ArchiveResults as JSONL (with optional filters) + update - Update ArchiveResults from stdin JSONL + delete - Delete ArchiveResults from stdin JSONL + +Examples: + # Create ArchiveResults for snapshots (queue for extraction) + archivebox snapshot list --status=queued | archivebox archiveresult create + archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid> + + # List with filters + archivebox archiveresult list --status=failed + archivebox archiveresult list --plugin=screenshot --status=succeeded + + # Update (reset failed extractions to queued) + archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued + + # Delete + archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes + + # Re-run failed extractions + archivebox archiveresult list --status=failed | archivebox run +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox archiveresult" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict: + return { + "type": "ArchiveResult", + "snapshot_id": str(snapshot_id), + "plugin": plugin, + "hook_name": hook_name, + "status": status, + } + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_archiveresults( + snapshot_id: str | None = None, + plugin: str | None = None, + status: str = "queued", +) -> int: + """ + Create ArchiveResult request records for Snapshots. + + Reads Snapshot records from stdin and emits ArchiveResult request JSONL. + Pass-through: Non-Snapshot/ArchiveResult records are output unchanged. + If --plugin is specified, only emits requests for that plugin. + Otherwise, emits requests for all enabled snapshot hooks. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.config.common import get_config + from archivebox.plugins.hooks import discover_hooks + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.core.models import Snapshot + + is_tty = sys.stdout.isatty() + + # If snapshot_id provided directly, use that + if snapshot_id: + try: + snapshots = [Snapshot.objects.get(id=snapshot_id)] + pass_through_records = [] + except Snapshot.DoesNotExist: + rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr) + return 1 + else: + # Read from stdin + records = list(read_stdin()) + if not records: + rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Separate snapshot records from pass-through records + snapshot_ids = [] + pass_through_records = [] + + for record in records: + record_type = record.get("type", "") + + if record_type == TYPE_SNAPSHOT: + # Pass through the Snapshot record itself + pass_through_records.append(record) + if record.get("id"): + snapshot_ids.append(record["id"]) + + elif record_type == TYPE_ARCHIVERESULT: + # ArchiveResult records: pass through if they have an id + if record.get("id"): + pass_through_records.append(record) + # If no id, we could create it, but for now just pass through + else: + pass_through_records.append(record) + + elif record_type: + # Other typed records (Crawl, Tag, etc): pass through + pass_through_records.append(record) + + elif record.get("id"): + # Untyped record with id - assume it's a snapshot ID + snapshot_ids.append(record["id"]) + + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + + if not snapshot_ids: + if pass_through_records: + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr) + return 0 + rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr) + return 1 + + snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids)) + + if not snapshots: + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) + return 0 if pass_through_records else 1 + + created_count = 0 + for snapshot in snapshots: + if plugin: + if not is_tty: + write_record(build_archiveresult_request(snapshot.id, plugin, status=status)) + created_count += 1 + else: + config = get_config(crawl=snapshot.crawl, snapshot=snapshot) + hooks = discover_hooks("Snapshot", config=config) + for hook_path in hooks: + hook_name = hook_path.stem + plugin_name = hook_path.parent.name + if not is_tty: + write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status)) + created_count += 1 + + rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_archiveresults( + status: str | None = None, + plugin: str | None = None, + snapshot_id: str | None = None, + limit: int | None = None, +) -> int: + """ + List ArchiveResults as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + queryset = ArchiveResult.objects.all().order_by("-start_ts") + + # Apply filters + filter_kwargs = { + "status": status, + "plugin": plugin, + "snapshot_id": snapshot_id, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for result in queryset: + if is_tty: + status_color = { + "queued": "yellow", + "started": "blue", + "succeeded": "green", + "failed": "red", + "skipped": "dim", + "noresults": "dim", + "backoff": "magenta", + }.get(result.status, "dim") + rprint( + f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}", + ) + else: + write_record(result.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_archiveresults( + status: str | None = None, +) -> int: + """ + Update ArchiveResults from stdin JSONL. + + Reads ArchiveResult records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + result_id = record.get("id") + if not result_id: + continue + + try: + result = ArchiveResult.objects.get(id=result_id) + + # Apply updates from CLI flags + if status: + result.status = status + + result.save() + updated_count += 1 + + if not is_tty: + write_record(result.to_json()) + + except ArchiveResult.DoesNotExist: + rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete ArchiveResults from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import ArchiveResult + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + result_ids = [r.get("id") for r in records if r.get("id")] + + if not result_ids: + rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr) + return 1 + + results = ArchiveResult.objects.filter(id__in=result_ids) + count = results.count() + + if count == 0: + rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr) + for result in results[:10]: + rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr) + if count > 10: + rprint(f" ... and {count - 10} more", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = results.delete() + rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage ArchiveResult records (plugin extraction results).""" + pass + + +@main.command("create") +@click.option("--snapshot-id", help="Snapshot ID to create results for") +@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +def create_cmd(snapshot_id: str | None, plugin: str | None, status: str): + """Create ArchiveResults for Snapshots from stdin JSONL.""" + sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status)) + + +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)") +@click.option("--plugin", "-p", help="Filter by plugin name") +@click.option("--snapshot-id", help="Filter by snapshot ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + plugin: str | None, + snapshot_id: str | None, + limit: int | None, +): + """List ArchiveResults as JSONL.""" + sys.exit( + list_archiveresults( + status=status, + plugin=plugin, + snapshot_id=snapshot_id, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +def update_cmd(status: str | None): + """Update ArchiveResults from stdin JSONL.""" + sys.exit(update_archiveresults(status=status)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete ArchiveResults from stdin JSONL.""" + sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py new file mode 100644 index 0000000000..de7c8a705b --- /dev/null +++ b/archivebox/cli/archivebox_binary.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 + +""" +archivebox binary <action> [args...] [--filters] + +Manage Binary records (detected executables like chrome, wget, etc.). + +Actions: + create - Create/register a Binary + list - List Binaries as JSONL (with optional filters) + update - Update Binaries from stdin JSONL + delete - Delete Binaries from stdin JSONL + +Examples: + # List all binaries + archivebox binary list + + # List specific binary + archivebox binary list --name=chrome + + # List binaries with specific version + archivebox binary list --version__icontains=120 + + # Delete old binary entries + archivebox binary list --name=chrome | archivebox binary delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox binary" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_binary( + name: str, + abspath: str, + version: str = "", +) -> int: + """ + Create/register a Binary. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + if not name or not abspath: + rprint("[red]Both --name and --abspath are required[/red]", file=sys.stderr) + return 1 + + try: + from archivebox.machine.models import Machine + + machine = Machine.current() + created = not Binary.objects.filter( + machine=machine, + name=name, + abspath=abspath, + version=version, + ).exists() + + # Mirror the Binary model lifecycle used elsewhere in the system so CLI + # records are owned by the current machine and can be safely piped into + # `archivebox run` without creating invalid rows missing machine_id. + binary = Binary.from_json( + { + "name": name, + "abspath": abspath, + "version": version, + "binproviders": "env", + "binprovider": "env", + }, + ) + if binary is None: + raise ValueError("failed to create binary record") + + if not is_tty: + write_record(binary.to_json()) + + if created: + rprint(f"[green]Created binary: {name} at {abspath}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Binary already exists: {name} at {abspath}[/dim]", file=sys.stderr) + + return 0 + + except Exception as e: + rprint(f"[red]Error creating binary: {e}[/red]", file=sys.stderr) + return 1 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_binaries( + name: str | None = None, + abspath__icontains: str | None = None, + version__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Binaries as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + queryset = Binary.objects.all().order_by("name", "-modified_at", "-created_at") + + # Apply filters + filter_kwargs = { + "name": name, + "abspath__icontains": abspath__icontains, + "version__icontains": version__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for binary in queryset: + if is_tty: + rprint(f"[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}") + else: + write_record(binary.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} binaries[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_binaries( + version: str | None = None, + abspath: str | None = None, +) -> int: + """ + Update Binaries from stdin JSONL. + + Reads Binary records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + binary_id = record.get("id") + if not binary_id: + continue + + try: + binary = Binary.objects.get(id=binary_id) + + # Apply updates from CLI flags + if version: + binary.version = version + if abspath: + binary.abspath = abspath + + binary.save() + updated_count += 1 + + if not is_tty: + write_record(binary.to_json()) + + except Binary.DoesNotExist: + rprint(f"[yellow]Binary not found: {binary_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} binaries[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Binaries from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.machine.models import Binary + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + binary_ids = [r.get("id") for r in records if r.get("id")] + + if not binary_ids: + rprint("[yellow]No valid binary IDs in input[/yellow]", file=sys.stderr) + return 1 + + binaries = Binary.objects.filter(id__in=binary_ids) + count = binaries.count() + + if count == 0: + rprint("[yellow]No matching binaries found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} binaries (dry run)[/yellow]", file=sys.stderr) + for binary in binaries: + rprint(f" {binary.name} {binary.abspath}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = binaries.delete() + rprint(f"[green]Deleted {deleted_count} binaries[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Binary records (detected executables).""" + pass + + +@main.command("create") +@click.option("--name", "-n", required=True, help="Binary name (e.g., chrome, wget)") +@click.option("--abspath", "-p", required=True, help="Absolute path to binary") +@click.option("--version", "-v", default="", help="Binary version") +def create_cmd(name: str, abspath: str, version: str): + """Create/register a Binary.""" + sys.exit(create_binary(name=name, abspath=abspath, version=version)) + + +@main.command("list") +@click.option("--name", "-n", help="Filter by name") +@click.option("--abspath__icontains", help="Filter by path contains") +@click.option("--version__icontains", help="Filter by version contains") +@click.option("--limit", type=int, help="Limit number of results") +def list_cmd( + name: str | None, + abspath__icontains: str | None, + version__icontains: str | None, + limit: int | None, +): + """List Binaries as JSONL.""" + sys.exit( + list_binaries( + name=name, + abspath__icontains=abspath__icontains, + version__icontains=version__icontains, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--version", "-v", help="Set version") +@click.option("--abspath", "-p", help="Set path") +def update_cmd(version: str | None, abspath: str | None): + """Update Binaries from stdin JSONL.""" + sys.exit(update_binaries(version=version, abspath=abspath)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Binaries from stdin JSONL.""" + sys.exit(delete_binaries(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_config.py b/archivebox/cli/archivebox_config.py new file mode 100644 index 0000000000..5dc50528b5 --- /dev/null +++ b/archivebox/cli/archivebox_config.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import sys +import toml +import rich_click as click +from rich import print +from pathlib import Path + +from archivebox.misc.util import docstring, enforce_types +from archivebox.misc.toml_util import CustomTOMLEncoder + + +def _format_toml(config: dict) -> str: + return toml.dumps(config, encoder=CustomTOMLEncoder()).strip().replace("\n\n", "\n") + + +@enforce_types +def config( + *keys, + get: bool = False, + set: bool = False, + search: bool = False, + reset: bool = False, + **kwargs, +) -> None: + """Get and set your ArchiveBox project configuration values""" + + from archivebox.misc.checks import check_data_folder + from archivebox.misc.logging_util import printable_config + from abx_plugins.plugins.base.utils import resolve_alias + from archivebox.config.collection import write_config_file + from archivebox.config import CONSTANTS_CONFIG + from archivebox.config.common import ArchiveBoxConfig, get_config, get_all_configs + from archivebox.plugins.discovery import discover_plugin_configs + + check_data_folder() + + FLAT_CONFIG = get_config().as_dict() + runtime_derived_keys = ArchiveBoxConfig.runtime_derived_config_keys() + readonly_config = {key: val for key, val in CONSTANTS_CONFIG.items() if key.isupper() and isinstance(val, Path)} + writable_config = {key: val for key, val in FLAT_CONFIG.items() if key not in runtime_derived_keys and key not in readonly_config} + readable_config = {**writable_config, **readonly_config} + CONFIGS = get_all_configs() + plugin_schemas = { + plugin_name: schema.get("properties", {}) for plugin_name, schema in discover_plugin_configs().items() if isinstance(schema, dict) + } + core_config_aliases = { + alias.upper(): field_name + for field_name, field in ArchiveBoxConfig.model_fields.items() + for alias in (field_name, str(field.alias or "")) + if alias + } + + config_options: list[str] = list(kwargs.pop("key=value", []) or keys or [f"{key}={val}" for key, val in kwargs.items()]) + no_args = not (get or set or reset or config_options) + + matching_config = {} + if search: + if config_options: + config_options = [ + core_config_aliases.get(key.upper().strip()) or resolve_alias(key.upper().strip(), plugin_schemas) for key in config_options + ] + matching_config = {key: readable_config[key] for key in config_options if key in readable_config} + for config_section in CONFIGS.values(): + aliases = {str(field.alias): field_name for field_name, field in type(config_section).model_fields.items() if field.alias} + + for search_key in config_options: + # search all aliases in the section + for alias_key, key in aliases.items(): + if key in readable_config and search_key.lower() in alias_key.lower(): + matching_config[key] = dict(config_section)[key] + + # search all keys and values in the section + for existing_key, value in dict(config_section).items(): + if existing_key in readable_config and ( + search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower() + ): + matching_config[existing_key] = value + for existing_key, value in readonly_config.items(): + if search_key.lower() in existing_key.lower() or search_key.lower() in str(value).lower(): + matching_config[existing_key] = value + + print(printable_config(matching_config)) + raise SystemExit(not matching_config) + + elif get or no_args: + if config_options: + config_options = [ + core_config_aliases.get(key.upper().strip()) or resolve_alias(key.upper().strip(), plugin_schemas) for key in config_options + ] + matching_config = {key: readable_config[key] for key in config_options if key in readable_config} + failed_config = [key for key in config_options if key not in readable_config] + if failed_config: + print("\n[red][X] These options failed to get[/red]") + print(" {}".format("\n ".join(config_options))) + raise SystemExit(1) + else: + matching_config = readable_config + + # Display core config sections + for config_section in CONFIGS.values(): + section_header = config_section.toml_section_header + if isinstance(section_header, str) and section_header: + print(f"[grey53]\\[{section_header}][/grey53]") + else: + print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]") + + kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config} + print(_format_toml(kv_in_section)) + print("[grey53]################################################################[/grey53]") + + readonly_keys = {key: val for key, val in readonly_config.items() if key in matching_config} + if readonly_keys: + print("[grey53]\\[CONSTANTS] # (read-only)[/grey53]") + print(_format_toml(readonly_keys)) + print("[grey53]################################################################[/grey53]") + + plugin_keys = {} + + # Collect all plugin config keys + for schema in plugin_schemas.values(): + for key in schema.keys(): + if key in matching_config and key in writable_config: + plugin_keys[key] = matching_config[key] + + # Display all plugin config in single [PLUGINS] section + if plugin_keys: + print("[grey53]\\[PLUGINS][/grey53]") + print(_format_toml(plugin_keys)) + print("[grey53]################################################################[/grey53]") + + raise SystemExit(not matching_config) + + elif set: + new_config = {} + failed_options = [] + for line in config_options: + if line.startswith("#") or not line.strip(): + continue + if "=" not in line: + print("[red][X] Config KEY=VALUE must have an = sign in it[/red]") + print(f" {line}") + raise SystemExit(2) + + raw_key, val = line.split("=", 1) + raw_key = raw_key.upper().strip() + key = core_config_aliases.get(raw_key) or resolve_alias(raw_key, plugin_schemas) + if key != raw_key: + print( + f"[yellow][i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.[/yellow]", + ) + + if key in writable_config: + new_config[key] = val.strip() + else: + failed_options.append(line) + + if new_config: + before = FLAT_CONFIG + matching_config = write_config_file(new_config) + after = get_config().as_dict() + print(printable_config(matching_config)) + + side_effect_changes = {} + for key, val in after.items(): + if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config): + side_effect_changes[key] = after[key] + + if side_effect_changes: + print(file=sys.stderr) + print("[yellow][i] Note: This change also affected these other options that depended on it:[/yellow]", file=sys.stderr) + print(" {}".format(printable_config(side_effect_changes, prefix=" ")), file=sys.stderr) + + if failed_options: + print() + print("[red][X] These options failed to set (check for typos):[/red]") + print(" {}".format("\n ".join(failed_options))) + raise SystemExit(1) + + elif reset: + print("[red][X] This command is not implemented yet.[/red]") + print(" Please manually remove the relevant lines from your config file:") + raise SystemExit(2) + + else: + print("[red][X] You must pass either --get or --set, or no arguments to get the whole config.[/red]") + print(" archivebox config") + print(" archivebox config --get SOME_KEY") + print(" archivebox config --set SOME_KEY=SOME_VALUE") + raise SystemExit(2) + + +@click.command() +@click.option("--search", is_flag=True, help="Search config KEYs, VALUEs, and ALIASES for the given term") +@click.option("--get", is_flag=True, help="Get the value for the given config KEYs") +@click.option("--set", is_flag=True, help="Set the given KEY=VALUE config values") +@click.option("--reset", is_flag=True, help="Reset the given KEY config values to their defaults") +@click.argument("KEY=VALUE", nargs=-1, type=str) +@docstring(config.__doc__) +def main(**kwargs) -> None: + config(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py new file mode 100644 index 0000000000..5e703c3ec4 --- /dev/null +++ b/archivebox/cli/archivebox_crawl.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 + +""" +archivebox crawl <action> [args...] [--filters] + +Manage Crawl records. + +Actions: + create - Create Crawl jobs from URLs + list - List Crawls as JSONL (with optional filters) + update - Update Crawls from stdin JSONL + delete - Delete Crawls from stdin JSONL + +Examples: + # Create + archivebox crawl create https://example.com https://foo.com --depth=1 + archivebox crawl create --tag=news https://example.com + + # List with filters + archivebox crawl list --status=queued + archivebox crawl list --urls__icontains=example.com + + # Update + archivebox crawl list --status=started | archivebox crawl update --status=queued + + # Delete + archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes + + # Full pipeline + archivebox crawl create https://example.com | archivebox snapshot create | archivebox run +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox crawl" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_crawl( + urls: Iterable[str], + depth: int = 0, + tag: str = "", + status: str = "queued", + created_by_id: int | None = None, +) -> int: + """ + Create a Crawl job from URLs. + + Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL. + Pass-through: Records that are not URLs are output unchanged (for piping). + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL + from archivebox.misc.util import validate_url + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + + created_by_id = created_by_id or get_or_create_system_user_pk() + is_tty = sys.stdout.isatty() + + # Collect all input records + records = list(read_args_or_stdin(urls)) + + if not records: + rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Separate pass-through records from URL records + url_list = [] + pass_through_records = [] + + for record in records: + record_type = record.get("type", "") + + # Pass-through: output records that aren't URL/Crawl types + if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"): + pass_through_records.append(record) + continue + + # Handle existing Crawl records (just pass through with id) + if record_type == TYPE_CRAWL and record.get("id"): + pass_through_records.append(record) + continue + + # Collect URLs + url = record.get("url") + if url: + try: + url_list.append(validate_url(str(url))) + except ValueError as err: + rprint(f"[red]Invalid URL: {err}[/red]", file=sys.stderr) + return 1 + + # Handle 'urls' field (newline-separated) + urls_field = record.get("urls") + if urls_field: + for line in urls_field.split("\n"): + line = line.strip() + if line and not line.startswith("#"): + try: + url_list.append(validate_url(line)) + except ValueError as err: + rprint(f"[red]Invalid URL: {err}[/red]", file=sys.stderr) + return 1 + + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + + if not url_list: + if pass_through_records: + # If we had pass-through records but no URLs, that's OK + rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr) + return 0 + rprint("[red]No valid URLs found[/red]", file=sys.stderr) + return 1 + + try: + # Build crawl record with all URLs as newline-separated string + crawl_record = { + "urls": "\n".join(url_list), + "max_depth": depth, + "tags_str": tag, + "status": status, + "label": "", + } + + crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id}) + if not crawl: + rprint("[red]Failed to create crawl[/red]", file=sys.stderr) + return 1 + + # Output JSONL record (only when piped) + if not is_tty: + write_record(crawl.to_json()) + + rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr) + + # If TTY, show human-readable output + if is_tty: + rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr) + for url in url_list[:5]: # Show first 5 URLs + rprint(f" {url[:70]}", file=sys.stderr) + if len(url_list) > 5: + rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr) + + return 0 + + except Exception as e: + rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr) + return 1 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_crawls( + status: str | None = None, + urls__icontains: str | None = None, + max_depth: int | None = None, + limit: int | None = None, +) -> int: + """ + List Crawls as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.crawls.models import Crawl + + is_tty = sys.stdout.isatty() + + queryset = Crawl.objects.all().order_by("-created_at") + + # Apply filters + filter_kwargs = { + "status": status, + "urls__icontains": urls__icontains, + "max_depth": max_depth, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for crawl in queryset: + if is_tty: + status_color = { + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(crawl.status, "dim") + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...") + else: + write_record(crawl.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_crawls( + status: str | None = None, + max_depth: int | None = None, +) -> int: + """ + Update Crawls from stdin JSONL. + + Reads Crawl records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.crawls.models import Crawl + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + crawl_id = record.get("id") + if not crawl_id: + continue + + try: + crawl = Crawl.objects.get(id=crawl_id) + + if status: + if status not in Crawl.StatusChoices.values: + rprint(f"[red]Invalid crawl status: {status}[/red]", file=sys.stderr) + continue + if status == Crawl.StatusChoices.SEALED: + crawl.cancel() + elif status == Crawl.StatusChoices.PAUSED: + crawl.pause() + elif status == Crawl.StatusChoices.QUEUED: + if crawl.status == Crawl.StatusChoices.PAUSED: + crawl.resume() + else: + crawl.update_and_requeue(status=Crawl.StatusChoices.QUEUED, retry_at=timezone.now()) + elif status == Crawl.StatusChoices.STARTED: + crawl.update_and_requeue(status=Crawl.StatusChoices.STARTED, retry_at=timezone.now()) + if max_depth is not None: + crawl.safe_update({"max_depth": max_depth, "modified_at": timezone.now()}, refresh=False) + crawl.max_depth = max_depth + elif not status: + crawl.safe_update({"modified_at": timezone.now()}, refresh=False) + updated_count += 1 + + if not is_tty: + crawl.refresh_from_db() + write_record(crawl.to_json()) + + except Crawl.DoesNotExist: + rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Crawls from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.crawls.models import Crawl + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + crawl_ids = [r.get("id") for r in records if r.get("id")] + + if not crawl_ids: + rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr) + return 1 + + crawls = Crawl.objects.filter(id__in=crawl_ids) + count = crawls.count() + + if count == 0: + rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr) + for crawl in crawls: + url_preview = crawl.urls[:50].replace("\n", " ") + rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = crawls.delete() + rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Crawl records.""" + pass + + +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)") +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +def create_cmd(urls: tuple, depth: int, tag: str, status: str): + """Create a Crawl job from URLs or stdin.""" + sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status)) + + +@main.command("list") +@click.option("--status", "-s", help="Filter by status (queued, started, sealed)") +@click.option("--urls__icontains", help="Filter by URLs contains") +@click.option("--max-depth", type=int, help="Filter by max depth") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd( + status: str | None, + urls__icontains: str | None, + max_depth: int | None, + limit: int | None, +): + """List Crawls as JSONL.""" + sys.exit( + list_crawls( + status=status, + urls__icontains=urls__icontains, + max_depth=max_depth, + limit=limit, + ), + ) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--max-depth", type=int, help="Set max depth") +def update_cmd(status: str | None, max_depth: int | None): + """Update Crawls from stdin JSONL.""" + sys.exit(update_crawls(status=status, max_depth=max_depth)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Crawls from stdin JSONL.""" + sys.exit(delete_crawls(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py new file mode 100644 index 0000000000..aec80e60e6 --- /dev/null +++ b/archivebox/cli/archivebox_extract.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 + +""" +archivebox extract [snapshot_ids...] [--plugins=NAMES] + +Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL. + +Input formats: + - Snapshot UUIDs (one per line) + - JSONL: {"type": "Snapshot", "id": "...", "url": "..."} + - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."} + +Output (JSONL): + {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."} + +Examples: + # Extract specific snapshot + archivebox extract 01234567-89ab-cdef-0123-456789abcdef + + # Pipe from snapshot command + archivebox snapshot https://example.com | archivebox extract + + # Run specific plugins only + archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef + + # Chain commands + archivebox crawl https://example.com | archivebox snapshot | archivebox extract +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox extract" + +import sys +from collections import defaultdict +from itertools import product + +import rich_click as click + + +def process_archiveresult_by_id(archiveresult_id: str) -> int: + """ + Re-run extraction for a single ArchiveResult by ID. + + ArchiveResults are projected status rows, not queued work items. Re-running + a single result means resetting that row and queueing its parent snapshot + through the shared crawl runner with the corresponding plugin selected. + """ + from rich import print as rprint + from django.utils import timezone + from archivebox.core.models import ArchiveResult + from archivebox.api.v1_core import _uuid_ref_query + from archivebox.services.runner import run_crawl + + try: + archiveresult = ArchiveResult.objects.get(_uuid_ref_query("id", archiveresult_id)) + except ArchiveResult.DoesNotExist: + rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr) + return 1 + + rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr) + + try: + was_paused = archiveresult.snapshot.is_paused + archiveresult.reset_for_retry() + snapshot = archiveresult.snapshot + if not was_paused: + snapshot.queue_for_extraction() + else: + # A paused snapshot may still accept explicit maintenance for one + # ArchiveResult, but this path must not transition it back to + # queued/startable work. Guard: only set retry_at while the row is + # still paused โ€” concurrent resume would otherwise see a stale + # retry_at marker. + snapshot.safe_update( + {"retry_at": timezone.now()}, + refresh=False, + extra_filter={"status": snapshot.StatusChoices.PAUSED}, + ) + crawl = snapshot.crawl + if not crawl.claim_processing_lock(lock_seconds=10): + rprint( + f"[yellow]Crawl {crawl.id} is already owned by another runner[/yellow]", + file=sys.stderr, + ) + return 1 + + try: + run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin]) + finally: + if was_paused: + snapshot.restore_paused_scheduler_marker() + archiveresult.refresh_from_db() + + if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: + print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]") + return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS: + print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]") + return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: + print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr) + return 1 + else: + # Still in progress or backoff - not a failure + print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]") + return 0 + + except Exception as e: + print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + return 1 + + +def run_plugins( + args: tuple, + records: list[dict] | None = None, + plugins: str = "", + wait: bool = True, + emit_results: bool = True, + show_progress: bool = True, + preserve_queued: bool = False, +) -> int: + """ + Run plugins on Snapshots from input. + + Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL. + + Exit codes: + 0: Success + 1: Failure + """ + from rich import print as rprint + from django.utils import timezone + + from archivebox.misc.jsonl import ( + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + ) + from archivebox.core.models import Snapshot + from archivebox.core.models import ArchiveResult + from archivebox.services.runner import run_crawl + from abx_dl.models import discover_plugins + + is_tty = sys.stdout.isatty() + + # Parse comma-separated plugins list once (reused in creation and filtering) + plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else [] + + # Parse stdin/args exactly once per CLI invocation. + # `main()` may already have consumed stdin to distinguish Snapshot input from + # ArchiveResult IDs; if so, it must pass the parsed records through here + # instead of asking this helper to reread an already-drained pipe. + if records is None: + records = list(read_args_or_stdin(args)) + + if not records: + rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Gather snapshot IDs and optional plugin constraints to process + snapshot_ids = set() + requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set) + for record in records: + record_type = record.get("type") + + if record_type == TYPE_SNAPSHOT: + snapshot_id = record.get("id") + if snapshot_id: + snapshot_ids.add(str(snapshot_id)) + elif record.get("url"): + # Look up by URL (get most recent if multiple exist) + snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first() + if snap: + snapshot_ids.add(str(snap.id)) + else: + rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr) + + elif record_type == TYPE_ARCHIVERESULT: + snapshot_id = record.get("snapshot_id") + if snapshot_id: + snapshot_ids.add(str(snapshot_id)) + plugin_name = record.get("plugin") + if plugin_name and not plugins_list: + requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name)) + + elif "id" in record: + # Assume it's a snapshot ID + snapshot_ids.add(str(record["id"])) + + if not snapshot_ids: + rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr) + return 1 + + existing_snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids).values_list("id", "crawl_id")) + existing_snapshot_ids = {str(snapshot_id) for snapshot_id, _crawl_id in existing_snapshots} + existing_crawl_ids = {str(crawl_id) for _snapshot_id, crawl_id in existing_snapshots} + missing_snapshot_ids = sorted(str(snapshot_id) for snapshot_id in snapshot_ids - existing_snapshot_ids) + for snapshot_id in missing_snapshot_ids: + rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr) + + # Queue only the target plugin rows. Bulk updates keep large reindex runs + # from doing one SELECT+UPDATE per snapshot/plugin before hooks even start. + requested_pairs: set[tuple[str, str]] = set() + if plugins_list: + requested_pairs.update((snapshot_id, plugin_name) for snapshot_id, plugin_name in product(existing_snapshot_ids, plugins_list)) + else: + requested_pairs.update( + (snapshot_id, plugin_name) + for snapshot_id, plugin_names in requested_plugins_by_snapshot.items() + if snapshot_id in existing_snapshot_ids + for plugin_name in plugin_names + ) + plugins_by_name = discover_plugins(runtime="archivebox") + requested_rows: set[tuple[str, str, str]] = set() + for snapshot_id, plugin_name in requested_pairs: + plugin = plugins_by_name.get(plugin_name) + hooks = plugin.filter_hooks("Snapshot") if plugin is not None else [] + if hooks: + requested_rows.update((snapshot_id, plugin_name, hook.name) for hook in hooks) + else: + requested_rows.add((snapshot_id, plugin_name, "")) + + queued_rows: set[tuple[str, str, str]] = set() + if preserve_queued and requested_rows: + queued_rows = { + (str(snapshot_id), plugin_name, hook_name) + for snapshot_id, plugin_name, hook_name in ArchiveResult.objects.filter( + snapshot_id__in=existing_snapshot_ids, + plugin__in={plugin_name for _snapshot_id, plugin_name, _hook_name in requested_rows}, + status=ArchiveResult.StatusChoices.QUEUED, + ).values_list("snapshot_id", "plugin", "hook_name") + } + rows_to_queue = requested_rows - queued_rows + + reset_fields = { + "status": ArchiveResult.StatusChoices.QUEUED, + "output_str": "", + "output_json": None, + "output_files": {}, + "output_size": 0, + "output_mimetypes": "", + "start_ts": None, + "end_ts": None, + "modified_at": timezone.now(), + } + if rows_to_queue and plugins_list: + rows_to_reset_by_hook: dict[tuple[str, str], set[str]] = defaultdict(set) + for snapshot_id, plugin_name, hook_name in rows_to_queue: + rows_to_reset_by_hook[(plugin_name, hook_name)].add(snapshot_id) + for (plugin_name, hook_name), plugin_snapshot_ids in rows_to_reset_by_hook.items(): + ArchiveResult.objects.filter(snapshot_id__in=plugin_snapshot_ids, plugin=plugin_name, hook_name=hook_name).update( + **reset_fields, + ) + elif rows_to_queue and requested_plugins_by_snapshot: + snapshot_ids_by_hook: dict[tuple[str, str], set[str]] = defaultdict(set) + for snapshot_id, plugin_name, hook_name in rows_to_queue: + snapshot_ids_by_hook[(plugin_name, hook_name)].add(snapshot_id) + for (plugin_name, hook_name), plugin_snapshot_ids in snapshot_ids_by_hook.items(): + ArchiveResult.objects.filter(snapshot_id__in=plugin_snapshot_ids, plugin=plugin_name, hook_name=hook_name).update( + **reset_fields, + ) + existing_rows = ( + { + (str(snapshot_id), plugin_name, hook_name) + for snapshot_id, plugin_name, hook_name in ArchiveResult.objects.filter( + snapshot_id__in=existing_snapshot_ids, + plugin__in={plugin_name for _snapshot_id, plugin_name, _hook_name in rows_to_queue}, + ).values_list("snapshot_id", "plugin", "hook_name") + } + if rows_to_queue + else set() + ) + missing_rows = rows_to_queue - existing_rows + if missing_rows: + ArchiveResult.objects.bulk_create( + [ + ArchiveResult( + snapshot_id=snapshot_id, + plugin=plugin_name, + hook_name=hook_name, + status=ArchiveResult.StatusChoices.QUEUED, + ) + for snapshot_id, plugin_name, hook_name in sorted(missing_rows) + ], + batch_size=500, + ) + + processed_count = len(existing_snapshot_ids) + queue_at = timezone.now() + if existing_snapshot_ids: + if requested_rows: + # Targeted ArchiveResult retries use retry_at as the scheduling + # signal and keep sealed snapshots sealed so extractors are not + # re-run outside the explicitly queued plugin rows. Paused snapshots + # also keep status=paused here: `retry_at` only asks the orchestrator + # to process the queued plugin rows, and run_due_snapshot restores + # retry_at=MAX afterward instead of resuming the snapshot lifecycle. + affected_snapshot_ids = {snapshot_id for snapshot_id, _plugin_name, _hook_name in rows_to_queue} + if preserve_queued and queued_rows: + queued_snapshot_ids = {snapshot_id for snapshot_id, _plugin_name, _hook_name in queued_rows} + affected_snapshot_ids.update( + str(snapshot_id) + for snapshot_id in Snapshot.objects.filter(id__in=queued_snapshot_ids) + .filter(retry_at__gt=queue_at) + .values_list("id", flat=True) + ) + affected_snapshot_ids.update( + str(snapshot_id) + for snapshot_id in Snapshot.objects.filter(id__in=queued_snapshot_ids, retry_at__isnull=True).values_list( + "id", + flat=True, + ) + ) + for snapshot in Snapshot.objects.filter(id__in=affected_snapshot_ids).only("id", "status", "modified_at"): + # Guard the read-time status so we never bump retry_at on a + # row that's been re-queued / started by a concurrent runner. + snapshot.safe_update( + {"retry_at": queue_at, "modified_at": queue_at}, + refresh=False, + extra_filter={"status": snapshot.status}, + ) + else: + # No plugin rows were requested, so this is a full snapshot retry. + for snapshot in Snapshot.objects.filter(id__in=existing_snapshot_ids).only("id", "status", "retry_at", "modified_at"): + snapshot.safe_update( + { + "status": Snapshot.StatusChoices.QUEUED, + "retry_at": queue_at, + "current_step": 0, + "modified_at": queue_at, + }, + refresh=False, + extra_filter={"status": snapshot.status}, + ) + if existing_crawl_ids and not requested_rows: + from archivebox.crawls.models import Crawl + + for crawl in Crawl.objects.filter(id__in=existing_crawl_ids).only("id", "status", "retry_at", "modified_at"): + update_fields = { + "retry_at": queue_at, + "modified_at": queue_at, + } + if crawl.status != Crawl.StatusChoices.STARTED: + update_fields["status"] = Crawl.StatusChoices.QUEUED + crawl.safe_update( + update_fields, + refresh=False, + extra_filter={"status": crawl.status}, + ) + + if processed_count == 0: + rprint("[red]No snapshots to process[/red]", file=sys.stderr) + return 1 + + if show_progress: + rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr) + + # Run orchestrator if --wait (default) + if wait: + if show_progress: + rprint("[blue]Running plugins...[/blue]", file=sys.stderr) + snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) + for snapshot_id, crawl_id in existing_snapshots: + snapshot_ids_by_crawl[str(crawl_id)].add(str(snapshot_id)) + + for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items(): + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.get(id=crawl_id) + if not crawl.claim_processing_lock(lock_seconds=10): + rprint( + f"[yellow]Crawl {crawl_id} is already owned by another runner[/yellow]", + file=sys.stderr, + ) + return 1 + selected_plugins = ( + plugins_list + or sorted( + {plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())}, + ) + or None + ) + run_crawl( + crawl_id, + snapshot_ids=sorted(crawl_snapshot_ids), + selected_plugins=selected_plugins, + show_progress=show_progress, + ) + + if not emit_results: + return 0 + + # Output results as JSONL (when piped) or human-readable (when TTY) + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + results = snapshot.archiveresult_set.all() + if plugins_list: + results = results.filter(plugin__in=plugins_list) + + for result in results: + if is_tty: + status_color = { + "succeeded": "green", + "failed": "red", + "skipped": "yellow", + }.get(result.status, "dim") + rprint( + f" [{status_color}]{result.status}[/{status_color}] {result.plugin} โ†’ {result.output_str or ''}", + file=sys.stderr, + ) + else: + write_record(result.to_json()) + except Snapshot.DoesNotExist: + continue + + return 0 + + +def is_archiveresult_id(value: str) -> bool: + """Check if value resolves to an ArchiveResult ID.""" + from archivebox.core.models import ArchiveResult + from archivebox.api.v1_core import _uuid_ref_query + + return ArchiveResult.objects.filter(_uuid_ref_query("id", value)).exists() + + +@click.command() +@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)") +@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)") +@click.argument("args", nargs=-1) +def main(plugins: str, wait: bool, args: tuple): + """Run plugins on Snapshots, or process existing ArchiveResults by ID""" + from archivebox.misc.jsonl import read_args_or_stdin + + # Read all input + records = list(read_args_or_stdin(args)) + + if not records: + from rich import print as rprint + + rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr) + sys.exit(1) + + # Check if input looks like existing ArchiveResult IDs to process + all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records) + + if all_are_archiveresult_ids: + # Process existing ArchiveResults by ID + from rich import print as rprint + + exit_code = 0 + for record in records: + archiveresult_id = record.get("id") or record.get("url") + if not isinstance(archiveresult_id, str): + rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr) + exit_code = 1 + continue + result = process_archiveresult_by_id(archiveresult_id) + if result != 0: + exit_code = result + sys.exit(exit_code) + else: + # Default behavior: run plugins on Snapshots from input + sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py new file mode 100755 index 0000000000..390618c221 --- /dev/null +++ b/archivebox/cli/archivebox_help.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +__package__ = "archivebox.cli" +__command__ = "archivebox help" + +import ast +import importlib.util +import os +from pathlib import Path + +import click +from rich import print +from rich.panel import Panel + + +def _command_doc(cmd: str, import_path: str) -> str: + def first_doc_line(docstring: str | None) -> str: + lines = (docstring or "").splitlines() + return lines[0] if lines else "" + + modname, _ = import_path.rsplit(".", 1) + spec = importlib.util.find_spec(modname) + if spec is None or spec.origin is None: + return "" + + try: + tree = ast.parse(Path(spec.origin).read_text()) + except OSError: + return "" + + for name in (cmd, "main"): + for node in tree.body: + if isinstance(node, ast.FunctionDef) and node.name == name: + return first_doc_line(ast.get_docstring(node)) + + return first_doc_line(ast.get_docstring(tree)) + + +def help() -> None: + """Print the ArchiveBox help message and usage""" + + from archivebox.cli import ArchiveBoxGroup + from archivebox.config import CONSTANTS + from archivebox.config.common import get_config + from archivebox.config.permissions import IN_DOCKER + from archivebox.misc.logging_util import log_cli_command + + log_cli_command("help", [], None, ".") + + COMMANDS_HELP_TEXT = ( + "\n ".join(f"[green]{cmd.ljust(20)}[/green] {_command_doc(cmd, path)}" for cmd, path in ArchiveBoxGroup.meta_commands.items()) + + "\n\n " + + "\n ".join(f"[green]{cmd.ljust(20)}[/green] {_command_doc(cmd, path)}" for cmd, path in ArchiveBoxGroup.setup_commands.items()) + + "\n\n " + + "\n ".join( + f"[green]{cmd.ljust(20)}[/green] {_command_doc(cmd, path)}" for cmd, path in ArchiveBoxGroup.archive_commands.items() + ) + ) + + DOCKER_USAGE = ( + """ +[dodger_blue3]Docker Usage:[/dodger_blue3] + [grey53]# using Docker Compose:[/grey53] + [blue]docker compose run[/blue] [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + + [grey53]# using Docker:[/grey53] + [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] +""" + if IN_DOCKER + else "" + ) + DOCKER_DOCS = ( + "\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]" + if IN_DOCKER + else "" + ) + DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else "" + DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else "" + + print(f"""{DOCKER_USAGE} +[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} + [dark_green]archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] + +[deep_sky_blue4]Commands:[/deep_sky_blue4] + {COMMANDS_HELP_TEXT} + +[deep_sky_blue4]Documentation:[/deep_sky_blue4] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki]https://github.com/ArchiveBox/ArchiveBox/wiki[/link]{DOCKER_DOCS} + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#cli-usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage[/link] + [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration[/link] +""") + + get_config() + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and CONSTANTS.ARCHIVE_DIR.is_dir(): + pretty_out_dir = str(CONSTANTS.DATA_DIR).replace(str(Path("~").expanduser()), "~") + EXAMPLE_USAGE = f""" +[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow] + +[violet]Hint:[/violet] [i]Common maintenance tasks:[/i] + [dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# make sure database is up-to-date (safe to run multiple times)[/grey53] + [dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# make sure plugins are up-to-date (wget, chrome, singlefile, etc.)[/grey53] + [dark_green]archivebox[/dark_green] [green]status[/green] [grey53]# get a health checkup report on your collection[/grey53] + [dark_green]archivebox[/dark_green] [green]update[/green] [grey53]# retry any previously failed or interrupted archiving tasks[/grey53] + +[violet]Hint:[/violet] [i]More example usage:[/i] + [dark_green]archivebox[/dark_green] [green]add[/green] --depth=1 "https://example.com/some/page" + [dark_green]archivebox[/dark_green] [green]list[/green] --sort=timestamp --csv=timestamp,downloaded_at,url,title + [dark_green]archivebox[/dark_green] [green]schedule[/green] --every=day --depth=1 "https://example.com/some/feed.rss" + [dark_green]archivebox[/dark_green] [green]server[/green] [blue]0.0.0.0:8000[/blue] [grey53]# Start the Web UI / API server[/grey53] +""" + print( + Panel( + EXAMPLE_USAGE, + expand=False, + border_style="grey53", + title="[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]", + subtitle="Commands run inside this dir will only apply to this collection.", + ), + ) + else: + DATA_SETUP_HELP = "\n" + if IN_DOCKER: + DATA_SETUP_HELP += "[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n" + DATA_SETUP_HELP += " docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n" + DATA_SETUP_HELP += "To load an [dark_blue]existing[/dark_blue] collection:\n" + DATA_SETUP_HELP += " 1. [green]cd[/green] ~/archivebox/data [grey53]# go into existing [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += f" 2. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# migrate to latest version (safe to run multiple times)[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-update all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ...get help with next steps... [/grey53]\n\n" + DATA_SETUP_HELP += "To start a [sea_green1]new[/sea_green1] collection:\n" + DATA_SETUP_HELP += " 1. [green]mkdir[/green] ~/archivebox/data [grey53]# create a new, empty [light_slate_blue]DATA DIR[/light_slate_blue] (can be anywhere)[/grey53]\n" + DATA_SETUP_HELP += " 2. [green]cd[/green] ~/archivebox/data [grey53]# cd into the new directory[/grey53]\n" + DATA_SETUP_HELP += f" 3. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]init[/green] [grey53]# initialize ArchiveBox in the new data dir[/grey53]\n" + DATA_SETUP_HELP += f" 4. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]install[/green] [grey53]# auto-install all plugins (wget, chrome, singlefile, etc.)[/grey53]\n" + DATA_SETUP_HELP += f" 5. {DOCKER_CMD_PREFIX}[dark_green]archivebox[/dark_green] [green]help[/green] [grey53]# ... get help with next steps... [/grey53]\n" + print( + Panel( + DATA_SETUP_HELP, + expand=False, + border_style="grey53", + title="[red]:cross_mark: No collection is currently active[/red]", + subtitle="All archivebox [green]commands[/green] should be run from inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) + + +@click.command() +@click.option("--help", "-h", is_flag=True, help="Show help") +def main(**kwargs): + """Print the ArchiveBox help message and usage""" + return help() + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py new file mode 100755 index 0000000000..d3f6a88873 --- /dev/null +++ b/archivebox/cli/archivebox_init.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import os +import sys +from pathlib import Path + +from rich import print +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +def _display_data_path(path: Path, data_dir: Path) -> str: + path = Path(path).resolve() + data_dir = Path(data_dir).resolve() + try: + return f"./{path.relative_to(data_dir)}" + except ValueError: + return str(path) + + +@enforce_types +def init(force: bool = False, quick: bool = False, install: bool = False) -> None: + """Initialize a new ArchiveBox collection in the current directory""" + + from archivebox.config import CONSTANTS, VERSION + from archivebox.config.common import get_config + from archivebox.config.collection import write_config_file + from archivebox.misc.db import apply_migrations + from archivebox.misc.checks import check_migrations + + config = get_config() + + # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK): + # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr) + # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr) + + is_empty = not len(set(os.listdir(CONSTANTS.DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR) + existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE) + if is_empty and not existing_index: + print(f"[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]") + print("[green]----------------------------------------------------------------------[/green]") + elif existing_index: + # TODO: properly detect and print the existing version in current index as well + print(f"[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]") + print("[green]----------------------------------------------------------------------[/green]") + else: + if force: + print("[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]") + print("[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]") + else: + print( + "[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n" + " You must run init in a completely empty directory, or an existing data folder.\n\n" + " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n" + " then run and run 'archivebox init' to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)", + ) + raise SystemExit(2) + + if existing_index: + print("\n[green][*] Verifying archive folder structure...[/green]") + else: + print("\n[green][+] Building archive folder structure...[/green]") + + archive_path = _display_data_path(CONSTANTS.ARCHIVE_DIR, CONSTANTS.DATA_DIR) + sources_path = _display_data_path(CONSTANTS.SOURCES_DIR, CONSTANTS.DATA_DIR) + logs_path = _display_data_path(CONSTANTS.LOGS_DIR, CONSTANTS.DATA_DIR) + print(f" + {archive_path}, {sources_path}, {logs_path}...") + Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True) + CONSTANTS.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.USERS_DIR.mkdir(parents=True, exist_ok=True) + Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) + for path in (Path(CONSTANTS.SOURCES_DIR), CONSTANTS.ARCHIVE_DIR, CONSTANTS.USERS_DIR, Path(CONSTANTS.LOGS_DIR)): + path.chmod(int(config.OUTPUT_PERMISSIONS, base=8) | 0o111) + + print(f" + {_display_data_path(CONSTANTS.CONFIG_FILE, CONSTANTS.DATA_DIR)}...") + + # create the .archivebox_id file with a unique ID for this collection + from archivebox.config.paths import _get_collection_id + + _get_collection_id(CONSTANTS.DATA_DIR, force_create=True) + + # create the ArchiveBox.conf file + write_config_file({"SECRET_KEY": config.SECRET_KEY}) + + if os.access(CONSTANTS.DATABASE_FILE, os.F_OK): + print("\n[green][*] Verifying main SQL index and running any migrations needed...[/green]") + else: + print("\n[green][+] Building main SQL index and running initial migrations...[/green]") + + from archivebox.config.django import setup_django + + setup_django() + previous_wants_init = os.environ.get("ARCHIVEBOX_WANTS_INIT") + os.environ["ARCHIVEBOX_WANTS_INIT"] = "1" + try: + check_migrations(blocking=True, auto_apply=False) + + for migration_line in apply_migrations(CONSTANTS.DATA_DIR): + sys.stdout.write(f" {migration_line}\n") + finally: + if previous_wants_init is None: + os.environ.pop("ARCHIVEBOX_WANTS_INIT", None) + else: + os.environ["ARCHIVEBOX_WANTS_INIT"] = previous_wants_init + + assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK) + print() + print(f" โˆš {_display_data_path(CONSTANTS.DATABASE_FILE, CONSTANTS.DATA_DIR)}") + + # from django.contrib.auth.models import User + # call_command("createsuperuser", interactive=True) + + print() + print("[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]") + + from archivebox.core.models import Snapshot + + snapshot_count = 0 + + if existing_index: + snapshot_count = Snapshot.objects.count() + print(f" โˆš Loaded {snapshot_count} links from existing main index.") + + print(" > Skipping orphan snapshot import during init.") + print() + print(" [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:") + print(" archivebox update") + + print("\n[green]----------------------------------------------------------------------[/green]") + + from django.contrib.auth.models import User + + config = get_config() + if (config.ADMIN_USERNAME and config.ADMIN_PASSWORD) and not User.objects.filter( + username=config.ADMIN_USERNAME, + ).exists(): + print("[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]") + User.objects.create_superuser(username=config.ADMIN_USERNAME, password=config.ADMIN_PASSWORD) + + if existing_index: + print("[green][โˆš] Done. Verified and updated the existing ArchiveBox collection.[/green]") + else: + print(f"[green][โˆš] Done. A new ArchiveBox collection was initialized ({snapshot_count} links).[/green]") + + CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) + + from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + config = get_config() + config.TMP_DIR.mkdir(parents=True, exist_ok=True) + config.LIB_DIR.mkdir(parents=True, exist_ok=True) + (config.LIB_DIR / "bin").mkdir(parents=True, exist_ok=True) + + working_tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True) + if working_tmp_dir: + working_tmp_dir.mkdir(parents=True, exist_ok=True) + + working_lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True) + if working_lib_dir: + working_lib_dir.mkdir(parents=True, exist_ok=True) + + if install: + from archivebox.cli.archivebox_install import install as install_method + + install_method() + + if Snapshot.objects.count() < 25: # hide the hints for experienced users + print() + print(" [violet]Hint:[/violet] To view your archive index, run:") + print( + " archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]", + ) + print() + print(" To add new links, you can run:") + print(" archivebox add < ~/some/path/to/list_of_links.txt") + print() + print(" For more usage and examples, run:") + print(" archivebox help") + + +@click.command() +@click.option("--force", "-f", is_flag=True, help="Ignore unrecognized files in current directory and initialize anyway") +@click.option("--quick", "-q", is_flag=True, help="Run any updates or migrations without rechecking all snapshot dirs") +@click.option("--install", "-s", is_flag=True, help="Automatically install dependencies and extras used for archiving") +@docstring(init.__doc__) +def main(**kwargs) -> None: + init(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py new file mode 100755 index 0000000000..b0d592f220 --- /dev/null +++ b/archivebox/cli/archivebox_install.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import os + +import rich_click as click +from rich import print + +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def install(binaries: tuple[str, ...] = (), binproviders: str = "*", dry_run: bool = False) -> None: + """Detect and install ArchiveBox dependencies by running the abx-dl install flow + + Examples: + archivebox install # Install all dependencies + archivebox install wget curl # Install only wget and curl + archivebox install --binproviders=pip yt-dlp # Install yt-dlp using only pip + archivebox install --binproviders=brew,apt # Install all deps using only brew or apt + """ + + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config import CONSTANTS + from archivebox.misc.logging import stderr + from archivebox.cli.archivebox_init import init + + archive_dir = CONSTANTS.ARCHIVE_DIR + + if dry_run: + print("[dim]Dry run - would detect ArchiveBox dependencies and run the abx-dl install flow[/dim]") + return + + if not (os.access(archive_dir, os.R_OK) and archive_dir.is_dir()): + init() # must init full index because we need a db to store Binary entries in + + # Show what we're installing + if binaries: + print(f"\n[green][+] Installing specific binaries: {', '.join(binaries)}[/green]") + else: + print("\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]") + + if binproviders != "*": + print(f"[green][+] Using providers: {binproviders}[/green]") + + if IS_ROOT: + EUID = os.geteuid() + print() + print(f"[yellow]:warning: Running as UID=[blue]{EUID}[/blue].[/yellow]") + print(f" DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].") + print() + + # Set up Django + from archivebox.config.django import setup_django + + setup_django() + + plugin_names = list(binaries) + if binproviders != "*": + plugin_names.extend(provider.strip() for provider in binproviders.split(",") if provider.strip()) + + print("[+] Running installer via abx-dl bus...") + print() + + from archivebox.services.runner import run_install + + run_install(plugin_names=plugin_names or None) + + print() + + # Check for superuser + from django.contrib.auth import get_user_model + + User = get_user_model() + + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): + stderr("\n[+] Don't forget to create a new admin user for the Web UI...", color="green") + stderr(" archivebox manage createsuperuser") + + print() + + # Show version to display full status including installed binaries + # Django is already loaded, so just import and call the function directly + from archivebox.cli.archivebox_version import version as show_version + + show_version(quiet=False) + + +@click.command() +@click.argument("binaries", nargs=-1, type=str, required=False) +@click.option( + "--binproviders", + "-p", + default="*", + help="Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all", + show_default=True, +) +@click.option("--dry-run", "-d", is_flag=True, help="Show what would happen without actually running", default=False) +@docstring(install.__doc__) +def main(**kwargs) -> None: + install(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py new file mode 100644 index 0000000000..1a1fe2642c --- /dev/null +++ b/archivebox/cli/archivebox_list.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox list" + +import sys + +import rich_click as click + +from archivebox.cli.archivebox_snapshot import list_snapshots, snapshot_filter_options, snapshot_output_options + + +@click.command() +@snapshot_output_options +@snapshot_filter_options(default_filter_type="substring") +def main(**kwargs) -> None: + """List Snapshots.""" + sys.exit(list_snapshots(**kwargs)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py new file mode 100644 index 0000000000..405f0ccb7a --- /dev/null +++ b/archivebox/cli/archivebox_machine.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +""" +archivebox machine <action> [--filters] + +Manage Machine records (system-managed, mostly read-only). + +Machine records track the host machines where ArchiveBox runs. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Machines as JSONL (with optional filters) + +Examples: + # List all machines + archivebox machine list + + # List machines by hostname + archivebox machine list --hostname__icontains=myserver +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox machine" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_machines( + hostname__icontains: str | None = None, + os_platform: str | None = None, + limit: int | None = None, +) -> int: + """ + List Machines as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Machine + + is_tty = sys.stdout.isatty() + + queryset = Machine.objects.all().order_by("-created_at") + + # Apply filters + filter_kwargs = { + "hostname__icontains": hostname__icontains, + "os_platform": os_platform, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for machine in queryset: + if is_tty: + rprint(f"[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}") + else: + write_record(machine.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} machines[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Machine records (read-only, system-managed).""" + pass + + +@main.command("list") +@click.option("--hostname__icontains", help="Filter by hostname contains") +@click.option("--os-platform", help="Filter by OS platform") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(hostname__icontains: str | None, os_platform: str | None, limit: int | None): + """List Machines as JSONL.""" + sys.exit( + list_machines( + hostname__icontains=hostname__icontains, + os_platform=os_platform, + limit=limit, + ), + ) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_manage.py b/archivebox/cli/archivebox_manage.py new file mode 100644 index 0000000000..491f23b962 --- /dev/null +++ b/archivebox/cli/archivebox_manage.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import rich_click as click +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def manage(args: list[str] | None = None) -> None: + """Run an ArchiveBox Django management command""" + + from archivebox.config.common import get_config + from archivebox.misc.logging import stderr + + config = get_config() + if (args and "createsuperuser" in args) and (config.IN_DOCKER and not config.IS_TTY): + stderr("[!] Warning: you need to pass -it to use interactive commands in docker", color="lightyellow") + stderr(" docker run -it archivebox manage {}".format(" ".join(args or ["..."])), color="lightyellow") + stderr("") + + from django.core.management import execute_from_command_line + + execute_from_command_line(["manage.py", *(args or ["help"])]) + + +@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) +@click.argument("args", nargs=-1) +@docstring(manage.__doc__) +def main(args: list[str] | None = None) -> None: + manage(args=args) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py new file mode 100644 index 0000000000..cbc2ba19e1 --- /dev/null +++ b/archivebox/cli/archivebox_mcp.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +archivebox mcp + +Start the Model Context Protocol (MCP) server in stdio mode. +Exposes all ArchiveBox CLI commands as MCP tools for AI agents. +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox mcp" + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def mcp(): + """ + Start the MCP server in stdio mode for AI agent control. + + The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands + as tools that AI agents can discover and execute. It communicates via JSON-RPC + 2.0 over stdin/stdout. + + Example usage with an MCP client: + archivebox mcp < requests.jsonl > responses.jsonl + + Or interactively: + archivebox mcp + {"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} + {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} + """ + + from archivebox.mcp.server import run_mcp_server + + # Run the stdio server (blocks until stdin closes) + run_mcp_server() + + +@click.command() +@docstring(mcp.__doc__) +def main(**kwargs): + """Start the MCP server in stdio mode""" + mcp() + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py new file mode 100644 index 0000000000..8d82759fd4 --- /dev/null +++ b/archivebox/cli/archivebox_persona.py @@ -0,0 +1,642 @@ +#!/usr/bin/env python3 + +""" +archivebox persona <action> [args...] [--filters] + +Manage Persona records (browser profiles for archiving). + +Actions: + create - Create Personas + list - List Personas as JSONL (with optional filters) + update - Update Personas from stdin JSONL + delete - Delete Personas from stdin JSONL + +Examples: + # Create a new persona + archivebox persona create work + archivebox persona create --import=chrome personal + archivebox persona create --import=edge work + + # List all personas + archivebox persona list + + # Delete a persona + archivebox persona list --name=old | archivebox persona delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox persona" + +import os +import sys +import shutil +import platform +from pathlib import Path +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters +from archivebox.personas import importers as persona_importers + + +# ============================================================================= +# Browser Profile Locations +# ============================================================================= + + +def get_chrome_user_data_dir() -> Path | None: + """Get the default Chrome user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": # macOS + candidates = [ + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_brave_user_data_dir() -> Path | None: + """Get the default Brave user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_edge_user_data_dir() -> Path | None: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Microsoft Edge", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Microsoft" / "Edge" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and (candidate / "Default").exists(): + return candidate + + return None + + +def get_browser_binary(browser: str) -> str | None: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == "Darwin": + candidates = { + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], + }.get(browser, []) + elif system == "Linux": + candidates = { + "chrome": [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + ], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": [ + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ], + }.get(browser, []) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = { + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + ], + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + ], + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + +BROWSER_PROFILE_FINDERS = { + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, # Same locations + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, +} + +CHROMIUM_BROWSERS = {"chrome", "chromium", "brave", "edge"} + + +# ============================================================================= +# Cookie Extraction via CDP +# ============================================================================= + +# ============================================================================= +# Validation Helpers +# ============================================================================= + + +def validate_persona_name(name: str) -> tuple[bool, str]: + """ + Validate persona name to prevent path traversal attacks. + + Returns: + (is_valid, error_message): tuple indicating if name is valid + """ + if not name or not name.strip(): + return False, "Persona name cannot be empty" + + # Check for path separators + if "/" in name or "\\" in name: + return False, "Persona name cannot contain path separators (/ or \\)" + + # Check for parent directory references + if ".." in name: + return False, "Persona name cannot contain parent directory references (..)" + + # Check for hidden files/directories + if name.startswith("."): + return False, "Persona name cannot start with a dot (.)" + + # Ensure name doesn't contain null bytes or other dangerous chars + if "\x00" in name or "\n" in name or "\r" in name: + return False, "Persona name contains invalid characters" + + return True, "" + + +def ensure_path_within_personas_dir(persona_path: Path) -> bool: + """ + Verify that a persona path is within PERSONAS_DIR. + + This is a safety check to prevent path traversal attacks where + a malicious persona name could cause operations on paths outside + the expected PERSONAS_DIR. + + Returns: + True if path is safe, False otherwise + """ + from archivebox.config.constants import CONSTANTS + + try: + # Resolve both paths to absolute paths + personas_dir = CONSTANTS.PERSONAS_DIR.resolve() + resolved_path = persona_path.resolve() + + # Check if resolved_path is a child of personas_dir + return resolved_path.is_relative_to(personas_dir) + except (ValueError, RuntimeError): + return False + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_personas( + names: Iterable[str], + import_from: str | None = None, + profile: str | None = None, +) -> int: + """ + Create Personas from names. + + If --import is specified, copy the browser profile to the persona directory + and extract cookies. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + name_list = list(names) if names else [] + + if not name_list: + rprint("[yellow]No persona names provided. Pass names as arguments.[/yellow]", file=sys.stderr) + return 1 + + # Validate import source if specified + source_profile_dir = None + if import_from: + import_from = import_from.lower() + if import_from not in BROWSER_PROFILE_FINDERS: + rprint(f"[red]Unknown browser: {import_from}[/red]", file=sys.stderr) + rprint(f"[dim]Supported browsers: {', '.join(BROWSER_PROFILE_FINDERS.keys())}[/dim]", file=sys.stderr) + return 1 + + source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]() + if not source_profile_dir: + rprint(f"[red]Could not find {import_from} profile directory[/red]", file=sys.stderr) + return 1 + + rprint(f"[dim]Found {import_from} profile: {source_profile_dir}[/dim]", file=sys.stderr) + + if profile is None and (source_profile_dir / "Default").exists(): + profile = "Default" + + browser_binary = get_browser_binary(import_from) + if browser_binary: + rprint(f"[dim]Using {import_from} binary: {browser_binary}[/dim]", file=sys.stderr) + + created_count = 0 + for name in name_list: + name = name.strip() + if not name: + continue + + # Validate persona name to prevent path traversal + is_valid, error_msg = persona_importers.validate_persona_name(name) + if not is_valid: + rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr) + continue + + persona, created = Persona.objects.get_or_create(name=name) + + if created: + persona.ensure_dirs() + created_count += 1 + rprint(f"[green]Created persona: {name}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Persona already exists: {name}[/dim]", file=sys.stderr) + + cookies_file = Path(persona.path) / "cookies.txt" + + # Import browser profile if requested + if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None: + try: + import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile) + import_result = persona_importers.import_persona_from_source( + persona, + import_source, + copy_profile=True, + import_cookies=True, + capture_storage=False, + ) + except Exception as e: + rprint(f"[red]Failed to import browser profile: {e}[/red]", file=sys.stderr) + return 1 + + if import_result.profile_copied: + rprint("[green]Copied browser profile to persona[/green]", file=sys.stderr) + if import_result.cookies_imported: + rprint(f"[green]Extracted cookies to {cookies_file}[/green]", file=sys.stderr) + elif not import_result.profile_copied: + rprint("[yellow]Could not import cookies automatically.[/yellow]", file=sys.stderr) + + for warning in import_result.warnings: + rprint(f"[yellow]{warning}[/yellow]", file=sys.stderr) + + if not is_tty: + write_record( + { + "id": str(persona.id), + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) + + rprint(f"[green]Created {created_count} new persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_personas( + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Personas as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + + queryset = Persona.objects.all().order_by("name") + + # Apply filters + filter_kwargs = { + "name": name, + "name__icontains": name__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for persona in queryset: + cookies_status = "[green]โœ“[/green]" if persona.COOKIES_FILE else "[dim]โœ—[/dim]" + chrome_status = "[green]โœ“[/green]" if Path(persona.CHROME_USER_DATA_DIR).exists() else "[dim]โœ—[/dim]" + + if is_tty: + rprint(f"[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]") + else: + write_record( + { + "id": str(persona.id), + "name": persona.name, + "path": str(persona.path), + "CHROME_USER_DATA_DIR": persona.CHROME_USER_DATA_DIR, + "COOKIES_FILE": persona.COOKIES_FILE, + }, + ) + count += 1 + + rprint(f"[dim]Listed {count} persona(s)[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_personas(name: str | None = None) -> int: + """ + Update Personas from stdin JSONL. + + Reads Persona records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.personas.models import Persona + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + persona_id = record.get("id") + old_name = record.get("name") + + if not persona_id and not old_name: + continue + + try: + if persona_id: + persona = Persona.objects.get(id=persona_id) + else: + persona = Persona.objects.get(name=old_name) + + # Apply updates from CLI flags + if name: + # Validate new name to prevent path traversal + is_valid, error_msg = persona_importers.validate_persona_name(name) + if not is_valid: + rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr) + continue + + # Rename the persona directory too + old_path = persona.path + persona.name = name + new_path = persona.path + + if old_path.exists() and old_path != new_path: + shutil.move(str(old_path), str(new_path)) + + persona.save() + + updated_count += 1 + + if not is_tty: + write_record( + { + "id": str(persona.id), + "name": persona.name, + "path": str(persona.path), + }, + ) + + except Persona.DoesNotExist: + rprint(f"[yellow]Persona not found: {persona_id or old_name}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_personas(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Personas from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.personas.models import Persona + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Collect persona IDs or names + persona_ids = [] + persona_names = [] + for r in records: + if r.get("id"): + persona_ids.append(r["id"]) + elif r.get("name"): + persona_names.append(r["name"]) + + if not persona_ids and not persona_names: + rprint("[yellow]No valid persona IDs or names in input[/yellow]", file=sys.stderr) + return 1 + + from django.db.models import Q + + query = Q() + if persona_ids: + query |= Q(id__in=persona_ids) + if persona_names: + query |= Q(name__in=persona_names) + + personas = Persona.objects.filter(query) + count = personas.count() + + if count == 0: + rprint("[yellow]No matching personas found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} persona(s) (dry run)[/yellow]", file=sys.stderr) + for persona in personas: + rprint(f" {persona.name} ({persona.path})", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Delete persona directories and database records + deleted_count = 0 + for persona in personas: + persona_path = persona.path + + # Safety check: ensure path is within PERSONAS_DIR before deletion + if not ensure_path_within_personas_dir(persona_path): + rprint(f'[red]Security error: persona path "{persona_path}" is outside PERSONAS_DIR. Skipping deletion.[/red]', file=sys.stderr) + continue + + if persona_path.exists(): + shutil.rmtree(persona_path) + persona.delete() + deleted_count += 1 + + rprint(f"[green]Deleted {deleted_count} persona(s)[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Persona records (browser profiles).""" + pass + + +@main.command("create") +@click.argument("names", nargs=-1) +@click.option("--import", "import_from", help="Import profile from browser (chrome, chromium, brave, edge)") +@click.option("--profile", help="Profile directory name under the user data dir (e.g. Default, Profile 1)") +def create_cmd(names: tuple, import_from: str | None, profile: str | None): + """Create Personas, optionally importing from a browser profile.""" + sys.exit(create_personas(names, import_from=import_from, profile=profile)) + + +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): + """List Personas as JSONL.""" + sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit)) + + +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): + """Update Personas from stdin JSONL.""" + sys.exit(update_personas(name=name)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Personas from stdin JSONL.""" + sys.exit(delete_personas(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py new file mode 100644 index 0000000000..e63c1272ff --- /dev/null +++ b/archivebox/cli/archivebox_pluginmap.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +EVENT_FLOW_DIAGRAM = """ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ ArchiveBox / abx-dl Flow โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ โ”‚ +โ”‚ InstallEvent โ”‚ +โ”‚ โ””โ”€ config.json > required_binaries โ”‚ +โ”‚ โ””โ”€ BinaryRequestEvent โ”‚ +โ”‚ โ””โ”€ abxpkg BinaryService builtin providers โ”‚ +โ”‚ โ””โ”€ BinaryEvent โ”‚ +โ”‚ โ””โ”€ BinaryCacheService / project cache backend โ”‚ +โ”‚ โ”‚ +โ”‚ CrawlEvent โ”‚ +โ”‚ โ””โ”€ CrawlSetupEvent โ”‚ +โ”‚ โ””โ”€ on_CrawlSetup__* โ”‚ +โ”‚ โ”‚ +โ”‚ CrawlStartEvent โ”‚ +โ”‚ โ””โ”€ SnapshotEvent โ”‚ +โ”‚ โ””โ”€ on_Snapshot__* โ”‚ +โ”‚ โ””โ”€ ArchiveResult / Snapshot / Tag โ”‚ +โ”‚ โ”‚ +โ”‚ SnapshotCleanupEvent -> internal cleanup, no direct hook family โ”‚ +โ”‚ CrawlCleanupEvent -> internal cleanup, no direct hook family โ”‚ +โ”‚ โ”‚ +โ”‚ ArchiveBox projects bus events into the DB; it no longer drives plugin โ”‚ +โ”‚ execution through the old queued model executor. โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +""" + + +@enforce_types +def pluginmap( + show_disabled: bool = False, + event: str | None = None, + quiet: bool = False, +) -> dict: + """ + Show the current abx-dl event phases and their associated plugin hooks. + + This command reflects the new bus-driven runtime, not the legacy ArchiveBox + state-machine executor. Event names are normalized to hook prefixes by + stripping a trailing `Event`, then ArchiveBox checks whether any matching + `on_{EventFamily}__*` scripts actually exist. + """ + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + from rich import box + + from archivebox.plugins.hooks import ( + discover_hooks, + is_background_hook, + normalize_hook_event_name, + ) + from archivebox.plugins.discovery import ( + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + ) + + console = Console() + prnt = console.print + + event_phases = { + "InstallEvent": { + "description": "Pre-run dependency phase. Enabled plugins emit BinaryRequest events from config.json required_binaries.", + "emits": ["BinaryRequestEvent", "BinaryEvent", "ProcessEvent"], + }, + "BinaryRequestEvent": { + "description": "Binary resolution phase. abxpkg BinaryService resolves or installs requested binaries using built-in providers.", + "emits": ["BinaryEvent", "ProcessEvent"], + "direct_hooks": False, + }, + "BinaryEvent": { + "description": "Resolved binary metadata event. ArchiveBoxBinaryService projects it into the ArchiveBox DB binary cache.", + "emits": [], + "direct_hooks": False, + }, + "CrawlEvent": { + "description": "Root crawl lifecycle event emitted by the runner.", + "emits": ["CrawlSetupEvent", "CrawlStartEvent", "CrawlCleanupEvent", "CrawlCompletedEvent"], + }, + "CrawlSetupEvent": { + "description": "Crawl-scoped setup phase. on_CrawlSetup hooks launch/configure shared daemons and runtime state.", + "emits": ["ProcessEvent"], + }, + "SnapshotEvent": { + "description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.", + "emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"], + }, + "SnapshotCleanupEvent": { + "description": "Internal snapshot cleanup phase.", + "emits": ["ProcessKillEvent"], + }, + "CrawlCleanupEvent": { + "description": "Internal crawl cleanup phase.", + "emits": ["ProcessKillEvent"], + }, + } + + if event: + requested = str(event).strip() + if requested in event_phases: + event_phases = {requested: event_phases[requested]} + else: + normalized_requested = normalize_hook_event_name(requested) + matched_name = next((name for name in event_phases if normalize_hook_event_name(name) == normalized_requested), None) + if matched_name is None: + prnt(f'[red]Error: Unknown event "{requested}". Available: {", ".join(event_phases.keys())}[/red]') + return {} + event_phases = {matched_name: event_phases[matched_name]} + + result = { + "events": {}, + "plugins_dir": str(BUILTIN_PLUGINS_DIR), + "user_plugins_dir": str(USER_PLUGINS_DIR), + } + + if not quiet: + prnt() + prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]") + prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]") + prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]") + prnt() + prnt( + Panel( + EVENT_FLOW_DIAGRAM, + title="[bold green]Event Flow[/bold green]", + border_style="green", + expand=False, + ), + ) + prnt() + + for event_name, info in event_phases.items(): + hook_event = normalize_hook_event_name(event_name) + has_direct_hooks = info.get("direct_hooks", True) + hooks = discover_hooks(event_name, filter_disabled=not show_disabled) if has_direct_hooks else [] + + hook_infos = [] + for hook_path in hooks: + plugin_name = hook_path.parent.name + hook_infos.append( + { + "path": str(hook_path), + "name": hook_path.name, + "plugin": plugin_name, + "is_background": is_background_hook(hook_path.name), + "extension": hook_path.suffix, + }, + ) + + result["events"][event_name] = { + "description": info["description"], + "hook_event": hook_event, + "emits": info["emits"], + "hooks": hook_infos, + "hook_count": len(hook_infos), + } + + if quiet: + continue + + title_suffix = f" -> on_{hook_event}__*" if hook_infos else "" + table = Table( + title=f"[bold yellow]{event_name}[/bold yellow]{title_suffix} ({len(hooks)} hooks)", + box=box.ROUNDED, + show_header=True, + header_style="bold magenta", + ) + table.add_column("Plugin", style="cyan", width=20) + table.add_column("Hook Name", style="green") + table.add_column("BG", justify="center", width=4) + table.add_column("Type", justify="center", width=5) + + if hook_infos: + for hook in sorted(hook_infos, key=lambda h: h["name"]): + bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else "" + table.add_row( + hook["plugin"], + hook["name"], + bg_marker, + hook["extension"].lstrip("."), + ) + else: + table.add_row("[dim]-[/dim]", "[dim]No direct hooks[/dim]", "", "") + + prnt(table) + prnt(f"[dim]{info['description']}[/dim]") + if info["emits"]: + prnt(f"[dim]Emits: {', '.join(info['emits'])}[/dim]") + if not hook_infos and has_direct_hooks: + prnt(f"[dim]No direct on_{hook_event}__* scripts are currently defined for this event family.[/dim]") + elif not has_direct_hooks: + prnt("[dim]No direct plugin hook family. This event is handled by services.[/dim]") + prnt() + + if not quiet: + total_hooks = sum(event_info["hook_count"] for event_info in result["events"].values()) + prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]") + prnt() + prnt("[dim]Hook naming convention: on_{EventFamily}__{XX}_{description}[.bg].{ext}[/dim]") + prnt("[dim]Event names are normalized with a simple `Event` suffix strip before hook discovery.[/dim]") + prnt("[dim]If no `on_{EventFamily}__*` scripts exist, the event is shown as having no direct hooks.[/dim]") + prnt() + + return result + + +@click.command() +@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too") +@click.option("--event", "-e", type=str, default=None, help="Filter to specific event (e.g. InstallEvent, SnapshotEvent)") +@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no tables") +@docstring(pluginmap.__doc__) +def main(**kwargs): + import json + + result = pluginmap(**kwargs) + if kwargs.get("quiet"): + print(json.dumps(result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py new file mode 100644 index 0000000000..4a94a365a1 --- /dev/null +++ b/archivebox/cli/archivebox_process.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 + +""" +archivebox process <action> [--filters] + +Manage Process records (system-managed, mostly read-only). + +Process records track executions of binaries during extraction. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Processes as JSONL (with optional filters) + +Examples: + # List all processes + archivebox process list + + # List processes by binary + archivebox process list --binary-name=chrome + + # List recent processes + archivebox process list --limit=10 +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox process" + +import sys + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_processes( + binary_name: str | None = None, + machine_id: str | None = None, + limit: int | None = None, +) -> int: + """ + List Processes as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Process + + is_tty = sys.stdout.isatty() + + queryset = Process.objects.all().select_related("binary", "machine").order_by("-started_at", "-created_at") + + # Apply filters + filter_kwargs = {} + if binary_name: + filter_kwargs["binary__name"] = binary_name + if machine_id: + filter_kwargs["machine_id"] = machine_id + + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for process in queryset: + if is_tty: + binary_name_str = process.binary.name if process.binary else "unknown" + exit_code = process.exit_code if process.exit_code is not None else "?" + status_color = "green" if process.exit_code == 0 else "red" if process.exit_code else "yellow" + rprint(f"[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]") + else: + write_record(process.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} processes[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Process records (read-only, system-managed).""" + pass + + +@main.command("list") +@click.option("--binary-name", "-b", help="Filter by binary name") +@click.option("--machine-id", "-m", help="Filter by machine ID") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(binary_name: str | None, machine_id: str | None, limit: int | None): + """List Processes as JSONL.""" + sys.exit( + list_processes( + binary_name=binary_name, + machine_id=machine_id, + limit=limit, + ), + ) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py new file mode 100644 index 0000000000..791c69e68d --- /dev/null +++ b/archivebox/cli/archivebox_remove.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox remove" + +import time +from pathlib import Path +from collections.abc import Iterable + +import rich_click as click + +from django.db import OperationalError +from django.db.models import QuerySet + +from archivebox.config import CONSTANTS +from archivebox.config.django import setup_django +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.checks import check_data_folder +from archivebox.misc.logging_util import ( + log_list_started, + log_list_finished, + log_removal_started, + log_removal_finished, + TimedProgress, +) +from archivebox.cli.archivebox_snapshot import snapshot_filter_options + + +@enforce_types +def remove( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + snapshots: QuerySet | None = None, + after: float | None = None, + before: float | None = None, + yes: bool = False, + out_dir: Path = CONSTANTS.DATA_DIR, + timeout: float | None = None, + **kwargs, +) -> dict[str, object]: + """Remove the specified URLs from the archive""" + + setup_django() + check_data_folder() + timeout = float(timeout) if timeout is not None else None + + from archivebox.core.models import Snapshot + + filter_kwargs = { + **kwargs, + "filter_patterns": filter_patterns, + "filter_type": filter_type, + "after": after, + "before": before, + } + pattern_list = list(filter_patterns) + + log_list_started(pattern_list or None, filter_type) + timer = TimedProgress(360, prefix=" ") + try: + if snapshots is None: + snapshots = Snapshot.objects.order_by("-created_at").search(**filter_kwargs) + # Freeze the target set up-front so a concurrent daemon writing new + # snapshots can't extend the deletion under us, and so the cursor isn't + # held open across the per-row deletes below. + snapshot_pks = list(snapshots.values_list("pk", flat=True)) + finally: + timer.end() + + if not snapshot_pks: + log_removal_finished(0, 0) + raise SystemExit(1) + + if not yes: + log_list_finished(snapshots) + log_removal_started(snapshots, yes=False) + + from archivebox.search.query import flush_search_index + + started_at = time.monotonic() + deadline = started_at + timeout if timeout is not None else None + + # Search-index flush touches a separate backend (FTS / sonic), not the + # main index.sqlite3 writer lock, so it's safe to do once up front. + flush_search_index(snapshots=snapshots) + + # Delete one snapshot at a time. Each ``.delete()`` is its own short + # Django-atomic block, so the writer lock is released between rows and + # an in-flight daemon transaction can interleave instead of deadlocking. + # Filesystem cleanup for each row is scheduled via ``transaction.on_commit`` + # in ``base_models/models.py`` and runs AFTER its row's tx commits โ€” so + # rmtree doesn't hold the lock either. + # + # The SQLite retry wrapper in core/sqlite_backend/base.py re-raises lock + # errors when called inside an atomic block (because it can't safely + # release+reacquire a transaction), so we wrap each row's delete in our + # own retry loop at this outer (non-atomic) level. Each attempt is a + # fresh atomic; an exception cleanly rolls it back before we sleep. + retry_interval = 1.0 + deleted_snapshot_pks = [] + timed_out = False + timeout_error = "" + for index, pk in enumerate(snapshot_pks): + if deadline is not None and time.monotonic() >= deadline: + timed_out = True + timeout_error = f"Remove timed out after {timeout:g}s with {len(snapshot_pks) - index} snapshots remaining." + break + while True: + try: + deleted_count, _ = Snapshot.objects.filter(pk=pk).delete() + if deleted_count: + deleted_snapshot_pks.append(pk) + break + except OperationalError as err: + if "database is locked" not in str(err): + raise + remaining_time = deadline - time.monotonic() if deadline is not None else None + if remaining_time is not None and remaining_time <= 0: + timed_out = True + timeout_error = f"Remove timed out after {timeout:g}s while waiting for the database lock." + break + time.sleep(min(retry_interval, remaining_time) if remaining_time is not None else retry_interval) + if timed_out: + break + + all_snapshots = Snapshot.objects.all() + remaining_count = all_snapshots.count() + deleted_snapshot_id_set = set(deleted_snapshot_pks) + remaining_snapshot_pks = [snapshot_id for snapshot_id in snapshot_pks if snapshot_id not in deleted_snapshot_id_set] + log_removal_finished(remaining_count, len(deleted_snapshot_pks)) + + return { + "removed_count": len(deleted_snapshot_pks), + "removed_snapshot_ids": [str(snapshot_id) for snapshot_id in deleted_snapshot_pks], + "not_removed_count": len(remaining_snapshot_pks), + "not_removed_snapshot_ids": [str(snapshot_id) for snapshot_id in remaining_snapshot_pks], + "success": not timed_out, + "error": timeout_error, + "timeout": timeout, + } + + +@click.command() +@click.option("--yes", is_flag=True, help="Remove links instantly without prompting to confirm") +@click.option("--timeout", type=float, default=None, help="Maximum seconds to spend deleting snapshots") +@snapshot_filter_options(default_filter_type="exact") +@docstring(remove.__doc__) +def main(**kwargs): + """Remove the specified URLs from the archive""" + result = remove(**kwargs) + if not result["success"]: + raise SystemExit(124) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py new file mode 100644 index 0000000000..5037175cbc --- /dev/null +++ b/archivebox/cli/archivebox_run.py @@ -0,0 +1,483 @@ +#!/usr/bin/env python3 + +""" +archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...] [--binary-id=...] + +Unified command for processing queued work on the shared abx-dl bus. + +Modes: + - With stdin JSONL: Process piped records, exit when complete + - Without stdin (TTY): Run the background runner in foreground until killed + - --crawl-id: Run the crawl runner for a specific crawl only + - --snapshot-id: Run a specific snapshot through its parent crawl + - --binary-id: Emit a BinaryRequestEvent for a specific Binary row + +Examples: + # Run the background runner in foreground + archivebox run + + # Run as daemon (don't exit on idle) + archivebox run --daemon + + # Process specific records (pipe any JSONL type, exits when done) + archivebox snapshot list --status=queued | archivebox run + archivebox archiveresult list --status=failed | archivebox run + archivebox crawl list --status=queued | archivebox run + + # Mixed types work too + cat mixed_records.jsonl | archivebox run + + # Run the crawl runner for a specific crawl + archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e + + # Run one snapshot from an existing crawl + archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad + + # Run one queued binary install directly on the bus + archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox run" + +import asyncio +import os +import signal +import sys +from collections import defaultdict + +import rich_click as click +from rich import print as rprint + + +RUNNER_DAEMON_ENV = "ARCHIVEBOX_RUNNER_DAEMON" + + +def _exit_daemon_runner_on_signal(sig: signal.Signals) -> None: + os._exit(128 + int(sig)) + + +def process_stdin_records() -> int: + """ + Process JSONL records from stdin. + + Create-or-update behavior: + - Records WITHOUT id: Create via Model.from_json(), then queue + - Records WITH id: Lookup existing, re-queue for processing + + Outputs JSONL of all processed records (for chaining). + + Handles any record type: Crawl, Snapshot, ArchiveResult. + Auto-cascades: Crawl โ†’ Snapshots โ†’ ArchiveResults. + + Returns exit code (0 = success, 1 = error). + """ + from django.utils import timezone + + from archivebox.misc.jsonl import ( + read_stdin, + write_record, + TYPE_CRAWL, + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_BINARYREQUEST, + TYPE_BINARY, + ) + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + from archivebox.machine.models import Binary + from archivebox.services.runner import run_binary, run_crawl + + records = list(read_stdin()) + is_tty = sys.stdout.isatty() + + if not records: + return 0 # Nothing to process + + created_by_id = get_or_create_system_user_pk() + queued_count = 0 + output_records = [] + full_crawl_ids: set[str] = set() + snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) + plugin_names_by_crawl: dict[str, set[str]] = defaultdict(set) + run_all_plugins_for_crawl: set[str] = set() + binary_ids: list[str] = [] + + for record in records: + record_type = record.get("type", "") + record_id = record.get("id") + + try: + if record_type == TYPE_CRAWL: + if record_id: + # Existing crawl - re-queue + try: + crawl = Crawl.objects.get(id=record_id) + except Crawl.DoesNotExist: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + else: + # New crawl - create it + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + + if crawl: + crawl.update_and_requeue( + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + full_crawl_ids.add(str(crawl.id)) + run_all_plugins_for_crawl.add(str(crawl.id)) + output_records.append(crawl.to_json()) + queued_count += 1 + + elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type): + if record_id: + # Existing snapshot - re-queue + try: + snapshot = Snapshot.objects.get(id=record_id) + except Snapshot.DoesNotExist: + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + else: + # New snapshot - create it + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + + if snapshot: + snapshot.queue_for_extraction() + crawl_id = str(snapshot.crawl_id) + snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) + run_all_plugins_for_crawl.add(crawl_id) + output_records.append(snapshot.to_json()) + queued_count += 1 + + elif record_type == TYPE_ARCHIVERESULT: + if record_id: + # Existing archiveresult - re-queue + try: + archiveresult = ArchiveResult.objects.get(id=record_id) + except ArchiveResult.DoesNotExist: + archiveresult = None + else: + archiveresult = None + + snapshot_id = record.get("snapshot_id") + plugin_name = record.get("plugin") + snapshot = None + if archiveresult: + if archiveresult.status in [ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ArchiveResult.StatusChoices.BACKOFF, + ]: + archiveresult.reset_for_retry() + snapshot = archiveresult.snapshot + plugin_name = plugin_name or archiveresult.plugin + elif snapshot_id: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + snapshot = None + + if snapshot: + snapshot.queue_for_extraction() + crawl_id = str(snapshot.crawl_id) + snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) + if plugin_name: + plugin_names_by_crawl[crawl_id].add(str(plugin_name)) + output_records.append(record if not archiveresult else archiveresult.to_json()) + queued_count += 1 + + elif record_type in {TYPE_BINARYREQUEST, TYPE_BINARY}: + if record_id: + try: + binary = Binary.objects.get(id=record_id) + except Binary.DoesNotExist: + binary = Binary.from_json(record) + else: + binary = Binary.from_json(record) + + if binary: + binary.retry_at = timezone.now() + if binary.status != Binary.StatusChoices.INSTALLED: + binary.status = Binary.StatusChoices.QUEUED + binary.save() + binary_ids.append(str(binary.id)) + output_records.append(binary.to_json()) + queued_count += 1 + + else: + # Unknown type - pass through + output_records.append(record) + + except Exception as e: + rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr) + continue + + # Output all processed records (for chaining) + if not is_tty: + for rec in output_records: + write_record(rec) + + if queued_count == 0: + rprint("[yellow]No records to process[/yellow]", file=sys.stderr) + return 0 + + rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr) + + for binary_id in binary_ids: + run_binary(binary_id) + + targeted_crawl_ids = full_crawl_ids | set(snapshot_ids_by_crawl) + if targeted_crawl_ids: + for crawl_id in sorted(targeted_crawl_ids): + try: + crawl = Crawl.objects.get(id=crawl_id) + except Crawl.DoesNotExist: + continue + if not crawl.claim_processing_lock(lock_seconds=10): + rprint(f"[yellow]Crawl {crawl_id} is already owned by another runner[/yellow]", file=sys.stderr) + return 1 + with foreground_shutdown_signals(), foreground_parent_watchdog(): + run_crawl( + crawl_id, + snapshot_ids=None if crawl_id in full_crawl_ids else sorted(snapshot_ids_by_crawl[crawl_id]), + selected_plugins=None if crawl_id in run_all_plugins_for_crawl else sorted(plugin_names_by_crawl[crawl_id]), + selected_plugins_are_explicit=False, + ) + return 0 + + +def run_runner( + daemon: bool = False, + crawl_id: str | None = None, + maintenance_only: bool = False, + maintenance_batch_size: int | None = None, +) -> int: + """ + Run the background runner loop. + + Args: + daemon: Run forever (don't exit when idle) + + Returns exit code (0 = success, 1 = error). + """ + from archivebox.config import CONSTANTS + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + from archivebox.machine.models import Machine, Process + from archivebox.core.takeover_util import enter_single_runner_gate, standby_until_foreground_runner_needed + from archivebox.core.recovery_util import recover_orchestrator_state + from archivebox.services.runner import run_pending_crawls + + Machine.current() + current = Process.current() + root_command = current.root + if daemon and root_command.process_type in ( + Process.TypeChoices.SERVER, + Process.TypeChoices.ADD, + Process.TypeChoices.UPDATE, + ): + # Server-owned daemon runners are persistent supervisor workers, but + # foreground add/update commands are allowed to borrow runner/sonic + # leadership without taking down Daphne. Waiting here keeps the worker + # on the normal runner path while preventing a server restart loop from + # immediately stealing the single-runner gate back from the newer CLI. + standby_until_foreground_runner_needed(root_command, data_dir=CONSTANTS.DATA_DIR) + if not enter_single_runner_gate(current, data_dir=CONSTANTS.DATA_DIR): + current.mark_exited() + return 0 + + recover_orchestrator_state(include_chrome=True) + if crawl_id: + from django.utils import timezone + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.filter(id=crawl_id, status__in=Crawl.RUNNABLE_STATES).first() + now = timezone.now() + # Only re-lease when the row is unscheduled (retry_at IS NULL) or its + # existing lease has already expired. A future retry_at means another + # worker is already scheduled โ€” don't clobber. + if crawl is not None and (crawl.retry_at is None or crawl.retry_at <= now): + # extra_filter pins the read-time retry_at so a concurrent worker + # that grabbed the lease between our SELECT and UPDATE wins. + crawl.safe_update( + {"retry_at": now}, + refresh=False, + extra_filter={"retry_at": crawl.retry_at}, + ) + # Only a foreground `archivebox add` gets the interactive "abort current + # hook, continue/retry, second Ctrl+C exits" flow. Server/update/run owned + # orchestrators should shut down immediately and cleanly on the first signal. + interactive_interrupts = current.root.process_type == Process.TypeChoices.ADD + if daemon: + os.environ[RUNNER_DAEMON_ENV] = "1" + try: + with ( + foreground_shutdown_signals( + on_signal=_exit_daemon_runner_on_signal if daemon else None, + raise_on_first_signal=not daemon, + ), + foreground_parent_watchdog(enabled=not daemon), + ): + run_pending_crawls( + daemon=daemon, + crawl_id=crawl_id, + maintenance_only=maintenance_only, + interactive_interrupts=interactive_interrupts, + **({"maintenance_batch_size": maintenance_batch_size} if maintenance_batch_size else {}), + ) + return 0 + except KeyboardInterrupt: + return 0 + except asyncio.CancelledError as e: + if daemon: + rprint(f"[red]Runner cancelled unexpectedly: {type(e).__name__}: {e}[/red]", file=sys.stderr) + return 1 + return 0 + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + return 1 + finally: + current.refresh_from_db() + if current.status != Process.StatusChoices.EXITED: + current.mark_exited() + + +@click.command() +@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)") +@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only") +@click.option("--snapshot-id", help="Run one snapshot through its crawl") +@click.option("--binary-id", help="Run one queued binary install directly on the bus") +@click.option("--maintenance-only", is_flag=True, help="Only process due maintenance ticks on sealed/paused snapshots") +@click.option( + "--maintenance-batch-size", + type=int, + default=None, + hidden=True, + help="Limit queued maintenance snapshots claimed per scheduler tick", +) +@click.option("--no-stdin", is_flag=True, hidden=True, help="Run the scheduler even when stdin is not a TTY") +def main( + daemon: bool, + crawl_id: str, + snapshot_id: str, + binary_id: str, + maintenance_only: bool, + maintenance_batch_size: int | None, + no_stdin: bool, +): + """ + Process queued work. + + Modes: + - No args + stdin piped: Process piped JSONL records + - No args + TTY: Run the crawl runner for all work + - --crawl-id: Run the crawl runner for that crawl only + - --snapshot-id: Run one snapshot through its crawl only + - --binary-id: Run one queued binary install directly on the bus + """ + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + + if daemon and not snapshot_id and not binary_id and not crawl_id: + try: + os.environ[RUNNER_DAEMON_ENV] = "1" + with ( + foreground_shutdown_signals( + on_signal=_exit_daemon_runner_on_signal, + raise_on_first_signal=False, + ), + foreground_parent_watchdog(enabled=False), + ): + sys.exit(run_runner(daemon=True, maintenance_only=maintenance_only, maintenance_batch_size=maintenance_batch_size)) + except KeyboardInterrupt: + sys.exit(0) + + with foreground_shutdown_signals(), foreground_parent_watchdog(enabled=not daemon): + if snapshot_id: + sys.exit(run_snapshot_worker(snapshot_id)) + + if binary_id: + try: + from archivebox.services.runner import run_binary + + run_binary(binary_id) + sys.exit(0) + except KeyboardInterrupt: + sys.exit(0) + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + import traceback + + traceback.print_exc() + sys.exit(1) + + if crawl_id: + sys.exit( + run_runner( + daemon=False, + crawl_id=crawl_id, + maintenance_only=maintenance_only, + maintenance_batch_size=maintenance_batch_size, + ), + ) + + if maintenance_only: + sys.exit(run_runner(daemon=daemon, maintenance_only=True, maintenance_batch_size=maintenance_batch_size)) + + if not no_stdin and not sys.stdin.isatty(): + sys.exit(process_stdin_records()) + else: + sys.exit(run_runner(daemon=daemon, maintenance_only=maintenance_only, maintenance_batch_size=maintenance_batch_size)) + + +def run_snapshot_worker(snapshot_id: str) -> int: + from archivebox.config import CONSTANTS + from archivebox.core.takeover_util import enter_single_runner_gate + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + from archivebox.machine.models import Process + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_snapshot + from django.utils import timezone + + current = Process.current() + if not enter_single_runner_gate(current, data_dir=CONSTANTS.DATA_DIR): + current.mark_exited() + return 0 + + snapshot = None + try: + with foreground_shutdown_signals(), foreground_parent_watchdog(): + for _ in range(10): + snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) + if snapshot.retry_at is None: + snapshot.update_and_requeue(retry_at=timezone.now()) + elif snapshot.retry_at > timezone.now(): + break + if not run_due_snapshot(snapshot, lock_seconds=60): + break + return 0 + except KeyboardInterrupt: + try: + if snapshot is not None: + snapshot.refresh_from_db() + else: + snapshot = Snapshot.objects.filter(id=snapshot_id).first() + if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED: + snapshot.update_and_requeue(retry_at=timezone.now()) + except Exception: + pass + return 0 + except Exception as e: + rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) + import traceback + + traceback.print_exc() + return 1 + finally: + current.refresh_from_db() + if current.status != Process.StatusChoices.EXITED: + current.mark_exited() + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_schedule.py b/archivebox/cli/archivebox_schedule.py new file mode 100644 index 0000000000..df74748f93 --- /dev/null +++ b/archivebox/cli/archivebox_schedule.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import rich_click as click +from rich import print + +from archivebox.misc.util import enforce_types, docstring + + +@enforce_types +def schedule( + add: bool = False, + show: bool = False, + clear: bool = False, + foreground: bool = False, + run_all: bool = False, + quiet: bool = False, + every: str | None = None, + tag: str = "", + depth: int | str = 0, + import_path: str | None = None, + config: dict[str, object] | None = None, +): + """Manage database-backed scheduled crawls processed by the crawl runner.""" + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.crawls.schedule_util import validate_schedule + from archivebox.services.runner import run_pending_crawls + + config_overrides = dict(config or {}) + depth = int(depth) + result: dict[str, object] = { + "created_schedule_ids": [], + "disabled_count": 0, + "run_all_enqueued": 0, + "active_schedule_ids": [], + } + + def _active_schedules(): + return CrawlSchedule.objects.filter(is_enabled=True).select_related("template").order_by("created_at") + + if clear: + disabled_count = CrawlSchedule.objects.filter(is_enabled=True).update( + is_enabled=False, + modified_at=timezone.now(), + ) + result["disabled_count"] = disabled_count + print(f"[green]\\[โˆš] Disabled {disabled_count} scheduled crawl(s).[/green]") + + if every or add: + schedule_str = (every or "day").strip() + validate_schedule(schedule_str) + + created_by_id = get_or_create_system_user_pk() + is_update_schedule = not import_path + template_urls = import_path or "archivebox://update" + template_label = (f"Scheduled import: {template_urls}" if import_path else "Scheduled ArchiveBox update")[:64] + template_notes = ( + f"Created by archivebox schedule for {template_urls}" + if import_path + else "Created by archivebox schedule to queue recurring archivebox://update maintenance crawls." + ) + + template = Crawl.objects.create( + urls=template_urls, + max_depth=0 if is_update_schedule else depth, + tags_str="" if is_update_schedule else tag, + label=template_label, + notes=template_notes, + created_by_id=created_by_id, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + config={ + "DEPTH": 0 if is_update_schedule else depth, + "SCHEDULE_KIND": "update" if is_update_schedule else "crawl", + # Caller-supplied overrides (e.g. {"ONLY_NEW": False}) win over the + # template defaults. Anything left unset falls through to the + # standard config stack at crawl-resolution time. + **config_overrides, + }, + ) + crawl_schedule = CrawlSchedule.objects.create( + template=template, + schedule=schedule_str, + is_enabled=True, + label=template_label, + notes=template_notes, + created_by_id=created_by_id, + ) + result["created_schedule_ids"] = [str(crawl_schedule.id)] + + schedule_type = "maintenance update" if is_update_schedule else "crawl" + print(f"[green]\\[โˆš] Created scheduled {schedule_type}.[/green]") + print(f" id={crawl_schedule.id}") + print(f" every={crawl_schedule.schedule}") + print(f" next_run={crawl_schedule.next_run_at.isoformat()}") + if import_path: + print(f" source={import_path}") + + schedules = list(_active_schedules()) + result["active_schedule_ids"] = [str(schedule.id) for schedule in schedules] + + if show: + if schedules: + print(f"[green]\\[*] Active scheduled crawls: {len(schedules)}[/green]") + for scheduled_crawl in schedules: + template = scheduled_crawl.template + print( + f" - id={scheduled_crawl.id} every={scheduled_crawl.schedule} " + f"next_run={scheduled_crawl.next_run_at.isoformat()} " + f"source={template.urls.splitlines()[0] if template.urls else ''}", + ) + else: + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") + + if run_all: + enqueued = 0 + now = timezone.now() + for scheduled_crawl in schedules: + scheduled_crawl.enqueue(queued_at=now) + enqueued += 1 + result["run_all_enqueued"] = enqueued + print(f"[green]\\[*] Enqueued {enqueued} scheduled crawl(s) immediately.[/green]") + if enqueued: + print( + "[yellow]\\[*] Start `archivebox server`, `archivebox run --daemon`, or `archivebox schedule --foreground` to process the queued crawls.[/yellow]", + ) + + if foreground: + print( + "[green]\\[*] Starting global crawl runner in foreground mode. It will materialize scheduled crawls and process queued work.[/green]", + ) + run_pending_crawls(daemon=True) + + if quiet: + return result + + if not any((every, add, show, clear, foreground, run_all)): + if schedules: + print("[green]\\[*] Active scheduled crawls:[/green]") + for scheduled_crawl in schedules: + print(f" - {scheduled_crawl.id} every={scheduled_crawl.schedule} next_run={scheduled_crawl.next_run_at.isoformat()}") + else: + print("[yellow]\\[*] No scheduled crawls are enabled.[/yellow]") + + return result + + +@click.command() +@click.option("--quiet", "-q", is_flag=True, help="Return structured results without extra summary output") +@click.option("--add", is_flag=True, help="Create a new scheduled crawl") +@click.option("--every", type=str, help='Run on an alias like daily/weekly/monthly or a cron expression such as "0 */6 * * *"') +@click.option("--tag", "-t", default="", help="Comma-separated tags to apply to scheduled crawl snapshots") +@click.option( + "--depth", + type=click.Choice([str(i) for i in range(5)]), + default="0", + help="Recursively archive linked pages up to N hops away", +) +@click.option( + "--only-new/--no-only-new", + "only_new", + default=None, + help="Skip URLs that already have a snapshot (default: inherit from ONLY_NEW config). " + "Pass --no-only-new to force re-archive on each scheduled run.", +) +@click.option("--clear", is_flag=True, help="Disable all currently enabled schedules") +@click.option("--show", is_flag=True, help="Print all currently enabled schedules") +@click.option("--foreground", "-f", is_flag=True, help="Run the global crawl runner in the foreground (no crontab required)") +@click.option("--run-all", is_flag=True, help="Enqueue all enabled schedules immediately and process them once") +@click.argument("import_path", required=False) +@docstring(schedule.__doc__) +def main(**kwargs): + """Manage database-backed scheduled crawls processed by the crawl runner.""" + only_new = kwargs.pop("only_new", None) + if only_new is not None: + kwargs["config"] = {"ONLY_NEW": bool(only_new)} + schedule(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py new file mode 100644 index 0000000000..c192bed9e5 --- /dev/null +++ b/archivebox/cli/archivebox_search.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" +__command__ = "archivebox search" + +from archivebox.cli.archivebox_list import main + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py new file mode 100644 index 0000000000..65576ec14c --- /dev/null +++ b/archivebox/cli/archivebox_server.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import sys +import os +import socket +import subprocess +import time +from collections.abc import Iterable + +import rich_click as click +from rich import print + +from archivebox.config import CONSTANTS +from archivebox.misc.util import docstring, enforce_types + + +import re as _re + +_IPV4_RE = _re.compile(r"^\d{1,3}(?:\.\d{1,3}){3}$") +_IPV6_CHARS_RE = _re.compile(r"^[0-9a-fA-F:.]+$") +_LOCAL_BIND_HOSTS = frozenset({"0.0.0.0", "::", "::0", "127.0.0.1", "::1"}) + + +def _is_ipv4_literal(host: str) -> bool: + return bool(_IPV4_RE.match(host)) + + +def _is_ipv6_literal(host: str) -> bool: + # Bracketed (e.g. ``[2001:db8::1]``) or bare form. Require at least two + # colons so we don't catch random strings with one ``:``. + stripped = host.strip("[]") + return stripped.count(":") >= 2 and bool(_IPV6_CHARS_RE.match(stripped)) + + +def _bind_host_looks_like_ip(host: str) -> bool: + if not host or host in _LOCAL_BIND_HOSTS: + return False + return _is_ipv4_literal(host) or _is_ipv6_literal(host) + + +def _split_bind_spec(spec: str) -> tuple[str, str]: + """Split a ``host:port`` / ``host`` / ``port`` spec into ``(host, port)``. + + The empty strings stand in for "not provided"; the caller fills in + defaults. Bracketed IPv6 literals like ``[::1]:8000`` are handled. + """ + spec = (spec or "").strip() + if not spec: + return "", "" + if spec.startswith("["): + # Bracketed IPv6: ``[::1]`` or ``[::1]:8000`` + end = spec.find("]") + if end == -1: + return spec, "" # malformed; let validator reject it + host = spec[: end + 1] + rest = spec[end + 1 :] + if rest.startswith(":"): + return host, rest[1:] + return host, "" + if ":" in spec: + host, _, port = spec.rpartition(":") + return host, port + # Bare token: digits = port, anything else = host + if spec.isdigit(): + return "", spec + return spec, "" + + +def _parse_and_validate_bind_spec(spec: str) -> tuple[str, str]: + """Resolve a CLI/config bind spec to ``(host, port)`` or hard-error. + + Accepts only IP literals (v4 or v6) or the special string ``localhost`` + (normalized to ``127.0.0.1``). Bare hostnames are rejected because the + bind address feeds Daphne, which has to listen on a numeric address; + public hostnames belong in ``BASE_URL`` instead. Empty values fall back + to ``127.0.0.1`` / ``8000``. + """ + raw_host, raw_port = _split_bind_spec(spec) + host = raw_host.strip() + port = (raw_port or "").strip() or "8000" + + if host == "" or host.lower() == "localhost": + host = "127.0.0.1" + elif _is_ipv4_literal(host) or _is_ipv6_literal(host): + pass + else: + print( + f"[red][X] Invalid BIND_ADDR host {host!r}: must be an IP literal or 'localhost'.[/red]", + ) + print( + "[red] Hostnames like archive.example.com are not valid bind addresses โ€” Daphne[/red]", + ) + print( + "[red] listens on numeric addresses only. Bind to 0.0.0.0 and set BASE_URL instead:[/red]", + ) + print( + f"[red] BASE_URL=https://{host} archivebox server 0.0.0.0:{port}[/red]", + ) + sys.exit(1) + + try: + port_int = int(port) + except ValueError: + print(f"[red][X] Invalid BIND_ADDR port {port!r}: must be an integer 1-65535.[/red]") + sys.exit(1) + if not (0 < port_int < 65536): + print(f"[red][X] Invalid BIND_ADDR port {port_int}: must be 1-65535.[/red]") + sys.exit(1) + + return host, port + + +def _print_server_startup_warnings(config, host: str, port: str) -> None: + """Print startup-time security / routing warnings for the server command. + + Runs only from ``archivebox server`` so other entry points (manage shell, + plugin lookups, etc.) don't repeat this banner on every config load. + """ + if config.IS_LOWER_SECURITY_MODE: + print( + f"[yellow][!] WARNING: ArchiveBox is running with SERVER_SECURITY_MODE={config.SERVER_SECURITY_MODE}[/yellow]", + ) + print("[yellow] Archived pages may share an origin with privileged app routes in this mode.[/yellow]") + print("[yellow] To switch to the safer isolated setup:[/yellow]") + print("[yellow] 1. Set SERVER_SECURITY_MODE=safe-subdomains-fullreplay[/yellow]") + print("[yellow] 2. Point *.archivebox.localhost (or your chosen base domain) at this server[/yellow]") + print( + "[yellow] 3. Configure wildcard DNS/TLS or your reverse proxy so admin., web., api., and snapshot subdomains resolve[/yellow]", + ) + print() + + # ``config.BASE_URL`` is the merged value (env > Machine.config > file > + # default), which is what the running server will actually use. Earlier we + # gated the "BASE_URL not set" warning on ``os.environ["BASE_URL"]`` alone, + # which fired noisily when the user pinned BASE_URL via Machine.config / + # ArchiveBox.conf instead of via env. + base_url = (config.BASE_URL or "").strip() + if base_url: + # BASE_URL is pinned. The only thing left to surface is a port + # mismatch โ€” bind port โ‰  BASE_URL's explicit port usually means the + # operator started the server with the wrong ``archivebox server PORT`` + # argument (or forgot to update one side after moving the listener). + # A reverse-proxy setup typically omits the port in BASE_URL + # (``https://archive.example.com``), so we only warn when BASE_URL + # carries an explicit port โ€” otherwise we'd nag every proxy deployment. + from urllib.parse import urlparse + + try: + base_port = urlparse(base_url).port + except (ValueError, TypeError): + base_port = None + if base_port is not None and str(base_port) != str(port): + print( + f"[yellow][!] BASE_URL ({base_url}) port {base_port} does not match the port the server is running on ({port}). " + "Make sure this is intentional![/yellow]", + ) + print() + return + + # If the user is upgrading from 0.7.3 and already had + # CSRF_TRUSTED_ORIGINS set, get_base_url() will silently use that as the + # implicit BASE_URL. Surface what we picked so the user knows where their + # links / redirects are going โ€” and tell them how to make it explicit. + from archivebox.core.routes_util import derive_base_url_from_csrf + + csrf_derived = derive_base_url_from_csrf(config) + if csrf_derived: + print( + f"[yellow][!] BASE_URL is not set; auto-derived [bold]{csrf_derived}[/bold] from a single CSRF_TRUSTED_ORIGINS entry.[/yellow]", + ) + print( + "[yellow] Links / redirects / cookies will use that origin. To silence this hint, set BASE_URL[/yellow]", + ) + print( + f"[yellow] explicitly: [bold]BASE_URL={csrf_derived}[/bold] (matches your existing CSRF_TRUSTED_ORIGINS).[/yellow]", + ) + print() + return + + # BASE_URL was not set explicitly. The routes_util derivation gives one of + # three results, with very different risk profiles โ€” show a tailored hint + # so new users coming from the 0.7.x single-domain world know whether the + # default is fine for them or needs attention. + if _bind_host_looks_like_ip(host): + # Real IP literal: subdomain routing can't work, URLs leak the IP. + # This is the most urgent case. + print( + f"[yellow][!] WARNING: BASE_URL is not set and BIND_ADDR resolves to an IP literal ({host}).[/yellow]", + ) + print( + "[yellow] Snapshot / admin / api URLs will be generated with the IP, and subdomain[/yellow]", + ) + print( + "[yellow] routing cannot work against an IP address. Set BASE_URL explicitly, e.g.[/yellow]", + ) + print( + "[yellow] BASE_URL=https://archive.example.com archivebox server 0.0.0.0:8000[/yellow]", + ) + if config.USES_SUBDOMAIN_ROUTING: + print( + "[yellow] Or switch SERVER_SECURITY_MODE to a one-domain mode if you can't run a hostname.[/yellow]", + ) + print() + else: + # Loopback / wildcard bind. The routes_util default of + # http://archivebox.localhost:PORT works in a browser on the same + # machine, but anything else (reverse proxy, k8s ingress, LAN client) + # needs BASE_URL set. (Real hostnames can't reach this branch โ€” the + # bind validator rejects them upfront.) + print( + "[yellow][!] BASE_URL is not set. Generated URLs will fall back to http://archivebox.localhost:<port>.[/yellow]", + ) + print( + "[yellow] That's fine for local browsing on this machine. Set BASE_URL when running behind[/yellow]", + ) + print( + "[yellow] a reverse proxy / ingress / public hostname, e.g.[/yellow]", + ) + print( + "[yellow] BASE_URL=https://archive.example.com archivebox server 0.0.0.0:8000[/yellow]", + ) + print() + + +@enforce_types +def server( + runserver_args: Iterable[str] | None = None, + reload: bool = False, + debug: bool = False, + daemonize: bool = False, + nothreading: bool = False, +) -> None: + """Run the ArchiveBox HTTP server""" + from archivebox.config.common import get_config + + config = get_config() + runserver_args = list(runserver_args or (config.BIND_ADDR,)) + + run_in_debug = config.DEBUG or debug or reload + if debug or reload: + os.environ["DEBUG"] = "True" + + from django.contrib.auth.models import User + + if not User.objects.filter(is_superuser=True).exclude(username="system").exists(): + print() + print( + "[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:", + ) + print(" [green]archivebox manage createsuperuser[/green]") + print() + + # First non-empty positional arg is the bind spec; otherwise inherit from + # config (which defaults to "127.0.0.1:8000"). _parse_and_validate_bind_spec + # hard-errors on hostnames so the rest of the server can assume a numeric + # bind host. + bind_spec = next((arg for arg in runserver_args if arg), "") + host, port = _parse_and_validate_bind_spec(bind_spec) + + if daemonize and os.environ.get("ARCHIVEBOX_SERVER_DAEMON_CHILD") != "1": + log_path = CONSTANTS.LOGS_DIR / "server.log" + log_path.parent.mkdir(parents=True, exist_ok=True) + daemon_env = os.environ.copy() + daemon_env["ARCHIVEBOX_SERVER_DAEMON_CHILD"] = "1" + daemon_cmd = [sys.executable, "-m", "archivebox", "server"] + if debug: + daemon_cmd.append("--debug") + if reload: + daemon_cmd.append("--reload") + if nothreading: + daemon_cmd.append("--nothreading") + daemon_cmd.extend(runserver_args) + with log_path.open("a", encoding="utf-8") as log_file: + proc = subprocess.Popen( + daemon_cmd, + cwd=os.getcwd(), + env=daemon_env, + stdin=subprocess.DEVNULL, + stdout=log_file, + stderr=log_file, + start_new_session=True, + ) + deadline = time.monotonic() + 30 + while time.monotonic() < deadline: + if proc.poll() is not None: + print(f"[red][X] ArchiveBox daemon server exited early with code {proc.returncode}. See {log_path}[/red]") + sys.exit(proc.returncode or 1) + try: + with socket.create_connection((host, int(port)), timeout=0.25): + break + except OSError: + time.sleep(0.1) + else: + print(f"[yellow][!] ArchiveBox daemon server pid={proc.pid} is still starting. See {log_path}[/yellow]") + return + + os.environ["BIND_ADDR"] = f"{host}:{port}" + from archivebox.core.routes_util import get_base_url + + base_url = get_base_url().rstrip("/") + admin_url = f"{base_url}/admin/" + + from archivebox.workers.supervisord_util import ( + active_supervisord_runtime_components, + format_runtime_components, + start_server_workers, + stop_existing_supervisord_process, + is_port_in_use, + ) + from archivebox.machine.models import Process + from archivebox.core.takeover_util import ( + command_owns_runtime_stack, + current_command, + foreground_runner_owner, + runtime_stack_owner, + standby_until_runtime_stack_needed, + ) + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + + if run_in_debug: + print("[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]") + else: + print("[green][+] Starting ArchiveBox webserver...[/green]") + print( + f" [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [dim]BIND_ADDR[/dim] [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]", + ) + print( + f" [green]>[/green] Log in to ArchiveBox Admin UI on [dim]BASE_URL [/dim] [deep_sky_blue3][link={admin_url}]{admin_url}[/link][/deep_sky_blue3]", + ) + print(" > Writing ArchiveBox error log to ./logs/errors.log") + print() + + # Reload config after we've set os.environ["BIND_ADDR"] above so the + # security-mode + base-url warnings see the effective values. + runtime_config = get_config() + _print_server_startup_warnings(runtime_config, host, port) + bind_url = f"http://{host}:{port}" + command = current_command(Process.TypeChoices.SERVER, data_dir=CONSTANTS.DATA_DIR, url=bind_url) + + def still_owns_runtime_stack() -> bool: + from django.db import connections + + try: + return command_owns_runtime_stack(command, data_dir=CONSTANTS.DATA_DIR) + finally: + connections.close_all() + + shutdown_state = None + try: + with ( + foreground_shutdown_signals() as shutdown_state, + foreground_parent_watchdog(enabled=os.environ.get("ARCHIVEBOX_SERVER_DAEMON_CHILD") != "1"), + ): + while True: + standby_result = standby_until_runtime_stack_needed(command, data_dir=CONSTANTS.DATA_DIR) + older_owner = runtime_stack_owner(data_dir=CONSTANTS.DATA_DIR, exclude_id=command.id) or foreground_runner_owner( + data_dir=CONSTANTS.DATA_DIR, + exclude_id=command.id, + ) + takeover_components = active_supervisord_runtime_components(config=config) + if older_owner and takeover_components: + print( + "[yellow][*] Taking over " + f"{format_runtime_components(takeover_components)} from older existing archivebox process (pid={older_owner.pid}).[/yellow]", + ) + stop_existing_supervisord_process() + if is_port_in_use(host, int(port)): + print(f"[red][X] Error: Port {port} is already in use[/red]") + print(f" Another process outside this ArchiveBox runtime is listening on {host}:{port}") + sys.exit(1) + + result = start_server_workers( + host=host, + port=port, + daemonize=False, + debug=run_in_debug, + reload=reload, + nothreading=nothreading, + keep_running=still_owns_runtime_stack, + should_stop_supervisord=still_owns_runtime_stack, + resumed_from_pid=standby_result.get("previous_owner_pid") if standby_result.get("resumed") else None, + ) + if result == "interrupted": + break + if not still_owns_runtime_stack(): + continue + if result == "exited": + print("[yellow][*] Runtime stack exited while this parent is still leader; restarting...[/yellow]") + continue + break + except KeyboardInterrupt: + pass + finally: + if not shutdown_state or not shutdown_state.signal_name: + command.mark_exited() + print("\n[i][green][๐ŸŸฉ] ArchiveBox server shut down gracefully.[/green][/i]") + + +@click.command() +@click.argument("runserver_args", nargs=-1) +@click.option("--reload", is_flag=True, help="Enable auto-reloading when code or templates change") +@click.option("--debug", is_flag=True, help="Enable DEBUG=True mode with more verbose errors") +@click.option("--nothreading", is_flag=True, help="Force runserver to run in single-threaded mode") +@click.option("--daemonize", is_flag=True, help="Run the server in the background as a daemon") +@docstring(server.__doc__) +def main(**kwargs): + server(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_shell.py b/archivebox/cli/archivebox_shell.py new file mode 100644 index 0000000000..f487cf52a9 --- /dev/null +++ b/archivebox/cli/archivebox_shell.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +from collections.abc import Iterable + +import rich_click as click + +from archivebox.misc.util import docstring + + +def shell(args: Iterable[str] = ()) -> None: + """Enter an interactive ArchiveBox Django shell""" + + from django.core.management import call_command, get_commands + + shell_command = "shell_plus" if "shell_plus" in get_commands() else "shell" + call_command(shell_command, *args) + + +@click.command(add_help_option=False, context_settings=dict(ignore_unknown_options=True)) +@click.argument("args", nargs=-1) +@docstring(shell.__doc__) +def main(args: Iterable[str] = ()) -> None: + shell(args=args) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py new file mode 100644 index 0000000000..c1509e77e2 --- /dev/null +++ b/archivebox/cli/archivebox_snapshot.py @@ -0,0 +1,494 @@ +#!/usr/bin/env python3 + +""" +archivebox snapshot <action> [args...] [--filters] + +Manage Snapshot records. + +Actions: + create - Create Snapshots from URLs or Crawl JSONL + list - List Snapshots as JSONL (with optional filters) + update - Update Snapshots from stdin JSONL + delete - Delete Snapshots from stdin JSONL + +Examples: + # Create + archivebox snapshot create https://example.com --tag=news + archivebox crawl create https://example.com | archivebox snapshot create + + # List with filters + archivebox snapshot list --status=queued + archivebox snapshot list --url__icontains=example.com + + # Update + archivebox snapshot list --tag=old | archivebox snapshot update --tag=new + + # Delete + archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox snapshot" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint +from django.db.models import QuerySet + +SNAPSHOT_FILTER_TYPE_CHOICES = ("exact", "substring", "regex", "domain", "tag", "timestamp") +SNAPSHOT_LIST_CHUNK_SIZE = 100 + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_snapshots( + urls: Iterable[str], + tag: str = "", + status: str = "queued", + depth: int = 0, + created_by_id: int | None = None, +) -> int: + """ + Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). + Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import ( + read_args_or_stdin, + write_record, + TYPE_SNAPSHOT, + TYPE_CRAWL, + ) + from archivebox.misc.util import validate_url + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + created_by_id = created_by_id or get_or_create_system_user_pk() + is_tty = sys.stdout.isatty() + + # Collect all input records + records = list(read_args_or_stdin(urls)) + + if not records: + rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) + return 1 + + # Process each record - handle Crawls and plain URLs/Snapshots + created_snapshots = [] + pass_through_count = 0 + + for record in records: + record_type = record.get("type", "") + + try: + if record_type == TYPE_CRAWL: + # Pass through the Crawl record itself first + if not is_tty: + write_record(record) + + # Input is a Crawl - get or create it, then create Snapshots for its URLs + crawl = None + crawl_id = record.get("id") + if crawl_id: + try: + crawl = Crawl.objects.get(id=crawl_id) + except Crawl.DoesNotExist: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + else: + crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) + + if not crawl: + continue + + # Create snapshots for each URL in the crawl + for url in crawl.get_urls_list(): + try: + validate_url(url) + except ValueError as err: + rprint(f"[red]Error creating snapshot: {err}[/red]", file=sys.stderr) + continue + merged_tags = crawl.tags_str + if tag: + merged_tags = f"{merged_tags},{tag}" if merged_tags else tag + snapshot_record = { + "url": url, + "tags": merged_tags, + "crawl_id": str(crawl.id), + "depth": depth, + "status": status, + } + snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id}) + if snapshot: + created_snapshots.append(snapshot) + if not is_tty: + write_record(snapshot.to_json()) + + elif record_type == TYPE_SNAPSHOT or record.get("url"): + # Input is a Snapshot or plain URL + if record.get("url"): + validate_url(str(record["url"])) + if tag and not record.get("tags"): + record["tags"] = tag + if status: + record["status"] = status + record["depth"] = record.get("depth", depth) + + snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) + if snapshot: + created_snapshots.append(snapshot) + if not is_tty: + write_record(snapshot.to_json()) + + else: + # Pass-through: output records we don't handle + if not is_tty: + write_record(record) + pass_through_count += 1 + + except Exception as e: + rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr) + continue + + if not created_snapshots: + if pass_through_count > 0: + rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr) + return 0 + rprint("[red]No snapshots created[/red]", file=sys.stderr) + return 1 + + rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr) + + if is_tty: + for snapshot in created_snapshots: + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) + + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def snapshot_filter_options(*, default_filter_type: str): + def decorate(func): + for decorator in reversed( + ( + click.option("--status", "-s", help="Filter by status (queued, started, sealed)"), + click.option("--url__icontains", help="Filter by URL contains"), + click.option("--url__istartswith", help="Filter by URL starts with"), + click.option("--tag", "-t", help="Filter by tag name"), + click.option("--crawl-id", help="Filter by crawl ID"), + click.option("--limit", "-n", type=int, help="Limit number of results"), + click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at"), + click.option("--search", help="Search mode to use for positional query"), + click.option("--before", type=float, help="Only snapshots bookmarked before timestamp"), + click.option("--after", type=float, help="Only snapshots bookmarked after timestamp"), + click.option( + "--filter-type", + "-f", + type=click.Choice(SNAPSHOT_FILTER_TYPE_CHOICES), + default=default_filter_type, + help="Type of pattern matching to use for positional filters", + ), + click.argument("filter_patterns", nargs=-1), + ), + ): + func = decorator(func) + return func + + return decorate + + +def snapshot_output_options(func): + for decorator in reversed( + ( + click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title"), + click.option("--json", "as_json", is_flag=True, help="Print output as a JSON array"), + click.option("--html", "as_html", is_flag=True, help="Print output as HTML"), + click.option("--with-headers", is_flag=True, help="Include column headers in structured output"), + ), + ): + func = decorator(func) + return func + + +def build_snapshot_queryset( + **kwargs, +) -> QuerySet: + from archivebox.core.models import Snapshot + + return Snapshot.objects.order_by("-created_at").search(**kwargs) + + +def list_snapshots( + csv: str | None = None, + as_json: bool = False, + as_html: bool = False, + with_headers: bool = False, + **kwargs, +) -> int: + """ + List Snapshots as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + + output_formats = sum(bool(output_format) for output_format in (csv, as_json, as_html)) + if output_formats > 1: + rprint("[red]Choose only one output format: --csv, --json, or --html[/red]", file=sys.stderr) + return 2 + if with_headers and not output_formats: + rprint("[red]--with-headers requires --csv, --json, or --html[/red]", file=sys.stderr) + return 2 + + is_tty = sys.stdout.isatty() and not output_formats + + try: + queryset = build_snapshot_queryset(**kwargs) + except ValueError as err: + rprint(f"[red]{err}[/red]", file=sys.stderr) + return 2 + + count = 0 + if as_json: + queryset = queryset.prefetch_related("tags") + output = queryset.to_json(with_headers=with_headers) + sys.stdout.write(output) + if output and not output.endswith("\n"): + sys.stdout.write("\n") + rprint(f"[dim]Listed {queryset.count()} snapshots[/dim]", file=sys.stderr) + return 0 + + if as_html: + queryset = queryset.prefetch_related("tags") + output = queryset.to_html(with_headers=with_headers) + sys.stdout.write(output) + if output and not output.endswith("\n"): + sys.stdout.write("\n") + rprint(f"[dim]Listed {queryset.count()} snapshots[/dim]", file=sys.stderr) + return 0 + + if csv: + cols = [col.strip() for col in csv.split(",") if col.strip()] + if not cols: + rprint("[red]No CSV columns provided[/red]", file=sys.stderr) + return 2 + if with_headers: + sys.stdout.write(",".join(cols)) + sys.stdout.write("\n") + for snapshot in queryset.prefetch_related("tags").iterator(chunk_size=SNAPSHOT_LIST_CHUNK_SIZE): + sys.stdout.write(snapshot.to_csv(cols=cols, separator=",")) + sys.stdout.write("\n") + count += 1 + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) + return 0 + + if not is_tty: + for snapshot in queryset.prefetch_related("tags").iterator(chunk_size=SNAPSHOT_LIST_CHUNK_SIZE): + write_record(snapshot.to_json()) + count += 1 + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) + return 0 + + for snapshot in queryset.iterator(chunk_size=SNAPSHOT_LIST_CHUNK_SIZE): + status_color = { + "queued": "yellow", + "started": "blue", + "sealed": "green", + }.get(snapshot.status, "dim") + rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}") + count += 1 + + rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_snapshots( + status: str | None = None, + tag: str | None = None, +) -> int: + """ + Update Snapshots from stdin JSONL. + + Reads Snapshot records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Snapshot + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + snapshot_id = record.get("id") + if not snapshot_id: + continue + + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + if status: + if status not in Snapshot.StatusChoices.values: + rprint(f"[red]Invalid snapshot status: {status}[/red]", file=sys.stderr) + continue + if status == Snapshot.StatusChoices.SEALED: + snapshot.cancel() + elif status == Snapshot.StatusChoices.PAUSED: + snapshot.pause() + elif status == Snapshot.StatusChoices.QUEUED: + if snapshot.status == Snapshot.StatusChoices.PAUSED: + snapshot.resume() + else: + snapshot.update_and_requeue(status=Snapshot.StatusChoices.QUEUED, retry_at=timezone.now()) + elif status == Snapshot.StatusChoices.STARTED: + snapshot.update_and_requeue(status=Snapshot.StatusChoices.STARTED, retry_at=timezone.now()) + if tag: + from archivebox.core.models import Tag + + tag_obj, _ = Tag.objects.get_or_create(name=tag) + snapshot.tags.add(tag_obj) + snapshot.safe_update({"modified_at": timezone.now()}, refresh=False) + + if not status and not tag: + snapshot.safe_update({"modified_at": timezone.now()}, refresh=False) + updated_count += 1 + + if not is_tty: + snapshot.refresh_from_db() + write_record(snapshot.to_json()) + + except Snapshot.DoesNotExist: + rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Snapshots from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Snapshot + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + snapshot_ids = [r.get("id") for r in records if r.get("id")] + + if not snapshot_ids: + rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr) + return 1 + + snapshots = Snapshot.objects.filter(id__in=snapshot_ids) + count = snapshots.count() + + if count == 0: + rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr) + for snapshot in snapshots: + rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = snapshots.delete() + rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Snapshot records.""" + pass + + +@main.command("create") +@click.argument("urls", nargs=-1) +@click.option("--tag", "-t", default="", help="Comma-separated tags to add") +@click.option("--status", "-s", default="queued", help="Initial status (default: queued)") +@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") +def create_cmd(urls: tuple, tag: str, status: str, depth: int): + """Create Snapshots from URLs or stdin JSONL.""" + sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) + + +@main.command("list") +@snapshot_output_options +@snapshot_filter_options(default_filter_type="substring") +def list_cmd(**kwargs): + """List Snapshots as JSONL.""" + sys.exit(list_snapshots(**kwargs)) + + +@main.command("update") +@click.option("--status", "-s", help="Set status") +@click.option("--tag", "-t", help="Add tag") +def update_cmd(status: str | None, tag: str | None): + """Update Snapshots from stdin JSONL.""" + sys.exit(update_snapshots(status=status, tag=tag)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Snapshots from stdin JSONL.""" + sys.exit(delete_snapshots(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py new file mode 100644 index 0000000000..96f8c03502 --- /dev/null +++ b/archivebox/cli/archivebox_status.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +from pathlib import Path + +import rich_click as click +from rich import print + +from archivebox.misc.util import enforce_types, docstring +from archivebox.config import CONSTANTS +from archivebox.config.common import get_config +from archivebox.misc.system import get_dir_size +from archivebox.misc.logging_util import printable_filesize + + +MAX_STATUS_FS_DIR_SCAN = 5000 + + +@enforce_types +def status(out_dir: Path = CONSTANTS.DATA_DIR) -> None: + """Print out some info and statistics about the archive collection""" + + from django.contrib.auth import get_user_model + from django.db.models import Sum + from django.db.models.functions import Coalesce + from archivebox.core.models import ArchiveResult, Snapshot + + config = get_config() + User = get_user_model() + + print("[green]\\[*] Scanning archive main index...[/green]") + print(f"[yellow] {out_dir}/*[/yellow]") + num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern="index.") + size = printable_filesize(num_bytes) + print(f" Index size: {size} across {num_files} files") + print() + + snapshots_qs = Snapshot.objects.all() + num_sql_links = snapshots_qs.count() + archive_dir = CONSTANTS.ARCHIVE_DIR + legacy_snapshot_dirs = [] + if archive_dir.exists(): + legacy_snapshot_dirs = [ + entry for entry in archive_dir.iterdir() if entry.is_dir() and not entry.is_symlink() and Snapshot.is_legacy_archive_dir(entry) + ] + print(f" > SQL Main Index: {num_sql_links} links".ljust(36), f"(found in {CONSTANTS.SQL_INDEX_FILENAME})") + print(f" > JSON Link Details: {len(legacy_snapshot_dirs)} links".ljust(36), f"(found in {archive_dir.name}/*/index.json)") + print() + print("[green]\\[*] Scanning archive data directories...[/green]") + users_dir = CONSTANTS.USERS_DIR + scan_roots = [root for root in (archive_dir, users_dir) if root.exists()] + scan_roots_display = ", ".join(str(root) for root in scan_roots) if scan_roots else str(archive_dir) + print(f"[yellow] {scan_roots_display}[/yellow]") + do_precise_fs_scan = num_sql_links <= MAX_STATUS_FS_DIR_SCAN + if do_precise_fs_scan: + num_bytes = num_dirs = num_files = 0 + for root in scan_roots: + root_bytes, root_dirs, root_files = get_dir_size(root) + num_bytes += root_bytes + num_dirs += root_dirs + num_files += root_files + else: + num_bytes = snapshots_qs.aggregate(total=Coalesce(Sum("output_size"), 0))["total"] or 0 + num_dirs = 0 + num_files = ArchiveResult.objects.exclude(output_files__in=["", "{}"]).count() + size = printable_filesize(num_bytes) + if do_precise_fs_scan: + print(f" Size: {size} across {num_files} files in {num_dirs} directories") + else: + print(f" Size: {size} across {num_files} DB-tracked output records") + + # Use DB as source of truth for snapshot status + num_indexed = num_sql_links + num_archived = snapshots_qs.filter(status=Snapshot.StatusChoices.SEALED).count() + num_unarchived = max(num_indexed - num_archived, 0) + print(f" > indexed: {num_indexed}".ljust(36), "(total snapshots in DB)") + print(f" > archived: {num_archived}".ljust(36), "(snapshots with archived content)") + print(f" > unarchived: {num_unarchived}".ljust(36), "(snapshots pending archiving)") + + # Count snapshot directories on filesystem across both legacy and current layouts. + if do_precise_fs_scan: + links = list(snapshots_qs) + expected_snapshot_dirs = {str(Path(snapshot.output_dir).resolve()) for snapshot in links if Path(snapshot.output_dir).exists()} + discovered_snapshot_dirs = {str(entry.resolve()) for entry in legacy_snapshot_dirs} + + if users_dir.exists(): + discovered_snapshot_dirs.update( + str(entry.resolve()) for entry in users_dir.glob(f"*/{CONSTANTS.SNAPSHOTS_DIR_NAME}/*/*/*") if entry.is_dir() + ) + + orphaned_dirs = sorted(discovered_snapshot_dirs - expected_snapshot_dirs) + num_present = len(discovered_snapshot_dirs) + num_valid = len(discovered_snapshot_dirs & expected_snapshot_dirs) + else: + orphaned_dirs = [] + num_present = num_archived + num_valid = num_archived + print() + print(f" > present: {num_present}".ljust(36), "(snapshot directories on disk)") + print(f" > [green]valid:[/green] {num_valid}".ljust(36), " (directories with matching DB entry)") + + num_orphaned = len(orphaned_dirs) + print(f" > [red]orphaned:[/red] {num_orphaned}".ljust(36), " (directories without matching DB entry)") + + if num_indexed: + print(" [violet]Hint:[/violet] You can list snapshots by status like so:") + print(" [green]archivebox list --status=<status> (e.g. sealed, queued, etc.)[/green]") + + if orphaned_dirs: + print(" [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:") + print(" [green]archivebox init[/green]") + + print() + print("[green]\\[*] Scanning recent archive changes and user logins:[/green]") + print(f"[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]") + admin_users = User.objects.filter(is_superuser=True).exclude(username="system") + users = [user.get_username() for user in admin_users] + print(f" UI users {len(users)}: {', '.join(users)}") + last_login = admin_users.order_by("last_login").last() + if last_login: + print(f" Last UI login: {last_login.get_username()} @ {str(last_login.last_login)[:16]}") + last_downloaded = Snapshot.objects.order_by("downloaded_at").last() + if last_downloaded: + print(f" Last changes: {str(last_downloaded.downloaded_at)[:16]}") + + if not users: + print() + print(" [violet]Hint:[/violet] You can create an admin user by running:") + print(" [green]archivebox manage createsuperuser[/green]") + + print() + recent_snapshots = snapshots_qs.order_by( + "-downloaded_at", + "-modified_at", + )[:10] + for snapshot in recent_snapshots: + if not snapshot.downloaded_at: + continue + print( + ( + "[grey53] " + f" > {str(snapshot.downloaded_at)[:16]} " + f"[{snapshot.num_outputs} {('X', 'โˆš')[snapshot.status == Snapshot.StatusChoices.SEALED]} {printable_filesize(snapshot.output_size or 0)}] " + f'"{snapshot.title}": {snapshot.url}' + "[/grey53]" + )[: config.TERM_WIDTH], + ) + print("[grey53] ...") + + +@click.command() +@docstring(status.__doc__) +def main(**kwargs): + """Print out some info and statistics about the archive collection""" + status(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py new file mode 100644 index 0000000000..ff6692a740 --- /dev/null +++ b/archivebox/cli/archivebox_tag.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +""" +archivebox tag <action> [args...] [--filters] + +Manage Tag records. + +Actions: + create - Create Tags + list - List Tags as JSONL (with optional filters) + update - Update Tags from stdin JSONL + delete - Delete Tags from stdin JSONL + +Examples: + # Create + archivebox tag create news tech science + archivebox tag create "important stuff" + + # List + archivebox tag list + archivebox tag list --name__icontains=news + + # Update (rename tags) + archivebox tag list --name=oldname | archivebox tag update --name=newname + + # Delete + archivebox tag list --name=unused | archivebox tag delete --yes +""" + +__package__ = "archivebox.cli" +__command__ = "archivebox tag" + +import sys +from collections.abc import Iterable + +import rich_click as click +from rich import print as rprint + +from archivebox.cli.cli_util import apply_filters + + +# ============================================================================= +# CREATE +# ============================================================================= + + +def create_tags(names: Iterable[str]) -> int: + """ + Create Tags from names. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + # Convert to list if needed + name_list = list(names) if names else [] + + if not name_list: + rprint("[yellow]No tag names provided. Pass names as arguments.[/yellow]", file=sys.stderr) + return 1 + + created_count = 0 + for name in name_list: + name = name.strip() + if not name: + continue + + tag, created = Tag.objects.get_or_create(name=name) + + if not is_tty: + write_record(tag.to_json()) + + if created: + created_count += 1 + rprint(f"[green]Created tag: {name}[/green]", file=sys.stderr) + else: + rprint(f"[dim]Tag already exists: {name}[/dim]", file=sys.stderr) + + rprint(f"[green]Created {created_count} new tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + + +def list_tags( + name: str | None = None, + name__icontains: str | None = None, + limit: int | None = None, +) -> int: + """ + List Tags as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + queryset = Tag.objects.all().order_by("name") + + # Apply filters + filter_kwargs = { + "name": name, + "name__icontains": name__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for tag in queryset: + snapshot_count = tag.snapshot_set.count() + if is_tty: + rprint(f"[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]") + else: + write_record(tag.to_json()) + count += 1 + + rprint(f"[dim]Listed {count} tags[/dim]", file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + + +def update_tags(name: str | None = None) -> int: + """ + Update Tags from stdin JSONL. + + Reads Tag records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + tag_id = record.get("id") + old_name = record.get("name") + + if not tag_id and not old_name: + continue + + try: + if tag_id: + tag = Tag.objects.get(id=tag_id) + else: + tag = Tag.objects.get(name=old_name) + + # Apply updates from CLI flags + if name: + tag.name = name + tag.save() + + updated_count += 1 + + if not is_tty: + write_record(tag.to_json()) + + except Tag.DoesNotExist: + rprint(f"[yellow]Tag not found: {tag_id or old_name}[/yellow]", file=sys.stderr) + continue + + rprint(f"[green]Updated {updated_count} tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + + +def delete_tags(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Tags from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Tag + + records = list(read_stdin()) + if not records: + rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) + return 1 + + # Collect tag IDs or names + tag_ids = [] + tag_names = [] + for r in records: + if r.get("id"): + tag_ids.append(r["id"]) + elif r.get("name"): + tag_names.append(r["name"]) + + if not tag_ids and not tag_names: + rprint("[yellow]No valid tag IDs or names in input[/yellow]", file=sys.stderr) + return 1 + + from django.db.models import Q + + query = Q() + if tag_ids: + query |= Q(id__in=tag_ids) + if tag_names: + query |= Q(name__in=tag_names) + + tags = Tag.objects.filter(query) + count = tags.count() + + if count == 0: + rprint("[yellow]No matching tags found[/yellow]", file=sys.stderr) + return 0 + + if dry_run: + rprint(f"[yellow]Would delete {count} tags (dry run)[/yellow]", file=sys.stderr) + for tag in tags: + rprint(f" {tag.name}", file=sys.stderr) + return 0 + + if not yes: + rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = tags.delete() + rprint(f"[green]Deleted {deleted_count} tags[/green]", file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + + +@click.group() +def main(): + """Manage Tag records.""" + pass + + +@main.command("create") +@click.argument("names", nargs=-1) +def create_cmd(names: tuple): + """Create Tags from names.""" + sys.exit(create_tags(names)) + + +@main.command("list") +@click.option("--name", help="Filter by exact name") +@click.option("--name__icontains", help="Filter by name contains") +@click.option("--limit", "-n", type=int, help="Limit number of results") +def list_cmd(name: str | None, name__icontains: str | None, limit: int | None): + """List Tags as JSONL.""" + sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit)) + + +@main.command("update") +@click.option("--name", "-n", help="Set new name") +def update_cmd(name: str | None): + """Update Tags from stdin JSONL.""" + sys.exit(update_tags(name=name)) + + +@main.command("delete") +@click.option("--yes", "-y", is_flag=True, help="Confirm deletion") +@click.option("--dry-run", is_flag=True, help="Show what would be deleted") +def delete_cmd(yes: bool, dry_run: bool): + """Delete Tags from stdin JSONL.""" + sys.exit(delete_tags(yes=yes, dry_run=dry_run)) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py new file mode 100644 index 0000000000..c978f2a115 --- /dev/null +++ b/archivebox/cli/archivebox_update.py @@ -0,0 +1,1048 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +__package__ = "archivebox.cli" + +import os +import asyncio +import shlex +import time + +from typing import TYPE_CHECKING, Any +from collections.abc import Iterable +from pathlib import Path + +import rich_click as click + +from archivebox.misc.util import enforce_types, docstring +from archivebox.cli.archivebox_snapshot import snapshot_filter_options + +if TYPE_CHECKING: + from django.db.models import QuerySet + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + +def _get_snapshot_crawl(snapshot: Snapshot) -> Crawl | None: + from django.core.exceptions import ObjectDoesNotExist + + try: + return snapshot.crawl + except ObjectDoesNotExist: + return None + + +def _get_search_indexing_plugins() -> list[str]: + from archivebox.config.common import get_config + from archivebox.plugins.hooks import discover_hooks + from archivebox.plugins.discovery import get_search_backends + + available_backends = set(get_search_backends()) + return sorted( + plugin_name + for plugin_name in { + hook.parent.name + for hook in discover_hooks("Snapshot", config=get_config()) + if hook.parent.name.startswith("search_backend_") and "index" in hook.name.lower() + } + if plugin_name.startswith("search_backend_") and plugin_name.removeprefix("search_backend_") in available_backends + ) + + +def _build_filtered_snapshots_queryset( + **kwargs, +): + from archivebox.core.models import Snapshot + from archivebox.cli.archivebox_snapshot import build_snapshot_queryset + + limit = kwargs.pop("limit", None) + snapshots = build_snapshot_queryset(**kwargs) + if kwargs.get("resume"): + snapshots = snapshots.filter(timestamp__lte=kwargs["resume"]) + snapshots = snapshots.select_related("crawl") + if limit is not None and limit > 0: + snapshot_ids = list(snapshots.values_list("id", flat=True)[:limit]) + snapshots = Snapshot.objects.filter(id__in=snapshot_ids).select_related("crawl") + + return snapshots + + +def reindex_snapshots( + snapshots: QuerySet[Snapshot, Snapshot], + *, + search_plugins: list[str], + batch_size: int, + collect_ids: bool = False, + wait_for_turn=None, +) -> dict[str, Any]: + from archivebox.cli.archivebox_extract import run_plugins + from archivebox.core.models import ArchiveResult + from abx_dl.models import discover_plugins + + stats: dict[str, Any] = {"processed": 0, "requested": 0, "queued": 0, "skipped_queued": 0, "reindexed": 0, "snapshot_ids": []} + records: list[dict[str, str]] = [] + plugins_by_name = discover_plugins(runtime="archivebox") + required_hooks_by_plugin = { + plugin_name: frozenset(hook.name for hook in plugins_by_name[plugin_name].filter_hooks("Snapshot")) + for plugin_name in search_plugins + if plugin_name in plugins_by_name + } + + total = snapshots.count() + print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}") + + def run_batch() -> None: + if not records: + return + if wait_for_turn: + wait_for_turn() + batch_records = list(records) + snapshot_ids = {record["snapshot_id"] for record in batch_records} + plugin_names = {record["plugin"] for record in batch_records} + queued_rows = { + (str(snapshot_id), plugin_name, hook_name) + for snapshot_id, plugin_name, hook_name in ArchiveResult.objects.filter( + snapshot_id__in=snapshot_ids, + plugin__in=plugin_names, + status=ArchiveResult.StatusChoices.QUEUED, + ).values_list("snapshot_id", "plugin", "hook_name") + } + records_to_queue = [] + for record in batch_records: + snapshot_id = record["snapshot_id"] + plugin_name = record["plugin"] + required_hooks = required_hooks_by_plugin.get(plugin_name, frozenset()) + if required_hooks and all((snapshot_id, plugin_name, hook_name) in queued_rows for hook_name in required_hooks): + stats["skipped_queued"] += 1 + continue + records_to_queue.append(record) + if not records_to_queue: + print( + f" [{stats['processed']}/{total}] Already queued {len(batch_records)} index jobs", + ) + records.clear() + return + # `archivebox update --index-only` intentionally breaks the usual + # "runner discovers work" rule by inserting synthetic queued + # ArchiveResult rows for search backends. run_plugins() keeps this as + # statement-sized UPDATE/bulk_create work, then bumps Snapshot.retry_at + # so the orchestrator owns actual hook execution. Paused snapshots stay + # PAUSED; run_due_snapshot restores retry_at=MAX after targeted rows + # finish. + exit_code = run_plugins( + args=(), + records=records_to_queue, + wait=False, + emit_results=False, + show_progress=False, + preserve_queued=True, + ) + if exit_code != 0: + raise SystemExit(exit_code) + stats["queued"] += len(records_to_queue) + print( + f" [{stats['processed']}/{total}] Queued {len(records_to_queue)} index jobs for orchestrator", + ) + records.clear() + + for snapshot in snapshots.select_related("crawl").paged_iterator(chunk_size=batch_size): + try: + stats["processed"] += 1 + + if _get_snapshot_crawl(snapshot) is None: + continue + + if collect_ids: + stats["snapshot_ids"].append(str(snapshot.id)) + for plugin_name in search_plugins: + records.append( + { + "type": "ArchiveResult", + "snapshot_id": str(snapshot.id), + "plugin": plugin_name, + }, + ) + stats["requested"] += 1 + if len(records) >= batch_size: + run_batch() + except KeyboardInterrupt as err: + err.archivebox_resume = snapshot.timestamp + raise + + run_batch() + return stats + + +@enforce_types +def update( + filter_patterns: Iterable[str] = (), + filter_type: str = "exact", + status: str | None = None, + url__icontains: str | None = None, + url__istartswith: str | None = None, + tag: str | None = None, + crawl_id: str | None = None, + limit: int | None = None, + sort: str | None = None, + search: str | None = None, + before: float | None = None, + after: float | None = None, + resume: str | None = None, + batch_size: int = 500, + continuous: bool = False, + index_only: bool = False, + migrate_only: bool = False, + stop_daemon_stack: bool = True, +) -> None: + """ + Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving. + + Three-phase operation (without filters): + - Phase 1: Drain old archive/ dirs by moving to new fs location (0.8.x โ†’ 0.9.x) + - Phase 2: O(n) scan over entire DB from most recent to least recent + - No orphan scans needed (trust 1:1 mapping between DB and filesystem after phase 1) + + With filters: Only phase 2 (DB query), no filesystem operations. + Without filters: All phases (full update). + """ + + from rich import print + from archivebox.config import CONSTANTS + from archivebox.config.django import setup_django + + setup_django() + from archivebox.misc.checks import check_migrations + + # This must be the first database operation in `archivebox update`. + # Old 0.7.x/0.8.x collections may not have current machine/process/crawl + # tables yet, and even "harmless" runtime-stack bookkeeping uses current + # ORM models. Apply Django migrations before creating Process rows, checking + # runtime ownership, queuing retry_at maintenance ticks, or touching any + # lazy Snapshot.save() filesystem migration path. + print("[*] Checking for pending migrations...") + check_migrations(auto_apply=True) + + from archivebox.machine.models import Process + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals, raise_if_shutdown_requested + from archivebox.core.takeover_util import ( + command_owns_foreground_runner, + current_command, + ensure_daemon_stack, + foreground_runner_owner, + standby_until_foreground_runner_needed, + ) + from archivebox.workers.supervisord_util import run_runner_worker, stop_own_supervisord_process + + command = current_command(Process.TypeChoices.UPDATE, data_dir=CONSTANTS.DATA_DIR) + + def still_owns_foreground_runner() -> bool: + from django.db import connections + + try: + return command_owns_foreground_runner(command, data_dir=CONSTANTS.DATA_DIR) + finally: + connections.close_all() + + def wait_for_turn() -> None: + raise_if_shutdown_requested() + standby_until_foreground_runner_needed(command, data_dir=CONSTANTS.DATA_DIR) + raise_if_shutdown_requested() + + def run_scoped_runner(*args: str, ensure_daemon_reason: str | None = None) -> None: + while True: + wait_for_turn() + if ensure_daemon_reason: + ensure_daemon_stack(reason=ensure_daemon_reason) + exit_code = run_runner_worker( + list(args), + name=f"worker_runner_update_{os.getpid()}", + keep_running=still_owns_foreground_runner, + ) + if exit_code == 0: + return + if not command_owns_foreground_runner(command, data_dir=CONSTANTS.DATA_DIR): + continue + raise SystemExit(exit_code) + + is_filtered_update = any( + ( + filter_patterns, + status, + url__icontains, + url__istartswith, + tag, + crawl_id, + limit, + sort, + search, + before, + after, + ), + ) + touched_snapshot_ids: set[str] = set() + exit_code = 0 + + try: + wait_for_turn() + + with foreground_shutdown_signals(), foreground_parent_watchdog(): + while True: + do_migrate = migrate_only or not index_only + do_index = index_only or not migrate_only + do_run_until_idle = do_migrate or do_index + ran_post_migrate_runner = False + full_update_empty = False + maintenance_work_queued = False + runner_work_queued = False + + if do_migrate: + if ( + filter_patterns + or status + or url__icontains + or url__istartswith + or tag + or crawl_id + or limit + or sort + or search + or before + or after + ): + print("[*] Processing filtered snapshots from database...") + stats = process_filtered_snapshots( + filter_patterns=filter_patterns, + filter_type=filter_type, + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + search=search, + before=before, + after=after, + resume=resume, + batch_size=batch_size, + queue_for_archiving=do_run_until_idle, + wait_for_turn=wait_for_turn, + ) + print_stats(stats) + touched_snapshot_ids.update(stats.get("snapshot_ids", [])) + maintenance_work_queued = stats.get("queued", 0) > 0 + runner_work_queued = runner_work_queued or maintenance_work_queued + else: + stats_combined = {"phase1": {}, "phase2": {}} + + print("[*] Phase 1: Draining old archive/ directories (0.8.x โ†’ 0.9.x migration)...") + stats_combined["phase1"] = drain_old_archive_dirs( + resume_from=resume, + batch_size=batch_size, + ) + + print("[*] Phase 2: Processing all database snapshots (most recent first)...") + stats_combined["phase2"] = process_all_db_snapshots( + batch_size=batch_size, + resume=resume, + wait_for_turn=wait_for_turn, + ) + print_combined_stats(stats_combined) + full_update_empty = ( + stats_combined["phase1"].get("processed", 0) == 0 and stats_combined["phase2"].get("snapshots", 0) == 0 + ) + maintenance_work_queued = any( + ( + stats_combined["phase1"].get("queued", 0), + stats_combined["phase2"].get("queued", 0), + stats_combined["phase2"].get("crawls_sealed", 0), + ), + ) + runner_work_queued = runner_work_queued or maintenance_work_queued + + if do_run_until_idle: + # Filesystem migration is maintenance on existing + # Snapshot rows: Snapshot.save() moves archive/<ts> to + # the current output_dir and preserves the lifecycle + # status. Drain those retry_at ticks before queuing + # search backfill below. Otherwise the sealed/paused + # runner branch correctly sees queued ArchiveResult + # rows first, runs the targeted plugins, and may leave + # the fs_version maintenance tick hidden behind that + # plugin work until another update pass. + if full_update_empty: + print("[*] No snapshots or legacy archive directories found; skipping filesystem maintenance runner.") + elif not maintenance_work_queued: + print("[*] No filesystem maintenance work queued; skipping filesystem maintenance runner.") + else: + print("[*] Phase 3: Running filesystem maintenance until idle...") + if full_update_empty: + pass + elif not maintenance_work_queued: + pass + elif is_filtered_update: + if not touched_snapshot_ids: + print("[*] No matching snapshots queued work for the runner.") + for snapshot_id in sorted(touched_snapshot_ids): + run_scoped_runner("--snapshot-id", snapshot_id) + else: + run_scoped_runner("--maintenance-only", "--maintenance-batch-size", str(batch_size)) + ran_post_migrate_runner = True + + if do_index: + if full_update_empty: + print("[*] No snapshots found; skipping search indexing backfill.") + else: + ensure_daemon_stack(reason="search indexing") + search_plugins = _get_search_indexing_plugins() + if not search_plugins: + print("[*] No search indexing plugins are available, nothing to backfill.") + else: + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + search=search, + before=before, + after=after, + resume=resume, + ) + from django.db.models import Exists, OuterRef, Q + from django.utils import timezone + from archivebox.core.models import ArchiveResult, Snapshot + + scoped_snapshot_ids = snapshots.order_by().values("id") if is_filtered_update else None + queued_index_results = ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.QUEUED, + plugin__in=search_plugins, + ) + if scoped_snapshot_ids is not None: + queued_index_results = queued_index_results.filter(snapshot_id__in=scoped_snapshot_ids) + + if queued_index_results.exists(): + runner_work_queued = True + now = timezone.now() + queued_result_for_snapshot = queued_index_results.filter(snapshot_id=OuterRef("pk")) + snapshots_to_wake = ( + Snapshot.objects.filter( + status__in=(Snapshot.StatusChoices.SEALED, Snapshot.StatusChoices.PAUSED), + ) + .annotate( + has_queued_index_result=Exists(queued_result_for_snapshot), + ) + .filter( + has_queued_index_result=True, + ) + .filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ) + ) + if scoped_snapshot_ids is not None: + snapshots_to_wake = snapshots_to_wake.filter(id__in=scoped_snapshot_ids) + woken_count = snapshots_to_wake.update( + retry_at=now, + modified_at=now, + ) + print( + "[*] Existing queued search index jobs found; " + f"skipping backfill scan and waking {woken_count} snapshot(s) for the runner.", + ) + else: + collect_index_ids = ( + is_filtered_update + or foreground_runner_owner( + data_dir=CONSTANTS.DATA_DIR, + exclude_id=command.id, + ) + is not None + ) + stats = reindex_snapshots( + snapshots, + search_plugins=search_plugins, + batch_size=batch_size, + collect_ids=collect_index_ids, + wait_for_turn=wait_for_turn, + ) + print_index_stats(stats) + touched_snapshot_ids.update(stats.get("snapshot_ids", [])) + runner_work_queued = runner_work_queued or stats["queued"] > 0 + + if do_run_until_idle and (do_index or not ran_post_migrate_runner): + # Search/index backfill intentionally queues targeted + # ArchiveResult rows without reopening sealed/paused + # snapshots. This second runner pass drains those plugin + # rows after filesystem maintenance has had its own turn. + # For a normal unfiltered `archivebox update`, keep the + # historical final pass broad enough to resume genuinely + # queued/interrupted crawl work after maintenance is done. + if full_update_empty: + print("[*] No snapshots found; skipping queued/interrupted crawl runner.") + elif not runner_work_queued: + print("[*] No queued/interrupted crawl work found; skipping queued/interrupted crawl runner.") + else: + print("[*] Phase 3: Running queued/interrupted crawl work until idle...") + if full_update_empty: + pass + elif not runner_work_queued: + pass + elif touched_snapshot_ids and is_filtered_update: + if not touched_snapshot_ids: + print("[*] No matching snapshots queued work for the runner.") + for snapshot_id in sorted(touched_snapshot_ids): + run_scoped_runner("--snapshot-id", snapshot_id) + else: + run_scoped_runner( + *(["--maintenance-only", "--maintenance-batch-size", str(batch_size)] if index_only or migrate_only else []), + ensure_daemon_reason="search indexing" if do_index else None, + ) + + if not continuous: + break + + print("[yellow]Sleeping 60s before next pass...[/yellow]") + time.sleep(60) + resume = None + except (KeyboardInterrupt, asyncio.CancelledError) as err: + exit_code = 130 + exact_resume = err.__dict__.get("archivebox_resume") + resume_cmd = ["archivebox", "update"] + if migrate_only: + resume_cmd.append("--migrate-only") + if index_only: + resume_cmd.append("--index-only") + if batch_size != 500: + resume_cmd.extend(["--batch-size", str(batch_size)]) + if exact_resume or resume: + resume_cmd.extend(["--resume", str(exact_resume or resume)]) + if before is not None: + resume_cmd.extend(["--before", str(before)]) + if after is not None: + resume_cmd.extend(["--after", str(after)]) + if filter_type != "exact": + resume_cmd.extend(["--filter-type", filter_type]) + if status: + resume_cmd.extend(["--status", status]) + if url__icontains: + resume_cmd.extend(["--url__icontains", url__icontains]) + if url__istartswith: + resume_cmd.extend(["--url__istartswith", url__istartswith]) + if tag: + resume_cmd.extend(["--tag", tag]) + if crawl_id: + resume_cmd.extend(["--crawl-id", crawl_id]) + if limit: + resume_cmd.extend(["--limit", str(limit)]) + if sort: + resume_cmd.extend(["--sort", sort]) + if search: + resume_cmd.extend(["--search", search]) + resume_cmd.extend(str(pattern) for pattern in filter_patterns) + print("\n[red][X] archivebox update interrupted.[/red]") + print("[yellow]Hint: resume this idempotent update with:[/yellow]") + print(f" [green]{shlex.join(resume_cmd)}[/green]") + raise SystemExit(exit_code) + except SystemExit as err: + if isinstance(err.code, int): + exit_code = err.code + elif err.code: + exit_code = 1 + raise + finally: + command.mark_exited(exit_code=exit_code) + if stop_daemon_stack: + stop_own_supervisord_process() + + +def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 500) -> dict[str, int]: + """ + Drain old archive/ directories (0.8.x โ†’ 0.9.x migration). + + Only processes real directories (skips symlinks - those are already migrated). + For each old dir found in archive/: + 1. Load or create DB snapshot + 2. Trigger fs migration on save() to move to data/archive/users/{user}/... + 3. Leave symlink in archive/ pointing to new location + + After this drains, archive/ should only contain symlinks and we can trust + 1:1 mapping between DB and filesystem. + """ + from archivebox.core.models import Snapshot + from archivebox.config import CONSTANTS + from archivebox.crawls.models import Crawl + from django.utils import timezone + + stats = {"processed": 0, "migrated": 0, "queued": 0, "skipped": 0, "invalid": 0} + crawl_url_lines: dict[str, list[str]] = {} + crawl_url_sets: dict[str, set[str]] = {} + dirty_crawl_ids: set[str] = set() + + archive_dir = CONSTANTS.ARCHIVE_DIR + if not archive_dir.exists(): + return stats + + last_crawl_id = None + while True: + crawl_qs = Crawl.objects.filter(label__startswith="[migration] orphaned").order_by("id") + if last_crawl_id is not None: + crawl_qs = crawl_qs.filter(id__gt=last_crawl_id) + crawl_batch = list(crawl_qs[:batch_size]) + if not crawl_batch: + break + for crawl in crawl_batch: + last_crawl_id = crawl.id + url_entries = crawl._iter_url_lines() + existing_urls = {url for _raw_line, url in url_entries if url} + lines = (crawl.urls or "").splitlines() + changed = False + for url in crawl.snapshot_set.order_by("timestamp").values_list("url", flat=True): + if url not in existing_urls: + lines.append(url) + existing_urls.add(url) + changed = True + if changed: + Crawl.objects.filter(pk=crawl.pk).update(urls="\n".join(lines), modified_at=timezone.now()) + + # Scan for real directories only (skip symlinks - they're already migrated) + all_entries = list(os.scandir(archive_dir)) + entries = [ + (e.stat().st_mtime, e.path) + for e in all_entries + if e.is_dir(follow_symlinks=False) and Snapshot.is_legacy_archive_dir(Path(e.path)) # Skip symlinks and 0.9.x roots + ] + entries.sort(reverse=True) # Newest first + print(f"[*] Found {len(entries)} old directories to drain") + + for mtime, entry_path in entries: + entry_path = Path(entry_path) + + # Resume from timestamp if specified + if resume_from and entry_path.name > resume_from: + continue + + stats["processed"] += 1 + + # Try to load existing snapshot from DB + snapshot = Snapshot.load_from_directory(entry_path) + + if not snapshot: + # Not in DB - create new snapshot record + snapshot = Snapshot.create_from_directory(entry_path) + if not snapshot: + # Invalid directory - move to invalid/ + Snapshot.move_directory_to_invalid(entry_path) + stats["invalid"] += 1 + print(f" [{stats['processed']}] Invalid: {entry_path.name}") + continue + + try: + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = timezone.now() + # Snapshot.save() owns URL validation and filesystem/index side + # effects. Do not use bulk_create() here; it bypasses save(). + snapshot.save() + + crawl = _get_snapshot_crawl(snapshot) + if crawl is not None: + crawl_cache_key = str(crawl.id) + existing_urls = crawl_url_sets.get(crawl_cache_key) + if existing_urls is None: + url_entries = crawl._iter_url_lines() + existing_urls = {url for _raw_line, url in url_entries if url} + crawl_url_sets[crawl_cache_key] = existing_urls + crawl_url_lines[crawl_cache_key] = (crawl.urls or "").splitlines() + if snapshot.url not in existing_urls: + crawl_url_lines[crawl_cache_key].append(snapshot.url) + existing_urls.add(snapshot.url) + dirty_crawl_ids.add(crawl_cache_key) + + stats["queued"] += 1 + print(f" [{stats['processed']}] Imported orphaned snapshot and queued migration: {entry_path.name}") + except Exception as e: + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") + continue + + # Ensure snapshot has a valid crawl (migration 0024 may have failed) + has_valid_crawl = _get_snapshot_crawl(snapshot) is not None + + if not has_valid_crawl: + # Create a new crawl (created_by will default to system user) + crawl = Crawl.objects.create(urls=snapshot.url) + # Use safe_update() to avoid save() hooks and keep the SQLite + # write to one statement while the migration loop does filesystem + # work outside any transaction. The modified_at CAS prevents this + # repair scan from overwriting a newer Snapshot edit. + if not snapshot.safe_update( + {"crawl": crawl}, + refresh=False, + extra_filter={"modified_at": snapshot.modified_at}, + ): + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped stale snapshot repair: {entry_path.name}") + continue + snapshot.crawl = crawl + + # Check if needs migration (0.8.x โ†’ 0.9.x) + try: + if snapshot.fs_migration_needed: + if snapshot.safe_update( + {"retry_at": timezone.now(), "modified_at": timezone.now()}, + refresh=False, + extra_filter={"modified_at": snapshot.modified_at}, + ): + stats["queued"] += 1 + print(f" [{stats['processed']}] Queued filesystem migration: {entry_path.name}") + else: + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped stale filesystem migration row: {entry_path.name}") + else: + stats["skipped"] += 1 + except Exception as e: + stats["skipped"] += 1 + print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") + + if stats["processed"] % batch_size == 0: + for crawl_id in tuple(dirty_crawl_ids): + Crawl.objects.filter(pk=crawl_id).update( + urls="\n".join(crawl_url_lines[crawl_id]), + modified_at=timezone.now(), + ) + dirty_crawl_ids.clear() + + for crawl_id in tuple(dirty_crawl_ids): + Crawl.objects.filter(pk=crawl_id).update( + urls="\n".join(crawl_url_lines[crawl_id]), + modified_at=timezone.now(), + ) + dirty_crawl_ids.clear() + return stats + + +def process_all_db_snapshots(batch_size: int = 500, resume: str | None = None, wait_for_turn=None) -> dict[str, int]: + """ + O(n) scan over entire DB from most recent to least recent. + + For each snapshot: + 1. Reconcile index.json with DB (merge titles, tags, archive results) + 2. Mark migrated snapshots sealed unless explicitly re-queued elsewhere + + No orphan detection needed - we trust 1:1 mapping between DB and filesystem + after Phase 1 has drained all old archive/ directories. + """ + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + from django.db.models import Q + from django.utils import timezone + + stats = { + "processed": 0, + "scanned_dirs": 0, + "updated_json": 0, + "updated_db": 0, + "queued": 0, + "sealed": 0, + "crawls_sealed": 0, + } + current_fs_version = Snapshot._fs_current_version() + + queryset = Snapshot.objects.all() + if resume: + queryset = queryset.filter(timestamp__lte=resume) + total = queryset.count() + stats["snapshots"] = total + print(f"[*] Processing {total} snapshots from database (most recent first)...") + + def update_in_batches(rows, *, label: str, **updates) -> int: + updated = 0 + checked = 0 + while True: + if wait_for_turn: + wait_for_turn() + batch = list(rows.only("id", "modified_at").order_by("-timestamp")[:batch_size]) + if not batch: + if updated: + print(f" [{label}] complete: {updated} rows updated") + return updated + checked += len(batch) + print(f" [{label}] updating next {len(batch)} rows (seen {checked})...") + for snapshot in batch: + # This maintenance scan intentionally bypasses save(); it is + # only normalizing scheduler fields, and Snapshot.save() may + # do filesystem migration work that belongs in the runner. + # Guard each single-row UPDATE with modified_at so stale scan + # pages cannot overwrite newer runner/admin writes. + updated += int( + snapshot.safe_update( + updates, + refresh=False, + extra_filter={"modified_at": snapshot.modified_at}, + ), + ) + print(f" [{label}] updated {updated} rows so far") + + now = timezone.now() + updated_rows = update_in_batches( + queryset.exclude( + status__in=[ + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + Snapshot.StatusChoices.SEALED, + ], + ), + label="snapshot status normalization", + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=now, + ) + stats["sealed"] += updated_rows + stats["updated_db"] += updated_rows + + fs_version_rows = queryset.exclude(fs_version=current_fs_version).filter(Q(retry_at__isnull=True) | Q(retry_at__gt=now)) + stale_batch = [] + + def queue_stale_fs_batch() -> None: + if not stale_batch: + return + if wait_for_turn: + wait_for_turn() + now = timezone.now() + # Do not bump fs_version here. The orchestrator calls Snapshot.save(), + # which performs the idempotent filesystem migration and commits the new + # fs_version in the same serialized worker path as normal crawls. + updated = 0 + for snapshot in stale_batch: + # Each row gets its own short autocommit UPDATE because this scan + # can touch millions of snapshots while a server is also alive. + # The modified_at predicate is the CAS guard: if the runner or + # admin changed the snapshot after paged_iterator read it, skip it + # and let the newer state decide whether migration is still due. + updated += int( + snapshot.safe_update( + { + "retry_at": now, + "modified_at": now, + }, + refresh=False, + extra_filter={"fs_version": snapshot.fs_version}, + ), + ) + stats["processed"] += len(stale_batch) + stats["updated_db"] += updated + stats["queued"] += updated + print(f" [{stats['processed']}/{total}] Queued {updated} filesystem migrations for orchestrator...") + stale_batch.clear() + + for snapshot in ( + fs_version_rows.only("id", "crawl_id", "timestamp", "fs_version", "modified_at") + .order_by("-timestamp") + .paged_iterator(chunk_size=batch_size) + ): + try: + stale_batch.append(snapshot) + if len(stale_batch) >= batch_size: + queue_stale_fs_batch() + except KeyboardInterrupt as err: + err.archivebox_resume = snapshot.timestamp + raise + queue_stale_fs_batch() + + now = timezone.now() + # Crawls with no open child snapshots are already finished. Seal them here + # instead of waking the foreground runner; otherwise migration/update can + # accidentally re-enter full crawl execution for historical rows. + stats["crawls_sealed"] = ( + Crawl.objects.filter( + status__in=Crawl.RUNNABLE_STATES, + ) + .exclude( + snapshot_set__status__in=Snapshot.OPEN_STATES, + ) + .update( + status=Crawl.StatusChoices.SEALED, + retry_at=None, + modified_at=now, + ) + ) + stats["updated_db"] += stats["crawls_sealed"] + return stats + + +def process_filtered_snapshots( + filter_patterns: Iterable[str], + filter_type: str, + status: str | None, + url__icontains: str | None, + url__istartswith: str | None, + tag: str | None, + crawl_id: str | None, + limit: int | None, + sort: str | None, + search: str | None, + before: float | None, + after: float | None, + resume: str | None, + batch_size: int, + queue_for_archiving: bool = True, + wait_for_turn=None, +) -> dict[str, Any]: + """Process snapshots matching filters (DB query only).""" + from archivebox.core.models import Snapshot + from django.utils import timezone + + stats: dict[str, Any] = {"processed": 0, "updated_json": 0, "updated_db": 0, "queued": 0, "snapshot_ids": []} + + snapshots = _build_filtered_snapshots_queryset( + filter_patterns=filter_patterns, + filter_type=filter_type, + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + sort=sort, + search=search, + before=before, + after=after, + resume=resume, + ) + + total = snapshots.count() + print(f"[*] Found {total} matching snapshots") + + for snapshot in snapshots.select_related("crawl").paged_iterator(chunk_size=batch_size): + if wait_for_turn and stats["processed"] % batch_size == 0: + wait_for_turn() + stats["processed"] += 1 + + # Skip snapshots with missing crawl references + if _get_snapshot_crawl(snapshot) is None: + continue + + try: + stats["snapshot_ids"].append(str(snapshot.id)) + update_values = {} + updated = 0 + if not isinstance(snapshot.current_step, int): + update_values["current_step"] = 0 + if queue_for_archiving: + update_values.update( + { + "status": Snapshot.StatusChoices.QUEUED, + "retry_at": timezone.now(), + "modified_at": timezone.now(), + }, + ) + if update_values: + # update() is intentionally used instead of save(); save() + # runs output-dir hooks, which must not happen while SQLite + # is holding the write lock for this state change. Index-only + # maintenance goes through reindex_snapshots/run_plugins instead + # so paused snapshots keep status=paused while only their + # targeted search ArchiveResult rows run. Since this loop reads + # with paged_iterator() and writes later, modified_at is the CAS + # guard that prevents stale CLI scans from overwriting a newer + # runner/admin update to the same snapshot. + updated = int( + snapshot.safe_update( + update_values, + refresh=False, + extra_filter={"modified_at": snapshot.modified_at}, + ), + ) + stats["updated_db"] += updated + + stats["queued"] += updated if queue_for_archiving else 0 + except KeyboardInterrupt as err: + err.archivebox_resume = snapshot.timestamp + raise + except Exception as e: + # Skip snapshots that can't be processed + print(f" [!] Skipping snapshot {snapshot.id}: {e}") + continue + + if stats["processed"] % batch_size == 0: + print(f" [{stats['processed']}/{total}] Processed...") + + return stats + + +def print_stats(stats: dict): + """Print statistics for filtered mode.""" + from rich import print + + print(f""" +[green]Update Complete[/green] + Scanned rows: {stats["processed"]} + Updated JSON: {stats.get("updated_json", 0)} + Updated DB rows: {stats.get("updated_db", 0)} + Queued snapshots: {stats["queued"]} +""") + + +def print_combined_stats(stats_combined: dict): + """Print statistics for full mode.""" + from rich import print + + s1 = stats_combined["phase1"] + s2 = stats_combined["phase2"] + + print(f""" +[green]Archive Update Complete[/green] + +Phase 1 (Drain Old Dirs): + Scanned dirs: {s1.get("processed", 0)} + Moved files: {s1.get("migrated", 0)} + Skipped dirs: {s1.get("skipped", 0)} + Invalid dirs: {s1.get("invalid", 0)} + +Phase 2 (Process DB): + Scanned dirs: {s2.get("scanned_dirs", 0)} + Updated JSON: {s2.get("updated_json", 0)} + Updated DB rows: {s2.get("updated_db", 0)} + Sealed snapshots: {s2.get("sealed", 0)} + Sealed crawls: {s2.get("crawls_sealed", 0)} +""") + + +def print_index_stats(stats: dict[str, Any]) -> None: + from rich import print + + print(f""" +[green]Search Reindex Complete[/green] + Scanned rows: {stats["processed"]} + Requested jobs: {stats.get("requested", stats["queued"])} + Queued index jobs: {stats["queued"]} + Already queued: {stats.get("skipped_queued", 0)} +""") + + +@click.command() +@click.option("--resume", type=str, help="Resume from timestamp") +@click.option("--batch-size", type=int, default=500, help="Commit every N records") +@click.option("--continuous", is_flag=True, help="Run continuously as background worker") +@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content") +@click.option("--migrate-only", is_flag=True, help="Only migrate filesystem and update database/index state") +@snapshot_filter_options(default_filter_type="exact") +@docstring(update.__doc__) +def main(**kwargs): + from archivebox.core.shutdown_util import foreground_parent_watchdog, foreground_shutdown_signals + + try: + with foreground_shutdown_signals(), foreground_parent_watchdog(): + update(**kwargs) + except ValueError as err: + raise click.BadParameter(str(err), param_hint="--status") from err + except KeyboardInterrupt: + raise SystemExit(130) from None + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py new file mode 100755 index 0000000000..2b4279d1c8 --- /dev/null +++ b/archivebox/cli/archivebox_version.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 + +__package__ = "archivebox.cli" + +import sys +import os +import platform +from pathlib import Path +from collections.abc import Iterable + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +def _format_binary_abspath( + abspath: str, + *, + pwd: Path, + lib_dir: Path, + personas_dir: Path, + home: Path, +) -> str: + path = Path(abspath).expanduser() + try: + normalized = path.resolve(strict=False) + except Exception: + normalized = path + + candidate_bases: tuple[tuple[Path, str], ...] = ( + (pwd, "./"), + (lib_dir, "LIB_DIR/"), + (Path(os.environ.get("LIB_DIR", "")), "LIB_DIR/") if os.environ.get("LIB_DIR") else (Path(), ""), + (personas_dir, "PERSONAS_DIR/"), + (home, "~/"), + ) + + for base, prefix in candidate_bases: + if not prefix: + continue + for candidate in (base, base.resolve(strict=False)): + try: + relative = normalized.relative_to(candidate) + except ValueError: + continue + + relative_str = relative.as_posix() + if prefix == "./": + return "." if not relative_str else f"./{relative_str}" + if prefix == "LIB_DIR/": + return "LIB_DIR" if not relative_str else f"LIB_DIR/{relative_str}" + if prefix == "PERSONAS_DIR/": + return "PERSONAS_DIR" if not relative_str else f"PERSONAS_DIR/{relative_str}" + return "~" if not relative_str else f"~/{relative_str}" + + return normalized.as_posix() + + +def _render_binary_abspath(abspath: str): + from rich.text import Text + + if abspath.startswith("LIB_DIR/"): + return Text.assemble(("LIB_DIR", "bright_blue"), (abspath.removeprefix("LIB_DIR"), "green")) + if abspath == "LIB_DIR": + return Text("LIB_DIR", style="bright_blue") + if abspath.startswith("PERSONAS_DIR/"): + return Text.assemble(("PERSONAS_DIR", "medium_purple"), (abspath.removeprefix("PERSONAS_DIR"), "green")) + if abspath == "PERSONAS_DIR": + return Text("PERSONAS_DIR", style="medium_purple") + if abspath.startswith("~/"): + return Text.assemble(("~", "cyan"), (abspath.removeprefix("~"), "green")) + if abspath == "~": + return Text("~", style="cyan") + if abspath.startswith("./"): + return Text.assemble((".", "cyan"), (abspath.removeprefix("."), "green")) + if abspath == ".": + return Text(".", style="cyan") + return Text(abspath, style="green") + + +@enforce_types +def version( + quiet: bool = False, + binaries: Iterable[str] = (), +) -> list[str]: + """Print the ArchiveBox version, debug metadata, and installed dependency versions""" + + # fast path for just getting the version and exiting, dont do any slower imports + from archivebox.config.version import VERSION + + print(VERSION) + if quiet or "--version" in sys.argv: + return [] + + from rich.panel import Panel + from rich.console import Console + + from archivebox.config import CONSTANTS + from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER + from archivebox.config.paths import get_data_locations, get_code_locations + from archivebox.misc.logging_util import printable_folder_status + from archivebox.config.common import get_config + + console = Console() + prnt = console.print + + # Check if LDAP is enabled (simple config lookup) + config = get_config() + LDAP_ENABLED = config.get("LDAP_ENABLED", False) + + p = platform.uname() + COMMIT_HASH = get_COMMIT_HASH() + prnt( + f"[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{CONSTANTS.VERSION}[/dark_goldenrod]", + f"COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else 'unknown'}", + f"BUILD_TIME={get_BUILD_TIME()}", + ) + prnt( + f"IN_DOCKER={IN_DOCKER}", + f"IN_QEMU={config.IN_QEMU}", + f"ARCH={p.machine}", + f"OS={p.system}", + f"PLATFORM={platform.platform()}", + f"PYTHON={sys.implementation.name.title()}" + (" (venv)" if CONSTANTS.IS_INSIDE_VENV else ""), + ) + + try: + OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount + except Exception: + OUTPUT_IS_REMOTE_FS = False + + try: + DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() + prnt( + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} ARCHIVEBOX_USER={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", + f"FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}", + f"FS_PERMS={config.OUTPUT_PERMISSIONS}", + f"FS_ATOMIC={config.ENFORCE_ATOMIC_WRITES}", + f"FS_REMOTE={OUTPUT_IS_REMOTE_FS}", + ) + except Exception: + prnt( + f"EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} ARCHIVEBOX_USER={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", + ) + + prnt( + f"DEBUG={config.DEBUG}", + f"IS_TTY={config.IS_TTY}", + f"SUDO={CONSTANTS.IS_ROOT}", + f"ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}", + f"SEARCH_BACKEND={config.SEARCH_BACKEND_ENGINE}", + f"LDAP={LDAP_ENABLED}", + ) + prnt() + + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): + PANEL_TEXT = "\n".join( + ( + "", + "[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...", + " [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.", + "", + " [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]", + "", + ), + ) + prnt( + Panel( + PANEL_TEXT, + expand=False, + border_style="grey53", + title="[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]", + subtitle="Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]", + ), + ) + prnt() + return [] + + prnt("[pale_green1][i] Binary Dependencies:[/pale_green1]") + failures = [] + + # Setup Django before importing models + try: + from archivebox.config.django import setup_django + + setup_django() + + from archivebox.machine.models import Machine, Binary + + machine = Machine.current() + + if isinstance(binaries, str): + requested_names = {name.strip() for name in binaries.split(",") if name.strip()} + else: + requested_names = {name for name in (binaries or ()) if name} + + db_binaries: dict[str, Binary] = {} + for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at"): + db_binaries.setdefault(binary.name, binary) + + all_binary_names = sorted(requested_names or set(db_binaries.keys())) + + if not all_binary_names: + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") + else: + any_available = False + compact_paths = console.is_terminal + for name in all_binary_names: + if requested_names and name not in requested_names: + continue + + installed = db_binaries.get(name) + if installed and installed.is_valid: + display_name = Path(name).expanduser().name if ("/" in name or name.startswith("~")) else name + display_path = ( + _format_binary_abspath( + installed.abspath, + pwd=Path.cwd(), + lib_dir=config.LIB_DIR, + personas_dir=CONSTANTS.PERSONAS_DIR, + home=Path.home(), + ) + if compact_paths + else installed.abspath + ) + rendered_path = _render_binary_abspath(display_path) if compact_paths else display_path + version_str = (installed.version or "unknown")[:15] + provider = (installed.binprovider or "env")[:8] + prnt( + "", + "[green]โˆš[/green]", + "", + display_name.ljust(18), + version_str.ljust(16), + provider.ljust(8), + rendered_path, + overflow="ignore", + crop=False, + ) + any_available = True + continue + + status = ( + "[grey53]not recorded[/grey53]" if name in requested_names and installed is None else "[grey53]not installed[/grey53]" + ) + prnt("", "[red]X[/red]", "", name.ljust(18), status, overflow="ignore", crop=False) + failures.append(name) + + if not any_available: + prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]") + + # Show hint if no binaries are installed yet + has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath="").exists() + if not has_any_installed: + prnt() + prnt("", "[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]") + + except Exception as e: + # Handle database errors gracefully (locked, missing, etc.) + prnt() + prnt("", f"[yellow]Warning: Could not query binaries from database: {e}[/yellow]") + prnt("", "[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]") + + if not binaries: + # Show code and data locations + prnt() + prnt("[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]") + try: + for name, path in get_code_locations().items(): + if isinstance(name, str) and isinstance(path, dict): + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) + except Exception as e: + prnt(f" [red]Error getting code locations: {e}[/red]") + + prnt() + if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): + prnt("[bright_yellow][i] Data locations:[/bright_yellow]") + try: + for name, path in get_data_locations().items(): + if isinstance(name, str) and isinstance(path, dict): + prnt(printable_folder_status(name, path), overflow="ignore", crop=False) + except Exception as e: + prnt(f" [red]Error getting data locations: {e}[/red]") + + try: + from archivebox.misc.checks import check_data_dir_permissions + + check_data_dir_permissions() + except Exception: + pass + else: + prnt() + prnt("[red][i] Data locations:[/red] (not in a data directory)") + + prnt() + + if failures: + prnt("[red]Error:[/red] [yellow]Failed to detect the following binaries:[/yellow]") + prnt(f" [red]{', '.join(failures)}[/red]") + prnt() + prnt("[violet]Hint:[/violet] To install missing binaries automatically, run:") + prnt(" [green]archivebox install[/green]") + prnt() + return failures + + +@click.command() +@click.option( + "--quiet", + "-q", + is_flag=True, + help="Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)", +) +@click.option( + "--binaries", + "-b", + help="Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)", +) +@docstring(version.__doc__) +def main(**kwargs): + failures = version(**kwargs) + if failures: + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/archivebox/cli/cli_util.py b/archivebox/cli/cli_util.py new file mode 100644 index 0000000000..799624e2ea --- /dev/null +++ b/archivebox/cli/cli_util.py @@ -0,0 +1,44 @@ +""" +Shared CLI utilities for ArchiveBox commands. + +This module contains common utilities used across multiple CLI commands, +extracted to avoid code duplication. +""" + +__package__ = "archivebox.cli" + + +def apply_filters(queryset, filter_kwargs: dict, limit: int | None = None): + """ + Apply Django-style filters from CLI kwargs to a QuerySet. + + Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2 + + Args: + queryset: Django QuerySet to filter + filter_kwargs: Dict of filter key-value pairs from CLI + limit: Optional limit on results + + Returns: + Filtered QuerySet + + Example: + queryset = Snapshot.objects.all() + filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'} + filtered = apply_filters(queryset, filter_kwargs, limit=10) + """ + filters = {} + for key, value in filter_kwargs.items(): + if value is None or key in ("limit", "offset"): + continue + # Handle CSV lists for __in filters + if key.endswith("__in") and isinstance(value, str): + value = [v.strip() for v in value.split(",")] + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + if limit: + queryset = queryset[:limit] + + return queryset diff --git a/archivebox/config.py b/archivebox/config.py deleted file mode 100644 index 23a92ebf77..0000000000 --- a/archivebox/config.py +++ /dev/null @@ -1,272 +0,0 @@ -import os -import re -import sys -import shutil - -from subprocess import run, PIPE, DEVNULL - -# ****************************************************************************** -# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration -# Use the 'env' command to pass config options to ArchiveBox. e.g.: -# env USE_COLOR=True CHROME_BINARY=google-chrome ./archive export.html -# ****************************************************************************** - -IS_TTY = sys.stdout.isatty() -USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' -SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' -ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true' -MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) -TIMEOUT = int(os.getenv('TIMEOUT', '60')) -OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' ) -FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) - -FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' -FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' -FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true' -FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' -FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' -FETCH_WARC = os.getenv('FETCH_WARC', 'True' ).lower() == 'true' -FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true' -FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true' -FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' -FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true' -SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' - -CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' -RESOLUTION = os.getenv('RESOLUTION', '1440,2000' ) -GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') -WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}') -COOKIES_FILE = os.getenv('COOKIES_FILE', None) -CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) -CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' -CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36') - -CURL_BINARY = os.getenv('CURL_BINARY', 'curl') -GIT_BINARY = os.getenv('GIT_BINARY', 'git') -WGET_BINARY = os.getenv('WGET_BINARY', 'wget') -YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') -CHROME_BINARY = os.getenv('CHROME_BINARY', None) - -URL_BLACKLIST = os.getenv('URL_BLACKLIST', None) - -try: - OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR')) -except Exception: - OUTPUT_DIR = None - - -# ****************************************************************************** -# **************************** Derived Settings ******************************** -# ****************************************************************************** - -REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) -if not OUTPUT_DIR: - OUTPUT_DIR = os.path.join(REPO_DIR, 'output') - -ARCHIVE_DIR_NAME = 'archive' -SOURCES_DIR_NAME = 'sources' -ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME) -SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME) - -PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox') -TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates') - -CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true' -USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM -USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC -WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode) - -URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) - -########################### Environment & Dependencies ######################### - -try: - ### Terminal Configuration - TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns - ANSI = { - 'reset': '\033[00;00m', - 'lightblue': '\033[01;30m', - 'lightyellow': '\033[01;33m', - 'lightred': '\033[01;35m', - 'red': '\033[01;31m', - 'green': '\033[01;32m', - 'blue': '\033[01;34m', - 'white': '\033[01;37m', - 'black': '\033[01;30m', - } - if not USE_COLOR: - # dont show colors if USE_COLOR is False - ANSI = {k: '' for k in ANSI.keys()} - - - if not CHROME_BINARY: - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - default_executable_paths = ( - 'chromium-browser', - 'chromium', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - 'google-chrome', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - 'google-chrome-stable', - 'google-chrome-beta', - 'google-chrome-canary', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', - 'google-chrome-unstable', - 'google-chrome-dev', - ) - for name in default_executable_paths: - full_path_exists = shutil.which(name) - if full_path_exists: - CHROME_BINARY = name - break - else: - CHROME_BINARY = 'chromium-browser' - # print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY)) - - if CHROME_USER_DATA_DIR is None: - # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev - default_profile_paths = ( - '~/.config/chromium', - '~/Library/Application Support/Chromium', - '~/AppData/Local/Chromium/User Data', - '~/.config/google-chrome', - '~/Library/Application Support/Google/Chrome', - '~/AppData/Local/Google/Chrome/User Data', - '~/.config/google-chrome-stable', - '~/.config/google-chrome-beta', - '~/Library/Application Support/Google/Chrome Canary', - '~/AppData/Local/Google/Chrome SxS/User Data', - '~/.config/google-chrome-unstable', - '~/.config/google-chrome-dev', - ) - for path in default_profile_paths: - full_path = os.path.expanduser(path) - if os.path.exists(full_path): - CHROME_USER_DATA_DIR = full_path - break - # print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR))) - - CHROME_OPTIONS = { - 'TIMEOUT': TIMEOUT, - 'RESOLUTION': RESOLUTION, - 'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY, - 'CHROME_BINARY': CHROME_BINARY, - 'CHROME_HEADLESS': CHROME_HEADLESS, - 'CHROME_SANDBOX': CHROME_SANDBOX, - 'CHROME_USER_AGENT': CHROME_USER_AGENT, - 'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR, - } - - - ### Check Python environment - python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) - if python_vers < 3.5: - print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) - print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') - raise SystemExit(1) - - if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'): - print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding)) - print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)') - print('') - print(' Confirm that it\'s fixed by opening a new shell and running:') - print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8') - print('') - print(' Alternatively, run this script with:') - print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html') - - ### Get code version by parsing git log - GIT_SHA = 'unknown' - try: - GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - except Exception: - print('[!] Warning: unable to determine git version, is git installed and in your $PATH?') - - ### Get absolute path for cookies file - try: - COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None - except Exception: - print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?') - raise - - ### Make sure curl is installed - if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - ### Make sure wget is installed and calculate version - if FETCH_WGET or FETCH_WARC: - if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - WGET_VERSION = 'unknown' - try: - wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2] - except Exception: - if USE_WGET: - print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?') - - WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION) - - ### Make sure chrome is installed and calculate version - if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM: - if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - try: - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version_str = result.stdout.decode('utf-8') - version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n') - version = [l for l in version_lines if l.isdigit()][-1] - if int(version) < 59: - print(version_lines) - print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - except (IndexError, TypeError, OSError): - print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - CHROME_VERSION = 'unknown' - try: - chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode() - CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0] - except Exception: - if USE_CHROME: - print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?') - - ### Make sure git is installed - if FETCH_GIT: - if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: git{reset}'.format(**ANSI)) - print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - - ### Make sure youtube-dl is installed - if FETCH_MEDIA: - if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode: - print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) - print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY)) - print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.') - raise SystemExit(1) - -except KeyboardInterrupt: - raise SystemExit(1) - -except: - print('[X] There was an error during the startup procedure, your archive data is unaffected.') - raise diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py new file mode 100644 index 0000000000..82b7c240b5 --- /dev/null +++ b/archivebox/config/__init__.py @@ -0,0 +1,23 @@ +"""Minimal import-time config exports.""" + +__package__ = "archivebox.config" +__order__ = 200 + + +def __getattr__(name: str): + if name in ("CONSTANTS", "CONSTANTS_CONFIG"): + from .constants import CONSTANTS, CONSTANTS_CONFIG + + return {"CONSTANTS": CONSTANTS, "CONSTANTS_CONFIG": CONSTANTS_CONFIG}[name] + if name in ("PACKAGE_DIR", "DATA_DIR"): + from .paths import PACKAGE_DIR, DATA_DIR + + return {"PACKAGE_DIR": PACKAGE_DIR, "DATA_DIR": DATA_DIR}[name] + if name == "VERSION": + from .version import VERSION + + return VERSION + raise AttributeError(name) + + +__all__ = ("CONSTANTS", "CONSTANTS_CONFIG", "PACKAGE_DIR", "DATA_DIR", "VERSION") diff --git a/archivebox/config/collection.py b/archivebox/config/collection.py new file mode 100644 index 0000000000..549b139a05 --- /dev/null +++ b/archivebox/config/collection.py @@ -0,0 +1,348 @@ +__package__ = "archivebox.config" + +import io +import json +import os +from collections.abc import Mapping +from typing import Any + +from archivebox.config.constants import CONSTANTS +from archivebox.config.configset import CaseConfigParser +from archivebox.misc.logging import AttrDict + + +CONFIG_FILE_HEADER = ( + "# This is the config file for your ArchiveBox collection.\n" + "#\n" + "# You can add options here manually in INI format, or automatically by running:\n" + "# archivebox config --set KEY=VALUE\n" + "#\n" + "# This file is kept in sync 1:1 with Machine.config in the index DB โ€”\n" + "# editing either side propagates to the other. ``archivebox init`` reads\n" + "# this file on startup; the admin Machine.config editor writes both.\n" + "#\n" + "# Full reference:\n" + "# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration\n" + "\n" +) + + +# Recursion guard for the bidirectional file<->DB mirror. Bidirectional sync +# means a write on one side always triggers an update on the other; without +# this flag, ``Machine.save -> mirror to file -> write_config_file -> mirror +# back to Machine -> Machine.save -> ...`` would loop forever. Module-level +# scalar is fine here โ€” daphne handles each request on its own thread and the +# admin/CLI write paths are inherently serialized through the DB. +_MIRROR_IN_PROGRESS: bool = False +# One-time-per-process startup sync, gated so subsequent ``Machine.current()`` +# calls collapse to a single boolean check. +_INITIAL_SYNC_DONE: bool = False + + +def _coerce_to_str_dict(config: Any) -> dict[str, str]: + """Project an arbitrary config payload to flat ``{UPPER_KEY: str}`` form. + + INI files only round-trip strings, so we normalize Machine.config values + to strings on the way out and accept the same shape on the way back. + Pydantic re-coerces types at read time inside ``get_config``. + + Composite values (``dict`` / ``list`` / ``tuple``) are JSON-encoded so + they round-trip back through pydantic-settings โ€” ``str(some_dict)`` + would produce Python's repr (``{'k': 'v'}`` with single quotes), and + pydantic-settings refuses to parse that as a ``dict`` field, causing + e.g. ``ABX_INSTALL_CACHE`` to crash with ``ValidationError: Input + should be a valid dictionary``. + """ + if not config: + return {} + if not isinstance(config, Mapping): + return {} + flat: dict[str, str] = {} + for key, value in config.items(): + upper_key = str(key).upper() + if value is None: + flat[upper_key] = "" + elif isinstance(value, (dict, list, tuple)): + flat[upper_key] = json.dumps(value, default=str) + else: + flat[upper_key] = str(value) + return flat + + +def _load_file_config_dict() -> tuple[dict[str, str], float | None]: + """Return ``(flat_dict, mtime)`` for ``ArchiveBox.conf`` (``({}, None)`` if missing).""" + config_path = CONSTANTS.CONFIG_FILE + try: + mtime = config_path.stat().st_mtime + except FileNotFoundError: + return {}, None + parser = CaseConfigParser() + parser.read(config_path) + flat = {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} + return flat, mtime + + +def _resolve_section_for_key(key: str, config_sections, plugin_configs) -> str: + for section in config_sections.values(): + if key in type(section).model_fields: + return section.toml_section_header + for schema in plugin_configs.values(): + if "properties" in schema and key in schema["properties"]: + return "PLUGINS" + # Unknown / user-defined keys land in SERVER_CONFIG so we never lose them + # (the previous code raised here, which was fine for the CLI-only path but + # would break the mirror as soon as anyone added a plugin-tunable that + # this process hasn't loaded a schema for). + return "SERVER_CONFIG" + + +def _render_config_file_content(config: dict[str, str]) -> str: + """Render a flat config dict to INI text, grouped by inferred section.""" + from archivebox.config.common import get_all_configs + from archivebox.plugins.discovery import discover_plugin_configs + + config_sections = get_all_configs() + plugin_configs = discover_plugin_configs() + + parser = CaseConfigParser() + for key, val in sorted(config.items()): + section = _resolve_section_for_key(key, config_sections, plugin_configs) + if section not in parser: + parser[section] = {} + parser[section][key] = "" if val is None else str(val) + + buf = io.StringIO() + buf.write(CONFIG_FILE_HEADER) + parser.write(buf) + return buf.getvalue() + + +def _write_file_if_changed(content: str) -> bool: + """Atomic-write ``ArchiveBox.conf`` only when contents actually differ. + + Skipping unchanged writes is the difference between every Machine.save in + a hot autodetection loop costing one disk write vs. zero. + """ + from archivebox.misc.system import atomic_write + + config_path = CONSTANTS.CONFIG_FILE + try: + existing = config_path.read_text(encoding="utf-8") if config_path.exists() else None + except OSError: + existing = None + if existing == content: + return False + atomic_write(config_path, content) + return True + + +def mirror_machine_config_to_file(config: Any) -> None: + """Rewrite ``ArchiveBox.conf`` so it mirrors ``Machine.config`` exactly. + + Called from ``Machine.save`` after the row is committed. Recursion-guarded + so the matching write_config_file -> Machine.save bounce doesn't loop. + """ + global _MIRROR_IN_PROGRESS + if _MIRROR_IN_PROGRESS: + return + _MIRROR_IN_PROGRESS = True + try: + flat = _coerce_to_str_dict(config) + _write_file_if_changed(_render_config_file_content(flat)) + finally: + _MIRROR_IN_PROGRESS = False + + +def _coerce_from_str_dict(file_config: dict[str, str]) -> dict[str, Any]: + """Inverse of ``_coerce_to_str_dict``: decode complex INI values to native. + + ``mirror_machine_config_to_file`` JSON-encodes ``dict`` / ``list`` values + so they round-trip through INI's string-only storage. When reading the + file back into ``Machine.config`` (a JSONField that holds native types) + those strings have to be decoded โ€” otherwise downstream consumers like + ``_emit_machine_config`` โ†’ ``MachineEvent`` โ†’ abx-dl see a JSON string + where they expect a dict and raise ``TypeError``. + Declared fields go through pydantic-settings' own ``field_is_complex`` / + ``prepare_field_value`` so they're decoded per annotation. Undeclared + keys (e.g. ``ABX_INSTALL_CACHE``, written dynamically by abx-dl) are + JSON-decoded when their string starts with ``{`` or ``[`` โ€” the same + shape ``_coerce_to_str_dict`` writes them as. + """ + from archivebox.config.common import ArchiveBoxConfig + from archivebox.config.configset import IniConfigSettingsSource + + decoder = IniConfigSettingsSource(ArchiveBoxConfig) + decoded: dict[str, Any] = dict(file_config) + declared_fields = set(ArchiveBoxConfig.model_fields) + for field_name, field in ArchiveBoxConfig.model_fields.items(): + if field_name not in decoded: + continue + raw = decoded[field_name] + if not isinstance(raw, str) or not raw: + continue + if decoder.field_is_complex(field): + decoded[field_name] = decoder.prepare_field_value(field_name, field, raw, True) + for key, raw in list(decoded.items()): + if key in declared_fields: + continue + if not isinstance(raw, str) or not raw: + continue + first = raw[:1] + if first not in ("{", "["): + continue + try: + decoded[key] = json.loads(raw) + except (TypeError, ValueError): + continue + return decoded + + +def _mirror_file_to_machine_config(file_config: dict[str, str]) -> None: + """Copy ``ArchiveBox.conf`` contents into ``Machine.config``. + + Internal helper used by ``write_config_file`` and the startup sync โ€” + callers must hold the ``_MIRROR_IN_PROGRESS`` guard around it. + """ + from archivebox.machine.models import Machine + + machine = Machine.current() + if _coerce_to_str_dict(machine.config) == file_config: + return + machine.config = _coerce_from_str_dict(file_config) + machine.save(update_fields=["config", "modified_at"]) + + +def sync_machine_and_file(machine: Any = None) -> None: + """One-time-per-process reconciliation between the two stores. + + Cheap on the common case where they already agree (single ``stat`` + dict + compare โ‰ˆ 1ms). When the two sides diverge we merge them: each side's + unique keys are preserved, and for keys present on both we let the newer + side win (file mtime vs. ``Machine.modified_at``). After the merge both + stores hold the union, so every subsequent write keeps them in lockstep + via the full-replace mirror functions. + + Pass ``machine`` when the caller already has a current ``Machine`` + instance (e.g. from ``Machine.current()``) to skip the 10โ€“15ms + ``get_host_guid()`` round-trip on the cold path. + """ + global _INITIAL_SYNC_DONE, _MIRROR_IN_PROGRESS + if _INITIAL_SYNC_DONE: + return + _INITIAL_SYNC_DONE = True + if _MIRROR_IN_PROGRESS: + return + _MIRROR_IN_PROGRESS = True + try: + if machine is None: + from archivebox.machine.detect import get_host_guid + from archivebox.machine.models import Machine + + try: + machine = Machine.objects.filter(guid=get_host_guid()).first() + except Exception: + return + if machine is None: + return + + file_config, file_mtime = _load_file_config_dict() + machine_config = _coerce_to_str_dict(machine.config) + if machine_config == file_config: + return + + db_mtime = machine.modified_at.timestamp() if machine.modified_at else 0.0 + file_is_newer = file_mtime is not None and file_mtime > db_mtime + + merged: dict[str, str] = {} + all_keys = set(machine_config) | set(file_config) + for key in all_keys: + in_file = key in file_config + in_db = key in machine_config + if in_file and in_db: + if file_config[key] == machine_config[key]: + merged[key] = file_config[key] + else: + merged[key] = file_config[key] if file_is_newer else machine_config[key] + elif in_file: + merged[key] = file_config[key] + else: + merged[key] = machine_config[key] + + if merged != file_config: + _write_file_if_changed(_render_config_file_content(merged)) + if merged != machine_config: + machine.config = _coerce_from_str_dict(merged) + machine.save(update_fields=["config", "modified_at"]) + finally: + _MIRROR_IN_PROGRESS = False + + +def write_config_file(config: dict[str, str]) -> AttrDict: + """Merge ``config`` into ``ArchiveBox.conf``, validate, then mirror to Machine.config. + + Backwards-compatible signature: callers (CLI ``archivebox config --set`` + and the init flow) pass a partial dict of keys to upsert. + """ + + from archivebox.config.common import get_all_configs + from archivebox.plugins.discovery import discover_plugin_configs + from archivebox.misc.system import atomic_write + + config_path = CONSTANTS.CONFIG_FILE + + if not os.access(config_path, os.F_OK): + atomic_write(config_path, CONFIG_FILE_HEADER) + + config_file = CaseConfigParser() + config_file.read(config_path) + + with open(config_path, encoding="utf-8") as old: + atomic_write(f"{config_path}.bak", old.read()) + + config_sections = get_all_configs() + plugin_configs = discover_plugin_configs() + + # Set up sections in empty config file + for key, val in config.items(): + section_name = _resolve_section_for_key(key, config_sections, plugin_configs) + if section_name in config_file: + existing_config = dict(config_file[section_name]) + else: + existing_config = {} + + config_file[section_name] = AttrDict({**existing_config, key: val}) + + with open(config_path, "w+", encoding="utf-8") as new: + config_file.write(new) + + updated_config = {} + try: + # validate the updated_config by attempting to re-parse it + from archivebox.config.common import get_config + + updated_config = get_config().as_dict() + except BaseException: # lgtm [py/catch-base-exception] + # something went horribly wrong, revert to the previous version + with open(f"{config_path}.bak", encoding="utf-8") as old: + atomic_write(config_path, old.read()) + + raise + + if os.access(f"{config_path}.bak", os.F_OK): + os.remove(f"{config_path}.bak") + + # Mirror the post-write file state into Machine.config so the DB stays + # 1:1 with the on-disk file. Recursion-guarded so Machine.save's own + # mirror-back doesn't loop us. + global _MIRROR_IN_PROGRESS + if not _MIRROR_IN_PROGRESS: + _MIRROR_IN_PROGRESS = True + try: + flat, _mtime = _load_file_config_dict() + _mirror_file_to_machine_config(flat) + except Exception: + pass + finally: + _MIRROR_IN_PROGRESS = False + + return AttrDict({key.upper(): updated_config.get(key.upper()) for key in config.keys()}) diff --git a/archivebox/config/common.py b/archivebox/config/common.py new file mode 100644 index 0000000000..9af0b2e169 --- /dev/null +++ b/archivebox/config/common.py @@ -0,0 +1,1156 @@ +from __future__ import annotations + +__package__ = "archivebox.config" + +import json +import os +import re +import secrets +import sys +import shutil +import inspect +from functools import lru_cache +from collections.abc import Mapping +from datetime import timedelta +from typing import Any, ClassVar, cast +from pathlib import Path +from urllib.parse import quote + +from rich.console import Console +from pydantic import BaseModel, Field, PrivateAttr, create_model, field_validator, model_validator +from pydantic_settings import SettingsConfigDict +from abx_plugins.plugins.base.utils import BASE_CONFIG_PATH, build_config_model, resolve_plugin_configs + +from archivebox.config.configset import BaseConfigSet, IniConfigSettingsSource +from archivebox.config.configset import COMPUTED_CONFIG_KEYS + +from .constants import CONSTANTS +from .ldap import LDAPConfig +from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION +from .permissions import IN_DOCKER + +ConfigOverrides = Mapping[str, object] +ConfigPayload = dict[str, object] +PluginSchemaDocuments = dict[str, dict[str, Any]] +LIVE_CONFIG_BASE_URL = "/admin/environment/config/" + +###################### Config ########################## + +_STDOUT_CONSOLE = Console() +_STDERR_CONSOLE = Console(stderr=True) +_WARNED_ARCHIVING_CONFIGS: set[tuple[int, bool]] = set() + + +def _legacy_bool(value: object) -> bool | None: + if value is None: + return None + normalized = str(value).strip().lower() + if normalized in {"1", "true", "yes", "on"}: + return True + if normalized in {"0", "false", "no", "off"}: + return False + return None + + +def permissions_from_legacy_public_flags(raw_config: Mapping[str, object]) -> str | None: + if str(raw_config.get("PERMISSIONS") or "").strip(): + return None + + public_snapshots = _legacy_bool(raw_config.get("PUBLIC_SNAPSHOTS")) + public_index = _legacy_bool(raw_config.get("PUBLIC_INDEX")) + if public_snapshots is False: + return "private" + if public_index is False: + return "unlisted" + if public_snapshots is True or public_index is True: + return "public" + return None + + +def resolve_delete_after_config_value(*configs: Mapping[str, Any] | None) -> str: + for config in configs: + if config is None: + continue + value = config.get("DELETE_AFTER") + if value: + return str(value) + return "0" + + +_SENSITIVE_CONFIG_KEY_NEEDLES = ("TOKEN", "SECRET", "API_KEY", "APIKEY", "PASSWORD") +SENSITIVE_CONFIG_VALUE_REDACTED = "********" +_SCOPE_CRAWL_FROZEN = "crawl_frozen" +_SCOPE_CRAWL_EXECUTION = "crawl_execution" +_SCOPE_SERVER = "server" + + +@lru_cache(maxsize=1) +def _plugin_sensitive_config_keys() -> frozenset[str]: + sensitive_keys: set[str] = set() + for prop_key, prop_schema in _plugin_config_properties(PLUGIN_CONFIG_SCHEMAS).items(): + if isinstance(prop_schema, Mapping) and prop_schema.get("x-sensitive"): + sensitive_keys.add(str(prop_key)) + return frozenset(sensitive_keys) + + +def is_sensitive_config_key(key: str) -> bool: + """True if a config key names a credential and must be write-only in the UI. + + Matches any key whose uppercase form contains ``TOKEN``, ``SECRET``, + ``API_KEY``, ``APIKEY``, or ``PASSWORD`` โ€” covers ``SECRET_KEY``, + ``OPENAI_API_KEY``, ``TWOCAPTCHA_APIKEY``, ``GITHUB_TOKEN``, + ``ADMIN_PASSWORD``, etc. Centralized here so the KeyValueWidget + (Machine/Crawl/Snapshot/Persona admin forms), the plugin config grid, + REST API responses, and any future surface that round-trips raw config + values all agree on which keys to redact. + """ + key = str(key or "") + upper = key.upper() + return key in _plugin_sensitive_config_keys() or any(needle in upper for needle in _SENSITIVE_CONFIG_KEY_NEEDLES) + + +def redact_sensitive_config(config: Mapping[str, Any] | None) -> dict[str, Any]: + """Return a copy of ``config`` with credential values replaced by ``********``. + + Used wherever a config dict crosses an API/export/debug-dump boundary. The + widget-side write-only treatment handles the form-render path; this helper + handles every JSON-response path (REST schemas, ``to_json`` exports, admin + debug views, etc.). Empty values are passed through unchanged so callers + can still tell "unset" from "set-but-hidden." + """ + if config is None: + return {} + if not isinstance(config, Mapping): + return {} + redacted: dict[str, Any] = {} + for key, value in config.items(): + if is_sensitive_config_key(str(key)) and value not in (None, ""): + redacted[key] = SENSITIVE_CONFIG_VALUE_REDACTED + else: + redacted[key] = value + return redacted + + +def normalize_runtime_config( + config: BaseConfigSet | Mapping[str, Any] | str | None, + *, + only_crawl_execution: bool = False, + exclude_runtime_derived: bool = False, + exclude_crawl_execution: bool = False, + json_safe: bool = True, +) -> dict[str, Any]: + """Return config filtered for runtime/frozen usage, optionally JSON-safe.""" + if config is None: + return {} + if isinstance(config, BaseConfigSet): + config = config.model_dump(mode="json") + elif isinstance(config, str): + config = json.loads(config) + else: + config = dict(config) + + runtime_derived_keys = ArchiveBoxConfig.runtime_derived_config_keys() if exclude_runtime_derived else frozenset() + filtered = { + key: value + for key, value in config.items() + if ( + value is not None + and (not only_crawl_execution or ArchiveBoxConfig.scope_for_key(str(key)) == _SCOPE_CRAWL_EXECUTION) + and (not exclude_runtime_derived or str(key) not in runtime_derived_keys) + and (not exclude_crawl_execution or ArchiveBoxConfig.scope_for_key(str(key)) != _SCOPE_CRAWL_EXECUTION) + ) + } + if not json_safe: + return filtered + return {key: value for key, value in json.loads(json.dumps(filtered, default=str)).items() if value is not None} + + +def build_crawl_config_snapshot( + *, + persona: Any = None, + overrides: Mapping[str, Any] | None = None, + base_config: ArchiveBoxBaseConfig | Mapping[str, object] | None = None, +) -> dict[str, Any]: + """Build the frozen crawl config stored on Crawl.config at creation time.""" + explicit_overrides = set(overrides or {}) + plugin_owned_keys = set(_plugin_config_properties(PLUGIN_CONFIG_SCHEMAS)) - set(ArchiveBoxBaseConfig.model_fields) + effective = get_config(persona=persona, base_config=base_config) + frozen = effective.for_crawl_frozen(persona=persona) + for key in ("BIND_ADDR", "BASE_URL", "CSRF_TRUSTED_ORIGINS", "SERVER_SECURITY_MODE"): + value = getattr(effective, key, None) + if value is not None: + frozen[key] = value + if persona is not None: + persona_config = persona.get_derived_config() + for key in plugin_owned_keys - explicit_overrides: + if key in persona_config: + frozen.pop(key, None) + if overrides: + resolved = get_config(base_config=frozen, overrides=overrides, include_machine=False) + resolved_payload = normalize_runtime_config(resolved) + frozen = resolved.for_crawl_frozen(persona=persona) + for key in ("BIND_ADDR", "BASE_URL", "CSRF_TRUSTED_ORIGINS", "SERVER_SECURITY_MODE"): + value = getattr(resolved, key, None) + if value is not None: + frozen[key] = value + for key in plugin_owned_keys & explicit_overrides: + if ArchiveBoxConfig.scope_for_key(key) == _SCOPE_CRAWL_FROZEN and key in resolved_payload: + frozen[key] = resolved_payload[key] + if persona is not None: + persona_config = persona.get_derived_config() + for key in plugin_owned_keys - explicit_overrides: + if key in persona_config: + frozen.pop(key, None) + return frozen + + +def rprint(*args, file=None, **kwargs): + console = _STDERR_CONSOLE if file is sys.stderr else _STDOUT_CONSOLE + console.print(*args, **kwargs) + + +class ShellConfig(BaseConfigSet): + toml_section_header: str = "SHELL_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_CRAWL_EXECUTION) + + DEBUG: bool = Field(default="--debug" in sys.argv) + + IS_TTY: bool = Field(default=sys.stdout.isatty()) + USE_COLOR: bool = Field(default=sys.stdout.isatty()) + SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty()) + + IN_DOCKER: bool = Field(default=IN_DOCKER) + IN_QEMU: bool = Field(default=False) + + ANSI: dict[str, str] = Field( + default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS, + ) + + @property + def TERM_WIDTH(self) -> int: + if not self.IS_TTY: + return 200 + return shutil.get_terminal_size((140, 10)).columns + + @property + def COMMIT_HASH(self) -> str | None: + return get_COMMIT_HASH() + + @property + def BUILD_TIME(self) -> str: + return get_BUILD_TIME() + + +class StorageConfig(BaseConfigSet): + toml_section_header: str = "STORAGE_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_SERVER) + + # TMP_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be a short path due to unix path length restrictions for socket files (<90 chars) + # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets + TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR, json_schema_extra={"scope": _SCOPE_CRAWL_EXECUTION}) + + # LIB_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be able to contain executable binaries (up to 5GB size) + # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow + LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR, json_schema_extra={"scope": _SCOPE_CRAWL_EXECUTION}) + + OUTPUT_PERMISSIONS: str = Field(default="644") + ENFORCE_ATOMIC_WRITES: bool = Field(default=True) + ALLOW_NO_UNIX_SOCKETS: bool = Field(default=False, alias="ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS") + + +class GeneralConfig(BaseConfigSet): + toml_section_header: str = "GENERAL_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_SERVER) + + TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]") + + +class ServerConfig(BaseConfigSet): + toml_section_header: str = "SERVER_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_SERVER) + + SERVER_SECURITY_MODES: ClassVar[tuple[str, ...]] = ( + "safe-subdomains-fullreplay", + "safe-onedomain-nojsreplay", + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50))) + BIND_ADDR: str = Field(default="127.0.0.1:8000") + BASE_URL: str = Field(default="") + ALLOWED_HOSTS: str = Field(default="*") + CSRF_TRUSTED_ORIGINS: str = Field(default="") + SERVER_SECURITY_MODE: str = Field(default="safe-subdomains-fullreplay") + + SNAPSHOTS_PER_PAGE: int = Field(default=50, ge=1) + FOOTER_INFO: str = Field( + default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.", + ) + PUBLIC_INDEX: bool = Field(default=True) + PUBLIC_ADD_VIEW: bool = Field(default=False) + + ADMIN_USERNAME: str | None = Field(default=None) + ADMIN_PASSWORD: str | None = Field(default=None) + + REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User") + REVERSE_PROXY_WHITELIST: str = Field(default="") + LOGOUT_REDIRECT_URL: str = Field(default="/") + + @field_validator("SERVER_SECURITY_MODE", mode="after") + def validate_server_security_mode(cls, v: str) -> str: + mode = (v or "").strip().lower() + if mode not in cls.SERVER_SECURITY_MODES: + raise ValueError(f"SERVER_SECURITY_MODE must be one of: {', '.join(cls.SERVER_SECURITY_MODES)}") + return mode + + @property + def USES_SUBDOMAIN_ROUTING(self) -> bool: + return self.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay" + + @property + def ENABLES_FULL_JS_REPLAY(self) -> bool: + return self.SERVER_SECURITY_MODE in ( + "safe-subdomains-fullreplay", + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + @property + def CONTROL_PLANE_ENABLED(self) -> bool: + return self.SERVER_SECURITY_MODE != "unsafe-onedomain-noadmin" + + @property + def BLOCK_UNSAFE_METHODS(self) -> bool: + return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin" + + @property + def SHOULD_NEUTER_RISKY_REPLAY(self) -> bool: + return self.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay" + + @property + def IS_UNSAFE_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin" + + @property + def IS_DANGEROUS_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE == "danger-onedomain-fullreplay" + + @property + def IS_LOWER_SECURITY_MODE(self) -> bool: + return self.SERVER_SECURITY_MODE in ( + "unsafe-onedomain-noadmin", + "danger-onedomain-fullreplay", + ) + + +class DatabaseConfig(BaseConfigSet): + toml_section_header: str = "DATABASE_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_SERVER) + + DATABASE_NAME: str = Field(default=str(CONSTANTS.DATABASE_FILE), alias="ARCHIVEBOX_DATABASE_NAME") + SQLITE_JOURNAL_MODE: str = Field( + default="WAL", + alias="ARCHIVEBOX_SQLITE_JOURNAL_MODE", + pattern=r"(?i)^(DELETE|TRUNCATE|PERSIST|MEMORY|WAL|OFF)$", + ) + SQLITE_MMAP_SIZE: int = Field( + default=0 if CONSTANTS.IN_DOCKER else 134217728, + alias="ARCHIVEBOX_SQLITE_MMAP_SIZE", + ge=0, + ) + SQLITE_BUSY_TIMEOUT: int = Field(default=30000, alias="ARCHIVEBOX_SQLITE_BUSY_TIMEOUT", ge=0) + SQLITE_LOCK_RETRY_TIMEOUT: float = Field(default=60.0, alias="ARCHIVEBOX_SQLITE_LOCK_RETRY_TIMEOUT", ge=0) + SQLITE_LOCK_RETRY_INTERVAL: float = Field(default=5.0, alias="ARCHIVEBOX_SQLITE_LOCK_RETRY_INTERVAL", gt=0) + + +class ArchivingConfig(BaseConfigSet): + toml_section_header: str = "ARCHIVING_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_CRAWL_FROZEN) + + PLUGINS: str = Field( + default="", + description="Comma-separated plugin selection for this run. Empty means use enabled plugin defaults.", + ) + + ONLY_NEW: bool = Field(default=True) + INDEX_ONLY: bool = Field(default=False) + + TIMEOUT: int = Field(default=60) + CRAWL_MAX_URLS: int = Field(default=0) + CRAWL_MAX_SIZE: int = Field(default=0) + CRAWL_TIMEOUT: int = Field(default=0, description="Maximum total crawl runtime in seconds (0 = unlimited).") + CRAWL_MAX_CONCURRENT_SNAPSHOTS: int = Field( + default=4, + description="Maximum number of snapshots to archive concurrently within one crawl.", + ) + SNAPSHOT_MAX_SIZE: int = Field(default=0) + + RESOLUTION: str = Field(default="1440,2000") + CHECK_SSL_VALIDITY: bool = Field(default=True) + USER_AGENT: str = Field( + default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)", + ) + COOKIES_FILE: Path | None = Field(default=None) + + URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST") + URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST") + + DEFAULT_PERSONA: str = Field(default="Default", json_schema_extra={"scope": _SCOPE_CRAWL_EXECUTION}) + PERMISSIONS: str = Field( + default="public", + description="Snapshot visibility: public lists and serves content, unlisted serves direct links only, private requires admin login.", + ) + DELETE_AFTER: str = Field( + default="0", + description=( + "Automatically delete Crawl, Snapshot, ArchiveResult, and Process rows after this duration. " + "Use 0, '', or None to disable. Allowed units: h/hr/hour, d/day, w/week, mo/month, y/year; " + "minimum non-zero duration is 1h." + ), + ) + + def warn_if_invalid(self) -> None: + if int(self.TIMEOUT) < 5: + rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr) + rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr) + rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr) + rprint(file=sys.stderr) + rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr) + rprint(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr) + rprint(file=sys.stderr) + + @field_validator("CHECK_SSL_VALIDITY", mode="after") + def validate_check_ssl_validity(cls, v): + """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests""" + if not v: + import urllib3 + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + return v + + @field_validator("DELETE_AFTER", mode="before") + @classmethod + def validate_delete_after(cls, value): + parse_delete_after(value) + if value is None: + return "0" + return str(value).strip() or "0" + + @field_validator("PERMISSIONS", mode="before") + @classmethod + def validate_permissions(cls, value): + normalized = str(value or "public").strip().lower() + if normalized not in {"public", "unlisted", "private"}: + raise ValueError("PERMISSIONS must be one of: public, unlisted, private.") + return normalized + + @property + def URL_ALLOWLIST_PTN(self) -> re.Pattern | None: + return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None + + @property + def URL_DENYLIST_PTN(self) -> re.Pattern: + return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) + + +def parse_delete_after(value) -> timedelta | None: + if value is None: + return None + + raw = str(value).strip().lower() + if raw in ("", "0", "none", "false", "no", "off"): + return None + + match = re.fullmatch(r"(\d+)\s*(h|hr|hrs|hour|hours|d|day|days|w|week|weeks|mo|month|months|y|yr|yrs|year|years)", raw) + if not match: + raise ValueError("DELETE_AFTER must be 0 or a duration like 1h, 7d, 4w, 6mo, or 1y.") + + amount = int(match.group(1)) + unit = match.group(2) + if amount <= 0: + return None + if unit in ("h", "hr", "hrs", "hour", "hours"): + duration = timedelta(hours=amount) + elif unit in ("d", "day", "days"): + duration = timedelta(days=amount) + elif unit in ("w", "week", "weeks"): + duration = timedelta(weeks=amount) + elif unit in ("mo", "month", "months"): + duration = timedelta(days=30 * amount) + else: + duration = timedelta(days=365 * amount) + + if duration < timedelta(hours=1): + raise ValueError("DELETE_AFTER must be 0 or at least 1h.") + return duration + + +class SearchBackendConfig(BaseConfigSet): + toml_section_header: str = "SEARCH_BACKEND_CONFIG" + _scope: str = PrivateAttr(default=_SCOPE_SERVER) + + SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep", json_schema_extra={"scope": _SCOPE_CRAWL_EXECUTION}) + + +def _plugin_user_config_value(value: Any) -> str: + if isinstance(value, Path): + return str(value) + if isinstance(value, (dict, list, bool, int, float)) or value is None: + return json.dumps(value) + return str(value) + + +def _plugin_user_config(config: Mapping[str, object]) -> dict[str, str]: + return {key: _plugin_user_config_value(value) for key, value in config.items()} + + +def _discover_plugin_config_schemas() -> PluginSchemaDocuments: + from archivebox.plugins.discovery import discover_plugin_configs + + schemas: PluginSchemaDocuments = {} + if BASE_CONFIG_PATH.exists(): + schemas["base"] = json.loads(BASE_CONFIG_PATH.read_text()) + schemas.update(discover_plugin_configs()) + return schemas + + +def _plugin_config_properties(plugin_schemas: PluginSchemaDocuments) -> dict[str, dict[str, Any]]: + properties: dict[str, dict[str, Any]] = {} + for schema in plugin_schemas.values(): + schema_properties = schema.get("properties") or {} + if isinstance(schema_properties, dict): + properties.update(schema_properties) + return properties + + +def _plugin_config_model(plugin_schemas: PluginSchemaDocuments) -> type[BaseModel]: + return build_config_model("ArchiveBoxPluginConfig", _plugin_config_properties(plugin_schemas)) + + +@lru_cache(maxsize=1) +def _archivebox_config_input_names() -> set[str]: + names = set(ArchiveBoxConfig.model_fields) + for field in ArchiveBoxConfig.model_fields.values(): + if isinstance(field.alias, str): + names.add(field.alias) + return names + + +class ArchiveBoxBaseConfig( + ShellConfig, + StorageConfig, + GeneralConfig, + ServerConfig, + DatabaseConfig, + ArchivingConfig, + SearchBackendConfig, + LDAPConfig, +): + """Merged, typed ArchiveBox config. + + Core ArchiveBox fields are declared above. Plugin-owned fields are added to + the concrete ArchiveBoxConfig model from plugin JSONSchema below, so + ArchiveBox does not hardcode any individual plugin config names. + """ + + model_config = SettingsConfigDict( + env_prefix="", + extra="ignore", + validate_default=True, + use_enum_values=True, + arbitrary_types_allowed=True, + populate_by_name=True, + ) + + computed_config_keys: ClassVar[tuple[str, ...]] = COMPUTED_CONFIG_KEYS + + @classmethod + def _core_config_classes(cls) -> tuple[type[BaseConfigSet], ...]: + return ( + ShellConfig, + StorageConfig, + GeneralConfig, + ServerConfig, + DatabaseConfig, + ArchivingConfig, + SearchBackendConfig, + LDAPConfig, + ) + + @classmethod + def _core_field_scope(cls, key: str) -> str | None: + if key == "toml_section_header": + return _SCOPE_SERVER + for config_cls in cls._core_config_classes(): + field = config_cls.model_fields.get(key) + if field is None: + continue + default_scope = str(config_cls.__private_attributes__["_scope"].default) + extra = field.json_schema_extra + if isinstance(extra, dict) and "scope" in extra: + return str(extra["scope"]) + return default_scope + if key in ArchiveBoxBaseConfig.model_fields: + field = ArchiveBoxBaseConfig.model_fields[key] + extra = field.json_schema_extra + if isinstance(extra, dict) and "scope" in extra: + return str(extra["scope"]) + return _SCOPE_SERVER + return None + + @classmethod + def _plugin_field_scope(cls, key: str) -> str | None: + scope = None + for plugin_name, schema in PLUGIN_CONFIG_SCHEMAS.items(): + properties = schema.get("properties") if isinstance(schema, dict) else None + if not isinstance(properties, dict) or key not in properties: + continue + prop_schema = properties.get(key) or {} + if isinstance(prop_schema, Mapping) and prop_schema.get("x-scope"): + scope = str(prop_schema["x-scope"]) + elif scope is None: + scope = _SCOPE_CRAWL_FROZEN + return scope + + @classmethod + @lru_cache(maxsize=None) + def scope_for_key(cls, key: str) -> str: + for plugin_name, schema in PLUGIN_CONFIG_SCHEMAS.items(): + properties = schema.get("properties") if isinstance(schema, dict) else None + if isinstance(properties, dict) and key == f"{str(plugin_name).upper()}_ENABLED" and key in properties: + return _SCOPE_CRAWL_EXECUTION + if key.endswith("_BINARY"): + return _SCOPE_CRAWL_EXECUTION + return cls._core_field_scope(key) or cls._plugin_field_scope(key) or _SCOPE_SERVER + + @classmethod + @lru_cache(maxsize=1) + def _scope_by_key(cls) -> dict[str, str]: + return {key: cls.scope_for_key(key) for key in cls.model_fields} + + @classmethod + @lru_cache(maxsize=1) + def _crawl_frozen_keys(cls) -> frozenset[str]: + return frozenset(key for key, scope in cls._scope_by_key().items() if scope == _SCOPE_CRAWL_FROZEN) + + @classmethod + @lru_cache(maxsize=1) + def _crawl_runtime_keys(cls) -> frozenset[str]: + return frozenset(key for key, scope in cls._scope_by_key().items() if scope in {_SCOPE_CRAWL_FROZEN, _SCOPE_CRAWL_EXECUTION}) + + @classmethod + @lru_cache(maxsize=1) + def runtime_derived_config_keys(cls) -> frozenset[str]: + runtime_derived_keys = { + "ABX_INSTALL_CACHE", + "ACTIVE_PERSONA", + "CHROME_DOWNLOADS_DIR", + "CHROME_USER_DATA_DIR", + "CRAWL_DIR", + "DEFAULT_PERSONA", + "EXTRA_CONTEXT", + "SNAP_DIR", + } + return frozenset( + key for key, scope in cls._scope_by_key().items() if scope == _SCOPE_CRAWL_EXECUTION and key in runtime_derived_keys + ) + + def _scoped_config(self, *, include_execution: bool) -> dict[str, Any]: + keys = type(self)._crawl_runtime_keys() if include_execution else type(self)._crawl_frozen_keys() + payload = self.model_dump(mode="json") + return {key: payload[key] for key in keys if payload.get(key) is not None} + + def for_crawl(self) -> dict[str, Any]: + """Config scoped to crawl execution, without runtime object overlays.""" + config = self._scoped_config(include_execution=True) + for key in type(self).runtime_derived_config_keys(): + config.pop(key, None) + return config + + def for_crawl_frozen(self, *, persona: Any = None) -> dict[str, Any]: + """Config safe to persist permanently on Crawl.config.""" + frozen = self._scoped_config(include_execution=False) + if persona is not None: + persona_config = dict(persona.config or {}) + scope_by_key = type(self)._scope_by_key() + for key in persona.get_derived_config(): + if key not in persona_config and scope_by_key.get(key) == _SCOPE_CRAWL_EXECUTION: + frozen.pop(key, None) + return frozen + + def for_crawl_runtime( + self, + *, + crawl: Any = None, + snapshot: Any = None, + persona: Any = None, + runtime_overrides: Mapping[str, Any] | None = None, + extra_context: Mapping[str, Any] | None = None, + crawl_output_dir: Any = None, + snapshot_output_dir: Any = None, + ) -> dict[str, Any]: + """Config payload safe to pass to crawl/snapshot hook execution.""" + config = self.for_crawl() + config["DATA_DIR"] = str(CONSTANTS.DATA_DIR) + scope_by_key = type(self)._scope_by_key() + model_fields = type(self).model_fields + # ArchiveBox owns SEARCH_BACKEND_ENGINE and uses it during model + # validation to derive the selected backend's *_ENABLED flag. Hooks + # only receive the backend-local flags, never the selector itself. + config.pop("SEARCH_BACKEND_ENGINE", None) + if persona is not None: + for key, value in persona.get_derived_config().items(): + if scope_by_key.get(key) == _SCOPE_CRAWL_EXECUTION: + config[key] = value + + if crawl is not None: + for key, value in dict(crawl.config or {}).items(): + if key in model_fields and scope_by_key.get(key) != _SCOPE_CRAWL_EXECUTION: + config[key] = value + config["CRAWL_DIR"] = str(crawl_output_dir if crawl_output_dir is not None else crawl.output_dir) + + if snapshot is not None: + for key, value in dict(snapshot.config or {}).items(): + if key in model_fields and scope_by_key.get(key) != _SCOPE_CRAWL_EXECUTION: + config[key] = value + config["SNAP_DIR"] = str(snapshot_output_dir if snapshot_output_dir is not None else snapshot.output_dir) + + if runtime_overrides: + config.update(normalize_runtime_config(runtime_overrides, json_safe=False)) + + if extra_context: + context: dict[str, Any] = {} + if config.get("EXTRA_CONTEXT"): + parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"])) + if not isinstance(parsed_extra_context, dict): + raise TypeError("EXTRA_CONTEXT must decode to an object") + context = parsed_extra_context + context.update(dict(extra_context)) + config["EXTRA_CONTEXT"] = json.dumps(context, separators=(",", ":"), sort_keys=True) + + return config + + @model_validator(mode="after") + def resolve_runtime_paths(self): + lib_dir = self.LIB_DIR.expanduser() + if not lib_dir.is_absolute(): + lib_dir = CONSTANTS.DATA_DIR / lib_dir + self.LIB_DIR = lib_dir.resolve() + + return self + + @model_validator(mode="after") + def derive_plugin_enabled_config(self): + plugin_names = _normalize_plugins_config_value(self.PLUGINS) + selected_plugins = _plugins_with_required_plugins(plugin_names) if plugin_names else set() + search_backend = self.SEARCH_BACKEND_ENGINE.strip().lower() + if search_backend: + selected_plugins.add(f"search_backend_{search_backend}") + for plugin_name, enabled_key in _plugin_enabled_config_keys().items(): + if plugin_names or plugin_name in selected_plugins: + setattr(self, enabled_key, plugin_name in selected_plugins) + return self + + +def _build_archivebox_config_model(plugin_schemas: PluginSchemaDocuments) -> type[ArchiveBoxBaseConfig]: + core_fields = set(ArchiveBoxBaseConfig.model_fields) + plugin_fields: dict[str, Any] = { + key: (field.annotation, field) for key, field in _plugin_config_model(plugin_schemas).model_fields.items() if key not in core_fields + } + return cast( + type[ArchiveBoxBaseConfig], + create_model( + "ArchiveBoxConfig", + __base__=ArchiveBoxBaseConfig, + __module__=__name__, + **plugin_fields, + ), + ) + + +PLUGIN_CONFIG_SCHEMAS = _discover_plugin_config_schemas() +ArchiveBoxConfig = _build_archivebox_config_model(PLUGIN_CONFIG_SCHEMAS) + + +def _normalize_plugins_config_value(value: Any) -> set[str]: + if value is None: + return set() + if isinstance(value, str): + raw = value.strip() + if not raw: + return set() + if raw.startswith("["): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + return {str(plugin).strip().lower() for plugin in parsed if str(plugin).strip()} + return {plugin.strip().lower() for plugin in raw.split(",") if plugin.strip()} + if isinstance(value, (list, tuple, set)): + return {str(plugin).strip().lower() for plugin in value if str(plugin).strip()} + normalized = str(value).strip().lower() + return {normalized} if normalized else set() + + +@lru_cache(maxsize=1) +def _plugin_enabled_config_keys() -> dict[str, str]: + enabled_keys: dict[str, str] = {} + for plugin_name, schema in PLUGIN_CONFIG_SCHEMAS.items(): + properties = schema.get("properties") if isinstance(schema, dict) else None + if not isinstance(properties, dict): + continue + enabled_key = f"{str(plugin_name).upper()}_ENABLED" + if enabled_key in properties and ArchiveBoxConfig.scope_for_key(enabled_key) == _SCOPE_CRAWL_EXECUTION: + enabled_keys[str(plugin_name).lower()] = enabled_key + return enabled_keys + + +def _plugins_with_required_plugins(plugin_names: set[str]) -> set[str]: + selected = set(plugin_names) + pending = list(selected) + while pending: + plugin_name = pending.pop() + schema = PLUGIN_CONFIG_SCHEMAS.get(plugin_name, {}) + required_plugins = schema.get("required_plugins") if isinstance(schema, dict) else None + if not isinstance(required_plugins, list): + continue + for required_plugin in required_plugins: + required_plugin_name = str(required_plugin).strip().lower() + if required_plugin_name and required_plugin_name not in selected: + selected.add(required_plugin_name) + pending.append(required_plugin_name) + return selected + + +def get_live_config_url(key: str) -> str: + return f"{LIVE_CONFIG_BASE_URL}{quote(key)}/" + + +@lru_cache(maxsize=1) +def config_field_metadata() -> dict[str, dict[str, Any]]: + """Return one centralized metadata map for core and plugin config fields.""" + metadata: dict[str, dict[str, Any]] = {} + for key, field in ArchiveBoxConfig.model_fields.items(): + if ArchiveBoxConfig.scope_for_key(key) == _SCOPE_CRAWL_EXECUTION or key in ArchiveBoxConfig.computed_config_keys: + continue + default = field.default + try: + json.dumps(default) + except TypeError: + default = str(default) + metadata[key] = { + "plugin": "archivebox", + "section": find_config_section(key), + "type": config_field_type(key), + "default": default, + "description": field.description or "", + "scope": ArchiveBoxConfig.scope_for_key(key), + "sensitive": is_sensitive_config_key(key), + } + for plugin_name, schema in PLUGIN_CONFIG_SCHEMAS.items(): + properties = schema.get("properties") if isinstance(schema, dict) else None + if not isinstance(properties, dict): + continue + for key, prop in properties.items(): + if not isinstance(prop, Mapping): + continue + if ArchiveBoxConfig.scope_for_key(key) == _SCOPE_CRAWL_EXECUTION: + continue + metadata[key] = { + **metadata.get(key, {}), + "plugin": plugin_name, + "section": "PLUGINS", + "type": prop.get("type", metadata.get(key, {}).get("type", "string")), + "default": prop.get("default", metadata.get(key, {}).get("default", "")), + "description": prop.get("description", metadata.get(key, {}).get("description", "")), + "scope": ArchiveBoxConfig.scope_for_key(key), + "sensitive": bool(prop.get("x-sensitive")) or is_sensitive_config_key(key), + "schema": dict(prop), + } + return metadata + + +def find_config_section(key: str) -> str: + from archivebox.config import CONSTANTS_CONFIG + + if key in CONSTANTS_CONFIG: + return "CONSTANT" + for section_id, section in get_all_configs().items(): + if key in type(section).model_fields: + return section_id + if key in _plugin_config_properties(PLUGIN_CONFIG_SCHEMAS): + return "PLUGINS" + return "DYNAMIC" + + +def find_config_default(key: str) -> str: + from archivebox.config import CONSTANTS_CONFIG + + if key in CONSTANTS_CONFIG: + return str(CONSTANTS_CONFIG[key]) + + field = ArchiveBoxConfig.model_fields.get(key) + if field is None: + return "" + default_val = field.default + if callable(default_val): + default_val = inspect.getsource(default_val).split("lambda", 1)[-1].split(":", 1)[-1].replace("\n", " ").strip() + if default_val.count(")") > default_val.count("("): + default_val = default_val[:-1] + else: + default_val = str(default_val) + return default_val + + +def config_field_type(key: str) -> str: + field = ArchiveBoxConfig.model_fields.get(key) + if field is None: + return "str" + annotation = field.annotation + try: + return annotation.__name__ + except AttributeError: + return str(annotation) + + +def find_config_type(key: str) -> str: + return config_field_type(key) + + +def find_config_source(key: str, merged_config: Mapping[str, Any]) -> str: + """Determine where a config value comes from.""" + from archivebox.machine.models import Machine + + machine = Machine.current() + if machine.config and key in machine.config: + return "Machine" + + if key in os.environ: + return "Environment" + + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + return "File" + + if key in _plugin_config_properties(PLUGIN_CONFIG_SCHEMAS): + return "Plugin Default" + + return "Default" + + +def get_request_config(request: Any, *, resolve_plugins: bool = False) -> ArchiveBoxBaseConfig: + """Return the per-request ArchiveBox config, upgrading to plugin resolution if needed.""" + request_state = request.__dict__ + request_config = request_state.get("archivebox_config") + request_config_resolves_plugins = bool(request_state.get("_archivebox_config_resolves_plugins", False)) + if request_config is None or (resolve_plugins and not request_config_resolves_plugins): + request_config = get_config(resolve_plugins=resolve_plugins) + request.archivebox_config = request_config + request._archivebox_config_resolves_plugins = resolve_plugins + return request_config + + +def get_config( + defaults: ConfigOverrides | None = None, + overrides: ConfigOverrides | None = None, + base_config: ArchiveBoxBaseConfig | Mapping[str, object] | None = None, + persona: Any = None, + crawl: Any = None, + snapshot: Any = None, + machine: Any = None, + include_machine: bool = True, + resolve_plugins: bool = True, + redact_sensitive: bool = False, +) -> ArchiveBoxBaseConfig: + """ + Get merged config from all sources. + + Defaults are hydrated by pydantic from core/plugin defaults, + ArchiveBox.conf, and environment variables. Persisted Machine/Persona + values then apply for live crawl-execution scope, while Crawl/Snapshot + rows apply their frozen crawl-scope values. Explicit overrides win last. + + Crawl-execution config is not persisted on Crawl.config. It is rederived + from current Machine/Persona state and hydrated process defaults each time. + """ + if crawl is None and snapshot is not None: + crawl = snapshot.crawl + + crawl_config_base = crawl is not None and base_config is None + + if include_machine and machine is None: + try: + from django.apps import apps + + if apps.ready: + from archivebox.machine.models import Machine + + machine = Machine.current() + except Exception: + machine = None + + if persona is None and crawl is not None: + persona = crawl.resolve_persona() + + config_data: ConfigPayload = dict(defaults or {}) + base_config_payload: ConfigPayload = {} + if crawl_config_base: + config_data.update( + normalize_runtime_config(ArchiveBoxConfig().model_dump(mode="json"), exclude_runtime_derived=True, json_safe=False), + ) + config_data.update(normalize_runtime_config(dict(crawl.config or {}), exclude_crawl_execution=True, json_safe=False)) + elif base_config is not None: + if isinstance(base_config, ArchiveBoxBaseConfig): + base_config_payload.update(base_config.model_dump(mode="json")) + else: + base_config_payload.update(dict(base_config)) + config_data.update(normalize_runtime_config(base_config_payload, exclude_runtime_derived=True, json_safe=False)) + else: + config_data.update( + normalize_runtime_config(ArchiveBoxConfig().model_dump(mode="json"), exclude_runtime_derived=True, json_safe=False), + ) + legacy_permissions = permissions_from_legacy_public_flags({**BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE), **os.environ}) + if legacy_permissions: + config_data["PERMISSIONS"] = legacy_permissions + + scope_overrides: ConfigPayload = {} + + if include_machine and machine is not None and machine.config: + from archivebox.machine.models import _sanitize_machine_config + + scope_overrides.update( + normalize_runtime_config( + _sanitize_machine_config(machine.config, lib_dir=config_data.get("LIB_DIR")), + only_crawl_execution=crawl_config_base, + exclude_runtime_derived=True, + json_safe=False, + ), + ) + + if persona is not None: + persona_config = normalize_runtime_config( + persona.get_derived_config(), + exclude_runtime_derived=not crawl_config_base, + json_safe=False, + ) + if crawl_config_base: + scope_by_key = ArchiveBoxConfig._scope_by_key() + crawl_keys = set(dict(crawl.config or {})) + persona_config = { + key: value + for key, value in persona_config.items() + if scope_by_key.get(key) == _SCOPE_CRAWL_EXECUTION or key not in crawl_keys + } + scope_overrides.update(persona_config) + + if crawl is not None and crawl.config and not crawl_config_base: + scope_overrides.update(normalize_runtime_config(crawl.config, exclude_crawl_execution=True, json_safe=False)) + + if snapshot is not None and snapshot.config: + scope_overrides.update(normalize_runtime_config(snapshot.config, exclude_crawl_execution=True, json_safe=False)) + + if overrides: + scope_overrides.update(normalize_runtime_config(overrides, exclude_crawl_execution=True, json_safe=False)) + + legacy_scope_permissions = permissions_from_legacy_public_flags(scope_overrides) + if legacy_scope_permissions: + scope_overrides["PERMISSIONS"] = legacy_scope_permissions + + archivebox_scope_overrides = {key: value for key, value in scope_overrides.items() if key in _archivebox_config_input_names()} + config_data.update(archivebox_scope_overrides) + + if resolve_plugins: + plugin_schemas = {plugin_name: schema for plugin_name, schema in PLUGIN_CONFIG_SCHEMAS.items() if isinstance(schema, dict)} + plugin_global_config = {key: str(value) if isinstance(value, Path) else value for key, value in config_data.items()} + plugin_user_config = _plugin_user_config( + { + **normalize_runtime_config(config_data, only_crawl_execution=True, json_safe=False), + **scope_overrides, + }, + ) + if not crawl_config_base: + plugin_user_config = { + **normalize_runtime_config( + BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE), + exclude_runtime_derived=True, + json_safe=False, + ), + **plugin_user_config, + } + plugin_sections = resolve_plugin_configs( + plugin_schemas, + global_config=plugin_global_config, + user_config=plugin_user_config, + environ={}, + ) + for plugin_config in plugin_sections.values(): + for key, value in plugin_config.items(): + if key in ArchiveBoxBaseConfig.model_fields and key not in archivebox_scope_overrides and key not in base_config_payload: + continue + config_data[key] = value + if base_config_payload: + config_data.update( + { + key: value + for key, value in normalize_runtime_config(base_config_payload, exclude_runtime_derived=True, json_safe=False).items() + if key in _archivebox_config_input_names() + }, + ) + if crawl_config_base: + config_data.update(normalize_runtime_config(dict(crawl.config or {}), exclude_crawl_execution=True, json_safe=False)) + config_data.update(archivebox_scope_overrides) + + # Decode JSON-encoded complex values (dict/list fields) that came from + # string-only sources before validation. ``IniConfigSettingsSource`` does + # this for the ArchiveBox.conf path, but Machine.config (mirrored from the + # INI via ``_coerce_to_str_dict``) and plugin/env scope overrides bypass + # pydantic-settings sources entirely โ€” they feed JSON strings directly + # into ``model_validate``, which rejects ``"{...}"`` for a ``dict[str, str]`` + # field. Run pydantic-settings' own complex-value decoder here so every + # source converges on the same shape before validation. + _complex_decoder = IniConfigSettingsSource(ArchiveBoxConfig) + for _field_name, _field in ArchiveBoxConfig.model_fields.items(): + if _field_name not in config_data: + continue + _raw = config_data[_field_name] + if not isinstance(_raw, str) or not _raw: + continue + if _complex_decoder.field_is_complex(_field): + config_data[_field_name] = _complex_decoder.prepare_field_value( + _field_name, + _field, + _raw, + True, + ) + + config = ArchiveBoxConfig.model_validate(config_data) + if redact_sensitive: + for key in type(config).model_fields: + value = config[key] + if is_sensitive_config_key(key) and value not in (None, ""): + setattr(config, key, SENSITIVE_CONFIG_VALUE_REDACTED) + os.environ["LIB_DIR"] = str(config.LIB_DIR) + os.environ["ABXPKG_LIB_DIR"] = str(config.LIB_DIR) + archiving_warning_key = (config.TIMEOUT, config.USE_COLOR) + if archiving_warning_key not in _WARNED_ARCHIVING_CONFIGS: + config.warn_if_invalid() + _WARNED_ARCHIVING_CONFIGS.add(archiving_warning_key) + return config + + +def get_all_configs() -> dict[str, BaseConfigSet]: + """Get all config section objects as a dictionary.""" + return { + "SHELL_CONFIG": ShellConfig(), + "STORAGE_CONFIG": StorageConfig(), + "GENERAL_CONFIG": GeneralConfig(), + "SERVER_CONFIG": ServerConfig(), + "DATABASE_CONFIG": DatabaseConfig(), + "ARCHIVING_CONFIG": ArchivingConfig(), + "SEARCH_BACKEND_CONFIG": SearchBackendConfig(), + "LDAP_CONFIG": LDAPConfig(), + } diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py new file mode 100644 index 0000000000..d4b7a959d4 --- /dev/null +++ b/archivebox/config/configset.py @@ -0,0 +1,211 @@ +"""Pydantic-backed config loading for ArchiveBox.""" + +__package__ = "archivebox.config" + +from pathlib import Path +from typing import Any, ClassVar +from configparser import ConfigParser + +from pydantic_settings import BaseSettings, PydanticBaseSettingsSource, SettingsConfigDict + +COMPUTED_CONFIG_KEYS = ( + "TERM_WIDTH", + "COMMIT_HASH", + "BUILD_TIME", + "USES_SUBDOMAIN_ROUTING", + "ENABLES_FULL_JS_REPLAY", + "CONTROL_PLANE_ENABLED", + "BLOCK_UNSAFE_METHODS", + "SHOULD_NEUTER_RISKY_REPLAY", + "IS_UNSAFE_MODE", + "IS_DANGEROUS_MODE", + "IS_LOWER_SECURITY_MODE", + "URL_ALLOWLIST_PTN", + "URL_DENYLIST_PTN", +) + + +class CaseConfigParser(ConfigParser): + def optionxform(self, optionstr: str) -> str: + return optionstr + + +# ``IniConfigSettingsSource.get_field_value`` is called once per pydantic field +# (hundreds of fields across the GlobalConfig + plugin configs). Without +# caching, every field re-opens and re-parses ``ArchiveBox.conf`` from disk โ€” +# that's the dominant cost of ``get_config()`` (each call was ~130ms; profile +# showed 632 ``parser.read(config_path)`` invocations per call). The cache is +# keyed on (path, mtime) so external edits to the file still get picked up. +_INI_CACHE: dict[tuple[str, float], dict[str, Any]] = {} + + +def _read_ini_config_cached(config_path_str: str) -> dict[str, Any]: + config_path = Path(config_path_str) + try: + mtime = config_path.stat().st_mtime + except FileNotFoundError: + return {} + cache_key = (str(config_path), mtime) + cached = _INI_CACHE.get(cache_key) + if cached is not None: + return cached + parser = CaseConfigParser() + parser.read(config_path) + flat = {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} + _INI_CACHE.clear() + _INI_CACHE[cache_key] = flat + return flat + + +class IniConfigSettingsSource(PydanticBaseSettingsSource): + """ + Custom settings source that reads from ArchiveBox.conf (INI format). + Flattens all sections into a single namespace. + """ + + def get_field_value(self, field: Any, field_name: str) -> tuple[Any, str, bool]: + config_vals = self._load_config_file() + field_value = config_vals.get(field_name.upper()) + # Mark complex-typed fields (``dict``/``list``) so the parent + # ``prepare_field_value`` JSON-decodes the INI string before pydantic + # validates against the dict/list annotation. Without this, e.g. + # ``ABX_INSTALL_CACHE`` arrives as a raw JSON string and pydantic + # rejects it with ``Input should be a valid dictionary``. + value_is_complex = bool(field_value is not None and self.field_is_complex(field)) + return field_value, field_name, value_is_complex + + def __call__(self) -> dict[str, Any]: + # Use the per-field path (``get_field_value`` + ``prepare_field_value``) + # so complex types get JSON-decoded. The previous flat-dict return + # skipped pydantic-settings' complex-value handling entirely. + result: dict[str, Any] = {} + for field_name, field in self.settings_cls.model_fields.items(): + value, key, value_is_complex = self.get_field_value(field, field_name) + if value is None: + continue + prepared = self.prepare_field_value(field_name, field, value, value_is_complex) + if prepared is not None: + result[key] = prepared + return result + + def _load_config_file(self) -> dict[str, Any]: + try: + from archivebox.config.constants import CONSTANTS + + config_path = CONSTANTS.CONFIG_FILE + except ImportError: + return {} + + return _read_ini_config_cached(str(config_path)) + + +class BaseConfigSet(BaseSettings): + """ + Base class for config sections. + + Automatically loads values from (highest to lowest priority): + 1. Environment variables + 2. ArchiveBox.conf file (INI format, flattened) + 3. Default values + + Subclasses define fields with defaults and types: + + class ShellConfig(BaseConfigSet): + DEBUG: bool = Field(default=False) + USE_COLOR: bool = Field(default=True) + """ + + model_config = SettingsConfigDict( + env_prefix="", + extra="ignore", + validate_default=True, + populate_by_name=True, + ) + computed_config_keys: ClassVar[tuple[str, ...]] = () + + @classmethod + def settings_customise_sources( + cls, + settings_cls: type[BaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource, + ) -> tuple[PydanticBaseSettingsSource, ...]: + """ + Define the order of settings sources (first = highest priority). + """ + return ( + init_settings, # 1. Passed to __init__ + env_settings, # 2. Environment variables + IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file + # dotenv_settings, # Skip .env files + # file_secret_settings, # Skip secrets files + ) + + @classmethod + def load_from_file(cls, config_path: Path) -> dict[str, str]: + """Load config values from INI file.""" + if not config_path.exists(): + return {} + + parser = CaseConfigParser() + parser.read(config_path) + + # Flatten all sections into single namespace + return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} + + def __getitem__(self, key: str) -> Any: + if key in type(self).model_fields: + return getattr(self, key) + if self.__pydantic_extra__ and key in self.__pydantic_extra__: + return self.__pydantic_extra__[key] + if key in self.computed_config_keys: + return getattr(self, key) + raise KeyError(key) + + def __setitem__(self, key: str, value: Any) -> None: + if key in type(self).model_fields: + object.__setattr__(self, key, value) + return + if key in self.computed_config_keys: + raise KeyError(f"{key} is computed and cannot be set") + if self.model_config.get("extra") != "allow": + raise KeyError(f"Unknown config key: {key}") + extra = self.__pydantic_extra__ + if extra is None: + extra = {} + object.__setattr__(self, "__pydantic_extra__", extra) + extra[key] = value + + def update(self, *args, **kwargs) -> None: + values = dict(*args, **kwargs) + for key, value in values.items(): + if key in self.computed_config_keys: + continue + self[key] = value + + def __contains__(self, key: str) -> bool: + return ( + key in type(self).model_fields + or bool(self.__pydantic_extra__ and key in self.__pydantic_extra__) + or key in self.computed_config_keys + ) + + def get(self, key: str, default: Any = None) -> Any: + return self[key] if key in self else default + + def as_dict(self) -> dict[str, Any]: + data = self.model_dump() + for key in self.computed_config_keys: + data[key] = getattr(self, key) + return data + + def items(self): + return self.as_dict().items() + + def keys(self): + return self.as_dict().keys() + + def values(self): + return self.as_dict().values() diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py new file mode 100644 index 0000000000..3696ed59c4 --- /dev/null +++ b/archivebox/config/constants.py @@ -0,0 +1,298 @@ +""" +Constants are for things that never change at runtime. +(but they can change from run-to-run or machine-to-machine) + +DATA_DIR will never change at runtime, but you can run +archivebox from inside a different DATA_DIR on the same machine. + +This is loaded very early in the archivebox startup flow, so nothing in this file +or imported from this file should import anything from archivebox.config.common, +django, other INSTALLED_APPS, or anything else that is not in a standard library. +""" + +__package__ = "archivebox.config" + +import re +import sys + +from pathlib import Path + +from platformdirs import user_config_path + +from archivebox.misc.logging import AttrDict, DEFAULT_CLI_COLORS + +from .paths import ( + PACKAGE_DIR, + DATA_DIR, + ARCHIVE_DIR, + USERS_DIR, + _env_path, + get_collection_id, + get_machine_id, + get_machine_type, +) +from .permissions import ( + IS_ROOT, + IN_DOCKER, + RUNNING_AS_UID, + RUNNING_AS_GID, + DEFAULT_UID, + DEFAULT_GID, + ARCHIVEBOX_USER, + ARCHIVEBOX_GROUP, +) +from .version import detect_installed_version + +###################### Config ########################## + + +class ConstantsDict: + PACKAGE_DIR: Path = PACKAGE_DIR + DATA_DIR: Path = DATA_DIR + ARCHIVE_DIR: Path = ARCHIVE_DIR + USERS_DIR: Path = USERS_DIR + + MACHINE_TYPE: str = get_machine_type() + MACHINE_ID: str = get_machine_id() + COLLECTION_ID: str = get_collection_id(DATA_DIR) + + # Host system + VERSION: str = detect_installed_version(PACKAGE_DIR) + IN_DOCKER: bool = IN_DOCKER + + # Permissions + IS_ROOT: bool = IS_ROOT + ARCHIVEBOX_USER: int = ARCHIVEBOX_USER + ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP + RUNNING_AS_UID: int = RUNNING_AS_UID + RUNNING_AS_GID: int = RUNNING_AS_GID + DEFAULT_UID: int = DEFAULT_UID + DEFAULT_GID: int = DEFAULT_GID + IS_INSIDE_VENV: bool = sys.prefix != sys.base_prefix + + # Source code dirs + PACKAGE_DIR_NAME: str = PACKAGE_DIR.name + TEMPLATES_DIR_NAME: str = "templates" + TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME + STATIC_DIR_NAME: str = "static" + STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME + + # Data dirs + ARCHIVE_DIR_NAME: str = "archive" + USERS_DIR_NAME: str = "users" + SNAPSHOTS_DIR_NAME: str = "snapshots" + CRAWLS_DIR_NAME: str = "crawls" + SOURCES_DIR_NAME: str = "sources" + PERSONAS_DIR_NAME: str = "personas" + CACHE_DIR_NAME: str = "cache" + LOGS_DIR_NAME: str = "logs" + CUSTOM_PLUGINS_DIR_NAME: str = "custom_plugins" + CUSTOM_TEMPLATES_DIR_NAME: str = "custom_templates" + ARCHIVE_DIR: Path = ARCHIVE_DIR + USERS_DIR: Path = USERS_DIR + SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME + PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME + LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME + CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME + CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME + USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME + + # Data dir files + CONFIG_FILENAME: str = "ArchiveBox.conf" + SQL_INDEX_FILENAME: str = "index.sqlite3" + CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME + DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME + + JSON_INDEX_FILENAME: str = "index.json" + JSONL_INDEX_FILENAME: str = "index.jsonl" + HTML_INDEX_FILENAME: str = "index.html" + ROBOTS_TXT_FILENAME: str = "robots.txt" + FAVICON_FILENAME: str = "favicon.ico" + + # Runtime dirs + TMP_DIR_NAME: str = "tmp" + DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + + LIB_DIR_NAME: str = "lib" + DEFAULT_LIB_DIR: Path = _env_path( + "LIB_DIR", + user_config_path("abx") / LIB_DIR_NAME, + ) + + RESERVED_ARCHIVE_DIR_NAMES: frozenset[str] = frozenset( + ( + USERS_DIR_NAME, + SNAPSHOTS_DIR_NAME, + CRAWLS_DIR_NAME, + "invalid", + ".DS_Store", + ), + ) + + # Config constants + TIMEZONE: str = "UTC" + DEFAULT_CLI_COLORS: dict[str, str] = DEFAULT_CLI_COLORS + DISABLED_CLI_COLORS: dict[str, str] = AttrDict({k: "" for k in DEFAULT_CLI_COLORS}) + + # Hard safety limits (seconds) + MAX_HOOK_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + MAX_SNAPSHOT_RUNTIME_SECONDS: int = 60 * 60 * 12 # 12 hours + + ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE + + STATICFILE_EXTENSIONS: frozenset[str] = frozenset( + ( + # 99.999% of the time, URLs ending in these extensions are static files + # that can be downloaded as-is, not html pages that need to be rendered + "gif", + "jpeg", + "jpg", + "png", + "tif", + "tiff", + "wbmp", + "ico", + "jng", + "bmp", + "svg", + "svgz", + "webp", + "ps", + "eps", + "ai", + "mp3", + "mp4", + "m4a", + "mpeg", + "mpg", + "mkv", + "mov", + "webm", + "m4v", + "flv", + "wmv", + "avi", + "ogg", + "ts", + "m3u8", + "pdf", + "txt", + "rtf", + "rtfd", + "doc", + "docx", + "ppt", + "pptx", + "xls", + "xlsx", + "atom", + "rss", + "css", + "js", + "json", + "dmg", + "iso", + "img", + "rar", + "war", + "hqx", + "zip", + "gz", + "bz2", + "7z", + # Less common extensions to consider adding later + # jar, swf, bin, com, exe, dll, deb + # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm, + # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf, + # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml + # These are always treated as pages, not as static files, never add them: + # html, htm, shtml, xhtml, xml, aspx, php, cgi + ), + ) + + PIP_RELATED_NAMES: frozenset[str] = frozenset( + ( + ".venv", + "venv", + "virtualenv", + ".virtualenv", + ), + ) + NPM_RELATED_NAMES: frozenset[str] = frozenset( + ( + "node_modules", + "package.json", + "package-lock.json", + "yarn.lock", + ), + ) + + # When initializing archivebox in a new directory, we check to make sure the dir is + # actually empty so that we dont clobber someone's home directory or desktop by accident. + # These files are exceptions to the is_empty check when we're trying to init a new dir, + # as they could be from a previous archivebox version, system artifacts, dependencies, etc. + ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset( + ( + *PIP_RELATED_NAMES, + *NPM_RELATED_NAMES, + ### Dirs: + ARCHIVE_DIR_NAME, + SOURCES_DIR_NAME, + LOGS_DIR_NAME, + CACHE_DIR_NAME, + LIB_DIR_NAME, + TMP_DIR_NAME, + PERSONAS_DIR_NAME, + CUSTOM_TEMPLATES_DIR_NAME, + CUSTOM_PLUGINS_DIR_NAME, + "invalid", + "users", + "machine", + # Backwards compatibility with old directory names + "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') + "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') + "static", # created by old static exports <v0.6.0 + "sonic", # created by plugin-managed Sonic FTS worker + ".git", + ".svn", + ### Files: + CONFIG_FILENAME, + SQL_INDEX_FILENAME, + f"{SQL_INDEX_FILENAME}-wal", + f"{SQL_INDEX_FILENAME}-shm", + "search.sqlite3", + "queue.sqlite3", + "queue.sqlite3-wal", + "queue.sqlite3-shm", + JSON_INDEX_FILENAME, + JSONL_INDEX_FILENAME, + HTML_INDEX_FILENAME, + ROBOTS_TXT_FILENAME, + FAVICON_FILENAME, + CONFIG_FILENAME, + f"{CONFIG_FILENAME}.bak", + f".{CONFIG_FILENAME}.bak", + "static_index.json", + ".DS_Store", + ".gitignore", + "lost+found", + ".DS_Store", + ".env", + ".collection_id", + ".archivebox_id", + "Dockerfile", + ), + ) + + @classmethod + def __getitem__(cls, key: str): + # so it behaves like a dict[key] == dict.key or object attr + return getattr(cls, key) + + +CONSTANTS = ConstantsDict +CONSTANTS_CONFIG = AttrDict({key: value for key, value in CONSTANTS.__dict__.items() if key.isupper() and not key.startswith("_")}) + +# add all key: values to globals() for easier importing, e.g.: +# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ... +# globals().update(CONSTANTS) diff --git a/archivebox/config/django.py b/archivebox/config/django.py new file mode 100644 index 0000000000..12354e0db8 --- /dev/null +++ b/archivebox/config/django.py @@ -0,0 +1,167 @@ +__package__ = "archivebox.config" + +import os +import sys +import subprocess + +from datetime import datetime, timezone + +from rich.console import Console + +import django +import django.db + +from archivebox.misc import logging + +from .constants import CONSTANTS +from .common import get_config + +CONFIG = get_config() + +if not CONFIG.USE_COLOR: + os.environ["NO_COLOR"] = "1" +if not CONFIG.SHOW_PROGRESS: + os.environ["TERM"] = "dumb" + +STDOUT = CONSOLE = Console() +STDERR = Console(stderr=True) +logging.CONSOLE = CONSOLE + + +DJANGO_SET_UP = False + + +def setup_django(check_db=False, in_memory_db=False) -> None: + from rich.panel import Panel + + global DJANGO_SET_UP + + if DJANGO_SET_UP: + # raise Exception('django is already set up!') + # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes + return + + # Third-party patches are only needed once Django/apps are about to load. + # Keeping them out of archivebox.__init__ avoids paying Django/Daphne setup + # cost for cheap CLI startup paths like `archivebox <cmd> --help`. + import archivebox.misc.monkey_patches # noqa: F401 + + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission + + # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user + if IS_ROOT and ARCHIVEBOX_USER != 0: + with SudoPermission(uid=0): + # running as root is a special case where it's ok to be a bit slower + # make sure data dir is always owned by the correct user + subprocess.run(["chown", f"{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", str(CONSTANTS.DATA_DIR)], stderr=subprocess.DEVNULL) + if CONSTANTS.DATA_DIR.exists(): + for child in CONSTANTS.DATA_DIR.iterdir(): + subprocess.run(["chown", f"{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}", str(child)], stderr=subprocess.DEVNULL) + + # Suppress the "database access during app initialization" warning + # This warning can be triggered during django.setup() but is safe to ignore + # since we're doing intentional setup operations + import warnings + + warnings.filterwarnings( + "ignore", + message=".*Accessing the database during app initialization.*", + category=RuntimeWarning, + ) + + try: + from django.core.management import call_command + + if in_memory_db: + raise Exception("dont use this anymore") + + # some commands dont store a long-lived sqlite3 db file on disk. + # in those cases we create a temporary in-memory db and run the migrations + # immediately to get a usable in-memory-database at startup + os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:") + django.setup() + + call_command("migrate", interactive=False, verbosity=0) + else: + # Otherwise use default sqlite3 file-based database and initialize django + # without running migrations automatically (user runs them manually by calling init) + try: + django.setup() + except Exception as e: + is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ("help", "version", "--help", "--version")) + if not is_using_meta_cmd: + # show error message to user only if they're not running a meta command / just trying to get help + STDERR.print() + STDERR.print( + Panel( + f"\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n", + title="\n\n[red][X] Error while trying to load database![/red]", + subtitle="[grey53]NO WRITES CAN BE PERFORMED[/grey53]", + expand=False, + style="bold red", + ), + ) + STDERR.print() + import traceback + + traceback.print_exc() + return + + from archivebox.core.settings_logging import ERROR_LOG as DEFAULT_ERROR_LOG + + # log startup message to the error log + error_log = DEFAULT_ERROR_LOG + with open(error_log, "a", encoding="utf-8") as f: + command = " ".join(sys.argv) + ts = datetime.now(timezone.utc).strftime("%Y-%m-%d__%H:%M:%S") + config = get_config() + f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={config.IN_DOCKER} IS_TTY={config.IS_TTY}\n") + + if check_db: + # make sure the data dir is owned by a non-root user + if CONSTANTS.DATA_DIR.stat().st_uid == 0: + STDERR.print("[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]") + STDERR.print(f" {CONSTANTS.DATA_DIR}") + STDERR.print() + STDERR.print("[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)") + STDERR.print(" cd path/to/your/archive/data") + STDERR.print(" archivebox [command]") + STDERR.print() + raise SystemExit(9) + + # Create cache table in DB if needed + try: + from django.core.cache import cache + + cache.get("test", None) + except django.db.utils.OperationalError: + call_command("createcachetable", verbosity=0) + + # if archivebox gets imported multiple times, we have to close + # the sqlite3 whenever we init from scratch to avoid multiple threads + # sharing the same connection by accident + from django.db import connections + + for conn in connections.all(): + conn.close_if_unusable_or_obsolete() + + sql_index_path = CONSTANTS.DATABASE_FILE + assert os.access(sql_index_path, os.F_OK), ( + f"No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)" + ) + + # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging + # if settings.DEBUG_LOGFIRE: + # from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor + # SQLite3Instrumentor().instrument() + + # import logfire + + # logfire.configure() + # logfire.instrument_django(is_sql_commentor_enabled=True) + # logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv) + + except KeyboardInterrupt: + raise + + DJANGO_SET_UP = True diff --git a/archivebox/config/ldap.py b/archivebox/config/ldap.py new file mode 100644 index 0000000000..16d84ad210 --- /dev/null +++ b/archivebox/config/ldap.py @@ -0,0 +1,53 @@ +__package__ = "archivebox.config" + +from pydantic import Field, PrivateAttr + +from archivebox.config.configset import BaseConfigSet + + +class LDAPConfig(BaseConfigSet): + """ + LDAP authentication configuration. + + Only loads and validates if django-auth-ldap is installed. + These settings integrate with Django's LDAP authentication backend. + """ + + toml_section_header: str = "LDAP_CONFIG" + _scope: str = PrivateAttr(default="server") + + LDAP_ENABLED: bool = Field(default=False) + LDAP_SERVER_URI: str | None = Field(default=None) + LDAP_BIND_DN: str | None = Field(default=None) + LDAP_BIND_PASSWORD: str | None = Field(default=None) + LDAP_USER_BASE: str | None = Field(default=None) + LDAP_USER_FILTER: str = Field(default="(uid=%(user)s)") + LDAP_USERNAME_ATTR: str = Field(default="username") + LDAP_FIRSTNAME_ATTR: str = Field(default="givenName") + LDAP_LASTNAME_ATTR: str = Field(default="sn") + LDAP_EMAIL_ATTR: str = Field(default="mail") + LDAP_CREATE_SUPERUSER: bool = Field(default=False) + + def validate_ldap_config(self) -> tuple[bool, str]: + """ + Validate that all required LDAP settings are configured. + + Returns: + Tuple of (is_valid, error_message) + """ + if not self.LDAP_ENABLED: + return True, "" + + required_fields = [ + "LDAP_SERVER_URI", + "LDAP_BIND_DN", + "LDAP_BIND_PASSWORD", + "LDAP_USER_BASE", + ] + + missing = [field for field in required_fields if not getattr(self, field)] + + if missing: + return False, f"LDAP_* config options must all be set if LDAP_ENABLED=True\nMissing: {', '.join(missing)}" + + return True, "" diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py new file mode 100644 index 0000000000..bf43e1f402 --- /dev/null +++ b/archivebox/config/paths.py @@ -0,0 +1,399 @@ +__package__ = "archivebox.config" + +import os +import socket +import hashlib +import tempfile +import platform +import subprocess +from pathlib import Path +from functools import cache +from datetime import datetime +from typing import TYPE_CHECKING + +from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + +if TYPE_CHECKING: + from archivebox.config.common import ArchiveBoxConfig + +############################################################################################# + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir +MAX_TMP_SOCKET_URL_LENGTH = 90 +SUPERVISORD_SOCKET_FILENAME = "supervisord.sock" + + +def _env_path(key: str, default: Path) -> Path: + path = Path(os.environ.get(key) or default).expanduser() + if not path.is_absolute(): + path = DATA_DIR / path + return path.resolve() + + +ARCHIVE_DIR: Path = DATA_DIR / "archive" # archivebox snapshot data dir +USERS_DIR: Path = ARCHIVE_DIR / "users" # archivebox user-scoped crawl/snapshot data dir + +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + +DATABASE_FILE = DATA_DIR / "index.sqlite3" + +############################################################################################# + + +def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: + collection_id_file = DATA_DIR / ".archivebox_id" + + try: + return collection_id_file.read_text().strip() + except (OSError, FileNotFoundError, PermissionError): + pass + + # hash the machine_id + collection dir path + creation time to get a unique collection_id + machine_id = get_machine_id() + collection_path = DATA_DIR.resolve() + try: + creation_date = DATA_DIR.stat().st_ctime + except Exception: + creation_date = datetime.now().isoformat() + collection_id = hashlib.sha256(f"{machine_id}:{collection_path}@{creation_date}".encode()).hexdigest()[:8] + + try: + # only persist collection_id file if we already have an index.sqlite3 file present + # otherwise we might be running in a directory that is not a collection, no point creating cruft files + collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK) + if collection_is_active or force_create: + collection_id_file.write_text(collection_id) + + # if we're running as root right now, make sure the collection_id file is owned by the archivebox user + if IS_ROOT: + with SudoPermission(uid=0): + if ARCHIVEBOX_USER == 0: + subprocess.run(["chmod", "777", str(collection_id_file)]) + else: + subprocess.run(["chown", str(ARCHIVEBOX_USER), str(collection_id_file)]) + except (OSError, FileNotFoundError, PermissionError): + pass + return collection_id + + +@cache +def get_collection_id(DATA_DIR=DATA_DIR) -> str: + """Get a short, stable, unique ID for the current collection (e.g. abc45678)""" + return _get_collection_id(DATA_DIR=DATA_DIR) + + +@cache +def get_machine_id() -> str: + """Get a short, stable, unique ID for the current machine (e.g. abc45678)""" + + MACHINE_ID = "unknown" + try: + import machineid + + MACHINE_ID = machineid.hashed_id("archivebox")[:8] + except Exception: + try: + import uuid + import hashlib + + MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8] + except Exception: + pass + return MACHINE_ID + + +@cache +def get_machine_type() -> str: + """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)""" + + OS: str = platform.system().lower() # darwin, linux, etc. + ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. + LIB_DIR_SCOPE: str = f"{ARCH}-{OS}-docker" if IN_DOCKER else f"{ARCH}-{OS}" + return LIB_DIR_SCOPE + + +def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool: + """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" + current_uid, current_gid = os.geteuid(), os.getegid() + uid, gid = uid or current_uid, gid or current_gid + + test_file = dir_path / ".permissions_test" + try: + with SudoPermission(uid=uid, fallback=fallback): + test_file.exists() + test_file.write_text(f"Checking if uid={uid} gid={gid} can write to dir") + test_file.unlink() + return True + except (OSError, PermissionError): + if chown: + # try fixing it using sudo permissions + with SudoPermission(uid=uid, fallback=fallback): + subprocess.run(["chown", f"{uid}:{gid}", str(dir_path)], stderr=subprocess.DEVNULL) + return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) + return False + + +def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: + """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" + from archivebox.misc.logging_util import pretty_path + + try: + socket_path = str(dir_path / ".test_socket.sock") + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + os.remove(socket_path) + except OSError: + pass + s.bind(socket_path) + s.close() + try: + os.remove(socket_path) + except OSError: + pass + except Exception as e: + raise Exception(f"ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}") from e + + return True + + +def create_and_chown_dir(dir_path: Path) -> None: + """Create a required runtime dir and fix only that dir's ownership when needed.""" + dir_existed = dir_path.exists() + dir_path.mkdir(parents=True, exist_ok=True) + + try: + stat = dir_path.stat() + except OSError: + return + + if dir_existed and stat.st_uid == ARCHIVEBOX_USER and stat.st_gid == ARCHIVEBOX_GROUP: + return + + with SudoPermission(uid=0, fallback=True): + try: + os.chown(dir_path, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) + except (OSError, PermissionError): + pass + + +def tmp_dir_socket_path_is_short_enough(dir_path: Path) -> bool: + socket_file = dir_path.absolute().resolve() / SUPERVISORD_SOCKET_FILENAME + return len(f"file://{socket_file}") < MAX_TMP_SOCKET_URL_LENGTH + + +def tmp_dir_candidates(config: "ArchiveBoxConfig") -> list[Path]: + from archivebox.config.constants import CONSTANTS + + collection_id = get_collection_id() + collection_id_short = collection_id[:4] + system_tmp_dir = Path(tempfile.gettempdir()) + candidates = [ + config.TMP_DIR, # <user-specified> + CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id> + Path("/var/run/archivebox") / collection_id, + Path("/tmp") / "archivebox" / collection_id, + Path("~/.tmp/archivebox").expanduser() / collection_id, + system_tmp_dir / "archivebox" / collection_id, + system_tmp_dir / "archivebox" / collection_id_short, + system_tmp_dir / "abx" / collection_id_short, + ] + seen = set() + unique_candidates = [] + for path in candidates: + path_key = str(path.expanduser().absolute()) + if path_key in seen: + continue + seen.add(path_key) + unique_candidates.append(path) + return unique_candidates + + +def get_or_create_working_tmp_dir(autofix=True, quiet=True, config: "ArchiveBoxConfig | None" = None, **config_kwargs): + from archivebox.config.common import get_config + from archivebox.misc.checks import check_tmp_dir + + config = config or get_config(**config_kwargs) + candidates = tmp_dir_candidates(config) + fallback_candidate = None + for candidate in candidates: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and config.TMP_DIR != candidate: + os.environ["TMP_DIR"] = str(candidate) + return candidate + try: + if ( + fallback_candidate is None + and candidate.exists() + and dir_is_writable(candidate) + and tmp_dir_socket_path_is_short_enough(candidate) + ): + fallback_candidate = candidate + except Exception: + pass + + # Some sandboxed environments disallow AF_UNIX binds entirely. + # Fall back to the shortest writable path so read-only CLI commands can still run, + # and let later permission checks surface the missing socket support if needed. + if fallback_candidate: + if autofix and config.TMP_DIR != fallback_candidate: + os.environ["TMP_DIR"] = str(fallback_candidate) + return fallback_candidate + + if not quiet: + raise OSError(f"ArchiveBox is unable to find a writable TMP_DIR, tried {candidates}!") + + +def get_or_create_working_lib_dir(autofix=True, quiet=False, config: "ArchiveBoxConfig | None" = None, **config_kwargs): + from archivebox.config.common import get_config + from archivebox.misc.checks import check_lib_dir + + config = config or get_config(**config_kwargs) + + # LIB_DIR is either the shared platformdirs default or an explicit env/config override. + CANDIDATES = [config.LIB_DIR] + + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and config.LIB_DIR != candidate: + os.environ["LIB_DIR"] = str(candidate) + return candidate + + if not quiet: + raise OSError(f"ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!") + + +def get_data_locations(config: "ArchiveBoxConfig | None" = None, **config_kwargs): + from archivebox.config.constants import CONSTANTS + from archivebox.config.common import get_config + from archivebox.misc.logging import AttrDict + + config = config or get_config(**config_kwargs) + try: + tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True, config=config) or config.TMP_DIR + except Exception: + tmp_dir = config.TMP_DIR + + return AttrDict( + { + "DATA_DIR": { + "path": DATA_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), + "is_mount": os.path.ismount(DATA_DIR.resolve()), + }, + "CONFIG_FILE": { + "path": CONSTANTS.CONFIG_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) + and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) + and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), + }, + "SQL_INDEX": { + "path": DATABASE_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(DATABASE_FILE.resolve()), + }, + "ARCHIVE_DIR": { + "path": CONSTANTS.ARCHIVE_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.ARCHIVE_DIR) + and os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) + and os.access(CONSTANTS.ARCHIVE_DIR, os.W_OK), + "is_mount": os.path.ismount(CONSTANTS.ARCHIVE_DIR.resolve()), + }, + "USERS_DIR": { + "path": CONSTANTS.USERS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.USERS_DIR), + "is_valid": os.path.isdir(CONSTANTS.USERS_DIR) + and os.access(CONSTANTS.USERS_DIR, os.R_OK) + and os.access(CONSTANTS.USERS_DIR, os.W_OK), + "is_mount": os.path.ismount(CONSTANTS.USERS_DIR.resolve()), + }, + "SOURCES_DIR": { + "path": CONSTANTS.SOURCES_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) + and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) + and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), + }, + "PERSONAS_DIR": { + "path": CONSTANTS.PERSONAS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), + "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) + and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) + and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write + }, + "LOGS_DIR": { + "path": CONSTANTS.LOGS_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) + and os.access(CONSTANTS.LOGS_DIR, os.R_OK) + and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write + }, + "TMP_DIR": { + "path": tmp_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(tmp_dir) + and os.access(tmp_dir, os.R_OK) + and os.access(tmp_dir, os.W_OK) + and tmp_dir_socket_path_is_short_enough(tmp_dir), + }, + # "CACHE_DIR": { + # "path": CACHE_DIR.resolve(), + # "enabled": True, + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write + # }, + }, + ) + + +def get_code_locations(config: "ArchiveBoxConfig | None" = None, **config_kwargs): + from archivebox.config.constants import CONSTANTS + from archivebox.config.common import get_config + from archivebox.misc.logging import AttrDict + + config = config or get_config(**config_kwargs) + try: + lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True, config=config) or config.LIB_DIR + except Exception: + lib_dir = config.LIB_DIR + + return AttrDict( + { + "PACKAGE_DIR": { + "path": (PACKAGE_DIR).resolve(), + "enabled": True, + "is_valid": os.access(PACKAGE_DIR / "__main__.py", os.X_OK), # executable + }, + "TEMPLATES_DIR": { + "path": CONSTANTS.TEMPLATES_DIR.resolve(), + "enabled": True, + "is_valid": os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list + }, + "CUSTOM_TEMPLATES_DIR": { + "path": CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR), + "is_valid": os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read + }, + "USER_PLUGINS_DIR": { + "path": CONSTANTS.USER_PLUGINS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), + "is_valid": os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read + }, + "LIB_DIR": { + "path": lib_dir.resolve(), + "enabled": True, + "is_valid": os.path.isdir(lib_dir) and os.access(lib_dir, os.R_OK) and os.access(lib_dir, os.W_OK), # read + write + }, + }, + ) diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py new file mode 100644 index 0000000000..8868a812a7 --- /dev/null +++ b/archivebox/config/permissions.py @@ -0,0 +1,139 @@ +__package__ = "archivebox.config" + +import os +import pwd +import sys +import socket +import platform +from typing import cast + +from rich import print + +from pathlib import Path +from contextlib import contextmanager + +############################################################################################# + +DATA_DIR = Path(os.getcwd()) + +try: + DATA_DIR_STAT = DATA_DIR.stat() + DATA_DIR_UID = DATA_DIR_STAT.st_uid + DATA_DIR_GID = DATA_DIR_STAT.st_gid +except PermissionError: + DATA_DIR_UID = 0 + DATA_DIR_GID = 0 + +DEFAULT_UID = 911 +DEFAULT_GID = 911 +RUNNING_AS_UID = os.getuid() +RUNNING_AS_GID = os.getgid() +EUID = os.geteuid() +EGID = os.getegid() +SUDO_UID = int(os.environ.get("SUDO_UID", 0)) +SUDO_GID = int(os.environ.get("SUDO_GID", 0)) +USER: str = Path("~").expanduser().resolve().name +HOSTNAME: str = cast(str, max([socket.gethostname(), platform.node()], key=len)) + +IS_ROOT = RUNNING_AS_UID == 0 +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") +# IN_DOCKER_COMPOSE = # TODO: figure out a way to detect if running in docker compose + + +FALLBACK_UID = RUNNING_AS_UID or SUDO_UID +FALLBACK_GID = RUNNING_AS_GID or SUDO_GID +try: + ARCHIVEBOX_ACCOUNT = pwd.getpwnam("archivebox") +except KeyError: + ARCHIVEBOX_ACCOUNT = None + +if DATA_DIR_UID != 0: + ARCHIVEBOX_USER = DATA_DIR_UID + ARCHIVEBOX_GROUP = DATA_DIR_GID +elif RUNNING_AS_UID == 0 and ARCHIVEBOX_ACCOUNT is not None: + ARCHIVEBOX_USER = ARCHIVEBOX_ACCOUNT.pw_uid + ARCHIVEBOX_GROUP = ARCHIVEBOX_ACCOUNT.pw_gid +else: + ARCHIVEBOX_USER = EUID or RUNNING_AS_UID or FALLBACK_UID + ARCHIVEBOX_GROUP = EGID or RUNNING_AS_GID or FALLBACK_GID +if not USER: + try: + # alternative method 1 to get username + USER = pwd.getpwuid(ARCHIVEBOX_USER).pw_name + except Exception: + pass + +if not USER: + try: + # alternative method 2 to get username + import getpass + + USER = getpass.getuser() + except Exception: + pass + +if not USER: + try: + # alternative method 3 to get username + USER = os.getlogin() or "archivebox" + except Exception: + USER = "archivebox" + +ARCHIVEBOX_USER_EXISTS = False +try: + pwd.getpwuid(ARCHIVEBOX_USER) + ARCHIVEBOX_USER_EXISTS = True +except Exception: + ARCHIVEBOX_USER_EXISTS = False + + +############################################################################################# + + +def drop_privileges(): + """If running as root, drop privileges to the data dir owner or archivebox user.""" + + # Always run ArchiveBox as the user that owns the data dir, or as the + # archivebox service account when the data dir is root-owned. + if os.getuid() == 0: + if os.geteuid() != ARCHIVEBOX_USER and ARCHIVEBOX_USER != 0 and ARCHIVEBOX_USER_EXISTS: + os.seteuid(ARCHIVEBOX_USER) + + # update environment variables so that subprocesses dont try to write to /root + pw_record = pwd.getpwuid(ARCHIVEBOX_USER) + os.environ["HOME"] = pw_record.pw_dir + os.environ["LOGNAME"] = pw_record.pw_name + os.environ["USER"] = pw_record.pw_name + + if ARCHIVEBOX_USER == 0 or not ARCHIVEBOX_USER_EXISTS: + print( + "[yellow]:warning: Running as [red]root[/red] is not recommended and may make your [blue]DATA_DIR[/blue] inaccessible to other users on your system.[/yellow] (use [blue]sudo[/blue] instead)", + file=sys.stderr, + ) + + +@contextmanager +def SudoPermission(uid=0, fallback=False): + """Attempt to run code with sudo permissions for a given user (or root)""" + + if os.geteuid() == uid: + # no need to change effective UID, we are already that user + yield + return + + try: + # change our effective UID to the given UID + os.seteuid(uid) + except PermissionError as err: + if not fallback: + raise PermissionError(f"Not enough permissions to run code as uid={uid}, please retry with sudo") from err + try: + # yield back to the caller so they can run code inside context as root + yield + finally: + # then set effective UID back to DATA_DIR owner + try: + os.seteuid(ARCHIVEBOX_USER) + except PermissionError as err: + if not fallback: + raise PermissionError(f"Failed to revert uid={uid} back to {ARCHIVEBOX_USER} after running code with sudo") from err diff --git a/archivebox/config/version.py b/archivebox/config/version.py new file mode 100644 index 0000000000..f1e866c9b7 --- /dev/null +++ b/archivebox/config/version.py @@ -0,0 +1,170 @@ +__package__ = "archivebox.config" + +import os +import importlib.metadata + +from pathlib import Path +from functools import cache +from datetime import datetime +import re + +############################################################################################# + +IN_DOCKER = os.environ.get("IN_DOCKER", False) in ("1", "true", "True", "TRUE", "yes") + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir + +############################################################################################# + + +@cache +def detect_installed_version(PACKAGE_DIR: Path = PACKAGE_DIR): + """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file""" + try: + # if in production install, use pip-installed package metadata + return importlib.metadata.version("archivebox").strip() + except importlib.metadata.PackageNotFoundError: + pass + + try: + # if in dev Git repo dir, use pyproject.toml file + pyproject_config = (PACKAGE_DIR.parent / "pyproject.toml").read_text().split("\n") + for line in pyproject_config: + if line.startswith("version = "): + return line.split(" = ", 1)[-1].strip('"').strip() + except FileNotFoundError: + # building docs, pyproject.toml is not available + pass + + # raise Exception('Failed to detect installed archivebox version!') + return "dev" + + +@cache +def get_COMMIT_HASH() -> str | None: + for env_var in ("ARCHIVEBOX_COMMIT_HASH", "COMMIT_HASH"): + env_commit_hash = os.environ.get(env_var, "").strip() + if re.fullmatch(r"[0-9a-fA-F]{40}", env_commit_hash): + return env_commit_hash + + if IN_DOCKER: + try: + version_txt = Path("/VERSION.txt").read_text() + docker_commit_hashes = re.findall(r"COMMIT_HASH=([0-9a-fA-F]{40})", version_txt) + if docker_commit_hashes: + return docker_commit_hashes[-1] + except Exception: + pass + + def _read_git_file(git_dir: Path, ref: str) -> str | None: + try: + return git_dir.joinpath(ref).read_text().strip() + except Exception: + pass + + try: + packed_refs = git_dir.joinpath("packed-refs").read_text().splitlines() + except Exception: + return None + + for line in packed_refs: + if line.startswith("#") or line.startswith("^") or not line.strip(): + continue + commit_hash, packed_ref = line.split(" ", 1) + if packed_ref == ref: + return commit_hash.strip() + + return None + + try: + git_dir = PACKAGE_DIR.parent / ".git" + if git_dir.is_file(): + gitdir_line = git_dir.read_text().strip() + gitdir_path = gitdir_line.removeprefix("gitdir:").strip() + git_dir = Path(gitdir_path) + if not git_dir.is_absolute(): + git_dir = PACKAGE_DIR.parent / git_dir + + head = (git_dir / "HEAD").read_text().strip() + if re.fullmatch(r"[0-9a-fA-F]{40}", head): + return head + + ref = head.removeprefix("ref:").strip() + commit_hash = _read_git_file(git_dir, ref) + if commit_hash: + return commit_hash + except Exception: + pass + + try: + return list((PACKAGE_DIR.parent / ".git/refs/heads/").glob("*"))[0].read_text().strip() + except Exception: + pass + + return None + + +@cache +def get_BUILD_TIME() -> str: + if IN_DOCKER: + try: + # if we're in the archivebox official docker image, /VERSION.txt will contain the build time + docker_build_end_time = Path("/VERSION.txt").read_text().rsplit("BUILD_END_TIME=")[-1].split("\n", 1)[0] + return docker_build_end_time + except Exception: + pass + + src_last_modified_unix_timestamp = (PACKAGE_DIR / "README.md").stat().st_mtime + return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime("%Y-%m-%d %H:%M:%S %s") + + +# def get_versions_available_on_github(config): +# """ +# returns a dictionary containing the ArchiveBox GitHub release info for +# the recommended upgrade version and the currently installed version +# """ + +# # we only want to perform the (relatively expensive) check for new versions +# # when its most relevant, e.g. when the user runs a long-running command +# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' +# long_running_commands = ('add', 'schedule', 'update', 'status', 'server') +# if subcommand_run_by_user not in long_running_commands: +# return None + +# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" +# response = requests.get(github_releases_api) +# if response.status_code != 200: +# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) +# return None +# all_releases = response.json() + +# installed_version = parse_version_string(config['VERSION']) + +# # find current version or nearest older version (to link to) +# current_version = None +# for idx, release in enumerate(all_releases): +# release_version = parse_version_string(release['tag_name']) +# if release_version <= installed_version: +# current_version = release +# break + +# current_version = current_version or all_releases[-1] + +# # recommended version is whatever comes after current_version in the release list +# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) +# try: +# recommended_version = all_releases[idx+1] +# except IndexError: +# recommended_version = None + +# return {'recommended_version': recommended_version, 'current_version': current_version} + +# def can_upgrade(config): +# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: +# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) +# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) +# return recommended_version > current_version +# return False + + +VERSION: str = detect_installed_version() diff --git a/archivebox/config/views.py b/archivebox/config/views.py new file mode 100644 index 0000000000..5ca38483bc --- /dev/null +++ b/archivebox/config/views.py @@ -0,0 +1,552 @@ +__package__ = "archivebox.config" + +import os +import inspect +from pathlib import Path +from typing import Any +from urllib.parse import quote, urlencode +from django.http import HttpRequest +from django.utils import timezone +from django.utils.html import format_html +from django.utils.safestring import mark_safe + +from admin_data_views.typing import TableContext, ItemContext, SectionData +from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink + +from archivebox.config import CONSTANTS +from archivebox.misc.util import parse_date + +from archivebox.machine.models import Binary + +ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/" +INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/" + + +def is_superuser(request: HttpRequest) -> bool: + return bool(request.user.is_superuser) + + +def format_parsed_datetime(value: object) -> str: + parsed = parse_date(value) + return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else "" + + +def get_environment_binary_url(name: str) -> str: + return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/" + + +def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | None: + if binary is None or not binary.id: + return None + + base_url = binary.admin_change_url + changelist_filters = urlencode({"q": name}) + return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}" + + +def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str: + installed_binary_url = get_installed_binary_change_url(name, db_binary) + + if installed_binary_url: + return str( + format_html( + '<code>{}</code><br/><a href="{}">View Installed Binary Record</a>', + merged["abspath"], + installed_binary_url, + ), + ) + + return str(format_html("<code>{}</code>", merged["abspath"])) + + +def obj_to_yaml(obj: Any, indent: int = 0) -> str: + indent_str = " " * indent + if indent == 0: + indent_str = "\n" # put extra newline between top-level entries + + if isinstance(obj, dict): + if not obj: + return "{}" + result = "\n" + for key, value in obj.items(): + result += f"{indent_str}{key}:{obj_to_yaml(value, indent + 1)}\n" + return result + + elif isinstance(obj, list): + if not obj: + return "[]" + result = "\n" + for item in obj: + result += f"{indent_str}- {obj_to_yaml(item, indent + 1).lstrip()}\n" + return result.rstrip() + + elif isinstance(obj, str): + if "\n" in obj: + return f" |\n{indent_str} " + obj.replace("\n", f"\n{indent_str} ") + else: + return f" {obj}" + + elif isinstance(obj, (int, float, bool)): + return f" {str(obj)}" + + elif callable(obj): + source = ( + "\n".join("" if "def " in line else line for line in inspect.getsource(obj).split("\n") if line.strip()) + .split("lambda: ")[-1] + .rstrip(",") + ) + return f" {indent_str} " + source.replace("\n", f"\n{indent_str} ") + + else: + return f" {str(obj)}" + + +def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]: + return ( + int(binary.status == Binary.StatusChoices.INSTALLED), + int(bool(binary.version)), + int(bool(binary.abspath)), + binary.modified_at, + ) + + +def get_db_binaries_by_name() -> dict[str, Binary]: + """Group Binary rows by a URL-safe canonical name. + + Hooks occasionally emit ``BinaryEvent.name`` carrying an abspath rather + than a short binary name (see ``services/binary_service.py``). That used + to leak ``name='/Users/.../bin/foo'`` rows into the DB, which then broke + ``/admin/environment/binaries`` because the admin URL regex is + ``(?P<key>[^/]+)``. Canonicalize at the keying step so duplicates fold + into the real binary and the admin link key stays slash-free regardless + of legacy DB state. + """ + from archivebox.machine.models import _canonical_binary_name + + grouped: dict[str, list[Binary]] = {} + binary_name_aliases = { + "youtube-dl": "yt-dlp", + } + for binary in Binary.objects.all(): + canonical_name = _canonical_binary_name(binary.name) + canonical_name = binary_name_aliases.get(canonical_name, canonical_name) + if not canonical_name: + continue + grouped.setdefault(canonical_name, []).append(binary) + + return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()} + + +@render_with_table_view +def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Binary Name": [], + "Found Version": [], + "Provided By": [], + "Found Abspath": [], + } + + db_binaries = get_db_binaries_by_name() + all_binary_names = sorted(db_binaries.keys()) + + for name in all_binary_names: + binary = db_binaries.get(name) + binary_is_valid = bool(binary and binary.is_valid) + + rows["Binary Name"].append(ItemLink(name, key=name)) + + if binary_is_valid: + rows["Found Version"].append(f"โœ… {binary.version}" if binary.version else "โœ… found") + rows["Provided By"].append(binary.binprovider or "-") + rows["Found Abspath"].append(binary.abspath or "-") + else: + rows["Found Version"].append("โŒ missing") + rows["Provided By"].append("-") + rows["Found Abspath"].append("-") + + return TableContext( + title="Binaries", + table=rows, + ) + + +@render_with_item_view +def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + key = { + "youtube-dl": "yt-dlp", + }.get(key, key) + db_binary = get_db_binaries_by_name().get(key) + binary_is_valid = bool(db_binary and db_binary.is_valid) + if binary_is_valid: + binary_data = db_binary.to_json() + section: SectionData = { + "name": key, + "description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)), + "fields": { + "name": key, + "binprovider": db_binary.binprovider or "-", + "abspath": db_binary.abspath or "not found", + "version": db_binary.version or "unknown", + "sha256": db_binary.sha256, + "status": db_binary.status, + }, + "help_texts": {}, + } + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + section: SectionData = { + "name": key, + "description": "No persisted Binary record found", + "fields": { + "name": key, + "binprovider": db_binary.binprovider if db_binary else "not recorded", + "abspath": db_binary.abspath if db_binary else "not recorded", + "version": db_binary.version if db_binary else "N/A", + "status": db_binary.status if db_binary else "unrecorded", + }, + "help_texts": {}, + } + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + +@render_with_table_view +def worker_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Name": [], + "Type": [], + "State": [], + "PID": [], + "Started": [], + "Command": [], + "Logfile": [], + "Exit Status": [], + } + + from archivebox.workers.supervisord_util import get_existing_supervisord_process + + supervisor = get_existing_supervisord_process() + if supervisor is None: + return TableContext( + title="No running worker processes", + table=rows, + ) + + all_config: dict[str, dict[str, object]] = {} + config_items = supervisor.getAllConfigInfo() + if not isinstance(config_items, list): + config_items = [] + for config_data in config_items: + if not isinstance(config_data, dict): + continue + config_name = config_data.get("name") + if not isinstance(config_name, str): + continue + all_config[config_name] = config_data + + # Collect every PID we plan to show so we can resolve them to Process rows + # in a single query. supervisord's per-worker description carries the pid + # in the form ``pid 12345, uptime 0:01:23`` (or just the bare ``pid`` + # placeholder when stopped); we ignore non-numeric values. + process_items = supervisor.getAllProcessInfo() + if not isinstance(process_items, list): + process_items = [] + + def _parse_worker_pid_and_uptime(description: str) -> tuple[int | None, str]: + body = description.replace("pid ", "", 1) + pid_part, _, uptime_part = body.partition(", uptime ") + try: + return int(pid_part.strip()), uptime_part.strip() + except ValueError: + return None, "" + + pids: set[int] = set() + supervisor_pid = supervisor.getPID() + if isinstance(supervisor_pid, int): + pids.add(supervisor_pid) + for proc_data in process_items: + if not isinstance(proc_data, dict): + continue + pid_int, _ = _parse_worker_pid_and_uptime(str(proc_data.get("description") or "")) + if pid_int is not None: + pids.add(pid_int) + + pid_to_process_id: dict[int, str] = {} + pid_to_process_type: dict[int, str] = {} + if pids: + try: + from archivebox.machine.models import Machine, Process + + for row in ( + Process.objects.filter(machine=Machine.current(), pid__in=pids) + .order_by("pid", "-started_at", "-created_at") + .only("id", "pid", "process_type") + ): + if row.pid in pid_to_process_id: + continue # keep the most recent row per PID + pid_to_process_id[row.pid] = str(row.id) + pid_to_process_type[row.pid] = row.process_type + + except Exception: + pass + + def _pid_cell(pid_value: int | None, uptime_str: str = ""): + if pid_value is None: + return "" + pid_text = str(pid_value) + process_id = pid_to_process_id.get(pid_value) + if process_id: + link = format_html('<a href="/admin/machine/process/{}/change/">{}</a>', process_id, pid_text) + else: + link = format_html("{}", pid_text) + if uptime_str: + return format_html("{}, uptime {}", link, uptime_str) + return link + + # Add top row for supervisord process manager. supervisord exposes its + # state + pid over XML-RPC but not its own start time / exit status / uptime, + # so we read those from the OS process (or fall back to the Process row + # recorded in _record_supervisord_process). Exit status stays blank while + # it's RUNNING โ€” supervisord wouldn't be answering RPC if it had exited. + rows["Name"].append(ItemLink("supervisord", key="supervisord")) + rows["Type"].append("supervisord") + supervisor_state = supervisor.getState() + state_name = str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else "") + rows["State"].append(state_name) + + supervisor_started = "" + supervisor_uptime = "" + try: + import time as _time + + import psutil + + ps_proc = psutil.Process(supervisor_pid) + create_time = ps_proc.create_time() + supervisor_started = format_parsed_datetime(create_time) + seconds = max(int(_time.time() - create_time), 0) + hours, remainder = divmod(seconds, 3600) + minutes, secs = divmod(remainder, 60) + supervisor_uptime = f"{hours}:{minutes:02d}:{secs:02d}" + except Exception: + try: + from archivebox.machine.models import Machine, Process + + row = ( + Process.objects.filter( + machine=Machine.current(), + process_type=Process.TypeChoices.SUPERVISORD, + pid=supervisor_pid, + ) + .order_by("-started_at") + .first() + ) + if row and row.started_at: + supervisor_started = row.started_at.strftime("%Y-%m-%d %H:%M:%S") + seconds = max(int((timezone.now() - row.started_at).total_seconds()), 0) + hours, remainder = divmod(seconds, 3600) + minutes, secs = divmod(remainder, 60) + supervisor_uptime = f"{hours}:{minutes:02d}:{secs:02d}" + except Exception: + pass + + rows["PID"].append(_pid_cell(supervisor_pid if isinstance(supervisor_pid, int) else None, supervisor_uptime)) + rows["Started"].append(supervisor_started or "-") + + rows["Command"].append("supervisord --configuration=tmp/supervisord.conf") + rows["Logfile"].append( + format_html( + '<a href="/admin/environment/logs/{}/">{}</a>', + "supervisord", + "logs/supervisord.log", + ), + ) + rows["Exit Status"].append("" if state_name == "RUNNING" else "-") + + # Add a row for each worker process managed by supervisord + for proc_data in process_items: + if not isinstance(proc_data, dict): + continue + proc_name = str(proc_data.get("name") or "") + proc_description = str(proc_data.get("description") or "") + proc_start = proc_data.get("start") + proc_logfile = str(proc_data.get("stdout_logfile") or "") + proc_config = all_config.get(proc_name, {}) + pid_int, uptime_str = _parse_worker_pid_and_uptime(proc_description) + + rows["Name"].append(ItemLink(proc_name, key=proc_name)) + # Prefer the Process row's process_type when we have one (e.g. "worker", + # "hook"); otherwise fall back to the generic "worker" label since + # everything in this loop is supervisord-managed. + rows["Type"].append(pid_to_process_type.get(pid_int, "worker") if pid_int else "worker") + rows["State"].append(str(proc_data.get("statename") or "")) + rows["PID"].append(_pid_cell(pid_int, uptime_str)) + rows["Started"].append(format_parsed_datetime(proc_start)) + rows["Command"].append(str(proc_config.get("command") or "")) + rows["Logfile"].append( + format_html( + '<a href="/admin/environment/logs/{}/">{}</a>', + proc_logfile.split("/")[-1].split(".")[0], + proc_logfile, + ), + ) + rows["Exit Status"].append(str(proc_data.get("exitstatus") or "")) + + return TableContext( + title="Running worker processes", + table=rows, + ) + + +@render_with_item_view +def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker, get_sock_file, CONFIG_FILE_NAME + + SOCK_FILE = get_sock_file() + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + + supervisor = get_existing_supervisord_process() + if supervisor is None: + return ItemContext( + slug="none", + title="error: No running supervisord process.", + data=[], + ) + + all_config: list[dict[str, object]] = [] + config_items = supervisor.getAllConfigInfo() + if not isinstance(config_items, list): + config_items = [] + for config_data in config_items: + if isinstance(config_data, dict): + all_config.append(config_data) + + if key == "supervisord": + relevant_config = CONFIG_FILE.read_text() + relevant_logs = str(supervisor.readLog(0, 10_000_000)) + start_ts = [line for line in relevant_logs.split("\n") if "RPC interface 'supervisor' initialized" in line][-1].split(",", 1)[0] + start_dt = parse_date(start_ts) + uptime = str(timezone.now() - start_dt).split(".")[0] if start_dt else "" + supervisor_state = supervisor.getState() + + proc: dict[str, object] = { + "name": "supervisord", + "pid": supervisor.getPID(), + "statename": str(supervisor_state.get("statename") if isinstance(supervisor_state, dict) else ""), + "start": start_ts, + "stop": None, + "exitstatus": "", + "stdout_logfile": "logs/supervisord.log", + "description": f"pid 000, uptime {uptime}", + } + else: + worker_data = get_worker(supervisor, key) + proc = worker_data if isinstance(worker_data, dict) else {} + relevant_config = next((config for config in all_config if config.get("name") == key), {}) + log_result = supervisor.tailProcessStdoutLog(key, 0, 10_000_000) + relevant_logs = str(log_result[0] if isinstance(log_result, tuple) else log_result) + + section: SectionData = { + "name": key, + "description": key, + "fields": { + "Command": str(proc.get("name") or ""), + "PID": str(proc.get("pid") or ""), + "State": str(proc.get("statename") or ""), + "Started": format_parsed_datetime(proc.get("start")), + "Stopped": format_parsed_datetime(proc.get("stop")), + "Exit Status": str(proc.get("exitstatus") or ""), + "Logfile": str(proc.get("stdout_logfile") or ""), + "Uptime": str(str(proc.get("description") or "").split("uptime ", 1)[-1]), + "Config": obj_to_yaml(relevant_config) if isinstance(relevant_config, dict) else str(relevant_config), + "Logs": relevant_logs, + }, + "help_texts": {"Uptime": "How long the process has been running ([days:]hours:minutes:seconds)"}, + } + + return ItemContext( + slug=key, + title=key, + data=[section], + ) + + +@render_with_table_view +def log_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + log_files: list[Path] = [] + for logfile in sorted(CONSTANTS.LOGS_DIR.glob("*.log"), key=os.path.getmtime)[::-1]: + if isinstance(logfile, Path): + log_files.append(logfile) + + rows = { + "Name": [], + "Last Updated": [], + "Size": [], + "Most Recent Lines": [], + } + + # Add a row for each worker process managed by supervisord + for logfile in log_files: + st = logfile.stat() + rows["Name"].append(ItemLink("logs" + str(logfile).rsplit("/logs", 1)[-1], key=logfile.name)) + rows["Last Updated"].append(format_parsed_datetime(st.st_mtime)) + rows["Size"].append(f"{st.st_size // 1000} kb") + + with open(logfile, "rb") as f: + try: + f.seek(-1024, os.SEEK_END) + except OSError: + f.seek(0) + last_lines = f.read().decode("utf-8", errors="replace").split("\n") + non_empty_lines = [line for line in last_lines if line.strip()] + rows["Most Recent Lines"].append(non_empty_lines[-1] if non_empty_lines else "") + + return TableContext( + title="Debug Log files", + table=rows, + ) + + +@render_with_item_view +def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + log_file = [logfile for logfile in CONSTANTS.LOGS_DIR.glob("*.log") if key in logfile.name][0] + + log_text = log_file.read_text() + log_stat = log_file.stat() + + section: SectionData = { + "name": key, + "description": key, + "fields": { + "Path": str(log_file), + "Size": f"{log_stat.st_size // 1000} kb", + "Last Updated": format_parsed_datetime(log_stat.st_mtime), + "Tail": "\n".join(log_text[-10_000:].split("\n")[-20:]), + "Full Log": log_text, + }, + } + + return ItemContext( + slug=key, + title=key, + data=[section], + ) diff --git a/archivebox/core/__init__.py b/archivebox/core/__init__.py new file mode 100644 index 0000000000..013c1431ad --- /dev/null +++ b/archivebox/core/__init__.py @@ -0,0 +1,9 @@ +__package__ = "archivebox.core" +__order__ = 100 + + +def register_admin(admin_site): + """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site""" + from archivebox.core.admin import register_admin as do_register + + do_register(admin_site) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py new file mode 100644 index 0000000000..9c95418307 --- /dev/null +++ b/archivebox/core/admin.py @@ -0,0 +1,17 @@ +__package__ = "archivebox.core" + +from django.contrib.auth import get_user_model + + +from archivebox.core.models import Snapshot, ArchiveResult, Tag +from archivebox.core.admin_tags import TagAdmin +from archivebox.core.admin_snapshots import SnapshotAdmin +from archivebox.core.admin_archiveresults import ArchiveResultAdmin +from archivebox.core.admin_users import CustomUserAdmin + + +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) + admin_site.register(ArchiveResult, ArchiveResultAdmin) + admin_site.register(Snapshot, SnapshotAdmin) + admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py new file mode 100644 index 0000000000..08e4e5fbdf --- /dev/null +++ b/archivebox/core/admin_archiveresults.py @@ -0,0 +1,832 @@ +__package__ = "archivebox.core" + +import html +import json +import os +import shlex +from pathlib import Path +from urllib.parse import quote +from functools import reduce +from operator import and_ + +from django.contrib import admin +from django.db.models import Min, Prefetch, Q, TextField +from django.db.models.functions import Cast +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django.core.exceptions import ValidationError +from django.urls import reverse, resolve +from django.utils import timezone +from django.utils.text import smart_split + +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.base_models.admin import BaseModelAdmin +from archivebox.plugins.discovery import get_plugin_icon +from archivebox.plugins.views import LIVE_PLUGIN_BASE_URL +from archivebox.core.routes_util import build_snapshot_url +from archivebox.core.widgets import InlineTagEditorWidget +from archivebox.machine.env_util import env_to_shell_exports + + +from archivebox.core.models import ArchiveResult, Snapshot + + +def _get_replay_source_url(result: ArchiveResult) -> str: + process = result.process_record + return str((process.url if process else None) or result.snapshot.url or "") + + +def build_abx_dl_display_command(result: ArchiveResult) -> str: + source_url = _get_replay_source_url(result) + plugin_name = str(result.plugin or "").strip() + cmd = ["abx-dl"] + if plugin_name: + cmd.append(f"--plugins={plugin_name}") + if source_url: + cmd.append(source_url) + return shlex.join(cmd) + + +def build_abx_dl_replay_command(result: ArchiveResult, config=None) -> str: + display_command = build_abx_dl_display_command(result) + process = result.process + env_items = env_to_shell_exports(process.env if process else {}) + if config is not None: + result.snapshot._runtime_config = config + snapshot_dir = shlex.quote(str(result.pwd or result.snapshot_dir)) + if env_items: + return f"cd {snapshot_dir}; env {env_items} {display_command}" + return f"cd {snapshot_dir}; {display_command}" + + +def get_plugin_admin_url(plugin_name: str) -> str: + from archivebox.plugins.discovery import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/" + + return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" + + +def get_process_link_label(process) -> str: + if process.pid: + return str(process.pid) + return str(process.id)[-8:] + + +def render_archiveresults_list(archiveresults_qs, limit=50, config=None): + """Render a nice inline list view of archive results with status, plugin, output, and actions.""" + + result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit]) + if not result_ids: + return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>') + + results_by_id = { + result.pk: result + for result in ArchiveResult.objects.filter(pk__in=result_ids).select_related( + "snapshot", + "snapshot__crawl", + "snapshot__crawl__created_by", + "process", + "process__machine", + ) + } + results = [results_by_id[result_id] for result_id in result_ids if result_id in results_by_id] + + if not results: + return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>') + + # Status colors + status_colors = { + "succeeded": ("#166534", "#dcfce7"), # green + "failed": ("#991b1b", "#fee2e2"), # red + "queued": ("#6b7280", "#f3f4f6"), # gray + "started": ("#92400e", "#fef3c7"), # amber + "paused": ("#1d4ed8", "#dbeafe"), # blue + "backoff": ("#92400e", "#fef3c7"), + "skipped": ("#475569", "#f1f5f9"), + "noresults": ("#475569", "#f1f5f9"), + } + + rows = [] + for idx, result in enumerate(results): + status = result.status or "queued" + color, bg = status_colors.get(status, ("#6b7280", "#f3f4f6")) + output_files = result.output_files or {} + if isinstance(output_files, dict): + output_file_count = len(output_files) + elif isinstance(output_files, (list, tuple, set)): + output_file_count = len(output_files) + elif isinstance(output_files, str): + try: + parsed = json.loads(output_files) + output_file_count = len(parsed) if isinstance(parsed, (dict, list, tuple, set)) else 0 + except Exception: + output_file_count = 0 + else: + output_file_count = 0 + + # Get plugin icon + icon = get_plugin_icon(result.plugin) + + # Format timestamp + end_time = result.end_ts.strftime("%Y-%m-%d %H:%M:%S") if result.end_ts else "-" + + process = result.process_record + process_display = "-" + if process: + process_display = f''' + <a href="{reverse("admin:machine_process_change", args=[process.id])}" + style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px;" + title="View process">{get_process_link_label(process)}</a> + ''' + + machine_display = "-" + if process and process.machine_id: + machine_display = f''' + <a href="{reverse("admin:machine_machine_change", args=[process.machine_id])}" + style="color: #2563eb; text-decoration: none; font-size: 12px;" + title="View machine">{process.machine.hostname}</a> + ''' + + # Truncate output for display + full_output = result.output_str_for_display() or "-" + output_display = full_output[:60] + if len(full_output) > 60: + output_display += "..." + + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result, config=config) + cmd_str_escaped = html.escape(display_cmd) + cmd_attr = html.escape(replay_cmd, quote=True) + + # Build output link - use embed_path() which checks output_files first + embed_path = result.embed_path() + snapshot_id = str(result.snapshot_id) + if embed_path and result.status == "succeeded": + output_link = build_snapshot_url(snapshot_id, embed_path, config=config) + else: + output_link = build_snapshot_url(snapshot_id, "", config=config) + + # Get version - try cmd_version field + version = result.cmd_version if result.cmd_version else "-" + + # Unique ID for this row's expandable output + row_id = f"output_{idx}_{str(result.id)[:8]}" + + rows.append(f''' + <tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'"> + <td style="padding: 10px 12px; white-space: nowrap;"> + <a href="{reverse("admin:core_archiveresult_change", args=[result.id])}" + style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;" + title="View/edit archive result"> + <code>{str(result.id)[-8:]}</code> + </a> + </td> + <td style="padding: 10px 12px; white-space: nowrap;"> + <span style="display: inline-block; padding: 3px 10px; border-radius: 12px; + font-size: 11px; font-weight: 600; text-transform: uppercase; + color: {color}; background: {bg};">{status}</span> + </td> + <td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.plugin}"> + {icon} + </td> + <td style="padding: 10px 12px; font-weight: 500; color: #334155;"> + <a href="{output_link}" target="_blank" + style="color: #334155; text-decoration: none;" + title="View output fullscreen" + onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';" + onmouseout="this.style.color='#334155'; this.style.textDecoration='none';"> + {result.plugin} + </a> + </td> + <td style="padding: 10px 12px; max-width: 280px;"> + <span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open" + style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;" + title="Click to expand full output"> + {output_display} + </span> + </td> + <td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px; text-align: right;"> + {output_file_count} + </td> + <td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;"> + {end_time} + </td> + <td style="padding: 10px 12px; white-space: nowrap;"> + {process_display} + </td> + <td style="padding: 10px 12px; white-space: nowrap;"> + {machine_display} + </td> + <td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;"> + {version} + </td> + <td style="padding: 10px 8px; white-space: nowrap;"> + <div style="display: flex; gap: 4px;"> + <a href="{output_link}" target="_blank" + style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;" + title="View output">๐Ÿ“„</a> + <a href="{reverse("admin:core_archiveresult_change", args=[result.id])}" + style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;" + title="Edit">โœ๏ธ</a> + </div> + </td> + </tr> + <tr style="border-bottom: 1px solid #e2e8f0;"> + <td colspan="11" style="padding: 0 12px 10px 12px;"> + <details id="{row_id}" style="margin: 0;"> + <summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;"> + Details & Output + </summary> + <div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;"> + <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;"> + <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)}</code></span> + <span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span> + <span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or "-"}</code></span> + </div> + <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;"> + <b>Output:</b> + </div> + <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre> + <div style="font-size: 11px; color: #64748b; margin-top: 8px;"> + <b>Command:</b> + </div> + <div style="position: relative; margin: 0; padding: 8px 56px 8px 8px; background: #1e293b; border-radius: 4px;"> + <button type="button" + data-command="{cmd_attr}" + onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;" + style="position: absolute; top: 6px; right: 6px; padding: 2px 8px; border: 0; border-radius: 4px; background: #334155; color: #e2e8f0; font-size: 11px; cursor: pointer;"> + Copy + </button> + <code title="{cmd_attr}" style="display: block; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #e2e8f0; font-size: 11px;">{cmd_str_escaped}</code> + </div> + </div> + </details> + </td> + </tr> + ''') + + total_count = archiveresults_qs.count() + footer = "" + if total_count > limit: + footer = f""" + <tr> + <td colspan="11" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;"> + Showing {limit} of {total_count} results   + <a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ""}" + style="color: #2563eb;">View all โ†’</a> + </td> + </tr> + """ + + return mark_safe(f""" + <div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;"> + <table style="width: 100%; border-collapse: collapse; font-size: 14px;"> + <thead> + <tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;"> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Details</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th> + <th style="padding: 10px 12px; text-align: right; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Files</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Process</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Machine</th> + <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th> + <th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th> + </tr> + </thead> + <tbody> + {"".join(rows)} + {footer} + </tbody> + </table> + </div> + """) + + +class ArchiveResultInline(admin.TabularInline): + name = "Archive Results Log" + model = ArchiveResult + parent_model = Snapshot + # fk_name = 'snapshot' + extra = 0 + sort_fields = ("end_ts", "plugin", "output_str", "status", "cmd_version") + readonly_fields = ("id", "result_id", "completed", "command", "version") + fields = ("start_ts", "end_ts", *readonly_fields, "plugin", "cmd", "cmd_version", "pwd", "status", "output_str") + # exclude = ('id',) + ordering = ("end_ts",) + show_change_link = True + # # classes = ['collapse'] + + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + try: + return self.parent_model.objects.get(pk=resolved.kwargs["object_id"]) + except (self.parent_model.DoesNotExist, ValidationError): + return None + + @admin.display( + description="Completed", + ordering="end_ts", + ) + def completed(self, obj): + return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S")) + + def result_id(self, obj): + return format_html( + '<a href="{}"><code style="font-size: 10px">[{}]</code></a>', + reverse("admin:core_archiveresult_change", args=(obj.id,)), + str(obj.id)[:8], + ) + + def command(self, obj): + return format_html("<small><code>{}</code></small>", " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html("<small><code>{}</code></small>", obj.cmd_version or "-") + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + base_fields = formset.form.base_fields + snapshot_output_dir = str(snapshot.output_dir) if snapshot else "" + + # import ipdb; ipdb.set_trace() + # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + base_fields["status"].initial = "succeeded" + base_fields["start_ts"].initial = timezone.now() + base_fields["end_ts"].initial = timezone.now() + base_fields["cmd_version"].initial = "-" + base_fields["pwd"].initial = snapshot_output_dir + base_fields["cmd"].initial = '["-"]' + base_fields["output_str"].initial = "Manually recorded cmd output..." + + if obj is not None: + # hidden values for existing entries and new entries + base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget() + base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget() + base_fields["cmd"].widget = base_fields["cmd"].hidden_widget() + base_fields["pwd"].widget = base_fields["pwd"].hidden_widget() + base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] + + +class ArchiveResultAdmin(BaseModelAdmin): + list_select_related = () + list_display = ( + "details_link", + "zip_link", + "created_at", + "snapshot_info", + "tags_inline", + "status_badge", + "plugin_with_icon", + "process_link", + "machine_link", + "cmd_str", + "output_str_display", + ) + list_display_links = None + sort_fields = ("id", "created_at", "plugin", "status") + readonly_fields = ( + "admin_actions", + "cmd", + "cmd_version", + "pwd", + "cmd_str", + "snapshot_info", + "tags_str", + "created_at", + "modified_at", + "output_summary", + "plugin_with_icon", + "process_link", + ) + search_fields = ( + "snapshot__id", + "snapshot__url", + "snapshot__tags__name", + "snapshot__crawl_id", + "plugin", + "hook_name", + "output_str", + "output_json", + "process__cmd", + ) + autocomplete_fields = ["snapshot"] + + fieldsets = ( + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot", "snapshot_info", "tags_str"), + "classes": ("card", "wide"), + }, + ), + ( + "Plugin", + { + "fields": ("plugin_with_icon", "process_link", "status"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("start_ts", "end_ts", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd", "cmd_str", "cmd_version", "pwd"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"), + "classes": ("card", "wide"), + }, + ), + ) + + list_filter = ("status", "plugin", "start_ts") + ordering = ["-start_ts"] + list_per_page = 50 + + paginator = AcceleratedPaginator + save_on_top = True + show_full_result_count = False + + actions = ["delete_selected"] + + class Meta: + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results" + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + return super().change_view(request, object_id, form_url, extra_context) + + def changelist_view(self, request, extra_context=None): + self.request = request + saved_list_per_page = self.list_per_page + self.list_per_page = request.archivebox_config.SNAPSHOTS_PER_PAGE + try: + return super().changelist_view(request, extra_context) + finally: + self.list_per_page = saved_list_per_page + + def get_queryset(self, request): + ordering = request.GET.get("o") + ordering_fields = set() + if ordering: + for part in ordering.split("."): + if not part: + continue + try: + idx = abs(int(part)) - 1 + except ValueError: + continue + if 0 <= idx < len(self.list_display): + ordering_fields.add(self.list_display[idx]) + + qs = ( + super() + .get_queryset(request) + .defer( + "notes", + "output_json", + ) + .prefetch_related( + Prefetch( + "snapshot", + queryset=Snapshot.objects.defer("config", "notes").prefetch_related("crawl__created_by", "tags"), + ), + "process__machine", + ) + ) + if "tags_inline" in ordering_fields: + qs = qs.annotate(snapshot_first_tag=Min("snapshot__tags__name")) + return qs + + def get_search_results(self, request, queryset, search_term): + if not search_term: + return queryset, False + + queryset = queryset.annotate( + snapshot_id_text=Cast("snapshot__id", output_field=TextField()), + snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()), + output_json_text=Cast("output_json", output_field=TextField()), + cmd_text=Cast("process__cmd", output_field=TextField()), + ) + + search_bits = [ + bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term) + ] + search_bits = [bit.strip() for bit in search_bits if bit.strip()] + if not search_bits: + return queryset, False + + filters = [] + for bit in search_bits: + filters.append( + Q(snapshot_id_text__icontains=bit) + | Q(snapshot__url__icontains=bit) + | Q(snapshot__tags__name__icontains=bit) + | Q(snapshot_crawl_id_text__icontains=bit) + | Q(plugin__icontains=bit) + | Q(hook_name__icontains=bit) + | Q(output_str__icontains=bit) + | Q(output_json_text__icontains=bit) + | Q(cmd_text__icontains=bit), + ) + + return queryset.filter(reduce(and_, filters)).distinct(), True + + def get_snapshot_view_url(self, result: ArchiveResult) -> str: + request = self.request + return build_snapshot_url(str(result.snapshot_id), request=request, config=request.archivebox_config) + + def get_output_view_url(self, result: ArchiveResult) -> str: + request = self.request + config = request.archivebox_config + output_path = result.embed_path() + if not output_path: + output_path = result.plugin or "" + return build_snapshot_url(str(result.snapshot_id), output_path, request=request, config=config) + + def get_output_files_url(self, result: ArchiveResult) -> str: + request = self.request + return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=request, config=request.archivebox_config)}/?files=1" + + def get_output_zip_url(self, result: ArchiveResult) -> str: + return f"{self.get_output_files_url(result)}&download=zip" + + @admin.display(description="Details", ordering="id") + def details_link(self, result): + return format_html( + '<a href="{}"><code>{}</code></a>', + reverse("admin:core_archiveresult_change", args=[result.id]), + str(result.id)[-8:], + ) + + @admin.display(description="Zip") + def zip_link(self, result): + return format_html( + '<a href="{}" class="archivebox-zip-button" data-loading-mode="spinner-only" onclick="return window.archiveboxHandleZipClick(this, event);" style="display:inline-flex; align-items:center; justify-content:center; gap:4px; width:48px; min-width:48px; height:24px; padding:0; box-sizing:border-box; border-radius:999px; border:1px solid #bfdbfe; background:#eff6ff; color:#1d4ed8; font-size:11px; font-weight:600; line-height:1; text-decoration:none;"><span class="archivebox-zip-spinner" aria-hidden="true"></span><span class="archivebox-zip-label">โฌ‡ ZIP</span></a>', + self.get_output_zip_url(result), + ) + + @admin.display( + description="Snapshot", + ordering="snapshot__url", + ) + def snapshot_info(self, result): + snapshot_id = str(result.snapshot_id) + request = self.request + return format_html( + '<a href="{}"><b><code>[{}]</code></b>   {}   {}</a><br/>', + build_snapshot_url(snapshot_id, "index.html", request=request, config=request.archivebox_config), + snapshot_id[:8], + result.snapshot.bookmarked_at.strftime("%Y-%m-%d %H:%M"), + result.snapshot.url[:128], + ) + + @admin.display( + description="Snapshot Tags", + ) + def tags_str(self, result): + return result.snapshot.tags_str() + + @admin.display(description="Tags", ordering="snapshot_first_tag") + def tags_inline(self, result): + widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False) + tags_html = widget.render( + name=f"tags_{result.snapshot_id}", + value=result.snapshot.tags.all(), + attrs={"id": f"tags_{result.snapshot_id}"}, + snapshot_id=str(result.snapshot_id), + ) + return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>') + + @admin.display(description="Status", ordering="status") + def status_badge(self, result): + status = result.status or ArchiveResult.StatusChoices.QUEUED + return format_html( + '<span class="status-badge {} status-{}">{}</span>', + status, + status, + result.get_status_display() or status, + ) + + @admin.display(description="Plugin", ordering="plugin") + def plugin_with_icon(self, result): + icon = get_plugin_icon(result.plugin) + return format_html( + '<a href="{}" title="{}">{}</a> <a href="{}"><code>{}</code></a>', + get_plugin_admin_url(result.plugin), + result.plugin, + icon, + get_plugin_admin_url(result.plugin), + result.plugin, + ) + + @admin.display(description="Process", ordering="process__pid") + def process_link(self, result): + process = result.process_record + if not process: + return "-" + process_label = get_process_link_label(process) + return format_html( + '<a href="{}"><code>{}</code></a>', + reverse("admin:machine_process_change", args=[process.id]), + process_label, + ) + + @admin.display(description="Machine", ordering="process__machine__hostname") + def machine_link(self, result): + process = result.process_record + if not process or not process.machine_id: + return "-" + machine = process.machine + return format_html( + '<a href="{}">{}</a>', + reverse("admin:machine_machine_change", args=[machine.id]), + machine.hostname, + ) + + @admin.display(description="Command") + def cmd_str(self, result): + request = self.request + display_cmd = build_abx_dl_display_command(result) + replay_cmd = build_abx_dl_replay_command(result, config=request.archivebox_config) + return format_html( + """ + <div style="position: relative; width: 100%; max-width: 100%; overflow: hidden; box-sizing: border-box;"> + <button type="button" + data-command="{}" + onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;" + style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;"> + Copy + </button> + <code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;"> + {} + </code> + </div> + """, + replay_cmd, + replay_cmd, + display_cmd, + ) + + def output_display(self, result): + request = self.request + config = request.archivebox_config + # Determine output link path - use embed_path() which checks output_files + embed_path = result.embed_path() + output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html" + snapshot_id = str(result.snapshot_id) + return format_html( + '<a href="{}" class="output-link">โ†—๏ธ</a><pre>{}</pre>', + build_snapshot_url(snapshot_id, output_path, request=request, config=config), + result.output_str_for_display(), + ) + + @admin.display(description="Output", ordering="output_str") + def output_str_display(self, result): + output_text = str(result.output_str_for_display() or "").strip() + if not output_text: + return "-" + + request = self.request + live_path = result.embed_path() + if live_path: + return format_html( + '<a href="{}" title="{}"><code>{}</code></a>', + build_snapshot_url(str(result.snapshot_id), live_path, request=request, config=request.archivebox_config), + output_text, + output_text, + ) + + return format_html( + '<span title="{}">{}</span>', + output_text, + output_text, + ) + + @admin.display(description="") + def admin_actions(self, result): + return format_html( + """ + <div style="display:flex; flex-wrap:wrap; gap:12px; align-items:center;"> + <a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;" + href="{}" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ“„ View Output + </a> + <a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;" + href="{}" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ“ Output files + </a> + <a class="btn archivebox-zip-button" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#eff6ff; border:1px solid #bfdbfe; border-radius:8px; color:#1d4ed8; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;" + href="{}" + data-loading-label="Preparing..." + onclick="return window.archiveboxHandleZipClick(this, event);" + onmouseover="this.style.background='#dbeafe'; this.style.borderColor='#93c5fd';" + onmouseout="this.style.background='#eff6ff'; this.style.borderColor='#bfdbfe';"> + <span class="archivebox-zip-spinner" aria-hidden="true"></span> + <span class="archivebox-zip-label">โฌ‡ Download Zip</span> + </a> + <a class="btn" style="display:inline-flex; align-items:center; gap:6px; padding:10px 16px; background:#f8fafc; border:1px solid #e2e8f0; border-radius:8px; color:#334155; text-decoration:none; font-size:14px; font-weight:500; transition:all 0.15s;" + href="{}" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ—‚ Snapshot + </a> + </div> + """, + self.get_output_view_url(result), + self.get_output_files_url(result), + self.get_output_zip_url(result), + self.get_snapshot_view_url(result), + ) + + def output_summary(self, result): + snapshot_dir = Path(result.snapshot.output_dir) + output_html = format_html( + '<pre style="display: inline-block">{}</pre><br/>', + result.output_str_for_display(), + ) + snapshot_id = str(result.snapshot_id) + request = self.request + output_html += format_html( + '<a href="{}#all">See result files ...</a><br/><pre><code>', + build_snapshot_url(snapshot_id, "index.html", request=request, config=request.archivebox_config), + ) + embed_path = result.embed_path() or "" + path_from_embed = snapshot_dir / (embed_path or "") + output_html += format_html( + '<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', + str(snapshot_dir), + str(embed_path), + ) + if os.access(path_from_embed, os.R_OK): + root_dir = str(path_from_embed) + else: + root_dir = str(snapshot_dir) + + # print(root_dir, str(list(os.walk(root_dir)))) + + for root, dirs, files in os.walk(root_dir): + depth = root.replace(root_dir, "").count(os.sep) + 1 + if depth > 2: + continue + indent = " " * 4 * (depth) + output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root)) + indentation_str = " " * 4 * (depth + 1) + for filename in sorted(files): + is_hidden = filename.startswith(".") + output_html += format_html( + '<span style="opacity: {}.2">{}{}</span><br/>', + int(not is_hidden), + indentation_str, + filename.strip(), + ) + + return output_html + mark_safe("</code></pre>") + + +def register_admin(admin_site): + admin_site.register(ArchiveResult, ArchiveResultAdmin) diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py new file mode 100644 index 0000000000..ec34d25123 --- /dev/null +++ b/archivebox/core/admin_site.py @@ -0,0 +1,159 @@ +__package__ = "archivebox.core" + +from typing import TYPE_CHECKING, Any + +from django.contrib import admin +from django.db import DatabaseError, connection +from admin_data_views.admin import ( + admin_data_index_view as adv_admin_data_index_view, + get_admin_data_urls as adv_get_admin_data_urls, + get_app_list as adv_get_app_list, +) + +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH + +if TYPE_CHECKING: + from django.http import HttpRequest + from django.template.response import TemplateResponse + from django.urls import URLPattern, URLResolver + + from admin_data_views.typing import AppDict + + +class ArchiveBoxAdmin(admin.AdminSite): + site_header = "ArchiveBox" + index_title = "Admin Views" + site_title = "Admin" + namespace = "admin" + + def each_context(self, request: "HttpRequest") -> dict[str, Any]: + context = super().each_context(request) + context["VERSION"] = VERSION + context["STATIC_CACHE_KEY"] = (get_COMMIT_HASH() or VERSION or "dev").strip() + return context + + @staticmethod + def _format_object_count(count: int) -> tuple[int, str, str]: + if count >= 1_000_000_000: + count_label = f"{count / 1_000_000_000:.1f}B" + elif count >= 1_000_000: + count_label = f"{count / 1_000_000:.1f}M" + elif count >= 1_000: + count_label = f"{count / 1_000:.1f}K" + else: + count_label = f"{count:,}" + count_label = count_label.replace(".0", "") + return count, count_label, f"Object count: {count:,}" + + def _set_model_object_count( + self, + models_by_table: dict[str, list[dict[str, Any]]], + table: str, + count: int, + title: str | None = None, + ) -> None: + models = models_by_table.get(table) + if not models: + return + count, count_label, count_title = self._format_object_count(count) + if title: + count_title = title + for model in models: + model["object_count"] = count + model["object_count_label"] = count_label + model["object_count_title"] = count_title + + def get_app_list(self, request: "HttpRequest", app_label: str | None = None) -> list["AppDict"]: + if app_label is None: + return adv_get_app_list(self, request) + return adv_get_app_list(self, request, app_label) + + def admin_data_index_view(self, request: "HttpRequest", **kwargs: Any) -> "TemplateResponse": + return adv_admin_data_index_view(self, request, **kwargs) + + def index(self, request: "HttpRequest", extra_context: dict[str, Any] | None = None) -> "TemplateResponse": + response = super().index(request, extra_context) + if connection.vendor != "sqlite": + return response + + models_by_table: dict[str, list[dict[str, Any]]] = {} + for app in response.context_data.get("app_list", []): + for model in app.get("models", []): + model_class = model.get("model") + if not model_class or not model.get("perms", {}).get("view"): + continue + models_by_table.setdefault(model_class._meta.db_table, []).append(model) + + if not models_by_table: + return response + + try: + with connection.cursor() as cursor: + cursor.execute("SELECT tbl, stat FROM sqlite_stat1") + for table, stat in cursor.fetchall(): + try: + count = int(str(stat).split()[0]) + except (IndexError, TypeError, ValueError): + continue + self._set_model_object_count( + models_by_table, + table, + count, + title=f"Approximate count from SQLite stats: {count:,}", + ) + models_by_table.pop(table, None) + except DatabaseError: + pass + + for table in list(models_by_table): + try: + with connection.cursor() as cursor: + cursor.execute(f"SELECT COUNT(*) FROM {connection.ops.quote_name(table)}") + count = int(cursor.fetchone()[0]) + except DatabaseError: + continue + self._set_model_object_count(models_by_table, table, count) + models_by_table.pop(table, None) + return response + + def get_admin_data_urls(self) -> list["URLResolver | URLPattern"]: + return adv_get_admin_data_urls(self) + + def get_urls(self) -> list["URLResolver | URLPattern"]: + return self.get_admin_data_urls() + super().get_urls() + + +archivebox_admin = ArchiveBoxAdmin() +# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin +# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel + + +############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS ######### + + +def register_admin_site(): + """Replace the default admin site with our custom ArchiveBox admin site.""" + from django.contrib import admin + from django.contrib.admin import sites + + admin.site = archivebox_admin + sites.site = archivebox_admin + + # Register admin views for each app + # (Previously handled by ABX plugin system, now called directly) + from archivebox.core.admin import register_admin as register_core_admin + from archivebox.crawls.admin import register_admin as register_crawls_admin + from archivebox.api.admin import register_admin as register_api_admin + from archivebox.machine.admin import register_admin as register_machine_admin + from archivebox.personas.admin import register_admin as register_personas_admin + from archivebox.workers.admin import register_admin as register_workers_admin + + register_core_admin(archivebox_admin) + register_crawls_admin(archivebox_admin) + register_api_admin(archivebox_admin) + register_machine_admin(archivebox_admin) + register_personas_admin(archivebox_admin) + register_workers_admin(archivebox_admin) + + return archivebox_admin diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py new file mode 100644 index 0000000000..4c95e7522a --- /dev/null +++ b/archivebox/core/admin_snapshots.py @@ -0,0 +1,1546 @@ +__package__ = "archivebox.core" + +import json +from functools import lru_cache +from types import SimpleNamespace + +from django.contrib import admin, messages +from django.urls import path, reverse +from django.shortcuts import get_object_or_404, redirect +from django.http import JsonResponse, HttpResponseBadRequest, HttpResponseNotAllowed +from django.utils import timezone +from django.utils.html import format_html, format_html_join +from django.utils.safestring import mark_safe +from django.db.models import Q, Count, Exists, F, OuterRef, Prefetch +from django import forms +from django.template import Template, RequestContext +from django.contrib.admin.helpers import ActionForm + +from archivebox.config.common import get_config +from archivebox.misc.util import htmldecode, urldecode +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.misc.logging_util import printable_filesize +from archivebox.search.admin import SearchResultsAdminMixin, SearchResultsChangeList +from archivebox.search.views import admin_snapshot_search_stream_view +from archivebox.core.routes_util import build_snapshot_url, build_web_url +from archivebox.core.tag_util import get_or_create_tag +from archivebox.plugins.hooks import discover_hooks +from archivebox.plugins.discovery import get_plugin_icon, get_plugin_name, get_plugins + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin + +from archivebox.core.models import Tag, Snapshot, ArchiveResult +from archivebox.crawls.models import Crawl +from archivebox.core.admin_archiveresults import render_archiveresults_list +from archivebox.core.preview_util import EXTENSION_SCREENSHOT_PLUGIN +from archivebox.progressmonitor.views import progress_endpoint +from archivebox.core.permissions import ( + PERMISSIONS_CHOICES, + PERMISSIONS_META, + get_snapshot_permissions, + normalize_permissions, +) +from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget + + +# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} +GLOBAL_CONTEXT = {} + +SNAPSHOT_PERMISSION_META = PERMISSIONS_META + + +@lru_cache(maxsize=1) +def _plugin_sort_order() -> dict[str, int]: + return {get_plugin_name(plugin): idx for idx, plugin in enumerate(get_plugins())} + + +class SnapshotActionForm(ActionForm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Define tags field in __init__ to avoid database access during app initialization + self.fields["tags"] = forms.CharField( + label="", + required=False, + widget=TagEditorWidget(), + ) + + def clean_tags(self): + """Parse comma-separated tag names without touching the DB.""" + tags_str = self.cleaned_data.get("tags", "") + if not tags_str: + return [] + + return [name.strip() for name in tags_str.split(",") if name.strip()] + + # TODO: allow selecting actions for specific extractor plugins? is this useful? + # plugin = forms.ChoiceField( + # choices=ArchiveResult.PLUGIN_CHOICES, + # required=False, + # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) + # ) + + +class TagNameListFilter(admin.SimpleListFilter): + title = "By tag name" + parameter_name = "tag" + + def lookups(self, request, model_admin): + selected = self.value() + tags = list(Tag.objects.order_by("name").only("id", "name")[:100]) + if selected and selected.isdigit() and all(str(tag.pk) != selected for tag in tags): + selected_tag = Tag.objects.filter(pk=int(selected)).only("id", "name").first() + if selected_tag: + tags.insert(0, selected_tag) + return [(str(tag.pk), tag.name) for tag in tags] + + def queryset(self, request, queryset): + if self.value(): + return queryset.filter(tags__id=self.value()) + return queryset + + +class SnapshotPermissionsListFilter(admin.SimpleListFilter): + title = "permission" + parameter_name = "permissions" + + def lookups(self, request, model_admin): + return PERMISSIONS_CHOICES + + def queryset(self, request, queryset): + value = self.value() + if value: + return queryset.filter(permissions=value) + return queryset + + +class SnapshotStatusListFilter(admin.SimpleListFilter): + title = "snapshot status" + parameter_name = "snapshot_status" + + def lookups(self, request, model_admin): + return Snapshot.StatusChoices.choices + + def queryset(self, request, queryset): + value = self.value() + if value in Snapshot.StatusChoices.values: + return queryset.filter(status=value) + return queryset + + +class SnapshotArchiveStateListFilter(admin.SimpleListFilter): + title = "archive state" + parameter_name = "archive_state" + + def lookups(self, request, model_admin): + return ( + ("downloaded", "Downloaded"), + ("not_downloaded", "Not downloaded"), + ("has_output", "Has saved files"), + ("empty_output", "No saved files"), + ("has_title", "Has title"), + ("missing_title", "Missing title"), + ) + + def queryset(self, request, queryset): + value = self.value() + if value == "downloaded": + return queryset.filter(downloaded_at__isnull=False) + if value == "not_downloaded": + return queryset.filter(downloaded_at__isnull=True) + if value == "has_output": + return queryset.filter(output_size__gt=0) + if value == "empty_output": + return queryset.filter(output_size=0) + if value == "has_title": + return queryset.exclude(Q(title__isnull=True) | Q(title="")) + if value == "missing_title": + return queryset.filter(Q(title__isnull=True) | Q(title="")) + return queryset + + +class SnapshotSizeListFilter(admin.SimpleListFilter): + title = "size" + parameter_name = "size" + + def lookups(self, request, model_admin): + return ( + ("1gb", ">1GB"), + ("500mb", ">500MB"), + ("250mb", ">250MB"), + ("100mb", ">100MB"), + ("50mb", ">50MB"), + ("25mb", ">25MB"), + ) + + def queryset(self, request, queryset): + value = self.value() + thresholds = { + "1gb": 1024 * 1024 * 1024, + "500mb": 500 * 1024 * 1024, + "250mb": 250 * 1024 * 1024, + "100mb": 100 * 1024 * 1024, + "50mb": 50 * 1024 * 1024, + "25mb": 25 * 1024 * 1024, + } + if value in thresholds: + return queryset.filter(output_size__gt=thresholds[value]) + return queryset + + +class SnapshotResultHealthListFilter(admin.SimpleListFilter): + title = "ArchiveResult status" + parameter_name = "archiveresult_status" + SNAPSHOT_FIRST_VALUES = {"succeeded"} + + def lookups(self, request, model_admin): + return ( + ("none", "No ArchiveResults"), + ("has_results", "Has ArchiveResults"), + ("succeeded", ">50% succeeded"), + ("failed", ">50% failed"), + ("running", ">50% running"), + ("pending", ">50% queued"), + ("noresults", ">50% noresults"), + ) + + def queryset(self, request, queryset): + value = self.value() + if value: + results = ArchiveResult.objects.filter(snapshot_id=OuterRef("pk")) + if value == "none": + return queryset.annotate(has_results=Exists(results)).filter(has_results=False) + if value == "has_results": + return queryset.annotate(has_results=Exists(results)).filter(has_results=True) + status_by_value = { + "succeeded": ArchiveResult.StatusChoices.SUCCEEDED, + "failed": ArchiveResult.StatusChoices.FAILED, + "running": ArchiveResult.StatusChoices.STARTED, + "pending": (ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.BACKOFF), + "backoff": ArchiveResult.StatusChoices.BACKOFF, + "noresults": ArchiveResult.StatusChoices.NORESULTS, + } + if value in status_by_value: + status = status_by_value[value] + if value in self.SNAPSHOT_FIRST_VALUES: + # "succeeded" is overwhelmingly common in large collections. + # Scan Snapshots in admin order and do indexed per-snapshot + # probes so page 1 can stop after list_per_page matches. + queryset = queryset.alias( + total_results=ArchiveResult.snapshot_count_expr(), + matching_results=ArchiveResult.snapshot_count_expr(status=status), + ).filter(matching_results__gt=F("total_results") / 2) + return queryset + + # Rare statuses are faster status-first: use the + # (status, snapshot_id) index to find candidate snapshots. + snapshot_ids = ArchiveResult.cached_snapshot_ids_with_majority_status(status) + queryset = queryset.filter(pk__in=snapshot_ids) + queryset._archivebox_count_hint = len(snapshot_ids) + queryset.query._archivebox_count_hint = queryset._archivebox_count_hint + return queryset + return queryset + + +class SnapshotChangeList(SearchResultsChangeList): + def __init__(self, request, *args, **kwargs): + super().__init__(request, *args, **kwargs) + resolver_name = request.resolver_match.url_name + self.embedded_changelist = request.GET.get("_embedded") == "crawl" + self.snapshot_is_grid_view = not self.embedded_changelist and ( + resolver_name == "grid" or request.path.rstrip("/").endswith("/grid") + ) + + def _attach_archiveresult_summaries(self): + snapshot_ids = [obj.pk for obj in self.result_list] + if not snapshot_ids: + return + + results_by_snapshot = {snapshot_id: [] for snapshot_id in snapshot_ids} + seen_plugins = {snapshot_id: set() for snapshot_id in snapshot_ids} + rows = ( + ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, status=ArchiveResult.StatusChoices.SUCCEEDED, output_size__gt=0) + .order_by("snapshot_id", "plugin") + .values_list("snapshot_id", "plugin", "status", "output_size") + ) + for snapshot_id, plugin, status, output_size in rows.iterator(chunk_size=1000): + if plugin in seen_plugins[snapshot_id]: + continue + seen_plugins[snapshot_id].add(plugin) + results_by_snapshot[snapshot_id].append(SimpleNamespace(plugin=plugin, status=status, output_size=output_size)) + + for obj in self.result_list: + obj.__dict__["_admin_archiveresults"] = results_by_snapshot[obj.pk] + + def get_results(self, request): + super().get_results(request) + if request.GET.get("_embedded") == "crawl": + self.full_result_count = self.result_count + self.show_full_result_count = True + self._attach_archiveresult_summaries() + + +class SnapshotAdminForm(forms.ModelForm): + """Custom form for Snapshot admin with tag editor widget.""" + + tags_editor = forms.CharField( + label="Tags", + required=False, + widget=TagEditorWidget(), + help_text="Type tag names and press Enter or Space to add. Click ร— to remove.", + ) + permissions_config = forms.ChoiceField( + label="Permissions", + choices=PERMISSIONS_CHOICES, + required=True, + help_text="Per-snapshot visibility. Matching the crawl/persona default clears the per-snapshot override.", + ) + + class Meta: + model = Snapshot + fields = "__all__" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Initialize tags_editor with current tags + if self.instance and self.instance.pk: + self.initial["tags_editor"] = ",".join( + sorted(tag.name for tag in self.instance.tags.all()), + ) + self.initial["permissions_config"] = get_snapshot_permissions(self.instance) + + def save(self, commit=True): + instance = super().save(commit=False) + permissions = self.cleaned_data["permissions_config"] + config = dict(instance.config or {}) + config["PERMISSIONS"] = permissions + instance.config = config + + # Handle tags_editor field + if commit: + instance.save() + self._save_m2m() + + # Parse and save tags from tags_editor + tags_str = self.cleaned_data.get("tags_editor", "") + if tags_str: + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={"name": name}, + ) + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + instance.tags.set(tags) + else: + instance.tags.clear() + + return instance + + +class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): + form = SnapshotAdminForm + raw_id_fields = ("crawl", "parent_snapshot") + list_select_related = () + list_display = ( + "permissions_badge", + "created_at", + "preview_icon", + "title_str", + "tags_inline", + "status_with_progress", + "files", + "size_with_stats", + ) + list_display_links = ("created_at",) + sort_fields = ("title_str", "created_at", "status", "crawl") + readonly_fields = ( + "admin_actions", + "snapshot_summary", + "url_favicon", + "tags_badges", + "imported_timestamp", + "created_at", + "modified_at", + "downloaded_at", + "output_dir", + "archiveresults_list", + ) + search_fields = ("id", "url", "timestamp", "title", "tags__name") + list_filter = ( + SnapshotPermissionsListFilter, + SnapshotStatusListFilter, + SnapshotResultHealthListFilter, + SnapshotArchiveStateListFilter, + SnapshotSizeListFilter, + "created_at", + "downloaded_at", + "crawl__created_by", + TagNameListFilter, + ) + + fieldsets = ( + ( + "Actions", + { + "fields": ("admin_actions",), + "classes": ("card", "actions-card"), + }, + ), + ( + "Snapshot", + { + "fields": ("snapshot_summary",), + "classes": ("card",), + }, + ), + ( + "URL", + { + "fields": (("url_favicon", "url"), ("title", "tags_badges")), + "classes": ("card", "wide"), + }, + ), + ( + "Tags", + { + "fields": ("tags_editor", "permissions_config"), + "classes": ("card",), + }, + ), + ( + "Status", + { + "fields": ("status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("bookmarked_at", "created_at", "modified_at", "downloaded_at"), + "classes": ("card",), + }, + ), + ( + "Relations", + { + "fields": ("crawl",), + "classes": ("card",), + }, + ), + ( + "Config", + { + "fields": ("config",), + "description": '<span style="display:block; margin:-4px 0 6px; font-size:11px; line-height:1.35; color:#94a3b8;">Uses <code>Crawl.config</code> by default. Only set per-snapshot overrides here when needed.</span>', + "classes": ("card",), + }, + ), + ( + "Files", + { + "fields": ("output_dir",), + "classes": ("card",), + }, + ), + ( + "Archive Results", + { + "fields": ("archiveresults_list",), + "classes": ("card", "wide"), + }, + ), + ) + + ordering = ["-created_at"] + actions = [ + "add_tags", + "remove_tags", + "resnapshot_snapshot", + "update_snapshots", + "overwrite_snapshots", + "set_snapshot_permissions", + "delete_snapshots", + ] + inlines = [] # Removed TagInline, using TagEditorWidget instead + list_per_page = 50 + + action_form = SnapshotActionForm + paginator = AcceleratedPaginator + + save_on_top = True + show_full_result_count = True + + def get_changelist(self, request, **kwargs): + return SnapshotChangeList + + def get_ordering(self, request): + if request.GET.get("o"): + return [] + return super().get_ordering(request) + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + extra_context = extra_context or {} + extra_context["CONFIG"] = request.archivebox_config + snapshot = self.get_object(request, object_id) + if snapshot and snapshot.status in { + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + }: + extra_context["progress_auto_expand"] = True + extra_context["progress_endpoint"] = progress_endpoint("snapshot", snapshot.id) + return super().change_view(request, object_id, form_url, extra_context | GLOBAL_CONTEXT) + + def changelist_view(self, request, extra_context=None): + self.request = request + saved_list_per_page = self.list_per_page + embedded_changelist = request.GET.get("_embedded") == "crawl" + if embedded_changelist: + try: + requested_per_page = int(request.GET.get("per_page", "200")) + except ValueError: + requested_per_page = 200 + self.list_per_page = min(max(200, requested_per_page), 500) + else: + self.list_per_page = request.archivebox_config.SNAPSHOTS_PER_PAGE + extra_context = extra_context or {} + extra_context["embedded_changelist"] = embedded_changelist + extra_context["CONFIG"] = request.archivebox_config + try: + try: + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + except Exception as e: + self.message_user(request, f"Error occurred while loading the page: {str(e)} {request.GET} {request.POST}") + return super().changelist_view(request, GLOBAL_CONTEXT) + finally: + self.list_per_page = saved_list_per_page + + def get_actions(self, request): + actions = super().get_actions(request) + if not actions: + return {} + actions.pop("delete_selected", None) + return actions + + def lookup_allowed(self, lookup, value, request=None): + if lookup in {"crawl__id__exact", "crawl_id__exact", "crawl_id"}: + return True + return super().lookup_allowed(lookup, value, request=request) + + def get_snapshot_view_url(self, obj: Snapshot) -> str: + request = self.request + return build_snapshot_url(str(obj.id), request=request, config=request.archivebox_config) + + def get_snapshot_files_url(self, obj: Snapshot) -> str: + request = self.request + return f"{build_snapshot_url(str(obj.id), request=request, config=request.archivebox_config)}/?files=1" + + def get_snapshot_zip_url(self, obj: Snapshot) -> str: + return f"{self.get_snapshot_files_url(obj)}&download=zip" + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path("grid/", self.admin_site.admin_view(self.grid_view), name="grid"), + path("search-stream/", self.admin_site.admin_view(self.search_stream_view), name="core_snapshot_search_stream"), + path("<path:object_id>/redo-failed/", self.admin_site.admin_view(self.redo_failed_view), name="core_snapshot_redo_failed"), + path( + "<path:object_id>/set-permissions/", + self.admin_site.admin_view(self.set_permissions_view), + name="core_snapshot_set_permissions", + ), + ] + return custom_urls + urls + + def search_stream_view(self, request): + return admin_snapshot_search_stream_view(self, request) + + def set_permissions_view(self, request, object_id): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + permissions = (request.POST.get("permissions") or "").strip().lower() + if permissions not in dict(PERMISSIONS_CHOICES): + return HttpResponseBadRequest("Invalid permissions value") + + snapshot = get_object_or_404(Snapshot, pk=object_id) + config = dict(snapshot.config or {}) + config["PERMISSIONS"] = permissions + + # Keep the quick-edit write to one targeted UPDATE so SQLite only holds + # the write lock for the permission/config change itself. safe_update() + # keeps this from overwriting a concurrent admin/runner edit to the + # same Snapshot after the row was loaded above. + snapshot.safe_update({"config": config, "modified_at": timezone.now()}, refresh=False) + icon, label, fg, bg = SNAPSHOT_PERMISSION_META[permissions] + return JsonResponse({"permissions": permissions, "icon": icon, "label": label, "fg": fg, "bg": bg}) + + @admin.action(description="Permissions โ–พ") + def set_snapshot_permissions(self, request, queryset): + permissions = (request.POST.get("permissions") or "").strip().lower() + if permissions not in dict(PERMISSIONS_CHOICES): + messages.error(request, "Choose a valid permissions value.") + return + updated = self.update_snapshot_permissions(queryset, permissions) + messages.success(request, f"Set permissions to {permissions} on {updated} snapshot(s).") + + def update_snapshot_permissions(self, queryset, permissions): + now = timezone.now() + updated = 0 + batch = [] + snapshots = queryset.select_related(None).only("id", "config").prefetch_related(None) + for snapshot in snapshots.iterator(chunk_size=500): + config = dict(snapshot.config or {}) + config["PERMISSIONS"] = permissions + snapshot.config = config + snapshot.modified_at = now + batch.append(snapshot) + if len(batch) >= 500: + Snapshot.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + batch.clear() + if batch: + Snapshot.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + return updated + + def redo_failed_view(self, request, object_id): + snapshot = get_object_or_404(Snapshot, pk=object_id) + + if request.method == "POST": + retried = snapshot.retry_failed_archiveresults() + if retried: + messages.success( + request, + f"Queued {retried} failed/skipped extractors for retry on this snapshot.", + ) + else: + messages.info( + request, + "No failed/skipped extractors were found on this snapshot.", + ) + + return redirect(snapshot.admin_change_url) + + def get_queryset(self, request): + self.request = request + ordering_fields = self._get_ordering_fields(request) + needs_files_sort = "files" in ordering_fields + needs_tags_sort = "tags_inline" in ordering_fields + is_change_view = request.resolver_match.url_name == "core_snapshot_change" + prefetches = ["tags"] + if is_change_view: + prefetches.append( + Prefetch( + "archiveresult_set", + queryset=ArchiveResult.objects.only( + "id", + "snapshot_id", + "plugin", + "status", + "output_size", + "output_files", + ), + ), + ) + else: + prefetches.append( + Prefetch( + "crawl", + queryset=Crawl.objects.select_related("created_by").only( + "id", + "created_by_id", + "created_by__id", + "created_by__username", + ), + ), + ) + + qs = super().get_queryset(request) + if is_change_view: + qs = qs.select_related("crawl__created_by").defer("notes") + else: + qs = qs.only( + "id", + "created_at", + "url", + "timestamp", + "bookmarked_at", + "crawl_id", + "title", + "status", + "fs_version", + "output_size", + "permissions", + ) + qs = qs.prefetch_related(*prefetches) + if needs_files_sort: + qs = qs.annotate( + ar_succeeded_count=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.SUCCEEDED), + ) + if needs_tags_sort: + qs = qs.annotate(tag_count=Count("tags", distinct=True)) + + return qs + + @admin.display(description="๐Ÿ‘", ordering="permissions") + def permissions_badge(self, obj): + permissions = obj.__dict__.get("snapshot_permissions") + if permissions is None: + permissions = obj.permissions + permissions = normalize_permissions(permissions) + icon, label, fg, bg = SNAPSHOT_PERMISSION_META[permissions] + menu_items = format_html_join( + "", + ( + '<button type="button" class="snapshot-permissions-menu-item{}" data-permissions="{}">' + '<span class="snapshot-permissions-icon" aria-hidden="true" style="color:{}; background:{};">{}</span>' + "<span>{}</span>" + "</button>" + ), + ( + ( + " is-active" if choice_value == permissions else "", + choice_value, + choice_fg, + choice_bg, + choice_icon, + choice_label, + ) + for choice_value, choice_label in PERMISSIONS_CHOICES + for choice_icon, _choice_title, choice_fg, choice_bg in [SNAPSHOT_PERMISSION_META[choice_value]] + ), + ) + return format_html( + '<span class="snapshot-permissions-quick" data-current-permissions="{}" data-permissions-url="{}">' + '<button type="button" class="snapshot-permissions-button snapshot-permissions-{}" title="{}" aria-label="Change snapshot permissions: {}" aria-expanded="false">' + '<span class="snapshot-permissions-icon" aria-hidden="true" style="color:{}; background:{};">{}</span>' + "</button>" + '<span class="snapshot-permissions-menu" role="menu" hidden>{}</span>' + "</span>", + permissions, + reverse(f"{self.admin_site.name}:core_snapshot_set_permissions", args=[obj.pk]), + permissions, + label, + label, + fg, + bg, + icon, + menu_items, + ) + + @admin.display(description="Imported Timestamp") + def imported_timestamp(self, obj): + context = RequestContext( + self.request, + { + "bookmarked_date": obj.bookmarked_at, + "timestamp": obj.timestamp, + }, + ) + + html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""") + return mark_safe(html.render(context)) + + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') + # return f'{pretty_time} ({obj.timestamp})' + + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + # <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()"> + # <input type="hidden" name="csrfmiddlewaretoken" value="{}"> + # <input type="hidden" name="_selected_action" value="{}"> + # <button name="update_snapshots">Check</button> + # <button name="update_titles">Pull title + favicon</button> + # <button name="update_snapshots">Update</button> + # <button name="overwrite_snapshots">Re-Archive (overwrite)</button> + # <button name="delete_snapshots">Permanently delete</button> + # </form> + # ''', + # csrf.get_token(self.request), + # obj.pk, + # ) + + @admin.display(description="") + def admin_actions(self, obj): + summary_url = self.get_snapshot_view_url(obj) + files_url = self.get_snapshot_files_url(obj) + zip_url = self.get_snapshot_zip_url(obj) + redo_failed_url = f"/admin/core/snapshot/{obj.pk}/redo-failed/" + return format_html( + """ + <div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;"> + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="{}" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ“„ View Snapshot + </a> + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="{}" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ“ All files + </a> + <a class="btn archivebox-zip-button" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1d4ed8; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="{}" + data-loading-label="Preparing..." + onclick="return window.archiveboxHandleZipClick(this, event);" + onmouseover="this.style.background='#dbeafe'; this.style.borderColor='#93c5fd';" + onmouseout="this.style.background='#eff6ff'; this.style.borderColor='#bfdbfe';"> + <span class="archivebox-zip-spinner" aria-hidden="true"></span> + <span class="archivebox-zip-label">โฌ‡ Download Zip</span> + </a> + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="{}" + target="_blank" + onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" + onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> + ๐Ÿ”— Original URL + </a> + + <span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span> + + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="/admin/core/snapshot/?id__exact={}" + title="Create a fresh new snapshot of this URL" + onmouseover="this.style.background='#dbeafe';" + onmouseout="this.style.background='#eff6ff';"> + ๐Ÿ†• Snapshot Again + </a> + <button type="submit" + formaction="{}" + formmethod="post" + formnovalidate + class="btn" + style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s; cursor: pointer;" + title="Redo failed extractors (missing outputs)" + onmouseover="this.style.background='#d1fae5';" + onmouseout="this.style.background='#ecfdf5';"> + ๐Ÿ” Retry Failed Extractors + </button> + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="/admin/core/snapshot/?id__exact={}" + title="Re-run all extractors (overwrite existing)" + onmouseover="this.style.background='#fef3c7';" + onmouseout="this.style.background='#fffbeb';"> + ๐Ÿ”„ Reset & Retry All Extractors + </a> + <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;" + href="/admin/core/snapshot/?id__exact={}" + title="Permanently delete this snapshot" + onmouseover="this.style.background='#fee2e2';" + onmouseout="this.style.background='#fef2f2';"> + โ˜ ๏ธ Delete + </a> + </div> + """, + summary_url, + files_url, + zip_url, + obj.url, + obj.pk, + redo_failed_url, + obj.pk, + obj.pk, + ) + + def status_info(self, obj): + request = self.request + config = request.archivebox_config + favicon_url = build_snapshot_url(str(obj.id), "favicon.ico", request=request, config=config) + return format_html( + """ + Archived: {} ({} files {})     + Favicon: <img src="{}" style="height: 20px"/>     + Extension: {}     + """, + "โœ…" if obj.is_archived else "โŒ", + obj.num_outputs, + self.size(obj) or "0kb", + favicon_url, + obj.extension or "-", + ) + + @admin.display(description="Archive Results") + def archiveresults_list(self, obj): + request = self.request + return render_archiveresults_list(obj.archiveresult_set.all(), limit=8, config=request.archivebox_config) + + @admin.display( + description="Title", + ordering="title", + ) + def title_str(self, obj): + request = self.request + config = request.archivebox_config + title_raw = (obj.title or "").strip() + url_raw = (obj.url or "").strip() + title_normalized = title_raw.lower() + url_normalized = url_raw.lower() + show_title = bool(title_raw) and title_normalized != "pending..." and title_normalized != url_normalized + css_class = "fetched" if show_title else "pending" + + detail_url = build_web_url(f"/{obj.archive_path_from_db}/index.html", request=request, config=config) + title_html = "" + if show_title: + title_html = format_html( + '<a href="{}"><b class="status-{}">{}</b></a>', + detail_url, + css_class, + urldecode(htmldecode(title_raw))[:128], + ) + + return format_html( + "{}" + '<div style="font-size: 11px; color: #64748b; margin-top: 2px;">' + '<a href="{}"><code style="user-select: all;">{}</code></a>' + "</div>", + title_html, + url_raw or obj.url, + (url_raw or obj.url)[:128], + ) + + @admin.display(description="Tags", ordering="tag_count") + def tags_inline(self, obj): + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk), editable=True) + tags = self._get_prefetched_tags(obj) + tags_html = widget.render( + name=f"tags_inline_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_inline_{obj.pk}"}, + snapshot_id=str(obj.pk), + ) + return mark_safe(f'<span class="tags-inline-editor tags-inline-editor--compact">{tags_html}</span>') + + @admin.display(description="Tags") + def tags_badges(self, obj): + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk), editable=False) + tags = self._get_prefetched_tags(obj) + tags_html = widget.render( + name=f"tags_readonly_{obj.pk}", + value=tags if tags is not None else obj.tags.all(), + attrs={"id": f"tags_readonly_{obj.pk}"}, + snapshot_id=str(obj.pk), + ) + return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>') + + def _get_preview_data(self, obj): + request = self.request + config = request.archivebox_config + results = self._get_prefetched_results(obj) + if results is not None: + has_screenshot = any(r.plugin == "screenshot" for r in results) + has_favicon = any(r.plugin == "favicon" for r in results) + has_extension_screenshot = any(r.plugin == EXTENSION_SCREENSHOT_PLUGIN for r in results) + else: + available_plugins = set( + obj.archiveresult_set.filter(plugin__in=("screenshot", EXTENSION_SCREENSHOT_PLUGIN, "favicon")).values_list( + "plugin", + flat=True, + ), + ) + has_screenshot = "screenshot" in available_plugins + has_favicon = "favicon" in available_plugins + has_extension_screenshot = EXTENSION_SCREENSHOT_PLUGIN in available_plugins + + extension_screenshot_urls = [ + build_snapshot_url(str(obj.id), f"{EXTENSION_SCREENSHOT_PLUGIN}/screenshot-1.png", request=request, config=config), + build_snapshot_url(str(obj.id), f"{EXTENSION_SCREENSHOT_PLUGIN}/screenshot.png", request=request, config=config), + ] + + if not has_screenshot and not has_extension_screenshot and not has_favicon: + return None + + if has_screenshot or has_extension_screenshot: + img_url = build_snapshot_url(str(obj.id), "screenshot/screenshot.png", request=request, config=config) + fallbacks = extension_screenshot_urls + img_alt = "Screenshot" + preview_class = "screenshot" + else: + img_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico", request=request, config=config) + fallbacks = [ + build_snapshot_url(str(obj.id), "favicon.ico", request=request, config=config), + ] + img_alt = "Favicon" + preview_class = "favicon" + + fallback_list = ",".join(fallbacks) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.remove()" + ) + + return { + "img_url": img_url, + "img_alt": img_alt, + "preview_class": preview_class, + "onerror_js": onerror_js, + "fallback_list": fallback_list, + } + + @admin.display(description="", empty_value="") + def url_favicon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return "" + + request = self.request + config = request.archivebox_config + favicon_url = build_snapshot_url(str(obj.id), "favicon/favicon.ico", request=request, config=config) + fallback_list = ",".join([build_snapshot_url(str(obj.id), "favicon.ico", request=request, config=config)]) + onerror_js = ( + "this.dataset.fallbacks && this.dataset.fallbacks.length ? " + "(this.src=this.dataset.fallbacks.split(',').shift(), " + "this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : " + "this.closest('a') && this.closest('a').remove()" + ) + + return format_html( + '<a href="{}" title="Open favicon" style="display:inline-flex; align-items:center; justify-content:center; width:32px; height:32px;">' + '<img src="{}" alt="Favicon" decoding="async" loading="lazy" onerror="{}" data-fallbacks="{}" ' + 'style="display:block; width:24px; height:24px; border-radius:6px; border:1px solid #e2e8f0; background:#fff; object-fit:contain; padding:2px;">' + "</a>", + favicon_url, + favicon_url, + onerror_js, + fallback_list, + ) + + @admin.display(description="Preview", empty_value="") + def preview_icon(self, obj): + preview = self._get_preview_data(obj) + if not preview: + return None + + return format_html( + '<img src="{}" alt="{}" class="snapshot-preview {}" decoding="async" loading="lazy" onerror="{}" data-fallbacks="{}">', + preview["img_url"], + preview["img_alt"], + preview["preview_class"], + preview["onerror_js"], + preview["fallback_list"], + ) + + @admin.display(description=" ", empty_value="") + def snapshot_summary(self, obj): + request = self.request + config = request.archivebox_config + preview = self._get_preview_data(obj) + stats = self._get_progress_stats(obj) + archive_size = stats["output_size"] or 0 + size_txt = printable_filesize(archive_size) if archive_size else "pending" + screenshot_html = "" + + if preview: + screenshot_html = format_html( + '<a href="{href}" title="Open snapshot live view" style="display:block; flex:0 0 220px; width:220px;">' + '<img src="{src}" alt="{alt}" decoding="async" loading="lazy" onerror="{onerror}" data-fallbacks="{fallbacks}" ' + 'style="display:block; width:100%; max-width:220px; aspect-ratio: 16 / 10; object-fit: cover; object-position: top; ' + 'border-radius: 10px; border: 1px solid #e2e8f0; background: #f8fafc;">' + "</a>", + href=build_web_url(f"/{obj.archive_path}", request=request, config=config), + src=preview["img_url"], + alt=preview["img_alt"], + onerror=preview["onerror_js"], + fallbacks=preview["fallback_list"], + ) + + return format_html( + '<div style="display:flex; gap:16px; align-items:flex-start;">' + "{}" + '<div style="min-width:0; flex:1;">' + '<div style="font: 600 12px/1.4 -apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,sans-serif; color:#64748b; text-transform:uppercase; letter-spacing:0.04em; margin-bottom:4px;">snap_dir size</div>' + '<div style="font: 700 28px/1.1 -apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,sans-serif; color:#0f172a; margin-bottom:8px;">{}</div>' + '<div style="font-size:13px; line-height:1.5; color:#64748b;">' + 'Open <a href="{}"><code>{}</code></a> to inspect files.' + "</div>" + "</div>" + "</div>", + screenshot_html, + size_txt, + build_web_url(f"/{obj.archive_path}", request=request, config=config), + obj.archive_path, + ) + + @admin.display( + description="Files Saved", + ordering="ar_succeeded_count", + ) + def files(self, obj): + results = self._get_prefetched_results(obj) + if results is None: + results = obj.archiveresult_set.only("plugin", "status", "output_size") + + plugins_with_output: dict[str, ArchiveResult] = {} + for result in results: + if result.status != ArchiveResult.StatusChoices.SUCCEEDED: + continue + if not result.output_size: + continue + plugins_with_output.setdefault(result.plugin, result) + + if not plugins_with_output: + return mark_safe('<span style="opacity: 0.35;">...</span>') + + sorted_results = sorted( + plugins_with_output.values(), + key=lambda result: (_plugin_sort_order().get(result.plugin, 9999), result.plugin), + ) + visible_results = sorted_results[:14] + output = [] + request = self.request + config = request.archivebox_config + for result in visible_results: + icon = mark_safe(get_plugin_icon(result.plugin)) + if not icon.strip(): + continue + output.append( + format_html( + '<a href="{}" class="exists-True" title="{}">{}</a>', + build_web_url(f"/{obj.archive_path_from_db}/{result.plugin}/", request=request, config=config), + result.plugin, + icon, + ), + ) + if len(sorted_results) > len(visible_results): + output.append( + format_html( + '<span title="{} more outputs">+{}</span>', + len(sorted_results) - len(visible_results), + len(sorted_results) - len(visible_results), + ), + ) + + return format_html( + '<span class="files-icons files-icons--compact" style="font-size: 1em; opacity: 0.8;">{}</span>', + mark_safe("".join(output)), + ) + + @admin.display( + # ordering='archiveresult_count' + ) + def size(self, obj): + request = self.request + config = request.archivebox_config + archive_size = self._get_progress_stats(obj)["output_size"] or 0 + if archive_size: + size_txt = printable_filesize(archive_size) + if archive_size > 52428800: + size_txt = mark_safe(f"<b>{size_txt}</b>") + else: + size_txt = mark_safe('<span style="opacity: 0.3">...</span>') + return format_html( + '<a href="{}" title="View all files">{}</a>', + build_web_url(f"/{obj.archive_path}", request=request, config=config), + size_txt, + ) + + @admin.display( + description="Status", + ordering="status", + ) + def status_with_progress(self, obj): + """Show status with progress bar for in-progress snapshots.""" + stats = self._get_progress_stats(obj) + + # Status badge colors + status_colors = { + "queued": ("#f59e0b", "#fef3c7"), # amber + "started": ("#3b82f6", "#dbeafe"), # blue + "paused": ("#1d4ed8", "#dbeafe"), # blue + "sealed": ("#10b981", "#d1fae5"), # green + "succeeded": ("#10b981", "#d1fae5"), # green + "failed": ("#ef4444", "#fee2e2"), # red + "backoff": ("#f59e0b", "#fef3c7"), # amber + "skipped": ("#6b7280", "#f3f4f6"), # gray + } + fg_color, bg_color = status_colors.get(obj.status, ("#6b7280", "#f3f4f6")) + + # For started snapshots, show progress bar + if obj.status == "started" and stats["total"] > 0: + percent = stats["percent"] + running = stats["running"] + succeeded = stats["succeeded"] + failed = stats["failed"] + + return format_html( + """<div style="min-width: 90px;"> + <div style="display: flex; align-items: center; gap: 6px; margin-bottom: 4px;"> + <span class="snapshot-progress-spinner"></span> + <span style="font-size: 11px; color: #64748b;">{}/{} hooks</span> + </div> + <div style="background: #e2e8f0; border-radius: 4px; height: 6px; overflow: hidden;"> + <div style="background: linear-gradient(90deg, #10b981 0%, #10b981 {}%, #ef4444 {}%, #ef4444 {}%, #3b82f6 {}%, #3b82f6 100%); + width: {}%; height: 100%; transition: width 0.3s;"></div> + </div> + <div style="font-size: 10px; color: #94a3b8; margin-top: 2px;"> + โœ“{} โœ—{} โณ{} + </div> + </div>""", + succeeded + failed + stats["skipped"], + stats["total"], + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int(succeeded / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, + int((succeeded + failed) / stats["total"] * 100) if stats["total"] else 0, + percent, + succeeded, + failed, + running, + ) + + # For other statuses, show simple badge + return format_html( + '<span style="display: inline-block; padding: 2px 8px; border-radius: 12px; ' + 'font-size: 11px; font-weight: 500; background: {}; color: {};">{}</span>', + bg_color, + fg_color, + obj.status.upper(), + ) + + @admin.display( + description="Size", + ordering="output_size", + ) + def size_with_stats(self, obj): + """Show archive size with output size from archive results.""" + stats = self._get_progress_stats(obj) + output_size = stats["output_size"] + size_bytes = output_size or 0 + + if size_bytes: + size_txt = printable_filesize(size_bytes) + if size_bytes > 52428800: # 50MB + size_txt = mark_safe(f"<b>{size_txt}</b>") + else: + size_txt = mark_safe('<span style="opacity: 0.3">...</span>') + + # Show hook statistics + if stats["total"] > 0: + return format_html( + '<a href="{}" title="View all files" style="white-space: nowrap;">' + "{}</a>" + '<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">' + "{}/{} hooks</div>", + self.get_snapshot_files_url(obj), + size_txt, + stats["succeeded"], + stats["total"], + ) + + return format_html( + '<a href="{}" title="View all files">{}</a>', + self.get_snapshot_files_url(obj), + size_txt, + ) + + def _get_progress_stats(self, obj): + cached_stats = obj.__dict__.get("_admin_progress_stats") + if cached_stats is not None: + return cached_stats + + results = self._get_prefetched_results(obj) + if results is None: + stats = obj.get_progress_stats() + expected_total = self._get_expected_hook_total(obj) + total = max(stats["total"], expected_total) + completed = stats["succeeded"] + stats["failed"] + stats.get("skipped", 0) + stats.get("noresults", 0) + stats["total"] = total + stats["pending"] = max(total - completed - stats["running"], 0) + stats["percent"] = int((completed / total * 100) if total > 0 else 0) + obj._admin_progress_stats = stats + return stats + + expected_total = self._get_expected_hook_total(obj) + observed_total = len(results) + total = max(observed_total, expected_total) + succeeded = sum(1 for r in results if r.status == "succeeded") + failed = sum(1 for r in results if r.status == "failed") + running = sum(1 for r in results if r.status == "started") + skipped = sum(1 for r in results if r.status == "skipped") + noresults = sum(1 for r in results if r.status == "noresults") + pending = max(total - succeeded - failed - running - skipped - noresults, 0) + completed = succeeded + failed + skipped + noresults + percent = int((completed / total * 100) if total > 0 else 0) + is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED, obj.StatusChoices.PAUSED) + stats = { + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": obj.output_size or 0, + "is_sealed": is_sealed, + } + obj._admin_progress_stats = stats + return stats + + def _get_prefetched_results(self, obj): + if "_admin_archiveresults" in obj.__dict__: + return obj.__dict__["_admin_archiveresults"] + if "archiveresult_set" in obj.__dict__.get("_prefetched_objects_cache", {}): + return obj.archiveresult_set.all() + return None + + def _get_expected_hook_total(self, obj) -> int: + try: + request = self.request + if request.resolver_match.url_name in {"core_snapshot_changelist", "core_snapshot_change"}: + return 0 + + crawl = obj.crawl + snapshot_config = obj.config or {} + crawl_config = crawl.config or {} + has_scoped_config = bool(snapshot_config or crawl_config) + + if request is not None and not has_scoped_config: + cached_total = request.__dict__.get("archivebox_expected_snapshot_hook_total") + if cached_total is None: + config = request.archivebox_config + cached_total = len(discover_hooks("Snapshot", config=config)) + request.archivebox_expected_snapshot_hook_total = cached_total + return cached_total + + if request is not None: + scoped_cache = request.__dict__.get("archivebox_expected_snapshot_hook_totals_by_scope") + if scoped_cache is None: + scoped_cache = {} + request.archivebox_expected_snapshot_hook_totals_by_scope = scoped_cache + if snapshot_config: + cache_key = ("snapshot", json.dumps(snapshot_config, sort_keys=True, default=str)) + else: + cache_key = ("crawl", json.dumps(crawl_config, sort_keys=True, default=str)) + cached_total = scoped_cache.get(cache_key) + if cached_total is None: + config = get_config(crawl=crawl, snapshot=obj if snapshot_config else None) + cached_total = len(discover_hooks("Snapshot", config=config)) + scoped_cache[cache_key] = cached_total + return cached_total + + return len(discover_hooks("Snapshot", config=get_config(crawl=crawl, snapshot=obj if snapshot_config else None))) + except Exception: + return 0 + + def _get_prefetched_tags(self, obj): + prefetched_cache = obj.__dict__.get("_prefetched_objects_cache", {}) + if "tags" in prefetched_cache: + return list(prefetched_cache["tags"]) + return None + + def _get_ordering_fields(self, request): + ordering = request.GET.get("o") + if not ordering: + return set() + fields = set() + for part in ordering.split("."): + if not part: + continue + try: + idx = abs(int(part)) - 1 + except ValueError: + continue + if 0 <= idx < len(self.list_display): + fields.add(self.list_display[idx]) + return fields + + @admin.display( + description="Original URL", + ordering="url", + ) + def url_str(self, obj): + return format_html( + '<a href="{}"><code style="user-select: all;">{}</code></a>', + obj.url, + obj.url[:128], + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('<span style="color: {};">{}</span>', color, h) + + def grid_view(self, request, extra_context=None): + extra_context = extra_context or {} + extra_context["snapshot_is_grid_view"] = True + return self.changelist_view(request, extra_context=extra_context) + + # for debugging, uncomment this to print all requests: + # def changelist_view(self, request, extra_context=None): + # print('[*] Got request', request.method, request.POST) + # return super().changelist_view(request, extra_context=None) + + @admin.action( + description="๐Ÿ” Redo Failed", + ) + def update_snapshots(self, request, queryset): + queued = 0 + for snapshot in queryset: + queued += snapshot.retry_failed_archiveresults() + + if queued: + messages.success( + request, + f"Queued {queued} failed/skipped extractors for retry. The background runner will process them.", + ) + else: + messages.info(request, "No failed/skipped extractors were found in the selected snapshots.") + + @admin.action( + description="๐Ÿ†• Archive Now", + ) + def resnapshot_snapshot(self, request, queryset): + snapshots = list(queryset) + if not snapshots: + messages.info(request, "No snapshots selected.") + return + + urls = "\n".join(snapshot.url for snapshot in snapshots if snapshot.url) + if not urls: + messages.info(request, "No valid snapshot URLs were found to archive.") + return + + from archivebox.cli.archivebox_add import add + + # "Archive Now" is an explicit user re-archive โ€” force ONLY_NEW=False + # on the resulting crawl so existing snapshots don't cause the crawl to + # seal immediately with zero new snapshots (the default ONLY_NEW=True + # would skip any URLs that have ever been archived before). + crawl, _ = add(urls=urls, bg=True, config={"ONLY_NEW": False}) + + messages.success( + request, + f"Created 1 queued crawl with {len(snapshots)} URL(s). The background runner will create snapshots and process them.", + ) + + # Redirect to the new crawl's admin page so the user lands on the + # work-in-progress crawl, not the old snapshot they re-archived from. + # A snapshot-view redirect would race the runner โ€” the new snapshot + # may sit queued for a while before the runner creates the DB row. + return redirect(f"/admin/crawls/crawl/{crawl.id}/change/#snapshots") + + @admin.action( + description="๐Ÿ”„ Redo", + ) + def overwrite_snapshots(self, request, queryset): + queued = sum(snapshot.archive(overwrite=True) for snapshot in queryset) + + messages.success( + request, + f"Queued {queued} snapshots for full re-archive (overwriting existing). The background runner will process them.", + ) + + @admin.action( + description="๐Ÿ—‘๏ธ Delete", + ) + def delete_snapshots(self, request, queryset): + """Delete snapshots in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list("pk", flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete() + + messages.success( + request, + mark_safe( + f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed.", + ), + ) + + @admin.action( + description="+", + ) + def add_tags(self, request, queryset): + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get("tags", "") + if not tags_str: + messages.warning(request, "No tags specified.") + return + + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag, _ = get_or_create_tag( + name, + created_by=request.user if request.user.is_authenticated else None, + ) + tags.append(tag) + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list("id", flat=True)) + num_snapshots = len(snapshot_ids) + + for tag in tags: + SnapshotTag.objects.bulk_create( + [SnapshotTag(snapshot_id=sid, tag_id=tag.pk) for sid in snapshot_ids], + ignore_conflicts=True, + batch_size=1000, + ) + + messages.success( + request, + f"Added {len(tags)} tag(s) to {num_snapshots} Snapshot(s).", + ) + + @admin.action( + description="โ€“", + ) + def remove_tags(self, request, queryset): + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get("tags", "") + if not tags_str: + messages.warning(request, "No tags specified.") + return + + # Parse comma-separated tag names and find matching Tag objects (case-insensitive) + tag_names = [name.strip() for name in tags_str.split(",") if name.strip()] + tags = [] + for name in tag_names: + tag = Tag.objects.filter(name__iexact=name).first() + if tag: + tags.append(tag) + + if not tags: + messages.warning(request, "No matching tags found.") + return + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list("id", flat=True)) + num_snapshots = len(snapshot_ids) + tag_ids = [t.pk for t in tags] + + deleted_count, _ = SnapshotTag.objects.filter( + snapshot_id__in=snapshot_ids, + tag_id__in=tag_ids, + ).delete() + + messages.success( + request, + f"Removed {len(tags)} tag(s) from {num_snapshots} Snapshot(s) ({deleted_count} associations deleted).", + ) diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py new file mode 100644 index 0000000000..2ff8c94d2f --- /dev/null +++ b/archivebox/core/admin_tags.py @@ -0,0 +1,245 @@ +__package__ = "archivebox.core" + +from urllib.parse import quote + +from django import forms +from django.contrib import admin, messages +from django.contrib.admin.options import IS_POPUP_VAR +from django.http import HttpRequest, HttpResponseRedirect +from django.template.response import TemplateResponse +from django.urls import reverse +from django.utils.html import format_html +from django.utils.safestring import mark_safe + +from archivebox.base_models.admin import BaseModelAdmin +from archivebox.core.models import SnapshotTag, Tag +from archivebox.core.tag_util import ( + TAG_HAS_SNAPSHOTS_CHOICES, + TAG_SORT_CHOICES, + build_tag_cards, + get_tag_creator_choices, + get_tag_year_choices, + normalize_created_by_filter, + normalize_created_year_filter, + normalize_has_snapshots_filter, + normalize_tag_sort, +) +from archivebox.core.routes_util import build_snapshot_url + + +class TagInline(admin.TabularInline): + model = SnapshotTag + fields = ("id", "tag") + extra = 1 + max_num = 1000 + autocomplete_fields = ("tag",) + + +class TagAdminForm(forms.ModelForm): + class Meta: + model = Tag + fields = "__all__" + widgets = { + "name": forms.TextInput( + attrs={ + "placeholder": "research, receipts, product-design...", + "autocomplete": "off", + "spellcheck": "false", + "data-tag-name-input": "1", + }, + ), + } + + def clean_name(self): + name = (self.cleaned_data.get("name") or "").strip() + if not name: + raise forms.ValidationError("Tag name is required.") + return name + + +class TagAdmin(BaseModelAdmin): + form = TagAdminForm + change_list_template = "admin/core/tag/change_list.html" + change_form_template = "admin/core/tag/change_form.html" + list_display = ("name", "num_snapshots", "created_at", "created_by") + list_filter = ("created_at", "created_by") + search_fields = ("id", "name") + readonly_fields = ("id", "created_at", "modified_at", "snapshots") + actions = ["delete_selected"] + ordering = ["name", "id"] + + fieldsets = ( + ( + "Tag", + { + "fields": ("name",), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("id", "created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Recent Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), + ) + + add_fieldsets = ( + ( + "Tag", + { + "fields": ("name",), + "classes": ("card", "wide"), + }, + ), + ( + "Metadata", + { + "fields": ("created_by",), + "classes": ("card",), + }, + ), + ) + + def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None): + return self.fieldsets if obj else self.add_fieldsets + + def changelist_view(self, request: HttpRequest, extra_context=None): + query = (request.GET.get("q") or "").strip() + sort = normalize_tag_sort((request.GET.get("sort") or "created_desc").strip()) + created_by = normalize_created_by_filter((request.GET.get("created_by") or "").strip()) + year = normalize_created_year_filter((request.GET.get("year") or "").strip()) + has_snapshots = normalize_has_snapshots_filter((request.GET.get("has_snapshots") or "all").strip()) + context = { + **self.admin_site.each_context(request), + **(extra_context or {}), + "title": "Tags", + "opts": self.model._meta, + "initial_query": query, + "initial_sort": sort, + "initial_created_by": created_by, + "initial_year": year, + "initial_has_snapshots": has_snapshots, + "tag_sort_choices": TAG_SORT_CHOICES, + "tag_has_snapshots_choices": TAG_HAS_SNAPSHOTS_CHOICES, + "tag_created_by_choices": get_tag_creator_choices(), + "tag_year_choices": get_tag_year_choices(), + "initial_tag_cards": build_tag_cards( + query=query, + request=request, + limit=150, + preview_limit=0, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ), + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_create_api_url": reverse("api-1:tags_create"), + } + return TemplateResponse(request, self.change_list_template, context) + + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + current_name = (request.POST.get("name") or "").strip() + if not current_name and obj: + current_name = obj.name + + similar_tag_cards = ( + build_tag_cards(query=current_name, request=request, limit=12, preview_limit=0) + if current_name + else build_tag_cards(request=request, limit=12, preview_limit=0) + ) + if obj: + similar_tag_cards = [card for card in similar_tag_cards if card["id"] != obj.pk] + + context.update( + { + "tag_search_api_url": reverse("api-1:search_tags"), + "tag_similar_cards": similar_tag_cards, + "tag_similar_query": current_name, + }, + ) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None): + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST: + return super().response_add(request, obj, post_url_continue=post_url_continue) + + self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def response_change(self, request: HttpRequest, obj: Tag): + if IS_POPUP_VAR in request.POST or "_continue" in request.POST or "_addanother" in request.POST or "_saveasnew" in request.POST: + return super().response_change(request, obj) + + self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS) + return self._redirect_to_changelist(obj.name) + + def _redirect_to_changelist(self, query: str = "") -> HttpResponseRedirect: + changelist_url = reverse("admin:core_tag_changelist") + if query: + changelist_url = f"{changelist_url}?q={quote(query)}" + return HttpResponseRedirect(changelist_url) + + @admin.display(description="Snapshots") + def snapshots(self, tag: Tag): + snapshots = tag.snapshot_set.select_related("crawl__created_by").order_by("-downloaded_at", "-created_at", "-pk")[:10] + total_count = tag.snapshot_set.count() + if not snapshots: + return mark_safe( + f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. ' + f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>', + ) + + cards = [] + for snapshot in snapshots: + title = (snapshot.title or "").strip() or snapshot.url + cards.append( + format_html( + """ + <a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;"> + <img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'"> + <span style="min-width:0;"> + <strong style="display:block;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</strong> + <code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code> + </span> + </a> + """, + reverse("admin:core_snapshot_change", args=[snapshot.pk]), + build_snapshot_url(str(snapshot.pk), "favicon.ico"), + title[:120], + snapshot.url[:120], + ), + ) + + cards.append( + format_html( + '<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>', + tag.id, + total_count, + ), + ) + return mark_safe('<div style="display:grid;gap:10px;">' + "".join(cards) + "</div>") + + @admin.display(description="Snapshots", ordering="num_snapshots") + def num_snapshots(self, tag: Tag): + count = tag.__dict__.get("num_snapshots") + if count is None: + count = tag.snapshot_set.count() + return format_html( + '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>', + tag.id, + count, + ) + + +def register_admin(admin_site): + admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/admin_users.py b/archivebox/core/admin_users.py new file mode 100644 index 0000000000..d8e6ac3cdb --- /dev/null +++ b/archivebox/core/admin_users.py @@ -0,0 +1,168 @@ +__package__ = "archivebox.core" + +from urllib.parse import urlencode + +from django.contrib import admin +from django.contrib.auth.admin import UserAdmin +from django.contrib.auth import get_user_model +from django.db.models import Count +from django.utils.html import format_html +from django.utils.safestring import mark_safe + + +class CustomUserAdmin(UserAdmin): + sort_fields = ["id", "email", "username", "is_superuser", "last_login", "date_joined"] + list_display = ["username", "id", "email", "is_superuser", "last_login", "date_joined"] + readonly_fields = ("snapshot_set", "archiveresult_set", "tag_set", "apitoken_set", "outboundwebhook_set") + change_form_template = "admin/auth/user/change_form.html" + + # Preserve Django's default user creation form and fieldsets + # This ensures passwords are properly hashed and permissions are set correctly + add_fieldsets = UserAdmin.add_fieldsets + + # Extend fieldsets for change form only (not user creation) + fieldsets = [*(UserAdmin.fieldsets or ()), ("Data", {"fields": readonly_fields})] + + def get_queryset(self, request): + return super().get_queryset(request).annotate(snapshot_count=Count("crawl__snapshot_set", distinct=True)) + + def snapshot_rss_badge(self, obj, api_token: str = ""): + params = {"created_by": obj.username, "limit": 50} + if api_token: + params["api_key"] = api_token + rss_url = f"/api/v1/core/snapshots.rss?{urlencode(params)}" + return format_html( + ( + '<a href="{}" title="Snapshot RSS feed for {}" ' + 'style="display:inline-flex;align-items:center;gap:5px;padding:3px 8px;border-radius:4px;' + "background:#fff3e0;border:1px solid #f59e0b;color:#7c2d12;font-weight:700;" + 'font-size:12px;line-height:1.2;text-decoration:none;white-space:nowrap;">' + '<span aria-hidden="true" style="display:inline-block;width:8px;height:8px;border-radius:50%;' + 'background:#f97316;box-shadow:0 0 0 3px rgba(249,115,22,.18);"></span>' + "RSS</a>" + ), + rss_url, + obj.username, + ) + + def snapshot_count_badge(self, obj): + snapshots_url = f"/admin/core/snapshot/?created_by__id__exact={obj.pk}" + snapshot_count = obj.__dict__.get("snapshot_count", 0) + snapshot_label = "snapshot" if snapshot_count == 1 else "snapshots" + return format_html( + ( + '<a href="{}" title="View snapshots for {}" ' + 'style="display:inline-flex;align-items:center;padding:3px 8px;border-radius:4px;' + "background:#f7f8fa;border:1px solid #d0d7de;color:#24292f;font-weight:600;" + 'font-size:12px;line-height:1.2;text-decoration:none;white-space:nowrap;">' + "{} {}</a>" + ), + snapshots_url, + obj.username, + snapshot_count, + snapshot_label, + ) + + @admin.display(description="Snapshots", ordering="snapshot_count") + def snapshot_count_column(self, obj): + return self.snapshot_count_badge(obj) + + def get_list_display(self, request): + from archivebox.api.auth import get_or_create_api_token + + api_token = get_or_create_api_token(request.user) + token = api_token.token if api_token else "" + + @admin.display(description="RSS Feed") + def snapshot_rss_feed(obj): + return self.snapshot_rss_badge(obj, api_token=token) + + return ["username", snapshot_rss_feed, "snapshot_count_column", "id", "email", "is_superuser", "last_login", "date_joined"] + + @admin.display(description="Snapshots") + def snapshot_set(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe( + "<br/>".join( + format_html( + '<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>๐Ÿ“… {}</b> {}', + snap.pk, + str(snap.id)[:8], + snap.downloaded_at.strftime("%Y-%m-%d %H:%M") if snap.downloaded_at else "pending...", + snap.url[:64], + ) + for snap in obj.snapshot_set.order_by("-modified_at")[:10] + ) + + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>', + ) + + @admin.display(description="Archive Result Logs") + def archiveresult_set(self, obj): + total_count = obj.archiveresult_set.count() + return mark_safe( + "<br/>".join( + format_html( + '<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>๐Ÿ“… {}</b> <b>๐Ÿ“„ {}</b> {}', + result.pk, + str(result.id)[:8], + result.snapshot.downloaded_at.strftime("%Y-%m-%d %H:%M") if result.snapshot.downloaded_at else "pending...", + result.extractor, + result.snapshot.url[:64], + ) + for result in obj.archiveresult_set.order_by("-modified_at")[:10] + ) + + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>', + ) + + @admin.display(description="Tags") + def tag_set(self, obj): + total_count = obj.tag_set.count() + return mark_safe( + ", ".join( + format_html( + '<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>', + tag.pk, + tag.name, + ) + for tag in obj.tag_set.order_by("-modified_at")[:10] + ) + + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>', + ) + + @admin.display(description="API Tokens") + def apitoken_set(self, obj): + total_count = obj.apitoken_set.count() + return mark_safe( + "<br/>".join( + format_html( + '<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})', + apitoken.pk, + str(apitoken.id)[:8], + apitoken.token_redacted[:64], + apitoken.expires, + ) + for apitoken in obj.apitoken_set.order_by("-modified_at")[:10] + ) + + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>', + ) + + @admin.display(description="API Outbound Webhooks") + def outboundwebhook_set(self, obj): + total_count = obj.outboundwebhook_set.count() + return mark_safe( + "<br/>".join( + format_html( + '<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}', + outboundwebhook.pk, + str(outboundwebhook.id)[:8], + outboundwebhook.referenced_model, + outboundwebhook.endpoint, + ) + for outboundwebhook in obj.outboundwebhook_set.order_by("-modified_at")[:10] + ) + + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>', + ) + + +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py new file mode 100644 index 0000000000..774d743362 --- /dev/null +++ b/archivebox/core/apps.py @@ -0,0 +1,45 @@ +__package__ = "archivebox.core" + +from django.apps import AppConfig +import os + + +class CoreConfig(AppConfig): + name = "archivebox.core" + label = "core" + + def ready(self): + """Register the archivebox.core.admin_site as the main django admin site""" + import sys + from django.utils.autoreload import DJANGO_AUTORELOAD_ENV + + from archivebox.core.admin_site import register_admin_site + + register_admin_site() + from archivebox.base_models.models import ModelWithOutputDir + + ModelWithOutputDir.register_delete_signal() + + # Import models to register state machines with the registry + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.core import models # noqa: F401 + + def _should_prepare_runtime() -> bool: + if os.environ.get("ARCHIVEBOX_RUNSERVER") == "1": + if os.environ.get("ARCHIVEBOX_AUTORELOAD") == "1": + return os.environ.get(DJANGO_AUTORELOAD_ENV) == "true" + return True + return False + + if _should_prepare_runtime(): + from archivebox.config import CONSTANTS + from archivebox.machine.models import Process + + Process.current().mark_running( + process_type=Process.TypeChoices.WORKER, + worker_type="worker_runserver", + pwd=str(CONSTANTS.DATA_DIR), + url=os.environ.get("ARCHIVEBOX_RUNSERVER_BIND_URL") or "", + timeout=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS, + ) diff --git a/archivebox/core/asgi.py b/archivebox/core/asgi.py new file mode 100644 index 0000000000..7f79ffd4f6 --- /dev/null +++ b/archivebox/core/asgi.py @@ -0,0 +1,88 @@ +""" +ASGI config for archivebox project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/stable/howto/deployment/asgi/ +""" + +from archivebox.config.django import setup_django +from django.core.asgi import get_asgi_application + +setup_django(in_memory_db=False, check_db=True) + + +def _patch_thread_sensitive_context_shutdown() -> None: + """Stop ``ThreadSensitiveContext.__aexit__`` from blocking the daphne loop. + + Django 6.0's ASGIHandler wraps every request in ``async with + ThreadSensitiveContext():`` (django/core/handlers/asgi.py:169). On exit + asgiref calls ``executor.shutdown()`` with the default ``wait=True`` + (asgiref/sync.py:148), which is a *synchronous* ``Thread.join()`` inside + an async function โ€” so it blocks the daphne event loop until the + executor's worker thread exits. + + That's normally fine because the request handler has already awaited + every ``sync_to_async`` it submitted, so the worker is idle and dies as + soon as the shutdown sentinel reaches it. The blocking turns into a + problem when a client disconnects mid-request: + + * ``SyncToAsync.__call__`` shields the executor work with + ``await asyncio.shield(exec_coro)`` (asgiref/sync.py:506) so that the + sync DB call doesn't get torn down halfway through. + * On cancellation it calls ``exec_coro.cancel()`` (line 522) which only + flips the asyncio ``Future`` to cancelled โ€” the underlying thread + keeps running the SQL query. + * Control unwinds to ``__aexit__`` while the orphaned thread is still + mid-query. ``shutdown(wait=True)`` then blocks the event loop until + that orphan finishes. + + Under heavy SQLite contention (the load-test scenario that surfaced + this on cabbage) those orphan threads can take 30 seconds each waiting + for write locks, and the daphne loop is single-threaded โ€” so every + such orphan stalls every other in-flight request, healthchecks time + out, and the container goes ``unhealthy``. + + Switching to ``shutdown(wait=False)`` queues the sentinel and returns + immediately; the worker thread still exits cleanly once its current + task finishes, and asgiref's ``WeakKeyDictionary`` releases the + executor as soon as the request's context is GC'd. No per-request + teardown guarantee is lost โ€” there was no caller relying on it. + """ + from asgiref import sync as _asgiref_sync + + original_aexit = _asgiref_sync.ThreadSensitiveContext.__aexit__ + + async def __aexit__(self, exc, value, tb): # type: ignore[no-redef] + if not self.token: + return + executor = _asgiref_sync.SyncToAsync.context_to_thread_executor.pop(self, None) + if executor is not None: + executor.shutdown(wait=False) + _asgiref_sync.SyncToAsync.thread_sensitive_context.reset(self.token) + + # Idempotent: only patch once even if asgi.py is reloaded. + if getattr(original_aexit, "_archivebox_patched", False): + return + __aexit__._archivebox_patched = True # type: ignore[attr-defined] + _asgiref_sync.ThreadSensitiveContext.__aexit__ = __aexit__ + + +_patch_thread_sensitive_context_shutdown() + +# Standard Django ASGI application (no websockets/channels needed) +application = get_asgi_application() + +# If websocket support is needed later, install channels and use: +# from channels.routing import ProtocolTypeRouter, URLRouter +# from channels.auth import AuthMiddlewareStack +# from channels.security.websocket import AllowedHostsOriginValidator +# from archivebox.core.routing import websocket_urlpatterns +# +# application = ProtocolTypeRouter({ +# "http": get_asgi_application(), +# "websocket": AllowedHostsOriginValidator( +# AuthMiddlewareStack(URLRouter(websocket_urlpatterns)) +# ), +# }) diff --git a/archivebox/core/context_processors.py b/archivebox/core/context_processors.py new file mode 100644 index 0000000000..623265e512 --- /dev/null +++ b/archivebox/core/context_processors.py @@ -0,0 +1,9 @@ +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH + + +def archivebox_globals(request): + return { + "VERSION": VERSION, + "STATIC_CACHE_KEY": (get_COMMIT_HASH() or VERSION or "dev").strip(), + } diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py new file mode 100644 index 0000000000..283dc21fd1 --- /dev/null +++ b/archivebox/core/forms.py @@ -0,0 +1,459 @@ +__package__ = "archivebox.core" + +import re +from decimal import Decimal, InvalidOperation, ROUND_CEILING + +from django import forms + +from archivebox.misc.util import URL_REGEX, find_all_urls, parse_filesize_to_bytes +from taggit.utils import edit_string_for_tags, parse_tags +from archivebox.base_models.admin import KeyValueWidget +from archivebox.crawls.schedule_util import validate_schedule +from archivebox.config.common import get_config, parse_delete_after +from archivebox.core.permissions import PERMISSIONS_CHOICES, PERMISSIONS_PUBLIC, filter_personas_by_permissions, is_admin_user +from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget +from archivebox.plugins.discovery import get_plugins +from archivebox.plugins.forms import ( + PLUGIN_GROUP_DEFINITIONS, + TIMEOUT_INPUT_PATTERN, + PluginConfigFormMixin, + get_choice_field, +) +from archivebox.personas.models import Persona + +DEPTH_CHOICES = ( + ("0", "depth = 0 (archive just these URLs)"), + ("1", "depth = 1 (+ URLs one hop away)"), + ("2", "depth = 2 (+ URLs two hops away)"), + ("3", "depth = 3 (+ URLs three hops away)"), + ("4", "depth = 4 (+ URLs four hops away)"), +) + + +class AddLinkForm(PluginConfigFormMixin, forms.Form): + allow_crawl_execution_config_fields = False + + # Basic fields + url = forms.CharField( + label="URLs", + strip=True, + widget=forms.Textarea( + attrs={ + "data-url-regex": URL_REGEX.pattern, + "placeholder": ( + "\n", + "Enter URL(s) to archive. Any format is ok: one per line, CSV, JSON, embedded in text, etc." + "Examples:\n\n" + "https://example.com\n\n" + "https://news.ycombinator.com,https://news.google.com\n\n" + "Or any text-based content [containing URLs](https://github.com/ArchiveBox/ArchiveBox)...", + ), + }, + ), + required=True, + ) + tag = forms.CharField( + label="Tags", + strip=True, + required=False, + widget=TagEditorWidget(), + ) + depth = forms.ChoiceField( + label="Archive depth", + choices=DEPTH_CHOICES, + initial="0", + widget=forms.RadioSelect(attrs={"class": "depth-selection"}), + ) + max_urls = forms.IntegerField( + label="Max crawl URLs", + required=False, + min_value=0, + initial=0, + widget=forms.NumberInput( + attrs={ + "min": 0, + "step": 1, + "placeholder": "0 = unlimited", + }, + ), + ) + crawl_max_size = forms.CharField( + label="Max crawl size", + required=False, + initial="0", + widget=forms.TextInput( + attrs={ + "placeholder": "0 = unlimited, or e.g. 45mb / 1gb", + }, + ), + ) + crawl_timeout = forms.CharField( + label="Max crawl time", + required=False, + initial=0, + widget=forms.TextInput( + attrs={ + "pattern": TIMEOUT_INPUT_PATTERN, + "title": "Use 0, integer seconds, or a duration like 1.5m or 1hr. Non-zero values must be greater than 10 seconds.", + "placeholder": "0, 300, 1.5m, or 1hr", + }, + ), + ) + timeout = forms.CharField( + label="Max subtask time", + required=False, + widget=forms.TextInput( + attrs={ + "pattern": TIMEOUT_INPUT_PATTERN, + "title": "Use integer seconds or a duration like 1.5m or 1hr. Non-zero values must be greater than 10 seconds.", + "placeholder": "60, 1.5m, or 1hr", + }, + ), + ) + snapshot_max_size = forms.CharField( + label="Max snapshot size", + required=False, + initial="0", + widget=forms.TextInput( + attrs={ + "placeholder": "0 = unlimited, or e.g. 45mb / 1gb", + }, + ), + ) + delete_after = forms.CharField( + label="Delete after", + required=False, + initial="0", + widget=forms.TextInput( + attrs={ + "placeholder": "0 = keep forever, or e.g. 1d / 6mo", + }, + ), + ) + crawl_max_concurrent_snapshots = forms.IntegerField( + label="Max in parallel", + required=False, + min_value=1, + widget=forms.NumberInput( + attrs={ + "min": 1, + "step": 1, + }, + ), + ) + notes = forms.CharField( + label="Notes", + strip=True, + required=False, + widget=forms.TextInput( + attrs={ + "placeholder": "Optional notes about this crawl", + }, + ), + ) + url_filters = forms.Field( + label="URL allowlist / denylist", + required=False, + widget=URLFiltersWidget(source_selector='textarea[name="url"]'), + ) + + # Plugin groups + main_plugins = forms.MultipleChoiceField( + label="Main", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], # populated in __init__ + ) + page_setup_plugins = forms.MultipleChoiceField( + label="Page Setup", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + media_plugins = forms.MultipleChoiceField( + label="Media", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + text_plugins = forms.MultipleChoiceField( + label="Text", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + metadata_plugins = forms.MultipleChoiceField( + label="Metadata", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + postprocessing_plugins = forms.MultipleChoiceField( + label="Postprocessing", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + other_plugins = forms.MultipleChoiceField( + label="Other", + required=False, + widget=forms.CheckboxSelectMultiple, + choices=[], + ) + + # Advanced options + schedule = forms.CharField( + label="Repeat schedule", + max_length=64, + required=False, + widget=forms.TextInput( + attrs={ + "placeholder": "e.g., daily, weekly, 0 */6 * * * (every 6 hours)", + }, + ), + ) + persona = forms.ModelChoiceField( + label="Persona (configuration profile)", + required=False, + queryset=Persona.objects.none(), + empty_label=None, + to_field_name="name", + ) + permissions = forms.ChoiceField( + label="Permissions", + choices=PERMISSIONS_CHOICES, + initial="public", + required=True, + ) + start_paused = forms.BooleanField( + label="Start paused", + initial=False, + required=False, + ) + config = forms.JSONField( + label="Custom config overrides", + widget=KeyValueWidget(), + initial=dict, + required=False, + ) + + def __init__(self, *args, **kwargs): + self.request = kwargs.pop("request", None) + self.can_override_crawl_config = bool(self.request and is_admin_user(self.request)) + super().__init__(*args, **kwargs) + + default_persona = Persona.get_or_create_default() + persona_queryset = Persona.objects.order_by("name") + if not self.can_override_crawl_config: + persona_queryset = filter_personas_by_permissions(persona_queryset, {PERMISSIONS_PUBLIC}) + self.fields["persona"].queryset = persona_queryset + + selected_persona = persona_queryset.filter(id=default_persona.id).first() or persona_queryset.first() + default_config = get_config(persona=selected_persona) if selected_persona else get_config() + if selected_persona: + self.fields["persona"].initial = selected_persona.name + self.fields["permissions"].initial = default_config.PERMISSIONS + self.fields["timeout"].initial = default_config.TIMEOUT + self.fields["crawl_max_concurrent_snapshots"].initial = default_config.CRAWL_MAX_CONCURRENT_SNAPSHOTS + self.fields["delete_after"].initial = default_config.DELETE_AFTER + self.fields["url_filters"].initial = { + "allowlist": "", + "denylist": "", + "same_domain_only": False, + "subpaths_only": False, + "only_new": bool(default_config.ONLY_NEW), + } + + if self.is_bound: + selected_persona = ( + persona_queryset.filter(name=str(self.data.get(self.add_prefix("persona")) or "")).first() or selected_persona + ) + if self.can_override_crawl_config: + self.build_plugin_groups(get_config(persona=selected_persona) if selected_persona else get_config()) + else: + all_plugins = get_plugins() + for field_name, *_rest, plugin_names in PLUGIN_GROUP_DEFINITIONS: + get_choice_field(self, field_name).choices = [(p, p) for p in all_plugins if p in plugin_names] + get_choice_field(self, "other_plugins").choices = [(p, p) for p in all_plugins] + self.plugin_groups = [] + + def clean(self): + cleaned_data = super().clean() or {} + + if not self.can_override_crawl_config: + cleaned_data["plugins"] = [] + cleaned_data["plugin_config"] = {} + cleaned_data["config"] = {} + return cleaned_data + + # Combine all plugin groups into single list + all_selected_plugins = [] + for field in [ + "main_plugins", + "page_setup_plugins", + "media_plugins", + "text_plugins", + "metadata_plugins", + "postprocessing_plugins", + "other_plugins", + ]: + selected = cleaned_data.get(field) + if isinstance(selected, list): + all_selected_plugins.extend(selected) + + # Store combined list for easy access + cleaned_data["plugins"] = all_selected_plugins + + plugin_config_overrides = self.clean_plugin_config_overrides(get_config(persona=cleaned_data.get("persona"))) + custom_config = cleaned_data.get("config") or {} + if not isinstance(custom_config, dict): + custom_config = {} + cleaned_data["plugin_config"] = plugin_config_overrides + cleaned_data["config"] = {**plugin_config_overrides, **custom_config} + + return cleaned_data + + def clean_url(self): + value = self.cleaned_data.get("url") or "" + if not list(find_all_urls(value)): + raise forms.ValidationError("Enter at least one valid URL.") + return value + + def clean_url_filters(self): + from archivebox.crawls.models import Crawl + + value = self.cleaned_data.get("url_filters") or {} + return { + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), + "subpaths_only": bool(value.get("subpaths_only")), + "only_new": bool(value.get("only_new")), + } + + def clean_max_urls(self): + value = self.cleaned_data.get("max_urls") + return int(value or 0) + + def clean_crawl_max_size(self): + raw_value = str(self.cleaned_data.get("crawl_max_size") or "").strip() + if not raw_value: + return 0 + try: + value = parse_filesize_to_bytes(raw_value) + except ValueError as err: + raise forms.ValidationError(str(err)) + if value < 0: + raise forms.ValidationError("Max crawl size must be 0 or a positive number of bytes.") + return value + + def clean_crawl_timeout(self): + return self._clean_timeout_seconds(self.cleaned_data.get("crawl_timeout"), "Max crawl time", blank_value=0) + + def clean_timeout(self): + return self._clean_timeout_seconds(self.cleaned_data.get("timeout"), "Max subtask time", blank_value=None) + + def _clean_timeout_seconds(self, raw_value, field_label: str, *, blank_value): + raw_value = str(raw_value or "").strip().lower() + if not raw_value: + return blank_value + if raw_value.isdigit(): + value = int(raw_value) + else: + match = re.fullmatch(r"(\d+(?:\.\d+)?)\s*(s|sec|secs|second|seconds|m|min|mins|minute|minutes|h|hr|hrs|hour|hours)", raw_value) + if not match: + raise forms.ValidationError(f"{field_label} must be seconds or a duration like 1.5m or 1hr.") + amount_str, unit = match.groups() + try: + amount = Decimal(amount_str) + except InvalidOperation as err: + raise forms.ValidationError(f"{field_label} must be seconds or a duration like 1.5m or 1hr.") from err + multiplier = 1 + if unit in {"m", "min", "mins", "minute", "minutes"}: + multiplier = 60 + elif unit in {"h", "hr", "hrs", "hour", "hours"}: + multiplier = 60 * 60 + value = int((amount * multiplier).to_integral_value(rounding=ROUND_CEILING)) + if 0 < value <= 10: + raise forms.ValidationError(f"{field_label} must be 0 or greater than 10 seconds.") + return value + + def clean_snapshot_max_size(self): + raw_value = str(self.cleaned_data.get("snapshot_max_size") or "").strip() + if not raw_value: + return 0 + try: + value = parse_filesize_to_bytes(raw_value) + except ValueError as err: + raise forms.ValidationError(str(err)) + if value < 0: + raise forms.ValidationError("Max snapshot size must be 0 or a positive number of bytes.") + return value + + def clean_delete_after(self): + raw_value = str(self.cleaned_data.get("delete_after") or "0").strip() or "0" + try: + parse_delete_after(raw_value) + except ValueError as err: + raise forms.ValidationError(str(err)) + return raw_value + + def clean_crawl_max_concurrent_snapshots(self): + value = self.cleaned_data.get("crawl_max_concurrent_snapshots") + if value in (None, ""): + value = get_config().CRAWL_MAX_CONCURRENT_SNAPSHOTS + value = int(value) + if value < 1: + raise forms.ValidationError("Max concurrent snapshots must be at least 1.") + return value + + def clean_schedule(self): + schedule = (self.cleaned_data.get("schedule") or "").strip() + if not schedule: + return "" + + try: + validate_schedule(schedule) + except ValueError as err: + raise forms.ValidationError(str(err)) + + return schedule + + +class TagWidget(forms.TextInput): + def format_value(self, value): + if value is not None and not isinstance(value, str): + value = edit_string_for_tags(value) + return super().format_value(value) + + +class TagField(forms.CharField): + widget = TagWidget + + def clean(self, value): + value = super().clean(value) + try: + return parse_tags(value) + except ValueError: + raise forms.ValidationError( + "Please provide a comma-separated list of tags.", + ) + + def has_changed(self, initial, data): + # Always return False if the field is disabled since self.bound_data + # always uses the initial value in this case. + if self.disabled: + return False + + try: + cleaned_data = self.clean(data) + except forms.ValidationError: + cleaned_data = data + + initial_value = [] if initial is None else initial + + if not isinstance(initial_value, list): + initial_value = list(initial_value) + + normalized_initial = sorted(tag.name for tag in initial_value) + return normalized_initial != cleaned_data diff --git a/archivebox/core/management/__init__.py b/archivebox/core/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/management/commands/__init__.py b/archivebox/core/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/management/commands/archivebox.py b/archivebox/core/management/commands/archivebox.py new file mode 100644 index 0000000000..4e663fe862 --- /dev/null +++ b/archivebox/core/management/commands/archivebox.py @@ -0,0 +1,17 @@ +__package__ = "archivebox" + +from django.core.management.base import BaseCommand + +from archivebox.cli import main as run_cli + + +class Command(BaseCommand): + help = "Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)" + + def add_arguments(self, parser): + parser.add_argument("subcommand", type=str, help="The subcommand you want to run") + parser.add_argument("command_args", nargs="*", help="Arguments to pass to the subcommand") + + def handle(self, *args, **kwargs): + command_args = [kwargs["subcommand"], *kwargs["command_args"]] + run_cli(args=command_args) diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py new file mode 100644 index 0000000000..6a99cea525 --- /dev/null +++ b/archivebox/core/middleware.py @@ -0,0 +1,362 @@ +__package__ = "archivebox.core" + +import ipaddress +import re +from pathlib import Path +from django.conf import settings +from django.utils import timezone +from django.contrib.auth.middleware import RemoteUserMiddleware +from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.shortcuts import redirect +from django.contrib.staticfiles import finders +from django.utils.http import http_date +from django.http import HttpResponseForbidden, HttpResponseNotModified + +from archivebox.config.common import get_config +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH +from archivebox.core.routes_util import ( + build_snapshot_url, + build_admin_url, + build_web_url, + get_api_host, + get_admin_host, + get_base_host, + get_listen_host, + get_listen_subdomain, + get_web_host, + host_matches, + is_snapshot_subdomain, + split_host_port, +) +from archivebox.core.views import SnapshotHostView, OriginalDomainHostView + + +ADMIN_LOGIN_HINT_COOKIE = "archivebox_admin_logged_in" + + +def _admin_login_hint_cookie_domain(config) -> str | None: + """Resolve the parent domain to scope the cross-subdomain login hint. + + NOTE: this cookie carries only the single bit "user is logged in on + admin somewhere"; it MUST NOT be confused with the session cookie, + which stays admin-host-scoped (see core/settings.py + SESSION_COOKIE_DOMAIN comment โ€” admin/web is a security boundary). + + Returns the hostname portion of ``get_base_host`` (which respects + ``BASE_URL`` and falls back to the local-bind mapping). Strips the + port โ€” cookie ``Domain=`` attributes don't include ports. Returns + ``None`` when subdomain routing is off, the base host is empty, or + the base host is an IP / bare ``localhost`` (browsers reject + cross-host cookies for those). + """ + if not config.USES_SUBDOMAIN_ROUTING: + return None + base_host = get_base_host(config=config) + if not base_host: + return None + hostname, _port = split_host_port(base_host) + if not hostname or hostname == "localhost": + return None + try: + ipaddress.ip_address(hostname) + except ValueError: + return hostname + return None + + +def detect_timezone(request, activate: bool = True): + gmt_offset = (request.COOKIES.get("GMT_OFFSET") or "").strip() + tz = None + if gmt_offset.replace("-", "").isdigit(): + tz = timezone.get_fixed_timezone(int(gmt_offset)) + if activate: + timezone.activate(tz) + # print('GMT_OFFSET', gmt_offset, tz) + return tz + + +def TimezoneMiddleware(get_response): + def middleware(request): + detect_timezone(request, activate=True) + return get_response(request) + + return middleware + + +def AdminCookieIsolationMiddleware(get_response): + def middleware(request): + response = get_response(request) + + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + request.archivebox_config = config + if not config.USES_SUBDOMAIN_ROUTING: + return response + + request_host = (request.get_host() or "").lower() + if host_matches(request_host, get_admin_host(config=config)): + return response + + if host_matches(request_host, get_web_host(config=config)): + for cookie_name in tuple(response.cookies.keys()): + if cookie_name != ADMIN_LOGIN_HINT_COOKIE: + response.cookies.pop(cookie_name, None) + return response + + response.cookies.pop(settings.SESSION_COOKIE_NAME, None) + response.cookies.pop(settings.CSRF_COOKIE_NAME, None) + return response + + return middleware + + +def CacheControlMiddleware(get_response): + snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/") + static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip() + + def middleware(request): + response = get_response(request) + + if request.path.startswith("/static/"): + rel_path = request.path[len("/static/") :] + static_path = finders.find(rel_path) + if static_path: + try: + mtime = Path(static_path).stat().st_mtime + except OSError: + mtime = None + etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"' + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime: + not_modified.headers["Last-Modified"] = http_date(mtime) + return not_modified + response.headers["ETag"] = etag + response.headers["Cache-Control"] = "public, max-age=31536000, immutable" + if mtime and not response.headers.get("Last-Modified"): + response.headers["Last-Modified"] = http_date(mtime) + return response + + if "/archive/" in request.path or "/static/" in request.path or snapshot_path_re.match(request.path): + if not response.get("Cache-Control"): + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + request.archivebox_config = config + policy = "private" if config.PERMISSIONS == "private" else "public" + response["Cache-Control"] = f"{policy}, max-age=60, stale-while-revalidate=300" + # print('Set Cache-Control header to', response['Cache-Control']) + return response + + return middleware + + +def ServerSecurityModeMiddleware(get_response): + blocked_prefixes = ("/admin", "/accounts", "/api", "/add", "/web") + allowed_methods = {"GET", "HEAD", "OPTIONS"} + + def middleware(request): + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + request.archivebox_config = config + + if config.CONTROL_PLANE_ENABLED: + return get_response(request) + + request.user = AnonymousUser() + request._cached_user = request.user + + if request.method.upper() not in allowed_methods: + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") + + for prefix in blocked_prefixes: + if request.path == prefix or request.path.startswith(f"{prefix}/"): + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") + + return get_response(request) + + return middleware + + +def HostRoutingMiddleware(get_response): + snapshot_path_re = re.compile( + r"^/(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?$", + ) + + def middleware(request): + if request.path in {"/health", "/health/"}: + return get_response(request) + + request_host = (request.get_host() or "").lower() + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + request.archivebox_config = config + admin_host = get_admin_host(config=config) + web_host = get_web_host(config=config) + api_host = get_api_host(config=config) + listen_host = get_listen_host(config=config) + subdomain = get_listen_subdomain(request_host, config=config) + + # Framework-owned assets must bypass snapshot/original-domain replay routing. + # Otherwise pages on snapshot subdomains can receive HTML for JS/CSS requests. + if request.path.startswith("/static/") or request.path in {"/favicon.ico", "/robots.txt"}: + return get_response(request) + + if config.USES_SUBDOMAIN_ROUTING and config.BASE_URL and not host_matches(request_host, admin_host): + add_should_redirect = not config.PUBLIC_ADD_VIEW and (request.path == "/add" or request.path.startswith("/add/")) + if ( + request.path == "/admin" + or request.path.startswith("/admin/") + or request.path == "/accounts" + or request.path.startswith("/accounts/") + or add_should_redirect + ): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + if subdomain and is_snapshot_subdomain(subdomain): + view = SnapshotHostView.as_view() + return view(request, snapshot_id=subdomain, path=request.path.lstrip("/")) + + # In subdomain mode with no explicit BASE_URL we can't safely emit + # ``admin.``/``web.``/``snap-*.`` redirects: every URL builder uses the + # request's own Host (via the request-host fallback in get_base_url), + # so prepending ``admin.`` to whatever the client sent produces a + # redirect chain of ``admin.admin.admin.<host>``. Pass the request + # through; the misconfig banner on the rendered page tells the user + # to pin BASE_URL so the redirects can resume. + if config.USES_SUBDOMAIN_ROUTING and not config.BASE_URL: + return get_response(request) + + if not config.USES_SUBDOMAIN_ROUTING: + if host_matches(request_host, listen_host): + return get_response(request) + + req_host, req_port = split_host_port(request_host) + listen_host_only, listen_port = split_host_port(listen_host) + if req_host.endswith(f".{listen_host_only}"): + if not listen_port or not req_port or listen_port == req_port: + target = build_web_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return get_response(request) + + if host_matches(request_host, admin_host): + snapshot_match = snapshot_path_re.match(request.path) + if config.USES_SUBDOMAIN_ROUTING and snapshot_match: + snapshot_id = snapshot_match.group("snapshot_id") + replay_path = (snapshot_match.group("path") or "").strip("/") + if replay_path == "index.html": + replay_path = "" + target = build_snapshot_url(snapshot_id, replay_path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + response = get_response(request) + hint_cookie_domain = _admin_login_hint_cookie_domain(config) + if request.user.is_authenticated and not request.path.startswith("/admin/logout"): + response.set_cookie( + ADMIN_LOGIN_HINT_COOKIE, + "1", + max_age=1209600, + domain=hint_cookie_domain, + secure=request.is_secure(), + httponly=True, + samesite="Lax", + ) + else: + response.delete_cookie(ADMIN_LOGIN_HINT_COOKIE, domain=hint_cookie_domain, samesite="Lax") + return response + + if host_matches(request_host, api_host): + request.user = AnonymousUser() + request._cached_user = request.user + if request.path.startswith("/admin"): + target = build_admin_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + if not request.path.startswith("/api/"): + target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}" + if request.META.get("QUERY_STRING"): + target_path = f"{target_path}?{request.META['QUERY_STRING']}" + return redirect(target_path) + return get_response(request) + + if host_matches(request_host, web_host): + if request.COOKIES.get(ADMIN_LOGIN_HINT_COOKIE) == "1" and (request.path == "/public" or request.path.startswith("/public/")): + target = build_admin_url("/admin/core/snapshot/", request=request) + return redirect(target) + request.user = AnonymousUser() + request._cached_user = request.user + return get_response(request) + + if subdomain: + view = OriginalDomainHostView.as_view() + return view(request, domain=subdomain, path=request.path.lstrip("/")) + + if host_matches(request_host, listen_host): + target = build_web_url(request.path, request=request) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + if (admin_host or web_host) and config.BASE_URL: + # Only force a canonical-host redirect when BASE_URL was set + # explicitly. If BASE_URL is empty (e.g. 0.7.3 โ†’ 0.9.0 upgrade + # where the user has CSRF_TRUSTED_ORIGINS but never set BASE_URL), + # the subdomain we'd redirect to may not actually resolve in the + # user's reverse proxy โ€” serve the request as-is instead and let + # the misconfig banner surface the problem in the page. + target = build_web_url(request.path, request=request) + if target: + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return get_response(request) + + return middleware + + +class ReverseProxyAuthMiddleware(RemoteUserMiddleware): + header = "HTTP_REMOTE_USER" + + def process_request(self, request): + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + request.archivebox_config = config + self.header = "HTTP_{normalized}".format(normalized=config.REVERSE_PROXY_USER_HEADER.replace("-", "_").upper()) + if config.REVERSE_PROXY_WHITELIST == "": + return + + ip = request.META.get("REMOTE_ADDR") + if not isinstance(ip, str): + return + + for cidr in config.REVERSE_PROXY_WHITELIST.split(","): + try: + network = ipaddress.ip_network(cidr) + except ValueError: + raise ImproperlyConfigured( + "The REVERSE_PROXY_WHITELIST config parameter is in invalid format, or " + "contains invalid CIDR. Correct format is a coma-separated list of IPv4/IPv6 CIDRs.", + ) + + if ipaddress.ip_address(ip) in network: + return super().process_request(request) diff --git a/archivebox/core/migrations/0001_initial.py b/archivebox/core/migrations/0001_initial.py new file mode 100644 index 0000000000..f64cdccab1 --- /dev/null +++ b/archivebox/core/migrations/0001_initial.py @@ -0,0 +1,25 @@ +# Generated by Django 2.2 on 2019-05-01 03:27 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Snapshot", + fields=[ + ("id", models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)), + ("url", models.URLField(unique=True)), + ("timestamp", models.CharField(default=None, max_length=32, null=True, unique=True)), + ("title", models.CharField(default=None, max_length=128, null=True)), + ("tags", models.CharField(default=None, max_length=256, null=True)), + ("added", models.DateTimeField(auto_now_add=True)), + ("updated", models.DateTimeField(default=None, null=True)), + ], + ), + ] diff --git a/archivebox/core/migrations/0002_auto_20200625_1521.py b/archivebox/core/migrations/0002_auto_20200625_1521.py new file mode 100644 index 0000000000..ff825ba642 --- /dev/null +++ b/archivebox/core/migrations/0002_auto_20200625_1521.py @@ -0,0 +1,17 @@ +# Generated by Django 3.0.7 on 2020-06-25 15:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0001_initial"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="timestamp", + field=models.CharField(default=None, max_length=32, null=True), + ), + ] diff --git a/archivebox/core/migrations/0003_auto_20200630_1034.py b/archivebox/core/migrations/0003_auto_20200630_1034.py new file mode 100644 index 0000000000..0d378f07b0 --- /dev/null +++ b/archivebox/core/migrations/0003_auto_20200630_1034.py @@ -0,0 +1,37 @@ +# Generated by Django 3.0.7 on 2020-06-30 10:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0002_auto_20200625_1521"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="added", + field=models.DateTimeField(auto_now_add=True, db_index=True), + ), + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.CharField(db_index=True, default=None, max_length=256, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="timestamp", + field=models.CharField(db_index=True, default=None, max_length=32, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="title", + field=models.CharField(db_index=True, default=None, max_length=128, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="updated", + field=models.DateTimeField(db_index=True, default=None, null=True), + ), + ] diff --git a/archivebox/core/migrations/0004_auto_20200713_1552.py b/archivebox/core/migrations/0004_auto_20200713_1552.py new file mode 100644 index 0000000000..02f2738c95 --- /dev/null +++ b/archivebox/core/migrations/0004_auto_20200713_1552.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.7 on 2020-07-13 15:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0003_auto_20200630_1034"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="timestamp", + field=models.CharField(db_index=True, default=None, max_length=32, unique=True), + preserve_default=False, + ), + ] diff --git a/archivebox/core/migrations/0005_auto_20200728_0326.py b/archivebox/core/migrations/0005_auto_20200728_0326.py new file mode 100644 index 0000000000..8b1c32e5c8 --- /dev/null +++ b/archivebox/core/migrations/0005_auto_20200728_0326.py @@ -0,0 +1,27 @@ +# Generated by Django 3.0.7 on 2020-07-28 03:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0004_auto_20200713_1552"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.CharField(blank=True, db_index=True, max_length=256, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="title", + field=models.CharField(blank=True, db_index=True, max_length=128, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="updated", + field=models.DateTimeField(blank=True, db_index=True, null=True), + ), + ] diff --git a/archivebox/core/migrations/0006_auto_20201012_1520.py b/archivebox/core/migrations/0006_auto_20201012_1520.py new file mode 100644 index 0000000000..103a28776a --- /dev/null +++ b/archivebox/core/migrations/0006_auto_20201012_1520.py @@ -0,0 +1,64 @@ +# Generated by Django 3.0.8 on 2020-10-12 15:20 + +from django.db import migrations, models +from django.utils.text import slugify + + +def forwards_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TagModel = apps.get_model("core", "Tag") + + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tag_set = {tag.strip() for tag in (snapshot.tags_old or "").split(",")} + tag_set.discard("") + + for tag in tag_set: + to_add, _ = TagModel.objects.get_or_create(name=tag, defaults={"slug": slugify(tag)}) + snapshot.tags.add(to_add) + + +def reverse_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags.values_list("name", flat=True) + snapshot.tags_old = ",".join([tag for tag in tags]) + snapshot.save() + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0005_auto_20200728_0326"), + ] + + operations = [ + migrations.RenameField( + model_name="snapshot", + old_name="tags", + new_name="tags_old", + ), + migrations.CreateModel( + name="Tag", + fields=[ + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("name", models.CharField(max_length=100, unique=True, verbose_name="name")), + ("slug", models.SlugField(max_length=100, unique=True, verbose_name="slug")), + ], + options={ + "verbose_name": "Tag", + "verbose_name_plural": "Tags", + }, + ), + migrations.AddField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField(to="core.Tag"), + ), + migrations.RunPython(forwards_func, reverse_func), + migrations.RemoveField( + model_name="snapshot", + name="tags_old", + ), + ] diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py new file mode 100644 index 0000000000..fcdfcd1c03 --- /dev/null +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -0,0 +1,157 @@ +# Generated by Django 3.0.8 on 2020-11-04 12:25 + +import json +from pathlib import Path + +from django.db import migrations, models +import django.db.models.deletion + +# Handle old vs new import paths +try: + from archivebox.config import CONSTANTS + + ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR +except ImportError: + ARCHIVE_DIR = Path("./archive") + +try: + from archivebox.misc.util import to_json +except ImportError: + try: + from index.json import to_json + except ImportError: + to_json = lambda x: json.dumps(x, indent=4, default=str) + +try: + JSONField = models.JSONField +except AttributeError: + import jsonfield + + JSONField = jsonfield.JSONField + + +def forwards_func(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + + snapshots = Snapshot.objects.all() + for snapshot in snapshots: + out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp + + try: + with open(out_dir / "index.json") as f: + fs_index = json.load(f) + except Exception: + continue + + history = fs_index["history"] + + for extractor in history: + for result in history[extractor]: + try: + ArchiveResult.objects.create( + extractor=extractor, + snapshot=snapshot, + pwd=result["pwd"], + cmd=result.get("cmd") or [], + cmd_version=result.get("cmd_version") or "unknown", + start_ts=result["start_ts"], + end_ts=result["end_ts"], + status=result["status"], + output=result.get("output") or "null", + ) + except Exception as e: + print( + " ! Skipping import due to missing/invalid index.json:", + out_dir, + e, + "(open an issue with this index.json for help)", + ) + + +def verify_json_index_integrity(snapshot): + results = snapshot.archiveresult_set.all() + out_dir = Path(ARCHIVE_DIR) / snapshot.timestamp + with open(out_dir / "index.json") as f: + index = json.load(f) + + history = index["history"] + index_results = [result for extractor in history for result in history[extractor]] + flattened_results = [result["start_ts"] for result in index_results] + + missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results] + + for missing in missing_results: + index["history"][missing.extractor].append( + { + "cmd": missing.cmd, + "cmd_version": missing.cmd_version, + "end_ts": missing.end_ts.isoformat(), + "start_ts": missing.start_ts.isoformat(), + "pwd": missing.pwd, + "output": missing.output, + "schema": "ArchiveResult", + "status": missing.status, + }, + ) + + json_index = to_json(index) + with open(out_dir / "index.json", "w") as f: + f.write(json_index) + + +def reverse_func(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + for snapshot in Snapshot.objects.all(): + verify_json_index_integrity(snapshot) + + ArchiveResult.objects.all().delete() + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0006_auto_20201012_1520"), + ] + + operations = [ + migrations.CreateModel( + name="ArchiveResult", + fields=[ + ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), + ("cmd", JSONField()), + ("pwd", models.CharField(max_length=256)), + ("cmd_version", models.CharField(max_length=32)), + ( + "status", + models.CharField(choices=[("succeeded", "succeeded"), ("failed", "failed"), ("skipped", "skipped")], max_length=16), + ), + ("output", models.CharField(max_length=512)), + ("start_ts", models.DateTimeField()), + ("end_ts", models.DateTimeField()), + ( + "extractor", + models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("wget", "wget"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("headers", "headers"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ("snapshot", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="core.Snapshot")), + ], + ), + migrations.RunPython(forwards_func, reverse_func), + ] diff --git a/archivebox/core/migrations/0008_auto_20210105_1421.py b/archivebox/core/migrations/0008_auto_20210105_1421.py new file mode 100644 index 0000000000..68c408e7e4 --- /dev/null +++ b/archivebox/core/migrations/0008_auto_20210105_1421.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-01-05 14:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0007_archiveresult"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="cmd_version", + field=models.CharField(blank=True, default=None, max_length=32, null=True), + ), + ] diff --git a/archivebox/core/migrations/0009_auto_20210216_1038.py b/archivebox/core/migrations/0009_auto_20210216_1038.py new file mode 100644 index 0000000000..41747426ac --- /dev/null +++ b/archivebox/core/migrations/0009_auto_20210216_1038.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:38 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0008_auto_20210105_1421"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="updated", + field=models.DateTimeField(auto_now=True, db_index=True, null=True), + ), + ] diff --git a/archivebox/core/migrations/0010_auto_20210216_1055.py b/archivebox/core/migrations/0010_auto_20210216_1055.py new file mode 100644 index 0000000000..14bc18fd01 --- /dev/null +++ b/archivebox/core/migrations/0010_auto_20210216_1055.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-16 10:55 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0009_auto_20210216_1038"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="start_ts", + field=models.DateTimeField(db_index=True), + ), + ] diff --git a/archivebox/core/migrations/0011_auto_20210216_1331.py b/archivebox/core/migrations/0011_auto_20210216_1331.py new file mode 100644 index 0000000000..a1f6e7539c --- /dev/null +++ b/archivebox/core/migrations/0011_auto_20210216_1331.py @@ -0,0 +1,40 @@ +# Generated by Django 3.1.3 on 2021-02-16 13:31 + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0010_auto_20210216_1055"), + ] + + operations = [ + migrations.AddField( + model_name="archiveresult", + name="uuid", + field=models.UUIDField(default=uuid.uuid4, editable=False), + ), + migrations.AlterField( + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("title", "title"), + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ] diff --git a/archivebox/core/migrations/0012_auto_20210216_1425.py b/archivebox/core/migrations/0012_auto_20210216_1425.py new file mode 100644 index 0000000000..27beb89797 --- /dev/null +++ b/archivebox/core/migrations/0012_auto_20210216_1425.py @@ -0,0 +1,22 @@ +# Generated by Django 3.1.3 on 2021-02-16 14:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0011_auto_20210216_1331"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="cmd_version", + field=models.CharField(blank=True, default=None, max_length=128, null=True), + ), + migrations.AlterField( + model_name="archiveresult", + name="output", + field=models.CharField(max_length=1024), + ), + ] diff --git a/archivebox/core/migrations/0013_auto_20210218_0729.py b/archivebox/core/migrations/0013_auto_20210218_0729.py new file mode 100644 index 0000000000..a774f156fa --- /dev/null +++ b/archivebox/core/migrations/0013_auto_20210218_0729.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0012_auto_20210216_1425"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="title", + field=models.CharField(blank=True, db_index=True, max_length=256, null=True), + ), + ] diff --git a/archivebox/core/migrations/0014_auto_20210218_0729.py b/archivebox/core/migrations/0014_auto_20210218_0729.py new file mode 100644 index 0000000000..d14211a6fd --- /dev/null +++ b/archivebox/core/migrations/0014_auto_20210218_0729.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0013_auto_20210218_0729"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="title", + field=models.CharField(blank=True, db_index=True, max_length=1024, null=True), + ), + ] diff --git a/archivebox/core/migrations/0015_auto_20210218_0730.py b/archivebox/core/migrations/0015_auto_20210218_0730.py new file mode 100644 index 0000000000..e2d99cdb9b --- /dev/null +++ b/archivebox/core/migrations/0015_auto_20210218_0730.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-18 07:30 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0014_auto_20210218_0729"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="title", + field=models.CharField(blank=True, db_index=True, max_length=512, null=True), + ), + ] diff --git a/archivebox/core/migrations/0016_auto_20210218_1204.py b/archivebox/core/migrations/0016_auto_20210218_1204.py new file mode 100644 index 0000000000..1b9961729a --- /dev/null +++ b/archivebox/core/migrations/0016_auto_20210218_1204.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-18 12:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0015_auto_20210218_0730"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField(blank=True, to="core.Tag"), + ), + ] diff --git a/archivebox/core/migrations/0017_auto_20210219_0211.py b/archivebox/core/migrations/0017_auto_20210219_0211.py new file mode 100644 index 0000000000..4a9a4c827a --- /dev/null +++ b/archivebox/core/migrations/0017_auto_20210219_0211.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-02-19 02:11 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0016_auto_20210218_1204"), + ] + + operations = [ + migrations.AlterField( + model_name="tag", + name="slug", + field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name="slug"), + ), + ] diff --git a/archivebox/core/migrations/0018_auto_20210327_0952.py b/archivebox/core/migrations/0018_auto_20210327_0952.py new file mode 100644 index 0000000000..dc5b2d1f58 --- /dev/null +++ b/archivebox/core/migrations/0018_auto_20210327_0952.py @@ -0,0 +1,22 @@ +# Generated by Django 3.1.3 on 2021-03-27 09:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0017_auto_20210219_0211"), + ] + + operations = [ + migrations.AlterField( + model_name="tag", + name="name", + field=models.CharField(max_length=100, unique=True), + ), + migrations.AlterField( + model_name="tag", + name="slug", + field=models.SlugField(blank=True, max_length=100, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0019_auto_20210401_0654.py b/archivebox/core/migrations/0019_auto_20210401_0654.py new file mode 100644 index 0000000000..846bb61961 --- /dev/null +++ b/archivebox/core/migrations/0019_auto_20210401_0654.py @@ -0,0 +1,17 @@ +# Generated by Django 3.1.3 on 2021-04-01 06:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0018_auto_20210327_0952"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="url", + field=models.URLField(db_index=True, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0020_auto_20210410_1031.py b/archivebox/core/migrations/0020_auto_20210410_1031.py new file mode 100644 index 0000000000..610eaa43b6 --- /dev/null +++ b/archivebox/core/migrations/0020_auto_20210410_1031.py @@ -0,0 +1,22 @@ +# Generated by Django 3.1.8 on 2021-04-10 10:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0019_auto_20210401_0654"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), + ), + migrations.AlterField( + model_name="tag", + name="id", + field=models.AutoField(primary_key=True, serialize=False, verbose_name="ID"), + ), + ] diff --git a/archivebox/core/migrations/0021_auto_20220914_0934.py b/archivebox/core/migrations/0021_auto_20220914_0934.py new file mode 100644 index 0000000000..3f757723fd --- /dev/null +++ b/archivebox/core/migrations/0021_auto_20220914_0934.py @@ -0,0 +1,34 @@ +# Generated by Django 3.1.14 on 2022-09-14 09:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0020_auto_20210410_1031"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ] diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py new file mode 100644 index 0000000000..43dd1a69b3 --- /dev/null +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -0,0 +1,35 @@ +# Generated by Django 3.1.14 on 2023-10-23 20:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0021_auto_20220914_0934"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="extractor", + field=models.CharField( + choices=[ + ("favicon", "favicon"), + ("headers", "headers"), + ("singlefile", "singlefile"), + ("pdf", "pdf"), + ("screenshot", "screenshot"), + ("dom", "dom"), + ("wget", "wget"), + ("title", "title"), + ("readability", "readability"), + ("mercury", "mercury"), + ("htmltotext", "htmltotext"), + ("git", "git"), + ("media", "media"), + ("archivedotorg", "archivedotorg"), + ], + max_length=32, + ), + ), + ] diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py new file mode 100644 index 0000000000..1dfb13ed71 --- /dev/null +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -0,0 +1,574 @@ +# Generated by hand on 2025-12-29 +# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL +# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0 + +from django.db import migrations, models, connection +import django.utils.timezone +from uuid import UUID +from datetime import datetime +import json + + +PROGRESS_EVERY = 10000 + + +def get_table_columns(table_name): + """Get list of column names for a table.""" + cursor = connection.cursor() + cursor.execute(f"PRAGMA table_info({table_name})") + return {row[1] for row in cursor.fetchall()} + + +def normalize_cmd(cmd): + if not cmd: + return "[]" + try: + parsed = json.loads(cmd) + if isinstance(parsed, list): + return json.dumps([str(part) for part in parsed]) + except (TypeError, json.JSONDecodeError): + pass + return json.dumps(str(cmd).split()) + + +def normalize_status(status): + return { + "success": "succeeded", + "succeded": "succeeded", + "succeeded": "succeeded", + "failed": "failed", + "skipped": "skipped", + "noresults": "noresults", + "queued": "queued", + "started": "started", + "backoff": "backoff", + }.get(str(status or "").strip().lower(), "failed") + + +def upgrade_core_tables(apps, schema_editor): + """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0.""" + from archivebox.uuid_compat import uuid7 + + cursor = connection.cursor() + + # Check if core_archiveresult table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + # Fresh install - no migration needed, tables will be created by later migrations + return + + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + row_count = cursor.fetchone()[0] + has_data = row_count > 0 + + # Detect which version we're migrating from + archiveresult_cols = get_table_columns("core_archiveresult") + has_uuid = "uuid" in archiveresult_cols + has_abid = "abid" in archiveresult_cols + + if has_data: + source_schema = "0.8.x abid" if has_abid and not has_uuid else "0.8.x uuid" if has_uuid else "0.7.x" + print(f" - Rebuilding core tables from {source_schema} schema ({row_count} ArchiveResults)...") + + # ============================================================================ + # PART 1: Upgrade core_archiveresult table + # ============================================================================ + # Create minimal table with only OLD fields that exist in v0.7.2/v0.8.6rc0 + # Migration 0025 will add the NEW fields (plugin, hook_name, output_files, etc.) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_archiveresult_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT, + snapshot_id TEXT NOT NULL, + cmd TEXT, + pwd VARCHAR(256), + cmd_version VARCHAR(128), + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + extractor VARCHAR(32), + output VARCHAR(1024), + created_at DATETIME, + modified_at DATETIME, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE + ); + """) + + if has_data: + has_archiveresult_created_at = "created_at" in archiveresult_cols + has_archiveresult_modified_at = "modified_at" in archiveresult_cols + archiveresult_select_cols = [ + "id", + "snapshot_id", + "cmd", + "pwd", + "cmd_version", + "start_ts", + "end_ts", + "status", + "extractor", + "output", + ] + if has_archiveresult_created_at: + archiveresult_select_cols.append("created_at") + if has_archiveresult_modified_at: + archiveresult_select_cols.append("modified_at") + + if has_uuid and not has_abid: + # Migrating from v0.7.2+ (has uuid column) + print(f" copying {row_count} ArchiveResults...") + select_cols = ["id", "uuid", "snapshot_id", "cmd", "pwd", "cmd_version", "start_ts", "end_ts", "status", "extractor", "output"] + if has_archiveresult_created_at: + select_cols.append("created_at") + if has_archiveresult_modified_at: + select_cols.append("modified_at") + cursor.execute(f"SELECT {', '.join(select_cols)} FROM core_archiveresult") + old_records = cursor.fetchall() + for i, record in enumerate(old_records, start=1): + values = dict(zip(select_cols, record)) + try: + new_uuid = UUID(str(values["uuid"])).hex + except (TypeError, ValueError): + new_uuid = uuid7().hex + start_ts = values["start_ts"] or datetime.now().isoformat() + end_ts = values["end_ts"] or start_ts + cursor.execute( + """ + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output, + created_at, modified_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + values["id"], + new_uuid, + values["snapshot_id"], + normalize_cmd(values["cmd"]), + values["pwd"] or "", + values["cmd_version"] or "", + start_ts, + end_ts, + normalize_status(values["status"] or "queued"), + values["extractor"] or "", + values["output"] or "", + values.get("created_at") or start_ts, + values.get("modified_at") or end_ts, + ), + ) + if i % PROGRESS_EVERY == 0: + print(f" copied {i}/{len(old_records)} ArchiveResults...") + elif has_abid and not has_uuid: + # Migrating from v0.8.6rc0 (has abid instead of uuid) + print(f" copying {row_count} ArchiveResults...") + cursor.execute(f"SELECT {', '.join(archiveresult_select_cols)} FROM core_archiveresult") + old_records = cursor.fetchall() + for i, record in enumerate(old_records, start=1): + values = dict(zip(archiveresult_select_cols, record)) + try: + new_uuid = UUID(str(values["id"])).hex + except (TypeError, ValueError): + new_uuid = uuid7().hex + start_ts = values["start_ts"] or datetime.now().isoformat() + end_ts = values["end_ts"] or start_ts + cursor.execute( + """ + INSERT OR IGNORE INTO core_archiveresult_new ( + uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output, + created_at, modified_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + new_uuid, + values["snapshot_id"], + normalize_cmd(values["cmd"]), + values["pwd"] or "", + values["cmd_version"] or "", + start_ts, + end_ts, + normalize_status(values["status"] or "queued"), + values["extractor"] or "", + values["output"] or "", + values.get("created_at") or start_ts, + values.get("modified_at") or end_ts, + ), + ) + if i % PROGRESS_EVERY == 0: + print(f" copied {i}/{len(old_records)} ArchiveResults...") + else: + # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs) + print(f" copying {row_count} ArchiveResults...") + cursor.execute(f"SELECT {', '.join(archiveresult_select_cols)} FROM core_archiveresult") + old_records = cursor.fetchall() + for i, record in enumerate(old_records, start=1): + values = dict(zip(archiveresult_select_cols, record)) + new_uuid = uuid7().hex + start_ts = values["start_ts"] or datetime.now().isoformat() + end_ts = values["end_ts"] or start_ts + cursor.execute( + """ + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output, + created_at, modified_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + values["id"], + new_uuid, + values["snapshot_id"], + normalize_cmd(values["cmd"]), + values["pwd"] or "", + values["cmd_version"] or "", + start_ts, + end_ts, + normalize_status(values["status"] or "queued"), + values["extractor"] or "", + values["output"] or "", + values.get("created_at") or start_ts, + values.get("modified_at") or end_ts, + ), + ) + if i % PROGRESS_EVERY == 0: + print(f" copied {i}/{len(old_records)} ArchiveResults...") + print(f" copied {len(old_records)} ArchiveResults") + + cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") + + # Don't create indexes - migration 0025 will handle them + + # ============================================================================ + # PART 2: Upgrade core_snapshot table + # ============================================================================ + # Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at) + # and all other fields needed by later migrations + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_snapshot_new ( + id TEXT PRIMARY KEY NOT NULL, + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + title VARCHAR(512), + crawl_id TEXT, + parent_snapshot_id TEXT, + + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + downloaded_at DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0', + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + current_step INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); + """) + + # Check if core_snapshot exists (it should) + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'") + if cursor.fetchone(): + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_snapshot") + snapshot_has_data = cursor.fetchone()[0] > 0 + + if snapshot_has_data: + # Detect which version we're migrating from + snapshot_cols = get_table_columns("core_snapshot") + has_added = "added" in snapshot_cols + has_bookmarked_at = "bookmarked_at" in snapshot_cols + + if has_added and not has_bookmarked_at: + # Migrating from v0.7.2 (has added/updated fields) + print(" copying Snapshots from 0.7.x schema...") + # timestamp is the legacy bookmark/import timestamp and archive/{timestamp} identity. + # added is the DB row creation/import time, and updated was renamed to downloaded_at in 0.8.x. + cursor.execute(""" + INSERT OR IGNORE INTO core_snapshot_new ( + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, downloaded_at, + status + ) + SELECT + id, url, timestamp, title, + COALESCE( + CASE + WHEN CAST(timestamp AS REAL) BETWEEN 788918400 AND 2082758400 + THEN datetime(CAST(timestamp AS REAL), 'unixepoch') + END, + added, + CURRENT_TIMESTAMP + ) as bookmarked_at, + COALESCE(added, CURRENT_TIMESTAMP) as created_at, + COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at, + updated as downloaded_at, + CASE + WHEN EXISTS ( + SELECT 1 FROM core_archiveresult + WHERE core_archiveresult.snapshot_id = core_snapshot.id + AND core_archiveresult.status IN ('queued', 'started', 'backoff') + ) + THEN 'queued' + WHEN EXISTS ( + SELECT 1 FROM core_archiveresult + WHERE core_archiveresult.snapshot_id = core_snapshot.id + ) + THEN 'sealed' + ELSE 'queued' + END as status + FROM core_snapshot; + """) + print(f" copied {cursor.rowcount} Snapshots") + elif has_bookmarked_at and not has_added: + # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at) + print(" copying Snapshots from 0.8.x schema...") + # Check what fields exist + has_status = "status" in snapshot_cols + has_retry_at = "retry_at" in snapshot_cols + has_crawl_id = "crawl_id" in snapshot_cols + + # Build column list based on what exists + insert_cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"] + select_cols = ["id", "url", "timestamp", "title", "bookmarked_at", "created_at", "modified_at", "downloaded_at"] + if has_crawl_id: + insert_cols.append("crawl_id") + select_cols.append("REPLACE(crawl_id, '-', '')") + if has_status: + insert_cols.append("status") + select_cols.append( + """ + CASE + WHEN status IN ('sealed', 'started', 'paused') THEN status + WHEN EXISTS ( + SELECT 1 FROM core_archiveresult + WHERE core_archiveresult.snapshot_id = core_snapshot.id + AND core_archiveresult.status IN ('queued', 'started', 'backoff') + ) + THEN status + WHEN EXISTS ( + SELECT 1 FROM core_archiveresult + WHERE core_archiveresult.snapshot_id = core_snapshot.id + ) + THEN 'sealed' + ELSE status + END + """, + ) + if has_retry_at: + insert_cols.append("retry_at") + select_cols.append("retry_at") + + cursor.execute(f""" + INSERT OR IGNORE INTO core_snapshot_new ({", ".join(insert_cols)}) + SELECT {", ".join(select_cols)} + FROM core_snapshot; + """) + print(f" copied {cursor.rowcount} Snapshots") + else: + print(f"Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}") + + cursor.execute("DROP TABLE IF EXISTS core_snapshot;") + cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;") + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);") + cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);") + + # ============================================================================ + # PART 3: Upgrade core_tag table + # ============================================================================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_tag_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + name VARCHAR(100) NOT NULL UNIQUE, + slug VARCHAR(100) NOT NULL UNIQUE, + + created_by_id INTEGER, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + """) + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'") + if cursor.fetchone(): + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM core_tag") + tag_has_data = cursor.fetchone()[0] > 0 + + if tag_has_data: + cursor.execute("PRAGMA table_info(core_tag)") + tag_id_type = None + for row in cursor.fetchall(): + if row[1] == "id": # row[1] is column name + tag_id_type = row[2] # row[2] is type + break + + if tag_id_type and "char" in tag_id_type.lower(): + # v0.8.6rc0: Tag IDs are UUIDs, need to convert to INTEGER + print(" converting Tag IDs from UUID to integers...") + + # Get all tags with their UUIDs + cursor.execute("SELECT id, name, slug, created_at, modified_at, created_by_id FROM core_tag ORDER BY name") + tags = cursor.fetchall() + + # Create mapping from old UUID to new INTEGER ID + uuid_to_int_map = {} + for i, tag in enumerate(tags, start=1): + old_id, name, slug, created_at, modified_at, created_by_id = tag + uuid_to_int_map[old_id] = i + # Insert with new INTEGER ID + cursor.execute( + """ + INSERT OR IGNORE INTO core_tag_new (id, name, slug, created_at, modified_at, created_by_id) + VALUES (?, ?, ?, ?, ?, ?) + """, + (i, name, slug, created_at, modified_at, created_by_id), + ) + if i % PROGRESS_EVERY == 0: + print(f" copied {i}/{len(tags)} Tags...") + + # Update snapshot_tags to use new INTEGER IDs + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot_tags'") + if cursor.fetchone(): + cursor.execute("SELECT id, snapshot_id, tag_id FROM core_snapshot_tags") + snapshot_tags = cursor.fetchall() + + # Delete old entries + cursor.execute("DELETE FROM core_snapshot_tags") + + # Re-insert with new integer tag IDs + for i, (st_id, snapshot_id, old_tag_id) in enumerate(snapshot_tags, start=1): + new_tag_id = uuid_to_int_map.get(old_tag_id) + if new_tag_id: + cursor.execute( + """ + INSERT OR IGNORE INTO core_snapshot_tags (id, snapshot_id, tag_id) + VALUES (?, ?, ?) + """, + (st_id, snapshot_id, new_tag_id), + ) + if i % PROGRESS_EVERY == 0: + print(f" copied {i}/{len(snapshot_tags)} SnapshotTag rows...") + print(f" copied {len(tags)} Tags") + else: + # v0.7.2: Tag IDs are already INTEGER + print(" copying Tags from 0.7.x schema...") + cursor.execute(""" + INSERT OR IGNORE INTO core_tag_new (id, name, slug) + SELECT id, name, slug + FROM core_tag; + """) + print(f" copied {cursor.rowcount} Tags") + + cursor.execute("DROP TABLE IF EXISTS core_tag;") + cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;") + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);") + + if has_data: + print(" โœ“ Core table rebuild complete") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0022_auto_20231023_2008"), + ("crawls", "0002_upgrade_from_0_8_6"), + ("auth", "0012_alter_user_first_name_max_length"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + upgrade_core_tables, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # NOTE: We do NOT remove extractor/output for ArchiveResult! + # They are still in the database and will be removed by migration 0025 + # after copying their data to plugin/output_str. + # However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields + # because the SQL above already transformed them. + migrations.RemoveField(model_name="snapshot", name="added"), + migrations.RemoveField(model_name="snapshot", name="updated"), + migrations.AddField( + model_name="snapshot", + name="bookmarked_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="snapshot", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="snapshot", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name="snapshot", + name="downloaded_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + # Declare fs_version (already created in database with DEFAULT '0.8.0') + migrations.AddField( + model_name="snapshot", + name="fs_version", + field=models.CharField( + max_length=10, + default="0.8.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + ), + ), + # SnapshotTag table already exists from v0.7.2, just declare it in state + migrations.CreateModel( + name="SnapshotTag", + fields=[ + ("id", models.AutoField(primary_key=True, serialize=False)), + ("snapshot", models.ForeignKey(to="core.Snapshot", db_column="snapshot_id", on_delete=models.CASCADE)), + ("tag", models.ForeignKey(to="core.Tag", db_column="tag_id", on_delete=models.CASCADE)), + ], + options={ + "db_table": "core_snapshot_tags", + "unique_together": {("snapshot", "tag")}, + }, + ), + # Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2) + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField( + "Tag", + blank=True, + related_name="snapshot_set", + through="SnapshotTag", + through_fields=("snapshot", "tag"), + ), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py new file mode 100644 index 0000000000..e32c2552ab --- /dev/null +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -0,0 +1,172 @@ +# Generated by hand on 2025-12-29 +# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL + +from django.db import migrations, models + + +def create_default_crawl_and_assign_snapshots(apps, schema_editor): + """ + Create a default crawl for migrated snapshots and assign all snapshots without a crawl to it. + Uses raw SQL because the app registry isn't fully populated during migrations. + """ + from django.db import connection + import uuid as uuid_lib + from datetime import datetime + + cursor = connection.cursor() + + # Check if there are any snapshots without a crawl + cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE crawl_id IS NULL") + snapshots_without_crawl = cursor.fetchone()[0] + + if snapshots_without_crawl == 0: + print("โœ“ Fresh install or all snapshots already have crawls") + return + + cursor.execute("SELECT url FROM core_snapshot WHERE crawl_id IS NULL ORDER BY bookmarked_at, timestamp") + crawl_urls = "\n".join(url for (url,) in cursor.fetchall() if url) + + # Get or create system user (pk=1) + cursor.execute("SELECT id FROM auth_user WHERE id = 1") + if not cursor.fetchone(): + cursor.execute( + """ + INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined) + VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?) + """, + [datetime.now().isoformat()], + ) + + # Create a default crawl for migrated snapshots. + # Depending on migration graph order, later crawls migrations may already + # have removed output_dir by the time this data migration runs. + crawl_id = uuid_lib.uuid4().hex + now = datetime.now().isoformat() + cursor.execute("PRAGMA table_info(crawls_crawl)") + crawl_columns = {row[1] for row in cursor.fetchall()} + default_columns: list[str] = [] + default_values: list[str] = [] + if "output_dir" in crawl_columns: + default_columns.append("output_dir") + default_values.append("''") + for column in ("max_size", "max_urls", "crawl_max_size", "snapshot_max_size"): + if column in crawl_columns: + default_columns.append(column) + default_values.append("0") + default_columns_sql = f"{', '.join(default_columns)}, " if default_columns else "" + default_values_sql = f"{', '.join(default_values)}, " if default_values else "" + + cursor.execute( + f""" + INSERT INTO crawls_crawl ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, max_depth, tags_str, label, notes, {default_columns_sql} + status, retry_at, created_by_id, schedule_id, config, persona_id + ) VALUES (?, ?, ?, 0, 0, ?, 0, '', 'Migrated from v0.7.2/v0.8.6', + 'Auto-created crawl for migrated snapshots', {default_values_sql} + 'sealed', ?, 1, NULL, '{{}}', NULL) + """, + [crawl_id, now, now, crawl_urls, now], + ) + + # Assign all snapshots without a crawl to the default crawl + cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id]) + + print(f"โœ“ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0023_upgrade_to_0_9_0"), + ("crawls", "0002_upgrade_from_0_8_6"), + ("auth", "0012_alter_user_first_name_max_length"), + ] + + operations = [ + migrations.RunPython( + create_default_crawl_and_assign_snapshots, + reverse_code=migrations.RunPython.noop, + ), + migrations.SeparateDatabaseAndState( + database_operations=[ + # Now make crawl_id NOT NULL + migrations.RunSQL( + sql=""" + -- Rebuild snapshot table with NOT NULL crawl_id + CREATE TABLE core_snapshot_final ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + crawl_id TEXT NOT NULL, + parent_snapshot_id TEXT, + + title VARCHAR(512), + downloaded_at DATETIME, + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + current_step INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); + + INSERT INTO core_snapshot_final ( + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + crawl_id, parent_snapshot_id, + downloaded_at, depth, fs_version, + config, notes, + num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + ) + SELECT + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + REPLACE(crawl_id, '-', ''), REPLACE(parent_snapshot_id, '-', ''), + downloaded_at, depth, fs_version, + COALESCE(config, '{}'), COALESCE(notes, ''), + num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + FROM core_snapshot; + + DROP TABLE core_snapshot; + ALTER TABLE core_snapshot_final RENAME TO core_snapshot; + + CREATE INDEX core_snapshot_url_idx ON core_snapshot(url); + CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp); + CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at); + CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id); + CREATE INDEX core_snapshot_status_idx ON core_snapshot(status); + CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at); + CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at); + CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], + state_operations=[ + migrations.AddField( + model_name="snapshot", + name="crawl", + field=models.ForeignKey( + on_delete=models.deletion.CASCADE, + to="crawls.crawl", + help_text="Crawl that created this snapshot", + ), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py new file mode 100644 index 0000000000..c4eb2ee88d --- /dev/null +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -0,0 +1,338 @@ +# Generated by Django 6.0 on 2025-12-31 23:09 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models, connection + +from archivebox.uuid_compat import uuid7 + + +def copy_old_fields_to_new(apps, schema_editor): + """Copy data from old field names to new field names after AddField operations.""" + cursor = connection.cursor() + + # Check if old fields still exist + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + + if "extractor" in cols and "plugin" in cols: + # Copy extractor -> plugin + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL") + + if "output" in cols and "output_str" in cols: + # Copy output -> output_str + cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL") + + # Fill missing row metadata for older schemas that did not have it. + # 0023 creates/preserves created_at + modified_at before any SQLite table + # rebuilds in this migration, so never overwrite existing non-empty values. + if "start_ts" in cols and "created_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''", + ) + + if "end_ts" in cols and "modified_at" in cols: + cursor.execute( + "UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, created_at, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''", + ) + + # NOTE: Snapshot timestamps (addedโ†’bookmarked_at, updatedโ†’modified_at) were already + # transformed by migration 0023, so we don't need to copy them here. + # NOTE: UUIDs are already populated by migration 0023 for all migration paths. + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0024_assign_default_crawl"), + ("crawls", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterModelOptions( + name="archiveresult", + options={"verbose_name": "Archive Result", "verbose_name_plural": "Archive Results Log"}, + ), + migrations.AlterModelOptions( + name="snapshot", + options={"verbose_name": "Snapshot", "verbose_name_plural": "Snapshots"}, + ), + # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027 + # to allow data migration to Process records first + # NOTE: created_at/modified_at are created by migration 0023 so they can + # preserve old ArchiveResult row metadata before SQLite table rebuilds. + # Update Django's state here before the AddField operations below. + migrations.SeparateDatabaseAndState( + database_operations=[], + state_operations=[ + migrations.AddField( + model_name="archiveresult", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name="archiveresult", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + ], + ), + migrations.AddField( + model_name="archiveresult", + name="config", + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name="archiveresult", + name="hook_name", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + max_length=255, + ), + ), + migrations.AddField( + model_name="archiveresult", + name="notes", + field=models.TextField(blank=True, default=""), + ), + migrations.AddField( + model_name="archiveresult", + name="num_uses_failed", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="archiveresult", + name="num_uses_succeeded", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="archiveresult", + name="output_files", + field=models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}"), + ), + migrations.AddField( + model_name="archiveresult", + name="output_json", + field=models.JSONField(blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)", null=True), + ), + migrations.AddField( + model_name="archiveresult", + name="output_mimetypes", + field=models.CharField(blank=True, default="", help_text="CSV of mimetypes sorted by size", max_length=512), + ), + migrations.AddField( + model_name="archiveresult", + name="output_size", + field=models.BigIntegerField(default=0, help_text="Total bytes of all output files"), + ), + migrations.AddField( + model_name="archiveresult", + name="output_str", + field=models.TextField(blank=True, default="", help_text="Human-readable output summary"), + ), + migrations.AddField( + model_name="archiveresult", + name="plugin", + field=models.CharField(db_index=True, default="", max_length=32), + ), + migrations.AddField( + model_name="archiveresult", + name="retry_at", + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + # NOTE: bookmarked_at, created_at, and downloaded_at already added by migration 0023 + # 0023 rebuilds core_snapshot with these columns already present so it + # can preserve legacy status/retry_at/config data in one tight SQLite + # table copy. Keep this migration state-only: running normal AddField + # operations here causes SQLite to rebuild the table from the pre-0025 + # state and repopulate existing columns with defaults, reopening sealed + # migrated rows as queued. + migrations.SeparateDatabaseAndState( + database_operations=[], + state_operations=[ + migrations.AddField( + model_name="snapshot", + name="config", + field=models.JSONField(default=dict), + ), + migrations.AddField( + model_name="snapshot", + name="current_step", + field=models.PositiveSmallIntegerField( + db_index=True, + default=0, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ), + ), + migrations.AddField( + model_name="snapshot", + name="depth", + field=models.PositiveSmallIntegerField(db_index=True, default=0), + ), + migrations.AddField( + model_name="snapshot", + name="notes", + field=models.TextField(blank=True, default=""), + ), + migrations.AddField( + model_name="snapshot", + name="num_uses_failed", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="snapshot", + name="num_uses_succeeded", + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name="snapshot", + name="parent_snapshot", + field=models.ForeignKey( + blank=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="child_snapshots", + to="core.snapshot", + ), + ), + migrations.AddField( + model_name="snapshot", + name="retry_at", + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name="snapshot", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ], + ), + migrations.AddField( + model_name="tag", + name="created_at", + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name="tag", + name="created_by", + field=models.ForeignKey( + default=archivebox.base_models.models.get_or_create_system_user_pk, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="tag_set", + to=settings.AUTH_USER_MODEL, + ), + ), + migrations.AddField( + model_name="tag", + name="modified_at", + field=models.DateTimeField(auto_now=True), + ), + # Copy data from old field names to new field names after AddField operations + migrations.RunPython( + copy_old_fields_to_new, + reverse_code=migrations.RunPython.noop, + ), + # Now remove the old ArchiveResult fields after data has been copied + migrations.RemoveField( + model_name="archiveresult", + name="extractor", + ), + migrations.RemoveField( + model_name="archiveresult", + name="output", + ), + # NOTE: Snapshot's added/updated were already removed by migration 0023 + migrations.AlterField( + model_name="archiveresult", + name="end_ts", + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name="archiveresult", + name="start_ts", + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ], + db_index=True, + default="queued", + max_length=15, + ), + ), + migrations.AlterField( + model_name="archiveresult", + name="uuid", + field=models.UUIDField(blank=True, db_index=True, default=uuid7, null=True), + ), + migrations.AlterField( + model_name="snapshot", + name="crawl", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="snapshot_set", to="crawls.crawl"), + ), + migrations.AlterField( + model_name="snapshot", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name="snapshot", + name="tags", + field=models.ManyToManyField( + blank=True, + related_name="snapshot_set", + through="core.SnapshotTag", + through_fields=("snapshot", "tag"), + to="core.tag", + ), + ), + migrations.AlterField( + model_name="snapshot", + name="timestamp", + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + migrations.AlterField( + model_name="snapshot", + name="url", + field=models.URLField(db_index=True), + ), + migrations.AlterField( + model_name="tag", + name="slug", + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + migrations.AddConstraint( + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("url", "crawl"), name="unique_url_per_crawl"), + ), + migrations.AddConstraint( + model_name="snapshot", + constraint=models.UniqueConstraint(fields=("timestamp",), name="unique_timestamp"), + ), + ] diff --git a/archivebox/core/migrations/0026_add_process_to_archiveresult.py b/archivebox/core/migrations/0026_add_process_to_archiveresult.py new file mode 100644 index 0000000000..7381b98ee1 --- /dev/null +++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py @@ -0,0 +1,34 @@ +# Generated by Django 6.0 on 2026-01-01 23:28 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0025_alter_archiveresult_options_alter_snapshot_options_and_more"), + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="num_uses_failed", + ), + migrations.RemoveField( + model_name="archiveresult", + name="num_uses_succeeded", + ), + migrations.AddField( + model_name="archiveresult", + name="process", + field=models.OneToOneField( + blank=True, + help_text="Process execution details for this archive result", + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="archiveresult", + to="machine.process", + ), + ), + ] diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py new file mode 100644 index 0000000000..0b98342582 --- /dev/null +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -0,0 +1,407 @@ +# Generated by hand on 2026-01-01 +# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields + +from django.db import migrations, connection +import json +from pathlib import Path +from archivebox.uuid_compat import uuid7 + + +PROGRESS_EVERY = 10000 + + +def parse_cmd_field(cmd_raw): + """ + Parse cmd field which could be: + 1. JSON array string: '["wget", "-p", "url"]' + 2. Space-separated string: 'wget -p url' + 3. NULL/empty + + Returns list of strings. + """ + if not cmd_raw: + return [] + + cmd_raw = cmd_raw.strip() + + if not cmd_raw: + return [] + + # Try to parse as JSON first + if cmd_raw.startswith("["): + try: + parsed = json.loads(cmd_raw) + if isinstance(parsed, list): + return [str(x) for x in parsed] + except json.JSONDecodeError: + pass + + # Fallback: split by spaces (simple approach, doesn't handle quoted strings) + # This is acceptable since old cmd fields were mostly simple commands + return cmd_raw.split() + + +def get_or_create_current_machine(cursor): + """Get or create Machine.current() using raw SQL.""" + import socket + from datetime import datetime + + # Simple machine detection - get hostname as guid + hostname = socket.gethostname() + guid = f"host_{hostname}" # Simple but stable identifier + + # Check if machine exists + cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid]) + row = cursor.fetchone() + + if row: + return row[0] + + # Create new machine + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + machine_id = uuid7().hex + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_machine)") + machine_cols = {row[1] for row in cursor.fetchall()} + + # Build INSERT statement based on available columns + if "config" in machine_cols: + # 0.9.x schema with config column + cursor.execute( + """ + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, config, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', '{}', 0, 0) + """, + [machine_id, now, now, guid, hostname], + ) + else: + # 0.8.x schema without config column + cursor.execute( + """ + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', 0, 0) + """, + [machine_id, now, now, guid, hostname], + ) + + return machine_id + + +def get_or_create_binary(cursor, machine_id, name, abspath, version): + """ + Get or create Binary record. + + Args: + cursor: DB cursor + machine_id: Machine FK + name: Binary name (basename of command) + abspath: Absolute path to binary (or just name if path unknown) + version: Version string + + Returns: + binary_id (str) + """ + from datetime import datetime + + # If abspath is just a name without slashes, it's not a full path + # Store it in both fields for simplicity + if "/" not in abspath: + # Not a full path - store as-is + pass + + # Check if binary exists with same machine, name, abspath, version + cursor.execute( + """ + SELECT id FROM machine_binary + WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ? + """, + [machine_id, name, abspath, version], + ) + + row = cursor.fetchone() + if row: + return row[0] + + # Create new binary + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + binary_id = uuid7().hex + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_binary)") + binary_cols = {row[1] for row in cursor.fetchall()} + + values_by_col = { + "id": binary_id, + "created_at": now, + "modified_at": now, + "machine_id": machine_id, + "name": name, + "binproviders": "env", + "overrides": "{}", + "binprovider": "env", + "abspath": abspath, + "version": version, + "sha256": "", + "status": "installed", + "retry_at": None, + "output_dir": "", + "num_uses_failed": 0, + "num_uses_succeeded": 0, + } + insert_cols = [col for col in values_by_col if col in binary_cols] + placeholders = ", ".join(["?"] * len(insert_cols)) + cursor.execute( + f""" + INSERT INTO machine_binary ({", ".join(insert_cols)}) + VALUES ({placeholders}) + """, + [values_by_col[col] for col in insert_cols], + ) + + return binary_id + + +def map_status(old_status): + """ + Map old ArchiveResult status to Process status and exit_code. + + Args: + old_status: One of: queued, started, backoff, succeeded, failed, skipped + + Returns: + (process_status, exit_code) tuple + """ + status_map = { + "queued": ("queued", None), + "started": ("running", None), + "backoff": ("queued", None), + "succeeded": ("exited", 0), + "failed": ("exited", 1), + "skipped": ("exited", None), # Skipped = exited without error + } + + return status_map.get(old_status, ("queued", None)) + + +def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id): + """ + Create a Process record. + + Returns: + process_id (str) + """ + from datetime import datetime + + # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite + process_id = uuid7().hex + now = datetime.now().isoformat() + + # Convert cmd array to JSON + cmd_json = json.dumps(cmd) + + # Set retry_at to now for queued processes, NULL otherwise + retry_at = now if status == "queued" else None + + cursor.execute("PRAGMA table_info(machine_process)") + process_cols = {row[1] for row in cursor.fetchall()} + values_by_col = { + "id": process_id, + "created_at": now, + "modified_at": now, + "machine_id": machine_id, + "parent_id": None, + "process_type": "hook", + "worker_type": "", + "pwd": pwd, + "cmd": cmd_json, + "env": "{}", + "timeout": 120, + "pid": None, + "exit_code": exit_code, + "stdout": "", + "stderr": "", + "started_at": started_at, + "ended_at": ended_at, + "binary_id": binary_id, + "iface_id": None, + "url": None, + "status": status, + "retry_at": retry_at, + "num_uses_failed": 0, + "num_uses_succeeded": 0, + } + insert_cols = [col for col in values_by_col if col in process_cols] + placeholders = ", ".join(["?"] * len(insert_cols)) + cursor.execute( + f""" + INSERT INTO machine_process ({", ".join(insert_cols)}) + VALUES ({placeholders}) + """, + [values_by_col[col] for col in insert_cols], + ) + + return process_id + + +def copy_archiveresult_data_to_process(apps, schema_editor): + """ + Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records. + + For each ArchiveResult without a process_id: + 1. Parse cmd field (handle both JSON array and space-separated string) + 2. Extract binary name/path from cmd[0] + 3. Get or create Binary record with machine, name, abspath, version + 4. Create Process record with mapped fields + 5. Link ArchiveResult.process_id to new Process + + Status mapping: + - queued โ†’ queued (exit_code=None) + - started โ†’ running (exit_code=None) + - backoff โ†’ queued (exit_code=None) + - succeeded โ†’ exited (exit_code=0) + - failed โ†’ exited (exit_code=1) + - skipped โ†’ exited (exit_code=None) + """ + cursor = connection.cursor() + + # Check if old fields still exist (skip if fresh install or already migrated) + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + + if "cmd" not in cols or "pwd" not in cols or "cmd_version" not in cols: + print(" โœ“ ArchiveResult process metadata already migrated") + return + + # Check if process_id field exists (should exist from 0026) + if "process_id" not in cols: + print("โœ— ERROR: process_id field not found. Migration 0026 must run first.") + return + + # Get or create Machine.current() + machine_id = get_or_create_current_machine(cursor) + + # Get ArchiveResults without process_id that have cmd data + # Use plugin (extractor was renamed to plugin in migration 0025) + cursor.execute(""" + SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version, + status, start_ts, end_ts, created_at + FROM core_archiveresult + WHERE process_id IS NULL + AND (cmd IS NOT NULL OR pwd IS NOT NULL) + """) + + results = cursor.fetchall() + + if not results: + print(" โœ“ No ArchiveResults need Process migration") + return + + print(f" - Migrating {len(results)} ArchiveResults to Process rows...") + + migrated_count = 0 + skipped_count = 0 + error_count = 0 + + for i, row in enumerate(results): + ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row + + try: + # Parse cmd field + cmd_array = parse_cmd_field(cmd_raw) + + # Extract binary info from cmd[0] if available + binary_id = None + if cmd_array and cmd_array[0]: + binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name + binary_abspath = cmd_array[0] + binary_version = cmd_version or "" + + # Get or create Binary record + binary_id = get_or_create_binary( + cursor, + machine_id, + binary_name, + binary_abspath, + binary_version, + ) + + # Map status + process_status, exit_code = map_status(status) + + # Set timestamps + started_at = start_ts or created_at + ended_at = end_ts if process_status == "exited" else None + + # Create Process record + process_id = create_process( + cursor=cursor, + machine_id=machine_id, + pwd=pwd or "", + cmd=cmd_array, + status=process_status, + exit_code=exit_code, + started_at=started_at, + ended_at=ended_at, + binary_id=binary_id, + ) + + # Link ArchiveResult to Process + cursor.execute( + "UPDATE core_archiveresult SET process_id = ? WHERE id = ?", + [process_id, ar_id], + ) + + migrated_count += 1 + if migrated_count % PROGRESS_EVERY == 0: + print(f" migrated {migrated_count}/{len(results)} ArchiveResults...") + + except Exception as e: + print(f"โœ— Error migrating ArchiveResult {ar_id}: {e}") + import traceback + + traceback.print_exc() + error_count += 1 + continue + + print(f" โœ“ Process migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0026_add_process_to_archiveresult"), + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + # First, copy data from old fields to Process + migrations.RunPython( + copy_archiveresult_data_to_process, + reverse_code=migrations.RunPython.noop, + ), + # Now safe to remove old fields (moved from 0025) + migrations.RemoveField( + model_name="archiveresult", + name="cmd", + ), + migrations.RemoveField( + model_name="archiveresult", + name="cmd_version", + ), + migrations.RemoveField( + model_name="archiveresult", + name="pwd", + ), + ] diff --git a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py new file mode 100644 index 0000000000..1459f4ef84 --- /dev/null +++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py @@ -0,0 +1,21 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0027_copy_archiveresult_to_process"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="fs_version", + field=models.CharField( + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + max_length=10, + ), + ), + ] diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py new file mode 100644 index 0000000000..64045aad67 --- /dev/null +++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py @@ -0,0 +1,220 @@ +# Generated by hand on 2026-01-02 +# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot) + +from django.db import migrations, models, connection +from uuid import UUID +from archivebox.uuid_compat import uuid7 + + +PROGRESS_EVERY = 10000 + + +def migrate_archiveresult_id_to_uuid(apps, schema_editor): + """ + Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration). + + Handles both migration paths: + - 0.7.x: ArchiveResult has integer id, NO uuid field โ†’ generate new UUIDs + - 0.8.x: ArchiveResult has integer id + optional uuid field โ†’ reuse existing UUIDs + + Strategy: + 1. Create new table with UUID as primary key (no temporary columns) + 2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x) + 3. Copy all data with UUID as new id + 4. Drop old table, rename new table + 5. Recreate indexes + + Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid) + """ + cursor = connection.cursor() + + # Check if table exists and has data + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + print(" โœ“ ArchiveResult table does not exist, skipping UUID PK migration") + return + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + row_count = cursor.fetchone()[0] + + # Don't skip if table is empty - we still need to recreate to remove uuid column + # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029) + + if row_count == 0: + print(" - Rebuilding empty ArchiveResult table with UUID primary keys...") + else: + print(f" - Migrating {row_count} ArchiveResults from integer IDs to UUID primary keys...") + + # Step 0: Check if machine_process table exists, if not NULL out process_id values + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'") + machine_process_exists = cursor.fetchone() is not None + + if not machine_process_exists: + print(" machine_process is unavailable; clearing process_id references...") + cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL") + + # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns) + cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL, + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + start_ts DATETIME, + end_ts DATETIME, + + output_str TEXT NOT NULL DEFAULT '', + output_json TEXT, + output_files TEXT NOT NULL DEFAULT '{}', + output_size BIGINT NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + process_id TEXT, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, + FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL + ); + """) + + # Step 2: Generate UUIDs for records that don't have them + # Check if uuid column exists (0.8.x has it, 0.7.x doesn't) + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = cursor.fetchall() + col_names = [col[1] for col in columns] + has_uuid_column = "uuid" in col_names + + if has_uuid_column: + cursor.execute("SELECT id, uuid FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {} + for old_id, existing_uuid in records: + if existing_uuid: + # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format) + # (existing UUIDs might be stored with or without dashes in old schema) + id_to_uuid[old_id] = UUID(existing_uuid).hex + else: + # Generate new UUIDv7 (time-ordered) as 32-char hex + id_to_uuid[old_id] = uuid7().hex + else: + # 0.7.x path: no uuid column, generate new UUIDs for all records + cursor.execute("SELECT id FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {old_id: uuid7().hex for (old_id,) in records} + + # Step 3: Copy data with UUIDs as new primary key + cursor.execute("SELECT * FROM core_archiveresult") + old_records = cursor.fetchall() + + # col_names already fetched in Step 2 + inserted_count = 0 + for i, record in enumerate(old_records): + old_id = record[col_names.index("id")] + new_uuid = id_to_uuid[old_id] + + # Build insert with new structure + values = {col_names[i]: record[i] for i in range(len(col_names))} + + # List of fields to copy (all fields from new schema except id, old_id, uuid) + fields_to_copy = [ + "created_at", + "modified_at", + "snapshot_id", + "plugin", + "hook_name", + "status", + "retry_at", + "start_ts", + "end_ts", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "config", + "notes", + "num_uses_succeeded", + "num_uses_failed", + "process_id", + ] + + # Build INSERT statement (only copy fields that exist in source) + existing_fields = [f for f in fields_to_copy if f in values] + + placeholders = ", ".join(["?"] * (len(existing_fields) + 1)) # +1 for id + field_list = "id, " + ", ".join(existing_fields) + + insert_values = [new_uuid] + [values.get(f) for f in existing_fields] + + try: + cursor.execute( + f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", + insert_values, + ) + inserted_count += 1 + if inserted_count % PROGRESS_EVERY == 0: + print(f" copied {inserted_count}/{len(old_records)} ArchiveResults...") + except Exception as e: + print(f"[0029] ERROR inserting record {old_id}: {e}") + if i == 0: + print(f"[0029] First record values: {insert_values[:5]}...") + raise + + if old_records: + print(f" copied {inserted_count}/{len(old_records)} ArchiveResults") + + # Step 4: Replace old table with new table + cursor.execute("DROP TABLE core_archiveresult") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult") + + # Step 5: Create indexes + cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") + cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") + cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") + cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") + cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)") + cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)") + + print(f" โœ“ ArchiveResult UUID primary key migration complete ({row_count} records)") + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0028_alter_snapshot_fs_version"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + migrate_archiveresult_id_to_uuid, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Remove uuid field (was added in 0025, we're merging it into id) + migrations.RemoveField( + model_name="archiveresult", + name="uuid", + ), + # Change id from AutoField to UUIDField (absorbing the uuid field) + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0030_alter_archiveresult_id.py b/archivebox/core/migrations/0030_alter_archiveresult_id.py new file mode 100644 index 0000000000..398cca9824 --- /dev/null +++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-02 10:02 + +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0029_migrate_archiveresult_to_uuid_pk"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py new file mode 100644 index 0000000000..4d31b51866 --- /dev/null +++ b/archivebox/core/migrations/0031_add_archiveresult_snapshot_status_index.py @@ -0,0 +1,16 @@ +# Generated by Codex on 2026-01-21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0030_alter_archiveresult_id"), + ] + + operations = [ + migrations.AddIndex( + model_name="archiveresult", + index=models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), + ), + ] diff --git a/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py new file mode 100644 index 0000000000..7883195089 --- /dev/null +++ b/archivebox/core/migrations/0032_remove_archiveresult_retry_at.py @@ -0,0 +1,14 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0031_add_archiveresult_snapshot_status_index"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="retry_at", + ), + ] diff --git a/archivebox/core/migrations/0033_alter_archiveresult_status.py b/archivebox/core/migrations/0033_alter_archiveresult_status.py new file mode 100644 index 0000000000..8f2315cd25 --- /dev/null +++ b/archivebox/core/migrations/0033_alter_archiveresult_status.py @@ -0,0 +1,28 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0032_remove_archiveresult_retry_at"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ("noresults", "No Results"), + ], + db_index=True, + default="queued", + max_length=16, + ), + ), + ] diff --git a/archivebox/core/migrations/0034_remove_tag_slug.py b/archivebox/core/migrations/0034_remove_tag_slug.py new file mode 100644 index 0000000000..ed1581a1ca --- /dev/null +++ b/archivebox/core/migrations/0034_remove_tag_slug.py @@ -0,0 +1,14 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0033_alter_archiveresult_status"), + ] + + operations = [ + migrations.RemoveField( + model_name="tag", + name="slug", + ), + ] diff --git a/archivebox/core/migrations/0035_archiveresult_archiveresult_start_idx.py b/archivebox/core/migrations/0035_archiveresult_archiveresult_start_idx.py new file mode 100644 index 0000000000..40082c3491 --- /dev/null +++ b/archivebox/core/migrations/0035_archiveresult_archiveresult_start_idx.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-24 10:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0034_remove_tag_slug"), + ("machine", "0013_alter_machine_config"), + ] + + operations = [ + migrations.AddIndex( + model_name="archiveresult", + index=models.Index(fields=["-start_ts", "-id"], name="archiveresult_start_idx"), + ), + ] diff --git a/archivebox/core/migrations/0036_snapshot_snapshot_public_order_idx.py b/archivebox/core/migrations/0036_snapshot_snapshot_public_order_idx.py new file mode 100644 index 0000000000..0c0be44a71 --- /dev/null +++ b/archivebox/core/migrations/0036_snapshot_snapshot_public_order_idx.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-24 10:28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0035_archiveresult_archiveresult_start_idx"), + ("crawls", "0007_remove_crawl_crawl_admin_order_idx_and_more"), + ] + + operations = [ + migrations.AddIndex( + model_name="snapshot", + index=models.Index(fields=["-bookmarked_at", "-created_at"], name="snapshot_public_order_idx"), + ), + ] diff --git a/archivebox/core/migrations/0037_alter_snapshot_url_max_length.py b/archivebox/core/migrations/0037_alter_snapshot_url_max_length.py new file mode 100644 index 0000000000..433bb40bdc --- /dev/null +++ b/archivebox/core/migrations/0037_alter_snapshot_url_max_length.py @@ -0,0 +1,15 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0036_snapshot_snapshot_public_order_idx"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="url", + field=models.CharField(db_index=True, max_length=65535), + ), + ] diff --git a/archivebox/core/migrations/0038_snapshot_progress_idx.py b/archivebox/core/migrations/0038_snapshot_progress_idx.py new file mode 100644 index 0000000000..8f35d45aa2 --- /dev/null +++ b/archivebox/core/migrations/0038_snapshot_progress_idx.py @@ -0,0 +1,16 @@ +# Generated by ArchiveBox on 2026-05-27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0037_alter_snapshot_url_max_length"), + ] + + operations = [ + migrations.AddIndex( + model_name="snapshot", + index=models.Index(fields=["crawl", "status", "modified_at"], name="snapshot_progress_idx"), + ), + ] diff --git a/archivebox/core/migrations/0039_alter_archiveresult_process.py b/archivebox/core/migrations/0039_alter_archiveresult_process.py new file mode 100644 index 0000000000..7277b6d308 --- /dev/null +++ b/archivebox/core/migrations/0039_alter_archiveresult_process.py @@ -0,0 +1,26 @@ +# Generated by Django 6.0.5 on 2026-05-27 20:11 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0038_snapshot_progress_idx"), + ("machine", "0015_process_progress_indexes"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="process", + field=models.OneToOneField( + blank=True, + help_text="Process execution details for this archive result", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="archiveresult", + to="machine.process", + ), + ), + ] diff --git a/archivebox/core/migrations/0040_archiveresult_delete_at_snapshot_delete_at.py b/archivebox/core/migrations/0040_archiveresult_delete_at_snapshot_delete_at.py new file mode 100644 index 0000000000..67ee565c6f --- /dev/null +++ b/archivebox/core/migrations/0040_archiveresult_delete_at_snapshot_delete_at.py @@ -0,0 +1,22 @@ +# Generated by Django 6.0.5 on 2026-05-27 20:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0039_alter_archiveresult_process"), + ] + + operations = [ + migrations.AddField( + model_name="archiveresult", + name="delete_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + migrations.AddField( + model_name="snapshot", + name="delete_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + ] diff --git a/archivebox/core/migrations/0041_snapshot_permissions.py b/archivebox/core/migrations/0041_snapshot_permissions.py new file mode 100644 index 0000000000..92b074330b --- /dev/null +++ b/archivebox/core/migrations/0041_snapshot_permissions.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0.5 on 2026-05-28 07:25 + +import django.db.models.fields.json +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0040_archiveresult_delete_at_snapshot_delete_at"), + ] + + operations = [ + migrations.AddField( + model_name="snapshot", + name="permissions", + field=models.GeneratedField( + db_index=True, + db_persist=True, + expression=django.db.models.fields.json.KeyTextTransform("PERMISSIONS", "config"), + output_field=models.CharField(max_length=16, null=True), + ), + ), + ] diff --git a/archivebox/core/migrations/0042_snapshot_output_size.py b/archivebox/core/migrations/0042_snapshot_output_size.py new file mode 100644 index 0000000000..10b716718b --- /dev/null +++ b/archivebox/core/migrations/0042_snapshot_output_size.py @@ -0,0 +1,43 @@ +# Generated by Django 6.0.5 on 2026-05-28 08:22 + +from django.db import migrations, models +from django.db.models import Sum + + +def backfill_snapshot_output_size(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + batch = [] + rows = ArchiveResult.objects.values("snapshot_id").annotate(total_size=Sum("output_size")).order_by() + for row in rows.iterator(chunk_size=2000): + batch.append(Snapshot(id=row["snapshot_id"], output_size=row["total_size"] or 0)) + if len(batch) >= 2000: + Snapshot.objects.bulk_update(batch, ["output_size"], batch_size=2000) + batch = [] + if batch: + Snapshot.objects.bulk_update(batch, ["output_size"], batch_size=2000) + + +def clear_snapshot_output_size(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + Snapshot.objects.update(output_size=0) + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0041_snapshot_permissions"), + ] + + operations = [ + migrations.AddField( + model_name="snapshot", + name="output_size", + field=models.BigIntegerField( + db_index=True, + default=0, + editable=False, + help_text="Total bytes of all ArchiveResult output files", + ), + ), + migrations.RunPython(backfill_snapshot_output_size, clear_snapshot_output_size), + ] diff --git a/archivebox/core/migrations/0043_archiveresult_retry_at.py b/archivebox/core/migrations/0043_archiveresult_retry_at.py new file mode 100644 index 0000000000..1c36c42549 --- /dev/null +++ b/archivebox/core/migrations/0043_archiveresult_retry_at.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0042_snapshot_output_size"), + ] + + operations = [ + migrations.AddField( + model_name="archiveresult", + name="retry_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + ] diff --git a/archivebox/core/migrations/0044_alter_archiveresult_status_alter_snapshot_status.py b/archivebox/core/migrations/0044_alter_archiveresult_status_alter_snapshot_status.py new file mode 100644 index 0000000000..4ca5981288 --- /dev/null +++ b/archivebox/core/migrations/0044_alter_archiveresult_status_alter_snapshot_status.py @@ -0,0 +1,41 @@ +# Generated by Django 6.0.5 on 2026-05-28 12:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0043_archiveresult_retry_at"), + ] + + operations = [ + migrations.AlterField( + model_name="archiveresult", + name="status", + field=models.CharField( + choices=[ + ("queued", "Queued"), + ("started", "Started"), + ("paused", "Paused"), + ("backoff", "Waiting to retry"), + ("succeeded", "Succeeded"), + ("failed", "Failed"), + ("skipped", "Skipped"), + ("noresults", "No Results"), + ], + db_index=True, + default="queued", + max_length=16, + ), + ), + migrations.AlterField( + model_name="snapshot", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("paused", "Paused"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ] diff --git a/archivebox/core/migrations/0045_archiveresult_unique_hook.py b/archivebox/core/migrations/0045_archiveresult_unique_hook.py new file mode 100644 index 0000000000..fb8ff322e6 --- /dev/null +++ b/archivebox/core/migrations/0045_archiveresult_unique_hook.py @@ -0,0 +1,45 @@ +from django.db import migrations, models + + +def deduplicate_archiveresults_per_hook(apps, schema_editor): + """Drop duplicate ArchiveResult rows per (snapshot, plugin, hook_name). + + Real long-lived collections (cabbage's demo, beta-tester DBs) accumulated + multiple rows per hook over the dev rc chain. The next operation adds a + UniqueConstraint on that tuple; without this cleanup pass the constraint + fails with ``UNIQUE constraint failed`` mid-migration and bricks startup. + Keep the row with the highest id (most recent) for each tuple. + """ + ArchiveResult = apps.get_model("core", "ArchiveResult") + duplicate_groups = ( + ArchiveResult.objects.values("snapshot_id", "plugin", "hook_name").annotate(count=models.Count("id")).filter(count__gt=1) + ) + for group in duplicate_groups.iterator(chunk_size=200): + lookup = { + "snapshot_id": group["snapshot_id"], + "plugin": group["plugin"], + "hook_name": group["hook_name"], + } + keep = ArchiveResult.objects.filter(**lookup).order_by("-id").values_list("id", flat=True).first() + if keep is not None: + ArchiveResult.objects.filter(**lookup).exclude(id=keep).delete() + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0044_alter_archiveresult_status_alter_snapshot_status"), + ] + + operations = [ + migrations.RunPython( + deduplicate_archiveresults_per_hook, + reverse_code=migrations.RunPython.noop, + ), + migrations.AddConstraint( + model_name="archiveresult", + constraint=models.UniqueConstraint( + fields=("snapshot", "plugin", "hook_name"), + name="unique_archiveresult_per_snapshot_hook", + ), + ), + ] diff --git a/archivebox/core/migrations/0046_repair_snapshot_permissions.py b/archivebox/core/migrations/0046_repair_snapshot_permissions.py new file mode 100644 index 0000000000..53703c90f2 --- /dev/null +++ b/archivebox/core/migrations/0046_repair_snapshot_permissions.py @@ -0,0 +1,43 @@ +from django.db import migrations + + +def _repair_snapshot_permissions(apps, schema_editor): + """Backfill ``core_snapshot.permissions`` if a prior squash-skewed + ``django_migrations`` row claimed ``0041_snapshot_permissions`` was + applied but the column never actually landed on the table. + + Beta-tester / cabbage-style DBs upgraded incrementally through the + 0.8.x โ†’ 0.9.x rc chain have a 0041_snapshot_permissions entry with a + different historical effect (the name was reused across squashes), + so the runtime model's ``snapshot.permissions`` ``GeneratedField`` has + no underlying column. Fresh installs added the column via 0041 and + this is a no-op. + """ + cursor = schema_editor.connection.cursor() + # ``table_xinfo`` lists STORED/VIRTUAL generated columns; ``table_info`` + # silently drops them, so a prior 0041 that landed the STORED column + # would otherwise look absent here and we'd ALTER TABLE -> duplicate. + cursor.execute("PRAGMA table_xinfo(core_snapshot)") + existing_cols = {row[1] for row in cursor.fetchall()} + if "permissions" in existing_cols: + return + # See crawls/0016 โ€” SQLite ALTER TABLE only allows VIRTUAL generated + # columns; STORED would error with "cannot add a STORED column". The + # runtime model's ``db_persist=True`` is only honored for fresh installs + # where 0041 added the column during initial table creation. + cursor.execute( + "ALTER TABLE core_snapshot ADD COLUMN permissions varchar(16) GENERATED ALWAYS AS (json_extract(config, '$.PERMISSIONS')) VIRTUAL", + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS core_snapshot_permissions_idx ON core_snapshot (permissions)", + ) + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0045_archiveresult_unique_hook"), + ] + + operations = [ + migrations.RunPython(_repair_snapshot_permissions, migrations.RunPython.noop), + ] diff --git a/archivebox/core/migrations/0047_archiveresult_status_snapshot_index.py b/archivebox/core/migrations/0047_archiveresult_status_snapshot_index.py new file mode 100644 index 0000000000..2c31aaeb83 --- /dev/null +++ b/archivebox/core/migrations/0047_archiveresult_status_snapshot_index.py @@ -0,0 +1,28 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0046_repair_snapshot_permissions"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql="CREATE INDEX IF NOT EXISTS archiveresult_status_snap_idx ON core_archiveresult (status, snapshot_id)", + reverse_sql="DROP INDEX IF EXISTS archiveresult_status_snap_idx", + ), + ], + state_operations=[ + migrations.AddIndex( + model_name="archiveresult", + index=models.Index(fields=["status", "snapshot"], name="archiveresult_status_snap_idx"), + ), + ], + ), + migrations.RunSQL( + sql="ANALYZE core_archiveresult", + reverse_sql=migrations.RunSQL.noop, + ), + ] diff --git a/archivebox/core/migrations/0048_remove_archiveresult_config.py b/archivebox/core/migrations/0048_remove_archiveresult_config.py new file mode 100644 index 0000000000..fe95e94926 --- /dev/null +++ b/archivebox/core/migrations/0048_remove_archiveresult_config.py @@ -0,0 +1,14 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0047_archiveresult_status_snapshot_index"), + ] + + operations = [ + migrations.RemoveField( + model_name="archiveresult", + name="config", + ), + ] diff --git a/archivebox/core/migrations/0049_alter_snapshot_url.py b/archivebox/core/migrations/0049_alter_snapshot_url.py new file mode 100644 index 0000000000..18a30c37be --- /dev/null +++ b/archivebox/core/migrations/0049_alter_snapshot_url.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.6 on 2026-06-04 22:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0048_remove_archiveresult_config"), + ] + + operations = [ + migrations.AlterField( + model_name="snapshot", + name="url", + field=models.TextField(db_index=True), + ), + ] diff --git a/archivebox/core/migrations/0050_snapshot_permissions_not_null.py b/archivebox/core/migrations/0050_snapshot_permissions_not_null.py new file mode 100644 index 0000000000..78f065df7c --- /dev/null +++ b/archivebox/core/migrations/0050_snapshot_permissions_not_null.py @@ -0,0 +1,69 @@ +import django.db.models.fields.json +import django.db.models.functions.comparison +from django.db import migrations, models +from django.db.models import Q + + +VALID_PERMISSIONS = {"public", "unlisted", "private"} +BATCH_SIZE = 1000 + + +def normalize_permissions(value, default="private"): + value = str(value or "").strip().lower() + return value if value in VALID_PERMISSIONS else default + + +def hydrate_snapshot_permissions(apps, schema_editor): + Snapshot = apps.get_model("core", "Snapshot") + db_alias = schema_editor.connection.alias + missing_permissions = Q(permissions__isnull=True) | (Q(permissions__isnull=False) & ~Q(permissions__in=VALID_PERMISSIONS)) + batch = [] + + snapshots = ( + Snapshot.objects.using(db_alias) + .filter(missing_permissions) + .select_related("crawl") + .only("id", "config", "crawl__permissions") + .iterator(chunk_size=BATCH_SIZE) + ) + for snapshot in snapshots: + config = dict(snapshot.config or {}) + config["PERMISSIONS"] = normalize_permissions(snapshot.crawl.permissions) + snapshot.config = config + batch.append(snapshot) + if len(batch) >= BATCH_SIZE: + Snapshot.objects.using(db_alias).bulk_update(batch, ["config"], batch_size=BATCH_SIZE) + batch.clear() + + if batch: + Snapshot.objects.using(db_alias).bulk_update(batch, ["config"], batch_size=BATCH_SIZE) + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0016_hydrate_crawl_permissions"), + ("core", "0049_alter_snapshot_url"), + ] + + operations = [ + migrations.RunPython(hydrate_snapshot_permissions, migrations.RunPython.noop), + migrations.RemoveField( + model_name="snapshot", + name="permissions", + ), + migrations.AddField( + model_name="snapshot", + name="permissions", + field=models.GeneratedField( + db_index=True, + db_persist=True, + editable=False, + expression=django.db.models.functions.comparison.Coalesce( + django.db.models.fields.json.KeyTextTransform("PERMISSIONS", "config"), + models.Value("private"), + output_field=models.CharField(max_length=16), + ), + output_field=models.CharField(max_length=16), + ), + ), + ] diff --git a/archivebox/core/migrations/__init__.py b/archivebox/core/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/models.py b/archivebox/core/models.py new file mode 100755 index 0000000000..42d97603f0 --- /dev/null +++ b/archivebox/core/models.py @@ -0,0 +1,4792 @@ +__package__ = "archivebox.core" + +from typing import TYPE_CHECKING, Optional, Any +from collections.abc import Iterable, Mapping, Sequence +import uuid +from archivebox.uuid_compat import CompactUUIDField, uuid7 +from datetime import datetime, timedelta + +import os +import json +from pathlib import Path +from urllib.parse import urlparse + +from statemachine import State, registry + +from django.db import models, transaction +from django.db.models import Case, F, Q, QuerySet, Sum, Value, When +from django.db.models.functions import Coalesce, Concat +from django.db.models.fields.json import KT +from django.utils.functional import cached_property +from django.utils.text import slugify +from django.utils import timezone +from django.core.cache import cache +from django.urls import reverse_lazy +from django.contrib import admin +from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist, ValidationError +from django.utils.safestring import mark_safe + +from archivebox.config import CONSTANTS +from archivebox.config.common import get_config, rprint +from archivebox.misc.system import atomic_write +from archivebox.misc.util import ( + parse_date, + domain as url_domain, + to_json, + ts_to_date_str, + urlencode, + htmlencode, + urldecode, + validate_url, +) +from archivebox.plugins.discovery import ( + get_plugins, + get_plugin_name, + get_plugin_icon, +) +from archivebox.base_models.models import ( + ModelWithUUID, + ModelWithDeleteAfter, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, + get_or_create_system_user_pk, +) +from archivebox.workers.models import ACTIVE_STATE_LEASE_SECONDS, RETRY_AT_MAX, ModelWithStateMachine, BaseStateMachine +from archivebox.crawls.models import Crawl +from archivebox.machine.models import Binary + +if TYPE_CHECKING: + from archivebox.config.common import ArchiveBoxBaseConfig + + +class UngroupedSubquery(models.Subquery): + """Scalar subquery that should not be copied into the outer GROUP BY.""" + + def get_group_by_cols(self): + return [] + + +class Tag(ModelWithUUID): + # Keep AutoField for compatibility with main branch migrations + # Don't use UUIDField here - requires complex FK transformation + id = models.AutoField(primary_key=True, serialize=False, verbose_name="ID") + created_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.CASCADE, + default=get_or_create_system_user_pk, + null=True, + related_name="tag_set", + ) + created_at = models.DateTimeField(default=timezone.now, db_index=True, null=True) + modified_at = models.DateTimeField(auto_now=True) + name = models.CharField(unique=True, blank=False, max_length=100) + + snapshot_set: models.Manager["Snapshot"] + + class Meta(ModelWithUUID.Meta): + app_label = "core" + verbose_name = "Tag" + verbose_name_plural = "Tags" + + def __str__(self): + return self.name + + @property + def slug(self) -> str: + """ASCII-safe slugified form of the tag name (derived, not stored).""" + return slugify(self.name or "") or "tag" + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_tag", args=[self.id])) + + def to_json(self) -> dict: + """ + Convert Tag model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + return { + "type": "Tag", + "schema_version": VERSION, + "id": str(self.id), + "name": self.name, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Tag from JSON dict. + + Args: + record: JSON dict with 'name' field + overrides: Optional dict with 'snapshot' to auto-attach tag + + Returns: + Tag instance or None + """ + name = record.get("name") + if not name: + return None + + tag, _ = Tag.objects.get_or_create(name=name) + + # Auto-attach to snapshot if in overrides + if overrides and "snapshot" in overrides and tag: + overrides["snapshot"].tags.add(tag) + + return tag + + +class SnapshotTag(models.Model): + id = models.AutoField(primary_key=True) + snapshot = models.ForeignKey("Snapshot", db_column="snapshot_id", on_delete=models.CASCADE, to_field="id") + tag = models.ForeignKey(Tag, db_column="tag_id", on_delete=models.CASCADE, to_field="id") + + class Meta: + app_label = "core" + db_table = "core_snapshot_tags" + unique_together = [("snapshot", "tag")] + + +class SnapshotQuerySet(models.QuerySet): + """Custom QuerySet for Snapshot model with export methods that persist through .filter() etc.""" + + def bulk_create(self, objs, *args, **kwargs): + objs = list(objs) + missing_crawl_ids = set() + from archivebox.core.permissions import PERMISSIONS_VALUES + + for obj in objs: + if isinstance(obj, self.model): + config = dict(obj.config or {}) + permission = str(config.get("PERMISSIONS") or "").strip().lower() + if permission not in PERMISSIONS_VALUES and obj.crawl_id: + crawl = getattr(obj, "crawl", None) + if not getattr(crawl, "permissions", None): + missing_crawl_ids.add(str(obj.crawl_id)) + + crawl_permissions_by_id = {} + if missing_crawl_ids: + crawl_permissions_by_id = { + str(crawl_id): permissions + for crawl_id, permissions in Crawl.objects.filter(pk__in=missing_crawl_ids).values_list("pk", "permissions") + } + + for obj in objs: + if isinstance(obj, self.model): + obj.ensure_permissions_config(crawl_permissions=crawl_permissions_by_id.get(str(obj.crawl_id))) + return super().bulk_create(objs, *args, **kwargs) + + def paged_iterator(self, chunk_size: int = 500): + """ + Iterate snapshots using bounded keyset pages instead of one streaming cursor. + + Django's iterator(chunk_size=...) still keeps a single SQLite SELECT + cursor open until the full queryset is exhausted. That is fine for + read-only exports, but update/migration code does filesystem work and + writes while iterating; a long-lived read cursor there can stretch lock + waits across thousands of rows. This respects the queryset's existing + filters, order_by(), select_related(), and prefetch_related() state; if + no ordering is defined, it falls back to primary-key order. + """ + pk_field = self.model._meta.pk.name + raw_ordering = tuple(self.query.order_by or self.model._meta.ordering or (pk_field,)) + + if any(not isinstance(term, str) or term == "?" for term in raw_ordering): + offset = 0 + while True: + batch = list(self[offset : offset + chunk_size]) + if not batch: + break + yield from batch + offset += chunk_size + return + + ordering = [] + for term in raw_ordering: + descending = term.startswith("-") + field_name = term[1:] if descending else term + if field_name == "pk": + field_name = pk_field + ordering.append(f"-{field_name}" if descending else field_name) + + ordered_field_names = [term[1:] if term.startswith("-") else term for term in ordering] + try: + if any(self.model._meta.get_field(field_name).null for field_name in ordered_field_names): + offset = 0 + while True: + batch = list(self[offset : offset + chunk_size]) + if not batch: + break + yield from batch + offset += chunk_size + return + except Exception: + offset = 0 + while True: + batch = list(self[offset : offset + chunk_size]) + if not batch: + break + yield from batch + offset += chunk_size + return + + unique_field_names = {pk_field, *(field.name for field in self.model._meta.fields if field.unique)} + if not any(field_name in unique_field_names for field_name in ordered_field_names): + offset = 0 + while True: + batch = list(self[offset : offset + chunk_size]) + if not batch: + break + yield from batch + offset += chunk_size + return + + last_values = None + value_field_names = tuple(dict.fromkeys([*ordered_field_names, pk_field])) + while True: + batch_qs = self.order_by(*ordering) + if last_values is not None: + page_filter = models.Q() + for idx, term in enumerate(ordering): + descending = term.startswith("-") + field_name = term[1:] if descending else term + prefix = {ordered_field_names[i]: last_values[i] for i in range(idx)} + comparison = "lt" if descending else "gt" + page_filter |= models.Q(**prefix, **{f"{field_name}__{comparison}": last_values[idx]}) + batch_qs = batch_qs.filter(page_filter) + + batch_rows = list(batch_qs.values_list(*value_field_names)[:chunk_size]) + if not batch_rows: + break + + pk_idx = value_field_names.index(pk_field) + snapshot_ids = [row[pk_idx] for row in batch_rows] + snapshots_by_id = {snapshot.pk: snapshot for snapshot in self.filter(pk__in=snapshot_ids).order_by()} + + for row in batch_rows: + snapshot_id = row[pk_idx] + snapshot = snapshots_by_id.get(snapshot_id) + if snapshot is not None: + yield snapshot + + last_values = batch_rows[-1][: len(ordered_field_names)] + + # ========================================================================= + # Filtering Methods + # ========================================================================= + + FILTER_TYPES = { + "exact": lambda pattern: models.Q(url=pattern), + "substring": lambda pattern: models.Q(url__icontains=pattern), + "regex": lambda pattern: models.Q(url__iregex=pattern), + "domain": lambda pattern: ( + models.Q(url__istartswith=f"http://{pattern}") + | models.Q(url__istartswith=f"https://{pattern}") + | models.Q(url__istartswith=f"ftp://{pattern}") + ), + "tag": lambda pattern: models.Q(tags__name=pattern), + "timestamp": lambda pattern: models.Q(timestamp=pattern), + } + FILTER_TYPE_CHOICES = tuple(FILTER_TYPES) + FILTER_ARG_KEYS = ( + "after", + "before", + "filter_type", + "filter_patterns", + "status", + "url__icontains", + "url__istartswith", + "tag", + "crawl_id", + "limit", + "sort", + "search", + ) + SPECIAL_FILTER_ARG_KEYS = frozenset({"filter_patterns", "filter_type", "query", "search", "tag", "before", "after", "limit", "sort"}) + + def filter_by_patterns(self, patterns: list[str], filter_type: str = "exact") -> "SnapshotQuerySet": + """Filter snapshots by URL patterns using specified filter type""" + from archivebox.misc.logging import stderr + + q_filter = models.Q() + for pattern in patterns: + try: + q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern) + except KeyError: + stderr() + stderr(f"[X] Got invalid pattern for --filter-type={filter_type}:", color="red") + stderr(f" {pattern}") + raise SystemExit(2) + return self.filter(q_filter) + + def search(self, **kwargs) -> "SnapshotQuerySet": + from datetime import timezone as dt_timezone + + from archivebox.core.snapshot_status import filter_snapshots_by_status + from archivebox.search.query import apply_snapshot_search + + queryset = self + filter_patterns = tuple(str(pattern) for pattern in kwargs.get("filter_patterns") or ()) + filter_type = kwargs.get("filter_type") or "substring" + query = kwargs.get("query") + if isinstance(query, (list, tuple)): + query = " ".join(str(part) for part in query) + query = (query or (" ".join(filter_patterns) if kwargs.get("search") else "")).strip() + + field_names = {field.name for field in self.model._meta.get_fields()} + field_names.update(field.attname for field in self.model._meta.fields) + field_filters = { + key: value + for key, value in kwargs.items() + if value is not None and key not in self.SPECIAL_FILTER_ARG_KEYS and key.split("__", 1)[0] in field_names + } + status = field_filters.pop("status", None) + queryset = filter_snapshots_by_status(queryset, status) + if field_filters: + queryset = queryset.filter(**field_filters) + if kwargs.get("tag"): + queryset = queryset.filter(tags__name__iexact=kwargs["tag"]) + if kwargs.get("before") is not None: + queryset = queryset.filter(bookmarked_at__lt=datetime.fromtimestamp(float(kwargs["before"]), tz=dt_timezone.utc)) + if kwargs.get("after") is not None: + queryset = queryset.filter(bookmarked_at__gt=datetime.fromtimestamp(float(kwargs["after"]), tz=dt_timezone.utc)) + + if query: + queryset = apply_snapshot_search( + queryset, + query, + search_mode=kwargs.get("search"), + ordering=("-created_at",) if not kwargs.get("sort") else None, + max_results=kwargs.get("limit"), + skip_backend_when_metadata_satisfies_limit=True, + include_metadata_for_forced_backend=True, + ) + elif filter_patterns: + queryset = queryset.filter_by_patterns(list(filter_patterns), filter_type) + + if kwargs.get("sort"): + queryset = queryset.order_by(kwargs["sort"]) + elif not queryset.query.order_by: + queryset = queryset.order_by("-created_at") + + limit = kwargs.get("limit") + if limit is not None and limit > 0: + queryset = queryset[:limit] + + return queryset + + # ========================================================================= + # Export Methods + # ========================================================================= + + def to_json(self, with_headers: bool = False) -> str: + """Generate JSON index from snapshots""" + import sys + from datetime import datetime, timezone as tz + from archivebox.config import VERSION + + config = get_config() + + MAIN_INDEX_HEADER = ( + { + "info": "This is an index of site data archived by ArchiveBox: The self-hosted web archive.", + "schema": "archivebox.index.json", + "copyright_info": config.FOOTER_INFO, + "meta": { + "project": "ArchiveBox", + "version": VERSION, + "git_sha": VERSION, + "website": "https://ArchiveBox.io", + "docs": "https://github.com/ArchiveBox/ArchiveBox/wiki", + "source": "https://github.com/ArchiveBox/ArchiveBox", + "issues": "https://github.com/ArchiveBox/ArchiveBox/issues", + "dependencies": {}, + }, + } + if with_headers + else {} + ) + + snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)] + + if with_headers: + output = { + **MAIN_INDEX_HEADER, + "num_links": len(snapshot_dicts), + "updated": datetime.now(tz.utc), + "last_run_cmd": sys.argv, + "links": snapshot_dicts, + } + else: + output = snapshot_dicts + return to_json(output, indent=4, sort_keys=True) + + def to_csv(self, cols: list[str] | None = None, header: bool = True, separator: str = ",", ljust: int = 0) -> str: + """Generate CSV output from snapshots""" + cols = cols or ["timestamp", "is_archived", "url"] + header_str = separator.join(col.ljust(ljust) for col in cols) if header else "" + row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500)) + return "\n".join((header_str, *row_strs)) + + def to_html(self, with_headers: bool = True) -> str: + """Generate main index HTML from snapshots""" + from datetime import datetime, timezone as tz + from django.template.loader import render_to_string + from archivebox.config import VERSION + from archivebox.config.version import get_COMMIT_HASH + + config = get_config() + + template = "static_index.html" if with_headers else "minimal_index.html" + snapshot_list = list(self.iterator(chunk_size=500)) + + return render_to_string( + template, + { + "version": VERSION, + "git_sha": get_COMMIT_HASH() or VERSION, + "num_links": str(len(snapshot_list)), + "date_updated": datetime.now(tz.utc).strftime("%Y-%m-%d"), + "time_updated": datetime.now(tz.utc).strftime("%Y-%m-%d %H:%M"), + "links": snapshot_list, + "FOOTER_INFO": config.FOOTER_INFO, + }, + ) + + +class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base] + """Manager for Snapshot model - uses SnapshotQuerySet for chainable methods""" + + def filter(self, *args, **kwargs): + domain = kwargs.pop("domain", None) + qs = super().filter(*args, **kwargs) + if domain: + qs = qs.filter(url__icontains=f"://{domain}") + return qs + + def get_queryset(self): + # Don't prefetch by default - it causes "too many open files" during bulk operations + # Views/templates can add .prefetch_related('tags', 'archiveresult_set') where needed + return super().get_queryset() + + # ========================================================================= + # Import Methods + # ========================================================================= + + def remove(self, atomic: bool = False) -> tuple: + """Remove snapshots from the database""" + from django.db import transaction + + if atomic: + with transaction.atomic(): + return self.get_queryset().delete() + return self.get_queryset().delete() + + +class Snapshot(ModelWithDeleteAfter, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): + INTERNAL_INPUT_URL = "archivebox://internal" + + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + # Stored as a variable-length TextField so short URLs don't reserve space and very long + # URLs (up to MAX_URL_LENGTH chars, enforced in save()) are supported, while keeping a + # normal index so exact, prefix, and substring lookups all stay fast. + url = models.TextField(db_index=True) # URLs can appear in multiple crawls + timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) + bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True) + crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name="snapshot_set", db_index=True) # type: ignore[assignment] + parent_snapshot = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="child_snapshots", + db_index=True, + help_text="Parent snapshot that discovered this URL (for recursive crawling)", + ) + + title = models.CharField(max_length=512, null=True, blank=True, db_index=True) + downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) + depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs + fs_version = models.CharField( + max_length=10, + default="0.9.0", + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', + ) + current_step = models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text="Current hook step being executed (0-9). Used for sequential hook execution.", + ) + + retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) + config = models.JSONField(default=dict, null=False, blank=False, editable=True) + permissions = models.GeneratedField( + expression=Coalesce(KT("config__PERMISSIONS"), Value("private"), output_field=models.CharField(max_length=16)), + output_field=models.CharField(max_length=16, null=False), + db_persist=True, + db_index=True, + editable=False, + ) + output_size = models.BigIntegerField( + default=0, + db_index=True, + editable=False, + help_text="Total bytes of all ArchiveResult output files", + ) + notes = models.TextField(blank=True, null=False, default="") + # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version() + + tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name="snapshot_set", through_fields=("snapshot", "tag")) + + state_machine_name = "archivebox.core.models.SnapshotMachine" + state_field_name = "status" + retry_at_field_name = "retry_at" + StatusChoices = ModelWithStateMachine.StatusChoices + active_state = StatusChoices.STARTED + delete_after_final_statuses = (StatusChoices.SEALED,) + RUNNABLE_STATES = (StatusChoices.QUEUED, StatusChoices.STARTED) + OPEN_STATES = (*RUNNABLE_STATES, StatusChoices.PAUSED) + + crawl_id: uuid.UUID + parent_snapshot_id: uuid.UUID | None + _prefetched_objects_cache: dict[str, Any] + + objects = SnapshotManager() + archiveresult_set: models.Manager["ArchiveResult"] + + if TYPE_CHECKING: + + @property + def sm(self) -> "SnapshotMachine": ... + + class Meta( + ModelWithDeleteAfter.Meta, + ModelWithOutputDir.Meta, + ModelWithConfig.Meta, + ModelWithNotes.Meta, + ModelWithHealthStats.Meta, + ModelWithStateMachine.Meta, + ): + app_label = "core" + verbose_name = "Snapshot" + verbose_name_plural = "Snapshots" + indexes = [ + models.Index(fields=["-bookmarked_at", "-created_at"], name="snapshot_public_order_idx"), + models.Index(fields=["crawl", "status", "modified_at"], name="snapshot_progress_idx"), + ] + constraints = [ + # Allow same URL in different crawls, but not duplicates within same crawl + models.UniqueConstraint(fields=["url", "crawl"], name="unique_url_per_crawl"), + # Global timestamp uniqueness for 1:1 symlink mapping + models.UniqueConstraint(fields=["timestamp"], name="unique_timestamp"), + ] + + def __str__(self): + return f"[{self.id}] {self.url[:64]}" + + @classmethod + def crawl_count_subquery(cls, *, status: str | None = None, outer_ref: str = "pk") -> QuerySet: + """Return a scalar subquery counting Snapshots for one outer Crawl.""" + qs = cls.objects.filter(crawl_id=models.OuterRef(outer_ref)) + if status is not None: + qs = qs.filter(status=status) + return qs.order_by().values("crawl_id").annotate(count=models.Count("pk")).values("count") + + @classmethod + def crawl_count_expr(cls, *, status: str | None = None, outer_ref: str = "pk"): + # Use scalar subqueries for sortable Crawl admin counters: SQLite can + # probe the (crawl_id, status, modified_at) index per Crawl row instead + # of joining/grouping all visible Snapshot rows. + return Coalesce( + models.Subquery(cls.crawl_count_subquery(status=status, outer_ref=outer_ref), output_field=models.IntegerField()), + models.Value(0), + ) + + @classmethod + def crawl_total_and_status_counts(cls, crawl_ids: Iterable[Any], *, status: str) -> dict[str, dict[str, int]]: + """Return total and status-filtered Snapshot counts keyed by Crawl ID.""" + crawl_ids = list(crawl_ids) + if not crawl_ids: + return {} + return { + str(row["crawl_id"]): { + "total": row["total"], + "status": row["status_count"], + } + for row in cls.objects.filter(crawl_id__in=crawl_ids) + .values("crawl_id") + .annotate( + total=models.Count("pk"), + status_count=models.Count("pk", filter=Q(status=status)), + ) + } + + def update_and_requeue(self, **kwargs) -> bool: + """ + Update this Snapshot through the shared retry_at ownership path. + + Any non-final Snapshot work means the parent Crawl must also be visible + to the runner. Keep that invariant here so CLI/admin callers do not + hand-edit the parent Crawl state every time they retry a hook. + """ + updated = super().update_and_requeue(**kwargs) + if not updated: + return False + + next_status = kwargs.get("status", self.status) + if next_status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED) or not self.crawl_id: + return True + + crawl = self.crawl + crawl_status = crawl.StatusChoices.STARTED if crawl.status == crawl.StatusChoices.STARTED else crawl.StatusChoices.QUEUED + crawl.update_and_requeue( + status=crawl_status, + retry_at=kwargs.get("retry_at") or timezone.now(), + ) + return True + + def queue_for_extraction(self, *, when=None) -> bool: + """Queue this Snapshot for the runner using the normal state path.""" + return self.update_and_requeue( + status=self.StatusChoices.QUEUED, + retry_at=when or timezone.now(), + current_step=0, + ) + + def pause(self, *, save: bool = True) -> bool: + paused = super().pause(save=save) + if paused and self.pk: + ArchiveResult.pause_queryset(self.archiveresult_set.all()) + return paused + + def resume(self, *, when: datetime | None = None, save: bool = True) -> bool: + resumed = super().resume(when=when, save=save) + if resumed and self.pk: + ArchiveResult.resume_queryset(self.archiveresult_set.all(), when=when) + return resumed + + def restore_paused_scheduler_marker(self) -> None: + """ + Keep explicit maintenance from accidentally resuming paused snapshots. + + Targeted jobs such as `archivebox update --index-only` may bump + retry_at so the orchestrator can run only queued search ArchiveResult + rows. After that maintenance pass, the lifecycle must remain PAUSED and + retry_at must go back to MAX until a real resume transition happens. + """ + type(self).objects.filter( + pk=self.pk, + status=self.StatusChoices.PAUSED, + ).update( + retry_at=RETRY_AT_MAX, + modified_at=timezone.now(), + ) + self.status = self.StatusChoices.PAUSED + self.retry_at = RETRY_AT_MAX + + def reconcile_parent_lifecycle(self, *, lock_seconds: int = 60) -> bool | None: + """ + Follow parent Crawl pause/seal state before any Snapshot work runs. + + Crawl.pause()/cancel() only wake child rows. The runner claims each due + Snapshot and lets this method perform the actual child transition, so + cancellation stays fast and Snapshot cleanup still runs from the normal + state-machine owner. + """ + parent_status = Crawl.objects.filter(id=self.crawl_id).values_list("status", flat=True).first() + if parent_status == Crawl.StatusChoices.SEALED and self.status != self.StatusChoices.SEALED: + if not self.claim_processing_lock(lock_seconds=lock_seconds): + return False + self.refresh_from_db() + parent_status = Crawl.objects.filter(id=self.crawl_id).values_list("status", flat=True).first() + if parent_status == Crawl.StatusChoices.SEALED and self.status != self.StatusChoices.SEALED: + self.sm.seal() + return True + + if parent_status == Crawl.StatusChoices.PAUSED and self.status not in (self.StatusChoices.PAUSED, self.StatusChoices.SEALED): + if not self.claim_processing_lock(lock_seconds=lock_seconds): + return False + self.refresh_from_db() + parent_status = Crawl.objects.filter(id=self.crawl_id).values_list("status", flat=True).first() + if parent_status == Crawl.StatusChoices.PAUSED and self.status not in ( + self.StatusChoices.PAUSED, + self.StatusChoices.SEALED, + ): + self.pause() + return True + + return None + + def finalize_completed_upload_results(self) -> int: + now = timezone.now() + result_ids = [] + upload_results = ( + self.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.QUEUED, + hook_name="on_Snapshot__archivebox_browser_extension_upload", + output_size__gt=0, + ) + .exclude(output_files={}) + .only("id", "output_files") + ) + for result in upload_results: + if ArchiveResult.output_files_upload_complete(result.output_files or {}): + result_ids.append(result.id) + if not result_ids: + return 0 + # Browser-extension uploads are already-finished external writes. If the + # PATCH request saved files but omitted status, finalize only this + # Snapshot's complete uploads without scanning ArchiveResult globally. + return ArchiveResult.objects.filter(id__in=result_ids, status=ArchiveResult.StatusChoices.QUEUED).update( + status=ArchiveResult.StatusChoices.SUCCEEDED, + modified_at=now, + ) + + def reset_abandoned_results(self) -> tuple[int, int]: + reset_count = 0 + running_count = 0 + for result in self.archiveresult_set.filter( + status__in=[ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF], + ).select_related("process"): + process = result.process + if process is not None and process.is_running: + running_count += 1 + continue + result.reset_for_retry() + reset_count += 1 + return reset_count, running_count + + def cancel(self) -> None: + if self.status != self.StatusChoices.SEALED: + self.sm.seal() + + def get_delete_after_config_value(self): + from archivebox.config.common import resolve_delete_after_config_value + + return resolve_delete_after_config_value(self.config, self.crawl.config) + + @classmethod + def missing_delete_at_candidates(cls): + return cls.objects.filter(delete_at__isnull=True).filter( + Q(config__has_key="DELETE_AFTER") | Q(crawl__config__has_key="DELETE_AFTER"), + ) + + @classmethod + def is_archivebox_internal_url(cls, url: str, *, config: Mapping[str, Any] | Any | None = None) -> bool: + parsed = urlparse((url or "").strip()) + if parsed.scheme not in ("http", "https") or not parsed.hostname: + return False + + from archivebox.core.routes_util import ( + get_admin_host, + get_api_host, + get_base_host, + get_listen_host, + get_web_host, + split_host_port, + ) + + if config is None: + config = get_config() + elif isinstance(config, Mapping): + route_config = config + + class RouteConfig: + BIND_ADDR = str(route_config.get("BIND_ADDR") or "") + BASE_URL = str(route_config.get("BASE_URL") or "") + CSRF_TRUSTED_ORIGINS = str(route_config.get("CSRF_TRUSTED_ORIGINS") or "") + SERVER_SECURITY_MODE = str(route_config.get("SERVER_SECURITY_MODE") or "") + + @property + def USES_SUBDOMAIN_ROUTING(self) -> bool: + return self.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay" + + config = RouteConfig() + host = parsed.hostname.lower().strip(".") + port = str(parsed.port) if parsed.port else None + protected_subdomains = {"admin", "web", "api"} + protected_hosts: set[tuple[str, str | None]] = set() + protected_roots: set[tuple[str, str | None]] = set() + for host_value in ( + get_listen_host(config=config), + get_base_host(config=config), + get_admin_host(config=config), + get_web_host(config=config), + get_api_host(config=config), + ): + if not host_value: + continue + protected_host, protected_port = split_host_port(host_value) + protected_host = protected_host.strip(".") + if not protected_host: + continue + protected_hosts.add((protected_host, protected_port)) + if protected_host in {"", "0.0.0.0", "::", "127.0.0.1", "::1", "localhost"}: + for local_alias in ("127.0.0.1", "localhost"): + protected_hosts.add((local_alias, protected_port)) + parts = protected_host.split(".", 1) + if len(parts) == 2 and (parts[0] in protected_subdomains or parts[0].startswith("snap-")): + protected_roots.add((parts[1], protected_port)) + else: + protected_roots.add((protected_host, protected_port)) + + for protected_host, protected_port in protected_hosts: + if host == protected_host and (protected_port is None or port == protected_port): + return True + + if config.USES_SUBDOMAIN_ROUTING: + for protected_root, protected_port in protected_roots: + if protected_port is not None and port != protected_port: + continue + if not protected_root or not host.endswith(f".{protected_root}"): + continue + subdomain = host[: -(len(protected_root) + 1)] + if subdomain in protected_subdomains or subdomain.startswith("snap-"): + return True + + return False + + @property + def created_by(self): + """Convenience property to access the user who created this snapshot via its crawl.""" + return self.crawl.created_by + + @property + def process_set(self): + """Get all Process objects related to this snapshot's ArchiveResults.""" + from archivebox.machine.models import Process + + return Process.objects.filter(archiveresult__snapshot_id=self.id) + + @property + def binary_set(self): + """Get all Binary objects used by processes related to this snapshot.""" + from archivebox.machine.models import Binary + + return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct() + + def ensure_permissions_config(self, crawl_permissions: str | None = None) -> bool: + config = dict(self.config or {}) + permission = str(config.get("PERMISSIONS") or "").strip().lower() + from archivebox.core.permissions import PERMISSIONS_PUBLIC, PERMISSIONS_VALUES, normalize_permissions + + if permission not in PERMISSIONS_VALUES: + if self.crawl_id: + if not crawl_permissions: + crawl_permissions = Crawl.objects.filter(pk=self.crawl_id).values_list("permissions", flat=True).first() + config["PERMISSIONS"] = normalize_permissions( + crawl_permissions, + default=PERMISSIONS_PUBLIC, + ) + self.config = config + return True + elif config.get("PERMISSIONS") != permission: + config["PERMISSIONS"] = permission + self.config = config + return True + return False + + def validate_url_for_archiving(self, *, config: Mapping[str, Any] | Any | None = None) -> None: + if self.is_internal_input_url(): + return + + try: + validate_url(self.url or "") + except ValueError as err: + raise ValidationError({"url": str(err)}) from err + + if self.is_archivebox_internal_url(self.url, config=config): + raise ValidationError({"url": "ArchiveBox cannot archive its own admin, web, api, or snapshot URLs."}) + + def is_internal_input_url(self) -> bool: + return (self.url or "").strip() == self.INTERNAL_INPUT_URL and self.depth == 0 and bool(self.crawl_id) + + def save(self, *args, **kwargs): + update_fields = kwargs.get("update_fields") + validate_url_field = self._state.adding or update_fields is None or "url" in update_fields + crawl_config_for_save = None + crawl_permissions_for_save = None + if self.crawl_id and validate_url_field: + crawl_row = Crawl.objects.filter(pk=self.crawl_id).values("config", "permissions").first() + if crawl_row: + crawl_config_for_save = crawl_row.get("config") or {} + crawl_permissions_for_save = crawl_row.get("permissions") + + if self.ensure_permissions_config(crawl_permissions=crawl_permissions_for_save): + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + + if validate_url_field: + self.validate_url_for_archiving(config=crawl_config_for_save if self.crawl_id else None) + + if not self.bookmarked_at: + self.bookmarked_at = self.created_at or timezone.now() + if not self.timestamp: + self.timestamp = str(self.bookmarked_at.timestamp()) + + if self._state.adding or update_fields is None or "title" in update_fields: + self.title = self._normalize_title_candidate(self.title, snapshot_url=self.url or "") or None + + # Migrate filesystem if needed (happens automatically on save) + existing_snapshot = self.pk and not self._state.adding + if existing_snapshot and self.fs_migration_needed: + self.migrate_filesystem_to_current_version() + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "fs_version", "modified_at"])) + elif existing_snapshot: + current_dir = self.get_storage_path_for_version(self._fs_current_version()) + source_dir = Path(self.output_dir) + if source_dir.exists() and source_dir != current_dir and not source_dir.is_symlink(): + self.migrate_filesystem_to_current_version(source_dir=source_dir) + + super().save(*args, **kwargs) + + from django.db import transaction + + def finish_snapshot_save(): + self.ensure_legacy_archive_symlink() + self.ensure_crawl_symlink() + crawl = Crawl.objects.filter(pk=self.crawl_id).first() + if crawl is None: + return + crawl_tag_names = crawl.current_tag_names() + if crawl_tag_names: + # Snapshots can be created by parser hook side-effect records, + # direct ORM creates, or legacy crawl URL expansion. Crawl tags + # are user-facing metadata on the whole import, so attach them + # at the Snapshot.save() boundary instead of relying on every + # caller to remember to duplicate this fanout logic. + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=crawl_tag_names)} + missing_tags = [Tag(name=name) for name in crawl_tag_names if name not in tags_by_name] + if missing_tags: + Tag.objects.bulk_create(missing_tags, ignore_conflicts=True) + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=crawl_tag_names)} + self.tags.add(*[tag.pk for name in crawl_tag_names if (tag := tags_by_name.get(name))]) + # Snapshot.save() normally appends newly created URLs to Crawl.urls + # so legacy/direct crawls can keep their queue text in sync. For + # internal-input crawls that would corrupt the original submitted + # import text; parsed URLs are represented by child Snapshot rows. + if crawl.has_internal_input_root(): + return + if not crawl.url_passes_filters(self.url, snapshot=self, use_effective_config=False): + return + # Best-effort skip if our URL is already recorded on the crawl; + # the atomic UPDATE below is what actually prevents clobbering. + crawl.refresh_from_db(fields=["urls"]) + if self.url in {url for _raw_line, url in crawl._iter_url_lines() if url}: + return + now = timezone.now() + # Atomic append: SQLite reads `urls` inside the UPDATE statement, + # so concurrent appends never clobber each other (no read-then-write + # window, no CAS retry needed). A rare duplicate URL on a race is + # harmless โ€” downstream consumers dedupe via Snapshot uniqueness. + text = models.TextField() + type(crawl).objects.filter(pk=crawl.pk).update( + urls=Case( + When(Q(urls="") | Q(urls__isnull=True), then=Value(self.url, output_field=text)), + default=Concat( + "urls", + Value("\n", output_field=text), + Value(self.url, output_field=text), + output_field=text, + ), + output_field=text, + ), + modified_at=now, + ) + crawl.modified_at = now + + # get_or_create/update_or_create wrap save() in atomic(); defer filesystem + # work and crawl maintenance so SQLite commits before touching the disk. + transaction.on_commit(finish_snapshot_save) + + migration_cleanup = self.__dict__.get("_pending_fs_migration_cleanup") + if migration_cleanup: + old_dir, new_dir = migration_cleanup + transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir)) + delattr(self, "_pending_fs_migration_cleanup") + + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created Snapshot', + # indent_level=2, + # url=self.url, + # metadata={ + # 'id': str(self.id), + # 'crawl_id': str(self.crawl_id), + # 'depth': self.depth, + # 'status': self.status, + # }, + # ) + + # ========================================================================= + # Filesystem Migration Methods + # ========================================================================= + + @staticmethod + def _fs_current_version() -> str: + """Get current ArchiveBox filesystem layout version.""" + return "0.9.4" + + @property + def fs_migration_needed(self) -> bool: + """Check if snapshot needs filesystem migration""" + return self.fs_version != self._fs_current_version() + + def _fs_next_version(self, version: str) -> str: + """Get next version in migration chain (0.7/0.8 had same layout, only 0.8โ†’0.9 migration needed)""" + # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp}) + if version in ("0.7.0", "0.8.0"): + return "0.9.0" + if version in ("0.9.0", "0.9.1", "0.9.2", "0.9.3"): + return "0.9.4" + return self._fs_current_version() + + @staticmethod + def is_legacy_archive_dir(path: Path) -> bool: + """Return True for old-style archive/{timestamp} snapshot directories.""" + if path.name in CONSTANTS.RESERVED_ARCHIVE_DIR_NAMES or path.name.startswith("."): + return False + try: + ts_int = int(float(path.name)) + except (TypeError, ValueError, OverflowError): + return False + return 788918400 <= ts_int <= 2082758400 + + def migrate_filesystem_to_current_version(self, source_dir: Path | None = None, config: "ArchiveBoxBaseConfig | None" = None) -> None: + """ + Copy legacy snapshot output into the current layout and defer old-dir cleanup. + + The ordering is intentionally crash-safe: + 1. Copy from the legacy directory into the new directory idempotently. + 2. Verify the new directory has every old file. + 3. Convert metadata in the new directory. + 4. Update fs_version in memory for the caller to save. + 5. Cleanup is scheduled only after the DB commit succeeds. + """ + current = self.fs_version + target = self._fs_current_version() + cleanup: tuple[Path, Path] | None = None + runtime_config = config or get_config() + + if source_dir and current == target: + current_dir = self.get_storage_path_for_version(target) + cleanup = self._fs_migrate_legacy_to_0_9_0(source_dir=source_dir, target_dir=current_dir) + crawl_dir = self.crawl.output_dir + old_crawl_dir = crawl_dir.with_name(str(uuid.UUID(hex=self.crawl.id.hex))) + if old_crawl_dir.exists() and not crawl_dir.exists() and not old_crawl_dir.is_symlink(): + crawl_dir.parent.mkdir(parents=True, exist_ok=True) + old_crawl_dir.rename(crawl_dir) + if cleanup: + self._pending_fs_migration_cleanup = cleanup + return + + while current != target: + next_ver = self._fs_next_version(current) + migrations = { + ("0.7.0", "0.9.0"): self._fs_migrate_from_0_7_0_to_0_9_0, + ("0.8.0", "0.9.0"): self._fs_migrate_from_0_8_0_to_0_9_0, + ("0.9.0", "0.9.4"): self._fs_migrate_from_0_9_0_to_0_9_4, + ("0.9.1", "0.9.4"): self._fs_migrate_from_0_9_0_to_0_9_4, + ("0.9.2", "0.9.4"): self._fs_migrate_from_0_9_0_to_0_9_4, + ("0.9.3", "0.9.4"): self._fs_migrate_from_0_9_0_to_0_9_4, + } + + migration = migrations.get((current, next_ver)) + if migration is None: + raise ValueError(f"No filesystem migration path from {current} to {next_ver}") + cleanup = migration(source_dir=source_dir, config=runtime_config) + + current = next_ver + source_dir = None + + self.fs_version = target + if cleanup: + self._pending_fs_migration_cleanup = cleanup + + def _fs_migrate_from_0_7_0_to_0_9_0(self, source_dir: Path | None = None, config: "ArchiveBoxBaseConfig | None" = None): + return self._fs_migrate_legacy_to_0_9_0(source_dir=source_dir, config=config) + + def _fs_migrate_from_0_8_0_to_0_9_0(self, source_dir: Path | None = None, config: "ArchiveBoxBaseConfig | None" = None): + return self._fs_migrate_legacy_to_0_9_0(source_dir=source_dir, config=config) + + def _fs_migrate_from_0_9_0_to_0_9_4(self, source_dir: Path | None = None, config: "ArchiveBoxBaseConfig | None" = None): + runtime_config = config or get_config() + target_dir = self.get_storage_path_for_version("0.9.4") + cleanup = self._fs_migrate_legacy_to_0_9_0(source_dir=source_dir or self.output_dir, target_dir=target_dir, config=runtime_config) + crawl_dir = self.crawl.output_dir + old_crawl_dir = crawl_dir.with_name(str(uuid.UUID(hex=self.crawl.id.hex))) + if old_crawl_dir.exists() and not crawl_dir.exists() and not old_crawl_dir.is_symlink(): + crawl_dir.parent.mkdir(parents=True, exist_ok=True) + old_crawl_dir.rename(crawl_dir) + return cleanup + + def _fs_migrate_legacy_to_0_9_0( + self, + source_dir: Path | None = None, + target_dir: Path | None = None, + config: "ArchiveBoxBaseConfig | None" = None, + ): + """ + Migrate from flat to nested structure. + + 0.8.x: archive/{timestamp}/ + 0.9.x: archive/users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/ + """ + import filecmp + import shutil + + old_dir = Path(source_dir) if source_dir else self.get_storage_path_for_version("0.8.0") + new_dir = Path(target_dir) if target_dir else self.get_storage_path_for_version("0.9.0") + + if old_dir == new_dir: + return None + + if old_dir.is_symlink(): + return None + + if not old_dir.exists(): + if new_dir.exists(): + self.convert_index_json_to_jsonl(output_dir=new_dir) + return None + return None + + if not new_dir.exists(): + new_dir.parent.mkdir(parents=True, exist_ok=True) + try: + old_dir.rename(new_dir) + self.convert_index_json_to_jsonl(output_dir=new_dir) + return (old_dir, new_dir) + except OSError: + pass + + new_dir.mkdir(parents=True, exist_ok=True) + + # Copy all files idempotently. If a previous attempt already converted + # index.json to index.jsonl, recopying index.json is harmless; conversion + # below removes it again after ensuring index.jsonl exists. + for old_file in old_dir.rglob("*"): + if not old_file.is_file(): + continue + + rel_path = old_file.relative_to(old_dir) + new_file = new_dir / rel_path + + # Skip if already copied + if new_file.exists(): + if new_file.stat().st_size == old_file.stat().st_size and filecmp.cmp(old_file, new_file, shallow=False): + continue + + new_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(old_file, new_file) + + # Verify all copied + old_files = {f.relative_to(old_dir): f.stat().st_size for f in old_dir.rglob("*") if f.is_file()} + new_files = {f.relative_to(new_dir): f.stat().st_size for f in new_dir.rglob("*") if f.is_file()} + + if old_files.keys() != new_files.keys(): + missing = old_files.keys() - new_files.keys() + missing.discard(Path(CONSTANTS.JSON_INDEX_FILENAME)) + if missing: + raise Exception(f"Migration incomplete: missing {missing}") + + for rel_path, old_size in old_files.items(): + if rel_path == Path(CONSTANTS.JSON_INDEX_FILENAME): + continue + if new_files.get(rel_path) != old_size: + raise Exception(f"Migration incomplete: size mismatch for {rel_path}") + if not filecmp.cmp(old_dir / rel_path, new_dir / rel_path, shallow=False): + raise Exception(f"Migration incomplete: content mismatch for {rel_path}") + + # Convert index.json to index.jsonl in the new directory. + self.convert_index_json_to_jsonl(output_dir=new_dir) + + return (old_dir, new_dir) + + def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path): + """ + Delete old directory and create symlink after successful migration. + """ + import shutil + import logging + + # Delete old directory + if old_dir.exists() and not old_dir.is_symlink(): + try: + shutil.rmtree(old_dir) + except Exception as e: + logging.getLogger("archivebox.migration").warning( + f"Could not remove old migration directory {old_dir}: {e}", + ) + return # Don't create symlink if cleanup failed + + # Create backwards-compat symlink (after old dir is deleted) + symlink_path = old_dir # Same path as old_dir + if symlink_path.is_symlink(): + symlink_path.unlink() + + if not symlink_path.exists(): + try: + symlink_path.symlink_to(new_dir, target_is_directory=True) + except Exception as e: + logging.getLogger("archivebox.migration").warning( + f"Could not create symlink from {symlink_path} to {new_dir}: {e}", + ) + + # ========================================================================= + # Path Calculation and Migration Helpers + # ========================================================================= + + @staticmethod + def extract_domain_from_url(url: str) -> str: + """ + Extract domain from URL for 0.9.x path structure. + Uses full hostname with sanitized special chars. + + Examples: + https://example.com:8080 โ†’ example.com_8080 + https://sub.example.com โ†’ sub.example.com + file:///path โ†’ localhost + data:text/html โ†’ data + """ + from urllib.parse import urlparse + + try: + parsed = urlparse(url) + + if parsed.scheme in ("http", "https"): + if parsed.port: + return f"{parsed.hostname}_{parsed.port}".replace(":", "_") + return parsed.hostname or "unknown" + elif parsed.scheme == "file": + return "localhost" + elif parsed.scheme: + return parsed.scheme + else: + return "unknown" + except Exception: + return "unknown" + + def get_storage_path_for_version(self, version: str) -> Path: + """ + Calculate storage path for specific filesystem version. + Centralizes path logic so it's reusable. + + 0.7.x/0.8.x: archive/{timestamp} + 0.9.x: archive/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ + """ + if version in ("0.7.0", "0.8.0"): + return CONSTANTS.ARCHIVE_DIR / self.timestamp + + elif version in ("0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "1.0.0"): + username = self.created_by.username + + date_base = self.bookmarked_at or self.created_at + date_str = date_base.strftime("%Y%m%d") if date_base else "unknown" + + domain = self.extract_domain_from_url(self.url) + + return CONSTANTS.USERS_DIR / username / CONSTANTS.SNAPSHOTS_DIR_NAME / date_str / domain / str(self.id) + else: + # Unknown version - use current + return self.get_storage_path_for_version(self._fs_current_version()) + + # ========================================================================= + # Loading and Creation from Filesystem (Used by archivebox update ONLY) + # ========================================================================= + + @classmethod + def load_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: + """ + Load existing Snapshot from DB by reading index.jsonl or index.json. + + Reads index file, extracts url+timestamp, queries DB. + Returns existing Snapshot or None if not found/invalid. + Does NOT create new snapshots. + + ONLY used by: archivebox update (for orphan detection) + """ + from archivebox.machine.models import Process + + # Try index.jsonl first (new format), then index.json (legacy) + jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME + json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME + + data = None + if jsonl_path.exists(): + try: + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get("type") == "Snapshot": + data = record + break + except OSError: + pass + elif json_path.exists(): + try: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + pass + + if not data: + timestamp = cls._select_best_timestamp( + index_timestamp=None, + folder_name=snapshot_dir.name, + ) + if not timestamp: + return None + try: + return cls.objects.select_related("crawl__created_by").get(timestamp=timestamp) + except cls.DoesNotExist: + return None + except cls.MultipleObjectsReturned: + return cls.objects.select_related("crawl__created_by").filter(timestamp=timestamp).first() + + url = data.get("url") + if not url: + timestamp = cls._select_best_timestamp( + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, + ) + if not timestamp: + return None + try: + return cls.objects.select_related("crawl__created_by").get(timestamp=timestamp) + except cls.DoesNotExist: + return None + except cls.MultipleObjectsReturned: + return cls.objects.select_related("crawl__created_by").filter(timestamp=timestamp).first() + + # Get timestamp - prefer index file, fallback to folder name + timestamp = cls._select_best_timestamp( + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, + ) + folder_timestamp = cls._select_best_timestamp( + index_timestamp=None, + folder_name=snapshot_dir.name, + ) + + if not timestamp: + return None + + # Look up existing (try exact match first, then fuzzy match for truncated timestamps) + try: + snapshot = cls.objects.select_related("crawl__created_by").get(url=url, timestamp=timestamp) + return snapshot + except cls.DoesNotExist: + # Try fuzzy match - index.json may have truncated timestamp + # e.g., index has "1767000340" but DB has "1767000340.624737" + # Do not fuzzy-match when the legacy folder name itself is a valid + # timestamp; distinct dirs like 1508259732 and 1508259732.0 must + # remain distinct snapshots. + if not folder_timestamp or timestamp != folder_timestamp: + candidates = cls.objects.select_related("crawl__created_by").filter(url=url, timestamp__startswith=timestamp) + if candidates.count() == 1: + snapshot = candidates.first() + if snapshot is None: + return None + return snapshot + elif candidates.count() > 1: + return candidates.first() + return None + except cls.MultipleObjectsReturned: + # Should not happen with unique constraint + return cls.objects.select_related("crawl__created_by").filter(url=url, timestamp=timestamp).first() + + @classmethod + def create_from_directory(cls, snapshot_dir: Path) -> Optional["Snapshot"]: + """ + Create new Snapshot from orphaned directory. + + Validates timestamp, ensures uniqueness. + Returns new UNSAVED Snapshot or None if invalid. + + ONLY used by: archivebox update (for orphan import) + """ + from archivebox.machine.models import Process + + # Try index.jsonl first (new format), then index.json (legacy) + jsonl_path = snapshot_dir / CONSTANTS.JSONL_INDEX_FILENAME + json_path = snapshot_dir / CONSTANTS.JSON_INDEX_FILENAME + + data = None + if jsonl_path.exists(): + try: + records = Process.parse_records_from_text(jsonl_path.read_text()) + for record in records: + if record.get("type") == "Snapshot": + data = record + break + except OSError: + pass + elif json_path.exists(): + try: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + pass + + if not data or not data.get("url"): + archive_org_path = snapshot_dir / "archive.org.txt" + try: + archived_url = archive_org_path.read_text(encoding="utf-8", errors="replace").strip().splitlines()[0].strip() + except (IndexError, OSError): + archived_url = "" + + if archived_url.startswith(("http://", "https://")): + if "://web.archive.org/web/" in archived_url and "/web/" in archived_url: + archive_target = archived_url.split("/web/", 1)[1].split("/", 1) + if len(archive_target) == 2: + candidate = archive_target[1] + if not candidate.startswith(("http://", "https://")) and "/" in candidate: + candidate = candidate.split("/", 1)[1] + if candidate.startswith(("http://", "https://")): + archived_url = candidate + + data = { + "url": archived_url, + "timestamp": snapshot_dir.name, + "title": "", + } + + if not data: + return None + + url = data.get("url") + if not url: + return None + + # Get and validate timestamp + timestamp = cls._select_best_timestamp( + index_timestamp=data.get("timestamp"), + folder_name=snapshot_dir.name, + ) + + if not timestamp: + return None + + # Ensure uniqueness (reuses existing logic from create_or_update_from_dict) + timestamp = cls._ensure_unique_timestamp(url, timestamp) + + # Detect version + fs_version = cls._detect_fs_version_from_index(data) + + system_user_id = get_or_create_system_user_pk() + catchall_crawl, _ = Crawl.objects.get_or_create( + label="[migration] orphaned snapshots", + defaults={ + "urls": f"# Orphaned snapshot: {url}", + "max_depth": 0, + "created_by_id": system_user_id, + }, + ) + if cls.objects.filter(crawl=catchall_crawl, url=url).exists(): + catchall_crawl = Crawl.objects.create( + label=f"[migration] orphaned snapshot {timestamp}", + urls=url, + max_depth=0, + created_by_id=system_user_id, + ) + + snapshot_kwargs = { + "url": url, + "timestamp": timestamp, + "title": data.get("title", ""), + "fs_version": fs_version, + "crawl": catchall_crawl, + } + try: + bookmarked_at = parse_date(data.get("bookmarked_at") or timestamp) + except (TypeError, ValueError, OSError): + bookmarked_at = None + try: + created_at = parse_date(data.get("created_at")) + except (TypeError, ValueError, OSError): + created_at = None + if bookmarked_at: + snapshot_kwargs["bookmarked_at"] = bookmarked_at + if created_at: + snapshot_kwargs["created_at"] = created_at + + return cls( + **snapshot_kwargs, + ) + + @staticmethod + def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> str | None: + """ + Select best timestamp from index.json vs folder name. + + Validates range (1995-2035). When a valid legacy folder name is + available it is the stable filesystem identity, so preserve it over + normalized variants like "1508259732.0" found in old index files. + """ + + def is_valid_timestamp(ts: object | None) -> bool: + if not isinstance(ts, (str, int, float)): + return False + try: + ts_int = int(float(ts)) + # 1995-01-01 to 2035-12-31 + return 788918400 <= ts_int <= 2082758400 + except (TypeError, ValueError, OverflowError): + return False + + index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False + folder_valid = is_valid_timestamp(folder_name) + + if folder_valid: + return str(folder_name).strip() + if index_valid and index_timestamp is not None: + return str(index_timestamp).strip() + return None + + @classmethod + def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str: + """ + Ensure timestamp is globally unique. + If there is a collision, add a tiny fractional suffix until unique. + """ + candidate = str(timestamp) + base = float(timestamp) + suffix = 0 + while cls.objects.filter(timestamp=candidate).exists(): + suffix += 1 + candidate = f"{base + (suffix / 1_000_000):.6f}".rstrip("0").rstrip(".") + return candidate + + @staticmethod + def _detect_fs_version_from_index(data: dict) -> str: + """ + Detect fs_version from index.json structure. + + - Has fs_version field: use it + - Has history dict: 0.7.0 + - Has archive_results list: 0.8.0 + - Default: 0.7.0 + """ + if "fs_version" in data: + return data["fs_version"] + if "history" in data and "archive_results" not in data: + return "0.7.0" + if "archive_results" in data: + return "0.8.0" + return "0.7.0" + + # ========================================================================= + # Index.json Reconciliation + # ========================================================================= + + def reconcile_with_index(self, output_dir: Path | None = None, update_existing_archive_results: bool = True): + """ + Merge index.json/index.jsonl with DB. DB is source of truth. + + - Title: longest non-URL + - Tags: union + - ArchiveResults: keep both (by plugin+start_ts) + + Converts index.json to index.jsonl if needed, then writes back in JSONL format. + + Used by: archivebox update (to sync index with DB) + """ + import json + + # Try to convert index.json to index.jsonl first + output_dir = Path(output_dir) if output_dir is not None else Path(self.output_dir) + self.convert_index_json_to_jsonl(output_dir=output_dir) + + # Check for index.jsonl (preferred) or index.json (legacy) + jsonl_path = output_dir / CONSTANTS.JSONL_INDEX_FILENAME + json_path = output_dir / CONSTANTS.JSON_INDEX_FILENAME + + index_data = {} + + if jsonl_path.exists(): + # Read from JSONL format + jsonl_data = self.read_index_jsonl(output_dir=output_dir) + if jsonl_data["snapshot"]: + index_data = jsonl_data["snapshot"] + # Convert archive_results list to expected format + index_data["archive_results"] = jsonl_data["archive_results"] + elif json_path.exists(): + # Fallback to legacy JSON format + try: + with open(json_path) as f: + index_data = json.load(f) + except (OSError, TypeError, ValueError, json.JSONDecodeError): + pass + + # Merge title + self._merge_title_from_index(index_data) + + # Merge tags + self._merge_tags_from_index(index_data) + + # Merge ArchiveResults + self._merge_archive_results_from_index(index_data, update_existing=update_existing_archive_results) + if not self._normalize_title_candidate(self.title, snapshot_url=self.url): + title_results = ( + self.archiveresult_set.filter( + plugin="title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + .exclude(output_str="") + .order_by("-start_ts", "-end_ts", "-created_at") + ) + for title_result in title_results.only("output_str"): + result_title = self._normalize_title_candidate(title_result.output_str, snapshot_url=self.url) + if result_title: + self.title = result_title + break + + # Write back in JSONL format + self.write_index_jsonl(output_dir=output_dir) + + def reconcile_with_index_json(self, output_dir: Path | None = None, update_existing_archive_results: bool = True): + """Deprecated: use reconcile_with_index() instead.""" + return self.reconcile_with_index(output_dir=output_dir, update_existing_archive_results=update_existing_archive_results) + + def _merge_title_from_index(self, index_data: dict): + """Merge title - prefer longest non-URL title.""" + index_title = self._normalize_title_candidate(index_data.get("title"), snapshot_url=self.url) + db_title = self._normalize_title_candidate(self.title, snapshot_url=self.url) + + candidates = [t for t in [index_title, db_title] if t] + if candidates: + best_title = max(candidates, key=len) + if self.title != best_title: + self.title = best_title + elif self.title: + self.title = None + + def _merge_tags_from_index(self, index_data: dict): + """Merge tags - union of both sources.""" + from django.db import transaction + + index_tags = set(index_data.get("tags", "").split(",")) if index_data.get("tags") else set() + index_tags = {t.strip() for t in index_tags if t.strip()} + + db_tags = set(self.tags.values_list("name", flat=True)) + + new_tags = index_tags - db_tags + if new_tags: + with transaction.atomic(): + for tag_name in new_tags: + tag, _ = Tag.objects.get_or_create(name=tag_name) + self.tags.add(tag) + + def _merge_archive_results_from_index(self, index_data: dict, update_existing: bool = True): + """Merge ArchiveResults one row per hook; retries update the existing row.""" + existing = {(ar.plugin, ar.hook_name): ar for ar in ArchiveResult.objects.filter(snapshot=self)} + if update_existing: + for archiveresult in existing.values(): + normalized_status = ArchiveResult.normalize_status(archiveresult.status) + if archiveresult.status != normalized_status: + archiveresult.status = normalized_status + archiveresult.save(update_fields=["status", "modified_at"]) + + # Handle 0.8.x format (archive_results list) + for result_data in index_data.get("archive_results", []): + self._create_archive_result_if_missing(result_data, existing, update_existing=update_existing) + + # Handle 0.7.x format (history dict) + if "history" in index_data and isinstance(index_data["history"], dict): + for plugin, result_list in index_data["history"].items(): + if isinstance(result_list, list): + for result_data in result_list: + # Support both old 'extractor' and new 'plugin' keys for backwards compat + result_data["plugin"] = result_data.get("plugin") or result_data.get("extractor") or plugin + self._create_archive_result_if_missing(result_data, existing, update_existing=update_existing) + + def _create_archive_result_if_missing(self, result_data: dict, existing: dict, update_existing: bool = True): + """Create ArchiveResult if not already in DB.""" + from dateutil import parser + from django.db import transaction + from archivebox.machine.models import Machine, Process + + # Support both old 'extractor' and new 'plugin' keys for backwards compat + plugin = (result_data.get("plugin") or result_data.get("extractor", ""))[:32] + if not plugin: + return + + start_ts = None + if result_data.get("start_ts"): + try: + start_ts = parser.parse(result_data["start_ts"]) + if start_ts and timezone.is_naive(start_ts): + start_ts = timezone.make_aware(start_ts, timezone.get_current_timezone()) + except (TypeError, ValueError, OverflowError): + pass + + end_ts = None + if result_data.get("end_ts"): + try: + end_ts = parser.parse(result_data["end_ts"]) + if end_ts and timezone.is_naive(end_ts): + end_ts = timezone.make_aware(end_ts, timezone.get_current_timezone()) + except (TypeError, ValueError, OverflowError): + pass + + # Support both 'output' (legacy) and 'output_str' (new JSONL) field names + output_str = result_data.get("output_str") or result_data.get("output", "") + status = ArchiveResult.normalize_status(result_data.get("status") or ArchiveResult.StatusChoices.FAILED) + process = None + cmd = result_data.get("cmd") or [] + pwd = result_data.get("pwd") or "" + output_files = ArchiveResult._normalize_output_files(result_data.get("output_files")) + output_size = ArchiveResult._coerce_output_file_size(result_data.get("output_size")) + output_json = result_data.get("output_json") + output_mimetypes = result_data.get("output_mimetypes", "") + + hook_name = result_data.get("hook_name", "") + existing_result = existing.get((plugin, hook_name)) + if existing_result: + if not update_existing: + return + + update_fields = [] + if existing_result.status != status: + existing_result.status = status + update_fields.append("status") + if output_str and existing_result.output_str != output_str: + existing_result.output_str = output_str + update_fields.append("output_str") + if output_json and existing_result.output_json != output_json: + existing_result.output_json = output_json + update_fields.append("output_json") + if output_files and existing_result.output_files != output_files: + existing_result.output_files = output_files + update_fields.append("output_files") + if "output_size" in result_data and existing_result.output_size != output_size: + existing_result.output_size = output_size + update_fields.append("output_size") + if output_mimetypes and existing_result.output_mimetypes != output_mimetypes: + existing_result.output_mimetypes = output_mimetypes + update_fields.append("output_mimetypes") + if start_ts and existing_result.start_ts != start_ts: + existing_result.start_ts = start_ts + update_fields.append("start_ts") + if end_ts and existing_result.end_ts != end_ts: + existing_result.end_ts = end_ts + update_fields.append("end_ts") + if update_fields: + existing_result.save(update_fields=[*update_fields, "modified_at"]) + return + + # Machine.current() can probe the host and sanitize config. Do that before + # atomic() so the transaction below only covers the two related row writes. + machine = Machine.current() if cmd or pwd else None + with transaction.atomic(): + if machine is not None: + process = Process.objects.create( + machine=machine, + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + cmd=cmd, + pwd=pwd, + status=Process.StatusChoices.EXITED, + exit_code=0 if status in ("succeeded", "skipped", "noresults") else 1, + started_at=start_ts, + ended_at=end_ts, + ) + + archiveresult = ArchiveResult.objects.create( + snapshot=self, + plugin=plugin, + hook_name=hook_name, + status=status, + output_str=output_str, + output_json=output_json, + output_files=output_files, + output_size=output_size, + output_mimetypes=output_mimetypes, + start_ts=start_ts, + end_ts=end_ts, + process=process, + ) + existing[(plugin, hook_name)] = archiveresult + + def write_index_json(self): + """Write index.json in 0.9.x format (deprecated, use write_index_jsonl).""" + import json + + index_path = Path(self.output_dir) / "index.json" + + data = { + "url": self.url, + "timestamp": self.timestamp, + "title": self.title or "", + "tags": ",".join(sorted(self.tags.values_list("name", flat=True))), + "fs_version": self.fs_version, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "archive_results": [ + { + "plugin": ar.plugin, + "status": ar.status, + "start_ts": ar.start_ts.isoformat() if ar.start_ts else None, + "end_ts": ar.end_ts.isoformat() if ar.end_ts else None, + "output": ar.output_str or "", + "cmd": ar.cmd if isinstance(ar.cmd, list) else [], + "pwd": ar.pwd, + } + for ar in ArchiveResult.objects.filter(snapshot=self).order_by("start_ts") + ], + } + + index_path.parent.mkdir(parents=True, exist_ok=True) + with open(index_path, "w") as f: + json.dump(data, f, indent=2, sort_keys=True) + + def write_index_jsonl(self, output_dir: Path | None = None): + """ + Write index.jsonl in flat JSONL format. + + Each line is a JSON record with a 'type' field: + - Snapshot: snapshot metadata (crawl_id, url, tags, etc.) + - ArchiveResult: extractor results (plugin, status, output, etc.) + - Binary: binary info used for the extraction + - Process: process execution details (cmd, exit_code, timing, etc.) + """ + import json + + output_dir = Path(output_dir) if output_dir is not None else Path(self.output_dir) + index_path = output_dir / CONSTANTS.JSONL_INDEX_FILENAME + index_path.parent.mkdir(parents=True, exist_ok=True) + + # Track unique binaries and processes to avoid duplicates + binaries_seen = set() + processes_seen = set() + + tmp_index_path = index_path.with_name(f".{index_path.name}.tmp") + with open(tmp_index_path, "w") as f: + # Write Snapshot record first (to_json includes crawl_id, fs_version) + f.write(json.dumps(self.to_json()) + "\n") + + # Write ArchiveResult records with their associated Binary and Process + # Use select_related to optimize queries + for ar in self.archiveresult_set.select_related("process__binary").order_by("start_ts"): + process = ar.process_record + # Write Binary record if not already written + if process and process.binary and process.binary_id not in binaries_seen: + binaries_seen.add(process.binary_id) + f.write(json.dumps(process.binary.to_json()) + "\n") + + # Write Process record if not already written + if process and process.id not in processes_seen: + processes_seen.add(process.id) + f.write(json.dumps(process.to_json()) + "\n") + + # Write ArchiveResult record + f.write(json.dumps(ar.to_json(snapshot_output_dir=output_dir)) + "\n") + os.replace(tmp_index_path, index_path) + + def read_index_jsonl(self, output_dir: Path | None = None) -> dict: + """ + Read index.jsonl and return parsed records grouped by type. + + Returns dict with keys: 'snapshot', 'archive_results', 'binaries', 'processes' + """ + from archivebox.machine.models import Process + from archivebox.misc.jsonl import ( + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_BINARYREQUEST, + TYPE_BINARY, + TYPE_PROCESS, + ) + + output_dir = Path(output_dir) if output_dir is not None else Path(self.output_dir) + index_path = output_dir / CONSTANTS.JSONL_INDEX_FILENAME + result: dict[str, Any] = { + "snapshot": None, + "archive_results": [], + "binaries": [], + "processes": [], + } + + if not index_path.exists(): + return result + + records = Process.parse_records_from_text(index_path.read_text()) + for record in records: + record_type = record.get("type") + if record_type == TYPE_SNAPSHOT: + result["snapshot"] = record + elif record_type == TYPE_ARCHIVERESULT: + result["archive_results"].append(record) + elif record_type in {TYPE_BINARYREQUEST, TYPE_BINARY}: + result["binaries"].append(record) + elif record_type == TYPE_PROCESS: + result["processes"].append(record) + + return result + + def convert_index_json_to_jsonl(self, output_dir: Path | None = None) -> bool: + """ + Convert index.json to index.jsonl format. + + Reads existing index.json, creates index.jsonl, and removes index.json. + Returns True if conversion was performed, False if no conversion needed. + """ + import json + + output_dir = Path(output_dir) if output_dir is not None else Path(self.output_dir) + json_path = output_dir / CONSTANTS.JSON_INDEX_FILENAME + jsonl_path = output_dir / CONSTANTS.JSONL_INDEX_FILENAME + + # Skip if already converted or no json file exists + if jsonl_path.exists(): + json_path.unlink(missing_ok=True) + return False + if not json_path.exists(): + return False + + try: + with open(json_path) as f: + data = json.load(f) + except (json.JSONDecodeError, OSError): + return False + + # Detect format version and extract records + fs_version = data.get("fs_version", "0.7.0") + + records = [] + snapshot_record = { + "type": "Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id) if self.crawl_id else None, + "url": data.get("url", self.url), + "timestamp": data.get("timestamp", self.timestamp), + "title": data.get("title", self.title or ""), + "tags": data.get("tags", ""), + "fs_version": fs_version, + "bookmarked_at": data.get("bookmarked_at"), + "created_at": data.get("created_at"), + } + records.append(snapshot_record) + + # Handle 0.8.x/0.9.x format (archive_results list) + for result_data in data.get("archive_results", []): + ar_record = { + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin", ""), + "hook_name": result_data.get("hook_name", ""), + "status": result_data.get("status") or ArchiveResult.StatusChoices.FAILED, + "output_str": result_data.get("output_str") or result_data.get("output", ""), + "output_json": result_data.get("output_json"), + "output_files": result_data.get("output_files"), + "output_size": result_data.get("output_size"), + "output_mimetypes": result_data.get("output_mimetypes", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), + } + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + if result_data.get("pwd"): + ar_record["pwd"] = result_data["pwd"] + records.append(ar_record) + + # Handle 0.7.x format (history dict) + if "history" in data and isinstance(data["history"], dict): + for plugin, result_list in data["history"].items(): + if not isinstance(result_list, list): + continue + for result_data in result_list: + ar_record = { + "type": "ArchiveResult", + "snapshot_id": str(self.id), + "plugin": result_data.get("plugin") or result_data.get("extractor") or plugin, + "hook_name": result_data.get("hook_name", ""), + "status": result_data.get("status") or ArchiveResult.StatusChoices.FAILED, + "output_str": result_data.get("output_str") or result_data.get("output", ""), + "output_json": result_data.get("output_json"), + "output_files": result_data.get("output_files"), + "output_size": result_data.get("output_size"), + "output_mimetypes": result_data.get("output_mimetypes", ""), + "start_ts": result_data.get("start_ts"), + "end_ts": result_data.get("end_ts"), + } + if result_data.get("cmd"): + ar_record["cmd"] = result_data["cmd"] + if result_data.get("pwd"): + ar_record["pwd"] = result_data["pwd"] + records.append(ar_record) + + jsonl_path.parent.mkdir(parents=True, exist_ok=True) + tmp_jsonl_path = jsonl_path.with_name(f".{jsonl_path.name}.tmp") + with open(tmp_jsonl_path, "w", encoding="utf-8") as f: + f.write("".join(json.dumps(record) + "\n" for record in records)) + os.replace(tmp_jsonl_path, jsonl_path) + + # Remove old index.json after successful conversion + try: + json_path.unlink() + except OSError: + pass + + return True + + # ========================================================================= + # Snapshot Utilities + # ========================================================================= + + @staticmethod + def move_directory_to_invalid(snapshot_dir: Path): + """ + Move invalid directory to data/invalid/YYYYMMDD/. + + Used by: archivebox update (when encountering invalid directories) + """ + from datetime import datetime + import shutil + + invalid_dir = CONSTANTS.DATA_DIR / "invalid" / datetime.now().strftime("%Y%m%d") + invalid_dir.mkdir(parents=True, exist_ok=True) + + dest = invalid_dir / snapshot_dir.name + counter = 1 + while dest.exists(): + dest = invalid_dir / f"{snapshot_dir.name}_{counter}" + counter += 1 + + try: + shutil.move(str(snapshot_dir), str(dest)) + except Exception: + pass + + @classmethod + def find_and_merge_duplicates(cls) -> int: + """ + Find and merge snapshots with same url:timestamp. + Returns count of duplicate sets merged. + + Used by: archivebox update (Phase 3: deduplication) + """ + from django.db.models import Count + + duplicates = cls.objects.values("url", "timestamp").annotate(count=Count("id")).filter(count__gt=1) + + merged = 0 + for dup in duplicates.iterator(chunk_size=500): + snapshots = list( + cls.objects.filter(url=dup["url"], timestamp=dup["timestamp"]).order_by("created_at"), # Keep oldest + ) + + if len(snapshots) > 1: + try: + cls._merge_snapshots(snapshots) + merged += 1 + except Exception: + pass + + return merged + + @classmethod + def _merge_snapshots(cls, snapshots: Sequence["Snapshot"]): + """ + Merge exact duplicates. + Keep oldest, union files + ArchiveResults. + """ + import shutil + + keeper = snapshots[0] + duplicates = snapshots[1:] + + keeper_dir = Path(keeper.output_dir) + + for dup in duplicates: + dup_dir = Path(dup.output_dir) + + # Merge files + if dup_dir.exists() and dup_dir != keeper_dir: + for dup_file in dup_dir.rglob("*"): + if not dup_file.is_file(): + continue + + rel = dup_file.relative_to(dup_dir) + keeper_file = keeper_dir / rel + + if not keeper_file.exists(): + keeper_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(dup_file, keeper_file) + + try: + shutil.rmtree(dup_dir) + except Exception: + pass + + # Merge tags + for tag in dup.tags.all(): + keeper.tags.add(tag) + + # Move ArchiveResults + ArchiveResult.objects.filter(snapshot=dup).update( + snapshot=keeper, + modified_at=timezone.now(), + ) + + # Delete + dup.delete() + + # ========================================================================= + # Output Directory Properties + # ========================================================================= + + @property + def output_dir_parent(self) -> str: + return "archive" + + @property + def output_dir_name(self) -> str: + return str(self.timestamp) + + def archive(self, overwrite=False, methods=None): + updates = { + "status": self.StatusChoices.QUEUED, + "retry_at": timezone.now(), + } + if overwrite: + updates["downloaded_at"] = None + return int(self.update_and_requeue(**updates)) + + @admin.display(description="Tags") + def tags_str(self, nocache=True) -> str | None: + if "_tags_str_cached" in self.__dict__: + return self.__dict__["_tags_str_cached"] + calc_tags_str = lambda: ",".join(sorted(tag.name for tag in self.tags.all())) + prefetched_cache = self.__dict__.get("_prefetched_objects_cache", {}) + if "tags" in prefetched_cache: + return calc_tags_str() + cache_key = f"{self.pk}-tags" + return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() + + def icons(self, path: str | None = None) -> str: + """Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" + from django.utils.html import format_html + + compact_icons = self.__dict__.get("_icons_compact", False) + cache_key = f"result_icons:{self.pk}:{'compact' if compact_icons else 'full'}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}" + + def calc_icons(): + if compact_icons and self.status == self.StatusChoices.STARTED: + progress_stats = self.__dict__.get("_icons_progress_stats") or self.get_progress_stats() + total = int(progress_stats.get("total") or 0) + succeeded = int(progress_stats.get("succeeded") or 0) + failed = int(progress_stats.get("failed") or 0) + skipped = int(progress_stats.get("skipped") or 0) + noresults = int(progress_stats.get("noresults") or 0) + running = int(progress_stats.get("running") or 0) + completed = succeeded + failed + skipped + noresults + percent = int((completed / total * 100) if total > 0 else 0) + return format_html( + '<div class="snapshot-files-progress" title="{} of {} hooks complete" style="min-width: 96px;">' + '<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 4px;">' + '<span class="snapshot-progress-spinner" style="display: inline-block; width: 12px; height: 12px; border: 2px solid #e2e8f0; border-top-color: #3b82f6; border-radius: 50%; animation: snapshot-spin 0.8s linear infinite;"></span>' + '<span style="font-size: 11px; color: #64748b;">{}/{} hooks</span>' + "</div>" + '<div style="background: #e2e8f0; border-radius: 4px; height: 6px; overflow: hidden;">' + '<div style="background: #3b82f6; width: {}%; height: 100%; transition: width 0.3s;"></div>' + "</div>" + '<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">' + "โœ“{} โœ—{} โณ{}" + "</div>" + "</div>", + completed, + total, + completed, + total, + percent, + succeeded, + failed, + running, + ) + + precomputed_archive_results = self.__dict__.get("_icons_archive_results") + prefetched_cache = self.__dict__.get("_prefetched_objects_cache", {}) + if precomputed_archive_results is not None and compact_icons: + archive_results = {plugin: True for plugin in precomputed_archive_results} + elif "archiveresult_set" in prefetched_cache: + archive_results = { + r.plugin: r + for r in self.archiveresult_set.all() + if r.status == "succeeded" and (compact_icons or r.output_files or r.output_str) + } + else: + # Filter for results that have either output_files or output_str + from django.db.models import Q + + archive_results_qs = self.archiveresult_set.filter(status="succeeded") + if not compact_icons: + archive_results_qs = archive_results_qs.filter(Q(output_files__isnull=False) | ~Q(output_str="")) + archive_results = {r.plugin: r for r in archive_results_qs} + + archive_path = path or self.archive_path + output = "" + output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>' + + # Get all plugins from hooks system (sorted by numeric prefix) + all_plugins = self.__dict__.get("_icons_plugin_names") + if all_plugins is None and not compact_icons: + all_plugins = [get_plugin_name(e) for e in get_plugins()] + elif all_plugins is None: + all_plugins = [] + ordered_plugins = [plugin for plugin in all_plugins if plugin in archive_results] + ordered_plugins.extend(sorted(set(archive_results) - set(ordered_plugins))) + + for plugin in ordered_plugins: + result = archive_results.get(plugin) + existing = result is True or bool( + result and result.status == "succeeded" and (compact_icons or result.output_files or result.output_str), + ) + if not existing: + continue + icon = mark_safe(get_plugin_icon(plugin)) + + # Skip plugins with empty icons that have no output + # (e.g., staticfile only shows when there's actual output) + if not icon.strip(): + continue + + embed_path = f"{plugin}/" if compact_icons else result.embed_path() + output += format_html( + output_template, + archive_path, + embed_path, + str(bool(existing)), + plugin, + icon, + ) + + return format_html( + '<span class="files-icons" style="font-size: 1em; opacity: 0.8; display: inline-grid; grid-auto-flow: column; grid-auto-columns: auto; grid-template-rows: repeat(4, auto); gap: 0 0; justify-content: start; align-content: start;">{}</span>', + mark_safe(output), + ) + + if compact_icons and self.status == self.StatusChoices.STARTED: + return calc_icons() + + cache_result = cache.get(cache_key) + if cache_result: + return cache_result + + fresh_result = calc_icons() + cache.set(cache_key, fresh_result, timeout=60 * 60 * 24) + return fresh_result + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_snapshot", args=[self.id])) + + def get_absolute_url(self): + return f"/{self.archive_path}" + + @cached_property + def domain(self) -> str: + return url_domain(self.url) + + @property + def title_stripped(self) -> str: + return (self.title or "").strip() + + @staticmethod + def _normalize_title_candidate(candidate: str | None, *, snapshot_url: str) -> str: + title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip() + if not title: + return "" + if title.lower() in {"pending...", "no title found", "unable to detect page title"}: + return "" + if title == snapshot_url: + return "" + if title.startswith(("http://", "https://")): + return "" + if "/" in title and title.lower().endswith(".txt"): + return "" + return title + + @property + def resolved_title(self) -> str: + stored_title = self._normalize_title_candidate(self.title, snapshot_url=self.url) + if stored_title: + return stored_title + + title_results = ( + self.archiveresult_set.filter( + plugin="title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + .exclude(output_str="") + .order_by("-start_ts", "-end_ts", "-created_at") + ) + for title_result in title_results.only("output_str"): + result_title = self._normalize_title_candidate(title_result.output_str, snapshot_url=self.url) + if result_title: + return result_title + + title_file = self.output_dir / "title" / "title.txt" + if title_file.exists(): + try: + file_title = self._normalize_title_candidate(title_file.read_text(encoding="utf-8"), snapshot_url=self.url) + except OSError: + file_title = "" + if file_title: + return file_title + + return "" + + @cached_property + def hashes_index(self) -> dict[str, dict[str, Any]]: + hashes_path = self.output_dir / "hashes" / "hashes.json" + if not hashes_path.exists(): + return {} + + try: + data = json.loads(hashes_path.read_text(encoding="utf-8")) + except Exception: + return {} + + index: dict[str, dict[str, Any]] = {} + if isinstance(data, dict) and isinstance(data.get("files"), list): + for entry in data["files"]: + if not isinstance(entry, dict): + continue + path = str(entry.get("path") or "").strip().rstrip("/") + if not path: + continue + index[path] = { + "size": entry.get("size") or entry.get("num_bytes") or entry.get("bytes") or 0, + "is_dir": bool(entry.get("is_dir")) or str(entry.get("path") or "").endswith("/"), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + elif isinstance(data, dict): + for path, entry in data.items(): + if not isinstance(entry, dict) or path == ".": + continue + clean_path = str(path).rstrip("/") + if not clean_path: + continue + index[clean_path] = { + "size": entry.get("size") or entry.get("num_bytes") or 0, + "is_dir": bool(entry.get("mime_type") == "inode/directory" or str(path).endswith("/")), + "hash": entry.get("hash") or entry.get("hash_sha256"), + } + return index + + @property + def output_dir(self) -> Path: + """The filesystem path to the snapshot's output directory.""" + import os + + current_path = self.get_storage_path_for_version(self.fs_version) + + if current_path.exists(): + return current_path + + if self.fs_version in ("0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "1.0.0"): + hyphen_path = current_path.with_name(str(uuid.UUID(hex=self.id.hex))) + if hyphen_path.exists(): + return hyphen_path + + # Check for backwards-compat symlink + old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + if old_path.is_symlink(): + link_target = Path(os.readlink(old_path)) + return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve() + elif old_path.exists(): + return old_path + + return current_path + + def ensure_legacy_archive_symlink(self) -> None: + """Ensure the legacy archive/<timestamp> path resolves to this snapshot.""" + import os + + legacy_path = CONSTANTS.ARCHIVE_DIR / self.timestamp + target = Path(self.get_storage_path_for_version(self._fs_current_version())) + + if target == legacy_path: + return + + legacy_path.parent.mkdir(parents=True, exist_ok=True) + + if legacy_path.exists() or legacy_path.is_symlink(): + if legacy_path.is_symlink(): + try: + if legacy_path.resolve() == target.resolve(): + return + except OSError: + pass + legacy_path.unlink(missing_ok=True) + else: + return + + rel_target = os.path.relpath(target, legacy_path.parent) + try: + legacy_path.symlink_to(rel_target, target_is_directory=True) + except OSError: + return + + def ensure_crawl_symlink(self, *, crawl_dir: Path | None = None, snapshot_dir: Path | None = None) -> None: + """Ensure snapshot is symlinked under its crawl output directory.""" + import os + from pathlib import Path + + if crawl_dir is None: + if not self.crawl_id: + return + try: + crawl = self.crawl + except ObjectDoesNotExist: + crawl = None + if crawl is None: + crawl = Crawl.objects.filter(id=self.crawl_id).select_related("created_by").first() + if not crawl: + return + crawl_dir = Path(crawl.output_dir) + + domain = self.extract_domain_from_url(self.url) + + link_path = Path(crawl_dir) / CONSTANTS.SNAPSHOTS_DIR_NAME / domain / str(self.id) + link_parent = link_path.parent + link_parent.mkdir(parents=True, exist_ok=True) + + target = Path(snapshot_dir) if snapshot_dir is not None else Path(self.output_dir) + if link_path.exists() or link_path.is_symlink(): + if link_path.is_symlink(): + if link_path.resolve() == target.resolve(): + return + link_path.unlink(missing_ok=True) + else: + return + + rel_target = os.path.relpath(target, link_parent) + try: + link_path.symlink_to(rel_target, target_is_directory=True) + except OSError: + return + + @cached_property + def legacy_archive_path(self) -> str: + return f"{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}" + + @cached_property + def archive_path_from_db(self) -> str: + """Best-effort public URL path derived from DB fields only.""" + if self.fs_version in ("0.7.0", "0.8.0"): + return self.legacy_archive_path + + if self.fs_version in ("0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "1.0.0"): + username = "web" + crawl = self.crawl if self.crawl_id else None + if crawl and crawl.created_by_id: + username = crawl.created_by.username + if username == "system": + username = "web" + + date_base = self.bookmarked_at or self.created_at + if date_base: + date_str = date_base.strftime("%Y%m%d") + else: + return self.legacy_archive_path + + domain = self.extract_domain_from_url(self.url) + return f"{username}/{date_str}/{domain}/{self.id}" + + return self.legacy_archive_path + + @cached_property + def url_path(self) -> str: + """URL path matching the current snapshot output_dir layout.""" + if self.fs_version in ("0.9.0", "0.9.1", "0.9.2", "0.9.3", "0.9.4", "1.0.0"): + return self.archive_path_from_db + + output_dir = Path(self.output_dir).resolve() + try: + rel_users_path = output_dir.relative_to(CONSTANTS.USERS_DIR) + except Exception: + rel_users_path = None + + if rel_users_path: + parts = rel_users_path.parts + # Configured users root: <username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/ + if len(parts) >= 5 and parts[1] == CONSTANTS.SNAPSHOTS_DIR_NAME: + username = parts[0] + if username == "system": + username = "web" + date_str = parts[2] + domain = parts[3] + snapshot_id = parts[4].replace("-", "") + return f"{username}/{date_str}/{domain}/{snapshot_id}" + + try: + rel_path = output_dir.relative_to(CONSTANTS.DATA_DIR) + except Exception: + return self.legacy_archive_path + + parts = rel_path.parts + # New layout: archive/users/<username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/ + if ( + len(parts) >= 7 + and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME + and parts[1] == CONSTANTS.USERS_DIR_NAME + and parts[3] == CONSTANTS.SNAPSHOTS_DIR_NAME + ): + username = parts[2] + if username == "system": + username = "web" + date_str = parts[4] + domain = parts[5] + snapshot_id = parts[6].replace("-", "") + return f"{username}/{date_str}/{domain}/{snapshot_id}" + + # Previous dev layout: users/<username>/snapshots/<YYYYMMDD>/<domain>/<uuid>/ + if len(parts) >= 6 and parts[0] == "users" and parts[2] == "snapshots": + username = parts[1] + if username == "system": + username = "web" + date_str = parts[3] + domain = parts[4] + snapshot_id = parts[5].replace("-", "") + return f"{username}/{date_str}/{domain}/{snapshot_id}" + + # Legacy layout: archive/<timestamp>/ + if len(parts) >= 2 and parts[0] == CONSTANTS.ARCHIVE_DIR_NAME: + return f"{parts[0]}/{parts[1]}" + + return "/".join(parts) + + @cached_property + def archive_path(self): + return self.url_path + + @cached_property + def archive_size(self): + return int(self.output_size or 0) + + def save_tags(self, tags: Iterable[str] = ()) -> None: + tags_id = [Tag.objects.get_or_create(name=tag)[0].pk for tag in tags if tag.strip()] + self.tags.clear() + self.tags.add(*tags_id) + + def pending_archiveresults(self) -> QuerySet["ArchiveResult"]: + return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES) + + def run(self) -> list["ArchiveResult"]: + """ + Execute snapshot by creating pending ArchiveResults for all enabled hooks. + + Returns: + list[ArchiveResult]: Newly created pending results + """ + return self.create_pending_archiveresults() + + def cleanup(self): + """ + Clean up background ArchiveResult hooks and empty results. + + Called by the state machine when entering the 'sealed' state. + Deletes empty ArchiveResults after the abx-dl cleanup phase has finished. + """ + # Clean up .pid files from output directory. + output_dir = Path(self.output_dir) + output_dir_exists = output_dir.exists() + if output_dir_exists: + for pid_file in output_dir.glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + # Update all background ArchiveResults from filesystem in case + # output arrived late. If there is no snapshot directory, there is + # no filesystem output to reconcile and no reason to hit this query. + for ar in self.archiveresult_set.filter(hook_name__contains=".bg."): + ar.update_from_output() + else: + return + + # Delete ArchiveResults that produced no output files + empty_ars = self.archiveresult_set.filter( + output_files={}, # No output files + ).filter( + status__in=ArchiveResult.FINAL_STATES, # Only delete finished ones + ) + + if empty_ars.exists(): + deleted_count, _ = empty_ars.delete() + rprint(f"[yellow]๐Ÿ—‘๏ธ Deleted {deleted_count} empty ArchiveResults for {self.url}[/yellow]") + + def to_json(self) -> dict: + """ + Convert Snapshot model instance to a JSON-serializable dict. + Includes all fields needed to fully reconstruct/identify this snapshot. + """ + from archivebox.config import VERSION + + archive_size = self.archive_size + + return { + "type": "Snapshot", + "schema_version": VERSION, + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "title": self.title, + "tags": self.tags_str(), + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "timestamp": self.timestamp, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, + "archive_size": archive_size, + "output_size": archive_size, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None, queue_for_extraction: bool = True): + """ + Create/update Snapshot from JSON dict. + + Unified method that handles: + - ID-based patching: {"id": "...", "title": "new title"} + - URL-based create/update: {"url": "...", "title": "...", "tags": "..."} + - Auto-creates Crawl if not provided + - Optionally queues for extraction + + Args: + record: Dict with 'url' (for create) or 'id' (for patch), plus other fields + overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id' + queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True) + + Returns: + Snapshot instance or None + """ + import re + from django.utils import timezone + from archivebox.base_models.models import get_or_create_system_user_pk + + config = get_config() + + overrides = overrides or {} + + # If 'id' is provided, lookup and patch that specific snapshot + snapshot_id = record.get("id") + if snapshot_id: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + # Generically update all fields present in record + update_fields = [] + for field_name, value in record.items(): + # Skip internal fields + if field_name in ("id", "type"): + continue + + # Skip if field doesn't exist on model + if not hasattr(snapshot, field_name): + continue + + # Special parsing for date fields + if field_name in ("bookmarked_at", "retry_at", "created_at", "modified_at"): + if value and isinstance(value, str): + value = parse_date(value) + + # Update field if value is provided and different + if value is not None and getattr(snapshot, field_name) != value: + setattr(snapshot, field_name, value) + update_fields.append(field_name) + + if update_fields: + snapshot.save(update_fields=update_fields + ["modified_at"]) + + return snapshot + except Snapshot.DoesNotExist: + # ID not found, fall through to create-by-URL logic + pass + + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + url = sanitize_extracted_url(fix_url_from_markdown(str(record.get("url") or "").strip())) + if not url: + return None + + # Determine or create crawl (every snapshot must have a crawl) + crawl = overrides.get("crawl") + parent_snapshot = overrides.get("snapshot") # Parent snapshot + created_by_id = overrides.get("created_by_id") or ( + parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk() + ) + + import sys + + record_crawl_id = record.get("crawl_id") + if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id): + rprint( + f"[yellow]โš ๏ธ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", + file=sys.stderr, + ) + + # If no crawl provided, inherit from parent or auto-create one + if not crawl: + if parent_snapshot: + # Inherit crawl from parent snapshot + crawl = parent_snapshot.crawl + else: + # Auto-create a single-URL crawl + from archivebox.crawls.models import Crawl + from archivebox.config import CONSTANTS + + timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") + sources_file = CONSTANTS.SOURCES_DIR / f"{timestamp_str}__auto_crawl.txt" + sources_file.parent.mkdir(parents=True, exist_ok=True) + sources_file.write_text(url) + + crawl = Crawl.objects.create( + urls=url, + max_depth=0, + label=f"auto-created for {url[:50]}", + created_by_id=created_by_id, + ) + rprint(f"[red]โš ๏ธ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) + + # Parser hooks emit child Snapshot records through this generic + # dispatcher after crawling an internal import root. Those child rows + # must inherit crawl-level tags just like direct Crawl.urls snapshots; + # otherwise `archivebox add --tag ... < bookmarks.html` loses the tag + # unless the parser format also happened to provide its own tags. + tags_raw = record.get("tags", "") + tag_list = list(crawl.current_tag_names()) if crawl else [] + if isinstance(tags_raw, list): + tag_list.extend(tag.strip() for tag in tags_raw if tag.strip()) + elif tags_raw: + tag_list.extend(tag.strip() for tag in re.split(config.TAG_SEPARATOR_PATTERN, tags_raw) if tag.strip()) + tag_list = list(dict.fromkeys(tag_list)) + + # Check for existing snapshot with same URL in same crawl + # (URLs can exist in multiple crawls, but should be unique within a crawl) + snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by("-created_at").first() + + title = record.get("title") + timestamp = record.get("timestamp") + timestamp_for_bookmark = Snapshot._select_best_timestamp(index_timestamp=timestamp, folder_name="") + try: + bookmarked_at = parse_date(record.get("bookmarked_at") or timestamp_for_bookmark) + except (TypeError, ValueError, OSError): + bookmarked_at = None + try: + created_at = parse_date(record.get("created_at")) + except (TypeError, ValueError, OSError): + created_at = None + + if snapshot: + # Update existing snapshot + if title and (not snapshot.title or len(title) > len(snapshot.title or "")): + snapshot.title = title + snapshot.save(update_fields=["title", "modified_at"]) + else: + # Create new snapshot + if timestamp: + while Snapshot.objects.filter(timestamp=timestamp).exists(): + timestamp = str(float(timestamp) + 1.0) + + create_kwargs = { + "url": url, + "timestamp": timestamp, + "title": title, + "crawl": crawl, + } + if bookmarked_at: + create_kwargs["bookmarked_at"] = bookmarked_at + if created_at: + create_kwargs["created_at"] = created_at + snapshot = Snapshot.objects.create(**create_kwargs) + + # Update tags + if tag_list: + existing_tags = set(snapshot.tags.values_list("name", flat=True)) + new_tags = set(tag_list) | existing_tags + snapshot.save_tags(new_tags) + + # Queue for extraction and update additional fields + update_fields = [] + + if queue_for_extraction: + if snapshot.status != Snapshot.StatusChoices.PAUSED: + snapshot.status = Snapshot.StatusChoices.QUEUED + update_fields.append("status") + snapshot.retry_at = timezone.now() + update_fields.append("retry_at") + + # Update additional fields if provided + for field_name in ("depth", "parent_snapshot_id", "crawl_id", "bookmarked_at", "created_at", "downloaded_at"): + value = record.get(field_name) + if field_name in ("bookmarked_at", "created_at", "downloaded_at") and value and isinstance(value, str): + value = parse_date(value) + if value is not None and getattr(snapshot, field_name) != value: + setattr(snapshot, field_name, value) + update_fields.append(field_name) + + if update_fields: + snapshot.save(update_fields=update_fields + ["modified_at"]) + + snapshot.ensure_crawl_symlink() + + return snapshot + + def create_pending_archiveresults(self, hooks: Iterable[tuple[str, str]] | None = None) -> list["ArchiveResult"]: + """ + Create ArchiveResult records for all enabled hooks. + + Uses the hooks system to discover available hooks from: + - abx_plugins/plugins/*/on_Snapshot__*.{py,sh,js} + - data/custom_plugins/*/on_Snapshot__*.{py,sh,js} + + Creates one ArchiveResult per hook (not per plugin), with hook_name set. + This enables step-based execution where all hooks in a step can run in parallel. + """ + try: + self.validate_url_for_archiving() + except ValidationError as err: + rprint(f"[yellow][!] Skipping blocked snapshot URL: {(self.url or '')[:120]}... ({err})[/yellow]") + return [] + + if hooks is None: + from archivebox.plugins.hooks import discover_hooks + from archivebox.config.common import get_config + + # Compatibility path for direct model callers. The runner passes its + # abx-dl hook inventory explicitly so queued rows match execution. + config = get_config(crawl=self.crawl, snapshot=self) + hooks = ((hook_path.parent.name, hook_path.stem) for hook_path in discover_hooks("Snapshot", config=config)) + archiveresults = [] + + for plugin, hook_name in hooks: + # ArchiveResult output is one filesystem directory per plugin hook, so + # retries must update this row in place instead of creating siblings. + archiveresult, _created = ArchiveResult.objects.get_or_create( + snapshot=self, + plugin=plugin, + hook_name=hook_name, + defaults={ + "status": ArchiveResult.INITIAL_STATE, + }, + ) + if archiveresult.status == ArchiveResult.INITIAL_STATE: + archiveresults.append(archiveresult) + + return archiveresults + + def is_finished_processing(self) -> bool: + """ + Check if all ArchiveResults are finished. + + Note: This is only called for observability/progress tracking. + The shared runner owns execution and does not poll this. + """ + # Check if any ARs are still pending/started + pending = self.archiveresult_set.exclude( + status__in=ArchiveResult.FINAL_STATES, + ).exists() + + return not pending + + def get_progress_stats(self) -> dict: + """ + Get progress statistics for this snapshot's archiving process. + + Returns dict with: + - total: Total number of archive results + - succeeded: Number of succeeded results + - failed: Number of failed results + - running: Number of currently running results + - pending: Number of pending/queued results + - percent: Completion percentage (0-100) + - output_size: Total output size in bytes + - is_sealed: Whether the snapshot is in a final state + """ + from django.db.models import Sum + + results = self.archiveresult_set.all() + + counts = ArchiveResult.status_counts( + results, + ( + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.STARTED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ), + ) + succeeded = counts.get(ArchiveResult.StatusChoices.SUCCEEDED, 0) + failed = counts.get(ArchiveResult.StatusChoices.FAILED, 0) + running = counts.get(ArchiveResult.StatusChoices.STARTED, 0) + skipped = counts.get(ArchiveResult.StatusChoices.SKIPPED, 0) + noresults = counts.get(ArchiveResult.StatusChoices.NORESULTS, 0) + total = results.count() + pending = total - succeeded - failed - running - skipped - noresults + + # Calculate percentage (succeeded + failed + skipped + noresults as completed) + completed = succeeded + failed + skipped + noresults + percent = int((completed / total * 100) if total > 0 else 0) + + # Sum output sizes + output_size = results.aggregate(total_size=Sum("output_size"))["total_size"] or 0 + + # Check if sealed + is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED) + + return { + "total": total, + "succeeded": succeeded, + "failed": failed, + "running": running, + "pending": pending, + "skipped": skipped, + "noresults": noresults, + "percent": percent, + "output_size": output_size, + "is_sealed": is_sealed, + } + + def retry_failed_archiveresults(self) -> int: + """ + Reset failed/skipped ArchiveResults to queued for retry. + + Returns count of ArchiveResults reset. + """ + retryable_results = ArchiveResult.objects.filter( + snapshot=self, + status__in=[ + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ], + ) + legacy_result_count = retryable_results.filter(hook_name="").count() + now = timezone.now() + count = retryable_results.exclude(hook_name="").update( + status=ArchiveResult.StatusChoices.QUEUED, + output_str="", + output_json=None, + output_files={}, + output_size=0, + output_mimetypes="", + start_ts=None, + end_ts=None, + modified_at=now, + ) + + if count + legacy_result_count > 0: + self.refresh_from_db(fields=["modified_at", "retry_at", "status"]) + self.queue_for_extraction(when=now) + + return count + legacy_result_count + + # ========================================================================= + # URL Helper Properties (migrated from Link schema) + # ========================================================================= + + @cached_property + def url_hash(self) -> str: + from hashlib import sha256 + + return sha256(self.url.encode()).hexdigest()[:8] + + @cached_property + def scheme(self) -> str: + return self.url.split("://")[0] + + @cached_property + def path(self) -> str: + parts = self.url.split("://", 1) + return "/" + parts[1].split("/", 1)[1] if len(parts) > 1 and "/" in parts[1] else "/" + + @cached_property + def basename(self) -> str: + return self.path.split("/")[-1] + + @cached_property + def extension(self) -> str: + basename = self.basename + return basename.split(".")[-1] if "." in basename else "" + + @cached_property + def base_url(self) -> str: + return f"{self.scheme}://{self.domain}" + + @cached_property + def is_static(self) -> bool: + static_extensions = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".mp4", ".mp3", ".wav", ".webm"} + return any(self.url.lower().endswith(ext) for ext in static_extensions) + + @cached_property + def is_archived(self) -> bool: + cached_is_archived = self.__dict__.get("_is_archived_cached") + if cached_is_archived is not None: + return bool(cached_is_archived) + + if self.downloaded_at or self.status == self.StatusChoices.SEALED: + return True + + output_paths = ( + self.domain, + "output.html", + "output.pdf", + "screenshot.png", + "singlefile.html", + "readability/content.html", + "mercury/content.html", + "htmltotext.txt", + "media", + "git", + ) + output_dir = Path(self.output_dir) + return any((output_dir / path).exists() for path in output_paths) + + # ========================================================================= + # Date/Time Properties (migrated from Link schema) + # ========================================================================= + + @cached_property + def bookmarked_date(self) -> str | None: + if self.bookmarked_at: + return self._ts_to_date_str(self.bookmarked_at) + if self.timestamp: + return str(self.timestamp) + return None + + @cached_property + def downloaded_datestr(self) -> str | None: + return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None + + @cached_property + def archive_dates(self) -> list[datetime]: + return [result.start_ts for result in self.archiveresult_set.all() if result.start_ts] + + @cached_property + def oldest_archive_date(self) -> datetime | None: + dates = self.archive_dates + return min(dates) if dates else None + + @cached_property + def newest_archive_date(self) -> datetime | None: + dates = self.archive_dates + return max(dates) if dates else None + + @cached_property + def num_outputs(self) -> int: + if "num_outputs_cached" in self.__dict__: + return int(self.__dict__["num_outputs_cached"] or 0) + + prefetched_cache = self.__dict__.get("_prefetched_objects_cache", {}) + if "archiveresult_set" in prefetched_cache: + return sum(1 for result in self.archiveresult_set.all() if result.status == "succeeded") + + return self.archiveresult_set.filter(status="succeeded").count() + + @cached_property + def num_failures(self) -> int: + if "num_failures_cached" in self.__dict__: + return int(self.__dict__["num_failures_cached"] or 0) + + prefetched_cache = self.__dict__.get("_prefetched_objects_cache", {}) + if "archiveresult_set" in prefetched_cache: + return sum(1 for result in self.archiveresult_set.all() if result.status == "failed") + + return self.archiveresult_set.filter(status="failed").count() + + # ========================================================================= + # Output Path Methods (migrated from Link schema) + # ========================================================================= + + def latest_outputs(self, status: str | None = None) -> dict[str, Any]: + """Get the latest output that each plugin produced""" + from archivebox.plugins.discovery import get_plugins + from django.db.models import Q + + latest: dict[str, Any] = {} + for plugin in get_plugins(): + results = self.archiveresult_set.filter(plugin=plugin) + if status is not None: + results = results.filter(status=status) + # Filter for results with output_files or output_str + results = results.filter(Q(output_files__isnull=False) | ~Q(output_str="")).order_by("-start_ts") + result = results.first() + # Return embed_path() for backwards compatibility + latest[plugin] = result.embed_path() if result else None + return latest + + def discover_outputs(self, include_filesystem_fallback: bool = True) -> list[dict]: + """Discover output files from ArchiveResults and filesystem.""" + from archivebox.misc.util import ts_to_date_str + + ArchiveResult = self.archiveresult_set.model + snap_dir = Path(self.output_dir) + outputs: list[dict] = [] + seen: set[str] = set() + + text_exts = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") + + def is_metadata_path(path: str | None) -> bool: + lower = (path or "").lower() + return lower.endswith(text_exts) + + def is_compact_path(path: str | None) -> bool: + lower = (path or "").lower() + return lower.endswith(text_exts) + + hashes_index = self.hashes_index if include_filesystem_fallback else {} + for result in self.archiveresult_set.all().order_by("start_ts"): + output_file_map = result.output_file_map() + embed_path = result.embed_path_db(output_file_map=output_file_map) + if not embed_path and include_filesystem_fallback: + embed_path = result.embed_path() + if not embed_path or embed_path.strip() in (".", "/", "./"): + continue + size = ( + result.output_size + or sum(result._coerce_output_file_size(metadata.get("size")) for metadata in output_file_map.values()) + or hashes_index.get(embed_path, {}).get("size") + or 0 + ) + if not size and include_filesystem_fallback and not hashes_index: + abs_path = snap_dir / embed_path + if not abs_path.exists(): + continue + if abs_path.is_dir(): + if not any(p.is_file() for p in abs_path.rglob("*")): + continue + size = sum(p.stat().st_size for p in abs_path.rglob("*") if p.is_file()) + else: + size = abs_path.stat().st_size + plugin_lower = (result.plugin or "").lower() + if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl"): + plugin_dir = snap_dir / result.plugin + if plugin_dir.exists(): + try: + size = sum(p.stat().st_size for p in plugin_dir.rglob("*") if p.is_file()) + except OSError: + pass + outputs.append( + { + "name": result.plugin, + "path": embed_path, + "ts": ts_to_date_str(result.end_ts), + "size": size or 0, + "is_metadata": is_metadata_path(embed_path), + "is_compact": is_compact_path(embed_path), + "result": result, + }, + ) + seen.add(result.plugin) + + if hashes_index: + grouped_hash_outputs: dict[str, dict[str, dict[str, Any]]] = {} + ignored_roots = {"index.html", "index.json", "index.jsonl", "favicon.ico", "warc", "hashes"} + for rel_path, meta in hashes_index.items(): + parts = Path(rel_path).parts + if len(parts) < 2: + continue + root = parts[0] + if root.startswith(".") or root in seen or root in ignored_roots: + continue + child_path = str(Path(*parts[1:])) + grouped_hash_outputs.setdefault(root, {})[child_path] = meta + + fallback_ts = ts_to_date_str(self.downloaded_at or self.created_at) + for root, root_entries in grouped_hash_outputs.items(): + fallback_path = ArchiveResult._fallback_output_file_path(list(root_entries.keys()), root, root_entries) + if not fallback_path: + continue + fallback_meta = root_entries.get(fallback_path, {}) + outputs.append( + { + "name": root, + "path": f"{root}/{fallback_path}", + "ts": fallback_ts, + "size": int(fallback_meta.get("size") or 0), + "is_metadata": is_metadata_path(fallback_path), + "is_compact": is_compact_path(fallback_path), + "result": None, + }, + ) + seen.add(root) + + if not include_filesystem_fallback or hashes_index: + return outputs + if not snap_dir.is_dir(): + return outputs + + embeddable_exts = { + "html", + "htm", + "mhtml", + "mht", + "pdf", + "txt", + "md", + "json", + "jsonl", + "csv", + "tsv", + "png", + "jpg", + "jpeg", + "gif", + "webp", + "svg", + "ico", + "mp4", + "webm", + "mp3", + "opus", + "ogg", + "wav", + } + + for entry in snap_dir.iterdir(): + if entry.name in ("index.html", "index.json", "favicon.ico", "warc"): + continue + if entry.is_dir(): + plugin = entry.name + if plugin in seen: + continue + best_file = ArchiveResult._find_best_output_file(entry, plugin) + if not best_file: + continue + best_file_stat = best_file.stat() + rel_path = str(best_file.relative_to(snap_dir)) + outputs.append( + { + "name": plugin, + "path": rel_path, + "ts": ts_to_date_str(best_file_stat.st_mtime or 0), + "size": best_file_stat.st_size or 0, + "is_metadata": is_metadata_path(rel_path), + "is_compact": is_compact_path(rel_path), + "result": None, + }, + ) + seen.add(plugin) + elif entry.is_file(): + ext = entry.suffix.lstrip(".").lower() + if ext not in embeddable_exts: + continue + plugin = entry.stem + if plugin in seen: + continue + entry_stat = entry.stat() + outputs.append( + { + "name": plugin, + "path": entry.name, + "ts": ts_to_date_str(entry_stat.st_mtime or 0), + "size": entry_stat.st_size or 0, + "is_metadata": is_metadata_path(entry.name), + "is_compact": is_compact_path(entry.name), + "result": None, + }, + ) + seen.add(plugin) + + return outputs + + # ========================================================================= + # Serialization Methods + # ========================================================================= + + def to_dict(self, extended: bool = False) -> dict[str, Any]: + """Convert Snapshot to a dictionary (replacement for Link._asdict())""" + from archivebox.core.routes_util import build_snapshot_url + + archive_size = self.archive_size + + result = { + "TYPE": "core.models.Snapshot", + "id": str(self.id), + "crawl_id": str(self.crawl_id), + "url": self.url, + "timestamp": self.timestamp, + "title": self.title, + "tags": sorted(tag.name for tag in self.tags.all()), + "downloaded_at": self.downloaded_at.isoformat() if self.downloaded_at else None, + "bookmarked_at": self.bookmarked_at.isoformat() if self.bookmarked_at else None, + "created_at": self.created_at.isoformat() if self.created_at else None, + "modified_at": self.modified_at.isoformat() if self.modified_at else None, + "retry_at": self.retry_at.isoformat() if self.retry_at else None, + "depth": self.depth, + "status": self.status, + "fs_version": self.fs_version, + # Computed properties + "domain": self.domain, + "scheme": self.scheme, + "base_url": self.base_url, + "path": self.path, + "basename": self.basename, + "extension": self.extension, + "is_static": self.is_static, + "is_archived": self.is_archived, + "archive_path": self.archive_path, + "archive_url": build_snapshot_url(str(self.id), "index.html"), + "output_dir": self.output_dir, + "link_dir": self.output_dir, # backwards compatibility alias + "archive_size": archive_size, + "output_size": archive_size, + "bookmarked_date": self.bookmarked_date, + "downloaded_datestr": self.downloaded_datestr, + "num_outputs": self.num_outputs, + "num_failures": self.num_failures, + } + return result + + def to_json_str(self, indent: int = 4) -> str: + """Convert to JSON string (legacy method, use to_json() for dict)""" + return to_json(self.to_dict(extended=True), indent=indent) + + def to_csv(self, cols: list[str] | None = None, separator: str = ",", ljust: int = 0) -> str: + """Convert to CSV string""" + data = self.to_dict() + cols = cols or ["timestamp", "is_archived", "url"] + invalid_cols = [col for col in dict.fromkeys(cols) if col not in data] + if invalid_cols: + supported_cols = ", ".join(sorted(data)) + raise ValueError(f"Invalid CSV field(s): {', '.join(invalid_cols)}\nSupported CSV fields: {supported_cols}") + return separator.join(to_json(data[col], indent=None).ljust(ljust) for col in cols) + + def write_json_details(self, out_dir: Path | str | None = None) -> None: + """Write JSON index file for this snapshot to its output directory""" + output_dir = Path(out_dir) if out_dir is not None else self.output_dir + path = output_dir / CONSTANTS.JSON_INDEX_FILENAME + atomic_write(str(path), self.to_dict(extended=True)) + + def write_html_details(self, out_dir: Path | str | None = None) -> None: + """Write HTML detail page for this snapshot to its output directory""" + from django.template.loader import render_to_string + from archivebox.core.widgets import TagEditorWidget + from archivebox.misc.logging_util import printable_filesize + + output_dir = Path(out_dir) if out_dir is not None else self.output_dir + TITLE_LOADING_MSG = "Not yet archived..." + + preview_priority = [ + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", + ] + + outputs = self.discover_outputs(include_filesystem_fallback=True) + loose_items, failed_items = self.get_detail_page_auxiliary_items(outputs) + outputs_by_plugin = {out["name"]: out for out in outputs} + output_size = sum(int(out.get("size") or 0) for out in outputs) + is_archived = bool(outputs or self.downloaded_at or self.status == self.StatusChoices.SEALED) + + best_preview_path = "about:blank" + best_result = {"path": "about:blank", "result": None} + for plugin in preview_priority: + out = outputs_by_plugin.get(plugin) + if out and out.get("path"): + best_preview_path = str(out["path"]) + best_result = out + break + + if best_preview_path == "about:blank" and outputs: + best_preview_path = str(outputs[0].get("path") or "about:blank") + best_result = outputs[0] + tag_widget = TagEditorWidget() + context = { + **self.to_dict(extended=True), + "snapshot": self, + "title": htmlencode(self.resolved_title or (self.base_url if is_archived else TITLE_LOADING_MSG)), + "url_str": htmlencode(urldecode(self.base_url)), + "archive_url": urlencode(f"warc/{self.timestamp}" or (self.domain if is_archived else "")) or "about:blank", + "extension": self.extension or "html", + "tags": self.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "pending", + "status": "archived" if is_archived else "not yet archived", + "status_color": "success" if is_archived else "danger", + "oldest_archive_date": ts_to_date_str(self.oldest_archive_date), + "best_preview_path": best_preview_path, + "best_result": best_result, + "archiveresults": outputs, + "loose_items": loose_items, + "failed_items": failed_items, + "related_snapshots": [], + "related_years": [], + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in self.tags.all().order_by("name")], + } + rendered_html = render_to_string("core/snapshot.html", context) + atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html) + + # ========================================================================= + # Helper Methods + # ========================================================================= + + def get_detail_page_auxiliary_items( + self, + outputs: list[dict] | None = None, + hidden_card_plugins: set[str] | None = None, + ) -> tuple[list[dict[str, object]], list[dict[str, object]]]: + if outputs is None: + outputs = self.discover_outputs(include_filesystem_fallback=True) + hidden_card_plugins = hidden_card_plugins or set() + accounted_entries: set[str] = set() + for output in outputs: + output_name = str(output.get("name") or "") + if output_name: + accounted_entries.add(output_name) + output_path = str(output.get("path") or "") + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + ignore_names = {".DS_Store", "index.html", "index.json", "index.jsonl", "favicon.ico"} + loose_items: list[dict[str, object]] = [] + if self.hashes_index: + grouped: dict[str, dict[str, object]] = {} + for rel_path, meta in self.hashes_index.items(): + parts = Path(rel_path).parts + if not parts: + continue + root = parts[0] + if root.startswith(".") or root in ignore_names or root in accounted_entries: + continue + entry = grouped.setdefault( + root, + { + "name": root, + "path": root, + "is_dir": len(parts) > 1 or bool(meta.get("is_dir")), + "size": 0, + }, + ) + entry["is_dir"] = bool(entry.get("is_dir")) or len(parts) > 1 or bool(meta.get("is_dir")) + entry["size"] = int(entry.get("size") or 0) + int(meta.get("size") or 0) + loose_items = sorted(grouped.values(), key=lambda item: str(item["name"]).lower()) + + ArchiveResult = self.archiveresult_set.model + failed_items: list[dict[str, object]] = [] + seen_failed: set[str] = set() + for result in self.archiveresult_set.all().order_by("start_ts"): + if result.status != ArchiveResult.StatusChoices.FAILED: + continue + root = str(result.plugin or "").strip() + if not root or root in seen_failed: + continue + seen_failed.add(root) + failed_items.append( + { + "name": f"{get_plugin_name(root)} ({result.status})", + "path": root, + "is_dir": True, + "size": int(result.output_size or 0), + }, + ) + + return loose_items, failed_items + + @staticmethod + def _ts_to_date_str(dt: datetime | None) -> str | None: + return dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None + + +# ============================================================================= +# Snapshot State Machine +# ============================================================================= + + +class SnapshotMachine(BaseStateMachine): + """ + State machine for managing Snapshot lifecycle. + + Hook Lifecycle: + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ QUEUED State โ”‚ + โ”‚ โ€ข Waiting for snapshot to be ready โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when can_start() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ STARTED State โ†’ enter_started() โ”‚ + โ”‚ 1. snapshot.run() โ”‚ + โ”‚ โ€ข discover_hooks('Snapshot') โ†’ finds all plugin hooks โ”‚ + โ”‚ โ€ข create_pending_archiveresults() โ†’ creates ONE โ”‚ + โ”‚ ArchiveResult per hook (NO execution yet) โ”‚ + โ”‚ 2. The shared abx-dl runner executes hooks and the โ”‚ + โ”‚ projector updates ArchiveResult rows from events โ”‚ + โ”‚ 3. Advance through steps 0-9 as foreground hooks complete โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when is_finished() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ SEALED State โ†’ enter_sealed() โ”‚ + โ”‚ โ€ข cleanup() โ†’ kills any background hooks still running โ”‚ + โ”‚ โ€ข Set retry_at=None (no more processing) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams + """ + + model_attr_name = "snapshot" + + # States + queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True) + started = State(value=Snapshot.StatusChoices.STARTED) + paused = State(value=Snapshot.StatusChoices.PAUSED) + sealed = State(value=Snapshot.StatusChoices.SEALED, final=True) + + # Tick Event (polled by workers) + tick = ( + queued.to(sealed, cond="has_finished_archive_results") + | queued.to.itself(unless="can_start") + | queued.to(started, cond="can_start") + | started.to(sealed, cond="is_finished") + | paused.to.itself() + ) + + # Manual event (can also be triggered by last ArchiveResult finishing) + seal = queued.to(sealed) | started.to(sealed) | paused.to(sealed) + pause_requested = queued.to(paused) | started.to(paused) + resume_requested = paused.to(queued) + + snapshot: Snapshot + + def can_start(self) -> bool: + can_start = bool(self.snapshot.url) + return can_start + + def is_finished(self) -> bool: + """Check if all ArchiveResults for this snapshot are finished.""" + return self.snapshot.is_finished_processing() + + def has_finished_archive_results(self) -> bool: + """A queued snapshot with only final projected rows was interrupted after hook completion.""" + results = self.snapshot.archiveresult_set.all() + return results.exists() and not results.exclude(status__in=ArchiveResult.FINAL_STATES).exists() + + @queued.enter + def enter_queued(self): + self.snapshot.update_and_requeue( + retry_at=timezone.now(), + status=Snapshot.StatusChoices.QUEUED, + ) + + @paused.enter + def enter_paused(self): + self.snapshot.update_and_requeue( + retry_at=RETRY_AT_MAX, + status=Snapshot.StatusChoices.PAUSED, + ) + + @started.enter + def enter_started(self): + """Just mark as started. The shared runner creates ArchiveResults and runs hooks.""" + owned_retry_at = self.snapshot.retry_at + now = timezone.now() + lease_until = now + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS) + # The runner owns queued Snapshot startup through retry_at. Creating + # pending ArchiveResult rows immediately before tick() can touch + # Snapshot.modified_at, so using modified_at CAS here would reject the + # legitimate owner. Keep the write to the scheduler columns only. + updated = Snapshot.objects.filter( + pk=self.snapshot.pk, + retry_at=owned_retry_at, + status=Snapshot.StatusChoices.QUEUED, + ).update( + status=Snapshot.StatusChoices.STARTED, + retry_at=lease_until, + modified_at=now, + ) + if updated != 1: + self.snapshot.refresh_from_db() + return + self.snapshot.status = Snapshot.StatusChoices.STARTED + self.snapshot.retry_at = lease_until + self.snapshot.modified_at = now + + @sealed.enter + def enter_sealed(self): + now = timezone.now() + owned_retry_at = self.snapshot.retry_at + # The runner owns this row via retry_at. Commit the final lifecycle + # state before cleanup so late projectors can update metadata without + # tripping a modified_at CAS while the row still looks QUEUED/STARTED. + updated = ( + type(self.snapshot) + .objects.filter( + pk=self.snapshot.pk, + retry_at=owned_retry_at, + status__in=[ + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + ], + ) + .update( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=now, + ) + ) + if updated != 1: + self.snapshot.refresh_from_db() + return + + self.snapshot.status = Snapshot.StatusChoices.SEALED + self.snapshot.retry_at = None + self.snapshot.modified_at = now + + # Clean up background hooks after the final state is visible in DB. + self.snapshot.cleanup() + + # Crawl finalization is handled by the runner/CrawlService cleanup + # phase. Sealing the parent crawl here races recursive discovery: + # Snapshot hooks can write urls.jsonl just before this state transition, + # and the runner still needs to enqueue those child snapshots. + + +class ArchiveResult(ModelWithDeleteAfter, ModelWithOutputDir, ModelWithNotes): + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + STARTED = "started", "Started" + PAUSED = "paused", "Paused" + BACKOFF = "backoff", "Waiting to retry" + SUCCEEDED = "succeeded", "Succeeded" + FAILED = "failed", "Failed" + SKIPPED = "skipped", "Skipped" + NORESULTS = "noresults", "No Results" + + INITIAL_STATE = StatusChoices.QUEUED + ACTIVE_STATE = StatusChoices.STARTED + FINAL_STATES = ( + StatusChoices.SUCCEEDED, + StatusChoices.FAILED, + StatusChoices.SKIPPED, + StatusChoices.NORESULTS, + ) + FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE) + delete_after_final_statuses = FINAL_STATES + + @classmethod + def normalize_status(cls, status: str | None) -> str: + return { + "success": cls.StatusChoices.SUCCEEDED, + "succeded": cls.StatusChoices.SUCCEEDED, + "succeeded": cls.StatusChoices.SUCCEEDED, + "failed": cls.StatusChoices.FAILED, + "skipped": cls.StatusChoices.SKIPPED, + "noresults": cls.StatusChoices.NORESULTS, + "queued": cls.StatusChoices.QUEUED, + "started": cls.StatusChoices.STARTED, + "paused": cls.StatusChoices.PAUSED, + "backoff": cls.StatusChoices.BACKOFF, + }.get(str(status or "").strip().lower(), cls.StatusChoices.FAILED) + + @staticmethod + def output_files_upload_complete(output_files: dict[str, dict[str, Any]]) -> bool: + if not output_files: + return False + for metadata in output_files.values(): + upload = metadata.get("upload") if isinstance(metadata, dict) else None + if isinstance(upload, dict) and upload.get("chunked") and not upload.get("complete"): + return False + return True + + @classmethod + def get_plugin_choices(cls): + """Get plugin choices from discovered hooks (for forms/admin).""" + plugins = [get_plugin_name(e) for e in get_plugins()] + return tuple((e, e) for e in plugins) + + @classmethod + def snapshot_count_subquery(cls, *, status: str | None = None, outer_ref: str = "pk") -> QuerySet: + """Return a scalar subquery counting ArchiveResults for one outer Snapshot. + + Use this instead of filtered join aggregates for per-row Snapshot counts: + the scalar form lets SQLite probe the covering ``(snapshot_id, status)`` + or ``(status, snapshot_id)`` indexes once per visible Snapshot row, + instead of joining and grouping the whole candidate Snapshot queryset. + """ + qs = cls.objects.filter(snapshot_id=models.OuterRef(outer_ref)) + if status is not None: + qs = qs.filter(status=status) + return qs.order_by().values("snapshot_id").annotate(count=models.Count("*")).values("count") + + @classmethod + def snapshot_half_count_subquery(cls, *, outer_ref: str = "snapshot_id") -> QuerySet: + return ( + cls.objects.filter(snapshot_id=models.OuterRef(outer_ref)) + .order_by() + .values("snapshot_id") + .annotate(half=models.Count("*") / models.Value(2)) + .values("half") + ) + + @classmethod + def snapshot_count_expr(cls, *, status: str | None = None, outer_ref: str = "pk"): + return Coalesce( + models.Subquery(cls.snapshot_count_subquery(status=status, outer_ref=outer_ref), output_field=models.IntegerField()), + models.Value(0), + ) + + @classmethod + def status_counts(cls, queryset: QuerySet | None = None, statuses: Iterable[str] | None = None) -> dict[str, int]: + """Count requested statuses with separate indexed COUNT probes.""" + qs = queryset if queryset is not None else cls.objects.all() + return {status: qs.filter(status=status).count() for status in (statuses or cls.StatusChoices.values)} + + @classmethod + def snapshot_ids_with_majority_status(cls, status: str | Iterable[str]) -> QuerySet: + """Return Snapshot IDs where more than half of ArchiveResults have ``status``. + + Start from ArchiveResult.status for every majority-status filter. The + ``(status, snapshot_id)`` index keeps the plan predictable even when a + user's collection has an unusual status distribution. + """ + statuses = tuple(status) if not isinstance(status, str) else (status,) + total_half = UngroupedSubquery(cls.snapshot_half_count_subquery(outer_ref="snapshot_id"), output_field=models.IntegerField()) + return ( + cls.objects.filter(status__in=statuses) + .order_by() + .values("snapshot_id") + .annotate( + matching_results=models.Count("*"), + total_half=total_half, + ) + .filter(matching_results__gt=models.F("total_half")) + .values("snapshot_id") + ) + + @classmethod + def cached_snapshot_ids_with_majority_status(cls, status: str | Iterable[str], *, timeout: int = 60) -> tuple[str, ...]: + statuses = tuple(status) if not isinstance(status, str) else (status,) + cache_key = f"archivebox:archiveresult:majority_status:{':'.join(sorted(statuses))}" + cached_ids = cache.get(cache_key) + if cached_ids is not None: + return tuple(cached_ids) + + snapshot_ids = tuple( + str(snapshot_id) for snapshot_id in cls.snapshot_ids_with_majority_status(statuses).values_list("snapshot_id", flat=True) + ) + cache.set(cache_key, snapshot_ids, timeout=timeout) + return snapshot_ids + + @classmethod + def clear_majority_status_cache(cls) -> None: + cache.delete_many( + [ + *(f"archivebox:archiveresult:majority_status:{status}" for status in cls.StatusChoices.values), + f"archivebox:archiveresult:majority_status:{':'.join(sorted((cls.StatusChoices.BACKOFF, cls.StatusChoices.QUEUED)))}", + ], + ) + + # UUID primary key (migrated from integer in 0029) + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore + # No choices= constraint - plugin names come from plugin system and can be any string + plugin = models.CharField(max_length=32, blank=False, null=False, db_index=True, default="") + hook_name = models.CharField( + max_length=255, + blank=True, + default="", + db_index=True, + help_text="Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)", + ) + + # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.) + # Added POST-v0.9.0, will be added in a separate migration + process = models.OneToOneField( + "machine.Process", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="archiveresult", + help_text="Process execution details for this archive result", + ) + + # New output fields (replacing old 'output' field) + output_str = models.TextField(blank=True, default="", help_text="Human-readable output summary") + output_json = models.JSONField(null=True, blank=True, default=None, help_text="Structured metadata (headers, redirects, etc.)") + output_files = models.JSONField(default=dict, help_text="Dict of {relative_path: {metadata}}") + output_size = models.BigIntegerField(default=0, help_text="Total bytes of all output files") + output_mimetypes = models.CharField(max_length=512, blank=True, default="", help_text="CSV of mimetypes sorted by size") + + start_ts = models.DateTimeField(default=None, null=True, blank=True) + end_ts = models.DateTimeField(default=None, null=True, blank=True) + + status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True) + retry_at = models.DateTimeField(default=None, null=True, blank=True, db_index=True) + notes = models.TextField(blank=True, null=False, default="") + # output_dir is computed via @property from snapshot.output_dir / plugin + + snapshot_id: uuid.UUID + process_id: uuid.UUID | None + + class Meta( + ModelWithDeleteAfter.Meta, + ModelWithOutputDir.Meta, + ModelWithNotes.Meta, + ): + app_label = "core" + verbose_name = "Archive Result" + verbose_name_plural = "Archive Results Log" + indexes = [ + models.Index(fields=["snapshot", "status"], name="archiveresult_snap_status_idx"), + models.Index(fields=["status", "snapshot"], name="archiveresult_status_snap_idx"), + models.Index(fields=["-start_ts", "-id"], name="archiveresult_start_idx"), + ] + constraints = [ + models.UniqueConstraint(fields=["snapshot", "plugin", "hook_name"], name="unique_archiveresult_per_snapshot_hook"), + ] + + def __str__(self): + return f"[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}" + + @staticmethod + def _format_output_line_for_display(line: str) -> str: + raw_line = str(line or "") + stripped = raw_line.strip() + if not stripped or "://" in stripped or not stripped.startswith(("/", "~/")): + return raw_line + try: + data_dir = CONSTANTS.DATA_DIR.expanduser().resolve(strict=False) + rel_path = Path(stripped).expanduser().resolve(strict=False).relative_to(data_dir) + except (OSError, ValueError): + return raw_line + return f"{raw_line[: len(raw_line) - len(raw_line.lstrip())]}./{rel_path}{raw_line[len(raw_line.rstrip()) :]}" + + def output_str_for_display(self) -> str: + return "\n".join(self._format_output_line_for_display(line) for line in str(self.output_str or "").splitlines()) + + def get_delete_after_config_value(self): + snapshot = self.snapshot + from archivebox.config.common import resolve_delete_after_config_value + + return resolve_delete_after_config_value(snapshot.config, snapshot.crawl.config) + + @classmethod + def missing_delete_at_candidates(cls): + return cls.objects.filter(delete_at__isnull=True).filter( + Q(snapshot__config__has_key="DELETE_AFTER") | Q(snapshot__crawl__config__has_key="DELETE_AFTER"), + ) + + @property + def created_by(self): + """Convenience property to access the user who created this archive result via its snapshot's crawl.""" + return self.snapshot.crawl.created_by + + def to_json(self, *, snapshot_output_dir: Path | None = None) -> dict: + """ + Convert ArchiveResult model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + process = self.process_record + pwd = ( + process.pwd + if process and process.pwd + else str((snapshot_output_dir / self.plugin) if snapshot_output_dir is not None else self.output_dir) + ) + cmd = process.cmd if process else [] + cmd_version = process.cmd_version if process else "" + + record = { + "type": "ArchiveResult", + "schema_version": VERSION, + "id": str(self.id), + "snapshot_id": str(self.snapshot_id), + "plugin": self.plugin, + "hook_name": self.hook_name, + "status": self.status, + "output_str": self.output_str, + "start_ts": self.start_ts.isoformat() if self.start_ts else None, + "end_ts": self.end_ts.isoformat() if self.end_ts else None, + } + # Include optional fields if set + if self.output_json: + record["output_json"] = self.output_json + if self.output_files: + record["output_files"] = self.output_files + if self.output_size: + record["output_size"] = self.output_size + if self.output_mimetypes: + record["output_mimetypes"] = self.output_mimetypes + if pwd: + record["pwd"] = pwd + if cmd: + record["cmd"] = cmd + if cmd_version: + record["cmd_version"] = cmd_version + if process: + record["process_id"] = str(process.id) + return record + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update ArchiveResult from JSON dict. + + Args: + record: JSON dict with 'snapshot_id', 'plugin', etc. + overrides: Optional dict of field overrides + + Returns: + ArchiveResult instance or None + """ + snapshot_id = record.get("snapshot_id") + plugin = record.get("plugin") + + if not snapshot_id or not plugin: + return None + + # Try to get existing by ID first + result_id = record.get("id") + if result_id: + try: + return ArchiveResult.objects.get(id=result_id) + except ArchiveResult.DoesNotExist: + pass + + # Get or create by snapshot_id + plugin + hook_name. The filesystem has a + # single output dir for each hook, so retries update that same DB row. + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + + result, _ = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + hook_name=record.get("hook_name", ""), + defaults={ + "status": record.get("status", "queued"), + "output_str": record.get("output_str", ""), + }, + ) + return result + except Snapshot.DoesNotExist: + return None + + def save(self, *args, **kwargs): + is_new = self._state.adding + update_fields = kwargs.get("update_fields") + refresh_snapshot_size = ( + is_new + or update_fields is None + or "output_size" in update_fields + or "snapshot" in update_fields + or "snapshot_id" in update_fields + ) + old_snapshot_id = None + old_output_size = 0 + if refresh_snapshot_size and not is_new: + old_values = type(self).objects.filter(pk=self.pk).values("snapshot_id", "output_size").first() + if old_values: + old_snapshot_id = old_values["snapshot_id"] + old_output_size = int(old_values["output_size"] or 0) + + # ArchiveResult rows are updated on every plugin event. Resolving + # DELETE_AFTER here is deceptively expensive because the effective + # value lives on Snapshot/Crawl config, so a save of an already-loaded + # result can still materialize parent objects and parse config. The + # orchestrator owns the repair pass for these rows instead: it fills + # missing delete_at values from fresh Snapshot/Crawl config when the + # queue is idle, outside the hook-result write hot path. + # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories + # Call the Django Model.save() directly instead + models.Model.save(self, *args, **kwargs) + if refresh_snapshot_size: + current_snapshot_id = self.snapshot_id + snapshot_ids = {snapshot_id for snapshot_id in (old_snapshot_id, current_snapshot_id) if snapshot_id} + current_output_size = int(self.output_size or 0) + if len(snapshot_ids) > 1: + # Moving an ArchiveResult between Snapshots is rare and cannot + # be represented as a single delta on one parent row. Keep the + # conservative aggregate fallback for that shape. + transaction.on_commit(lambda: type(self).refresh_snapshot_output_sizes(snapshot_ids)) + elif current_snapshot_id: + # Hook-result projection updates ArchiveResult rows at very + # high frequency during indexing. Re-aggregating every sibling + # row for the parent Snapshot on each save turns those short + # writes into a table-scan hot path. For the common case where + # the result stays attached to the same Snapshot, the persisted + # parent total is exactly the old total plus this row's size + # delta; F() keeps that update atomic with concurrent result + # saves for other plugins on the same Snapshot. + size_delta = current_output_size if is_new else current_output_size - old_output_size + if size_delta: + transaction.on_commit( + lambda: Snapshot.objects.filter(pk=current_snapshot_id).update( + output_size=F("output_size") + size_delta, + modified_at=timezone.now(), + ), + ) + if is_new or update_fields is None or "status" in update_fields or "snapshot" in update_fields or "snapshot_id" in update_fields: + transaction.on_commit(type(self).clear_majority_status_cache) + + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # log_worker_event( + # worker_type='DB', + # event='Created ArchiveResult', + # indent_level=3, + # plugin=self.plugin, + # metadata={ + # 'id': str(self.id), + # 'snapshot_id': str(self.snapshot_id), + # 'snapshot_url': str(self.snapshot.url)[:64], + # 'status': self.status, + # }, + # ) + + def delete(self, *args, **kwargs): + snapshot_id = self.snapshot_id + deleted = super().delete(*args, **kwargs) + if snapshot_id: + transaction.on_commit(lambda: type(self).refresh_snapshot_output_sizes({snapshot_id})) + transaction.on_commit(type(self).clear_majority_status_cache) + return deleted + + @staticmethod + def refresh_snapshot_output_sizes(snapshot_ids): + for snapshot_id in snapshot_ids: + total_size = ArchiveResult.objects.filter(snapshot_id=snapshot_id).aggregate(total_size=Sum("output_size"))["total_size"] or 0 + Snapshot.objects.filter(pk=snapshot_id).update( + output_size=total_size, + modified_at=timezone.now(), + ) + + @cached_property + def snapshot_dir(self): + return Path(self.snapshot.output_dir) + + @cached_property + def url(self): + return self.snapshot.url + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_archiveresult", args=[self.id])) + + def get_absolute_url(self): + return f"/{self.snapshot.archive_path}/{self.plugin}" + + def reset_for_retry(self, *, save: bool = True) -> None: + self.status = self.StatusChoices.QUEUED + self.retry_at = None + self.output_str = "" + self.output_json = None + self.output_files = {} + self.output_size = 0 + self.output_mimetypes = "" + self.start_ts = None + self.end_ts = None + if save: + self.save( + update_fields=[ + "status", + "retry_at", + "output_str", + "output_json", + "output_files", + "output_size", + "output_mimetypes", + "start_ts", + "end_ts", + "modified_at", + ], + ) + + @property + def is_paused(self) -> bool: + return self.status == self.StatusChoices.PAUSED + + @classmethod + def pause_queryset(cls, queryset) -> int: + return queryset.exclude(status__in=[*cls.FINAL_STATES, cls.StatusChoices.PAUSED]).update( + status=cls.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + modified_at=timezone.now(), + ) + + @classmethod + def resume_queryset(cls, queryset, *, when: datetime | None = None) -> int: + return queryset.filter(status=cls.StatusChoices.PAUSED).update( + status=cls.StatusChoices.QUEUED, + retry_at=when or timezone.now(), + modified_at=timezone.now(), + ) + + def pause(self, *, save: bool = True) -> bool: + if self.status in self.FINAL_STATES: + return False + if self.is_paused: + return False + self.status = self.StatusChoices.PAUSED + self.retry_at = RETRY_AT_MAX + if save: + self.pause_queryset(type(self).objects.filter(pk=self.pk)) + self.refresh_from_db() + return True + + def resume(self, *, when: datetime | None = None, save: bool = True) -> bool: + if not self.is_paused: + return False + self.status = self.StatusChoices.QUEUED + self.retry_at = when or timezone.now() + if save: + self.resume_queryset(type(self).objects.filter(pk=self.pk), when=self.retry_at) + self.refresh_from_db() + return True + + @property + def plugin_module(self) -> Any | None: + # Hook scripts are now used instead of Python plugin modules + # The plugin name maps to hooks in abx_plugins/plugins/{plugin}/ + return None + + @staticmethod + def _normalize_output_files(raw_output_files: Any) -> dict[str, dict[str, Any]]: + def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]: + normalized = dict(metadata) + if "extension" not in normalized: + normalized["extension"] = Path(path).suffix.lower().lstrip(".") + if "mimetype" not in normalized: + from abx_dl.output_files import guess_mimetype + + guessed = guess_mimetype(path) + if guessed: + normalized["mimetype"] = guessed + return normalized + + if raw_output_files is None: + return {} + if isinstance(raw_output_files, str): + try: + raw_output_files = json.loads(raw_output_files) + except json.JSONDecodeError: + return {} + if isinstance(raw_output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in raw_output_files.items(): + if not path: + continue + metadata_dict = dict(metadata) if isinstance(metadata, dict) else {} + metadata_dict.pop("path", None) + normalized[str(path)] = _enrich_metadata(str(path), metadata_dict) + return normalized + if isinstance(raw_output_files, (list, tuple, set)): + normalized: dict[str, dict[str, Any]] = {} + for item in raw_output_files: + if isinstance(item, str): + normalized[item] = _enrich_metadata(item, {}) + continue + if not isinstance(item, dict): + continue + path = str(item.get("path") or "").strip() + if not path: + continue + normalized[path] = _enrich_metadata( + path, + {key: value for key, value in item.items() if key != "path" and value not in (None, "")}, + ) + return normalized + return {} + + @staticmethod + def _coerce_output_file_size(value: Any) -> int: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return 0 + + def output_file_map(self) -> dict[str, dict[str, Any]]: + return self._normalize_output_files(self.output_files) + + def output_file_paths(self) -> list[str]: + return list(self.output_file_map().keys()) + + def output_file_count(self) -> int: + return len(self.output_file_paths()) + + def output_size_from_files(self) -> int: + return sum(self._coerce_output_file_size(metadata.get("size")) for metadata in self.output_file_map().values()) + + def update_output_metadata_from_filesystem(self, snapshot_dir: Path | None = None, save: bool = True) -> bool: + from collections import defaultdict + from abx_dl.output_files import guess_mimetype + + if self.plugin == "title": + return False + + snapshot_dir = Path(snapshot_dir or self.snapshot.output_dir) + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid"} + output_files: dict[str, dict[str, Any]] = {} + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + + def add_file(file_path: Path, rel_path: str, *, root_relative: bool = False) -> None: + nonlocal total_size + try: + if not file_path.is_file() or file_path.name in exclude_names: + return + stat = file_path.stat() + except OSError: + return + mime_type = guess_mimetype(file_path) or "application/octet-stream" + metadata = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } + if root_relative: + metadata["root_relative"] = True + output_files[rel_path] = metadata + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + + for raw_line in str(self.output_str or "").splitlines(): + raw_output = raw_line.strip().lstrip("/") + if not raw_output or raw_output in {".", "./", "/"} or "://" in raw_output or raw_output.startswith("/"): + continue + if not self._looks_like_output_path(raw_output, self.plugin): + continue + + raw_path = Path(raw_output) + if raw_output.startswith(f"{self.plugin}/"): + plugin_relative = raw_output.removeprefix(f"{self.plugin}/") + add_file(snapshot_dir / raw_output, plugin_relative) + elif len(raw_path.parts) == 1: + add_file(snapshot_dir / self.plugin / raw_output, raw_output) + add_file(snapshot_dir / raw_output, raw_output, root_relative=True) + else: + add_file(snapshot_dir / self.plugin / raw_output, raw_output) + add_file(snapshot_dir / raw_output, raw_output, root_relative=True) + + plugin_dir = snapshot_dir / self.plugin + if not output_files and plugin_dir.is_dir(): + for file_path in plugin_dir.rglob("*"): + if not file_path.is_file() or ".hooks" in file_path.parts: + continue + add_file(file_path, str(file_path.relative_to(plugin_dir))) + + if not output_files: + return False + + sorted_mimes = sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True) + output_mimetypes = ",".join(mime for mime, _ in sorted_mimes) + if self.output_files == output_files and self.output_size == total_size and self.output_mimetypes == output_mimetypes: + return False + + self.output_files = output_files + self.output_size = total_size + self.output_mimetypes = output_mimetypes + self.modified_at = timezone.now() + if save: + self.save(update_fields=["output_files", "output_size", "output_mimetypes", "modified_at"]) + return True + + def output_exists(self) -> bool: + return os.path.exists(Path(self.snapshot_dir) / self.plugin) + + @staticmethod + def _looks_like_output_path(raw_output: str | None, plugin_name: str | None = None) -> bool: + value = str(raw_output or "").strip() + if value in ("", ".", "./", "/"): + return False + if plugin_name and value.startswith(f"{plugin_name}/"): + return True + if Path(value).is_absolute(): + return True + if Path(value).suffix: + return True + if "/" in value and "\\" not in value and " " not in value: + left, _, right = value.partition("/") + if left and right and all(ch.isalnum() or ch in "+-." for ch in left + right): + return False + return False + + def _existing_output_path(self, raw_output: str | None) -> str | None: + value = str(raw_output or "").strip() + if not value: + return None + + output_path = Path(value) + snapshot_dir = Path(self.snapshot_dir).resolve(strict=False) + candidates: list[str] = [] + + if output_path.is_absolute(): + try: + candidates.append(str(output_path.resolve(strict=False).relative_to(snapshot_dir))) + except (OSError, ValueError): + return None + elif value.startswith(f"{self.plugin}/"): + candidates.append(value) + elif len(output_path.parts) == 1: + candidates.append(f"{self.plugin}/{value}") + else: + candidates.append(value) + + output_file_map = self.output_file_map() + hashes_index = self.snapshot.hashes_index + for relative_path in candidates: + if relative_path in hashes_index: + return relative_path + + if relative_path in output_file_map: + return relative_path + + plugin_relative = relative_path.removeprefix(f"{self.plugin}/") + if plugin_relative in output_file_map: + return relative_path + + candidate = snapshot_dir / relative_path + try: + if candidate.is_file(): + return relative_path + except OSError: + continue + + return None + + @staticmethod + def _fallback_output_file_path( + output_file_paths: Sequence[str], + plugin_name: str | None = None, + output_file_map: dict[str, dict[str, Any]] | None = None, + ) -> str | None: + ignored = {"stdout.log", "stderr.log", "hook.pid", "listener.pid"} + candidates = [ + path + for path in output_file_paths + if Path(path).name not in ignored and Path(path).suffix.lower() not in (".pid", ".log", ".sh") + ] + if not candidates: + return None + + output_file_map = output_file_map or {} + preferred_names = [ + "index.html", + "index.htm", + "output.html", + "content.html", + "article.html", + "snapshot.mhtml", + "snapshot.mht", + "output.pdf", + "index.pdf", + "content.txt", + "output.txt", + "index.txt", + "index.md", + "index.json", + "article.json", + ] + for preferred_name in preferred_names: + for candidate in candidates: + if Path(candidate).name.lower() == preferred_name: + return candidate + + ext_groups = ( + (".html", ".htm", ".mhtml", ".mht", ".pdf"), + (".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".ico"), + (".json", ".jsonl", ".txt", ".md", ".csv", ".tsv"), + (".mp4", ".webm", ".mp3", ".opus", ".ogg", ".wav"), + ) + for ext_group in ext_groups: + group_candidates = [candidate for candidate in candidates if Path(candidate).suffix.lower() in ext_group] + if group_candidates: + return max( + group_candidates, + key=lambda path: ArchiveResult._coerce_output_file_size(output_file_map.get(path, {}).get("size")), + ) + + return None + + @staticmethod + def _find_best_output_file(dir_path: Path, plugin_name: str | None = None) -> Path | None: + if not dir_path.exists() or not dir_path.is_dir(): + return None + file_map: dict[str, dict[str, Any]] = {} + file_count = 0 + max_scan = 500 + for file_path in dir_path.rglob("*"): + file_count += 1 + if file_count > max_scan: + break + if file_path.is_dir() or file_path.name.startswith("."): + continue + rel_path = str(file_path.relative_to(dir_path)) + try: + size = file_path.stat().st_size + except OSError: + size = 0 + file_map[rel_path] = {"size": size} + + fallback_path = ArchiveResult._fallback_output_file_path(list(file_map.keys()), plugin_name, file_map) + if not fallback_path: + return None + return dir_path / fallback_path + + def embed_path_db(self, output_file_map: dict[str, dict[str, Any]] | None = None) -> str | None: + output_file_map = output_file_map if output_file_map is not None else self.output_file_map() + + def is_root_relative(path: str) -> bool: + metadata = output_file_map.get(path) or {} + return bool(isinstance(metadata, dict) and metadata.get("root_relative")) + + if self.output_str: + raw_output = str(self.output_str).strip() + if self._looks_like_output_path(raw_output, self.plugin): + output_path = Path(raw_output) + if output_path.is_absolute(): + return None + + candidates: list[str] = [] + if raw_output.startswith(f"{self.plugin}/"): + candidates.append(raw_output) + elif len(output_path.parts) == 1: + candidates.append(f"{self.plugin}/{raw_output}") + candidates.append(raw_output) + else: + candidates.append(raw_output) + + if not output_file_map: + return self._existing_output_path(raw_output) + + if raw_output in output_file_map and is_root_relative(raw_output): + return raw_output + + for relative_path in candidates: + plugin_relative = relative_path.removeprefix(f"{self.plugin}/") + if relative_path in output_file_map: + return f"{self.plugin}/{relative_path}" if not relative_path.startswith(f"{self.plugin}/") else relative_path + if plugin_relative in output_file_map: + return f"{self.plugin}/{plugin_relative}" + + output_file_paths = list(output_file_map.keys()) + if output_file_paths: + fallback_path = self._fallback_output_file_path(output_file_paths, self.plugin, output_file_map) + if fallback_path: + if is_root_relative(fallback_path): + return fallback_path + return f"{self.plugin}/{fallback_path}" + + return None + + def embed_path(self) -> str | None: + """ + Get the relative path to the embeddable output file for this result. + + This is intentionally DB-backed only so snapshot/admin rendering stays + fast and predictable without filesystem probes. + """ + return self.embed_path_db() + + @property + def output_dir_name(self) -> str: + return self.plugin + + @property + def output_dir_parent(self) -> str: + return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR)) + + # Properties that delegate to Process model (for backwards compatibility) + # These properties will replace the direct fields after migration is complete + # They allow existing code to continue using archiveresult.pwd, .cmd, etc. + + # Note: After migration 3 creates Process records and migration 5 removes the old fields, + # these properties provide seamless access to Process data through ArchiveResult + + # Uncommented after migration 3 completed - properties now active + @property + def process_record(self): + if not self.process_id: + return None + try: + return self.process + except ObjectDoesNotExist: + return None + + @property + def pwd(self) -> str: + """Working directory, derived from the snapshot/plugin path if the Process row is gone.""" + process = self.process_record + return process.pwd if process and process.pwd else str(self.output_dir) + + @property + def cmd(self) -> list: + """Command array (from Process).""" + process = self.process_record + return process.cmd if process else [] + + @property + def cmd_version(self) -> str: + """Command version (from Process.binary).""" + process = self.process_record + return process.cmd_version if process else "" + + @property + def binary(self): + """Binary FK (from Process).""" + process = self.process_record + return process.binary if process else None + + @property + def iface(self): + """Network interface FK (from Process).""" + process = self.process_record + return process.iface if process else None + + @property + def machine(self): + """Machine FK (from Process).""" + process = self.process_record + return process.machine if process else None + + @property + def timeout(self) -> int: + """Timeout in seconds (from Process).""" + process = self.process_record + return process.timeout if process else 120 + + def save_search_index(self): + pass + + def update_from_output(self): + """ + Update this ArchiveResult from filesystem logs and output files. + + Used for Snapshot cleanup / orphan recovery when a hook's output exists + on disk but the projector did not finalize the row in the database. + + Updates: + - status, output_str, output_json from ArchiveResult JSONL record + - output_files, output_size, output_mimetypes by walking filesystem + - end_ts, cmd, cmd_version, binary FK + - Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records() + """ + from collections import defaultdict + from pathlib import Path + from django.utils import timezone + from abx_dl.output_files import guess_mimetype + from archivebox.plugins.hooks import process_hook_records, extract_records_from_process + from archivebox.machine.models import Process + + plugin_dir = Path(self.pwd) if self.pwd else None + if not plugin_dir or not plugin_dir.exists(): + self.status = self.StatusChoices.FAILED + self.output_str = "Output directory not found" + self.end_ts = timezone.now() + self.save() + return + + records = [] + process = self.process_record + if process: + records = extract_records_from_process(process) + + if not records: + stdout_file = plugin_dir / "stdout.log" + stdout = stdout_file.read_text(errors="replace") if stdout_file.exists() else "" + records = Process.parse_records_from_text(stdout) + + # Find ArchiveResult record and update status/output from it + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + if ar_records: + hook_data = ar_records[0] + + # Update status + status_map = { + "succeeded": self.StatusChoices.SUCCEEDED, + "failed": self.StatusChoices.FAILED, + "skipped": self.StatusChoices.SKIPPED, + "noresults": self.StatusChoices.NORESULTS, + } + self.status = status_map.get(hook_data.get("status", "failed"), self.StatusChoices.FAILED) + + # Update output fields + self.output_str = hook_data.get("output_str") or hook_data.get("output") or "" + self.output_json = hook_data.get("output_json") + + # Update cmd fields + if hook_data.get("cmd"): + if process: + process.cmd = hook_data["cmd"] + process.save() + self._set_binary_from_cmd(hook_data["cmd"]) + # Note: cmd_version is derived from binary.version, not stored on Process + else: + # No ArchiveResult record: treat background hooks or clean exits as skipped + is_background = False + try: + from archivebox.plugins.hooks import is_background_hook + + is_background = bool(self.hook_name and is_background_hook(self.hook_name)) + except Exception: + pass + + if is_background or (process and process.exit_code == 0): + self.status = self.StatusChoices.SKIPPED + self.output_str = "Hook did not output ArchiveResult record" + else: + self.status = self.StatusChoices.FAILED + self.output_str = "Hook did not output ArchiveResult record" + + # Walk filesystem and populate output_files, output_size, output_mimetypes + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid"} + mime_sizes = defaultdict(int) + total_size = 0 + output_files = {} + + for file_path in plugin_dir.rglob("*"): + if not file_path.is_file(): + continue + if ".hooks" in file_path.parts: + continue + if file_path.name in exclude_names: + continue + + try: + stat = file_path.stat() + mime_type = guess_mimetype(file_path) or "application/octet-stream" + + relative_path = str(file_path.relative_to(plugin_dir)) + output_files[relative_path] = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + except OSError: + continue + + self.output_files = output_files + self.output_size = total_size + sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) + self.output_mimetypes = ",".join(mime for mime, _ in sorted_mimes) + + # Update timestamps + self.end_ts = timezone.now() + + self.save() + + # Process side-effect records (filter Snapshots for depth/URL) + filtered_records = [] + for record in records: + record_type = record.get("type") + + # Skip ArchiveResult records (already processed above) + if record_type == "ArchiveResult": + continue + + # Filter Snapshot records for depth/URL constraints + if record_type == "Snapshot": + url = record.get("url") + if not url: + continue + + depth = record.get("depth", self.snapshot.depth + 1) + if depth > self.snapshot.crawl.max_depth: + continue + + if not self._url_passes_filters(url): + continue + + filtered_records.append(record) + + # Process filtered records with unified dispatcher + overrides = { + "snapshot": self.snapshot, + "crawl": self.snapshot.crawl, + "created_by_id": self.created_by.pk, + } + process_hook_records(filtered_records, overrides=overrides) + + # Cleanup PID files (keep logs even if empty so they can be tailed) + pid_file = plugin_dir / "hook.pid" + pid_file.unlink(missing_ok=True) + + def _set_binary_from_cmd(self, cmd: list) -> None: + """ + Find Binary for command and set binary FK. + + Tries matching by absolute path first, then by binary name. + Only matches binaries on the current machine. + """ + if not cmd: + return + + from archivebox.machine.models import Machine + + bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd + machine = Machine.current() + + # Try matching by absolute path first + binary = Binary.objects.filter( + abspath=bin_path_or_name, + machine=machine, + ).first() + + if binary: + process = self.process_record + if process: + process.binary = binary + process.save() + return + + # Fallback: match by binary name + bin_name = Path(bin_path_or_name).name + binary = Binary.objects.filter( + name=bin_name, + machine=machine, + ).first() + + if binary: + process = self.process_record + if process: + process.binary = binary + process.save() + + def _url_passes_filters(self, url: str) -> bool: + """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters. + + Uses the centralized config resolver so frozen crawl/snapshot values + and live Machine/Persona execution values apply in their scoped order. + """ + return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot) + + @property + def output_dir(self) -> Path: + """Get the output directory for this plugin's results.""" + return Path(self.snapshot.output_dir) / self.plugin + + +# ============================================================================= +# State Machine Registration +# ============================================================================= + +# Manually register state machines with python-statemachine registry +# (normally auto-discovered from statemachines.py, but we define them here for clarity) +registry.register(SnapshotMachine) diff --git a/archivebox/core/permissions.py b/archivebox/core/permissions.py new file mode 100644 index 0000000000..b0ab32cde8 --- /dev/null +++ b/archivebox/core/permissions.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from django.db.models import QuerySet +from django.http import HttpRequest + +PERMISSIONS_PUBLIC = "public" +PERMISSIONS_UNLISTED = "unlisted" +PERMISSIONS_PRIVATE = "private" +PERMISSIONS_CHOICES = ( + (PERMISSIONS_PUBLIC, "Public"), + (PERMISSIONS_UNLISTED, "Unlisted"), + (PERMISSIONS_PRIVATE, "Private"), +) +PERMISSIONS_VALUES = {value for value, _label in PERMISSIONS_CHOICES} +PERMISSIONS_META = { + PERMISSIONS_PUBLIC: ("๐Ÿ‘ฅ", "Public", "#047857", "#d1fae5"), + PERMISSIONS_UNLISTED: ("๐Ÿ”—", "Unlisted", "#1d4ed8", "#dbeafe"), + PERMISSIONS_PRIVATE: ("๐Ÿ”’", "Private", "#991b1b", "#fee2e2"), +} + + +def normalize_permissions(permissions: object, *, default: str = PERMISSIONS_PRIVATE) -> str: + permissions = str(permissions or "").strip().lower() + if permissions not in PERMISSIONS_VALUES: + return default + return permissions + + +def is_admin_user(request: HttpRequest) -> bool: + user = request.user + return bool(user.is_authenticated and user.is_active and user.is_staff) + + +def get_snapshot_permissions(snapshot) -> str: + return normalize_permissions(snapshot.permissions) + + +def can_view_snapshot(request: HttpRequest, snapshot) -> bool: + permissions = get_snapshot_permissions(snapshot) + return permissions in {PERMISSIONS_PUBLIC, PERMISSIONS_UNLISTED} or is_admin_user(request) + + +def _persona_ids_for_permissions(allowed_permissions: set[str]) -> list[str]: + from archivebox.personas.models import Persona + + return [str(persona_id) for persona_id in Persona.objects.filter(permissions__in=allowed_permissions).values_list("id", flat=True)] + + +def filter_personas_by_permissions(queryset: QuerySet, allowed_permissions: set[str]) -> QuerySet: + return queryset.filter(id__in=_persona_ids_for_permissions(allowed_permissions)) + + +def filter_snapshots_by_permissions(queryset: QuerySet, *, direct: bool = False, allowed_permissions: set[str] | None = None) -> QuerySet: + allowed_permissions = allowed_permissions or ({PERMISSIONS_PUBLIC, PERMISSIONS_UNLISTED} if direct else {PERMISSIONS_PUBLIC}) + allowed = sorted(allowed_permissions) + return queryset.filter(permissions__in=allowed) + + +def public_snapshots_queryset(queryset: QuerySet) -> QuerySet: + return filter_snapshots_by_permissions(queryset, direct=False) + + +def direct_snapshots_queryset(request: HttpRequest, queryset: QuerySet) -> QuerySet: + return queryset if is_admin_user(request) else filter_snapshots_by_permissions(queryset, direct=True) diff --git a/archivebox/core/preview_util.py b/archivebox/core/preview_util.py new file mode 100644 index 0000000000..c0753eaf9d --- /dev/null +++ b/archivebox/core/preview_util.py @@ -0,0 +1,4 @@ +__package__ = "archivebox.core" + + +EXTENSION_SCREENSHOT_PLUGIN = "chrome_extension_screenshot" diff --git a/archivebox/core/recovery_util.py b/archivebox/core/recovery_util.py new file mode 100644 index 0000000000..1fa32c060d --- /dev/null +++ b/archivebox/core/recovery_util.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +from pathlib import Path + +from django.utils import timezone +from rich.console import Console + + +def _is_signal_interrupted_exit(exit_code: int | None) -> bool: + return exit_code is not None and (exit_code < 0 or exit_code >= 128) + + +def recover_orchestrator_state(*, include_chrome: bool = False) -> dict[str, int]: + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.archive_result_service import _collect_output_metadata + from archivebox.machine.models import Process + from django.core.exceptions import ValidationError + from django.db.models import Exists, OuterRef, Q, Subquery, Value + from django.db.models.functions import Coalesce + + now = timezone.now() + recovery_console = Console(stderr=True, highlight=False, soft_wrap=True) + cleaned = { + "processes_stale_running": Process.cleanup_stale_running(), + "processes_orphaned_workers": Process.cleanup_orphaned_workers(), + "chrome_processes_orphaned": Process.cleanup_orphaned_chrome() if include_chrome else 0, + "crawls_queued_without_retry_at": 0, + "snapshots_queued_without_retry_at": 0, + "archiveresults_backoff": 0, + "snapshots_queued_plugin_rows_waiting_on_stale_lease": 0, + "archiveresults_started_without_running_process": 0, + "archiveresults_missing_for_orphaned_hook_processes": 0, + "snapshots_started_without_running_results": 0, + "crawls_started_with_due_snapshots": 0, + "crawls_started_waiting_on_future_snapshots": 0, + "crawls_started_without_active_snapshots": 0, + } + + running_archiveresults = ArchiveResult.objects.filter( + snapshot_id=OuterRef("pk"), + status=ArchiveResult.StatusChoices.STARTED, + process__status=Process.StatusChoices.RUNNING, + ) + active_child_snapshots = Snapshot.objects.filter( + crawl_id=OuterRef("pk"), + status__in=Snapshot.OPEN_STATES, + ) + due_child_snapshots = active_child_snapshots.exclude(status=Snapshot.StatusChoices.PAUSED).filter( + Q(retry_at__isnull=True) | Q(retry_at__lte=now), + ) + next_future_child_retry = Subquery( + active_child_snapshots.filter(retry_at__gt=now).order_by("retry_at").values("retry_at")[:1], + ) + + # Broken lock repair: QUEUED rows with retry_at=NULL are invisible to the + # queue. Set only the scheduling field so the runner owns the next tick. + cleaned["crawls_queued_without_retry_at"] = Crawl.objects.filter( + status=Crawl.StatusChoices.QUEUED, + retry_at__isnull=True, + ).update(retry_at=now, modified_at=now) + cleaned["snapshots_queued_without_retry_at"] = Snapshot.objects.filter( + status=Snapshot.StatusChoices.QUEUED, + retry_at__isnull=True, + crawl__status__in=Crawl.RUNNABLE_STATES, + ).update(retry_at=now, modified_at=now) + backoff_results = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.BACKOFF) + orphaned_results = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).exclude( + process__status=Process.StatusChoices.RUNNING, + ) + # ArchiveResult has no retry_at scheduler. Wake only the parent Snapshots + # for result rows we are about to repair, then requeue those exact rows. + # Using subqueries keeps million-row recovery in SQLite instead of building + # Python ID lists or scanning all sealed snapshots. + Snapshot.objects.filter( + id__in=backoff_results.values("snapshot_id"), + status__in=[Snapshot.StatusChoices.SEALED, Snapshot.StatusChoices.PAUSED], + retry_at__isnull=True, + ).update(retry_at=now, modified_at=now) + Snapshot.objects.filter( + id__in=orphaned_results.values("snapshot_id"), + status__in=[Snapshot.StatusChoices.SEALED, Snapshot.StatusChoices.PAUSED], + retry_at__isnull=True, + ).update(retry_at=now, modified_at=now) + cleaned["archiveresults_backoff"] = backoff_results.update(status=ArchiveResult.StatusChoices.QUEUED, modified_at=now) + # Targeted plugin rows on final/paused Snapshots are scheduled through the + # parent Snapshot.retry_at. retry_at=NULL is the normal idle marker for a + # sealed Snapshot and must not be interpreted as queued work just because + # old/synthetic ArchiveResult rows exist. If takeover kills the runner + # after it leases the Snapshot but before queued ArchiveResult rows finish, + # the rows remain QUEUED while retry_at sits in the future. Recovery runs + # only after this runner has won the single-runner gate, so it can safely + # unlock those stale plugin leases for immediate processing instead of + # waiting out the previous owner's full lock timeout. + queued_plugin_results = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED) + cleaned["snapshots_queued_plugin_rows_waiting_on_stale_lease"] = ( + Snapshot.objects.filter( + id__in=queued_plugin_results.values("snapshot_id"), + status__in=[Snapshot.StatusChoices.SEALED, Snapshot.StatusChoices.PAUSED], + ) + .filter(retry_at__gt=now) + .update(retry_at=now, modified_at=now) + ) + # Impossible state repair: STARTED ArchiveResults without a live Process + # have no owner left to emit completion. Requeue only the result row; the + # snapshot/crawl schedulers will pick up normal retry processing. + cleaned["archiveresults_started_without_running_process"] = orphaned_results.update( + status=ArchiveResult.StatusChoices.QUEUED, + process=None, + modified_at=now, + ) + orphaned_hook_processes = Process.objects.filter( + process_type=Process.TypeChoices.HOOK, + archiveresult__isnull=True, + ).exclude(status=Process.StatusChoices.RUNNING) + for process in orphaned_hook_processes.only("id", "pwd", "cmd", "process_type", "status"): + hook_script_name = process.hook_script_name + if not hook_script_name or not process.pwd: + continue + plugin_dir = Path(process.pwd) + try: + # Old or synthetic hook Process rows can point at arbitrary paths. + # Only paths whose parent directory is a valid Snapshot id can be + # reconstructed into ArchiveResult rows. + snapshot = Snapshot.objects.filter(id=plugin_dir.parent.name).first() + except ValidationError: + continue + if snapshot is None: + continue + result, created = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin_dir.name, + hook_name=Path(hook_script_name).stem, + defaults={ + "status": ArchiveResult.StatusChoices.QUEUED, + }, + ) + if result.status == ArchiveResult.StatusChoices.QUEUED: + requeue_snapshot = False + # A runner can die after the hook Process exits but before the + # ProcessCompletedEvent projector links/finalizes ArchiveResult. + # Reconstruct only that exact hook row from the durable Process row. + output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) + result.process = process + if _is_signal_interrupted_exit(process.exit_code): + # The owning runner died or was asked to stop while the hook was + # still active. Keep the work item queued so takeover retries the + # same hook; treating an unknown signal exit as success would + # silently skip unfinished side effects. + result.output_files = {} + result.output_size = 0 + result.output_mimetypes = "" + result.output_str = "" + result.status = ArchiveResult.StatusChoices.QUEUED + requeue_snapshot = True + else: + result.output_files = output_files + result.output_size = output_size + result.output_mimetypes = output_mimetypes + result.output_str = process.stderr if process.exit_code not in (0, None) else "" + result.status = ( + ArchiveResult.StatusChoices.FAILED + if process.exit_code not in (0, None) + else (ArchiveResult.StatusChoices.SUCCEEDED if output_files else ArchiveResult.StatusChoices.NORESULTS) + ) + result.save( + update_fields=[ + "process", + "output_files", + "output_size", + "output_mimetypes", + "output_str", + "status", + "modified_at", + ], + ) + if requeue_snapshot: + Snapshot.objects.filter(id=snapshot.id).update(retry_at=now, modified_at=now) + if created: + cleaned["archiveresults_missing_for_orphaned_hook_processes"] += 1 + Snapshot.objects.filter(id=snapshot.id).update(retry_at=now, modified_at=now) + started_snapshots = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ) + + # Broken lock repair: STARTED + retry_at=NULL or retry_at in the future + # means "owned by an active runner". Recovery only runs from the current + # elected runner after Process cleanup has proven old owners are gone, so + # STARTED rows with no live ArchiveResult process should not wait out the + # previous runner's full lease before the new runner can resume them. + # We only unlock scheduling; normal Snapshot runner code owns the next + # transition and side effects. + cleaned["snapshots_started_without_running_results"] = ( + started_snapshots.annotate(has_running_results=Exists(running_archiveresults)) + .filter(has_running_results=False) + .update( + retry_at=now, + modified_at=now, + ) + ) + + # Broken lock repair: STARTED + retry_at=NULL is an orphaned ownership + # lease. Recovery only unlocks scheduling; the runner owns any subsequent + # state-machine transition, including sealing rows whose children/results + # are already final. + recoverable_started_crawls = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ) + + due_started_crawls = recoverable_started_crawls.annotate(has_due_child=Exists(due_child_snapshots)).filter(has_due_child=True) + cleaned["crawls_started_with_due_snapshots"] = due_started_crawls.update(retry_at=now, modified_at=now) + future_started_crawls = recoverable_started_crawls.annotate( + has_active_child=Exists(active_child_snapshots), + has_due_child=Exists(due_child_snapshots), + next_child_retry=next_future_child_retry, + ).filter(has_active_child=True, has_due_child=False) + cleaned["crawls_started_waiting_on_future_snapshots"] = future_started_crawls.update( + retry_at=Coalesce("next_child_retry", Value(now)), + modified_at=now, + ) + finished_started_crawls = recoverable_started_crawls.annotate(has_active_child=Exists(active_child_snapshots)).filter( + has_active_child=False, + ) + cleaned["crawls_started_without_active_snapshots"] = finished_started_crawls.update(retry_at=now, modified_at=now) + + repair_messages = { + "processes_stale_running": ( + "Closing {count} interrupted process(es) " + "(ArchiveBox may have been interrupted before it was able to record that they stopped; any affected work can now be retried)." + ), + "processes_orphaned_workers": ( + "Closing {count} interrupted extractor process(es) " + "(ArchiveBox may have been interrupted before it was able to record their result; affected extractor results can now be retried)." + ), + "chrome_processes_orphaned": ( + "Stopping {count} leftover browser process(es) " + "(ArchiveBox may have been interrupted before it was able to close them; this frees browser resources and avoids duplicate browser sessions)." + ), + "crawls_queued_without_retry_at": ( + "Starting {count} Crawl(s) that were queued but never started " + "(ArchiveBox may have been interrupted before it was able to begin archiving them)." + ), + "snapshots_queued_without_retry_at": ( + "Starting {count} Snapshot(s) that were queued but never started " + "(ArchiveBox may have been interrupted before it was able to archive those URLs)." + ), + "archiveresults_backoff": ( + "Retrying {count} extractor result(s) that were waiting to retry " + "(ArchiveBox may have been interrupted before it was able to try them again; affected outputs will be retried)." + ), + "archiveresults_started_without_running_process": ( + "Retrying {count} extractor result(s) that were interrupted before finishing " + "(ArchiveBox may have been interrupted before it was able to save their final status; partial files will be overwritten with fresh results upon retry)." + ), + "snapshots_started_without_running_results": ( + "Resuming {count} Snapshot(s) that were interrupted before finishing " + "(ArchiveBox may have been interrupted before it was able to finish archiving them; missing outputs will be retried)." + ), + "crawls_started_with_due_snapshots": ( + "Resuming {count} Crawl(s) with pending URLs ready to archive " + "(ArchiveBox may have been interrupted before it was able to archive the remaining URLs; pending URLs will continue)." + ), + "crawls_started_waiting_on_future_snapshots": ( + "Resuming {count} Crawl(s) with URLs waiting for a later retry " + "(ArchiveBox may have been interrupted before it was able to retry delayed URLs; they will retry later)." + ), + "crawls_started_without_active_snapshots": ( + "Finalizing {count} Crawl(s) that finished URL processing but were not closed cleanly " + "(ArchiveBox may have been interrupted before it was able to save the final crawl status; archived data is not changed)." + ), + } + for key, message in repair_messages.items(): + if cleaned[key]: + recovery_console.print(f"[yellow]โš ๏ธ Repairing: {message.format(count=cleaned[key])}[/yellow]") + + return cleaned diff --git a/archivebox/core/routes_util.py b/archivebox/core/routes_util.py new file mode 100644 index 0000000000..8358f8e2a0 --- /dev/null +++ b/archivebox/core/routes_util.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import re +from typing import Any +from urllib.parse import urlparse + +from archivebox.config.common import get_config + + +_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$") +_SNAPSHOT_SUBDOMAIN_RE = re.compile(r"^snap-(?P<suffix>[0-9a-fA-F]{12})$") +_ROLE_SUBDOMAIN_LABELS = ("admin", "web", "api") + + +def split_host_port(host: str) -> tuple[str, str | None]: + parsed = urlparse(f"//{host}") + hostname = (parsed.hostname or host or "").lower() + port = str(parsed.port) if parsed.port else None + return hostname, port + + +def _normalize_base_url(value: str | None) -> str: + if not value: + return "" + base = value.strip() + if not base: + return "" + if "://" not in base: + base = f"http://{base}" + parsed = urlparse(base) + if not parsed.netloc: + return "" + # Accept ``*.<host>`` as a synonym for ``<host>`` so users can paste the + # wildcard-friendly form (e.g. from the banner suggestion) without it + # leaking ``*.`` into every downstream URL. Subdomain routing already + # prepends the appropriate role label (admin/web/api/snap-*) at build + # time, so the bare base host is what we want to store. + netloc = parsed.netloc + while netloc.startswith("*."): + netloc = netloc[2:] + if not netloc: + return "" + return f"{parsed.scheme}://{netloc}" + + +def normalize_base_url(value: str | None) -> str: + return _normalize_base_url(value) + + +def _csrf_trusted_origins(config) -> list[str]: + raw = (config.CSRF_TRUSTED_ORIGINS or "").strip() + if not raw: + return [] + seen: list[str] = [] + for entry in raw.split(","): + normalized = _normalize_base_url(entry.strip()) + if normalized and normalized not in seen: + seen.append(normalized) + return seen + + +def _allowed_hosts(config) -> set[str]: + raw = (config.ALLOWED_HOSTS or "").strip() + if not raw: + return set() + return {entry.strip().lower() for entry in raw.split(",") if entry.strip() and entry.strip() != "*"} + + +def derive_base_url_from_csrf(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + """Pick a single CSRF_TRUSTED_ORIGINS entry to act as the implicit BASE_URL. + + 0.7.3 โ†’ 0.9.0 upgrade path: any reverse-proxied 0.7.3 deployment already + had ``CSRF_TRUSTED_ORIGINS=https://archive.example.com`` set (required for + admin login to work). On upgrade, ``BASE_URL`` is the new knob โ€” but it + defaults to empty, and falling through to ``BIND_ADDR`` produces an + unreachable URL like ``http://0.0.0.0:8000``. If the user has exactly one + CSRF origin we treat it as the implicit BASE_URL so links/redirects keep + pointing at the public hostname they already configured. + + Returns ``""`` when the inference is ambiguous (multiple origins) or + impossible (none set) so callers fall through to their next strategy. + """ + config = config or get_config(**config_kwargs) + origins = _csrf_trusted_origins(config) + if len(origins) == 1: + return origins[0] + return "" + + +def get_listen_host(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + return (config.BIND_ADDR or "").strip() + + +def get_listen_parts(config: dict[str, Any] | None = None, **config_kwargs: Any) -> tuple[str, str | None]: + config = config or get_config(**config_kwargs) + return split_host_port(get_listen_host(config=config)) + + +def _with_port(host: str, port: str | None) -> str: + return f"{host}:{port}" if port else host + + +def strip_role_subdomain(host: str) -> str: + """Strip leading ``admin.`` / ``web.`` / ``api.`` / ``snap-*.`` + labels from a host (preserving the port). Strips repeatedly so an + already-compounded host like ``snap-X.snap-X.<base>`` reduces all the + way down to ``<base>``. + + Used when we want to recover the canonical base host from a request that + arrived on a role subdomain โ€” otherwise builders that prepend their own + role label (e.g. ``snap-X.``) compound onto the existing prefix and you + get ``snap-X.snap-X.snap-X.<base>`` on every click. + """ + if not host: + return "" + hostname, port = split_host_port(host) + while hostname and "." in hostname: + head, _sep, rest = hostname.partition(".") + if head in _ROLE_SUBDOMAIN_LABELS or _SNAPSHOT_SUBDOMAIN_RE.match(head): + hostname = rest + continue + break + return _with_port(hostname, port) + + +def _is_local_bind_host(host: str) -> bool: + return host in {"", "0.0.0.0", "::", "127.0.0.1", "::1", "localhost"} + + +def canonical_base_host_for_request(request_host: str) -> str: + """Strip role subdomains and remap loopback hostnames to ``archivebox.localhost``. + + Used by the banner suggestion and the in-browser pin endpoint: when the + user is hitting the server on raw ``localhost:9292`` or ``127.0.0.1:9292`` + we want to suggest the wildcard-friendly ``archivebox.localhost`` family + instead, so the eventual pinned ``BASE_URL`` plays nicely with subdomain + routing without forcing the user to add a /etc/hosts entry. + """ + hostname, port = split_host_port(strip_role_subdomain(request_host or "")) + if _is_local_bind_host(hostname): + hostname = "archivebox.localhost" + return _with_port(hostname, port) + + +def _root_host_from_listen(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + listen_host, listen_port = get_listen_parts(config=config) + root_host = "archivebox.localhost" if _is_local_bind_host(listen_host) else listen_host + return _with_port(root_host, listen_port) if root_host else "" + + +def get_base_url(request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + override = _normalize_base_url(config.BASE_URL) + if override: + return override + + # A) Implicit BASE_URL from a single CSRF_TRUSTED_ORIGINS entry. Catches + # 0.7.3 โ†’ 0.9.0 upgrades where users already set CSRF_TRUSTED_ORIGINS + # for their reverse-proxy login but never set BASE_URL. + csrf_derived = derive_base_url_from_csrf(config) + if csrf_derived: + return csrf_derived + + scheme = request.scheme if request else "http" + if request: + req_host, req_port = split_host_port(request.get_host()) + if req_host.endswith(".archivebox.localhost"): + return f"{scheme}://{_with_port('archivebox.localhost', req_port)}" + if _is_local_bind_host(req_host): + return f"{scheme}://{_with_port('archivebox.localhost', req_port)}" + # C) Per-request fallback: when ``BASE_URL`` is unset and CSRF didn't + # give us a single origin, trust the request's Host header โ€” but first + # peel off any ``admin.`` / ``web.`` / ``api.`` / ``snap-*.`` label. + # Otherwise the URL builders below prepend their own role label onto a + # host that already carries one, producing the ``snap-X.snap-X.snap-X`` + # compounding bug. Django has already admitted the host via + # ALLOWED_HOSTS; the misconfig banner surfaces the case where the + # resulting URL doesn't match what the operator probably intended. + canonical_host = strip_role_subdomain(request.get_host()) + return f"{scheme}://{canonical_host}" + + root_host = _root_host_from_listen(config=config) + return f"{scheme}://{root_host}" if root_host else "" + + +def get_base_host(request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + return urlparse(get_base_url(request=request, config=config, **config_kwargs)).netloc.lower() + + +def _build_base_host(subdomain: str | None, request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + base_host = get_base_host(request=request, config=config, **config_kwargs) + if not base_host: + return "" + host, port = split_host_port(base_host) + full_host = f"{subdomain}.{host}" if subdomain else host + return _with_port(full_host, port) + + +def get_admin_host(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_host(config=config) + return _build_base_host("admin", config=config) + + +def get_web_host(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_host(config=config) + return _build_base_host("web", config=config) + + +def get_api_host(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_host(config=config) + return _build_base_host("api", config=config) + + +def get_snapshot_subdomain(snapshot_id: str) -> str: + normalized = re.sub(r"[^0-9a-fA-F]", "", snapshot_id or "") + suffix = (normalized[-12:] if len(normalized) >= 12 else normalized).lower() + return f"snap-{suffix}" + + +def get_snapshot_host(snapshot_id: str, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_host(config=config) + return _build_base_host(get_snapshot_subdomain(snapshot_id), config=config) + + +def get_original_host(domain: str, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_host(config=config) + return _build_base_host(domain, config=config) + + +def is_snapshot_subdomain(subdomain: str) -> bool: + value = (subdomain or "").strip() + return bool(_SNAPSHOT_SUBDOMAIN_RE.match(value) or _SNAPSHOT_ID_RE.match(value)) + + +def get_snapshot_lookup_key(snapshot_ref: str) -> str: + value = (snapshot_ref or "").strip().lower() + match = _SNAPSHOT_SUBDOMAIN_RE.match(value) + if match: + return match.group("suffix") + if _SNAPSHOT_ID_RE.match(value): + return re.sub(r"[^0-9a-fA-F]", "", value).lower() + return value + + +def get_listen_subdomain(request_host: str, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return "" + req_host, req_port = split_host_port(request_host) + base_host, base_port = split_host_port(get_base_host(config=config)) + if not base_host: + return "" + if base_port and req_port and base_port != req_port: + return "" + if req_host == base_host: + return "" + suffix = f".{base_host}" + if req_host.endswith(suffix): + return req_host[: -len(suffix)] + return "" + + +def host_matches(request_host: str, target_host: str) -> bool: + if not request_host or not target_host: + return False + req_host, req_port = split_host_port(request_host) + target_host_only, target_port = split_host_port(target_host) + if req_host != target_host_only: + return False + if target_port and req_port and target_port != req_port: + return False + return True + + +def _scheme_from_request(request=None, config: dict[str, Any] | None = None) -> str: + config = config or get_config() + override = _normalize_base_url(config.BASE_URL) + if override: + return urlparse(override).scheme + if request: + return request.scheme + return "http" + + +def _build_base_url_for_host(host: str, request=None, config: dict[str, Any] | None = None) -> str: + if not host: + return "" + scheme = _scheme_from_request(request, config=config) + return f"{scheme}://{host}" + + +def get_admin_base_url(request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_url(request=request, config=config) + return _build_base_url_for_host(_build_base_host("admin", request=request, config=config), request=request, config=config) + + +def get_web_base_url(request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_url(request=request, config=config) + return _build_base_url_for_host(_build_base_host("web", request=request, config=config), request=request, config=config) + + +def get_api_base_url(request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return get_base_url(request=request, config=config) + return _build_base_url_for_host(_build_base_host("api", request=request, config=config), request=request, config=config) + + +def get_snapshot_base_url(snapshot_id: str, request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return _build_url(get_web_base_url(request=request, config=config), f"/snapshot/{str(snapshot_id).replace('-', '')}") + return _build_base_url_for_host( + _build_base_host(get_snapshot_subdomain(snapshot_id), request=request, config=config), + request=request, + config=config, + ) + + +def get_original_base_url(domain: str, request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + config = config or get_config(**config_kwargs) + if not config.USES_SUBDOMAIN_ROUTING: + return _build_url(get_web_base_url(request=request, config=config), f"/original/{domain}") + return _build_base_url_for_host(_build_base_host(domain, request=request, config=config), request=request, config=config) + + +def build_admin_url(path: str = "", request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + return _build_url(get_admin_base_url(request, config=config, **config_kwargs), path) + + +def build_web_url(path: str = "", request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + return _build_url(get_web_base_url(request, config=config, **config_kwargs), path) + + +def build_snapshot_url(snapshot_id: str, path: str = "", request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + return _build_url(get_snapshot_base_url(snapshot_id, request=request, config=config, **config_kwargs), path) + + +def build_original_url(domain: str, path: str = "", request=None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + return _build_url(get_original_base_url(domain, request=request, config=config, **config_kwargs), path) + + +def _build_url(base_url: str, path: str) -> str: + if not base_url: + if not path: + return "" + return path if path.startswith("/") else f"/{path}" + if not path: + return base_url + return f"{base_url}{path if path.startswith('/') else f'/{path}'}" diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py new file mode 100644 index 0000000000..d4e8f8776b --- /dev/null +++ b/archivebox/core/settings.py @@ -0,0 +1,681 @@ +__package__ = "archivebox.core" + +import os +import sys +import inspect +import importlib + +from pathlib import Path + +from django.conf.locale.en import formats as en_formats # type: ignore + +import archivebox + +from archivebox.config.constants import CONSTANTS +from archivebox.config.common import get_config +from archivebox.core.routes_util import get_api_base_url, get_admin_base_url, get_base_url, normalize_base_url +from .settings_logging import SETTINGS_LOGGING + + +IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3] +IS_TESTING = "test" in sys.argv[:3] or "PYTEST_CURRENT_TEST" in os.environ +IS_SHELL = "shell" in sys.argv[:3] or "shell_plus" in sys.argv[:3] +IS_GETTING_VERSION_OR_HELP = "version" in sys.argv or "help" in sys.argv or "--version" in sys.argv or "--help" in sys.argv +CONFIG = get_config() +PACKAGE_DIR = CONSTANTS.PACKAGE_DIR + +################################################################################ +### ArchiveBox Plugin Settings +################################################################################ + +ALL_PLUGINS = archivebox.ALL_PLUGINS +LOADED_PLUGINS = archivebox.LOADED_PLUGINS + +################################################################################ +### Django Core Settings +################################################################################ + +WSGI_APPLICATION = "archivebox.core.wsgi.application" +ASGI_APPLICATION = "archivebox.core.asgi.application" +ROOT_URLCONF = "archivebox.core.urls" + +LOGIN_URL = "/accounts/login/" +LOGOUT_REDIRECT_URL = CONFIG.LOGOUT_REDIRECT_URL + +PASSWORD_RESET_URL = "/accounts/password_reset/" +APPEND_SLASH = True + +DEBUG = CONFIG.DEBUG or ("--debug" in sys.argv) + + +INSTALLED_APPS = [ + "daphne", + # Django default apps + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", + # 3rd-party apps from PyPI + "signal_webhooks", # handles REST API outbound webhooks + "django_object_actions", # provides easy Django Admin action buttons on change views + # Our ArchiveBox-provided apps (use fully qualified names) + # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies + # "archivebox.config", # ArchiveBox config settings (no models, not a real Django app) + "archivebox.plugins", # plugin discovery, hook helpers, config UI, and plugin metadata views + "archivebox.search", # search backend query helpers, admin search UI, and daemon integrations + "archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc. + "archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors) + "archivebox.personas", # handles Persona and session management + "archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this) + "archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core) + "archivebox.progressmonitor", # live progress endpoint and admin monitor template + "archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. + "abx_plugins.plugins.opencode", + # 3rd-party apps from PyPI that need to be loaded last + "admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin + "django_extensions", # provides Django Debug Toolbar (and other non-debug helpers) +] + +DJANGO_OBJECT_ACTIONS_DEFAULT_HTTP_METHOD = "POST" + + +MIDDLEWARE = [ + "archivebox.core.middleware.TimezoneMiddleware", + "django.middleware.security.SecurityMiddleware", + "archivebox.core.middleware.AdminCookieIsolationMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "archivebox.api.middleware.ApiCorsMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "archivebox.core.middleware.ReverseProxyAuthMiddleware", + "archivebox.core.middleware.ServerSecurityModeMiddleware", + "archivebox.core.middleware.HostRoutingMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "archivebox.core.middleware.CacheControlMiddleware", + # Additional middlewares from plugins (if any) +] + + +################################################################################ +### Authentication Settings +################################################################################ + +# AUTH_USER_MODEL = 'auth.User' # cannot be easily changed unfortunately + +AUTHENTICATION_BACKENDS = [ + "django.contrib.auth.backends.RemoteUserBackend", + "django.contrib.auth.backends.ModelBackend", + # Additional auth backends (e.g., LDAP) configured via settings +] + + +# LDAP Authentication Configuration +# Conditionally loaded if LDAP_ENABLED=True and django-auth-ldap is installed +try: + if CONFIG.LDAP_ENABLED: + # Validate LDAP configuration + is_valid, error_msg = CONFIG.validate_ldap_config() + if not is_valid: + from rich import print + + print(f"[red][X] Error: {error_msg}[/red]") + raise ValueError(error_msg) + + try: + # Try to import django-auth-ldap (will fail if not installed) + LDAPSearch = importlib.import_module("django_auth_ldap.config").LDAPSearch + ldap = importlib.import_module("ldap") + + # Configure LDAP authentication + AUTH_LDAP_SERVER_URI = CONFIG.LDAP_SERVER_URI + AUTH_LDAP_BIND_DN = CONFIG.LDAP_BIND_DN + AUTH_LDAP_BIND_PASSWORD = CONFIG.LDAP_BIND_PASSWORD + + # Configure user search + AUTH_LDAP_USER_SEARCH = LDAPSearch( + CONFIG.LDAP_USER_BASE, + getattr(ldap, "SCOPE_SUBTREE", 2), + CONFIG.LDAP_USER_FILTER, + ) + + # Map LDAP attributes to Django user model fields + AUTH_LDAP_USER_ATTR_MAP = { + "username": CONFIG.LDAP_USERNAME_ATTR, + "first_name": CONFIG.LDAP_FIRSTNAME_ATTR, + "last_name": CONFIG.LDAP_LASTNAME_ATTR, + "email": CONFIG.LDAP_EMAIL_ATTR, + } + + # Use custom LDAP backend that supports LDAP_CREATE_SUPERUSER + AUTHENTICATION_BACKENDS = [ + "archivebox.ldap.auth.ArchiveBoxLDAPBackend", + "django.contrib.auth.backends.RemoteUserBackend", + "django.contrib.auth.backends.ModelBackend", + ] + + except ImportError as e: + from rich import print + + print("[red][X] Error: LDAP_ENABLED=True but required LDAP libraries are not installed![/red]") + print(f"[red] {e}[/red]") + print("[yellow] To install LDAP support, run:[/yellow]") + print("[yellow] pip install archivebox[ldap][/yellow]") + print("[yellow] Or manually:[/yellow]") + print("[yellow] apt install build-essential python3-dev libsasl2-dev libldap2-dev libssl-dev[/yellow]") + print("[yellow] pip install python-ldap django-auth-ldap[/yellow]") + raise + +except ImportError: + # archivebox.config.ldap not available (shouldn't happen but handle gracefully) + pass + +################################################################################ +### Staticfile and Template Settings +################################################################################ + +STATIC_URL = "/static/" +TEMPLATES_DIR_NAME = "templates" +CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK) +STATICFILES_DIRS = [ + *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / "static")] if CUSTOM_TEMPLATES_ENABLED else []), + # *[ + # str(plugin_dir / 'static') + # for plugin_dir in PLUGIN_DIRS.values() + # if (plugin_dir / 'static').is_dir() + # ], + # Additional static file dirs from plugins + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "static"), +] + +TEMPLATE_DIRS = [ + *([str(CONSTANTS.CUSTOM_TEMPLATES_DIR)] if CUSTOM_TEMPLATES_ENABLED else []), + # *[ + # str(plugin_dir / 'templates') + # for plugin_dir in PLUGIN_DIRS.values() + # if (plugin_dir / 'templates').is_dir() + # ], + # Additional template dirs from plugins + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "core"), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "admin"), + str(PACKAGE_DIR / TEMPLATES_DIR_NAME), +] + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "DIRS": TEMPLATE_DIRS, + "APP_DIRS": True, + "OPTIONS": { + "context_processors": [ + "django.template.context_processors.debug", + "django.template.context_processors.request", + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "archivebox.core.context_processors.archivebox_globals", + ], + }, + }, +] + + +################################################################################ +### External Service Settings +################################################################################ + +# CACHE_DB_FILENAME = 'cache.sqlite3' +# CACHE_DB_PATH = CONSTANTS.CACHE_DIR / CACHE_DB_FILENAME +# CACHE_DB_TABLE = 'django_cache' + +DATABASE_NAME = CONFIG.DATABASE_NAME +SQLITE_JOURNAL_MODE = CONFIG.SQLITE_JOURNAL_MODE +SQLITE_MMAP_SIZE = CONFIG.SQLITE_MMAP_SIZE + +SQLITE_CONNECTION_OPTIONS = { + "ENGINE": "archivebox.core.sqlite_backend", + "TIME_ZONE": CONSTANTS.TIMEZONE, + "OPTIONS": { + # https://gcollazo.com/optimal-sqlite-settings-for-django/ + # https://litestream.io/tips/#busy-timeout + # https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options + "timeout": CONFIG.SQLITE_BUSY_TIMEOUT / 1000, + "check_same_thread": False, + # Keep SQLite on Django's default deferred transaction mode. BEGIN + # IMMEDIATE grabs the write lock as soon as atomic() opens, which is + # exactly what hurts ArchiveBox on large collections where Python code + # may do filesystem work before the actual row write. Deferred BEGIN + # keeps writes statement-scoped unless a caller explicitly opens a + # transaction around multiple writes. + "transaction_mode": None, + "init_command": ( + "PRAGMA foreign_keys=ON;" + f"PRAGMA busy_timeout = {CONFIG.SQLITE_BUSY_TIMEOUT};" + f"PRAGMA journal_mode = {SQLITE_JOURNAL_MODE};" + "PRAGMA synchronous = NORMAL;" + "PRAGMA temp_store = MEMORY;" + f"PRAGMA mmap_size = {SQLITE_MMAP_SIZE};" + "PRAGMA journal_size_limit = 67108864;" + "PRAGMA cache_size = 2000;" + ), + }, +} + +DATABASES = { + "default": { + "NAME": DATABASE_NAME, + **SQLITE_CONNECTION_OPTIONS, + }, + # "filestore": { + # "NAME": CONSTANTS.FILESTORE_DATABASE_FILE, + # **SQLITE_CONNECTION_OPTIONS, + # }, + # 'cache': { + # 'NAME': CACHE_DB_PATH, + # **SQLITE_CONNECTION_OPTIONS, + # }, +} +MIGRATION_MODULES = {"signal_webhooks": None} + +# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.) +# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting +# For now: manually add `id = CompactUUIDField(primary_key=True, default=uuid7, ...)` to all models +# OR inherit from ModelWithUUID base class which provides UUID primary key +DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" + + +# class FilestoreDBRouter: +# """ +# A router to store all the File models in the filestore.sqlite3 database. +# This data just mirrors what is in the file system, so we want to keep it in a separate database +# from the main index database to avoid contention. +# """ + +# route_app_labels = {"filestore"} +# db_name = "filestore" + +# def db_for_read(self, model, **hints): +# if model._meta.app_label in self.route_app_labels: +# return self.db_name +# return 'default' + +# def db_for_write(self, model, **hints): +# if model._meta.app_label in self.route_app_labels: +# return self.db_name +# return 'default' + +# def allow_relation(self, obj1, obj2, **hints): +# if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels: +# return obj1._meta.app_label == obj2._meta.app_label +# return None + +# def allow_migrate(self, db, app_label, model_name=None, **hints): +# if app_label in self.route_app_labels: +# return db == self.db_name +# return db == "default" + +DATABASE_ROUTERS = [] + +CACHES = { + "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}, + # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, + # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, + # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, +} + +EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend" + + +STORAGES = { + "default": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + }, + "staticfiles": { + "BACKEND": "django.contrib.staticfiles.storage.StaticFilesStorage", + }, + "archive": { + "BACKEND": "django.core.files.storage.FileSystemStorage", + "OPTIONS": { + "base_url": "/archive/", + "location": CONSTANTS.ARCHIVE_DIR, + }, + }, + # "snapshots": { + # "BACKEND": "django.core.files.storage.FileSystemStorage", + # "OPTIONS": { + # "base_url": "/snapshots/", + # "location": CONSTANTS.USERS_DIR, + # }, + # }, + # "personas": { + # "BACKEND": "django.core.files.storage.FileSystemStorage", + # "OPTIONS": { + # "base_url": "/personas/", + # "location": PERSONAS_DIR, + # }, + # }, +} + +CHANNEL_LAYERS = {"default": {"BACKEND": "channels.layers.InMemoryChannelLayer"}} + +################################################################################ +### Security Settings +################################################################################ + +# Persist SECRET_KEY on first use. Data dirs created before init wrote a +# SECRET_KEY line โ€” or those whose ArchiveBox.conf was hand-edited to remove +# it โ€” would otherwise sign session cookies with a fresh random key on every +# boot (because the pydantic field's default_factory regenerates), logging +# users out on every server restart. ``archivebox init`` writes this for new +# collections; this branch is the recovery path for the rest. +# +# We can't check ``CONFIG.SECRET_KEY`` for "missing" โ€” pydantic's +# default_factory already filled it with a fresh random value. We have to +# inspect the conf file directly to know whether the value will survive. +SECRET_KEY = CONFIG.SECRET_KEY +try: + from archivebox.config.configset import BaseConfigSet as _BaseConfigSet + + _persisted_keys = _BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + _secret_persisted = bool((_persisted_keys.get("SECRET_KEY") or "").strip()) +except Exception: + _secret_persisted = True # err on the side of NOT touching disk +if not _secret_persisted: + try: + from archivebox.config.collection import write_config_file + + write_config_file({"SECRET_KEY": SECRET_KEY}) + except Exception: + # Read-only mount, missing data dir, mid-init race โ€” fall back to the + # in-memory random key. The user will get logged out on the next boot + # but the server still comes up. + pass + +ALLOWED_HOSTS = [host.strip() for host in CONFIG.ALLOWED_HOSTS.split(",") if host.strip()] +CSRF_TRUSTED_ORIGINS = list({origin.strip() for origin in CONFIG.CSRF_TRUSTED_ORIGINS.split(",") if origin.strip()}) + +admin_base_url = normalize_base_url(get_admin_base_url()) +if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(admin_base_url) + +api_base_url = normalize_base_url(get_api_base_url()) +if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(api_base_url) + +# Auto-extend CSRF_TRUSTED_ORIGINS with the explicit ALLOWED_HOSTS entries so +# users who set ALLOWED_HOSTS=archivebox.example.com don't have to also list +# https://archivebox.example.com under CSRF_TRUSTED_ORIGINS. (The previous +# per-host WARNING print was just noise โ€” the auto-append below is the actual +# fix, and the effective CSRF_TRUSTED_ORIGINS gets surfaced once at startup +# from archivebox_server.py.) +for hostname in ALLOWED_HOSTS: + if hostname == "*": + continue + https_endpoint = f"https://{hostname}" + if https_endpoint not in CSRF_TRUSTED_ORIGINS: + CSRF_TRUSTED_ORIGINS.append(https_endpoint) + +SECURE_BROWSER_XSS_FILTER = True +SECURE_CONTENT_TYPE_NOSNIFF = True +SECURE_REFERRER_POLICY = "strict-origin-when-cross-origin" + +# When BASE_URL is an https:// URL the deployment is HTTPS end-to-end, typically +# behind a TLS-terminating proxy/tunnel (the bundled traefik/cloudflared profiles, +# or your own caddy/traefik/nginx) where the proxy -> archivebox hop is plain HTTP, so +# request.is_secure() / request.scheme would otherwise report http. Honour the +# proxy's X-Forwarded-Proto so request-derived schemes are correct, and mark the +# admin session + CSRF cookies Secure so auth cookies are never sent in cleartext. +# Derived from the RESOLVED base URL's scheme โ€” no separate flag to keep in sync. +# get_base_url() also covers deployments that only set CSRF_TRUSTED_ORIGINS (the +# implicit-BASE_URL fallback used on 0.7.x->0.9.x upgrades), so HTTPS hardening +# isn't lost until BASE_URL is migrated. A plain-http base (e.g. local +# http://archivebox.localhost:8000) keeps the defaults below. +BASE_URL_IS_HTTPS = get_base_url(config=CONFIG).strip().lower().startswith("https://") +if BASE_URL_IS_HTTPS: + SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") + +CSRF_COOKIE_SECURE = BASE_URL_IS_HTTPS +SESSION_COOKIE_SECURE = BASE_URL_IS_HTTPS +SESSION_COOKIE_HTTPONLY = True +SESSION_COOKIE_NAME = f"archivebox_sessionid_{CONSTANTS.COLLECTION_ID}" +CSRF_COOKIE_NAME = f"archivebox_csrftoken_{CONSTANTS.COLLECTION_ID}" +# Auth cookies are intentionally scoped to the exact host that set them so +# the admin session is NOT readable from web.* / api.* โ€” that +# split is a security boundary, not a UX choice. Subdomains that need to +# render an "is the user logged in?" indicator must use the single-bit +# `archivebox_admin_logged_in` hint cookie set by core/middleware.py +# (which IS scoped to the listen-host parent), never widen these. +SESSION_COOKIE_DOMAIN = None +CSRF_COOKIE_DOMAIN = None +SESSION_COOKIE_AGE = 1209600 # 2 weeks +SESSION_EXPIRE_AT_BROWSER_CLOSE = False +SESSION_SAVE_EVERY_REQUEST = False + +SESSION_ENGINE = "django.contrib.sessions.backends.db" + +AUTH_PASSWORD_VALIDATORS = [ + {"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"}, + {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"}, + {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"}, + {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"}, +] + +DATA_UPLOAD_MAX_NUMBER_FIELDS = None +DATA_UPLOAD_MAX_MEMORY_SIZE = 26_214_400 # 25MB + +################################################################################ +### Shell Settings +################################################################################ + +SHELL_PLUS = "ipython" +SHELL_PLUS_PRINT_SQL = False +IPYTHON_ARGUMENTS = ["--no-confirm-exit", "--no-banner"] +IPYTHON_KERNEL_DISPLAY_NAME = "ArchiveBox Django Shell" +if IS_SHELL: + os.environ["PYTHONSTARTUP"] = str(PACKAGE_DIR / "misc" / "shell_welcome_message.py") + + +################################################################################ +### Internationalization & Localization Settings +################################################################################ + +LANGUAGE_CODE = "en-us" +USE_I18N = True +USE_TZ = True +DATETIME_FORMAT = "Y-m-d h:i:s A" +SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A" +TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent + +en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format +en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT + + +################################################################################ +### Logging Settings +################################################################################ + +LOGGING = SETTINGS_LOGGING + + +################################################################################ +### REST API Outbound Webhooks settings +################################################################################ + +# Add default webhook configuration to the User model +SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook" +SIGNAL_WEBHOOKS: dict[str, object] = { + "TIMEOUT": 30, + "TASK_HANDLER": "archivebox.api.webhooks.transaction_on_commit_task_handler", + "ERROR_HANDLER": "archivebox.api.webhooks.warning_error_handler", + "HOOKS": { + # ... is a special sigil value that means "use the default autogenerated hooks" + "django.contrib.auth.models.User": ..., + "archivebox.crawls.models.Crawl": ..., + "archivebox.core.models.Snapshot": ..., + "archivebox.core.models.ArchiveResult": ..., + "archivebox.core.models.Tag": ..., + "archivebox.api.models.APIToken": ..., + "archivebox.personas.models.Persona": ..., + "archivebox.machine.models.Machine": ..., + "archivebox.machine.models.Binary": ..., + "archivebox.machine.models.Process": ..., + }, +} + +################################################################################ +### Admin Data View Settings +################################################################################ + +ADMIN_DATA_VIEWS = { + "NAME": "Environment", + "URLS": [ + { + "route": "config/", + "view": "archivebox.core.views.live_config_list_view", + "name": "Configuration", + "items": { + "route": "<str:key>/", + "view": "archivebox.core.views.live_config_value_view", + "name": "config_val", + }, + }, + { + "route": "binaries/", + "view": "archivebox.config.views.binaries_list_view", + "name": "Dependencies", + "items": { + "route": "<str:key>/", + "view": "archivebox.config.views.binary_detail_view", + "name": "binary", + }, + }, + { + "route": "plugins/", + "view": "archivebox.plugins.views.plugins_list_view", + "name": "Plugins", + "items": { + "route": "<str:key>/", + "view": "archivebox.plugins.views.plugin_detail_view", + "name": "plugin", + }, + }, + { + "route": "workers/", + "view": "archivebox.config.views.worker_list_view", + "name": "Workers", + "items": { + "route": "<str:key>/", + "view": "archivebox.config.views.worker_detail_view", + "name": "worker", + }, + }, + { + "route": "logs/", + "view": "archivebox.config.views.log_list_view", + "name": "Logs", + "items": { + "route": "<str:key>/", + "view": "archivebox.config.views.log_detail_view", + "name": "log", + }, + }, + # Additional admin data views from plugins + ], +} + + +################################################################################ +### Debug Settings +################################################################################ + +# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode) +DEBUG_TOOLBAR = False +DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ("--nothreading" in sys.argv) and ("--reload" not in sys.argv) +if DEBUG_TOOLBAR: + try: + import debug_toolbar # noqa + + DEBUG_TOOLBAR = True + except ImportError: + DEBUG_TOOLBAR = False + +if DEBUG_TOOLBAR: + INSTALLED_APPS = [*INSTALLED_APPS, "debug_toolbar"] + INTERNAL_IPS = ["0.0.0.0", "127.0.0.1", "*"] + DEBUG_TOOLBAR_CONFIG = { + "SHOW_TOOLBAR_CALLBACK": lambda request: True, + "RENDER_PANELS": True, + } + DEBUG_TOOLBAR_PANELS = [ + "debug_toolbar.panels.history.HistoryPanel", + "debug_toolbar.panels.versions.VersionsPanel", + "debug_toolbar.panels.timer.TimerPanel", + "debug_toolbar.panels.settings.SettingsPanel", + "debug_toolbar.panels.headers.HeadersPanel", + "debug_toolbar.panels.request.RequestPanel", + "debug_toolbar.panels.sql.SQLPanel", + "debug_toolbar.panels.staticfiles.StaticFilesPanel", + # 'debug_toolbar.panels.templates.TemplatesPanel', + "debug_toolbar.panels.cache.CachePanel", + "debug_toolbar.panels.signals.SignalsPanel", + "debug_toolbar.panels.logging.LoggingPanel", + "debug_toolbar.panels.redirects.RedirectsPanel", + "debug_toolbar.panels.profiling.ProfilingPanel", + "djdt_flamegraph.FlamegraphPanel", + ] + MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"] + +if DEBUG: + try: + import django_autotyping # noqa + except ImportError: + pass + else: + INSTALLED_APPS += ["django_autotyping"] + AUTOTYPING = { + "STUBS_GENERATION": { + "LOCAL_STUBS_DIR": PACKAGE_DIR / "typings", + }, + } + +# https://github.com/bensi94/Django-Requests-Tracker (improved version of django-debug-toolbar) +# Must delete archivebox/templates/admin to use because it relies on some things we override +# visit /__requests_tracker__/ to access +DEBUG_REQUESTS_TRACKER = True +DEBUG_REQUESTS_TRACKER = DEBUG_REQUESTS_TRACKER and DEBUG +if DEBUG_REQUESTS_TRACKER: + import requests_tracker + + INSTALLED_APPS += ["requests_tracker"] + MIDDLEWARE += ["requests_tracker.middleware.requests_tracker_middleware"] + INTERNAL_IPS = ["127.0.0.1", "10.0.2.2", "0.0.0.0", "*"] + + TEMPLATE_DIRS.insert(0, str(Path(inspect.getfile(requests_tracker)).parent / "templates")) + + REQUESTS_TRACKER_CONFIG = { + "TRACK_SQL": True, + "ENABLE_STACKTRACES": False, + "IGNORE_PATHS_PATTERNS": ( + r".*/favicon\.ico", + r".*\.png", + r"/admin/jsi18n/", + ), + "IGNORE_SQL_PATTERNS": ( + r"^SELECT .* FROM django_migrations WHERE app = 'requests_tracker'", + r"^SELECT .* FROM django_migrations WHERE app = 'auth'", + ), + } + +# # https://docs.pydantic.dev/logfire/integrations/django/ (similar to DataDog / NewRelic / etc.) +# DEBUG_LOGFIRE = False +# DEBUG_LOGFIRE = DEBUG_LOGFIRE and os.access(DATA_DIR / '.logfire', os.W_OK) and (DATA_DIR / '.logfire').is_dir() + + +# For usage with https://www.jetadmin.io/integrations/django +# INSTALLED_APPS += ['jet_django'] +# JET_PROJECT = 'archivebox' +# JET_TOKEN = 'some-api-token-here' + + +# import ipdb; ipdb.set_trace() diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py new file mode 100644 index 0000000000..4f591db835 --- /dev/null +++ b/archivebox/core/settings_logging.py @@ -0,0 +1,278 @@ +__package__ = "archivebox.core" + +import re +import os +import tempfile +import logging + + +from archivebox.config import CONSTANTS +from archivebox.misc.logging import STDERR + + +IGNORABLE_URL_PATTERNS = [ + re.compile(r"/.*/?apple-touch-icon.*\.png"), + re.compile(r"/.*/?favicon\.ico"), + re.compile(r"/.*/?robots\.txt"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/.*/?.*\.(css|js)\.map"), + re.compile(r"/static/.*"), + re.compile(r"/admin/jsi18n/"), +] + + +class NoisyRequestsFilter(logging.Filter): + def filter(self, record) -> bool: + logline = record.getMessage() + # '"GET /api/v1/docs HTTP/1.1" 200 1023' + # '"GET /static/admin/js/SelectFilter2.js HTTP/1.1" 200 15502' + # '"GET /static/admin/js/SelectBox.js HTTP/1.1" 304 0' + # '"GET /admin/jsi18n/ HTTP/1.1" 200 3352' + # '"GET /admin/api/apitoken/0191bbf8-fd5e-0b8c-83a8-0f32f048a0af/change/ HTTP/1.1" 200 28778' + + # ignore harmless 404s for the patterns in IGNORABLE_URL_PATTERNS + for pattern in IGNORABLE_URL_PATTERNS: + ignorable_GET_request = re.compile(f'"GET {pattern.pattern} HTTP/.*" (2..|30.|404) .+$', re.I | re.M) + if ignorable_GET_request.match(logline): + return False + + ignorable_404_pattern = re.compile(f"Not Found: {pattern.pattern}", re.I | re.M) + if ignorable_404_pattern.match(logline): + return False + + return True + + +class DaphneCloseTimeoutFilter(logging.Filter): + """Drop daphne's noisy "killed slow response after client disconnect" warning. + + Daphne emits this whenever a request handler is still running when the + client disconnects (e.g. iframe gets navigated away mid-response while + fetching favicon / screenshot / preview html). For our use case these are + always benign โ€” the disconnect is the browser cancelling a request, not a + server-side fault โ€” so we suppress them outright rather than spamming + WARNING. Other daphne.server lines pass through unchanged. + """ + + def filter(self, record) -> bool: + if record.name != "daphne.server": + return True + logline = record.getMessage() + if ( + "Application instance" in logline + and "for connection <WebRequest" in logline + and "took too long to shut down" in logline + and "was killed" in logline + ): + return False + return True + + +class AsyncioCancelledShieldFilter(logging.Filter): + """Drop asyncio's "CancelledError exception in shielded future" noise. + + When a browser disconnects mid-request, daphne cancels the asgi task and + asgiref's ``sync_to_async`` shields the synchronous Django view via + ``asyncio.shield(exec_coro)``. The shield wakes up to find its parent + cancelled and re-raises ``CancelledError``; asyncio's default exception + handler then logs the full traceback via the ``asyncio`` logger. There's + nothing the server can do (the client is already gone), so these tracebacks + are pure noise and cause hundreds of lines of spam per disconnect. + + We match conservatively: only drop the specific shielded-future message + that points back into asgiref's shield path. Other asyncio errors fall + through unchanged. + """ + + def filter(self, record) -> bool: + if record.name != "asyncio": + return True + logline = record.getMessage() + if "CancelledError exception in shielded future" in logline: + return False + exc_info = record.exc_info + if exc_info and exc_info[0] is not None: + try: + import asyncio as _asyncio + + if issubclass(exc_info[0], _asyncio.CancelledError): + return False + except Exception: + pass + return True + + +class CustomOutboundWebhookLogFormatter(logging.Formatter): + def format(self, record): + result = super().format(record) + return result.replace("HTTP Request: ", "OutboundWebhook: ") + + +class StripANSIColorCodesFilter(logging.Filter): + _ansi_re = re.compile(r"\x1b\[[0-9;]*m") + _bare_re = re.compile(r"\[[0-9;]*m") + + def filter(self, record) -> bool: + msg = record.getMessage() + if isinstance(msg, str) and ("\x1b[" in msg or "[m" in msg): + msg = self._ansi_re.sub("", msg) + msg = self._bare_re.sub("", msg) + record.msg = msg + record.args = () + return True + + +ERROR_LOG = tempfile.NamedTemporaryFile().name + +LOGS_DIR = CONSTANTS.LOGS_DIR + +if os.access(LOGS_DIR, os.W_OK) and LOGS_DIR.is_dir(): + ERROR_LOG = LOGS_DIR / "errors.log" +else: + # historically too many edge cases here around creating log dir w/ correct permissions early on + # if there's an issue on startup, we trash the log and let user figure it out via stdout/stderr + # print(f'[!] WARNING: data/logs dir does not exist. Logging to temp file: {ERROR_LOG}') + pass + +LOG_LEVEL_DATABASE = "WARNING" # change to DEBUG to log all SQL queries +LOG_LEVEL_REQUEST = "WARNING" # if DEBUG else 'WARNING' + +if LOG_LEVEL_DATABASE == "DEBUG": + db_logger = logging.getLogger("django.db.backends") + db_logger.setLevel(logging.DEBUG) + db_logger.addHandler(logging.StreamHandler()) + + +SETTINGS_LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "rich": { + "datefmt": "[%Y-%m-%d %H:%M:%S]", + "format": "%(name)s %(message)s", + }, + "outbound_webhooks": { + "()": CustomOutboundWebhookLogFormatter, + "datefmt": "[%Y-%m-%d %H:%M:%S]", + }, + }, + "filters": { + "noisyrequestsfilter": { + "()": NoisyRequestsFilter, + }, + "daphneclosetimeout": { + "()": DaphneCloseTimeoutFilter, + }, + "asynciocancelledshield": { + "()": AsyncioCancelledShieldFilter, + }, + "stripansi": { + "()": StripANSIColorCodesFilter, + }, + "require_debug_false": { + "()": "django.utils.log.RequireDebugFalse", + }, + "require_debug_true": { + "()": "django.utils.log.RequireDebugTrue", + }, + }, + "handlers": { + "default": { + "class": "rich.logging.RichHandler", + "formatter": "rich", + "level": "DEBUG", + "markup": False, + "enable_link_path": False, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) + "console": STDERR, + "filters": ["noisyrequestsfilter", "daphneclosetimeout", "asynciocancelledshield", "stripansi"], + }, + "logfile": { + "level": "INFO", + "class": "logging.handlers.RotatingFileHandler", + "filename": ERROR_LOG, + "maxBytes": 1024 * 1024 * 25, # 25 MB + "backupCount": 10, + "formatter": "rich", + "filters": ["noisyrequestsfilter", "daphneclosetimeout", "asynciocancelledshield", "stripansi"], + }, + "outbound_webhooks": { + "class": "rich.logging.RichHandler", + "markup": False, + "enable_link_path": False, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) + "formatter": "outbound_webhooks", + }, + # "mail_admins": { + # "level": "ERROR", + # "filters": ["require_debug_false"], + # "class": "django.utils.log.AdminEmailHandler", + # }, + "null": { + "class": "logging.NullHandler", + }, + }, + "root": { + "handlers": ["default", "logfile"], + "level": "INFO", + "formatter": "rich", + }, + "loggers": { + "api": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "checks": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "core": { + "handlers": ["default", "logfile"], + "level": "DEBUG", + "propagate": False, + }, + "httpx": { + "handlers": ["outbound_webhooks"], + "level": "INFO", + "formatter": "outbound_webhooks", + "propagate": False, + }, + "django": { + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + "propagate": False, + }, + "django.utils.autoreload": { + "propagate": False, + "handlers": [], + "level": "ERROR", + }, + "django.channels.server": { + # see archivebox.misc.monkey_patches.ModifiedAccessLogGenerator for dedicated daphne server logging settings + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.server": { # logs all requests (2xx, 3xx, 4xx) + "propagate": False, + "handlers": ["default", "logfile"], + "level": "INFO", + "filters": ["noisyrequestsfilter"], + }, + "django.request": { # only logs 4xx and 5xx errors + "propagate": False, + "handlers": ["default", "logfile"], + "level": "ERROR", + "filters": ["noisyrequestsfilter"], + }, + "django.db.backends": { + "propagate": False, + "handlers": ["default"], + "level": LOG_LEVEL_DATABASE, + }, + }, +} diff --git a/archivebox/core/shutdown_util.py b/archivebox/core/shutdown_util.py new file mode 100644 index 0000000000..f927336008 --- /dev/null +++ b/archivebox/core/shutdown_util.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import os +import signal +import subprocess +import sys +import threading +from collections.abc import Callable, Iterator +from contextlib import contextmanager +from dataclasses import dataclass + +import psutil + + +@dataclass +class ShutdownSignalState: + """Tracks the exact OS signal that asked a foreground command to exit.""" + + signal_name: str | None = None + + +_active_shutdown_state: ShutdownSignalState | None = None + + +def raise_if_shutdown_requested() -> None: + """Let long foreground loops honor a signal even if Python ignored it once.""" + + if _active_shutdown_state and _active_shutdown_state.signal_name: + raise KeyboardInterrupt + + +def configured_stopwaitsecs(workers: list[dict[str, str]] | tuple[dict[str, str], ...], *, default: int = 5, buffer: int = 5) -> int: + """Return a deterministic shutdown bound from generated worker definitions.""" + + stop_grace_seconds = default + for worker in workers: + try: + stop_grace_seconds = max(stop_grace_seconds, int(worker.get("stopwaitsecs") or default) + buffer) + except (TypeError, ValueError): + continue + return stop_grace_seconds + + +def wait_popen_and_kill_children( + proc: subprocess.Popen, + children: list[psutil.Process], + *, + timeout: float, + kill_timeout: float = 2.0, +) -> None: + """Wait for a Popen parent and then hard-kill any surviving descendants.""" + + try: + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=kill_timeout) + kill_remaining_processes(children, timeout=kill_timeout) + + +def wait_psutil_and_kill_children( + proc: psutil.Process, + children: list[psutil.Process], + *, + timeout: float, + kill_timeout: float = 2.0, +) -> None: + """Wait for a psutil parent and then hard-kill any surviving descendants.""" + + try: + if proc.status() == psutil.STATUS_ZOMBIE: + # Another ArchiveBox foreground parent owns this Popen and must reap + # it. By the time supervisord is a zombie it has already stopped + # accepting work, so the caller can clear stale pid/socket files + # without blocking for a process it cannot reap itself. + kill_remaining_processes(children, timeout=kill_timeout) + return + proc.wait(timeout=timeout) + except psutil.TimeoutExpired: + proc.kill() + kill_remaining_processes(children, timeout=kill_timeout) + try: + proc.wait(timeout=kill_timeout) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + pass + + +def kill_remaining_processes(processes: list[psutil.Process], *, timeout: float = 2.0) -> None: + _gone, alive = psutil.wait_procs(processes, timeout=timeout) + for process in alive: + try: + process.kill() + except psutil.NoSuchProcess: + pass + psutil.wait_procs(alive, timeout=timeout) + + +@contextmanager +def foreground_shutdown_signals( + handled_signals: tuple[signal.Signals, ...] = (signal.SIGHUP, signal.SIGINT, signal.SIGTERM), + *, + first_signal_message: str | None = "\n[๐Ÿ›‘] Got {signal_name}, stopping gracefully...\n", + on_signal: Callable[[signal.Signals], None] | None = None, + raise_on_first_signal: bool = True, +) -> Iterator[ShutdownSignalState]: + """Install foreground signal handlers that print an immediate exit notice. + + Some log-tail loops intentionally swallow KeyboardInterrupt so that callers + can centralize cleanup in finally blocks. The handler writes the signal name + immediately, then raises KeyboardInterrupt to break out of the blocking read. + """ + + global _active_shutdown_state + + if threading.current_thread() is not threading.main_thread(): + yield ShutdownSignalState() + return + + state = ShutdownSignalState() + previous_active_state = _active_shutdown_state + previous_handlers = {sig: signal.getsignal(sig) for sig in handled_signals} + + def raise_keyboard_interrupt(signum, _frame): + sig = signal.Signals(signum) + already_requested = state.signal_name is not None + if not already_requested: + state.signal_name = sig.name + if first_signal_message is not None: + os.write(sys.stdout.fileno(), first_signal_message.format(signal_name=state.signal_name).encode()) + if on_signal is not None: + on_signal(sig) + # Foreground `archivebox add` uses the first signal to abort the active + # hook through the bus-facing runner code, then reserves the second + # signal for hard foreground-command shutdown. Server/update/run and + # other non-interactive commands raise immediately so their finally + # blocks can stop owned children without prompting. + if already_requested: + os.write(sys.stdout.fileno(), f"\n[๐Ÿ›‘] Got {sig.name} again, exiting immediately.\n".encode()) + os._exit(130) + if raise_on_first_signal: + raise KeyboardInterrupt + + try: + _active_shutdown_state = state + for sig in handled_signals: + signal.signal(sig, raise_keyboard_interrupt) + yield state + finally: + if state.signal_name and previous_active_state is not None and not previous_active_state.signal_name: + previous_active_state.signal_name = state.signal_name + _active_shutdown_state = previous_active_state + for sig, previous_handler in previous_handlers.items(): + signal.signal(sig, previous_handler) + + +@contextmanager +def foreground_parent_watchdog( + *, + enabled: bool = True, + check_interval: float = 2.0, + shutdown_signal: signal.Signals = signal.SIGTERM, +) -> Iterator[None]: + """Ask a foreground command to exit if its launcher/wrapper disappears. + + `uv run archivebox ...` and similar wrappers can be killed without + delivering a signal to the real Python child. If that child keeps crawling + as an orphan, it can hold SQLite write locks long after the user-facing + command timed out. This watchdog is only for foreground command lifetimes; + daemon/supervisord workers should not use it because their parent may + intentionally hand them off. + """ + + original_ppid = os.getppid() + if not enabled or original_ppid <= 1: + yield + return + + stopped = threading.Event() + + def watch_parent() -> None: + while not stopped.wait(check_interval): + if os.getppid() == original_ppid: + continue + sys.stderr.write("\n[๐Ÿ›‘] ArchiveBox parent process exited; stopping foreground command gracefully...\n") + sys.stderr.flush() + os.kill(os.getpid(), shutdown_signal) + return + + thread = threading.Thread(target=watch_parent, name="archivebox-parent-watchdog", daemon=True) + thread.start() + try: + yield + finally: + stopped.set() diff --git a/archivebox/core/snapshot_status.py b/archivebox/core/snapshot_status.py new file mode 100644 index 0000000000..f9df3808ad --- /dev/null +++ b/archivebox/core/snapshot_status.py @@ -0,0 +1,25 @@ +__package__ = "archivebox.core" + +from django.db.models import QuerySet + + +def snapshot_status_values() -> tuple[str, ...]: + from archivebox.core.models import Snapshot + + return tuple(Snapshot.StatusChoices.values) + + +def normalize_snapshot_status(status: str | None) -> str | None: + value = str(status or "").strip().lower() + if not value: + return None + + valid_statuses = snapshot_status_values() + if value not in valid_statuses: + raise ValueError(f"Invalid snapshot status: {status}. Expected one of: {', '.join(valid_statuses)}") + return value + + +def filter_snapshots_by_status(queryset: QuerySet, status: str | None) -> QuerySet: + value = normalize_snapshot_status(status) + return queryset.filter(status=value) if value else queryset diff --git a/archivebox/core/sqlite_backend/__init__.py b/archivebox/core/sqlite_backend/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/sqlite_backend/base.py b/archivebox/core/sqlite_backend/base.py new file mode 100644 index 0000000000..731ef077c2 --- /dev/null +++ b/archivebox/core/sqlite_backend/base.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import sqlite3 +import time +from collections.abc import Mapping +from itertools import tee +import re + +from django.db.backends.sqlite3.base import DatabaseWrapper as DjangoSQLiteDatabaseWrapper +from django.db.backends.sqlite3.base import SQLiteCursorWrapper as DjangoSQLiteCursorWrapper + + +def _sqlite_lock_retry_timeout() -> float: + from django.conf import settings + + return settings.CONFIG.SQLITE_LOCK_RETRY_TIMEOUT + + +def _sqlite_lock_retry_interval() -> float: + from django.conf import settings + + return settings.CONFIG.SQLITE_LOCK_RETRY_INTERVAL + + +def _format_sql(query: str, params=None) -> str: + compact = " ".join(str(query).split()) + match = re.match(r'^(INSERT INTO|UPDATE|DELETE FROM|SELECT) "?([A-Za-z0-9_]+)"?', compact, flags=re.IGNORECASE) + if match: + compact = f"{match.group(1).upper()} {match.group(2)}" + if params is not None: + if isinstance(params, str): + params_summary = params + elif isinstance(params, (tuple, list)): + preview = ", ".join(repr(param)[:60] for param in params[:4]) + params_summary = f"{len(params)} params: {preview}" + elif isinstance(params, Mapping): + preview = ", ".join(f"{key}={repr(value)[:60]}" for key, value in list(params.items())[:4]) + params_summary = f"{len(params)} params: {preview}" + else: + params_summary = repr(params)[:120] + compact = f"{compact} ({params_summary})" + return compact[:260] + + +def _log_locked_database(query: str, params=None, *, attempt: int, elapsed: float, retry_interval: float) -> None: + from rich.console import Console + + from archivebox.misc.db import log_sqlite_lock_holders + + console = Console(stderr=True) + console.print( + f"[yellow][*] SQLite database is locked for {elapsed:.0f}s; retrying in {retry_interval:g}s... attempt={attempt}[/yellow]", + ) + console.print(f"[yellow] Query: {_format_sql(query, params)}[/yellow]") + log_sqlite_lock_holders(console) + + +def _connection_in_transaction(connection) -> bool: + try: + return bool(connection and connection.in_transaction) + except (AttributeError, sqlite3.Error): + return False + + +def _recover_non_atomic_connection(db_wrapper, query: str) -> None: + if db_wrapper is None or getattr(db_wrapper, "in_atomic_block", False): + return + connection = getattr(db_wrapper, "connection", None) + if not _connection_in_transaction(connection): + return + try: + connection.rollback() + except sqlite3.Error: + return + + +def _is_inside_atomic(db_wrapper) -> bool: + return bool(db_wrapper is not None and getattr(db_wrapper, "in_atomic_block", False)) + + +def _abort_locked_database(query: str, params=None, *, elapsed: float, db_wrapper=None) -> None: + _recover_non_atomic_connection(db_wrapper, query) + raise sqlite3.OperationalError( + f"SQLite database remained locked for {elapsed:.0f}s while running {_format_sql(query, params)}; " + "aborting instead of retrying indefinitely", + ) + + +def _retry_locked_database(action, query: str, params=None, *, db_wrapper=None): + attempt = 0 + started_at = time.monotonic() + while True: + try: + return action() + except (sqlite3.OperationalError, Exception) as err: + from archivebox.misc.db import sqlite_lock_error + + if not sqlite_lock_error(err): + raise + attempt += 1 + elapsed = time.monotonic() - started_at + retry_timeout = _sqlite_lock_retry_timeout() + retry_interval = _sqlite_lock_retry_interval() + _log_locked_database(query, params, attempt=attempt, elapsed=elapsed, retry_interval=retry_interval) + # If SQLite raised while Django is in autocommit mode, do not keep a + # partially-open sqlite transaction around while waiting. Explicit + # transaction.atomic() callers keep their normal transaction boundary. + _recover_non_atomic_connection(db_wrapper, query) + if _is_inside_atomic(db_wrapper): + raise + if retry_timeout and elapsed >= retry_timeout: + _abort_locked_database(query, params, elapsed=elapsed, db_wrapper=db_wrapper) + time.sleep(retry_interval) + + +class SQLiteCursorWrapper(DjangoSQLiteCursorWrapper): + def execute(self, query, params=None): + if params is None: + return _retry_locked_database( + lambda: super(SQLiteCursorWrapper, self).execute(query), + query, + db_wrapper=getattr(self, "db_wrapper", None), + ) + param_names = list(params) if isinstance(params, Mapping) else None + converted_query = self.convert_query(query, param_names=param_names) + return _retry_locked_database( + lambda: super(DjangoSQLiteCursorWrapper, self).execute(converted_query, params), + converted_query, + params, + db_wrapper=getattr(self, "db_wrapper", None), + ) + + def executemany(self, query, param_list): + peekable, param_list = tee(iter(param_list)) + if (params := next(peekable, None)) and isinstance(params, Mapping): + param_names = list(params) + else: + param_names = None + converted_query = self.convert_query(query, param_names=param_names) + param_list = tuple(param_list) + return _retry_locked_database( + lambda: super(DjangoSQLiteCursorWrapper, self).executemany(converted_query, param_list), + converted_query, + f"{len(param_list)} parameter sets", + db_wrapper=getattr(self, "db_wrapper", None), + ) + + +class DatabaseWrapper(DjangoSQLiteDatabaseWrapper): + def create_cursor(self, name=None): + cursor = self.connection.cursor(factory=SQLiteCursorWrapper) + cursor.db_wrapper = self + return cursor + + def _commit(self): + return _retry_locked_database(lambda: super(DatabaseWrapper, self)._commit(), "COMMIT", db_wrapper=self) + + def _rollback(self): + return _retry_locked_database(lambda: super(DatabaseWrapper, self)._rollback(), "ROLLBACK", db_wrapper=self) diff --git a/archivebox/core/tag_util.py b/archivebox/core/tag_util.py new file mode 100644 index 0000000000..04e7ef9b22 --- /dev/null +++ b/archivebox/core/tag_util.py @@ -0,0 +1,294 @@ +from __future__ import annotations + +import json +from collections import defaultdict +from typing import Any +from urllib.parse import unquote + +from django.contrib.auth.models import User +from django.db.models import Count, Exists, F, OuterRef, QuerySet +from django.db.models.functions import Lower +from django.http import HttpRequest +from django.urls import reverse + +from archivebox.config.common import get_config +from archivebox.core.routes_util import build_snapshot_url, build_web_url +from archivebox.core.models import Snapshot, SnapshotTag, Tag + + +TAG_SNAPSHOT_PREVIEW_LIMIT = 10 +TAG_SORT_CHOICES = ( + ("name_asc", "Name A-Z"), + ("name_desc", "Name Z-A"), + ("created_desc", "Created newest"), + ("created_asc", "Created oldest"), + ("snapshots_desc", "Most snapshots"), + ("snapshots_asc", "Fewest snapshots"), +) +TAG_HAS_SNAPSHOTS_CHOICES = ( + ("all", "All"), + ("yes", "Has snapshots"), + ("no", "No snapshots"), +) + + +def normalize_tag_name(name: str) -> str: + return (name or "").strip() + + +def normalize_tag_sort(sort: str = "created_desc") -> str: + valid_sorts = {key for key, _label in TAG_SORT_CHOICES} + return sort if sort in valid_sorts else "created_desc" + + +def normalize_has_snapshots_filter(value: str = "all") -> str: + valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES} + return value if value in valid_filters else "all" + + +def normalize_created_by_filter(created_by: str = "") -> str: + return created_by if str(created_by).isdigit() else "" + + +def normalize_created_year_filter(year: str = "") -> str: + year = (year or "").strip() + return year if len(year) == 4 and year.isdigit() else "" + + +def get_matching_tags( + query: str = "", + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +) -> QuerySet[Tag]: + sort = normalize_tag_sort(sort) + has_snapshots = normalize_has_snapshots_filter(has_snapshots) + needs_snapshot_counts = sort.startswith("snapshots_") + + queryset = Tag.objects.select_related("created_by") + if needs_snapshot_counts: + queryset = queryset.annotate(num_snapshots=Count("snapshot_set", distinct=True)) + + query = normalize_tag_name(query) + if query: + queryset = queryset.filter(name__icontains=query) + + created_by = normalize_created_by_filter(created_by) + if created_by: + queryset = queryset.filter(created_by_id=int(created_by)) + + year = normalize_created_year_filter(year) + if year: + queryset = queryset.filter(created_at__year=int(year)) + + if has_snapshots != "all" and not needs_snapshot_counts: + queryset = queryset.annotate(has_snapshot=Exists(SnapshotTag.objects.filter(tag_id=OuterRef("pk")))) + + if has_snapshots == "yes": + queryset = queryset.filter(num_snapshots__gt=0) if needs_snapshot_counts else queryset.filter(has_snapshot=True) + elif has_snapshots == "no": + queryset = queryset.filter(num_snapshots=0) if needs_snapshot_counts else queryset.filter(has_snapshot=False) + + if sort == "name_asc": + queryset = queryset.order_by(Lower("name"), "id") + elif sort == "name_desc": + queryset = queryset.order_by(Lower("name").desc(), "-id") + elif sort == "created_asc": + queryset = queryset.order_by(F("created_at").asc(nulls_first=True), "id", Lower("name")) + elif sort == "snapshots_desc": + queryset = queryset.order_by(F("num_snapshots").desc(nulls_last=True), F("created_at").desc(nulls_last=True), "-id", Lower("name")) + elif sort == "snapshots_asc": + queryset = queryset.order_by(F("num_snapshots").asc(nulls_first=True), Lower("name"), "id") + else: + queryset = queryset.order_by(F("created_at").desc(nulls_last=True), "-id", Lower("name")) + + return queryset + + +def add_snapshot_counts(tags: list[Tag], snapshot_queryset: QuerySet[Snapshot] | None = None) -> None: + tag_ids = [tag.pk for tag in tags] + if not tag_ids: + return + + queryset = SnapshotTag.objects.filter(tag_id__in=tag_ids) + if snapshot_queryset is not None: + queryset = queryset.filter(snapshot_id__in=snapshot_queryset.values("id")) + counts = {int(row["tag_id"]): row["num_snapshots"] for row in queryset.values("tag_id").annotate(num_snapshots=Count("snapshot_id"))} + for tag in tags: + tag.num_snapshots = counts.get(int(tag.pk), 0) + + +def get_tag_creator_choices() -> list[tuple[str, str]]: + rows = ( + Tag.objects.filter(created_by__isnull=False) + .values_list("created_by_id", "created_by__username") + .order_by(Lower("created_by__username"), "created_by_id") + .distinct() + ) + return [(str(user_id), username or f"User {user_id}") for user_id, username in rows] + + +def get_tag_year_choices() -> list[str]: + years = Tag.objects.exclude(created_at__isnull=True).dates("created_at", "year", order="DESC") + return [str(year.year) for year in years] + + +def get_tag_by_ref(tag_ref: str | int) -> Tag: + if isinstance(tag_ref, int): + return Tag.objects.get(pk=tag_ref) + + ref = str(tag_ref).strip() + if ref.isdigit(): + return Tag.objects.get(pk=int(ref)) + + decoded = unquote(ref) + return Tag.objects.get(name__iexact=decoded) + + +def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError("Tag name is required") + + existing = Tag.objects.filter(name__iexact=normalized_name).first() + if existing: + return existing, False + + tag = Tag.objects.create( + name=normalized_name, + created_by=created_by, + ) + return tag, True + + +def rename_tag(tag: Tag, name: str) -> Tag: + normalized_name = normalize_tag_name(name) + if not normalized_name: + raise ValueError("Tag name is required") + + existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first() + if existing: + raise ValueError(f'Tag "{existing.name}" already exists') + + if tag.name != normalized_name: + tag.name = normalized_name + tag.save() + return tag + + +def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]: + return tag.delete() + + +def export_tag_urls(tag: Tag) -> str: + urls = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").values_list("url", flat=True) + return "\n".join(urls) + + +def export_tag_snapshots_jsonl(tag: Tag) -> str: + snapshots = tag.snapshot_set.order_by("-downloaded_at", "-created_at", "-pk").prefetch_related("tags") + return "\n".join(json.dumps(snapshot.to_json()) for snapshot in snapshots) + + +def _display_snapshot_title(snapshot: Snapshot) -> str: + title = (snapshot.title or "").strip() + url = (snapshot.url or "").strip() + if not title: + return url + + normalized_title = title.lower() + if normalized_title == "pending..." or normalized_title == url.lower(): + return url + return title + + +def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None, config: Any | None = None) -> dict[str, Any]: + return { + "id": str(snapshot.pk), + "title": _display_snapshot_title(snapshot), + "url": snapshot.url, + "favicon_url": build_snapshot_url(str(snapshot.pk), "favicon.ico", request=request, config=config), + "admin_url": reverse("admin:core_snapshot_change", args=[snapshot.pk]), + "archive_url": build_web_url(f"/{snapshot.archive_path_from_db}/index.html", request=request, config=config), + "downloaded_at": snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None, + } + + +def _build_snapshot_preview_map( + tags: list[Tag], + request: HttpRequest | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, +) -> dict[int, list[dict[str, Any]]]: + tag_ids = [tag.pk for tag in tags] + if not tag_ids or preview_limit <= 0: + return {} + + snapshot_tags = ( + SnapshotTag.objects.filter(tag_id__in=tag_ids) + .select_related("snapshot__crawl__created_by") + .order_by( + "tag_id", + F("snapshot__downloaded_at").desc(nulls_last=True), + F("snapshot__created_at").desc(nulls_last=True), + F("snapshot_id").desc(), + ) + ) + + preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list) + config = get_config() + for snapshot_tag in snapshot_tags: + previews = preview_map[snapshot_tag.tag_id] + if len(previews) >= preview_limit: + continue + previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request, config=config)) + return preview_map + + +def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]: + count = tag.__dict__.get("num_snapshots") + if count is None: + count = tag.snapshot_set.count() + return { + "id": tag.pk, + "name": tag.name, + "slug": tag.slug, + "num_snapshots": count, + "filter_url": f"/admin/core/snapshot/?tags__id__exact={tag.pk}", + "edit_url": f"/admin/core/tag/{tag.pk}/change/", + "export_urls_url": f"/api/v1/core/tag/{tag.pk}/urls.txt", + "export_jsonl_url": f"/api/v1/core/tag/{tag.pk}/snapshots.jsonl", + "rename_url": f"/api/v1/core/tag/{tag.pk}/rename", + "delete_url": f"/api/v1/core/tag/{tag.pk}/", + "snapshots": snapshot_previews or [], + } + + +def build_tag_cards( + query: str = "", + request: HttpRequest | None = None, + limit: int | None = None, + preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT, + sort: str = "created_desc", + created_by: str = "", + year: str = "", + has_snapshots: str = "all", +) -> list[dict[str, Any]]: + sort = normalize_tag_sort(sort) + has_snapshots = normalize_has_snapshots_filter(has_snapshots) + needs_snapshot_count_annotation = sort.startswith("snapshots_") + queryset = get_matching_tags( + query=query, + sort=sort, + created_by=created_by, + year=year, + has_snapshots=has_snapshots, + ) + if limit is not None: + queryset = queryset[:limit] + + tags = list(queryset) + if not needs_snapshot_count_annotation: + add_snapshot_counts(tags) + preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit) + return [build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, [])) for tag in tags] diff --git a/archivebox/core/takeover_util.py b/archivebox/core/takeover_util.py new file mode 100644 index 0000000000..d5b39eb16a --- /dev/null +++ b/archivebox/core/takeover_util.py @@ -0,0 +1,347 @@ +from __future__ import annotations + +import time +import sys +from pathlib import Path + +from django.db import IntegrityError +from django.utils import timezone +from archivebox.config import CONSTANTS +from archivebox.config.common import rprint + +RUNNER_ACTIVE_WORKER_TYPE = "worker_runner" +RUNNER_WAITING_WORKER_TYPE = "runner_waiting" +RUNNER_GATE_WORKER_TYPES = (RUNNER_ACTIVE_WORKER_TYPE, RUNNER_WAITING_WORKER_TYPE, "") + + +def runtime_stack_owner_types(): + from archivebox.machine.models import Process + + return ( + Process.TypeChoices.SERVER, + Process.TypeChoices.ORCHESTRATOR, + ) + + +def foreground_runner_owner_types(): + from archivebox.machine.models import Process + + return ( + Process.TypeChoices.SERVER, + Process.TypeChoices.ADD, + Process.TypeChoices.UPDATE, + ) + + +def current_command(process_type: str, *, data_dir: str | Path, url: str | None = None): + from archivebox.machine.models import Process + + proc = Process.current() + proc.mark_running(process_type=process_type, pwd=str(data_dir), url=url, timeout=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS) + return proc + + +def live_processes(*, process_type: str, data_dir: str | Path, url: str | None = None): + from archivebox.machine.models import Machine, Process + + qs = Process.objects.filter( + machine=Machine.current(), + process_type=process_type, + status=Process.StatusChoices.RUNNING, + pwd=str(data_dir), + ) + if url is not None: + qs = qs.filter(url=url) + return [proc for proc in qs.order_by("-created_at", "-modified_at").iterator(chunk_size=50) if proc.is_running] + + +def newest_live_process(*, process_type: str, data_dir: str | Path, url: str | None = None): + processes = live_processes(process_type=process_type, data_dir=data_dir, url=url) + return processes[0] if processes else None + + +def command_is_newest(command, *, process_type: str, data_dir: str | Path, url: str | None = None) -> bool: + leader = newest_live_process(process_type=process_type, data_dir=data_dir, url=url) + return bool(leader and leader.id == command.id) + + +def runtime_stack_owner(*, data_dir: str | Path, exclude_id=None): + from archivebox.machine.models import Machine, Process + + machine = Machine.current() + base_qs = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + pwd=str(data_dir), + process_type__in=runtime_stack_owner_types(), + ) + if exclude_id is not None: + base_qs = base_qs.exclude(id=exclude_id) + + for qs in ( + # Only server parents own HTTP runtime leadership. Foreground add/update + # commands can own runner/sonic components, but server startup must never + # wait behind them before binding Daphne. + base_qs.filter(process_type=Process.TypeChoices.SERVER), + # A foreground `archivebox run` process is allowed to own the runtime + # stack when no server/add parent is alive. A runner launched by + # supervisord is only a child worker; after its parent is killed it must + # not keep stealing leadership from the next foreground command. + base_qs.filter(process_type=Process.TypeChoices.ORCHESTRATOR).exclude(parent__process_type=Process.TypeChoices.SUPERVISORD), + ): + for proc in qs.order_by("-created_at", "-modified_at").iterator(chunk_size=50): + if proc.is_running: + return proc + proc.mark_exited(exit_code=proc.exit_code if proc.exit_code is not None else 0) + return None + + +def command_owns_runtime_stack(command, *, data_dir: str | Path) -> bool: + owner = runtime_stack_owner(data_dir=data_dir) + return bool(owner and owner.id == command.id) + + +def foreground_runner_owner(*, data_dir: str | Path, exclude_id=None): + from archivebox.machine.models import Machine, Process + + machine = Machine.current() + qs = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + pwd=str(data_dir), + process_type__in=foreground_runner_owner_types(), + ) + if exclude_id is not None: + qs = qs.exclude(id=exclude_id) + for proc in qs.order_by("-created_at", "-modified_at").iterator(chunk_size=50): + if proc.is_running: + return proc + proc.mark_exited(exit_code=proc.exit_code if proc.exit_code is not None else 0) + return None + + +def command_owns_foreground_runner(command, *, data_dir: str | Path) -> bool: + owner = foreground_runner_owner(data_dir=data_dir) + return bool(owner and owner.id == command.id) + + +def runtime_stack_component_label(*, owner=None, data_dir: str | Path) -> str: + try: + from archivebox.config.common import get_config + from archivebox.workers.supervisord_util import active_supervisord_runtime_components + + components = active_supervisord_runtime_components(config=get_config()) + except Exception: + components = [] + + names = list(components) + if not names and owner is not None: + from archivebox.machine.models import Process + + if owner.process_type == Process.TypeChoices.SERVER: + names = ["orchestrator", "server"] + elif owner.process_type == Process.TypeChoices.ORCHESTRATOR: + names = ["orchestrator"] + + return ", ".join(dict.fromkeys(names)) or "runtime stack" + + +def ensure_daemon_stack(*, reason: str = ""): + from archivebox.config.common import get_config + from archivebox.workers.supervisord_util import ( + get_existing_supervisord_process, + get_or_create_supervisord_process, + get_sonic_supervisord_worker_from_plugin, + get_worker, + start_worker, + ) + + config = get_config() + sonic_worker = get_sonic_supervisord_worker_from_plugin(config) + if sonic_worker is None: + return None + + from abx_plugins.plugins.search_backend_sonic.daemon import is_port_listening, prepare_sonic_daemon + + sonic_event = prepare_sonic_daemon(config) + if is_port_listening(sonic_event.host, sonic_event.port): + return { + "name": sonic_event.worker_name, + "statename": "RUNNING", + "description": f"existing Sonic daemon at {sonic_event.url}", + } + + supervisor = get_existing_supervisord_process() or get_or_create_supervisord_process(daemonize=False) + worker = get_worker(supervisor, sonic_worker["name"]) + if isinstance(worker, dict) and worker.get("statename") in ("STARTING", "RUNNING"): + return worker + + if reason: + rprint(f"[yellow][*] Starting daemon stack for {reason}...[/yellow]") + return start_worker(supervisor, sonic_worker) + + +def healthy_orchestrator(*, data_dir: str | Path): + from archivebox.machine.models import Machine, Process + from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker + + supervisor = get_existing_supervisord_process() + worker = get_worker(supervisor, "worker_runner") if supervisor else None + if isinstance(worker, dict) and worker.get("statename") in ("STARTING", "RUNNING"): + return worker + + for proc in Process.objects.filter( + machine=Machine.current(), + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + pwd=str(data_dir), + ).order_by("-created_at"): + if proc.is_running: + return proc + return None + + +def _runner_sort_key(process): + return (process.started_at or process.created_at, process.created_at, str(process.id)) + + +def live_runner_processes(*, data_dir: str | Path, exclude_id=None): + from archivebox.machine.models import Machine, Process + + machine = Machine.current() + Process.cleanup_stale_running(machine=machine) + qs = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type=Process.TypeChoices.ORCHESTRATOR, + worker_type__in=RUNNER_GATE_WORKER_TYPES, + pwd=str(data_dir), + ) + if exclude_id is not None: + qs = qs.exclude(id=exclude_id) + return [process for process in qs.order_by("started_at", "created_at").iterator(chunk_size=20) if process.is_running] + + +def enter_single_runner_gate(command, *, data_dir: str | Path, graceful_timeout: float = 5.0) -> bool: + """ + Admit exactly one active runner for this DATA_DIR using Process rows. + + The current process is a real OS process while it waits, so we keep its + Process row RUNNING but mark worker_type=runner_waiting. Only the process + that wins takeover is promoted to worker_type=worker_runner, which is + protected by a partial unique DB constraint. Older runners are terminated + and fully waited out before promotion, so the runner work loop never overlaps. + """ + from archivebox.machine.models import Process + + command.mark_running( + process_type=Process.TypeChoices.ORCHESTRATOR, + worker_type=RUNNER_WAITING_WORKER_TYPE, + pwd=str(data_dir), + timeout=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS, + ) + while True: + runners = live_runner_processes(data_dir=data_dir) + if all(process.id != command.id for process in runners): + command.refresh_from_db() + command.mark_running( + process_type=Process.TypeChoices.ORCHESTRATOR, + worker_type=RUNNER_WAITING_WORKER_TYPE, + pwd=str(data_dir), + timeout=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS, + ) + runners = live_runner_processes(data_dir=data_dir) + + newest = max(runners, key=_runner_sort_key) + if newest.id != command.id: + rprint( + f"[yellow][*] Newer ArchiveBox runner pid={newest.pid} is taking over; exiting this runner.[/yellow]", + file=sys.stderr, + ) + return False + + older_runners = [process for process in runners if process.id != command.id] + if older_runners: + for process in older_runners: + rprint(f"[yellow][*] Stopping older ArchiveBox runner process (pid={process.pid})...[/yellow]", file=sys.stderr) + process.kill_tree(graceful_timeout=graceful_timeout) + time.sleep(0.1) + continue + + try: + command.mark_running( + process_type=Process.TypeChoices.ORCHESTRATOR, + worker_type=RUNNER_ACTIVE_WORKER_TYPE, + pwd=str(data_dir), + timeout=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS, + ) + return True + except IntegrityError: + # A hard-killed runner may leave the unique active-runner row behind. + # The next loop starts by proving each RUNNING Process row still maps + # to a live OS process, then marks dead rows EXITED before retrying. + command.refresh_from_db() + time.sleep(0.1) + + +def standby_until_leader_needed(command, *, process_type: str, data_dir: str | Path, url: str | None = None, interval: float = 2.0) -> None: + from archivebox.workers.supervisord_util import reap_foreground_supervisord_process + + announced = False + while not command_is_newest(command, process_type=process_type, data_dir=data_dir, url=url): + reap_foreground_supervisord_process() + if not announced: + leader = newest_live_process(process_type=process_type, data_dir=data_dir, url=url) + leader_pid = leader.pid if leader else "unknown" + rprint(f"[yellow][*] Standing by; newer ArchiveBox process pid={leader_pid} is running the orchestrator and server.[/yellow]") + announced = True + time.sleep(interval) + command.modified_at = timezone.now() + command.save(update_fields=["modified_at"]) + + +def standby_until_runtime_stack_needed(command, *, data_dir: str | Path, interval: float = 2.0) -> dict[str, object]: + from archivebox.workers.supervisord_util import reap_foreground_supervisord_process + + announced = False + previous_owner_pid = None + while not command_owns_runtime_stack(command, data_dir=data_dir): + reap_foreground_supervisord_process() + if not announced: + owner = runtime_stack_owner(data_dir=data_dir) + owner_pid = owner.pid if owner else "unknown" + components = runtime_stack_component_label(owner=owner, data_dir=data_dir) + previous_owner_pid = owner_pid + rprint( + f"[yellow][*] A newer archivebox process took over the {components} " + f"(pid={owner_pid}). Work will continue there, and will resume here if that process exits and work still remains.[/yellow]", + file=sys.stderr, + ) + announced = True + time.sleep(interval) + command.modified_at = timezone.now() + command.save(update_fields=["modified_at"]) + return {"resumed": announced, "previous_owner_pid": previous_owner_pid} + + +def standby_until_foreground_runner_needed(command, *, data_dir: str | Path, interval: float = 2.0) -> dict[str, object]: + from archivebox.workers.supervisord_util import reap_foreground_supervisord_process + + announced = False + previous_owner_pid = None + while not command_owns_foreground_runner(command, data_dir=data_dir): + reap_foreground_supervisord_process() + if not announced: + owner = foreground_runner_owner(data_dir=data_dir) + owner_pid = owner.pid if owner else "unknown" + previous_owner_pid = owner_pid + rprint( + f"[yellow][*] A newer archivebox process took over the orchestrator, sonic " + f"(pid={owner_pid}). Work will continue there, and will resume here if that process exits and work still remains.[/yellow]", + file=sys.stderr, + ) + announced = True + time.sleep(interval) + command.modified_at = timezone.now() + command.save(update_fields=["modified_at"]) + return {"resumed": announced, "previous_owner_pid": previous_owner_pid} diff --git a/archivebox/core/templatetags/__init__.py b/archivebox/core/templatetags/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py new file mode 100644 index 0000000000..b142d09557 --- /dev/null +++ b/archivebox/core/templatetags/core_tags.py @@ -0,0 +1,811 @@ +from typing import Any + +from django import template +from django.contrib.admin.templatetags.base import InclusionAdminNode +from django.utils.safestring import mark_safe +from django.utils.html import escape +from django.templatetags.static import static +from django.utils import timezone +from django.utils.text import Truncator + +from pathlib import Path + +from abx_plugins.plugins.archivewebpage.replay_preview import is_replay_target as is_archivewebpage_replay_target + +from archivebox.plugins.discovery import ( + get_plugin_icon, + get_plugin_template, + get_plugin_name, +) +from archivebox.core.routes_util import ( + canonical_base_host_for_request, + get_admin_base_url, + get_web_base_url, + get_snapshot_base_url, + build_snapshot_url, +) + + +register = template.Library() + +_TEXT_PREVIEW_EXTS = (".json", ".jsonl", ".txt", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".md", ".log") +_IMAGE_PREVIEW_EXTS = (".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".ico", ".avif") +_MHTML_PREVIEW_EXTS = (".mhtml", ".mht") + +_MEDIA_FILE_EXTS = { + ".mp4", + ".webm", + ".mkv", + ".avi", + ".mov", + ".flv", + ".wmv", + ".m4v", + ".mpg", + ".mpeg", + ".ts", + ".m2ts", + ".mts", + ".3gp", + ".3g2", + ".ogv", + ".mp3", + ".m4a", + ".aac", + ".ogg", + ".oga", + ".opus", + ".wav", + ".flac", + ".alac", + ".aiff", + ".wma", + ".mka", + ".ac3", + ".eac3", + ".dts", +} + + +def _normalize_output_files(output_files: Any) -> dict[str, dict[str, Any]]: + if isinstance(output_files, dict): + normalized: dict[str, dict[str, Any]] = {} + for path, metadata in output_files.items(): + if not path: + continue + normalized[str(path)] = dict(metadata) if isinstance(metadata, dict) else {} + return normalized + return {} + + +def _snapshot_id(value: Any) -> Any: + from archivebox.core.models import Snapshot + + return value.id if isinstance(value, Snapshot) else value + + +def _coerce_output_file_size(value: Any) -> int | None: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return None + + +def _count_media_files(result) -> int: + try: + output_files = _normalize_output_files(result.output_files or {}) + except Exception: + output_files = {} + + if output_files: + return sum(1 for path in output_files.keys() if Path(path).suffix.lower() in _MEDIA_FILE_EXTS) + + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + except Exception: + return 0 + + if not plugin_dir.exists(): + return 0 + + count = 0 + scanned = 0 + max_scan = 500 + for file_path in plugin_dir.rglob("*"): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + count += 1 + return count + + +def _list_media_files(result) -> list[dict]: + media_files: list[dict] = [] + try: + plugin_dir = Path(result.snapshot_dir) / result.plugin + except Exception: + return media_files + + output_files = _normalize_output_files(result.output_files or {}) + candidates: list[tuple[Path, int | None]] = [] + if output_files: + for path, metadata in output_files.items(): + rel_path = Path(path) + if rel_path.suffix.lower() in _MEDIA_FILE_EXTS: + candidates.append((rel_path, _coerce_output_file_size(metadata.get("size")))) + + if not candidates and plugin_dir.exists(): + scanned = 0 + max_scan = 2000 + for file_path in plugin_dir.rglob("*"): + if scanned >= max_scan: + break + scanned += 1 + if not file_path.is_file(): + continue + if file_path.suffix.lower() in _MEDIA_FILE_EXTS: + try: + rel_path = file_path.relative_to(plugin_dir) + except ValueError: + continue + try: + size = file_path.stat().st_size + except OSError: + size = None + candidates.append((rel_path, size)) + + for rel_path, size in candidates: + href = str(Path(result.plugin) / rel_path) + media_files.append( + { + "name": rel_path.name, + "path": href, + "size": size, + }, + ) + + media_files.sort(key=lambda item: item["name"].lower()) + return media_files + + +def _resolve_snapshot_output_file(snapshot_dir: str | Path | None, raw_output_path: str | None) -> Path | None: + if not snapshot_dir or not raw_output_path or str(raw_output_path).strip() in (".", "/", "./"): + return None + + output_file = Path(raw_output_path) + if not output_file.is_absolute(): + output_file = Path(snapshot_dir) / raw_output_path + + try: + output_file = output_file.resolve() + snap_dir = Path(snapshot_dir).resolve() + if snap_dir not in output_file.parents and output_file != snap_dir: + return None + except Exception: + return None + + if output_file.exists() and output_file.is_file(): + return output_file + return None + + +def _is_text_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_TEXT_PREVIEW_EXTS) + + +def _is_image_preview_path(raw_output_path: str | None) -> bool: + return (raw_output_path or "").lower().endswith(_IMAGE_PREVIEW_EXTS) + + +def _is_root_snapshot_output_path(raw_output_path: str | None) -> bool: + normalized = str(raw_output_path or "").strip().lower() + return normalized in ("", ".", "./", "/", "index.html", "index.json") + + +def _build_snapshot_files_url(snapshot_id: str, request=None, config=None) -> str: + return build_snapshot_url(str(snapshot_id), "/?files=1", request=request, config=config) + + +def _build_snapshot_preview_url(snapshot_id: str, path: str = "", request=None, config=None) -> str: + if path == "about:blank": + return path + if _is_root_snapshot_output_path(path): + return _build_snapshot_files_url(snapshot_id, request=request, config=config) + url = build_snapshot_url(str(snapshot_id), path, request=request, config=config) + if not ( + _is_text_preview_path(path) + or _is_image_preview_path(path) + or (path or "").lower().endswith(_MHTML_PREVIEW_EXTS) + or is_archivewebpage_replay_target(path or "") + ): + return url + separator = "&" if "?" in url else "?" + return f"{url}{separator}preview=1" + + +def _render_text_preview(plugin: str, icon_html: str, snippet: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + escaped = escape(snippet) + return ( + f'<div class="thumbnail-text" data-plugin="{plugin_attr}" data-compact="1">' + f'<div class="thumbnail-text-header">' + f'<span class="thumbnail-compact-icon">{icon_html}</span>' + f'<span class="thumbnail-text-title">{plugin_label}</span>' + f"</div>" + f'<pre class="thumbnail-text-pre">{escaped}</pre>' + f"</div>" + ) + + +def _render_fallback_card(plugin: str, icon_html: str, fallback_label: str) -> str: + plugin_attr = escape(plugin or "") + plugin_label = escape(plugin or "") + fallback_attr = escape(fallback_label) + return ( + f'<div class="thumbnail-compact" data-plugin="{plugin_attr}" data-compact="1">' + f'<span class="thumbnail-compact-icon">{icon_html}</span>' + f'<span class="thumbnail-compact-label">{plugin_label}</span>' + f'<span class="thumbnail-compact-meta">{fallback_attr}</span>' + f"</div>" + ) + + +def _render_text_file_preview(snapshot_dir: str | Path | None, raw_output_path: str | None, plugin: str, icon_html: str) -> str | None: + output_file = _resolve_snapshot_output_file(snapshot_dir, raw_output_path) + if not output_file: + return None + + try: + with output_file.open("rb") as f: + raw = f.read(4096) + text = raw.decode("utf-8", errors="replace").strip() + if not text: + return None + lines = text.splitlines()[:6] + snippet = "\n".join(lines) + return _render_text_preview(plugin, icon_html, snippet) + except Exception: + return None + + +@register.filter(name="split") +def split(value, separator: str = ","): + return (value or "").split(separator) + + +@register.filter(name="index") +def index(value, position): + try: + return value[int(position)] + except Exception: + return None + + +@register.filter +def file_size(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return f"{num_bytes:3.1f} {count}" + num_bytes /= 1024.0 + return "{:3.1f} {}".format(num_bytes, "TB") + + +@register.filter +def intcomma(value: int | str | None) -> str: + try: + return f"{int(value or 0):,}" + except (TypeError, ValueError): + return str(value or "") + + +def result_list(context, cl): + """ + Monkey patched result + """ + num_sorted_fields = 0 + request = context.get("request") + config = request.__dict__.get("archivebox_config") if request is not None else context.get("CONFIG") + results = cl.result_list + if config is not None: + for obj in results: + obj._runtime_config = config + return { + "cl": cl, + "num_sorted_fields": num_sorted_fields, + "results": results, + "request": request, + "CONFIG": config, + } + + +@register.tag(name="snapshots_grid") +def result_list_tag(parser, token): + return InclusionAdminNode( + parser, + token, + func=result_list, + template_name="snapshots_grid.html", + takes_context=True, + ) + + +_LOW_DISK_THRESHOLD_GB = 1.0 +_HIGH_MEMORY_THRESHOLD_PCT = 95.0 +_HIGH_LOAD_MULTIPLE = 3 # 15-min loadavg > 3 * cpu_count +_HEALTH_CHECK_INTERVAL_SECONDS = 30 +_health_cache: dict = {"checked_at": 0.0, "stats": {}} + + +def _machine_health_stats() -> dict: + """Cached wrapper around the Machine-admin stats util. + + The Machine list/change pages already render disk / mem / load via + ``archivebox.machine.detect.get_host_stats()`` โ€” we reuse the same call so + the banner thresholds line up 1:1 with what's shown on /admin/machine/. + Cached for 30s because the inclusion tag fires on every page render and + ``get_host_stats`` shells into several psutil probes. + """ + import time + + now = time.monotonic() + if _health_cache["stats"] and (now - _health_cache["checked_at"]) < _HEALTH_CHECK_INTERVAL_SECONDS: + return _health_cache["stats"] + + try: + from archivebox.machine.detect import get_host_stats + + stats = get_host_stats() or {} + except Exception: + stats = {} + + _health_cache["checked_at"] = now + _health_cache["stats"] = stats + return stats + + +@register.inclusion_tag("system_warnings_banner.html", takes_context=True) +def system_warnings_banner(context): + """Render the top-of-page warning banner for one of the conditions below, + in priority order (highest first): + + 1. ``mode="unconfigured"``โ€” ``BASE_URL`` is empty. Security/correctness + issue: until it's pinned, generated URLs can echo any Host the client + sends, and admin/web/api routing has no canonical anchor. + 2. ``mode="unsafe"`` โ€” ``SERVER_SECURITY_MODE`` is a non-subdomain + mode. Archived pages share an origin with privileged routes. + 3. ``mode="low_disk"`` โ€” ``DATA_DIR`` has <1 GiB free; new archive + jobs will start failing on ENOSPC. + 4. ``mode="high_memory"`` โ€” virtual memory utilization at/above 95%; the + host is one OOM-kill from a crash. + 5. ``mode="high_load"`` โ€” 15-minute load average exceeds 3 ร— CPU count + (the kernel's own sustained-load EMA, so no rolling buffer of ours is + needed). + + Config/security warnings come first because they affect correctness + + security and need explicit operator action; host-health warnings come + after and reuse ``machine.detect.get_host_stats`` (the same function that + populates the Machine admin page), cached for 30s. + """ + config = context.get("CONFIG") + if config is None: + from archivebox.config.common import get_config + + config = get_config(resolve_plugins=False) + + if not config.BASE_URL: + return _unconfigured_banner_context(context.get("request")) + if not config.USES_SUBDOMAIN_ROUTING: + return {"mode": "unsafe"} + + stats = _machine_health_stats() + free_gb = stats.get("disk_data_free_gb") + if isinstance(free_gb, (int, float)) and free_gb < _LOW_DISK_THRESHOLD_GB: + return {"mode": "low_disk", "free_gb": f"{free_gb:.2f}"} + + mem_pct = stats.get("mem_virt_used_pct") + if isinstance(mem_pct, (int, float)) and mem_pct >= _HIGH_MEMORY_THRESHOLD_PCT: + return {"mode": "high_memory", "mem_pct": f"{mem_pct:.1f}"} + + cpu_load = stats.get("cpu_load") or () + cpu_count = stats.get("cpu_count") or 1 + # ``cpu_load`` is the (1min, 5min, 15min) tuple from psutil.getloadavg(); + # we take the 15-min figure because the operator's threshold was + # "sustained for 15min" and the kernel already maintains that EMA. + load_15 = cpu_load[2] if isinstance(cpu_load, (list, tuple)) and len(cpu_load) >= 3 else None + if isinstance(load_15, (int, float)) and load_15 > _HIGH_LOAD_MULTIPLE * cpu_count: + return { + "mode": "high_load", + "load_15": f"{load_15:.2f}", + "cpu_count": cpu_count, + "load_threshold": _HIGH_LOAD_MULTIPLE * cpu_count, + } + + return {"mode": ""} + + +def _unconfigured_banner_context(request) -> dict: + """Build the banner payload for the unset-BASE_URL case. + + Always returns ``mode="unconfigured"`` โ€” the user explicitly asked for + the banner to render whenever ``BASE_URL`` is empty, regardless of + whether the request host happens to match a CSRF-derived value. The + ``suggested_base_url`` is derived from the current request when one is + available so the user can copy/paste the right value straight into + their config. + """ + if request is None: + return { + "mode": "unconfigured", + "actual_host": "", + "suggested_base_url": "", + "machine_admin_url": "", + } + scheme = request.scheme or "http" + actual_full_host = request.get_host() or "" + canonical_host = canonical_base_host_for_request(actual_full_host) + # Suggest the wildcard form ``http://*.<host>`` so the value lands in the + # operator's clipboard already aligned with subdomain routing. The config + # parser strips the leading ``*.`` so users can paste it verbatim. + suggested_base_url = f"{scheme}://*.{canonical_host}" if canonical_host else "" + user = request.user + is_superuser = bool(user and user.is_authenticated and user.is_superuser) + machine_admin_url = "" + if is_superuser: + try: + from archivebox.machine.models import Machine + + machine = Machine.current() + machine_admin_url = f"/admin/machine/machine/{machine.id}/change/" + except Exception: + machine_admin_url = "" + return { + "mode": "unconfigured", + "actual_host": actual_full_host, + "suggested_base_url": suggested_base_url, + "machine_admin_url": machine_admin_url, + } + + +@register.simple_tag(takes_context=True) +def url_replace(context, **kwargs): + dict_ = context["request"].GET.copy() + dict_.update(**kwargs) + return dict_.urlencode() + + +@register.simple_tag +def has_real_admin_users() -> bool: + """True if any non-``system`` superuser exists. Used by the login page to + only show the bootstrap hint (createsuperuser / ADMIN_USERNAME env vars) + when the collection still has no real admin.""" + from django.contrib.auth.models import User + + return User.objects.filter(is_superuser=True).exclude(username="system").exists() + + +@register.simple_tag(takes_context=True) +def admin_base_url(context) -> str: + return get_admin_base_url(request=context.get("request"), config=context.get("CONFIG")) + + +@register.simple_tag(takes_context=True) +def web_base_url(context) -> str: + return get_web_base_url(request=context.get("request"), config=context.get("CONFIG")) + + +@register.simple_tag(takes_context=True) +def snapshot_base_url(context, snapshot) -> str: + snapshot_id = _snapshot_id(snapshot) + return get_snapshot_base_url(str(snapshot_id), request=context.get("request"), config=context.get("CONFIG")) + + +@register.simple_tag(takes_context=True) +def snapshot_url(context, snapshot, path: str = "") -> str: + snapshot_id = _snapshot_id(snapshot) + return build_snapshot_url(str(snapshot_id), path, request=context.get("request"), config=context.get("CONFIG")) + + +@register.simple_tag(takes_context=True) +def snapshot_index_row(context, link) -> str: + snapshot_id = str(_snapshot_id(link)) + request = context.get("request") + config = context.get("CONFIG") + snapshot_base = get_snapshot_base_url(snapshot_id, request=request, config=config) + screenshot_plugin_url = f"{snapshot_base}/screenshot/screenshot.png" + extension_screenshot_1_url = f"{snapshot_base}/chrome_extension_screenshot/screenshot-1.png" + extension_screenshot_url = f"{snapshot_base}/chrome_extension_screenshot/screenshot.png" + favicon_plugin_url = f"{snapshot_base}/favicon/favicon.ico" + favicon_root_url = f"{snapshot_base}/favicon.ico" + + status = getattr(link, "status", None) or "unknown" + bookmarked_at = getattr(link, "bookmarked_at", None) + timestamp = getattr(link, "timestamp", "") + if bookmarked_at: + bookmarked_at = timezone.localtime(bookmarked_at) + date_text = bookmarked_at.strftime("%Y-%m-%d") + time_text = bookmarked_at.strftime("%H:%M") + sort_value = str(bookmarked_at.timestamp()) + title_time = f"Bookmarked: {bookmarked_at} ({timestamp})" + else: + date_text = "" + time_text = "" + sort_value = "" + title_time = f"Bookmarked: ({timestamp})" + + url = getattr(link, "url", "") or "" + title = getattr(link, "title", "") or "" + is_pending = status in {"queued", "started", "backoff"} + title_text = title or ("Loading..." if is_pending else url) + tags_str = link.tags_str() if callable(getattr(link, "tags_str", None)) else getattr(link, "tags_str", "") + tag_html = "".join(f'<span class="snapshot-tag">{escape(tag)}</span>' for tag in (tags_str or "").split(",") if tag) + if tag_html: + tag_cell = f'<span class="snapshot-tags">{tag_html}</span>' + else: + tag_cell = '<span class="empty-value">...</span>' + + num_outputs = int(getattr(link, "num_outputs", 0) or 0) + icons = link.icons() if callable(getattr(link, "icons", None)) else getattr(link, "icons", "") + icons_cell = str(icons) if icons else '<span class="empty-value">...</span>' + archive_size = int(getattr(link, "archive_size", 0) or 0) + size_cell = file_size(archive_size) if archive_size else '<span class="empty-value">...</span>' + output_plural = "" if num_outputs == 1 else "s" + + if is_pending: + preview_html = ( + '<span class="snapshot-preview snapshot-preview-spinner" aria-label="Archiving in progress">' + f'<img src="{escape(static("spinner.gif"))}" alt="" decoding="async" loading="lazy">' + "</span>" + ) + else: + preview_html = ( + f'<img src="{escape(screenshot_plugin_url)}" ' + f'data-fallbacks="{escape(extension_screenshot_1_url)},{escape(extension_screenshot_url)}" ' + 'onerror="nextPublicSnapshotPreview(this)" class="snapshot-preview screenshot" alt="" decoding="async" loading="lazy">' + ) + + html = f""" +<tr class="snapshot-row status-{escape(status)}"> + <td class="snapshot-time" title="{escape(title_time)}" data-sort="{escape(sort_value)}"> + <a href="{escape(snapshot_base)}/index.html"> + <span>{escape(date_text)}</span> + <small>{escape(time_text)}</small> + </a> + </td> + <td class="snapshot-preview-cell"> + <a href="{escape(snapshot_base)}/index.html" title="Open archived snapshot"> + {preview_html} + </a> + </td> + <td class="snapshot-title-cell" title="{escape(title or url)}"> + <div class="snapshot-title-line"> + <a href="{escape(snapshot_base)}/index.html" class="snapshot-favicon-link" title="Open archived snapshot"> + <img src="{escape(favicon_plugin_url)}" + data-fallbacks="{escape(favicon_root_url)}" + onerror="nextPublicSnapshotPreview(this)" + class="link-favicon" + alt="" + decoding="async" + loading="lazy"> + </a> + <a href="{escape(snapshot_base)}/index.html" class="snapshot-title"> + {escape(Truncator(title_text).chars(110))} + </a> + </div> + <a href="{escape(url)}" class="snapshot-url" title="{escape(url)}" target="_blank" rel="noopener noreferrer"> + {escape(url)} + </a> + </td> + <td class="snapshot-tags-cell"> + {tag_cell} + </td> + <td class="snapshot-status-cell"> + <span class="snapshot-status">{escape(status)}</span> + </td> + <td class="snapshot-files-cell"> + <span data-number-for="{escape(url)}" title="{num_outputs} successful outputs"> + {icons_cell} + </span> + </td> + <td class="snapshot-size-cell"> + <a href="{escape(snapshot_base)}/?files=1" title="View archived files"> + {size_cell} + </a> + <small>{num_outputs} output{output_plural}</small> + </td> +</tr> +""" + return mark_safe(html) + + +@register.simple_tag(takes_context=True) +def snapshot_preview_url(context, snapshot, path: str = "") -> str: + snapshot_id = _snapshot_id(snapshot) + return _build_snapshot_preview_url(str(snapshot_id), path, request=context.get("request"), config=context.get("CONFIG")) + + +@register.simple_tag +def plugin_icon(plugin: str) -> str: + """ + Render the icon for a plugin. + + Usage: {% plugin_icon "screenshot" %} + """ + icon_html = get_plugin_icon(plugin) + return mark_safe( + f'<span class="abx-plugin-icon" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">{icon_html}</span>', + ) + + +@register.simple_tag(takes_context=True) +def plugin_card(context, result) -> str: + """ + Render the card template for an archive result. + + Usage: {% plugin_card result %} + + Context variables passed to template: + - result: ArchiveResult object + - snapshot: Parent Snapshot object + - output_path: Path to output relative to snapshot dir (from embed_path()) + - plugin: Plugin base name + """ + from archivebox.core.models import ArchiveResult + + if result is None or not isinstance(result, ArchiveResult): + return "" + + plugin = get_plugin_name(result.plugin) + template_str = get_plugin_template(plugin, "card") + + # Use embed_path() for the display path + raw_output_path = result.embed_path() or "" + output_url = build_snapshot_url( + str(result.snapshot_id), + raw_output_path or "", + request=context.get("request"), + config=context.get("CONFIG"), + ) + + icon_html = get_plugin_icon(plugin) + plugin_lower = (plugin or "").lower() + media_file_count = _count_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else 0 + media_files = _list_media_files(result) if plugin_lower in ("ytdlp", "yt-dlp", "youtube-dl") else [] + if media_files: + snapshot_id = str(result.snapshot_id) + request = context.get("request") + config = context.get("CONFIG") + for item in media_files: + path = item.get("path") or "" + item["url"] = build_snapshot_url(snapshot_id, path, request=request, config=config) if path else "" + + output_lower = (raw_output_path or "").lower() + force_text_preview = output_lower.endswith(_TEXT_PREVIEW_EXTS) + + # Create a mini template and render it with context + try: + if template_str and raw_output_path and str(raw_output_path).strip() not in (".", "/", "./") and not force_text_preview: + tpl = template.Template(template_str) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + "plugin_icon": icon_html, + "media_file_count": media_file_count, + "media_files": media_files, + }, + ) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + except Exception: + pass + + if force_text_preview: + preview = _render_text_file_preview(result.snapshot_dir, raw_output_path, plugin, icon_html) + if preview: + return mark_safe(preview) + + if output_lower.endswith(_TEXT_PREVIEW_EXTS): + fallback_label = "text" + else: + fallback_label = "output" + + return mark_safe(_render_fallback_card(plugin, icon_html, fallback_label)) + + +@register.simple_tag +def output_card(snapshot, output_path: str, plugin: str) -> str: + plugin_name = get_plugin_name(plugin) + icon_html = get_plugin_icon(plugin_name) + preview = _render_text_file_preview(snapshot.output_dir, output_path, plugin_name, icon_html) + if preview: + return mark_safe(preview) + + output_lower = (output_path or "").lower() + fallback_label = "text" if output_lower.endswith(_TEXT_PREVIEW_EXTS) else "output" + return mark_safe(_render_fallback_card(plugin_name, icon_html, fallback_label)) + + +@register.simple_tag(takes_context=True) +def plugin_full(context, result) -> str: + """ + Render the full template for an archive result. + + Usage: {% plugin_full result %} + """ + from archivebox.core.models import ArchiveResult + + if result is None or not isinstance(result, ArchiveResult): + return "" + + plugin = get_plugin_name(result.plugin) + template_str = get_plugin_template(plugin, "full") + + if not template_str: + return "" + + raw_output_path = "" + raw_output_path = result.embed_path_db() or "" + if not raw_output_path: + raw_output_path = result.embed_path() or "" + if _is_root_snapshot_output_path(raw_output_path): + return "" + output_url = build_snapshot_url( + str(result.snapshot_id), + raw_output_path, + request=context.get("request"), + config=context.get("CONFIG"), + ) + + try: + tpl = template.Template(template_str) + ctx = template.Context( + { + "result": result, + "snapshot": result.snapshot, + "output_path": output_url, + "output_path_raw": raw_output_path, + "plugin": plugin, + }, + ) + rendered = tpl.render(ctx) + # Only return non-empty content (strip whitespace to check) + if rendered.strip(): + return mark_safe(rendered) + return "" + except Exception: + return "" + + +@register.filter +def plugin_name(value: str) -> str: + """ + Get the base name of a plugin (strips numeric prefix). + + Usage: {{ result.plugin|plugin_name }} + """ + return get_plugin_name(value) + + +@register.simple_tag(takes_context=True) +def api_token(context) -> str: + """ + Return an API token string for the logged-in user, creating one if needed. + """ + from archivebox.api.auth import get_or_create_api_token + + request = context.get("request") + user = request.user + if not user or not user.is_authenticated: + return "" + + token = get_or_create_api_token(user) + return token.token if token else "" diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py new file mode 100644 index 0000000000..2b51baadf3 --- /dev/null +++ b/archivebox/core/urls.py @@ -0,0 +1,136 @@ +__package__ = "archivebox.core" + +import sys +from importlib.util import find_spec + +from django.conf import settings +from django.urls import path, re_path, include +from django.views import static +from django.views.generic.base import RedirectView +from django.http import HttpRequest + +from archivebox.config.constants import CONSTANTS +from archivebox.config.common import get_config +from archivebox.misc.serve_static import serve_static + +from archivebox.core.admin_site import archivebox_admin +from archivebox.core.views import ( + HomepageView, + SnapshotView, + SnapshotPathView, + SnapshotReplayView, + SnapshotReplayAuthView, + OriginalDomainReplayView, + PublicIndexView, + AddView, + WebAddView, + HealthCheckView, +) +from archivebox.progressmonitor.views import live_progress_view +from archivebox.search.views import public_snapshot_search_stream_view +from abx_plugins.plugins.opencode.views import opencode_proxy_view + + +# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 +# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE +# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': VERSIONS_AVAILABLE, 'CAN_UPGRADE': CAN_UPGRADE} + + +CONFIG = get_config() +DEBUG = CONFIG.DEBUG or ("--debug" in sys.argv) + +urlpatterns = [ + re_path(r"^static/(?P<path>.*)$", serve_static), + path("robots.txt", static.serve, {"document_root": CONSTANTS.STATIC_DIR, "path": "robots.txt"}), + path("favicon.ico", static.serve, {"document_root": CONSTANTS.STATIC_DIR, "path": "favicon.ico"}), + path("docs/", RedirectView.as_view(url="https://github.com/ArchiveBox/ArchiveBox/wiki"), name="Docs"), + re_path(r"^admin/agent/?(?=$|opencode)", include("abx_plugins.plugins.opencode.urls")), + re_path(r"^(?P<path>assets/.*)$", opencode_proxy_view, name="opencode-assets"), + path("public/search-stream/", public_snapshot_search_stream_view, name="public-search-stream"), + path("public/", PublicIndexView.as_view(), name="public-index"), + path("public.html", RedirectView.as_view(url="/public/"), name="public-index-html"), + path("archive/", RedirectView.as_view(url="/")), + path("archive/<path:path>", SnapshotView.as_view(), name="Snapshot"), + re_path(r"^snapshot\/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:\/(?P<path>.*))?$", SnapshotReplayView.as_view(), name="snapshot-replay"), + re_path(r"^original\/(?P<domain>[^/]+)(?:\/(?P<path>.*))?$", OriginalDomainReplayView.as_view(), name="original-replay"), + re_path(r"^web/(?P<url>(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$", WebAddView.as_view(), name="web-add"), + re_path( + r"^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$", + SnapshotPathView.as_view(), + name="snapshot-path-url", + ), + re_path( + r"^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path", + ), + re_path(r"^(?P<username>[^/]+)/(?P<url>https?://.*)$", SnapshotPathView.as_view(), name="snapshot-path-url-nodate"), + re_path( + r"^(?P<username>[^/]+)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$", + SnapshotPathView.as_view(), + name="snapshot-path-nodate", + ), + path("admin/core/snapshot/add/", RedirectView.as_view(url="/add/")), + path("admin/core/snapshot/replay-auth/", SnapshotReplayAuthView.as_view(), name="snapshot-replay-auth"), + path("add/", AddView.as_view(), name="add"), + # ``query_string=True`` preserves the ``?next=โ€ฆ`` param that Django's + # auth/login mixins append, so e.g. ``UserPassesTestMixin`` redirecting + # an unauthenticated ``/add`` visitor to ``/accounts/login/?next=/add/`` + # carries the ``next`` through to ``/admin/login/`` and lands them at + # ``/add/`` after login instead of the admin homepage. + path("accounts/login/", RedirectView.as_view(url="/admin/login/", query_string=True)), + path("accounts/logout/", RedirectView.as_view(url="/admin/logout/", query_string=True)), + path("accounts/", include("django.contrib.auth.urls")), + path("progress.json", live_progress_view, name="live_progress"), + path("admin/", archivebox_admin.urls), + path("api/", include("archivebox.api.urls"), name="api"), + path("health/", HealthCheckView.as_view(), name="healthcheck"), + path("error/", lambda request: _raise_test_error(request)), + # path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django + path("index.html", RedirectView.as_view(url="/")), + path("", HomepageView.as_view(), name="Home"), +] + + +def _raise_test_error(_request: HttpRequest): + raise ZeroDivisionError("Intentional test error route") + + +if getattr(settings, "DEBUG_TOOLBAR", False): + urlpatterns += [path("__debug__/", include("debug_toolbar.urls"))] + +if getattr(settings, "DEBUG_REQUESTS_TRACKER", False) and find_spec("requests_tracker"): + urlpatterns += [path("__requests_tracker__/", include("requests_tracker.urls"))] + + +# # Proposed FUTURE URLs spec +# path('', HomepageView) +# path('/add', AddView) +# path('/public', PublicIndexView) +# path('/snapshot/:slug', SnapshotView) + +# path('/admin', admin.site.urls) +# path('/accounts', django.contrib.auth.urls) + +# # Proposed REST API spec +# # :slugs can be uuid, short_uuid, or any of the unique index_fields +# path('api/v1/'), +# path('api/v1/core/' [GET]) +# path('api/v1/core/snapshot/', [GET, POST, PUT]), +# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]), +# path('api/v1/core/archiveresult', [GET, POST, PUT]), +# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]), +# path('api/v1/core/tag/', [GET, POST, PUT]), +# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]), + +# path('api/v1/cli/', [GET]) +# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode + +# path('api/v1/extractors/', [GET]) +# path('api/v1/extractors/:extractor/', [GET]), +# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function + +# future, just an idea: +# path('api/v1/scheduler/', [GET]) +# path('api/v1/scheduler/task/', [GET, POST, PUT]), +# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]), diff --git a/archivebox/core/views.py b/archivebox/core/views.py new file mode 100644 index 0000000000..f7aa21018e --- /dev/null +++ b/archivebox/core/views.py @@ -0,0 +1,1819 @@ +__package__ = "archivebox.core" + +import json +import os +import posixpath +from glob import glob, escape +from django.utils import timezone +from typing import cast +from pathlib import Path +from urllib.parse import quote, urlparse + +from django.shortcuts import render, redirect +from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden, QueryDict +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django.views import View +from django.views.generic.list import ListView +from django.views.generic import FormView +from django.db.models import Case, IntegerField, Q, Value, When +from django.core.paginator import InvalidPage +from django.contrib import messages +from django.conf import settings +from django.contrib.auth import HASH_SESSION_KEY, SESSION_KEY, get_user_model +from django.contrib.auth.mixins import UserPassesTestMixin +from django.contrib.sessions.models import Session +from django.core import signing +from django.views.decorators.csrf import csrf_exempt +from django.utils.decorators import method_decorator + +from admin_data_views.typing import TableContext, ItemContext, SectionData +from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink + +from abx_plugins.plugins.archivewebpage import replay_preview as archivewebpage_replay + +from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, VERSION +from archivebox.config.common import ( + SENSITIVE_CONFIG_VALUE_REDACTED, + find_config_default, + find_config_section, + find_config_source, + find_config_type, + get_config, + get_all_configs, + get_request_config, + _plugin_config_properties, + redact_sensitive_config, +) +from archivebox.config.common import PLUGIN_CONFIG_SCHEMAS +from archivebox.config.configset import BaseConfigSet +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.misc.util import ( + base_url, + filter_queryset_by_uuid_substring, + htmlencode, + ts_to_date_str, + urldecode, + without_fragment, +) +from archivebox.misc.serve_static import serve_static_with_byterange_support +from archivebox.misc.logging_util import printable_filesize +from archivebox.search.config import ( + get_search_mode, + get_search_mode_backend, + get_search_mode_base, + get_search_mode_options, +) +from archivebox.search.views import get_cached_public_search_state + +from archivebox.core.models import ArchiveResult, Snapshot, SnapshotTag +from archivebox.core.permissions import ( + PERMISSIONS_PRIVATE, + PERMISSIONS_PUBLIC, + PERMISSIONS_UNLISTED, + can_view_snapshot, + direct_snapshots_queryset, + filter_personas_by_permissions, + get_snapshot_permissions, + is_admin_user, + public_snapshots_queryset, +) +from archivebox.core.routes_util import ( + build_admin_url, + build_snapshot_url, + build_web_url, + get_admin_host, + get_snapshot_host, + get_snapshot_lookup_key, + get_web_host, + host_matches, +) +from archivebox.core.forms import AddLinkForm +from archivebox.plugins.forms import get_plugin_config_binary_urls +from archivebox.crawls.models import Crawl +from archivebox.plugins.discovery import discover_plugin_configs +from archivebox.plugins.views import get_config_definition_link +from archivebox.progressmonitor.views import live_progress_view, progress_endpoint + + +def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str: + target = archivefile or "" + if target == "index.html": + target = "" + fullpath = Path(snapshot.output_dir) / target + if fullpath.is_file(): + target = str(Path(target).parent) + if target == ".": + target = "" + return target + + +def _find_snapshot_by_ref(snapshot_ref: str) -> Snapshot | None: + lookup = get_snapshot_lookup_key(snapshot_ref) + if not lookup: + return None + + if len(lookup) == 12 and "-" not in lookup: + return Snapshot.objects.filter(id__endswith=lookup).order_by("-created_at", "-downloaded_at").first() + + try: + return Snapshot.objects.get(pk=lookup) + except Snapshot.DoesNotExist: + try: + return Snapshot.objects.get(id__startswith=lookup) + except Snapshot.DoesNotExist: + return None + except Snapshot.MultipleObjectsReturned: + return Snapshot.objects.filter(id__startswith=lookup).first() + + +def _admin_login_redirect_or_forbidden(request: HttpRequest): + if get_request_config(request).CONTROL_PLANE_ENABLED: + return redirect(f"/admin/login/?next={request.path}") + return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.") + + +REPLAY_AUTH_SALT = "archivebox.private-snapshot-replay" +REPLAY_COOKIE_PREFIX = f"archivebox_replay_{CONSTANTS.COLLECTION_ID}_" +REPLAY_GRANT_MAX_AGE = 60 + + +def _replay_cookie_name(snapshot: Snapshot) -> str: + return f"{REPLAY_COOKIE_PREFIX}{str(snapshot.id).replace('-', '')[-12:]}" + + +def _clean_replay_next(path: str | None) -> str: + """Only allow same-snap relative replay paths; grants must never redirect off-host.""" + path = f"/{(path or 'index.html').lstrip('/')}" + parsed = urlparse(path) + if parsed.scheme or parsed.netloc or path.startswith("//"): + return "/index.html" + return path + + +def _replay_payload_is_valid(payload: dict, snapshot: Snapshot) -> bool: + """A replay cookie is not its own auth source; it must point at a live admin session. + + Replayed pages can execute hostile JS, so admin cookies stay host-only on admin.*. + The snap host gets only this host-only HttpOnly cookie, and every request checks + that the original Django session still exists and still belongs to an active staff user. + Logout, session expiry, user deletion/deactivation, or password auth-hash rotation all + make the replay cookie inert without needing admin.* to delete a cookie on snap-*. + """ + if payload.get("snapshot_id") != str(snapshot.id): + return False + try: + session = Session.objects.get(session_key=str(payload.get("session_key") or "")) + session_data = session.get_decoded() + user_id = str(session_data.get(SESSION_KEY) or "") + auth_hash = str(session_data.get(HASH_SESSION_KEY) or "") + user = get_user_model().objects.get(pk=user_id) + except Exception: + return False + return ( + str(payload.get("user_id")) == user_id + and str(payload.get("auth_hash") or "") == auth_hash + and user.is_active + and user.is_staff + and auth_hash == user.get_session_auth_hash() + ) + + +def _has_replay_cookie(request: HttpRequest, snapshot: Snapshot) -> bool: + value = request.COOKIES.get(_replay_cookie_name(snapshot)) + if not value: + return False + try: + payload = signing.loads(value, salt=REPLAY_AUTH_SALT, max_age=settings.SESSION_COOKIE_AGE) + except signing.BadSignature: + return False + return isinstance(payload, dict) and _replay_payload_is_valid(payload, snapshot) + + +def _private_snapshot_auth_redirect(request: HttpRequest, snapshot: Snapshot, path: str = "", *, preserve_query: bool = True): + next_path = _clean_replay_next(path or "index.html") + if preserve_query and request.META.get("QUERY_STRING"): + next_path = f"{next_path}?{request.META['QUERY_STRING']}" + target = build_admin_url( + f"/admin/core/snapshot/replay-auth/?snapshot={snapshot.id}&next={quote(next_path, safe='')}", + request=request, + ) + return redirect(target) + + +def _replay_auth_response(request: HttpRequest, snapshot: Snapshot): + try: + payload = signing.loads(str(request.GET.get("grant") or ""), salt=REPLAY_AUTH_SALT, max_age=REPLAY_GRANT_MAX_AGE) + except signing.BadSignature: + return _private_snapshot_auth_redirect(request, snapshot, "index.html", preserve_query=False) + + if not isinstance(payload, dict) or not _replay_payload_is_valid(payload, snapshot): + return _private_snapshot_auth_redirect(request, snapshot, "index.html", preserve_query=False) + + cookie_value = signing.dumps(payload, salt=REPLAY_AUTH_SALT) + response = redirect(_clean_replay_next(request.GET.get("next"))) + response.set_cookie( + _replay_cookie_name(snapshot), + cookie_value, + max_age=settings.SESSION_COOKIE_AGE, + secure=request.is_secure(), + httponly=True, + samesite="Lax", + ) + return response + + +class SnapshotReplayAuthView(View): + """Admin-only handoff that lets a snap host mint its own replay cookie. + + admin.* cannot set a host-only cookie for snap-* (browsers forbid that), and + widening the real Django session cookie to *.archivebox.localhost would let XSS + in replayed pages hit the admin UI. Instead admin.* proves the user is logged in + with a short URL grant, then snap-* validates it and sets a snap-host-only cookie. + """ + + def get(self, request: HttpRequest): + if not is_admin_user(request): + return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={quote(request.get_full_path(), safe='')}") + + snapshot = _find_snapshot_by_ref(str(request.GET.get("snapshot") or "")) + if not snapshot: + raise Http404 + + payload = { + "snapshot_id": str(snapshot.id), + "user_id": str(request.user.pk), + "session_key": request.session.session_key, + "auth_hash": request.user.get_session_auth_hash(), + } + grant = signing.dumps(payload, salt=REPLAY_AUTH_SALT) + next_path = _clean_replay_next(request.GET.get("next")) + target = build_snapshot_url(str(snapshot.id), "_auth", request=request, config=get_request_config(request)) + return redirect(f"{target}?grant={quote(grant, safe='')}&next={quote(next_path, safe='')}") + + +class HomepageView(View): + def get(self, request): + request_config = get_request_config(request) + if request.user.is_authenticated and request_config.CONTROL_PLANE_ENABLED: + return redirect("/admin/core/snapshot/") + + if request_config.PUBLIC_INDEX: + return redirect("/public") + + return _admin_login_redirect_or_forbidden(request) + + +class SnapshotView(View): + # render static html index from filesystem archive/<timestamp>/index.html + + @staticmethod + def find_snapshots_for_url(path: str): + """Return a queryset of snapshots matching a URL-ish path. URL only โ€” never tries ID matching. + + Use ``find_snapshots_for_id`` separately if you also want to match by snapshot UUID. + """ + + def _fragmentless_url_query(url: str) -> Q: + # Use a range comparison (url >= 'canonical#' AND url < 'canonical#\U0010ffff') + # instead of LIKE/__startswith โ€” SQLite's case-insensitive LIKE bypasses the + # url index and forces a full-table scan over ~1M rows (~250ms). The range + # form lets SQLite use a MULTI-INDEX OR and stays under 1ms. + canonical = without_fragment(url) + return Q(url=canonical) | (Q(url__gte=f"{canonical}#") & Q(url__lt=f"{canonical}#\U0010ffff")) + + normalized = without_fragment(path) + if path.startswith(("http://", "https://")): + # exact url match (indexed) โ€” fastest path + qs = Snapshot.objects.filter(_fragmentless_url_query(path)) + if qs.exists(): + return qs + normalized = normalized.split("://", 1)[1] + + # try exact match on full url (without scheme) + qs = Snapshot.objects.filter( + _fragmentless_url_query("http://" + normalized) | _fragmentless_url_query("https://" + normalized), + ) + if qs.exists(): + return qs + + # fall back to match on exact base_url + base = base_url(normalized) + qs = Snapshot.objects.filter( + _fragmentless_url_query("http://" + base) | _fragmentless_url_query("https://" + base), + ) + if qs.exists(): + return qs + + # fall back to matching base_url as prefix + return Snapshot.objects.filter(Q(url__startswith="http://" + base) | Q(url__startswith="https://" + base)) + + @staticmethod + def find_snapshots_for_id(slug: str): + """Return a queryset of snapshots matching a (possibly truncated) UUID via prefix or suffix. + + Strips non-hex characters from ``slug`` (so input with or without hyphens both work). + Requires at least 8 hex chars โ€” shorter inputs return an empty queryset to avoid + scanning the entire snapshots table on too-broad matches. + """ + return filter_queryset_by_uuid_substring(Snapshot.objects.all(), slug) + + @staticmethod + def render_live_index(request, snapshot): + TITLE_LOADING_MSG = "Not yet archived..." + from archivebox.core.widgets import TagEditorWidget + + # Reuse the middleware-attached config; never re-bootstrap from env + plugin + # schemas just to render a snapshot page (that pays ~30ms for no reason). + runtime_config = get_request_config(request) + snapshot._runtime_config = runtime_config + snapshot_permissions = get_snapshot_permissions(snapshot) + hidden_card_plugins = {"archivedotorg", "favicon", "title"} + outputs = [ + out + for out in snapshot.discover_outputs(include_filesystem_fallback=True) + if (out.get("size") or 0) > 0 and out.get("name") not in hidden_card_plugins + ] + archiveresults = {out["name"]: out for out in outputs} + hash_index = snapshot.hashes_index + accounted_entries: set[str] = set() + for output in outputs: + output_name = output.get("name") or "" + if output_name: + accounted_entries.add(output_name) + output_path = output.get("path") or "" + if not output_path: + continue + parts = Path(output_path).parts + if parts: + accounted_entries.add(parts[0]) + + loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs, hidden_card_plugins=hidden_card_plugins) + preview_priority = [ + "singlefile", + "screenshot", + "wget", + "dom", + "pdf", + "readability", + ] + preferred_types = tuple(preview_priority) + output_order = {result_type: index for index, result_type in enumerate(archiveresults.keys())} + + best_result = {"path": "about:blank", "result": None} + for result_type in preferred_types: + if result_type in archiveresults: + best_result = archiveresults[result_type] + break + + related_snapshots_qs = ( + SnapshotView.find_snapshots_for_url(snapshot.url) + .select_related("crawl", "crawl__created_by") + .annotate( + num_outputs_cached=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.SUCCEEDED), + num_failures_cached=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.FAILED), + ) + ) + related_snapshots = list( + related_snapshots_qs.exclude(id=snapshot.id).order_by("-bookmarked_at", "-created_at", "-timestamp")[:25], + ) + related_years_map: dict[int, list[Snapshot]] = {} + for snap in [snapshot, *related_snapshots]: + snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at + if not snap_dt: + continue + related_years_map.setdefault(snap_dt.year, []).append(snap) + related_years = [] + for year, snaps in related_years_map.items(): + snaps_sorted = sorted( + snaps, + key=lambda s: s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now(), + reverse=True, + ) + related_years.append( + { + "year": year, + "latest": snaps_sorted[0], + "snapshots": snaps_sorted, + }, + ) + related_years.sort(key=lambda item: item["year"], reverse=True) + + warc_path = next( + (rel_path for rel_path in hash_index if rel_path.startswith("warc/") and ".warc" in Path(rel_path).name), + "warc/", + ) + + ordered_outputs = sorted( + archiveresults.values(), + key=lambda r: ( + preferred_types.index(r["name"]) if r["name"] in preferred_types else len(preferred_types), + output_order.get(r["name"], len(output_order)), + ), + ) + if best_result["path"] == "about:blank" and ordered_outputs: + best_result = ordered_outputs[0] + non_compact_outputs = [out for out in ordered_outputs if not out.get("is_compact") and not out.get("is_metadata")] + compact_outputs = [out for out in ordered_outputs if out.get("is_compact") or out.get("is_metadata")] + tag_widget = TagEditorWidget() + output_size = sum(int(out.get("size") or 0) for out in ordered_outputs) + has_outputs = bool(ordered_outputs) + is_archived = has_outputs or snapshot.status == Snapshot.StatusChoices.SEALED + snapshot_status = str(snapshot.status or "").lower() + status_label_by_state = { + "queued": ("queued", "info"), + "started": ("running", "warning"), + "paused": ("paused", "default"), + "sealed": ("archived", "success"), + } + if has_outputs and not is_archived: + status_label, status_color = ("partial", "warning") + elif has_outputs: + status_label, status_color = ("archived", "success") + else: + status_label, status_color = status_label_by_state.get(snapshot_status, ("not yet archived", "danger")) + + context = { + "id": str(snapshot.id), + "snapshot_id": str(snapshot.id), + "progress_endpoint": progress_endpoint("snapshot", snapshot.id), + "url": snapshot.url, + "archive_path": snapshot.archive_path_from_db, + "title": htmlencode(snapshot.resolved_title or (snapshot.base_url if is_archived else TITLE_LOADING_MSG)), + "extension": snapshot.extension or "html", + "tags": snapshot.tags_str() or "untagged", + "size": printable_filesize(output_size) if output_size else "โ€”", + "status": status_label, + "status_color": status_color, + "snapshot_state": snapshot_status, + "has_outputs": has_outputs, + "snapshot_permissions": snapshot_permissions, + "snapshot_permissions_icon": { + "public": "๐Ÿ‘ฅ", + "unlisted": "๐Ÿ”—", + "private": "๐Ÿ”’", + }.get(snapshot_permissions, "๐Ÿ‘ฅ"), + "bookmarked_date": snapshot.bookmarked_date, + "downloaded_datestr": snapshot.downloaded_datestr, + "num_outputs": snapshot.num_outputs, + "num_failures": snapshot.num_failures, + "oldest_archive_date": ts_to_date_str(snapshot.oldest_archive_date), + "warc_path": warc_path, + "archiveresults": [*non_compact_outputs, *compact_outputs], + "best_result": best_result, + "snapshot": snapshot, # Pass the snapshot object for template tags + "CONFIG": runtime_config, + "related_snapshots": related_snapshots, + "related_years": related_years, + "loose_items": loose_items, + "failed_items": failed_items, + "title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in snapshot.tags.all().order_by("name")], + } + return render(template_name="core/snapshot.html", request=request, context=context) + + def get(self, request, path): + snapshot = None + + try: + slug, archivefile = path.split("/", 1) + except (IndexError, ValueError): + slug, archivefile = path.split("/", 1)[0], "index.html" + + # slug is a timestamp + if slug.replace(".", "").isdigit(): + # missing trailing slash -> redirect to index + if "/" not in path: + return redirect(f"{path}/index.html") + + try: + try: + snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug)) + if not can_view_snapshot(request, snapshot): + return _private_snapshot_auth_redirect(request, snapshot, archivefile or "index.html") + canonical_base = snapshot.url_path + if canonical_base != snapshot.legacy_archive_path: + target_path = f"/{canonical_base}/{archivefile or 'index.html'}" + query = request.META.get("QUERY_STRING") + if query: + target_path = f"{target_path}?{query}" + return redirect(target_path) + + if request.GET.get("files"): + target_path = _files_index_target(snapshot, archivefile) + response = serve_static_with_byterange_support( + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + elif archivefile == "index.html": + # if they requested snapshot index, serve live rendered template instead of static html + response = self.render_live_index(request, snapshot) + else: + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) + response["Link"] = f'<{snapshot.url}>; rel="canonical"' + return response + except Snapshot.DoesNotExist: + if Snapshot.objects.filter(timestamp__startswith=slug).exists(): + raise Snapshot.MultipleObjectsReturned + else: + raise + except Snapshot.DoesNotExist: + # Snapshot does not exist + return HttpResponse( + format_html( + ( + "<center><br/><br/><br/>" + "No Snapshot directories match the given timestamp/ID: <code>{}</code><br/><br/>" + 'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>' + "</center>" + ), + slug, + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshot_hrefs = mark_safe("<br/>").join( + format_html( + '{} <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>', + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), + snap.archive_path, + snap.timestamp, + snap.url, + snap.title_stripped[:64] or "", + ) + for snap in direct_snapshots_queryset(request, Snapshot.objects.filter(timestamp__startswith=slug)) + .only("url", "timestamp", "title", "bookmarked_at") + .order_by("-bookmarked_at") + ) + return HttpResponse( + format_html( + ("Multiple Snapshots match the given timestamp/ID <code>{}</code><br/><pre>"), + slug, + ) + + snapshot_hrefs + + format_html('</pre><br/>Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'), + content_type="text/html", + status=404, + ) + except Http404: + assert snapshot # (Snapshot.DoesNotExist is already handled above) + + # Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png + return HttpResponse( + format_html( + ( + "<html><head>" + "<title>Snapshot Not Found" + #'' + "" + "



" + f'Snapshot [{snapshot.timestamp}]: {snapshot.url}
' + f"was queued on {str(snapshot.bookmarked_at).split('.')[0]}, " + f'but no files have been saved yet in:
{snapshot.timestamp}/' + "{}" + f"

" + "It's possible {} " + f"during the last capture on {str(snapshot.bookmarked_at).split('.')[0]},
or that the archiving process has not completed yet.
" + f"
# run this cmd to finish/retry archiving this Snapshot
" + f'archivebox update -t timestamp {snapshot.timestamp}


' + '
' + "Next steps:
" + f'- list all the Snapshot files .*
' + f'- view the Snapshot ./index.html
' + f'- go to the Snapshot admin to edit
' + f'- go to the Snapshot actions to re-archive
' + '- or return to the main index...
' + "
" + "" + ), + archivefile if str(archivefile) != "None" else "", + f"the {archivefile} resource could not be fetched" + if str(archivefile) != "None" + else "the original site was not available", + ), + content_type="text/html", + status=404, + ) + + # slug is either a URL or a (possibly truncated) snapshot UUID + def _resolve_snapshots_for_slug(slug: str): + # full URLs go straight to the url-only path (fast, indexed) + if "://" in slug: + return SnapshotView.find_snapshots_for_url(slug) + # short uuid-shaped slugs (>=8 hex chars after stripping non-hex) try id matching first + id_qs = SnapshotView.find_snapshots_for_id(slug) + if id_qs.exists(): + return id_qs + return SnapshotView.find_snapshots_for_url(slug) + + try: + try: + snapshot = direct_snapshots_queryset(request, _resolve_snapshots_for_slug(path)).get() + except Snapshot.DoesNotExist: + raise + except Snapshot.DoesNotExist: + return HttpResponse( + format_html( + ( + "



" + "No Snapshots match the given url: {}


" + 'Return to the Main Index, or:

' + '+ Add a new Snapshot for {}

' + "
" + ), + base_url(path), + path if "://" in path else f"https://{path}", + path, + ), + content_type="text/html", + status=404, + ) + except Snapshot.MultipleObjectsReturned: + snapshots = direct_snapshots_queryset(request, _resolve_snapshots_for_slug(path)) + snapshot_hrefs = mark_safe("
").join( + format_html( + '{} {} {} {} {}', + snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"), + str(snap.id)[:8], + snap.archive_path, + snap.timestamp, + snap.url, + snap.title_stripped[:64] or "", + ) + for snap in snapshots.only("url", "timestamp", "title", "bookmarked_at").order_by("-bookmarked_at") + ) + return HttpResponse( + format_html( + ("Multiple Snapshots match the given URL {}
"),
+                    base_url(path),
+                )
+                + snapshot_hrefs
+                + format_html('

Choose a Snapshot to proceed or go back to the Main Index'), + content_type="text/html", + status=404, + ) + + target_path = f"/{snapshot.archive_path}/index.html" + query = request.META.get("QUERY_STRING") + if query: + target_path = f"{target_path}?{query}" + return redirect(target_path) + + +class SnapshotPathView(View): + """Serve snapshots by the new URL scheme: /////...""" + + def get( + self, + request, + username: str, + date: str | None = None, + domain: str | None = None, + snapshot_id: str | None = None, + path: str = "", + url: str | None = None, + ): + if username == "system": + return redirect(request.path.replace("/system/", "/web/", 1)) + + if date and domain and domain == date: + raise Http404 + + requested_url = url + if not requested_url and domain and domain.startswith(("http://", "https://")): + requested_url = domain + + snapshot = None + snapshots_qs = direct_snapshots_queryset(request, Snapshot.objects.select_related("crawl", "crawl__created_by")) + if snapshot_id: + snapshot = _find_snapshot_by_ref(snapshot_id) + if snapshot and not can_view_snapshot(request, snapshot): + return _private_snapshot_auth_redirect(request, snapshot, path or "index.html") + else: + # fuzzy lookup by date + domain/url (most recent) + username_lookup = "system" if username == "web" else username + if requested_url: + qs = ( + SnapshotView.find_snapshots_for_url(requested_url) + .select_related("crawl", "crawl__created_by") + .filter( + crawl__created_by__username=username_lookup, + ) + ) + else: + qs = snapshots_qs.filter(crawl__created_by__username=username_lookup) + + if date: + try: + if len(date) == 4: + qs = qs.filter(bookmarked_at__year=int(date)) + elif len(date) == 6: + qs = qs.filter(bookmarked_at__year=int(date[:4]), bookmarked_at__month=int(date[4:6])) + elif len(date) == 8: + qs = qs.filter( + bookmarked_at__year=int(date[:4]), + bookmarked_at__month=int(date[4:6]), + bookmarked_at__day=int(date[6:8]), + ) + except ValueError: + pass + + if requested_url: + snapshot = qs.order_by("-bookmarked_at", "-created_at", "-timestamp").first() + else: + requested_domain = domain or "" + if requested_domain.startswith(("http://", "https://")): + requested_domain = Snapshot.extract_domain_from_url(requested_domain) + else: + requested_domain = Snapshot.extract_domain_from_url(f"https://{requested_domain}") + + # Prefer exact domain matches + matches = [ + s for s in qs.order_by("-bookmarked_at", "-created_at") if Snapshot.extract_domain_from_url(s.url) == requested_domain + ] + snapshot = matches[0] if matches else qs.order_by("-bookmarked_at", "-created_at", "-timestamp").first() + + if not snapshot: + return HttpResponse( + format_html( + ( + "



" + "No Snapshots match the given id or url: {}


" + 'Return to the Main Index' + "
" + ), + snapshot_id or requested_url or domain, + ), + content_type="text/html", + status=404, + ) + + canonical_base = snapshot.url_path + if date: + requested_base = f"{username}/{date}/{domain or url or ''}" + else: + requested_base = f"{username}/{domain or url or ''}" + if snapshot_id: + requested_base = f"{requested_base}/{snapshot_id}" + if canonical_base != requested_base: + target = f"/{canonical_base}/{path or 'index.html'}" + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) + + archivefile = path or "index.html" + if archivefile != "index.html" and not request.GET.get("files"): + target = build_snapshot_url(str(snapshot.id), archivefile, request=request) + query = request.META.get("QUERY_STRING") + if query: + target = f"{target}?{query}" + return redirect(target) + + if request.GET.get("files"): + target_path = _files_index_target(snapshot, archivefile) + return serve_static_with_byterange_support( + request, + target_path, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + + if archivefile == "index.html": + return SnapshotView.render_live_index(request, snapshot) + + return serve_static_with_byterange_support( + request, + archivefile, + document_root=snapshot.output_dir, + show_indexes=True, + is_archive_replay=True, + ) + + +def _safe_archive_relpath(path: str) -> str | None: + if not path: + return "" + cleaned = posixpath.normpath(path) + cleaned = cleaned.lstrip("/") + if cleaned.startswith("..") or "/../" in f"/{cleaned}/": + return None + return cleaned + + +def _coerce_sort_timestamp(value: str | float | None) -> float: + if value is None: + return 0.0 + try: + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def _snapshot_sort_key(match_path: str, cache: dict[str, float]) -> tuple[float, str]: + parts = Path(match_path).parts + date_str = "" + snapshot_id = "" + try: + idx = parts.index("snapshots") + date_str = parts[idx + 1] + snapshot_id = parts[idx + 3] + except Exception: + return (_coerce_sort_timestamp(date_str), match_path) + + if snapshot_id not in cache: + snapshot = Snapshot.objects.filter(id=snapshot_id).only("bookmarked_at", "created_at", "downloaded_at", "timestamp").first() + if snapshot: + snap_dt = snapshot.bookmarked_at or snapshot.created_at or snapshot.downloaded_at + cache[snapshot_id] = snap_dt.timestamp() if snap_dt else _coerce_sort_timestamp(snapshot.timestamp) + else: + cache[snapshot_id] = _coerce_sort_timestamp(date_str) + + return (cache[snapshot_id], match_path) + + +def _snapshot_id_from_replay_path(path: Path) -> str | None: + parts = path.parts + try: + responses_idx = parts.index("responses") + except ValueError: + return None + return parts[responses_idx - 1] if responses_idx > 0 else None + + +def _replay_path_visible(request: HttpRequest, path: Path) -> bool: + snapshot_id = _snapshot_id_from_replay_path(path) + if not snapshot_id: + return False + snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first() + if not snapshot or (not can_view_snapshot(request, snapshot) and not _has_replay_cookie(request, snapshot)): + return False + request.archivebox_config = get_request_config(request, resolve_plugins=False) + return True + + +def _latest_response_match(request: HttpRequest, domain: str, rel_path: str, *, data_root: Path) -> tuple[Path, Path] | None: + if not domain or not rel_path: + return None + domain = domain.split(":", 1)[0].lower() + # TODO: optimize by querying output_files in DB instead of globbing filesystem + escaped_domain = escape(domain) + escaped_path = escape(rel_path) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path) + matches = glob(pattern) + if not matches: + return None + + sort_cache: dict[str, float] = {} + best_paths = sorted(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache), reverse=True) + best_path = next((Path(match_path) for match_path in best_paths if _replay_path_visible(request, Path(match_path))), None) + if best_path is None: + return None + parts = best_path.parts + try: + responses_idx = parts.index("responses") + except ValueError: + return None + responses_root = Path(*parts[: responses_idx + 1]) + rel_to_root = Path(*parts[responses_idx + 1 :]) + return responses_root, rel_to_root + + +def _latest_responses_root(request: HttpRequest, domain: str, *, data_root: Path) -> Path | None: + if not domain: + return None + domain = domain.split(":", 1)[0].lower() + escaped_domain = escape(domain) + pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain) + matches = glob(pattern) + if not matches: + return None + + sort_cache: dict[str, float] = {} + best_paths = sorted(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache), reverse=True) + return next((Path(match_path) for match_path in best_paths if _replay_path_visible(request, Path(match_path))), None) + + +def _latest_snapshot_for_domain(request: HttpRequest, domain: str) -> Snapshot | None: + if not domain: + return None + + requested_domain = domain.split(":", 1)[0].lower() + snapshots = direct_snapshots_queryset( + request, + SnapshotView.find_snapshots_for_url(f"https://{requested_domain}"), + ).order_by("-bookmarked_at", "-created_at", "-timestamp") + for snapshot in snapshots: + if Snapshot.extract_domain_from_url(snapshot.url).lower() == requested_domain: + return snapshot + return None + + +def _original_request_url(domain: str, path: str = "", query_string: str = "") -> str: + normalized_domain = (domain or "").split(":", 1)[0].lower() + normalized_path = (path or "").lstrip("/") + if normalized_path in ("", "index.html"): + normalized_path = "" + target = f"https://{normalized_domain}" + if normalized_path: + target = f"{target}/{normalized_path}" + if query_string: + target = f"{target}?{query_string}" + return target + + +def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool): + candidates: list[str] = [] + rel_path = rel_path or "" + if rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + if "." not in Path(rel_path).name: + candidates.append(f"{rel_path.rstrip('/')}/index.html") + candidates.append(rel_path) + + for candidate in candidates: + try: + return serve_static_with_byterange_support( + request, + candidate, + document_root=str(responses_root), + show_indexes=show_indexes, + is_archive_replay=True, + ) + except Http404: + pass + + if rel_path.endswith("index.html"): + rel_dir = rel_path[: -len("index.html")] + try: + return serve_static_with_byterange_support( + request, + rel_dir, + document_root=str(responses_root), + show_indexes=True, + is_archive_replay=True, + ) + except Http404: + return None + return None + + +def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""): + rel_path = path or "" + request_config = get_request_config( + request, + resolve_plugins=rel_path.startswith("replay/") or rel_path == "replay", + ) + request.archivebox_config = request_config + request.archivebox_snapshot_url = snapshot.url + snapshot._runtime_config = request_config + + if rel_path.startswith("replay/") or rel_path == "replay": + response = archivewebpage_replay.serve_replay_asset_response(rel_path, request_config, HttpResponse) + if response is not None: + return response + + if rel_path == "progress.json": + # Host routing forwards every snap-* path to SnapshotHostView, so we forward + # /progress.json on through to the same view used everywhere else. The caller + # passes snapshot_id explicitly in the query string โ€” we don't read it from the + # subdomain (this keeps the endpoint identical across all security modes). + return live_progress_view(request) + + is_directory_request = bool(path) and path.endswith("/") + show_indexes = bool(request.GET.get("files")) or (request_config.USES_SUBDOMAIN_ROUTING and is_directory_request) + if not show_indexes and (not rel_path or rel_path == "index.html"): + return SnapshotView.render_live_index(request, snapshot) + + if not rel_path or rel_path.endswith("/"): + if show_indexes: + rel_path = rel_path.rstrip("/") + else: + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + try: + return serve_static_with_byterange_support( + request, + rel_path, + document_root=snapshot.output_dir, + show_indexes=show_indexes, + is_archive_replay=True, + ) + except Http404: + pass + + host = urlparse(snapshot.url).hostname or snapshot.domain + responses_root = Path(snapshot.output_dir) / "responses" / host + if responses_root.exists(): + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + raise Http404 + + +def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""): + request_config = get_request_config(request, resolve_plugins=False) + request.archivebox_config = request_config + requested_root_index = path in ("", "index.html") or path.endswith("/") + rel_path = path or "" + if not rel_path or rel_path.endswith("/"): + rel_path = f"{rel_path}index.html" + rel_path = _safe_archive_relpath(rel_path) + if rel_path is None: + raise Http404 + + domain = domain.lower() + match = _latest_response_match(request, domain, rel_path, data_root=CONSTANTS.USERS_DIR) + if not match and "." not in Path(rel_path).name: + index_path = f"{rel_path.rstrip('/')}/index.html" + match = _latest_response_match(request, domain, index_path, data_root=CONSTANTS.USERS_DIR) + if not match and "." not in Path(rel_path).name: + html_path = f"{rel_path}.html" + match = _latest_response_match(request, domain, html_path, data_root=CONSTANTS.USERS_DIR) + + show_indexes = bool(request.GET.get("files")) + if match: + responses_root, rel_to_root = match + response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes) + if response is not None: + return response + + responses_root = _latest_responses_root(request, domain, data_root=CONSTANTS.USERS_DIR) + if responses_root: + response = _serve_responses_path(request, responses_root, rel_path, show_indexes) + if response is not None: + return response + + if requested_root_index and not show_indexes: + snapshot = _latest_snapshot_for_domain(request, domain) + if snapshot: + return SnapshotView.render_live_index(request, snapshot) + + if request_config.PUBLIC_ADD_VIEW or request.user.is_authenticated: + target_url = _original_request_url(domain, path, request.META.get("QUERY_STRING", "")) + return redirect(build_web_url(f"/web/{quote(target_url, safe=':/')}")) + + raise Http404 + + +class SnapshotHostView(View): + """Serve snapshot directory contents on ./.""" + + def get(self, request, snapshot_id: str, path: str = ""): + request_config = get_request_config(request) + snapshot = _find_snapshot_by_ref(snapshot_id) + + if not snapshot: + raise Http404 + if path == "_auth": + return _replay_auth_response(request, snapshot) + if not can_view_snapshot(request, snapshot) and not _has_replay_cookie(request, snapshot): + return _private_snapshot_auth_redirect(request, snapshot, path) + + canonical_host = get_snapshot_host(str(snapshot.id), config=request_config) + if not host_matches(request.get_host(), canonical_host): + target = build_snapshot_url(str(snapshot.id), path, request=request, config=request_config) + if request.META.get("QUERY_STRING"): + target = f"{target}?{request.META['QUERY_STRING']}" + return redirect(target) + + return _serve_snapshot_replay(request, snapshot, path) + + +class SnapshotReplayView(View): + """Serve snapshot directory contents on a one-domain replay path.""" + + def get(self, request, snapshot_id: str, path: str = ""): + snapshot = _find_snapshot_by_ref(snapshot_id) + if not snapshot: + raise Http404 + if path == "_auth": + return _replay_auth_response(request, snapshot) + if not can_view_snapshot(request, snapshot) and not _has_replay_cookie(request, snapshot): + return _private_snapshot_auth_redirect(request, snapshot, path) + + return _serve_snapshot_replay(request, snapshot, path) + + +class OriginalDomainHostView(View): + """Serve responses from the most recent snapshot when using ./.""" + + def get(self, request, domain: str, path: str = ""): + return _serve_original_domain_replay(request, domain, path) + + +class OriginalDomainReplayView(View): + """Serve original-domain replay content on a one-domain replay path.""" + + def get(self, request, domain: str, path: str = ""): + return _serve_original_domain_replay(request, domain, path) + + +class PublicIndexView(ListView): + template_name = "public_index.html" + model = Snapshot + ordering = ["-bookmarked_at", "-created_at"] + paginator_class = AcceleratedPaginator + public_page_scan_chunk_size = 50 + + def get_paginate_by(self, queryset): + runtime_config = self.__dict__.get("runtime_config") + if runtime_config is None: + self.runtime_config = runtime_config = get_request_config(self.request, resolve_plugins=False) + return runtime_config.SNAPSHOTS_PER_PAGE + + def _base_public_snapshot_fields(self) -> tuple[str, ...]: + return ( + "id", + "created_at", + "modified_at", + "url", + "timestamp", + "bookmarked_at", + "title", + "downloaded_at", + "status", + "output_size", + "permissions", + ) + + def _ordered_public_page_from_order_index(self, *, page_number: int, page_size: int) -> list[Snapshot] | None: + target_count = page_number * page_size + public_snapshots: list[Snapshot] = [] + scanned = 0 + chunk_size = max(self.public_page_scan_chunk_size, page_size) + ordered_snapshots = Snapshot.objects.order_by(*self.ordering).only(*self._base_public_snapshot_fields()) + + while len(public_snapshots) < target_count: + chunk = list(ordered_snapshots[scanned : scanned + chunk_size]) + if not chunk: + break + scanned += len(chunk) + public_snapshots.extend(snapshot for snapshot in chunk if snapshot.permissions == PERMISSIONS_PUBLIC) + + start = (page_number - 1) * page_size + return public_snapshots[start:target_count] + + def paginate_queryset(self, queryset, page_size): + if self.request.GET.get("q", default="").strip(): + return super().paginate_queryset(queryset, page_size) + + public_count = self.get_exact_public_snapshot_count() + paginator = self.get_paginator(range(public_count), page_size) + page_kwarg = self.kwargs.get(self.page_kwarg) + page_query = self.request.GET.get(self.page_kwarg) + page_number = page_kwarg or page_query or 1 + + try: + page = paginator.page(page_number) + except InvalidPage as err: + raise Http404(f"Invalid page ({page_number}): {err}") from err + + object_list = self._ordered_public_page_from_order_index(page_number=page.number, page_size=page_size) + page.object_list = object_list + return paginator, page, object_list, page.has_other_pages() + + def get_context_data(self, **kwargs): + runtime_config = self.__dict__.get("runtime_config") + if runtime_config is None: + self.runtime_config = runtime_config = get_request_config(self.request, resolve_plugins=False) + search_mode = get_search_mode(self.request.GET.get("search_mode"), config=runtime_config) + search_mode_backend = get_search_mode_backend(search_mode, config=runtime_config) + query = self.request.GET.get("q", default="").strip() + public_search_state = self.__dict__.get("public_search_state") + public_search_pending = bool(query and (public_search_state is None or not public_search_state.get("done"))) + context = { + **super().get_context_data(**kwargs), + "VERSION": VERSION, + "CONFIG": runtime_config, + "COMMIT_HASH": runtime_config.COMMIT_HASH, + "FOOTER_INFO": runtime_config.FOOTER_INFO, + "WEB_BASE_URL": build_web_url(request=self.request, config=runtime_config), + "search_mode": search_mode, + "search_mode_options": get_search_mode_options(config=runtime_config), + "public_search_stream_pending": public_search_pending, + } + context["show_search_index_hint"] = bool( + query + and not public_search_pending + and get_search_mode_base(search_mode, config=runtime_config) == "deep" + and search_mode_backend + and context["paginator"].count == 0, + ) + snapshots = list(context.get("object_list") or ()) + icons_by_snapshot: dict[str, set[str]] = {str(snapshot.id): set() for snapshot in snapshots} + tag_names_by_snapshot: dict[str, list[str]] = {str(snapshot.id): [] for snapshot in snapshots} + progress_by_snapshot: dict[str, dict[str, int]] = { + str(snapshot.id): { + "total": 0, + "succeeded": 0, + "failed": 0, + "running": 0, + "skipped": 0, + "noresults": 0, + } + for snapshot in snapshots + } + if icons_by_snapshot: + for snapshot_id, tag_name in ( + SnapshotTag.objects.filter(snapshot_id__in=icons_by_snapshot.keys()) + .order_by("tag__name") + .values_list("snapshot_id", "tag__name") + .iterator(chunk_size=1000) + ): + tag_names_by_snapshot[str(snapshot_id)].append(tag_name) + + for snapshot_id, plugin, status in ( + ArchiveResult.objects.filter( + snapshot_id__in=icons_by_snapshot.keys(), + ) + .exclude(plugin="") + .values_list("snapshot_id", "plugin", "status") + .iterator(chunk_size=1000) + ): + snapshot_key = str(snapshot_id) + progress = progress_by_snapshot[snapshot_key] + progress["total"] += 1 + if status == ArchiveResult.StatusChoices.SUCCEEDED: + icons_by_snapshot[snapshot_key].add(plugin) + progress["succeeded"] += 1 + elif status == ArchiveResult.StatusChoices.FAILED: + progress["failed"] += 1 + elif status == ArchiveResult.StatusChoices.STARTED: + progress["running"] += 1 + elif status == ArchiveResult.StatusChoices.SKIPPED: + progress["skipped"] += 1 + elif status == ArchiveResult.StatusChoices.NORESULTS: + progress["noresults"] += 1 + + for snapshot in snapshots: + snapshot._icons_compact = True + snapshot._icons_archive_results = icons_by_snapshot.get(str(snapshot.id), set()) + snapshot._icons_progress_stats = progress_by_snapshot.get(str(snapshot.id), {}) + snapshot.num_outputs_cached = snapshot._icons_progress_stats.get("succeeded", 0) + snapshot._tags_str_cached = ",".join(tag_names_by_snapshot.get(str(snapshot.id), [])) + snapshot._is_archived_cached = bool(snapshot.downloaded_at or snapshot.status == Snapshot.StatusChoices.SEALED) + context["object_list"] = snapshots + return context + + def get_exact_public_snapshot_count(self) -> int: + hidden_count = Snapshot.objects.filter(permissions=PERMISSIONS_PRIVATE).count() + hidden_count += Snapshot.objects.filter(permissions=PERMISSIONS_UNLISTED).count() + return Snapshot.objects.count() - hidden_count + + def get_queryset(self, **kwargs): + qs = public_snapshots_queryset(super().get_queryset(**kwargs)).only(*self._base_public_snapshot_fields()) + query = self.request.GET.get("q", default="").strip() + + if not query: + return qs + + cached_state = get_cached_public_search_state(self.request) + self.public_search_state = cached_state + if cached_state is not None: + cached_ids = cached_state.get("ids") or [] + if not cached_ids: + return qs.none() + search_rank = Case( + *(When(pk=snapshot_id, then=Value(index)) for index, snapshot_id in enumerate(cached_ids)), + output_field=IntegerField(), + ) + return qs.filter(pk__in=cached_ids).annotate(search_rank=search_rank).order_by("search_rank", *self.ordering) + + return qs.none() + + def get(self, *args, **kwargs): + if self.request.user.is_authenticated: + return redirect("/admin/core/snapshot/") + if get_request_config(self.request).PUBLIC_INDEX: + response = super().get(*args, **kwargs) + return response + else: + return _admin_login_redirect_or_forbidden(self.request) + + +@method_decorator(csrf_exempt, name="dispatch") +class AddView(UserPassesTestMixin, FormView): + template_name = "add.html" + form_class = AddLinkForm + + def get_initial(self): + """Prefill the AddLinkForm with the 'url' GET parameter""" + if self.request.method == "GET": + url = self.request.GET.get("url", None) + if url: + return {"url": url if "://" in url else f"https://{url}"} + + return super().get_initial() + + def get_form_kwargs(self): + kwargs = super().get_form_kwargs() + kwargs["request"] = self.request + return kwargs + + def test_func(self): + return get_request_config(self.request).PUBLIC_ADD_VIEW or self.request.user.is_authenticated + + def _can_override_crawl_config(self) -> bool: + return is_admin_user(self.request) + + def _get_custom_config_overrides(self, form: AddLinkForm) -> dict: + custom_config = form.cleaned_data.get("config") or {} + + if not isinstance(custom_config, dict): + return {} + + if not self._can_override_crawl_config(): + return {} + + return {str(key): value for key, value in custom_config.items() if not str(key).endswith("_BINARY")} + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + request_config = get_request_config(self.request, resolve_plugins=True) + required_search_plugin = f"search_backend_{request_config.SEARCH_BACKEND_ENGINE}".strip() + can_override_crawl_config = self._can_override_crawl_config() + plugin_configs = discover_plugin_configs() if can_override_crawl_config else {} + public_persona_config_keys = { + "CRAWL_MAX_CONCURRENT_SNAPSHOTS", + "DELETE_AFTER", + "ONLY_NEW", + "PERMISSIONS", + "TIMEOUT", + } + persona_queryset = context["form"].fields["persona"].queryset + if not can_override_crawl_config: + persona_queryset = filter_personas_by_permissions(persona_queryset, {PERMISSIONS_PUBLIC}) + persona_config_map = {} + for persona in persona_queryset.order_by("name"): + effective_config = get_config(persona=persona) + effective_config_redacted = get_config(persona=persona, redact_sensitive=True).model_dump(mode="json") + if can_override_crawl_config: + raw_config = redact_sensitive_config(persona.config or {}) + effective_config_json = effective_config_redacted + binary_urls = get_plugin_config_binary_urls(effective_config) + else: + raw_config = {} + effective_config_json = {key: effective_config_redacted.get(key) for key in public_persona_config_keys} + binary_urls = {} + persona_config_map[persona.name] = { + "config": raw_config, + "effective_config": effective_config_json, + "binary_urls": binary_urls, + } + recent_personas = list(persona_queryset.order_by("-created_at", "name")[:5]) + plugin_dependency_map = {} + if can_override_crawl_config: + plugin_dependency_map = { + plugin_name: [ + str(required_plugin).strip() + for required_plugin in (schema.get("required_plugins") or []) + if str(required_plugin).strip() + ] + for plugin_name, schema in plugin_configs.items() + if isinstance(schema.get("required_plugins"), list) and schema.get("required_plugins") + } + return { + **context, + "title": "Create Crawl", + # We can't just call request.build_absolute_uri in the template, because it would include query parameters + "absolute_add_path": self.request.build_absolute_uri(self.request.path), + "web_base_url": build_web_url("", request=self.request), + "VERSION": VERSION, + "FOOTER_INFO": request_config.FOOTER_INFO, + "required_search_plugin": required_search_plugin, + "plugin_dependency_map_json": json.dumps(plugin_dependency_map, sort_keys=True), + "persona_config_map_json": json.dumps(persona_config_map, sort_keys=True, default=str), + "recent_personas": recent_personas, + "can_override_crawl_config": can_override_crawl_config, + "stdout": "", + } + + def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl: + from archivebox.cli.archivebox_add import add + + urls = form.cleaned_data["url"] + print(f"[+] Adding URL: {urls}") + + # Extract all form fields + tag = form.cleaned_data["tag"] + depth = int(form.cleaned_data["depth"]) + max_urls = int(form.cleaned_data.get("max_urls") or 0) + crawl_max_size = int(form.cleaned_data.get("crawl_max_size") or 0) + crawl_timeout = int(form.cleaned_data.get("crawl_timeout") or 0) + timeout = form.cleaned_data.get("timeout") + snapshot_max_size = int(form.cleaned_data.get("snapshot_max_size") or 0) + delete_after = str(form.cleaned_data.get("delete_after") or "0").strip() or "0" + crawl_max_concurrent_snapshots = int(form.cleaned_data["crawl_max_concurrent_snapshots"]) + permissions = str(form.cleaned_data.get("permissions") or "public").strip().lower() + can_override_crawl_config = self._can_override_crawl_config() + plugins = ",".join(form.cleaned_data.get("plugins", [])) if can_override_crawl_config else "" + schedule = form.cleaned_data.get("schedule", "").strip() if can_override_crawl_config else "" + persona = form.cleaned_data.get("persona") + start_paused = form.cleaned_data.get("start_paused", False) if can_override_crawl_config else False + notes = form.cleaned_data.get("notes", "") + url_filters = form.cleaned_data.get("url_filters") or {} + plugin_config = form.cleaned_data.get("plugin_config") or {} + if not isinstance(plugin_config, dict): + plugin_config = {} + if not can_override_crawl_config: + plugin_config = {} + custom_config = self._get_custom_config_overrides(form) + custom_config.pop("DEFAULT_PERSONA", None) + custom_config.pop("PERMISSIONS", None) + if persona: + persona.ensure_dirs() + + if created_by_id is None: + if self.request.user.is_authenticated: + created_by_id = self.request.user.pk + else: + from archivebox.base_models.models import get_or_create_system_user_pk + + created_by_id = get_or_create_system_user_pk() + + config = {} + effective_config = get_config(persona=persona) if persona else get_config() + if delete_after != str(effective_config.DELETE_AFTER): + config["DELETE_AFTER"] = delete_after + if timeout is not None and int(timeout) != int(effective_config.TIMEOUT): + config["TIMEOUT"] = int(timeout) + + config.update(plugin_config) + config.update(custom_config) + if bool(url_filters.get("only_new")) != bool(effective_config.ONLY_NEW): + config["ONLY_NEW"] = bool(url_filters.get("only_new")) + crawl, _snapshots = add( + urls=urls, + depth=depth, + max_urls=max_urls, + crawl_max_size=crawl_max_size, + crawl_timeout=crawl_timeout, + snapshot_max_size=snapshot_max_size, + crawl_max_concurrent_snapshots=crawl_max_concurrent_snapshots, + tag=tag, + url_allowlist=url_filters.get("allowlist") or "", + url_denylist=url_filters.get("denylist") or "", + plugins=plugins, + persona=persona.name if persona else "Default", + bg=True, + created_by_id=created_by_id, + config=config, + ) + if notes: + crawl.safe_update({"notes": notes}, refresh=False) + if permissions and crawl.config.get("PERMISSIONS") != permissions: + next_config = {**crawl.config, "PERMISSIONS": permissions} + crawl.safe_update({"config": next_config}, refresh=True) + if start_paused: + crawl.pause() + + # 3. create a CrawlSchedule if schedule is provided + if schedule: + from archivebox.crawls.models import CrawlSchedule + + crawl_schedule = CrawlSchedule.objects.create( + template=crawl, + schedule=schedule, + is_enabled=True, + config=config, + label=crawl.label, + notes=f"Auto-created from add page. {notes}".strip(), + created_by_id=created_by_id, + ) + crawl.schedule = crawl_schedule + crawl.safe_update({"schedule": crawl_schedule}, refresh=False) + + return crawl + + def form_valid(self, form): + crawl = self._create_crawl_from_form(form) + + urls = form.cleaned_data["url"] + schedule = form.cleaned_data.get("schedule", "").strip() + rough_url_count = len([url for url in urls.splitlines() if url.strip()]) + + # Build success message with schedule link if created + schedule_msg = "" + if schedule and crawl.schedule_id: + schedule_msg = f" and scheduled to repeat {schedule}" + + messages.success( + self.request, + mark_safe( + f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. View Crawl โ†’", + ), + ) + + # Orchestrator (managed by supervisord) will pick up the queued crawl + return redirect(crawl.admin_change_url) + + +class WebAddView(AddView): + def _latest_snapshot_for_url(self, requested_url: str): + return ( + direct_snapshots_queryset( + self.request, + SnapshotView.find_snapshots_for_url(requested_url), + ) + .order_by("-bookmarked_at", "-created_at", "-timestamp") + .first() + ) + + def _normalize_add_url(self, requested_url: str) -> str: + if requested_url.startswith(("http://", "https://")): + return requested_url + return f"https://{requested_url}" + + def dispatch(self, request, *args, **kwargs): + requested_url = urldecode(kwargs.get("url", "") or "") + if requested_url: + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f"/{snapshot.url_path}") + + request_host = (request.get_host() or "").lower() + if request.user.is_authenticated and not get_request_config(request).PUBLIC_ADD_VIEW and host_matches(request_host, get_web_host()): + return redirect(build_admin_url(request.get_full_path(), request=request)) + + if not self.test_func(): + if host_matches(request_host, get_web_host()): + return redirect(build_admin_url(request.get_full_path(), request=request)) + if host_matches(request_host, get_admin_host()): + next_url = quote(request.get_full_path(), safe="/:?=&") + return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={next_url}") + return HttpResponse( + format_html( + ( + "



" + "No Snapshots match the given url: {}


" + 'Return to the Main Index' + "
" + ), + requested_url or "", + ), + content_type="text/html", + status=404, + ) + + return super().dispatch(request, *args, **kwargs) + + def get(self, request: HttpRequest, *args: object, **kwargs: object): + requested_url = urldecode(str(kwargs.get("url") or (args[0] if args else ""))) + if not requested_url: + raise Http404 + + snapshot = self._latest_snapshot_for_url(requested_url) + if snapshot: + return redirect(f"/{snapshot.url_path}") + + add_url = self._normalize_add_url(requested_url) + assert self.form_class is not None + defaults_form = self.form_class() + form_data = QueryDict(mutable=True) + form_data.update( + { + "url": add_url, + "depth": defaults_form.fields["depth"].initial or "0", + "max_urls": defaults_form.fields["max_urls"].initial or 0, + "crawl_max_size": defaults_form.fields["crawl_max_size"].initial or "0", + "crawl_timeout": defaults_form.fields["crawl_timeout"].initial or 0, + "timeout": defaults_form.fields["timeout"].initial or 0, + "snapshot_max_size": defaults_form.fields["snapshot_max_size"].initial or "0", + "delete_after": defaults_form.fields["delete_after"].initial or "0", + "crawl_max_concurrent_snapshots": defaults_form.fields["crawl_max_concurrent_snapshots"].initial, + "persona": defaults_form.fields["persona"].initial or "Default", + "permissions": defaults_form.fields["permissions"].initial or "public", + "config": "{}", + }, + ) + if defaults_form.fields["start_paused"].initial: + form_data["start_paused"] = "on" + + form = self.form_class(data=form_data) + if not form.is_valid(): + return self.form_invalid(form) + + crawl = self._create_crawl_from_form(form) + snapshot = Snapshot.from_json({"url": add_url, "tags": form.cleaned_data.get("tag", "")}, overrides={"crawl": crawl}) + assert snapshot is not None + return redirect(f"/{snapshot.url_path}") + + +class HealthCheckView(View): + """ + A Django view that renders plain text "OK" for service discovery tools + """ + + def get(self, request): + """ + Handle a GET request + """ + return HttpResponse("OK", content_type="text/plain", status=200) + + +@render_with_table_view +def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: + CONFIGS = get_all_configs() + + assert request.user.is_superuser, "Must be a superuser to view configuration settings." + + merged_config = get_config(redact_sensitive=True) + + rows = { + "Section": [], + "Key": [], + "Type": [], + "Value": [], + "Source": [], + "Default": [], + # "Documentation": [], + # "Aliases": [], + } + + for section_id, section in reversed(list(CONFIGS.items())): + for key in dict(section).keys(): + rows["Section"].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", find_config_type(key))) + + # Use merged config value (includes machine overrides) + actual_value = merged_config.get(key, dict(section)[key]) + rows["Value"].append(mark_safe(f"{actual_value}")) + + # Show where the value comes from + source = find_config_source(key, merged_config) + source_colors = {"Machine": "purple", "Environment": "blue", "File": "green", "Plugin Default": "teal", "Default": "gray"} + rows["Source"].append(format_html('{}', source_colors.get(source, "gray"), source)) + + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) + # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) + # rows['Aliases'].append(', '.join(find_config_aliases(key))) + + section = "CONSTANT" + for key in CONSTANTS_CONFIG.keys(): + rows["Section"].append(section) # section.replace('_', ' ').title().replace(' Config', '') + rows["Key"].append(ItemLink(key, key=key)) + rows["Type"].append(format_html("{}", type(CONSTANTS_CONFIG[key]).__name__)) + rows["Value"].append(format_html("{}", redact_sensitive_config(CONSTANTS_CONFIG).get(key))) + rows["Source"].append(mark_safe('Constant')) + rows["Default"].append( + mark_safe( + f'{find_config_default(key) or "See here..."}', + ), + ) + # rows['Documentation'].append(mark_safe(f'Wiki: {key}')) + # rows['Aliases'].append('') + + return TableContext( + title="Computed Configuration Values", + table=rows, + ) + + +@render_with_item_view +def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + from archivebox.machine.models import Machine + + CONFIGS = get_all_configs() + + assert request.user.is_superuser, "Must be a superuser to view configuration settings." + + merged_config = get_config(redact_sensitive=True) + + # Determine all sources for this config value + sources_info = [] + + # Machine config + machine = Machine.current() + machine_admin_url = machine.admin_change_url + if machine.config and key in machine.config: + sources_info.append(("Machine", redact_sensitive_config(machine.config).get(key), "purple")) + + # Environment variable + if key in os.environ: + sources_info.append(("Environment", redact_sensitive_config(os.environ).get(key), "blue")) + + # Config file value + if CONSTANTS.CONFIG_FILE.exists(): + file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE) + if key in file_config: + sources_info.append(("File", redact_sensitive_config(file_config).get(key), "green")) + + # Default value + default_val = find_config_default(key) + if key in _plugin_config_properties(PLUGIN_CONFIG_SCHEMAS): + sources_info.append(("Plugin Default", default_val, "gray")) + elif default_val: + sources_info.append(("Default", default_val, "gray")) + + # Final computed value + config_source = find_config_source(key, merged_config) + final_value = merged_config.get(key, CONFIGS.get(key, None)) + is_redacted = final_value == SENSITIVE_CONFIG_VALUE_REDACTED + + # Build sources display + sources_html = "
".join([f'{source}: {value}' for source, value, color in sources_info]) + + # aliases = USER_CONFIG.get(key, {}).get("aliases", []) + aliases = [] + + if key in CONSTANTS_CONFIG: + section_header = mark_safe( + f'[CONSTANTS]   {key}   (read-only, hardcoded by ArchiveBox)', + ) + elif key in merged_config: + section_header = mark_safe( + f'data / ArchiveBox.conf   [{find_config_section(key)}]   {key}', + ) + else: + section_header = mark_safe( + f'[DYNAMIC CONFIG]   {key}   (read-only, calculated at runtime)', + ) + + definition_url, definition_label = get_config_definition_link(key) + + section_data = cast( + SectionData, + { + "name": section_header, + "description": None, + "fields": { + "Key": key, + "Type": find_config_type(key), + "Value": final_value, + "Currently read from": config_source, + }, + "help_texts": { + "Key": mark_safe(f""" + Documentation   + + Aliases: {", ".join(aliases)} + + """), + "Type": mark_safe(f''' + + See full definition in {definition_label}... + + '''), + "Value": mark_safe(f''' + { + 'Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)

' + if is_redacted + else "" + } +


+ Configuration Sources (highest priority first):

+ {sources_html} +

+

+ To change this value, edit data/ArchiveBox.conf or run: +

+ archivebox config --set {key}="{ + val.strip("'") if (val := find_config_default(key)) else str(final_value).strip("'") + }" +

+ '''), + "Currently read from": mark_safe(f""" + The value shown in the "Value" field comes from the {config_source} source. +

+ Priority order (highest to lowest): +
    +
  1. Machine - Machine-specific overrides + {f'
    โ†’ Edit {key} in Machine.config for this server' if machine_admin_url else ""} +
  2. +
  3. Environment - process defaults from environment variables
  4. +
  5. File - data/ArchiveBox.conf
  6. +
  7. Plugin Default - Default value from plugin config.json
  8. +
  9. Default - Default value from code
  10. +
+ {f'
Tip: To override {key} on this machine, edit the Machine.config field and add:
{{"\\"{key}\\": "your_value_here"}}' if machine_admin_url and key not in CONSTANTS_CONFIG else ""} + """), + }, + }, + ) + + return ItemContext( + slug=key, + title=key, + data=[section_data], + ) diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py new file mode 100644 index 0000000000..c4c600478b --- /dev/null +++ b/archivebox/core/widgets.py @@ -0,0 +1,738 @@ +__package__ = "archivebox.core" + +import json +import re +import hashlib +from django import forms +from django.db.models.manager import BaseManager +from django.db.models.query import QuerySet +from django.utils.html import escape +from django.utils.safestring import mark_safe + + +class TagEditorWidget(forms.Widget): + """ + A widget that renders tags as clickable pills with inline editing. + - Displays existing tags alphabetically as styled pills with X remove button + - Text input with HTML5 datalist for autocomplete suggestions + - Press Enter or Space to create new tags (auto-creates if doesn't exist) + - Uses AJAX for autocomplete and tag creation + """ + + template_name = "" # We render manually + + class Media: + css = {"all": []} + js = [] + + def __init__(self, attrs=None, snapshot_id=None): + self.snapshot_id = snapshot_id + super().__init__(attrs) + + def _escape(self, value): + """Escape HTML entities in value.""" + return escape(str(value)) if value else "" + + def _normalize_id(self, value): + """Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start).""" + normalized = re.sub(r"[^A-Za-z0-9_]", "_", str(value)) + if not normalized or not re.match(r"[A-Za-z_]", normalized): + normalized = f"t_{normalized}" + return normalized + + def _tag_style(self, value): + """Compute a stable pastel color style for a tag value.""" + tag = (value or "").strip().lower() + digest = hashlib.md5(tag.encode("utf-8")).hexdigest() + hue = int(digest[:4], 16) % 360 + bg = f"hsl({hue}, 70%, 92%)" + border = f"hsl({hue}, 60%, 82%)" + fg = f"hsl({hue}, 35%, 28%)" + return f"--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};" + + def render(self, name, value, attrs=None, renderer=None): + """ + Render the tag editor widget. + + Args: + name: Field name + value: Can be: + - QuerySet of Tag objects (from M2M field) + - List of tag names + - Comma-separated string of tag names + - None + attrs: HTML attributes + renderer: Not used + """ + # Parse value to get list of tag names + tags = [] + if value: + if isinstance(value, (BaseManager, QuerySet)): + tags = sorted([tag.name for tag in value.all()]) + elif isinstance(value, (list, tuple)): + from archivebox.core.models import Tag + + if value and isinstance(value[0], Tag): # List of Tag objects + tags = sorted([tag.name for tag in value]) + else: # List of strings or IDs + # Could be tag IDs from form submission + from archivebox.core.models import Tag + + tag_names = [] + for v in value: + if isinstance(v, str) and not v.isdigit(): + tag_names.append(v) + else: + try: + tag = Tag.objects.get(pk=v) + tag_names.append(tag.name) + except (Tag.DoesNotExist, ValueError): + if isinstance(v, str): + tag_names.append(v) + tags = sorted(tag_names) + elif isinstance(value, str): + tags = sorted([t.strip() for t in value.split(",") if t.strip()]) + + widget_id_raw = attrs.get("id", name) if attrs else name + widget_id = self._normalize_id(widget_id_raw) + + # Build pills HTML + pills_html = "" + for tag in tags: + pills_html += f''' + + {self._escape(tag)} + + + ''' + + # Build the widget HTML + html = f''' +
+
+ {pills_html} +
+ + + +
+ + + ''' + + return mark_safe(html) + + +class URLFiltersWidget(forms.Widget): + """Render URL allowlist / denylist controls with same-domain autofill.""" + + template_name = "" + + def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'): + self.source_selector = source_selector + super().__init__(attrs) + + def render(self, name, value, attrs=None, renderer=None): + value = value if isinstance(value, dict) else {} + widget_id_raw = attrs.get("id", name) if attrs else name + widget_id = re.sub(r"[^A-Za-z0-9_]", "_", str(widget_id_raw)) or name + value = value or {} + allowlist = escape(value.get("allowlist", "") or "") + denylist = escape(value.get("denylist", "") or "") + same_domain_checked = " checked" if value.get("same_domain_only") else "" + subpaths_checked = " checked" if value.get("subpaths_only") else "" + only_new_checked = " checked" if value.get("only_new") else "" + + return mark_safe(f''' +
+ +
+
+
+ + Regex patterns or domains to include, one pattern per line. +
+ +
+
+
+ + Regex patterns or domains to exclude, one pattern per line. +
+ +
+
+ + + +
These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.
+ +
+ ''') + + def value_from_datadict(self, data, files, name): + return { + "allowlist": data.get(f"{name}_allowlist", ""), + "denylist": data.get(f"{name}_denylist", ""), + "same_domain_only": data.get(f"{name}_same_domain_only") in ("1", "on", "true"), + "subpaths_only": data.get(f"{name}_subpaths_only") in ("1", "on", "true"), + "only_new": data.get(f"{name}_only_new") in ("1", "on", "true"), + } + + +class InlineTagEditorWidget(TagEditorWidget): + """ + Inline version of TagEditorWidget for use in list views. + Includes AJAX save functionality for immediate persistence. + """ + + def __init__(self, attrs=None, snapshot_id=None, editable=True): + super().__init__(attrs, snapshot_id) + self.snapshot_id = snapshot_id + self.editable = editable + + def render(self, name, value, attrs=None, renderer=None, snapshot_id=None): + """Render inline tag editor with AJAX save.""" + # Use snapshot_id from __init__ or from render call + snapshot_id = snapshot_id or self.snapshot_id + + # Parse value to get list of tag dicts with id and name + tag_data = [] + if value: + if isinstance(value, (BaseManager, QuerySet)): + for tag in value.all(): + tag_data.append({"id": tag.pk, "name": tag.name}) + tag_data.sort(key=lambda x: x["name"].lower()) + elif isinstance(value, (list, tuple)): + from archivebox.core.models import Tag + + if value and isinstance(value[0], Tag): + for tag in value: + tag_data.append({"id": tag.pk, "name": tag.name}) + tag_data.sort(key=lambda x: x["name"].lower()) + + widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get("id", name) if attrs else name) + widget_id = self._normalize_id(widget_id_raw) + + # Build pills HTML with filter links + pills_html = "" + for td in tag_data: + remove_button = "" + if self.editable: + remove_button = ( + f'' + ) + pills_html += f''' + + {self._escape(td["name"])} + {remove_button} + + ''' + + tags_json = escape(json.dumps(tag_data)) + input_html = "" + readonly_class = " readonly" if not self.editable else "" + if self.editable: + input_html = f''' + + + ''' + + html = f''' + + + {pills_html} + + {input_html} + + ''' + + return mark_safe(html) diff --git a/archivebox/core/wsgi.py b/archivebox/core/wsgi.py new file mode 100644 index 0000000000..1b667177a3 --- /dev/null +++ b/archivebox/core/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for archivebox project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/ +""" + +import archivebox # noqa +from archivebox.config.django import setup_django +from django.core.wsgi import get_wsgi_application + +setup_django(in_memory_db=False, check_db=True) + +application = get_wsgi_application() diff --git a/archivebox/crawls/__init__.py b/archivebox/crawls/__init__.py new file mode 100644 index 0000000000..b47f54cadd --- /dev/null +++ b/archivebox/crawls/__init__.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.crawls" +__order__ = 100 + + +def register_admin(admin_site): + from .admin import register_admin as register_crawls_admin + + register_crawls_admin(admin_site) diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py new file mode 100644 index 0000000000..9f43c48da0 --- /dev/null +++ b/archivebox/crawls/admin.py @@ -0,0 +1,1331 @@ +__package__ = "archivebox.crawls" + +from copy import copy +import json +from urllib.parse import urlencode, urlparse + +from django import forms +from django.core.paginator import Paginator +from django.http import JsonResponse, HttpRequest, HttpResponseBadRequest, HttpResponseNotAllowed +from django.shortcuts import get_object_or_404, redirect +from django.template.response import TemplateResponse +from django.template.loader import render_to_string +from django.urls import path, reverse +from django.utils.html import escape, format_html, format_html_join +from django.utils import timezone +from django.utils.safestring import mark_safe +from django.contrib import admin, messages +from django.db.models import Count, F, Q + + +from django_object_actions import action + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.core.permissions import ( + PERMISSIONS_CHOICES, + PERMISSIONS_META, + PERMISSIONS_PRIVATE, + PERMISSIONS_PUBLIC, + PERMISSIONS_UNLISTED, + PERMISSIONS_VALUES, + normalize_permissions, +) +from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget +from archivebox.crawls.models import Crawl, CrawlSchedule +from archivebox.misc.paginators import AcceleratedPaginator +from archivebox.progressmonitor.views import progress_endpoint +from archivebox.workers.models import RETRY_AT_MAX + + +class MaxDepthListFilter(admin.SimpleListFilter): + title = "max depth" + parameter_name = "max_depth" + + def lookups(self, request, model_admin): + return [(str(depth), str(depth)) for depth in range(5)] + + def queryset(self, request, queryset): + value = self.value() + if value is not None and value.isdigit(): + return queryset.filter(max_depth=int(value)) + return queryset + + +def render_snapshots_list(snapshots_qs, request=None, crawl=None, page_size=50, prefix="snapshots"): + """Render a nice inline list view of snapshots with status, title, URL, and progress.""" + + query_param = f"{prefix}_q" + status_param = f"{prefix}_status" + page_param = f"{prefix}_page" + query = (request.GET.get(query_param, "") if request is not None else "").strip() + status_filter = (request.GET.get(status_param, "") if request is not None else "").strip() + valid_statuses = {choice[0] for choice in Snapshot.StatusChoices.choices} + + filtered_qs = snapshots_qs + if query: + from archivebox.misc.util import filter_queryset_by_uuid_substring + + id_match_pks = list(filter_queryset_by_uuid_substring(Snapshot.objects.all(), query).values_list("pk", flat=True)[:100]) + filtered_qs = filtered_qs.filter(Q(pk__in=id_match_pks) | Q(url__icontains=query) | Q(title__icontains=query)) + if status_filter in valid_statuses: + filtered_qs = filtered_qs.filter(status=status_filter) + + # Keep ArchiveResult counters as scalar subqueries so the paginated + # Snapshot queryset does not become a join+GROUP BY over every result row. + snapshots_qs = filtered_qs.order_by("-created_at").annotate( + total_results=ArchiveResult.snapshot_count_expr(), + succeeded_results=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.SUCCEEDED), + failed_results=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.FAILED), + started_results=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.STARTED), + skipped_results=ArchiveResult.snapshot_count_expr(status=ArchiveResult.StatusChoices.SKIPPED), + snapshot_permissions=F("permissions"), + ) + + page_number = request.GET.get(page_param, 1) if request is not None else 1 + paginator = Paginator(snapshots_qs, page_size) + page_obj = paginator.get_page(page_number) + snapshots = page_obj.object_list + total_count = paginator.count + + def querystring(**updates): + if request is None: + return "#" + params = request.GET.copy() + for key, value in updates.items(): + if value in (None, ""): + params.pop(key, None) + else: + params[key] = str(value) + return f"?{params.urlencode()}" if params else "?" + + preserved_inputs = "" + if request is not None: + managed_params = {query_param, status_param, page_param} + preserved_inputs = "".join( + f'' + for key, values in request.GET.lists() + if key not in managed_params + for value in values + ) + + status_options = "".join( + f'' + for value, label in Snapshot.StatusChoices.choices + ) + + controls = f""" +
+
+ {preserved_inputs} + + + + + {f'Clear' if query or status_filter else ""} +
+
+ {page_obj.start_index() if total_count else 0}-{page_obj.end_index() if total_count else 0} of {total_count} +
+
+ """ + + if not snapshots: + return mark_safe(f""" +
+ {controls} +
No Snapshots found.
+
+ """) + + # Status colors matching Django admin and progress monitor + status_colors = { + "queued": ("#6c757d", "#f8f9fa"), # gray + "started": ("#856404", "#fff3cd"), # amber + "paused": ("#1d4ed8", "#dbeafe"), # blue + "sealed": ("#155724", "#d4edda"), # green + "failed": ("#721c24", "#f8d7da"), # red + } + + rows = [] + for snapshot in snapshots: + status = snapshot.status or "queued" + color, bg = status_colors.get(status, ("#6c757d", "#f8f9fa")) + permissions = snapshot.snapshot_permissions + permission_icon = { + PERMISSIONS_PUBLIC: "๐Ÿ‘", + PERMISSIONS_UNLISTED: "๐Ÿ”—", + PERMISSIONS_PRIVATE: "๐Ÿ”’", + }[permissions] + permission_fg, permission_bg = { + PERMISSIONS_PUBLIC: ("#047857", "#d1fae5"), + PERMISSIONS_UNLISTED: ("#1d4ed8", "#dbeafe"), + PERMISSIONS_PRIVATE: ("#991b1b", "#fee2e2"), + }[permissions] + + # Calculate progress + total = snapshot.total_results + succeeded = snapshot.succeeded_results + failed = snapshot.failed_results + running = snapshot.started_results + skipped = snapshot.skipped_results + done = succeeded + failed + skipped + pending = max(total - done - running, 0) + progress_pct = int((done / total) * 100) if total > 0 else 0 + progress_text = f"{done}/{total}" if total > 0 else "-" + progress_title = f"{succeeded} succeeded, {failed} failed, {running} running, {pending} pending, {skipped} skipped" + progress_color = "#28a745" + if failed: + progress_color = "#dc3545" + elif running: + progress_color = "#17a2b8" + elif pending: + progress_color = "#ffc107" + + # Truncate title and URL + snapshot_title = snapshot.title or "Untitled" + title = snapshot_title[:60] + if len(snapshot_title) > 60: + title += "..." + url_display = snapshot.url[:50] + if len(snapshot.url) > 50: + url_display += "..." + delete_button = "" + exclude_button = "" + if crawl is not None: + delete_url = reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]) + exclude_url = reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, snapshot.pk]) + delete_button = f''' + + ''' + exclude_button = f''' + + ''' + + # Format date + date_str = snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot.created_at else "-" + + rows.append(f''' + + + {status} + + + {permission_icon} + + + + + + + + {escape(title)} + + + {escape(url_display)} + + +
+
+
+
+ {progress_text} +
+ + + {date_str} + + {f'
{exclude_button}{delete_button}
' if crawl is not None else ""} + + ''') + + pagination = "" + if paginator.num_pages > 1: + pagination = f""" +
+ {"Previous" if page_obj.has_previous() else "Previous"} + Page {page_obj.number} of {paginator.num_pages} + {"Next" if page_obj.has_next() else "Next"} +
+ """ + + return mark_safe(f""" +
+ {controls} + + + + + + + + + + + { + '' if crawl is not None else "" + } + + + + {"".join(rows)} + +
Status๐Ÿ”’TitleURLProgressCreatedActions
+ {pagination} +
+ { + ''' + + ''' + if crawl is not None + else "" + } + """) + + +class URLFiltersField(forms.Field): + widget = URLFiltersWidget(source_selector="#id_urls") + + def to_python(self, value): + if isinstance(value, dict): + return value + return {"allowlist": "", "denylist": "", "same_domain_only": False, "subpaths_only": False, "only_new": False} + + +class CrawlAdminForm(forms.ModelForm): + """Custom form for Crawl admin to render urls field as textarea.""" + + tags_editor = forms.CharField( + label="Tags", + required=False, + widget=TagEditorWidget(), + help_text="Type tag names and press Enter or Space to add. Click ร— to remove.", + ) + url_filters = URLFiltersField( + label="URL Filters", + required=False, + help_text="Set URL_ALLOWLIST / URL_DENYLIST for this crawl.", + ) + + class Meta: + model = Crawl + fields = "__all__" + widgets = { + "urls": forms.Textarea( + attrs={ + "rows": 8, + "style": "width: 100%; font-family: monospace; font-size: 13px;", + "placeholder": "https://example.com\nhttps://example2.com\n# Comments start with #", + }, + ), + "notes": forms.Textarea( + attrs={ + "rows": 1, + "style": "width: 100%; min-height: 0; resize: vertical;", + }, + ), + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {} + if self.instance and self.instance.pk: + self.initial["tags_editor"] = self.instance.tags_str + effective_only_new = self.effective_only_new(self.instance if self.instance and self.instance.pk else None) + derived_filter_toggles = self.derive_filter_toggles( + self.instance.urls if self.instance and self.instance.pk else "", + config.get("URL_ALLOWLIST", ""), + ) + self.initial["url_filters"] = { + "allowlist": config.get("URL_ALLOWLIST", ""), + "denylist": config.get("URL_DENYLIST", ""), + "same_domain_only": derived_filter_toggles["same_domain_only"], + "subpaths_only": derived_filter_toggles["subpaths_only"], + "only_new": effective_only_new, + } + + @staticmethod + def extract_url_line(line): + line = str(line or "").strip() + if not line or line.startswith("#"): + return "" + if line.startswith("{"): + try: + return str(json.loads(line).get("url", "")).strip() + except (TypeError, ValueError, json.JSONDecodeError): + return "" + return line + + @staticmethod + def regex_escape(text): + escaped = "" + for char in str(text or ""): + escaped += f"\\{char}" if char in r".*+?^${}()|[]\\" else char + return escaped + + @classmethod + def generated_host_allowlist(cls, urls): + seen = set() + domains = [] + for raw_line in str(urls or "").splitlines(): + url = cls.extract_url_line(raw_line) + if not url: + continue + parsed = urlparse(url) + domain = (parsed.hostname or "").lower() + if not domain or domain in seen: + continue + seen.add(domain) + domains.append(domain) + if not domains: + return "" + return "^https?://(" + "|".join(cls.regex_escape(domain) for domain in domains) + ")([:/]|$)" + + @staticmethod + def subpath_prefix(pathname): + path = str(pathname or "/") + while "//" in path: + path = path.replace("//", "/") + if not path or path == "/": + return "/" + if path.endswith("/"): + return path + last_slash = path.rfind("/") + last_part = path[last_slash + 1 :] + if "." in last_part: + return path[: last_slash + 1] or "/" + return path + + @staticmethod + def parsed_host_and_port(parsed): + host = (parsed.hostname or "").lower() + if not host: + return "" + try: + port = parsed.port + except ValueError: + port = None + return f"{host}:{port}" if port is not None else host + + @classmethod + def generated_subpath_allowlist(cls, urls): + seen = set() + paths = [] + for raw_line in str(urls or "").splitlines(): + url = cls.extract_url_line(raw_line) + if not url: + continue + parsed = urlparse(url) + domain = (parsed.hostname or "").lower() + if domain: + seen.add(domain) + host = cls.parsed_host_and_port(parsed) + path = cls.subpath_prefix(parsed.path) + path_key = f"{host}{path}" + if not host or path_key in seen: + continue + seen.add(path_key) + paths.append((host, path)) + if not paths: + return "" + patterns = [] + for host, path in paths: + if path == "/": + patterns.append(f"^https?://{cls.regex_escape(host)}([/?#]|$)") + elif path.endswith("/"): + patterns.append(f"^https?://{cls.regex_escape(host)}{cls.regex_escape(path)}") + else: + patterns.append(f"^https?://{cls.regex_escape(host)}{cls.regex_escape(path)}([/?#]|$)") + return "\n".join(patterns) + + @classmethod + def derive_filter_toggles(cls, urls, allowlist): + normalized_allowlist = "\n".join(Crawl.split_filter_patterns(allowlist)) + if not normalized_allowlist: + return {"same_domain_only": False, "subpaths_only": False} + if normalized_allowlist == cls.generated_subpath_allowlist(urls): + return {"same_domain_only": True, "subpaths_only": True} + if normalized_allowlist == cls.generated_host_allowlist(urls): + return {"same_domain_only": True, "subpaths_only": False} + return {"same_domain_only": False, "subpaths_only": False} + + @staticmethod + def effective_only_new(crawl=None): + from archivebox.config.common import get_config + + if crawl is not None: + return bool(get_config(crawl=crawl, resolve_plugins=False).ONLY_NEW) + return bool(get_config(resolve_plugins=False).ONLY_NEW) + + @staticmethod + def inherited_only_new(crawl): + crawl_without_only_new = copy(crawl) + config = dict(crawl.config or {}) + config.pop("ONLY_NEW", None) + crawl_without_only_new.config = config + return CrawlAdminForm.effective_only_new(crawl_without_only_new) + + def clean_tags_editor(self): + tags_str = self.cleaned_data.get("tags_editor", "") + tag_names = [] + seen = set() + for raw_name in tags_str.split(","): + name = raw_name.strip() + if not name: + continue + lowered = name.lower() + if lowered in seen: + continue + seen.add(lowered) + tag_names.append(name) + return ",".join(tag_names) + + def clean_url_filters(self): + value = self.cleaned_data.get("url_filters") or {} + return { + "allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))), + "denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))), + "same_domain_only": bool(value.get("same_domain_only")), + "subpaths_only": bool(value.get("subpaths_only")), + "only_new": bool(value.get("only_new")), + } + + def save(self, commit=True): + instance = super().save(commit=False) + instance.tags_str = self.cleaned_data.get("tags_editor", "") + if f"{self.add_prefix('url_filters')}_allowlist" in self.data or f"{self.add_prefix('url_filters')}_denylist" in self.data: + url_filters = self.cleaned_data.get("url_filters") or {} + instance.set_url_filters( + url_filters.get("allowlist", ""), + url_filters.get("denylist", ""), + ) + config = dict(instance.config or {}) + only_new = bool(url_filters.get("only_new")) + inherited_only_new = self.inherited_only_new(instance) + if only_new != inherited_only_new: + config["ONLY_NEW"] = only_new + else: + config.pop("ONLY_NEW", None) + instance.config = config + if commit: + instance.save() + instance.apply_crawl_config_filters() + self._save_m2m() + return instance + + +class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): + form = CrawlAdminForm + change_form_template = "admin/crawls/crawl/change_form.html" + list_select_related = () + paginator = AcceleratedPaginator + show_full_result_count = False + list_display = ( + "short_id", + "permissions_badge", + "created_at", + "owner", + "depth", + "status_with_stop_reason", + "pause_resume_control", + "label", + "notes", + "urls_preview", + "schedule_str", + "retry_at", + "num_archived_snapshots", + "num_total_snapshots", + ) + sort_fields = ( + "id", + "created_at", + "created_by", + "max_depth", + "label", + "notes", + "schedule_str", + "status", + "retry_at", + ) + search_fields = ( + "id", + "created_by__username", + "max_depth", + "label", + "notes", + "schedule_id", + "status", + "urls", + ) + + readonly_fields = ("created_at", "modified_at", "stop_reason_display") + + fieldsets = ( + ( + "URLs", + { + "fields": ("urls", "url_filters"), + "classes": ("card", "wide"), + }, + ), + ( + "Overview", + { + "fields": ( + ("label", "status", "retry_at", "schedule", "created_by", "created_at", "modified_at"), + ("max_depth",), + ("stop_reason_display",), + ("notes", "tags_editor"), + ), + "classes": ("card", "wide", "crawl-admin-overview"), + }, + ), + ( + "Config", + { + "fields": ("config",), + "classes": ("card", "wide", "crawl-admin-config"), + }, + ), + ) + add_fieldsets = ( + ( + "URLs", + { + "fields": ("urls", "url_filters"), + "classes": ("card", "wide"), + }, + ), + ( + "Overview", + { + "fields": ( + ("label", "status", "retry_at", "schedule", "created_by"), + ("max_depth",), + ("notes", "tags_editor"), + ), + "classes": ("card", "wide", "crawl-admin-overview"), + }, + ), + ( + "Config", + { + "fields": ("config",), + "classes": ("card", "wide", "crawl-admin-config"), + }, + ), + ) + + list_filter = (MaxDepthListFilter, "schedule", "created_by", "status", "retry_at") + ordering = ["-created_at", "-retry_at"] + list_per_page = 50 + actions = [ + "pause_selected_crawls", + "resume_selected_crawls", + "seal_selected_crawls", + "delete_selected_batched", + "set_crawl_permissions", + ] + change_actions = ["recrawl"] + + def __init__(self, model, admin_site): + super().__init__(model, admin_site) + self.crawl_admin_base_config = None + self.stop_reason_cache = {} + + class Media: + css = {"all": ("admin/crawls/crawl_change.css",)} + js = ("admin/crawls/crawl_admin.js",) + + def changelist_view(self, request, extra_context=None): + self.request = request + self.crawl_admin_base_config = request.archivebox_config + self.stop_reason_cache = {} + response = super().changelist_view(request, extra_context) + if not isinstance(response, TemplateResponse): + return response + cl = response.context_data.get("cl") + if cl is not None and not self.should_annotate_snapshot_counts(request): + self.hydrate_visible_snapshot_counts(cl.result_list) + return response + + def should_annotate_snapshot_counts(self, request): + ordering = request.GET.get("o", "") + if not ordering: + return False + list_display = list(self.get_list_display(request)) + count_positions = { + str(list_display.index("num_archived_snapshots") + 1), + str(list_display.index("num_total_snapshots") + 1), + } + return any(part.lstrip("-") in count_positions for part in ordering.split(".")) + + def hydrate_visible_snapshot_counts(self, crawls): + crawl_list = list(crawls) + crawl_ids = [crawl.pk for crawl in crawl_list] + if not crawl_ids: + return + counts = Snapshot.crawl_total_and_status_counts(crawl_ids, status=Snapshot.StatusChoices.SEALED) + for crawl in crawl_list: + row = counts.get(str(crawl.pk), {}) + crawl.num_snapshots_cached = row.get("total", 0) + crawl.num_archived_snapshots_cached = row.get("status", 0) + + def get_queryset(self, request): + """Keep joins page-local while computing per-row snapshot counts in the page query.""" + queryset = ( + super() + .get_queryset(request) + .prefetch_related( + "created_by", + "persona", + "schedule__template", + ) + ) + if self.should_annotate_snapshot_counts(request): + queryset = queryset.annotate( + num_snapshots_cached=Snapshot.crawl_count_expr(), + num_archived_snapshots_cached=Snapshot.crawl_count_expr(status=Snapshot.StatusChoices.SEALED), + ) + return queryset + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + self.crawl_admin_base_config = request.archivebox_config + self.stop_reason_cache = {} + crawl = self.get_object(request, object_id) + if crawl: + self.hydrate_visible_snapshot_counts([crawl]) + extra_context = { + **(extra_context or {}), + "crawl_stop_reason": self.stop_reason_for_crawl(crawl) if crawl else "", + "crawl_snapshots_changelist": self.snapshots_changelist(crawl) if crawl else "", + } + if crawl and crawl.status in { + Crawl.StatusChoices.QUEUED, + Crawl.StatusChoices.STARTED, + Crawl.StatusChoices.PAUSED, + }: + extra_context["progress_auto_expand"] = True + extra_context["progress_endpoint"] = progress_endpoint("crawl", crawl.id) + return super().change_view(request, object_id, form_url, extra_context) + + def add_view(self, request, form_url="", extra_context=None): + self.request = request + return super().add_view(request, form_url, extra_context) + + def get_fieldsets(self, request, obj=None): + return self.fieldsets if obj else self.add_fieldsets + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path( + "/snapshot//delete/", + self.admin_site.admin_view(self.delete_snapshot_view), + name="crawls_crawl_snapshot_delete", + ), + path( + "/snapshot//exclude-domain/", + self.admin_site.admin_view(self.exclude_domain_view), + name="crawls_crawl_snapshot_exclude_domain", + ), + path( + "/set-permissions/", + self.admin_site.admin_view(self.set_permissions_view), + name="crawls_crawl_set_permissions", + ), + ] + return custom_urls + urls + + def get_actions(self, request): + actions = super().get_actions(request) + actions.pop("delete_selected", None) + return actions + + @admin.action(description="Delete") + def delete_selected_batched(self, request, queryset): + """Delete crawls in a single transaction to avoid SQLite concurrency issues.""" + from django.db import transaction + + total = queryset.count() + + # Get list of IDs to delete first (outside transaction) + ids_to_delete = list(queryset.values_list("pk", flat=True)) + + # Delete everything in a single atomic transaction + with transaction.atomic(): + deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete() + + messages.success(request, f"Successfully deleted {total} crawls ({deleted_count} total objects including related records).") + + @admin.action(description="Pause") + def pause_selected_crawls(self, request, queryset): + # Admin changelist actions must stay set-based. Calling crawl.pause() + # here fans out into per-crawl Snapshot/ArchiveResult writes and can + # hold SQLite behind the request for minutes on large archives. The + # Crawl row is the scheduler signal; the runner observes PAUSED and + # owns child-row lifecycle work. + paused = queryset.exclude(status__in=Crawl.INACTIVE_STATES).update( + status=Crawl.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + modified_at=timezone.now(), + ) + if paused: + messages.success(request, f"Paused {paused} crawl(s). The runner will stop scheduling new work on the next sweep.") + else: + messages.warning(request, "No active crawls were selected to pause.") + + @admin.action(description="Resume") + def resume_selected_crawls(self, request, queryset): + # Keep resume symmetrical with pause: one tight scheduler UPDATE, no + # save() hooks and no child fanout in the request path. Paused child + # rows become runnable through their own resume/maintenance paths. + resumed = queryset.filter(status__in=Crawl.INACTIVE_STATES).update( + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + modified_at=timezone.now(), + ) + if resumed: + messages.success(request, f"Resumed {resumed} crawl(s). The runner will pick them up on the next sweep.") + else: + messages.warning(request, "No paused or sealed crawls were selected to resume.") + + @admin.action(description="Seal") + def seal_selected_crawls(self, request, queryset): + now = timezone.now() + crawl_ids = list(queryset.exclude(status=Crawl.StatusChoices.SEALED).values_list("pk", flat=True)) + if not crawl_ids: + messages.warning(request, "No unsealed crawls were selected to seal.") + return + + Snapshot.objects.filter( + crawl_id__in=crawl_ids, + status__in=Snapshot.OPEN_STATES, + ).filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ).update( + retry_at=now, + modified_at=now, + ) + sealed = ( + Crawl.objects.filter(pk__in=crawl_ids) + .exclude(status=Crawl.StatusChoices.SEALED) + .update( + status=Crawl.StatusChoices.SEALED, + retry_at=now, + modified_at=now, + ) + ) + messages.success(request, f"Sealed {sealed} crawl(s). The runner will finish cleanup on the next sweep.") + + @admin.action(description="Permissions โ–พ") + def set_crawl_permissions(self, request, queryset): + permissions = (request.POST.get("permissions") or "").strip().lower() + if permissions not in PERMISSIONS_VALUES: + messages.error(request, "Choose a valid permissions value.") + return + updated = self.update_crawl_permissions(queryset, permissions) + messages.success(request, f"Set permissions to {permissions} on {updated} crawl(s).") + + def update_crawl_permissions(self, queryset, permissions): + now = timezone.now() + updated = 0 + batch = [] + crawls_to_update = [] + for crawl in queryset.only("id", "config", "permissions").iterator(chunk_size=500): + old_permissions = crawl.permissions + config = dict(crawl.config or {}) + config["PERMISSIONS"] = permissions + crawl.config = config + crawl.modified_at = now + crawls_to_update.append((crawl, old_permissions)) + batch.append(crawl) + if len(batch) >= 500: + Crawl.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + batch.clear() + if batch: + Crawl.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + for crawl, old_permissions in crawls_to_update: + crawl.update_child_snapshot_permissions(old_permissions, permissions) + return updated + + @action(label="Recrawl", description="Create a new crawl with the same settings", methods=("POST",)) + def recrawl(self, request, obj): + """Duplicate this crawl as a new crawl with the same URLs and settings.""" + + # Validate URLs (required for crawl to start) + if not obj.urls: + messages.error(request, "Cannot recrawl: original crawl has no URLs.") + return redirect("admin:crawls_crawl_change", obj.id) + + new_crawl = Crawl.create_scheduler_row( + urls=obj.urls, + max_depth=obj.max_depth, + tags_str=obj.tags_str, + config=obj.config, + schedule=obj.schedule, + label=f"{obj.label} (recrawl)" if obj.label else "", + notes=obj.notes, + created_by=request.user, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + messages.success(request, f"Created new crawl {new_crawl.id} with the same settings. It will start processing shortly.") + + return redirect("admin:crawls_crawl_change", new_crawl.id) + + @admin.display(description="Stop Reason") + def stop_reason_display(self, obj): + reason = self.stop_reason_for_crawl(obj) if obj else "" + if not reason: + return mark_safe('None') + return format_html('{}', reason) + + def stop_reason_for_crawl(self, obj): + if obj.pk in self.stop_reason_cache: + return self.stop_reason_cache[obj.pk] + + output_dir = obj.output_dir + config = self.limit_config_for_crawl(obj, output_dir) + reason = obj.stop_reason( + config=config, + output_dir=output_dir, + num_snapshots=obj.num_snapshots_cached, + num_sealed_snapshots=obj.num_archived_snapshots_cached, + ) + self.stop_reason_cache[obj.pk] = reason + return reason + + def limit_config_for_crawl(self, obj, output_dir): + from archivebox.config.common import get_config + + return get_config(crawl=obj).for_crawl_runtime( + crawl=obj, + persona=obj.resolve_persona(), + crawl_output_dir=output_dir, + ) + + @admin.display(description="Status", ordering="status") + def status_with_stop_reason(self, obj): + status = "PAUSED" if obj.is_paused else str(obj.status or "").upper() + reason = self.stop_reason_for_crawl(obj) if obj.is_paused or obj.status == Crawl.StatusChoices.SEALED else "" + if reason: + reason_label = reason.removeprefix("crawl_").replace("_", " ") + return format_html( + '{}{}', + obj.status, + status, + reason, + reason_label, + ) + return format_html('{}', obj.status, status) + + @admin.display(description="ID", ordering="id") + def short_id(self, obj): + short_id = str(obj.pk)[-8:] + return format_html('{}', obj.admin_change_url, short_id) + + @admin.display(description="Owner", ordering="created_by") + def owner(self, obj): + return obj.created_by + + @admin.display(description="Depth", ordering="max_depth") + def depth(self, obj): + return obj.max_depth + + @admin.display(description="๐Ÿ‘", ordering="permissions") + def permissions_badge(self, obj): + permissions = normalize_permissions(obj.permissions) + icon, label, fg, bg = PERMISSIONS_META[permissions] + menu_items = format_html_join( + "", + ( + '" + ), + ( + ( + " is-active" if choice_value == permissions else "", + choice_value, + choice_fg, + choice_bg, + choice_icon, + choice_label, + ) + for choice_value, choice_label in PERMISSIONS_CHOICES + for choice_icon, _choice_title, choice_fg, choice_bg in [PERMISSIONS_META[choice_value]] + ), + ) + return format_html( + '' + '" + '' + "", + permissions, + reverse(f"{self.admin_site.name}:crawls_crawl_set_permissions", args=[obj.pk]), + permissions, + label, + label, + fg, + bg, + icon, + menu_items, + ) + + @admin.display(description="Pause") + def pause_resume_control(self, obj): + if obj.is_paused or obj.status == Crawl.StatusChoices.SEALED: + reason = "paused" if obj.is_paused else (self.stop_reason_for_crawl(obj) or "sealed") + return format_html( + '', + obj.pk, + reason, + ) + return format_html( + '', + obj.pk, + ) + + @admin.display(description="Archived", ordering="num_archived_snapshots_cached") + def num_archived_snapshots(self, obj): + return obj.num_archived_snapshots_cached + + @admin.display(description="Snapshots", ordering="num_snapshots_cached") + def num_total_snapshots(self, obj): + return obj.num_snapshots_cached + + @admin.display(description="Snapshots") + def snapshots_changelist(self, obj): + request = self.request + snapshot_changelist = reverse("admin:core_snapshot_changelist") + scoped_params = {"crawl_id": str(obj.pk)} + full_url = f"{snapshot_changelist}?{urlencode(scoped_params)}" + + snapshot_admin = self.admin_site._registry[Snapshot] + changelist_request = copy(request) + changelist_request.method = "GET" + changelist_request.path = snapshot_changelist + changelist_request.GET = request.GET.copy() + changelist_request.GET.update( + { + **scoped_params, + "_embedded": "crawl", + "per_page": "200", + }, + ) + changelist_request.POST = request.POST.copy() + changelist_request.POST.clear() + + response = snapshot_admin.changelist_view( + changelist_request, + extra_context={"embedded_changelist": True}, + ) + context = { + **response.context_data, + "snapshot_changelist_url": full_url, + "crawl": obj, + } + return mark_safe(render_to_string("admin/crawls/crawl/snapshots_changelist.html", context, request=request)) + + def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + + if snapshot.status == Snapshot.StatusChoices.STARTED: + snapshot.cancel_running_hooks() + + removed_urls = crawl.prune_url(snapshot.url) + snapshot.delete() + return JsonResponse( + { + "ok": True, + "snapshot_id": str(snapshot.id), + "removed_urls": removed_urls, + }, + ) + + def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + crawl = get_object_or_404(Crawl, pk=object_id) + snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl) + result = crawl.exclude_domain(snapshot.url) + return JsonResponse( + { + "ok": True, + **result, + }, + ) + + def set_permissions_view(self, request: HttpRequest, object_id: str): + if request.method != "POST": + return HttpResponseNotAllowed(["POST"]) + + permissions = (request.POST.get("permissions") or "").strip().lower() + if permissions not in PERMISSIONS_VALUES: + return HttpResponseBadRequest("Invalid permissions value") + + crawl = get_object_or_404(Crawl, pk=object_id) + self.update_crawl_permissions(Crawl.objects.filter(pk=crawl.pk), permissions) + icon, label, fg, bg = PERMISSIONS_META[permissions] + return JsonResponse({"permissions": permissions, "icon": icon, "label": label, "fg": fg, "bg": bg}) + + @admin.display(description="Schedule", ordering="schedule") + def schedule_str(self, obj): + if not obj.schedule: + return mark_safe("None") + return format_html('{}', obj.schedule.admin_change_url, obj.schedule) + + @admin.display(description="URLs", ordering="urls") + def urls_preview(self, obj): + first_url = next((line.strip() for line in (obj.urls or "").splitlines() if line.strip() and not line.strip().startswith("#")), "") + return first_url[:80] + "..." if len(first_url) > 80 else first_url + + @admin.display(description="URLs") + def urls_editor(self, obj): + """Editor for crawl URLs.""" + widget_id = f"crawl_urls_{obj.pk}" + + # Escape for safe HTML embedding + escaped_urls = (obj.urls or "").replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) + + # Count lines for auto-expand logic + line_count = len((obj.urls or "").split("\n")) + uri_rows = min(max(3, line_count), 10) + + html = f''' +
+ +
+ + +

+ {line_count} URL{"s" if line_count != 1 else ""} ยท Note: URLs displayed here for reference only +

+
+
+ ''' + return mark_safe(html) + + +class CrawlScheduleAdmin(BaseModelAdmin): + list_display = ("id", "created_at", "created_by", "label", "notes", "template_str", "crawls", "num_crawls", "num_snapshots") + sort_fields = ("id", "created_at", "created_by", "label", "notes", "template_str") + search_fields = ("id", "created_by__username", "label", "notes", "schedule_id", "template_id", "template__urls") + + readonly_fields = ("created_at", "modified_at", "crawls", "snapshots") + + fieldsets = ( + ( + "Schedule Info", + { + "fields": ("label", "notes"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("schedule", "template"), + "classes": ("card",), + }, + ), + ( + "Metadata", + { + "fields": ("created_by", "created_at", "modified_at"), + "classes": ("card",), + }, + ), + ( + "Crawls", + { + "fields": ("crawls",), + "classes": ("card", "wide"), + }, + ), + ( + "Snapshots", + { + "fields": ("snapshots",), + "classes": ("card", "wide"), + }, + ), + ) + + list_filter = ("created_by",) + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + def get_queryset(self, request): + self.request = request + return ( + super() + .get_queryset(request) + .select_related("created_by", "template") + .annotate( + crawl_count=Count("crawl", distinct=True), + snapshot_count=Count("crawl__snapshot_set", distinct=True), + ) + ) + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + return super().change_view(request, object_id, form_url, extra_context) + + def add_view(self, request, form_url="", extra_context=None): + return redirect("/add/#schedule") + + def get_fieldsets(self, request, obj=None): + if obj is None: + return tuple(fieldset for fieldset in self.fieldsets if fieldset[0] not in {"Crawls", "Snapshots"}) + return self.fieldsets + + def save_model(self, request, obj, form, change): + if not obj.created_by_id and request.user.is_authenticated: + obj.created_by = request.user + super().save_model(request, obj, form, change) + + @admin.display(description="Template", ordering="template") + def template_str(self, obj): + return format_html('{}', obj.template.admin_change_url, obj.template) + + @admin.display(description="# Crawls", ordering="crawl_count") + def num_crawls(self, obj): + count = obj.__dict__.get("crawl_count") + if count is None: + count = obj.crawl_set.count() + return count + + @admin.display(description="# Snapshots", ordering="snapshot_count") + def num_snapshots(self, obj): + count = obj.__dict__.get("snapshot_count") + if count is None: + count = Snapshot.objects.filter(crawl__schedule=obj).count() + return count + + def crawls(self, obj): + return format_html_join( + "
", + ' - {}', + ((crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by("-created_at")[:20]), + ) or mark_safe("No Crawls yet...") + + def snapshots(self, obj): + crawl_ids = obj.crawl_set.values_list("pk", flat=True) + return render_snapshots_list( + Snapshot.objects.filter(crawl_id__in=crawl_ids), + request=self.request, + prefix="schedule_snapshots", + ) + + +def register_admin(admin_site): + admin_site.register(Crawl, CrawlAdmin) + admin_site.register(CrawlSchedule, CrawlScheduleAdmin) diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py new file mode 100644 index 0000000000..b9e5ed660f --- /dev/null +++ b/archivebox/crawls/apps.py @@ -0,0 +1,15 @@ +from django.apps import AppConfig + + +class CrawlsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.crawls" + label = "crawls" + + def ready(self): + """Import models to register state machines with the registry""" + import sys + + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.crawls.models import CrawlMachine # noqa: F401 diff --git a/archivebox/crawls/migrations/0001_initial.py b/archivebox/crawls/migrations/0001_initial.py new file mode 100644 index 0000000000..c90b52ad85 --- /dev/null +++ b/archivebox/crawls/migrations/0001_initial.py @@ -0,0 +1,177 @@ +# Generated by hand on 2025-12-29 +# Creates Crawl and CrawlSchedule tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +import django.core.validators +from django.conf import settings +from archivebox.uuid_compat import uuid7 +from archivebox.base_models.models import get_or_create_system_user_pk + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("auth", "0012_alter_user_first_name_max_length"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create crawls_crawlschedule table first (circular FK will be added later) + CREATE TABLE IF NOT EXISTS crawls_crawlschedule ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + schedule VARCHAR(64) NOT NULL, + is_enabled BOOLEAN NOT NULL DEFAULT 1, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + + template_id TEXT NOT NULL, + created_by_id INTEGER NOT NULL, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_at_idx ON crawls_crawlschedule(created_at); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_created_by_id_idx ON crawls_crawlschedule(created_by_id); + CREATE INDEX IF NOT EXISTS crawls_crawlschedule_template_id_idx ON crawls_crawlschedule(template_id); + + -- Create crawls_crawl table + CREATE TABLE IF NOT EXISTS crawls_crawl ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL, + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ); + CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status); + CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at); + CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at); + CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id); + CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id); + """, + reverse_sql=""" + DROP TABLE IF EXISTS crawls_crawl; + DROP TABLE IF EXISTS crawls_crawlschedule; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="CrawlSchedule", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("schedule", models.CharField(max_length=64)), + ("is_enabled", models.BooleanField(default=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "verbose_name": "Scheduled Crawl", + "verbose_name_plural": "Scheduled Crawls", + "app_label": "crawls", + }, + ), + migrations.CreateModel( + name="Crawl", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ("urls", models.TextField(help_text="Newline-separated list of URLs to crawl")), + ("config", models.JSONField(blank=True, default=dict, null=True)), + ( + "max_depth", + models.PositiveSmallIntegerField( + default=0, + validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)], + ), + ), + ("tags_str", models.CharField(blank=True, default="", max_length=1024)), + ("persona_id", models.UUIDField(blank=True, null=True)), + ("label", models.CharField(blank=True, default="", max_length=64)), + ("notes", models.TextField(blank=True, default="")), + ("output_dir", models.CharField(blank=True, default="", max_length=512)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ("retry_at", models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)), + ( + "created_by", + models.ForeignKey( + default=get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ( + "schedule", + models.ForeignKey( + blank=True, + editable=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="crawls.crawlschedule", + ), + ), + ], + options={ + "verbose_name": "Crawl", + "verbose_name_plural": "Crawls", + "app_label": "crawls", + }, + ), + migrations.AddField( + model_name="crawlschedule", + name="template", + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="crawls.crawl"), + ), + ], + ), + ] diff --git a/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py new file mode 100644 index 0000000000..eee69b1a06 --- /dev/null +++ b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py @@ -0,0 +1,156 @@ +# Generated by hand on 2025-12-31 +# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 + +from django.db import migrations, connection + + +def upgrade_crawl_table_from_v086(apps, schema_editor): + """Upgrade crawls_crawl table from v0.8.6rc0 schema to v0.9.0 schema.""" + cursor = connection.cursor() + + # Check if crawls_crawl table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'") + if not cursor.fetchone(): + return + + # Detect schema version + cursor.execute("PRAGMA table_info(crawls_crawl)") + crawl_cols = {row[1] for row in cursor.fetchall()} + has_seed_id = "seed_id" in crawl_cols + has_urls = "urls" in crawl_cols + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawlschedule'") + if cursor.fetchone(): + cursor.execute("PRAGMA table_info(crawls_crawlschedule)") + schedule_cols = {row[1] for row in cursor.fetchall()} + schedule_sets = [] + if "id" in schedule_cols: + schedule_sets.append("id = REPLACE(id, '-', '')") + if "template_id" in schedule_cols: + schedule_sets.append( + "template_id = CASE " + "WHEN template_id IS NOT NULL " + "AND LENGTH(REPLACE(template_id, '-', '')) = 32 " + "AND REPLACE(template_id, '-', '') NOT GLOB '*[^0-9A-Fa-f]*' " + "THEN REPLACE(template_id, '-', '') ELSE NULL END", + ) + if schedule_sets: + cursor.execute(f"UPDATE crawls_crawlschedule SET {', '.join(schedule_sets)}") + + crawl_sets = [] + if "id" in crawl_cols: + crawl_sets.append("id = REPLACE(id, '-', '')") + if "persona_id" in crawl_cols: + crawl_sets.append( + "persona_id = CASE " + "WHEN persona_id IS NOT NULL " + "AND LENGTH(REPLACE(persona_id, '-', '')) = 32 " + "AND REPLACE(persona_id, '-', '') NOT GLOB '*[^0-9A-Fa-f]*' " + "THEN REPLACE(persona_id, '-', '') ELSE NULL END", + ) + if "schedule_id" in crawl_cols: + crawl_sets.append( + "schedule_id = CASE " + "WHEN schedule_id IS NOT NULL " + "AND LENGTH(REPLACE(schedule_id, '-', '')) = 32 " + "AND REPLACE(schedule_id, '-', '') NOT GLOB '*[^0-9A-Fa-f]*' " + "THEN REPLACE(schedule_id, '-', '') ELSE NULL END", + ) + if crawl_sets: + cursor.execute(f"UPDATE crawls_crawl SET {', '.join(crawl_sets)}") + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'") + if cursor.fetchone(): + cursor.execute("PRAGMA table_info(core_snapshot)") + snapshot_cols = {row[1] for row in cursor.fetchall()} + if "crawl_id" in snapshot_cols: + cursor.execute( + """ + UPDATE core_snapshot + SET crawl_id = CASE + WHEN crawl_id IS NOT NULL + AND LENGTH(REPLACE(crawl_id, '-', '')) = 32 + AND REPLACE(crawl_id, '-', '') NOT GLOB '*[^0-9A-Fa-f]*' + THEN REPLACE(crawl_id, '-', '') + ELSE NULL + END + """, + ) + + # Only upgrade if we have v0.8.6rc0 schema + if not (has_seed_id and not has_urls): + return + + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + has_data = cursor.fetchone()[0] > 0 + + # v0.8.6rc0 schema - upgrade to v0.9.0 + if has_data: + print("Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0...") + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS crawls_crawl_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL, + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ); + """) + + if has_data: + cursor.execute(""" + INSERT OR IGNORE INTO crawls_crawl_new ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, config, max_depth, tags_str, persona_id, label, notes, output_dir, + status, retry_at, created_by_id, schedule_id + ) + SELECT + REPLACE(id, '-', ''), created_at, modified_at, num_uses_succeeded, num_uses_failed, + '', config, max_depth, tags_str, REPLACE(persona_id, '-', ''), '', '', '', + status, retry_at, created_by_id, REPLACE(schedule_id, '-', '') + FROM crawls_crawl; + """) + + cursor.execute("DROP TABLE crawls_crawl;") + cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl;") + + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);") + + if has_data: + print("โœ“ crawls_crawl upgraded to v0.9.0") + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0001_initial"), + ] + + operations = [ + migrations.RunPython( + upgrade_crawl_table_from_v086, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py new file mode 100644 index 0000000000..d8d38f37cb --- /dev/null +++ b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 6.0 on 2026-01-01 23:36 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0002_upgrade_from_0_8_6"), + ] + + operations = [ + migrations.RemoveField( + model_name="crawlschedule", + name="num_uses_failed", + ), + migrations.RemoveField( + model_name="crawlschedule", + name="num_uses_succeeded", + ), + ] diff --git a/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py new file mode 100644 index 0000000000..3d68253062 --- /dev/null +++ b/archivebox/crawls/migrations/0004_remove_crawl_output_dir.py @@ -0,0 +1,16 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0003_remove_crawlschedule_num_uses_failed_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="crawl", + name="output_dir", + ), + ] diff --git a/archivebox/crawls/migrations/0005_add_crawl_limits.py b/archivebox/crawls/migrations/0005_add_crawl_limits.py new file mode 100644 index 0000000000..c931816227 --- /dev/null +++ b/archivebox/crawls/migrations/0005_add_crawl_limits.py @@ -0,0 +1,31 @@ +# Generated by Django 6.0 on 2026-03-23 00:00 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0004_remove_crawl_output_dir"), + ] + + operations = [ + migrations.AddField( + model_name="crawl", + name="max_size", + field=models.BigIntegerField( + default=0, + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + migrations.AddField( + model_name="crawl", + name="max_urls", + field=models.IntegerField( + default=0, + help_text="Maximum number of URLs to snapshot for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + ] diff --git a/archivebox/crawls/migrations/0006_crawl_crawl_admin_order_idx.py b/archivebox/crawls/migrations/0006_crawl_crawl_admin_order_idx.py new file mode 100644 index 0000000000..b96aca7d55 --- /dev/null +++ b/archivebox/crawls/migrations/0006_crawl_crawl_admin_order_idx.py @@ -0,0 +1,18 @@ +# Generated by Django 6.0.5 on 2026-05-24 10:26 + +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0005_add_crawl_limits"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddIndex( + model_name="crawl", + index=models.Index(fields=["-created_at", "-retry_at"], name="crawl_admin_order_idx"), + ), + ] diff --git a/archivebox/crawls/migrations/0007_remove_crawl_crawl_admin_order_idx_and_more.py b/archivebox/crawls/migrations/0007_remove_crawl_crawl_admin_order_idx_and_more.py new file mode 100644 index 0000000000..77f51da85b --- /dev/null +++ b/archivebox/crawls/migrations/0007_remove_crawl_crawl_admin_order_idx_and_more.py @@ -0,0 +1,22 @@ +# Generated by Django 6.0.5 on 2026-05-24 10:28 + +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0006_crawl_crawl_admin_order_idx"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.RemoveIndex( + model_name="crawl", + name="crawl_admin_order_idx", + ), + migrations.AddIndex( + model_name="crawl", + index=models.Index(fields=["-created_at", "-retry_at", "-id"], name="crawl_admin_order_idx"), + ), + ] diff --git a/archivebox/crawls/migrations/0008_split_crawl_snapshot_size_limits.py b/archivebox/crawls/migrations/0008_split_crawl_snapshot_size_limits.py new file mode 100644 index 0000000000..ccb8f4709c --- /dev/null +++ b/archivebox/crawls/migrations/0008_split_crawl_snapshot_size_limits.py @@ -0,0 +1,36 @@ +# Generated by Django 6.0 on 2026-05-24 + +import django.core.validators +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0007_remove_crawl_crawl_admin_order_idx_and_more"), + ] + + operations = [ + migrations.RenameField( + model_name="crawl", + old_name="max_size", + new_name="crawl_max_size", + ), + migrations.AddField( + model_name="crawl", + name="snapshot_max_size", + field=models.BigIntegerField( + default=0, + help_text="Maximum archived output size in bytes for each snapshot (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + migrations.AlterField( + model_name="crawl", + name="crawl_max_size", + field=models.BigIntegerField( + default=0, + help_text="Maximum total archived output size in bytes for this crawl (0 = unlimited).", + validators=[django.core.validators.MinValueValidator(0)], + ), + ), + ] diff --git a/archivebox/crawls/migrations/0009_crawl_progress_status_idx.py b/archivebox/crawls/migrations/0009_crawl_progress_status_idx.py new file mode 100644 index 0000000000..a1059d4379 --- /dev/null +++ b/archivebox/crawls/migrations/0009_crawl_progress_status_idx.py @@ -0,0 +1,16 @@ +# Generated by ArchiveBox on 2026-05-27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0008_split_crawl_snapshot_size_limits"), + ] + + operations = [ + migrations.AddIndex( + model_name="crawl", + index=models.Index(fields=["status", "-modified_at"], name="crawl_progress_status_idx"), + ), + ] diff --git a/archivebox/crawls/migrations/0010_crawl_delete_at.py b/archivebox/crawls/migrations/0010_crawl_delete_at.py new file mode 100644 index 0000000000..85a1f7f8e1 --- /dev/null +++ b/archivebox/crawls/migrations/0010_crawl_delete_at.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-27 20:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0009_crawl_progress_status_idx"), + ] + + operations = [ + migrations.AddField( + model_name="crawl", + name="delete_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + ] diff --git a/archivebox/crawls/migrations/0011_move_crawl_limits_to_config.py b/archivebox/crawls/migrations/0011_move_crawl_limits_to_config.py new file mode 100644 index 0000000000..ed6df28f1f --- /dev/null +++ b/archivebox/crawls/migrations/0011_move_crawl_limits_to_config.py @@ -0,0 +1,38 @@ +from django.db import migrations + + +def move_limit_fields_to_config(apps, schema_editor): + Crawl = apps.get_model("crawls", "Crawl") + rows = Crawl.objects.values("id", "config", "max_urls", "crawl_max_size", "snapshot_max_size").iterator(chunk_size=1000) + for row in rows: + config = dict(row["config"] or {}) + if row["max_urls"]: + config["CRAWL_MAX_URLS"] = row["max_urls"] + if row["crawl_max_size"]: + config["CRAWL_MAX_SIZE"] = row["crawl_max_size"] + if row["snapshot_max_size"]: + config["SNAPSHOT_MAX_SIZE"] = row["snapshot_max_size"] + if config != (row["config"] or {}): + Crawl.objects.filter(id=row["id"]).update(config=config) + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0010_crawl_delete_at"), + ] + + operations = [ + migrations.RunPython(move_limit_fields_to_config, migrations.RunPython.noop), + migrations.RemoveField( + model_name="crawl", + name="max_urls", + ), + migrations.RemoveField( + model_name="crawl", + name="crawl_max_size", + ), + migrations.RemoveField( + model_name="crawl", + name="snapshot_max_size", + ), + ] diff --git a/archivebox/crawls/migrations/0012_drop_stale_crawl_timeout_column.py b/archivebox/crawls/migrations/0012_drop_stale_crawl_timeout_column.py new file mode 100644 index 0000000000..35393a7728 --- /dev/null +++ b/archivebox/crawls/migrations/0012_drop_stale_crawl_timeout_column.py @@ -0,0 +1,22 @@ +from django.db import migrations + + +def drop_stale_crawl_timeout_column(apps, schema_editor): + table_name = "crawls_crawl" + column_name = "crawl_timeout" + connection = schema_editor.connection + with connection.cursor() as cursor: + columns = {column.name for column in connection.introspection.get_table_description(cursor, table_name)} + if column_name not in columns: + return + schema_editor.execute(f"ALTER TABLE {schema_editor.quote_name(table_name)} DROP COLUMN {schema_editor.quote_name(column_name)}") + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0011_move_crawl_limits_to_config"), + ] + + operations = [ + migrations.RunPython(drop_stale_crawl_timeout_column, migrations.RunPython.noop), + ] diff --git a/archivebox/crawls/migrations/0013_crawl_permissions.py b/archivebox/crawls/migrations/0013_crawl_permissions.py new file mode 100644 index 0000000000..bc04f929ac --- /dev/null +++ b/archivebox/crawls/migrations/0013_crawl_permissions.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0.5 on 2026-05-28 07:25 + +import django.db.models.fields.json +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0012_drop_stale_crawl_timeout_column"), + ] + + operations = [ + migrations.AddField( + model_name="crawl", + name="permissions", + field=models.GeneratedField( + db_index=True, + db_persist=True, + expression=django.db.models.fields.json.KeyTextTransform("PERMISSIONS", "config"), + output_field=models.CharField(max_length=16, null=True), + ), + ), + ] diff --git a/archivebox/crawls/migrations/0014_crawl_persona_fk.py b/archivebox/crawls/migrations/0014_crawl_persona_fk.py new file mode 100644 index 0000000000..e3468c7461 --- /dev/null +++ b/archivebox/crawls/migrations/0014_crawl_persona_fk.py @@ -0,0 +1,40 @@ +# Generated by hand on 2026-05-28 + +import django.db.models.deletion +from django.db import migrations, models + + +def clear_stale_persona_ids(apps, _schema_editor): + Crawl = apps.get_model("crawls", "Crawl") + Persona = apps.get_model("personas", "Persona") + Crawl.objects.filter(persona_id__isnull=False).exclude( + persona_id__in=Persona.objects.values_list("id", flat=True), + ).update(persona_id=None) + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0013_crawl_permissions"), + ("personas", "0003_persona_permissions"), + ] + + operations = [ + migrations.RunPython(clear_stale_persona_ids, migrations.RunPython.noop), + migrations.RenameField( + model_name="crawl", + old_name="persona_id", + new_name="persona", + ), + migrations.AlterField( + model_name="crawl", + name="persona", + field=models.ForeignKey( + blank=True, + db_column="persona_id", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="crawls", + to="personas.persona", + ), + ), + ] diff --git a/archivebox/crawls/migrations/0015_alter_crawl_status.py b/archivebox/crawls/migrations/0015_alter_crawl_status.py new file mode 100644 index 0000000000..f2fe14c17f --- /dev/null +++ b/archivebox/crawls/migrations/0015_alter_crawl_status.py @@ -0,0 +1,22 @@ +# Generated by Django 6.0.5 on 2026-05-28 12:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0014_crawl_persona_fk"), + ] + + operations = [ + migrations.AlterField( + model_name="crawl", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("paused", "Paused"), ("sealed", "Sealed")], + db_index=True, + default="queued", + max_length=15, + ), + ), + ] diff --git a/archivebox/crawls/migrations/0016_hydrate_crawl_permissions.py b/archivebox/crawls/migrations/0016_hydrate_crawl_permissions.py new file mode 100644 index 0000000000..1377faaca5 --- /dev/null +++ b/archivebox/crawls/migrations/0016_hydrate_crawl_permissions.py @@ -0,0 +1,135 @@ +import os +import json +import uuid + +from django.db import migrations +from django.db.models import Q + + +VALID_PERMISSIONS = {"public", "unlisted", "private"} +BATCH_SIZE = 1000 + + +def normalize_permissions(value, default): + value = str(value or "").strip().lower() + return value if value in VALID_PERMISSIONS else default + + +def raw_base_config(apps): + try: + from archivebox.config import CONSTANTS + from archivebox.config.configset import BaseConfigSet + + config = {**BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE), **os.environ} + except Exception: + config = dict(os.environ) + + try: + Machine = apps.get_model("machine", "Machine") + machine_config = Machine.objects.order_by("-modified_at").values_list("config", flat=True).first() or {} + if isinstance(machine_config, dict): + config.update(machine_config) + except Exception: + pass + return config + + +def resolve_permissions(config, default): + from archivebox.config.common import permissions_from_legacy_public_flags + + explicit = str(config.get("PERMISSIONS") or "").strip().lower() + if explicit in VALID_PERMISSIONS: + return explicit + return permissions_from_legacy_public_flags(config) or default + + +def id_values(pk): + if isinstance(pk, uuid.UUID): + return str(pk), pk.hex + pk_str = str(pk) + return pk_str, pk_str.replace("-", "") + + +def flush_batch(cursor, table_name, batch): + if not batch: + return + cursor.executemany( + f"UPDATE {table_name} SET config = %s WHERE id = %s OR id = %s", + [(json.dumps(config), *id_values(pk)) for pk, config in batch], + ) + + +def _ensure_permissions_column(cursor): + """Backfill the ``permissions`` generated column on ``crawls_crawl``. + + Long-lived dev DBs (cabbage's demo + beta-tester collections) have + ``crawls/0013_crawl_permissions`` marked applied in ``django_migrations`` + but the *historical* migration with that name did something unrelated โ€” + the actual ``permissions`` column never made it onto the table. Without + this guard the hydration query below fails with ``no such column: + crawls_crawl.permissions`` and bricks startup. Fresh installs already + have the column (added by the current 0013), so this is a safe no-op + in that case. + """ + # ``table_info`` hides generated columns (SQLite docs: "this command + # does not include the generated columns"). Fresh installs add + # ``permissions`` as a STORED GeneratedField via 0013, which + # ``table_xinfo`` reports but ``table_info`` does not โ€” so the latter + # would lie to us and we'd try to ALTER an already-present column. + cursor.execute("PRAGMA table_xinfo(crawls_crawl)") + existing_cols = {row[1] for row in cursor.fetchall()} + if "permissions" in existing_cols: + return + # SQLite ``ALTER TABLE ADD COLUMN`` only supports VIRTUAL generated + # columns (STORED is rejected with "cannot add a STORED column"). The + # current model declares ``db_persist=True`` so fresh installs get a + # STORED column via Django's initial table creation, but on legacy DBs + # we have to settle for VIRTUAL โ€” runtime behavior is equivalent (the + # expression is evaluated on read instead of write), and Django's + # field-level queries don't care which storage mode SQLite uses under + # the hood. Index creation on a virtual column is still supported. + cursor.execute( + "ALTER TABLE crawls_crawl ADD COLUMN permissions varchar(16) GENERATED ALWAYS AS (json_extract(config, '$.PERMISSIONS')) VIRTUAL", + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS crawls_crawl_permissions_idx ON crawls_crawl (permissions)", + ) + + +def hydrate_crawl_permissions(apps, schema_editor): + Crawl = apps.get_model("crawls", "Crawl") + base_config = raw_base_config(apps) + default_permissions = resolve_permissions(base_config, "public") + table_name = schema_editor.quote_name(Crawl._meta.db_table) + cursor = schema_editor.connection.cursor() + _ensure_permissions_column(cursor) + batch = [] + missing_permissions = Q(permissions__isnull=True) | (Q(permissions__isnull=False) & ~Q(permissions__in=VALID_PERMISSIONS)) + + for crawl in Crawl.objects.filter(missing_permissions).select_related("persona").iterator(chunk_size=BATCH_SIZE): + config = dict(crawl.config or {}) + resolved = dict(base_config) + if crawl.persona_id: + persona_config = crawl.persona.config or {} + if isinstance(persona_config, dict): + resolved.update(persona_config) + resolved.update(config) + config["PERMISSIONS"] = resolve_permissions(resolved, default_permissions) + batch.append((crawl.id, config)) + if len(batch) >= BATCH_SIZE: + flush_batch(cursor, table_name, batch) + batch.clear() + + flush_batch(cursor, table_name, batch) + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0019_single_active_runner_constraint"), + ("personas", "0004_hydrate_persona_permissions"), + ("crawls", "0015_alter_crawl_status"), + ] + + operations = [ + migrations.RunPython(hydrate_crawl_permissions, migrations.RunPython.noop), + ] diff --git a/archivebox/crawls/migrations/0017_drop_stale_crawl_limit_columns.py b/archivebox/crawls/migrations/0017_drop_stale_crawl_limit_columns.py new file mode 100644 index 0000000000..34f40237f4 --- /dev/null +++ b/archivebox/crawls/migrations/0017_drop_stale_crawl_limit_columns.py @@ -0,0 +1,79 @@ +from django.db import migrations + + +LEGACY_COLUMNS = ("max_urls", "crawl_max_size", "snapshot_max_size") + + +def drop_stale_crawl_limit_columns(apps, schema_editor): + """Drop legacy NOT-NULL limit columns left over from pre-0011 schemas. + + crawls/0011_move_crawl_limits_to_config copies these fields into + ``crawl.config`` and ``RemoveField``s them from the model. That works on + fresh installs, but on collections where a *historical* 0011 ran (a + different migration with the same id that pre-dates the current move- + to-config logic) ``django_migrations`` records it as applied while the + columns themselves stayed on the table as ``NOT NULL`` with no default. + + The next time anything tries to create a Crawl via the ORM (e.g. the + /add/ form) Django doesn't pass those columns โ€” they're not in the + model โ€” and SQLite raises ``NOT NULL constraint failed: crawls_crawl + .max_urls``, which surfaces as an HTTP 500 on /add/. We hit this in + the rc51 cabbage UI test before this migration existed. + + Self-heal: introspect the live table, copy any pre-existing values + into ``config`` (so we don't drop data), then ``DROP COLUMN`` each + legacy field. Fresh installs already have the columns removed and + this is a no-op. + """ + table_name = "crawls_crawl" + connection = schema_editor.connection + with connection.cursor() as cursor: + columns = {column.name for column in connection.introspection.get_table_description(cursor, table_name)} + stale_present = [col for col in LEGACY_COLUMNS if col in columns] + if not stale_present: + return + # Backfill config with any non-null values from the stale columns so + # nothing gets silently dropped. ``CRAWL_MAX_URLS`` / ``CRAWL_MAX_SIZE`` + # / ``SNAPSHOT_MAX_SIZE`` are the canonical config keys. + col_to_config_key = { + "max_urls": "CRAWL_MAX_URLS", + "crawl_max_size": "CRAWL_MAX_SIZE", + "snapshot_max_size": "SNAPSHOT_MAX_SIZE", + } + select_cols = ", ".join(schema_editor.quote_name(c) for c in stale_present) + cursor.execute(f"SELECT id, config, {select_cols} FROM {schema_editor.quote_name(table_name)}") + rows = cursor.fetchall() + import json + + Crawl = apps.get_model("crawls", "Crawl") + for row in rows: + crawl_id, config_raw = row[0], row[1] + stale_values = row[2:] + try: + config = dict(json.loads(config_raw)) if isinstance(config_raw, str) else dict(config_raw or {}) + except (TypeError, ValueError): + config = {} + mutated = False + for col_name, value in zip(stale_present, stale_values, strict=False): + if value in (None, 0, "0", ""): + continue + key = col_to_config_key.get(col_name) + if key and key not in config: + config[key] = value + mutated = True + if mutated: + Crawl.objects.filter(id=crawl_id).update(config=config) + for col_name in stale_present: + schema_editor.execute( + f"ALTER TABLE {schema_editor.quote_name(table_name)} DROP COLUMN {schema_editor.quote_name(col_name)}", + ) + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0016_hydrate_crawl_permissions"), + ] + + operations = [ + migrations.RunPython(drop_stale_crawl_limit_columns, migrations.RunPython.noop), + ] diff --git a/archivebox/crawls/migrations/0018_freeze_crawl_config_snapshots.py b/archivebox/crawls/migrations/0018_freeze_crawl_config_snapshots.py new file mode 100644 index 0000000000..78f5417e88 --- /dev/null +++ b/archivebox/crawls/migrations/0018_freeze_crawl_config_snapshots.py @@ -0,0 +1,67 @@ +import json + +from django.db import migrations + + +BATCH_SIZE = 1000 + + +def _config_cache_key(config): + return json.dumps(config or {}, sort_keys=True, separators=(",", ":"), default=str) + + +def _flush_updates(Crawl, db_alias, pending): + if pending: + Crawl.objects.using(db_alias).bulk_update(pending, ["config"], batch_size=BATCH_SIZE) + pending.clear() + + +def freeze_existing_crawl_configs(apps, schema_editor): + from archivebox.config.common import build_crawl_config_snapshot + from archivebox.config.constants import CONSTANTS + from archivebox.personas.models import derive_persona_config + + class PersonaConfigSnapshot: + def __init__(self, persona): + self.name = persona.name + self.config = dict(persona.config or {}) + + def get_derived_config(self): + return derive_persona_config(name=self.name, config=self.config, persona_dir=CONSTANTS.PERSONAS_DIR / self.name) + + Crawl = apps.get_model("crawls", "Crawl") + Persona = apps.get_model("personas", "Persona") + db_alias = schema_editor.connection.alias + rows = Crawl.objects.using(db_alias).values_list("id", "persona_id", "config") + persona_ids = {persona_id for _, persona_id, _ in rows if persona_id} + personas = {persona.pk: PersonaConfigSnapshot(persona) for persona in Persona.objects.using(db_alias).filter(pk__in=persona_ids)} + + frozen_cache = {} + pending = [] + for crawl_id, persona_id, current_config in rows.iterator(chunk_size=BATCH_SIZE): + current_config = dict(current_config or {}) + cache_key = (persona_id, _config_cache_key(current_config)) + if cache_key not in frozen_cache: + frozen_cache[cache_key] = build_crawl_config_snapshot( + persona=personas.get(persona_id), + overrides=current_config, + ) + frozen_config = frozen_cache[cache_key] + if frozen_config != current_config: + pending.append(Crawl(id=crawl_id, config=frozen_config)) + if len(pending) >= BATCH_SIZE: + _flush_updates(Crawl, db_alias, pending) + + _flush_updates(Crawl, db_alias, pending) + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("crawls", "0017_drop_stale_crawl_limit_columns"), + ] + + operations = [ + migrations.RunPython(freeze_existing_crawl_configs, migrations.RunPython.noop), + ] diff --git a/archivebox/crawls/migrations/0019_crawlschedule_config.py b/archivebox/crawls/migrations/0019_crawlschedule_config.py new file mode 100644 index 0000000000..56a5faef92 --- /dev/null +++ b/archivebox/crawls/migrations/0019_crawlschedule_config.py @@ -0,0 +1,25 @@ +from django.db import migrations, models + + +def copy_template_config_to_schedule(apps, schema_editor): + CrawlSchedule = apps.get_model("crawls", "CrawlSchedule") + db_alias = schema_editor.connection.alias + + for schedule in CrawlSchedule.objects.using(db_alias).select_related("template").iterator(chunk_size=200): + template_config = dict(schedule.template.config or {}) if schedule.template_id else {} + CrawlSchedule.objects.using(db_alias).filter(pk=schedule.pk).update(config=template_config) + + +class Migration(migrations.Migration): + dependencies = [ + ("crawls", "0018_freeze_crawl_config_snapshots"), + ] + + operations = [ + migrations.AddField( + model_name="crawlschedule", + name="config", + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.RunPython(copy_template_config_to_schedule, migrations.RunPython.noop), + ] diff --git a/archivebox/crawls/migrations/__init__.py b/archivebox/crawls/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py new file mode 100755 index 0000000000..482b6efae7 --- /dev/null +++ b/archivebox/crawls/models.py @@ -0,0 +1,1785 @@ +__package__ = "archivebox.crawls" + +from typing import TYPE_CHECKING, Any +from collections.abc import Iterable, Mapping +from io import StringIO +import uuid +import json +import re +from itertools import islice +from datetime import timedelta +from archivebox.uuid_compat import CompactUUIDField, uuid7 +from pathlib import Path +from urllib.parse import urlparse + +from django.db import IntegrityError, models, transaction +from django.db.models import Q +from django.db.models.fields.json import KT +from django.core.exceptions import ValidationError +from django.core.validators import MaxValueValidator, MinValueValidator +from django.conf import settings +from django.urls import reverse_lazy +from django.utils import timezone +from statemachine import State, registry +from archivebox.config.common import rprint as print +from archivebox.core.permissions import PERMISSIONS_VALUES, normalize_permissions + +from archivebox.base_models.models import ( + ModelWithUUID, + ModelWithDeleteAfter, + ModelWithOutputDir, + ModelWithConfig, + ModelWithNotes, + ModelWithHealthStats, + get_or_create_system_user_pk, +) +from archivebox.workers.models import RETRY_AT_MAX, ModelWithStateMachine, BaseStateMachine +from archivebox.crawls.schedule_util import next_run_for_schedule, validate_schedule +from archivebox.misc.util import parse_date, validate_url, validate_url_length + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + + +class CrawlSchedule(ModelWithUUID, ModelWithNotes): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + modified_at = models.DateTimeField(auto_now=True) + + template: "Crawl" = models.ForeignKey("Crawl", on_delete=models.CASCADE, null=False, blank=False) # type: ignore + schedule = models.CharField(max_length=64, blank=False, null=False) + is_enabled = models.BooleanField(default=True) + config = models.JSONField(default=dict, null=True, blank=True) + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") + + crawl_set: models.Manager["Crawl"] + + class Meta(ModelWithUUID.Meta, ModelWithNotes.Meta): + app_label = "crawls" + verbose_name = "Scheduled Crawl" + verbose_name_plural = "Scheduled Crawls" + + def __str__(self) -> str: + urls_preview = self.template.urls[:64] if self.template and self.template.urls else "" + return f"[{self.id}] {urls_preview} @ {self.schedule}" + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_any", args=[self.id])) + + def save(self, *args, **kwargs): + self.schedule = (self.schedule or "").strip() + validate_schedule(self.schedule) + self.label = self.label or (self.template.label if self.template else "") + super().save(*args, **kwargs) + if self.template: + self.template.safe_update( + { + "schedule_id": self.pk, + "modified_at": timezone.now(), + }, + refresh=False, + ) + self.template.schedule_id = self.pk + self.template.schedule = self + + @property + def last_run_at(self): + latest_crawl = self.crawl_set.order_by("-created_at").first() + if latest_crawl: + return latest_crawl.created_at + if self.template: + return self.template.created_at + return self.created_at + + @property + def next_run_at(self): + return next_run_for_schedule(self.schedule, self.last_run_at) + + def is_due(self, now=None) -> bool: + now = now or timezone.now() + return self.is_enabled and self.next_run_at <= now + + def enqueue(self, queued_at=None) -> "Crawl": + from archivebox.config.common import build_crawl_config_snapshot + + queued_at = queued_at or timezone.now() + template = self.template + label = template.label or self.label + persona = template.persona if template.persona_id else None + + return Crawl.objects.create( + urls=template.urls, + config=build_crawl_config_snapshot(persona=persona, overrides=self.config or {}), + max_depth=template.max_depth, + tags_str=template.tags_str, + persona_id=template.persona_id, + label=label, + notes=template.notes, + schedule=self, + status=Crawl.StatusChoices.QUEUED, + retry_at=queued_at, + created_by=template.created_by, + ) + + +class Crawl(ModelWithDeleteAfter, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) + modified_at = models.DateTimeField(auto_now=True) + + urls = models.TextField(blank=False, null=False, help_text="Newline-separated list of URLs to crawl") + config = models.JSONField(default=dict, null=True, blank=True) + permissions = models.GeneratedField( + expression=KT("config__PERMISSIONS"), + output_field=models.CharField(max_length=16, null=True), + db_persist=True, + db_index=True, + editable=False, + ) + max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) + tags_str = models.CharField(max_length=1024, blank=True, null=False, default="") + persona = models.ForeignKey( + "personas.Persona", + db_column="persona_id", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="crawls", + ) + label = models.CharField(max_length=64, blank=True, null=False, default="") + notes = models.TextField(blank=True, null=False, default="") + schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True) + + status = ModelWithStateMachine.StatusField( + choices=ModelWithStateMachine.StatusChoices, + default=ModelWithStateMachine.StatusChoices.QUEUED, + ) + retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) + + state_machine_name = "archivebox.crawls.models.CrawlMachine" + retry_at_field_name = "retry_at" + state_field_name = "status" + StatusChoices = ModelWithStateMachine.StatusChoices + active_state = StatusChoices.STARTED + delete_after_final_statuses = (StatusChoices.SEALED,) + RUNNABLE_STATES = (StatusChoices.QUEUED, StatusChoices.STARTED) + INACTIVE_STATES = (StatusChoices.PAUSED, StatusChoices.SEALED) + + schedule_id: uuid.UUID | None + + snapshot_set: models.Manager["Snapshot"] + + if TYPE_CHECKING: + + @property + def sm(self) -> "CrawlMachine": ... + + class Meta( + ModelWithDeleteAfter.Meta, + ModelWithOutputDir.Meta, + ModelWithConfig.Meta, + ModelWithHealthStats.Meta, + ModelWithStateMachine.Meta, + ): + app_label = "crawls" + verbose_name = "Crawl" + verbose_name_plural = "Crawls" + indexes = [ + models.Index(fields=["-created_at", "-retry_at", "-id"], name="crawl_admin_order_idx"), + models.Index(fields=["status", "-modified_at"], name="crawl_progress_status_idx"), + ] + + def __str__(self): + first_url = next((line.strip() for line in (self.urls or "").splitlines() if line.strip() and not line.strip().startswith("#")), "") + # Show last 8 digits of UUID and more of the URL + short_id = str(self.id)[-8:] + return f"[...{short_id}] {first_url[:120]}" + + def get_delete_after_config_value(self): + from archivebox.config.common import resolve_delete_after_config_value + + return resolve_delete_after_config_value(self.config) + + def pause(self, *, save: bool = True) -> bool: + return super().pause(save=save) + + def resume(self, *, when=None, save: bool = True) -> bool: + resumed = super().resume(when=when, save=save) + if resumed and self.pk: + from archivebox.core.models import ArchiveResult, Snapshot + + resume_at = when or timezone.now() + active_snapshots = self.snapshot_set.filter( + status=Snapshot.StatusChoices.PAUSED, + ) + active_snapshots.update( + status=Snapshot.StatusChoices.QUEUED, + retry_at=resume_at, + modified_at=timezone.now(), + ) + ArchiveResult.resume_queryset(ArchiveResult.objects.filter(snapshot__crawl=self), when=resume_at) + return resumed + + def cancel(self) -> None: + now = timezone.now() + self.schedule_child_snapshots_for_sealing() + # User-initiated cancellation may come from an admin/API request while + # the runner owns the crawl lease. This is intentionally a plain + # conditional UPDATE instead of CAS: cancellation is an idempotent user + # command, not a stale iterator write. Keep it to a tight scheduler row + # update and let the runner claim the SEALED+due row for cleanup hooks. + type(self).objects.filter(pk=self.pk).exclude(status=self.StatusChoices.SEALED).update( + status=self.StatusChoices.SEALED, + retry_at=now, + modified_at=now, + ) + self.status = self.StatusChoices.SEALED + self.retry_at = now + + def schedule_child_snapshots_for_sealing(self) -> int: + from archivebox.core.models import Snapshot + + now = timezone.now() + # Cancellation seals the Crawl first, then lets the runner seal each + # child Snapshot through its own state machine. Active children that + # are already due need no write; the runner will claim them as-is. + active_children = self.snapshot_set.filter( + status__in=Snapshot.OPEN_STATES, + ) + return active_children.filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ).update( + retry_at=now, + modified_at=now, + ) + + def schedule_child_snapshots_for_pause(self) -> int: + from archivebox.core.models import Snapshot + + now = timezone.now() + # Parent pause is a scheduler command. Wake child rows only; each + # Snapshot runner claim performs the real pause transition and cascades + # its own ArchiveResults, keeping request/admin transactions tiny. + active_children = self.snapshot_set.filter( + status__in=Snapshot.RUNNABLE_STATES, + ) + return active_children.filter( + Q(retry_at__isnull=True) | Q(retry_at__gt=now), + ).update( + retry_at=now, + modified_at=now, + ) + + @classmethod + def missing_delete_at_candidates(cls): + return cls.objects.filter(delete_at__isnull=True, config__has_key="DELETE_AFTER") + + def save(self, *args, **kwargs): + update_fields = kwargs.get("update_fields") + sync_tags = update_fields is None or "tags_str" in update_fields + old_crawl = type(self).objects.filter(pk=self.pk).first() if self.pk else None + previous_tag_names = set() + if sync_tags and old_crawl is not None: + previous_tag_names = set(self.parse_tag_names(old_crawl.tags_str or "")) + + config = dict(self.config or {}) + is_new = self._state.adding or old_crawl is None + persona = self.persona if self.persona_id else None + if is_new: + from archivebox.config.common import build_crawl_config_snapshot + + config = build_crawl_config_snapshot(persona=persona, overrides=config) + if str(config.get("PERMISSIONS") or "").strip().lower() not in PERMISSIONS_VALUES: + from archivebox.config.common import get_config + + config["PERMISSIONS"] = normalize_permissions(get_config(persona=persona, include_machine=True).PERMISSIONS) + if "CRAWL_MAX_CONCURRENT_SNAPSHOTS" in config: + raw_concurrency = config["CRAWL_MAX_CONCURRENT_SNAPSHOTS"] + if raw_concurrency in (None, ""): + config.pop("CRAWL_MAX_CONCURRENT_SNAPSHOTS", None) + else: + config["CRAWL_MAX_CONCURRENT_SNAPSHOTS"] = max(1, int(raw_concurrency)) + + if config != (self.config or {}): + self.config = config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + + super().save(*args, **kwargs) + old_permissions = getattr(old_crawl, "permissions", None) + if old_crawl is not None and old_permissions != self.permissions: + transaction.on_commit(lambda: self.update_child_snapshot_permissions(old_permissions, self.permissions)) + if sync_tags: + next_tag_names = set(self.parse_tag_names(self.tags_str or "")) + added_tag_names = next_tag_names - previous_tag_names + removed_tag_names = previous_tag_names - next_tag_names + if added_tag_names or removed_tag_names: + # Keep the SQLite write phase short: the Crawl row is already + # saved, and the potentially large snapshot tag fanout runs in + # chunked ORM writes after any caller atomic() exits. + transaction.on_commit( + lambda: self.apply_snapshot_tag_diff( + added_tag_names=added_tag_names, + removed_tag_names=removed_tag_names, + ), + ) + # if is_new: + # from archivebox.misc.logging_util import log_worker_event + # first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + # log_worker_event( + # worker_type='DB', + # event='Created Crawl', + # indent_level=1, + # metadata={ + # 'id': str(self.id), + # 'first_url': first_url[:64], + # 'max_depth': self.max_depth, + # 'status': self.status, + # }, + # ) + + def update_child_snapshot_permissions(self, old_permissions: str | None, new_permissions: str | None) -> int: + from archivebox.core.models import Snapshot + + normalized_new_permissions = normalize_permissions(new_permissions) + now = timezone.now() + batch = [] + updated = 0 + queryset = self.snapshot_set.filter(Q(permissions=old_permissions) | Q(permissions__isnull=True)).only("id", "config") + for snapshot in queryset.iterator(chunk_size=500): + config = dict(snapshot.config or {}) + config["PERMISSIONS"] = normalized_new_permissions + snapshot.config = config + snapshot.modified_at = now + batch.append(snapshot) + if len(batch) >= 500: + Snapshot.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + batch.clear() + if batch: + Snapshot.objects.bulk_update(batch, ["config", "modified_at"], batch_size=500) + updated += len(batch) + return updated + + @property + def api_url(self) -> str: + return str(reverse_lazy("api-1:get_crawl", args=[self.id])) + + @staticmethod + def parse_tag_names(tags: Iterable[str] | str, *, pattern: str = r",") -> list[str]: + raw_tags = re.split(pattern, tags) if isinstance(tags, str) else tags + tag_names: list[str] = [] + seen: set[str] = set() + for raw_tag in raw_tags: + tag_name = str(raw_tag or "").strip() + if not tag_name: + continue + lowered = tag_name.lower() + if lowered in seen: + continue + seen.add(lowered) + tag_names.append(tag_name) + return tag_names + + def current_tag_names(self) -> list[str]: + current_tags_str = type(self).objects.filter(pk=self.pk).values_list("tags_str", flat=True).first() if self.pk else self.tags_str + if current_tags_str is not None: + self.tags_str = current_tags_str + return self.parse_tag_names(self.tags_str or "") + + def apply_snapshot_tag_diff(self, *, added_tag_names: Iterable[str], removed_tag_names: Iterable[str]) -> None: + from archivebox.core.models import Snapshot, SnapshotTag, Tag + + added_names = self.parse_tag_names(added_tag_names) + removed_names = self.parse_tag_names(removed_tag_names) + if not added_names and not removed_names: + return + + if added_names: + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=added_names)} + missing_tags = [Tag(name=name) for name in added_names if name not in tags_by_name] + if missing_tags: + # One small write for missing tag rows, followed by chunked + # M2M fanout below; avoid per-snapshot get_or_create loops. + Tag.objects.bulk_create(missing_tags, ignore_conflicts=True) + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=added_names)} + + tag_ids = [tag.pk for tag_name in added_names if (tag := tags_by_name.get(tag_name))] + snapshot_ids = Snapshot.objects.filter(crawl=self).values_list("id", flat=True).iterator(chunk_size=5000) + while True: + batch_snapshot_ids = list(islice(snapshot_ids, 5000)) + if not batch_snapshot_ids: + break + for tag_id in tag_ids: + # Chunked bulk_create keeps memory bounded and uses the + # SnapshotTag uniqueness constraint instead of row-by-row + # existence checks. + SnapshotTag.objects.bulk_create( + [SnapshotTag(snapshot_id=snapshot_id, tag_id=tag_id) for snapshot_id in batch_snapshot_ids], + ignore_conflicts=True, + batch_size=5000, + ) + + if removed_names: + removed_tag_ids = list(Tag.objects.filter(name__in=removed_names).values_list("pk", flat=True)) + if removed_tag_ids: + # One DELETE with a subquery keeps the tag removal transaction + # bounded to the M2M rows touched by this crawl only. + SnapshotTag.objects.filter(snapshot__crawl=self, tag_id__in=removed_tag_ids).delete() + + def to_json(self) -> dict: + """ + Convert Crawl model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + from archivebox.config.common import redact_sensitive_config + + return { + "type": "Crawl", + "schema_version": VERSION, + "id": str(self.id), + "urls": self.urls, + "status": self.status, + "max_depth": self.max_depth, + "config": redact_sensitive_config(self.config), + "tags_str": self.tags_str, + "label": self.label, + "created_at": self.created_at.isoformat() if self.created_at else None, + } + + @staticmethod + def from_json(record: dict, overrides: dict | None = None): + """ + Create or get a Crawl from a JSON dict. + + Args: + record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label' + overrides: Dict of field overrides (e.g., created_by_id) + + Returns: + Crawl instance or None if invalid + """ + from django.utils import timezone + + overrides = overrides or {} + + # Check if crawl already exists by ID + crawl_id = record.get("id") + if crawl_id: + try: + return Crawl.objects.get(id=crawl_id) + except Crawl.DoesNotExist: + pass + + # Get URLs - can be string (newline-separated) or from 'url' field + urls = record.get("urls", "") + if not urls and record.get("url"): + urls = record["url"] + + if not urls: + return None + + # Create new crawl (status stays QUEUED, not started) + crawl = Crawl.objects.create( + urls=urls, + max_depth=record.get("max_depth", record.get("depth", 0)), + config=record.get("config") or {}, + tags_str=record.get("tags_str", record.get("tags", "")), + label=record.get("label", ""), + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + **overrides, + ) + return crawl + + @property + def output_dir(self) -> Path: + from archivebox.config import CONSTANTS + from archivebox.core.models import Snapshot + + date_str = self.created_at.strftime("%Y%m%d") + first_url = "" + for raw_line in StringIO(self.urls or ""): + candidate = raw_line.strip() + if candidate and not candidate.startswith("#"): + first_url = candidate + break + domain = Snapshot.extract_domain_from_url(first_url) if first_url else "unknown" + + output_dir = CONSTANTS.USERS_DIR / self.created_by.username / CONSTANTS.CRAWLS_DIR_NAME / date_str / domain / str(self.id) + hyphen_dir = output_dir.with_name(str(uuid.UUID(hex=self.id.hex))) + return output_dir if output_dir.exists() or not hyphen_dir.exists() else hyphen_dir + + def get_urls_list(self) -> list[str]: + """Get list of URLs from urls field, filtering out comments and empty lines.""" + if not self.urls: + return [] + return [url.strip() for url in self.urls.split("\n") if url.strip() and not url.strip().startswith("#")] + + def has_internal_input_root(self) -> bool: + """Return True when Crawl.urls is preserved source text, not the work queue. + + `archivebox add` creates a synthetic root snapshot to run parser hooks + through the same Snapshot lifecycle as every other extractor. In that + mode the raw submitted import text must remain in Crawl.urls forever; + parsed URLs live as child Snapshot rows and should not be appended back. + """ + from archivebox.core.models import Snapshot + + return self.snapshot_set.filter(url=Snapshot.INTERNAL_INPUT_URL, depth=0).exists() + + @staticmethod + def normalize_domain(value: str) -> str: + candidate = (value or "").strip().lower() + if not candidate: + return "" + if "://" not in candidate and "/" not in candidate: + candidate = f"https://{candidate.lstrip('.')}" + try: + parsed = urlparse(candidate) + hostname = parsed.hostname or "" + if not hostname: + return "" + if parsed.port: + return f"{hostname}_{parsed.port}" + return hostname + except Exception: + return "" + + @staticmethod + def split_filter_patterns(value) -> list[str]: + patterns = [] + seen = set() + if isinstance(value, list): + raw_values = value + elif isinstance(value, str): + raw_values = value.splitlines() + else: + raw_values = [] + + for raw_value in raw_values: + pattern = str(raw_value or "").strip() + if not pattern or pattern in seen: + continue + seen.add(pattern) + patterns.append(pattern) + return patterns + + @classmethod + def _pattern_matches_url(cls, url: str, pattern: str) -> bool: + normalized_pattern = str(pattern or "").strip() + if not normalized_pattern: + return False + + if re.fullmatch(r"[\w.*:-]+", normalized_pattern): + wildcard_only_subdomains = normalized_pattern.startswith("*.") + normalized_domain = cls.normalize_domain( + normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern, + ) + normalized_url_domain = cls.normalize_domain(url) + if not normalized_domain or not normalized_url_domain: + return False + + pattern_host = normalized_domain.split("_", 1)[0] + url_host = normalized_url_domain.split("_", 1)[0] + + if wildcard_only_subdomains: + return url_host.endswith(f".{pattern_host}") + + if normalized_url_domain == normalized_domain: + return True + return url_host == pattern_host or url_host.endswith(f".{pattern_host}") + + try: + return bool(re.search(normalized_pattern, url)) + except re.error: + return False + + def get_current_config(self, *, refresh: bool = False) -> dict[str, Any]: + if refresh and self.pk: + config = type(self).objects.filter(pk=self.pk).values_list("config", flat=True).first() + if config is not None: + self.config = config + return dict(self.config or {}) + + def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + config = self.get_current_config(refresh=True) + else: + config = self.get_current_config() + if snapshot is not None and snapshot.config: + config.update(snapshot.config) + return self.split_filter_patterns(config.get("URL_ALLOWLIST", "")) + + def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]: + if use_effective_config: + config = self.get_current_config(refresh=True) + else: + config = self.get_current_config() + if snapshot is not None and snapshot.config: + config.update(snapshot.config) + return self.split_filter_patterns(config.get("URL_DENYLIST", "")) + + def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool: + denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot) + allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot) + return self.url_passes_compiled_filters(url, allowlist=allowlist, denylist=denylist) + + def url_passes_compiled_filters(self, url: str, *, allowlist: list[str], denylist: list[str]) -> bool: + for pattern in denylist: + if self._pattern_matches_url(url, pattern): + return False + + if allowlist: + return any(self._pattern_matches_url(url, pattern) for pattern in allowlist) + + return True + + def set_url_filters(self, allowlist, denylist) -> None: + config = dict(self.config or {}) + allow_patterns = self.split_filter_patterns(allowlist) + deny_patterns = self.split_filter_patterns(denylist) + + if allow_patterns: + config["URL_ALLOWLIST"] = "\n".join(allow_patterns) + else: + config.pop("URL_ALLOWLIST", None) + + if deny_patterns: + config["URL_DENYLIST"] = "\n".join(deny_patterns) + else: + config.pop("URL_DENYLIST", None) + + self.config = config + + def apply_crawl_config_filters(self) -> dict[str, int]: + from archivebox.core.models import Snapshot + + removed_urls = self.prune_urls( + lambda url: not self.url_passes_filters(url, use_effective_config=False), + ) + + filtered_snapshots = [ + snapshot + for snapshot in self.snapshot_set.filter( + status__in=[ + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + ], + ).only("pk", "url", "status") + if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False) + ] + + deleted_snapshots = 0 + if filtered_snapshots: + started_snapshots = [snapshot for snapshot in filtered_snapshots if snapshot.status == Snapshot.StatusChoices.STARTED] + for snapshot in started_snapshots: + snapshot.cancel_running_hooks() + + filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots] + deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete() + + return { + "removed_urls": len(removed_urls), + "deleted_snapshots": deleted_snapshots, + } + + def _iter_url_lines(self) -> list[tuple[str, str]]: + entries: list[tuple[str, str]] = [] + for raw_line in (self.urls or "").splitlines(): + stripped = raw_line.strip() + if not stripped: + continue + if stripped.startswith("#"): + entries.append((raw_line.rstrip(), "")) + continue + try: + entry = json.loads(stripped) + entries.append((raw_line.rstrip(), str(entry.get("url", "") or "").strip())) + except json.JSONDecodeError: + entries.append((raw_line.rstrip(), stripped)) + return entries + + def count_urls_for_limit(self) -> int: + """ + Count unique URLs already queued or snapshotted for this crawl. + + max_urls is a crawl-wide cap on snapshots, so direct URL entries and + recursively discovered snapshots both have to consume the same budget. + """ + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + urls = set(self.snapshot_set.values_list("url", flat=True)) + for _raw_line, raw_url in self._iter_url_lines(): + url = sanitize_extracted_url(fix_url_from_markdown(str(raw_url or "").strip())) + if url: + urls.add(url) + return len(urls) + + def remaining_url_capacity(self) -> int | None: + max_urls = int(self._config_value(self.get_current_config(refresh=True), "CRAWL_MAX_URLS", 0) or 0) + if max_urls <= 0: + return None + return max(max_urls - self.count_urls_for_limit(), 0) + + def has_remaining_url_capacity(self) -> bool: + remaining = self.remaining_url_capacity() + return remaining is None or remaining > 0 + + def remaining_snapshot_capacity(self) -> int | None: + max_urls = int(self._config_value(self.get_current_config(refresh=True), "CRAWL_MAX_URLS", 0) or 0) + if max_urls <= 0: + return None + return max(max_urls - self.snapshot_set.count(), 0) + + def has_remaining_snapshot_capacity(self) -> bool: + remaining = self.remaining_snapshot_capacity() + return remaining is None or remaining > 0 + + def prune_urls(self, predicate) -> list[str]: + kept_lines: list[str] = [] + removed_urls: list[str] = [] + + for raw_line, url in self._iter_url_lines(): + if not url: + kept_lines.append(raw_line) + continue + if predicate(url): + removed_urls.append(url) + continue + kept_lines.append(raw_line) + + next_urls = "\n".join(kept_lines) + if next_urls != (self.urls or ""): + self.urls = next_urls + self.save(update_fields=["urls", "modified_at"]) + return removed_urls + + def prune_url(self, url: str) -> int: + target = (url or "").strip() + removed = self.prune_urls(lambda candidate: candidate == target) + return len(removed) + + def exclude_domain(self, domain: str) -> dict[str, int | str | bool]: + normalized_domain = self.normalize_domain(domain) + if not normalized_domain: + return { + "domain": "", + "created": False, + "removed_urls": 0, + "deleted_snapshots": 0, + } + + domains = self.get_url_denylist(use_effective_config=False) + created = normalized_domain not in domains + if created: + domains.append(normalized_domain) + self.set_url_filters( + self.get_url_allowlist(use_effective_config=False), + domains, + ) + self.save(update_fields=["config", "modified_at"]) + + filter_result = self.apply_crawl_config_filters() + + return { + "domain": normalized_domain, + "created": created, + "removed_urls": filter_result["removed_urls"], + "deleted_snapshots": filter_result["deleted_snapshots"], + } + + def get_system_task(self) -> str | None: + urls = self.get_urls_list() + if len(urls) != 1: + return None + system_url = urls[0].strip().lower() + if system_url.startswith("archivebox://"): + return system_url + return None + + def resolve_persona(self): + from archivebox.personas.models import Persona + + if self.persona_id: + return Persona.objects.filter(id=self.persona_id).first() + + return None + + @staticmethod + def _config_value(config: Mapping[str, Any] | Any, key: str, default: Any = None) -> Any: + if isinstance(config, Mapping): + return config.get(key, default) + return config[key] if key in config else default + + @classmethod + def create_scheduler_row(cls, **kwargs) -> "Crawl": + from archivebox.base_models.models import normalize_config_json_values + from archivebox.config.common import build_crawl_config_snapshot + + now = timezone.now() + kwargs.setdefault("created_at", now) + kwargs.setdefault("modified_at", now) + config = normalize_config_json_values(kwargs.get("config") or {}) + persona = kwargs.get("persona") + if persona is None and kwargs.get("persona_id"): + from archivebox.personas.models import Persona + + persona = Persona.objects.filter(pk=kwargs["persona_id"]).first() + kwargs["config"] = build_crawl_config_snapshot(persona=persona, overrides=config) + crawl = cls(**kwargs) + if crawl.delete_at is None: + crawl.set_delete_at_from_config() + cls.objects.bulk_create([crawl]) + return crawl + + def limit_stop_reason( + self, + *, + config: Mapping[str, Any] | Any | None = None, + output_dir: Path | None = None, + num_snapshots: int | None = None, + ) -> str: + from abx_dl.limits import CrawlLimitState + + if output_dir is None: + output_dir = self.output_dir + if config is None: + from archivebox.config.common import get_config + + config = get_config(crawl=self, include_machine=False).for_crawl_runtime( + crawl=self, + persona=self.resolve_persona(), + crawl_output_dir=output_dir, + ) + + limits_path = output_dir / ".abx-dl" / "limits.json" + if limits_path.exists(): + stop_reason = CrawlLimitState.from_config(config).get_stop_reason() + if stop_reason: + return stop_reason + + max_urls = int(self._config_value(config, "CRAWL_MAX_URLS", 0) or 0) + if num_snapshots is None: + num_snapshots = self.snapshot_set.count() + if max_urls > 0 and num_snapshots >= max_urls and self.count_urls_for_limit() >= max_urls: + return "crawl_max_urls" + + return "" + + def lifecycle_stop_reason(self, *, num_snapshots: int | None = None, num_sealed_snapshots: int | None = None) -> str: + if self.is_paused: + return "paused" + + if self.status != self.StatusChoices.SEALED: + return "" + + if num_snapshots is None: + num_snapshots = self.snapshot_set.count() + if num_snapshots == 0: + return "no_viable_urls" + + if num_sealed_snapshots is None: + from archivebox.core.models import Snapshot + + num_sealed_snapshots = self.snapshot_set.filter(status=Snapshot.StatusChoices.SEALED).count() + if num_sealed_snapshots >= num_snapshots: + return "done" + + return "" + + def stop_reason( + self, + *, + config: Mapping[str, Any] | Any | None = None, + output_dir: Path | None = None, + num_snapshots: int | None = None, + num_sealed_snapshots: int | None = None, + ) -> str: + return self.limit_stop_reason(config=config, output_dir=output_dir, num_snapshots=num_snapshots) or self.lifecycle_stop_reason( + num_snapshots=num_snapshots, + num_sealed_snapshots=num_sealed_snapshots, + ) + + def add_url(self, entry: dict) -> bool: + """ + Add a URL to the crawl queue if not already present. + + Args: + entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'plugin' + + Returns: + True if URL was added, False if skipped (duplicate or depth exceeded) + """ + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) + if not url: + return False + try: + validate_url_length(url) + except ValueError: + return False + if not self.url_passes_filters(url): + return False + + depth = entry.get("depth", 1) + + # Skip if depth exceeds max_depth + if depth > self.max_depth: + return False + + # Skip if already a Snapshot for this crawl + if self.snapshot_set.filter(url=url).exists(): + return False + + # Check if already in urls (parse existing JSONL entries) + existing_urls = {url for _raw_line, url in self._iter_url_lines() if url} + + if url in existing_urls: + return False + + if not self.has_remaining_url_capacity(): + return False + + # Append as JSONL + entry = {**entry, "url": url} + jsonl_entry = json.dumps(entry) + self.urls = (self.urls.rstrip() + "\n" + jsonl_entry).lstrip("\n") + self.save(update_fields=["urls", "modified_at"]) + return True + + def create_snapshots_from_urls(self) -> list["Snapshot"]: + """ + Create Snapshot objects for each URL in self.urls that doesn't already exist. + + Returns: + List of newly created Snapshot objects + """ + from archivebox.core.models import Snapshot, Tag + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + if self.status == self.StatusChoices.SEALED: + return [] + # Internal-input crawls preserve the submitted text verbatim in + # Crawl.urls. The root snapshot's parser hooks are the only supported + # path for turning that text into child snapshots, otherwise a later + # runner pass could reinterpret plain URL-looking lines as direct + # depth-0 work and bypass format-specific metadata parsing. + if self.has_internal_input_root(): + return [] + + created_snapshots = [] + crawl_tag_names = self.current_tag_names() + tags_by_name: dict[str, Tag] = {} + + for line in self.urls.splitlines(): + if not line.strip(): + continue + config = self.get_current_config(refresh=True) + only_new_urls = bool(self._config_value(config, "ONLY_NEW", True)) + + # Parse JSONL or plain URL + try: + entry = json.loads(line) + snapshot_id = entry.get("id") or entry.get("snapshot_id") + url = sanitize_extracted_url(fix_url_from_markdown(str(entry.get("url", "") or "").strip())) + depth = entry.get("depth", 0) + title = entry.get("title") + timestamp = entry.get("timestamp") + tag_names = [*crawl_tag_names, *self.parse_tag_names(entry.get("tags", ""))] + except json.JSONDecodeError: + snapshot_id = None + url = sanitize_extracted_url(fix_url_from_markdown(line.strip())) + depth = 0 + title = None + timestamp = None + tag_names = crawl_tag_names + + if not url: + continue + try: + validate_url(url) + except ValueError as err: + print(f"[yellow][!] Skipping invalid snapshot URL: {url[:120]}... ({err})[/yellow]") + continue + if Snapshot.is_archivebox_internal_url(url, config=config): + print(f"[yellow][!] Skipping internal ArchiveBox snapshot URL: {url}[/yellow]") + continue + if not self.url_passes_filters(url, use_effective_config=False): + continue + if only_new_urls and Snapshot.objects.filter(url=url).exists(): + continue + + # Skip if depth exceeds max_depth + if depth > self.max_depth: + continue + + # Stop creating new snapshots once the crawl-wide URL cap is reached. + if not self.has_remaining_snapshot_capacity(): + break + + defaults = { + "depth": depth, + "title": title, + "timestamp": timestamp or str(timezone.now().timestamp()), + "status": Snapshot.INITIAL_STATE, + "retry_at": timezone.now(), + # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl + } + try: + # Intentionally avoid get_or_create/update_or_create here: + # Django wraps those helpers in atomic(), and Snapshot.save() schedules + # filesystem/crawl maintenance callbacks. Keeping this as explicit + # read-then-save lets SQLite commit each write immediately unless the + # caller deliberately wrapped us in transaction.atomic(). + if snapshot_id: + snapshot = Snapshot.objects.filter(id=snapshot_id).first() + if snapshot: + created = False + for field, value in { + **defaults, + "url": url, + "crawl": self, + }.items(): + setattr(snapshot, field, value) + snapshot.save(update_fields=["depth", "title", "timestamp", "status", "retry_at", "url", "crawl", "modified_at"]) + else: + snapshot = Snapshot(id=snapshot_id, url=url, crawl=self, **defaults) + snapshot.save() + created = True + else: + snapshot = Snapshot.objects.filter(url=url, crawl=self).first() + if snapshot: + created = False + else: + try: + snapshot = Snapshot(url=url, crawl=self, **defaults) + snapshot.save() + created = True + except IntegrityError: + snapshot = Snapshot.objects.get(url=url, crawl=self) + created = False + except ValidationError as err: + print(f"[yellow][!] Skipping blocked snapshot URL: {url} ({err})[/yellow]") + continue + + if created: + created_snapshots.append(snapshot) + if tag_names: + missing_names = [tag_name for tag_name in tag_names if tag_name not in tags_by_name] + if missing_names: + tags_by_name.update({tag.name: tag for tag in Tag.objects.filter(name__in=missing_names)}) + missing_tags = [Tag(name=tag_name) for tag_name in missing_names if tag_name not in tags_by_name] + if missing_tags: + # Create tag rows in bulk, then attach through the M2M + # relation without clearing any non-crawl snapshot tags. + Tag.objects.bulk_create(missing_tags, ignore_conflicts=True) + tags_by_name.update({tag.name: tag for tag in Tag.objects.filter(name__in=missing_names)}) + snapshot.tags.add(*[tag.pk for tag_name in tag_names if (tag := tags_by_name.get(tag_name))]) + + # Symlink creation touches the filesystem and can be slow on remote disks. + # Defer it until after any active DB transaction commits so SQLite does + # not hold a write lock while mkdir/symlink work runs. + transaction.on_commit(lambda snapshot=snapshot: snapshot.ensure_crawl_symlink()) + + return created_snapshots + + def create_discovered_snapshot( + self, + parent_snapshot, + *, + url: str, + depth: int, + title: str = "", + tags: str = "", + created_by_id: int | None = None, + ): + """Create one child snapshot if it passes crawl filters and limits.""" + snapshots = self.create_discovered_snapshots( + parent_snapshot, + [{"url": url, "title": title, "tags": tags}], + depth=depth, + created_by_id=created_by_id, + ) + return snapshots[0] if snapshots else None + + def create_discovered_snapshots( + self, + parent_snapshot, + records: Iterable[Mapping[str, Any]], + *, + depth: int, + created_by_id: int | None = None, + ) -> list["Snapshot"]: + """Create child snapshots from discovered URL records after filtering and deduping once.""" + from archivebox.core.models import Snapshot, SnapshotTag, Tag + from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url + + if self.status == self.StatusChoices.SEALED: + return [] + + if depth > self.max_depth: + return [] + + crawl_tag_names = self.current_tag_names() + config = self.get_current_config(refresh=True) + if parent_snapshot is not None and parent_snapshot.config: + config.update(parent_snapshot.config) + allowlist = self.split_filter_patterns(config.get("URL_ALLOWLIST", "")) + denylist = self.split_filter_patterns(config.get("URL_DENYLIST", "")) + + def metadata_score(record: Mapping[str, Any]) -> int: + # Multiple parsers can discover the same URL from one import root. + # Keep the record with the richest user-facing metadata so generic + # text/HTML extraction does not erase RSS/Netscape/JSON fields. + return sum(bool(record.get(field)) for field in ("title", "bookmarked_at", "timestamp", "tags")) + + deduped_records: dict[str, Mapping[str, Any]] = {} + for record in records: + url = sanitize_extracted_url(fix_url_from_markdown(str(record.get("url") or "").strip())) + if not url: + continue + try: + validate_url(url) + except ValueError as err: + print(f"[yellow][!] Skipping invalid discovered snapshot URL: {url[:120]}... ({err})[/yellow]") + continue + if Snapshot.is_archivebox_internal_url(url, config=config): + print(f"[yellow][!] Skipping internal ArchiveBox discovered snapshot URL: {url}[/yellow]") + continue + if self.url_passes_compiled_filters(url, allowlist=allowlist, denylist=denylist): + existing_record = deduped_records.get(url) + if existing_record is None or metadata_score(record) > metadata_score(existing_record): + deduped_records[url] = record + + if not deduped_records: + return [] + + existing_in_crawl = { + snapshot.url: snapshot for snapshot in self.snapshot_set.prefetch_related("tags").filter(url__in=deduped_records.keys()) + } + for url, snapshot in existing_in_crawl.items(): + record = deduped_records[url] + update_fields = [] + title = Snapshot._normalize_title_candidate(str(record.get("title") or "").strip()[:512], snapshot_url=url) + if title and (not snapshot.title or len(title) > len(snapshot.title or "")): + snapshot.title = title + update_fields.append("title") + bookmarked_at = None + try: + bookmarked_at = parse_date(record.get("bookmarked_at") or record.get("timestamp")) + except (TypeError, ValueError, OSError): + pass + if bookmarked_at and snapshot.bookmarked_at != bookmarked_at: + snapshot.bookmarked_at = bookmarked_at + update_fields.append("bookmarked_at") + if update_fields: + snapshot.save(update_fields=[*update_fields, "modified_at"]) + tag_names = { + *crawl_tag_names, + *self.parse_tag_names( + str(record.get("tags") or ""), + pattern=self._config_value(config, "TAG_SEPARATOR_PATTERN", r"[,]"), + ), + } + if tag_names: + tag_ids = [Tag.objects.get_or_create(name=tag_name)[0].pk for tag_name in tag_names] + snapshot.tags.add(*tag_ids) + + existing_scope = Snapshot.objects if bool(self._config_value(config, "ONLY_NEW", True)) else self.snapshot_set + existing_urls = set(existing_scope.filter(url__in=deduped_records.keys()).values_list("url", flat=True)) + urls = [url for url in deduped_records.keys() if url not in existing_urls] + remaining = self.remaining_snapshot_capacity() + if remaining is not None: + urls = urls[:remaining] + if not urls: + return [] + + now = timezone.now() + snapshots = [] + for index, url in enumerate(urls): + record = deduped_records[url] + bookmarked_at = now + try: + bookmarked_at = parse_date(record.get("bookmarked_at") or record.get("timestamp")) or now + except (TypeError, ValueError, OSError): + pass + snapshots.append( + Snapshot( + url=url, + timestamp=str((now + timedelta(microseconds=index)).timestamp()), + title=Snapshot._normalize_title_candidate( + str(record.get("title") or "").strip()[:512], + snapshot_url=url, + ) + or None, + crawl=self, + parent_snapshot=parent_snapshot, + depth=depth, + status=Snapshot.StatusChoices.QUEUED, + retry_at=now, + bookmarked_at=bookmarked_at, + created_at=now, + ), + ) + for snapshot in snapshots: + snapshot.set_delete_at_from_config(self._config_value(config, "DELETE_AFTER", "0")) + + created_snapshots = [] + for snapshot in snapshots: + try: + # Snapshot.save() owns URL validation and filesystem/index side + # effects. Do not use bulk_create() here; it bypasses save(). + snapshot.save() + except IntegrityError: + continue + except ValidationError as err: + print(f"[yellow][!] Skipping blocked discovered snapshot URL: {snapshot.url} ({err})[/yellow]") + continue + created_snapshots.append(snapshot) + if not created_snapshots: + return [] + + crawl_urls = {url for _raw_line, url in self._iter_url_lines() if url} + new_url_lines = [snapshot.url for snapshot in created_snapshots if snapshot.url not in crawl_urls] + # For internal-input crawls, Crawl.urls is the immutable source text. + # Child snapshots are the parsed/indexed representation, so appending + # discovered URLs here would both duplicate state and destroy the exact + # import artifact users submitted through CLI/API/UI. + if new_url_lines and not self.has_internal_input_root(): + self.urls = (self.urls.rstrip() + "\n" + "\n".join(new_url_lines)).lstrip("\n") + self.save(update_fields=["urls", "modified_at"]) + + tag_names_by_url: dict[str, set[str]] = {} + for snapshot in created_snapshots: + tag_names = { + *crawl_tag_names, + *self.parse_tag_names( + str(deduped_records[snapshot.url].get("tags") or ""), + pattern=self._config_value(config, "TAG_SEPARATOR_PATTERN", r"[,]"), + ), + } + if tag_names: + tag_names_by_url[snapshot.url] = tag_names + # Snapshot.save() handles model-level validation. The crawl symlink + # can still wait until after commit so SQLite does not hold a write + # lock while touching the filesystem. + transaction.on_commit(lambda snapshot=snapshot: snapshot.ensure_crawl_symlink()) + + tag_names = {tag for tags in tag_names_by_url.values() for tag in tags} + if tag_names: + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=tag_names)} + missing_tags = [Tag(name=name) for name in sorted(tag_names - tags_by_name.keys())] + if missing_tags: + Tag.objects.bulk_create(missing_tags, ignore_conflicts=True) + tags_by_name = {tag.name: tag for tag in Tag.objects.filter(name__in=tag_names)} + SnapshotTag.objects.bulk_create( + [ + SnapshotTag(snapshot=snapshot, tag=tags_by_name[tag_name]) + for snapshot in created_snapshots + for tag_name in tag_names_by_url.get(snapshot.url, set()) + if tag_name in tags_by_name + ], + ignore_conflicts=True, + ) + + return created_snapshots + + def install_declared_binaries(self, binary_names: set[str], machine=None) -> None: + """ + Install crawl-declared Binary rows without violating the retry_at lock lifecycle. + + Correct calling pattern: + 1. Crawl hooks declare Binary records and queue them with retry_at <= now + 2. Exactly one actor claims each Binary by moving retry_at into the future + 3. Only that owner executes `.sm.tick()` and performs install side effects + 4. Everyone else waits for the claimed owner to finish instead of launching + a second install against shared state such as the pip or npm trees + + This helper follows that contract by claiming each Binary before ticking + it, and by waiting when another worker already owns the row. That keeps + synchronous crawl execution compatible with the shared background runner and + avoids duplicate installs of the same dependency. + """ + import time + from archivebox.machine.models import Binary, Machine + + if not binary_names: + return + + machine = machine or Machine.current() + lock_seconds = 600 + deadline = time.monotonic() + max(lock_seconds, len(binary_names) * lock_seconds) + + while time.monotonic() < deadline: + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if not unresolved_binaries: + return + + claimed_any = False + waiting_on_existing_owner = False + now = timezone.now() + + for binary in unresolved_binaries: + try: + if binary.tick_claimed(lock_seconds=lock_seconds): + claimed_any = True + continue + except Exception: + claimed_any = True + continue + + binary.refresh_from_db() + if binary.status == Binary.StatusChoices.INSTALLED: + claimed_any = True + continue + if binary.retry_at and binary.retry_at > now: + waiting_on_existing_owner = True + + if claimed_any: + continue + if waiting_on_existing_owner: + time.sleep(0.5) + continue + break + + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if unresolved_binaries: + binary_details = ", ".join( + f"{binary.name} (status={binary.status}, retry_at={binary.retry_at})" for binary in unresolved_binaries + ) + raise RuntimeError( + f"Crawl dependencies failed to install before continuing: {binary_details}", + ) + + def run(self) -> "Snapshot | None": + """ + Execute this Crawl: run hooks, process JSONL, create snapshots. + + Called by the state machine when entering the 'started' state. + + Returns: + The root Snapshot for this crawl, or None for system crawls that don't create snapshots + """ + import time + from archivebox.plugins.hooks import run_hook, discover_hooks, process_hook_records + from archivebox.config.common import get_config + from archivebox.machine.models import Binary, Machine + + def get_runtime_config(): + return get_config(crawl=self).for_crawl_runtime( + crawl=self, + persona=persona, + runtime_overrides=persona_runtime_overrides, + ) + + system_task = self.get_system_task() + if system_task == "archivebox://update": + from archivebox.cli.archivebox_update import process_all_db_snapshots + + process_all_db_snapshots() + return None + + machine = Machine.current() + declared_binary_names: set[str] = set() + persona_runtime_overrides: dict[str, str] = {} + persona = self.resolve_persona() + if persona: + base_runtime_config = get_config(crawl=self, persona=persona) + chrome_binary = str(base_runtime_config.get("CHROME_BINARY") or "") + persona_runtime_overrides = persona.prepare_runtime_for_crawl( + crawl=self, + chrome_binary=chrome_binary, + ) + + executed_crawl_hooks: set[str] = set() + + def run_crawl_hook(hook: Path) -> set[str]: + executed_crawl_hooks.add(str(hook)) + primary_url = next( + (line.strip() for line in self.urls.splitlines() if line.strip()), + self.urls.strip(), + ) + + hook_start = time.time() + plugin_name = hook.parent.name + output_dir = self.output_dir / plugin_name + output_dir.mkdir(parents=True, exist_ok=True) + + process = run_hook( + hook, + output_dir=output_dir, + config=get_runtime_config(), + crawl_id=str(self.id), + source_url=self.urls, + url=primary_url, + snapshot_id=str(self.id), + ) + hook_elapsed = time.time() - hook_start + if hook_elapsed > 0.5: + print(f"[yellow]โฑ๏ธ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]") + + if process.status == process.StatusChoices.RUNNING: + if process.poll() is None: + return set() + + from archivebox.plugins.hooks import extract_records_from_process + + records = [] + # A hook can exit before its completed Process metadata is visible. + # Give successful hooks a brief chance to flush JSONL stdout into + # the Process row before downstream hooks. + for delay in (0.0, 0.05, 0.1, 0.25, 0.5): + if delay: + time.sleep(delay) + records = extract_records_from_process(process) + if records: + break + if records: + print(f"[cyan]๐Ÿ“ Processing {len(records)} records from {hook.name}[/cyan]") + for record in records[:3]: + print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}") + if system_task: + records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")] + overrides = {"crawl": self} + stats = process_hook_records(records, overrides=overrides) + if stats: + print(f"[green]โœ“ Created: {stats}[/green]") + + hook_binary_names = { + str(record.get("name")).strip() + for record in records + if record.get("type") in ("BinaryRequest", "Binary") and record.get("name") + } + hook_binary_names.discard("") + if hook_binary_names: + declared_binary_names.update(hook_binary_names) + return hook_binary_names + + def resolve_provider_binaries(binary_names: set[str]) -> set[str]: + if not binary_names: + return set() + + resolved_binary_names = set(binary_names) + + while True: + unresolved_binaries = list( + Binary.objects.filter( + machine=machine, + name__in=resolved_binary_names, + ) + .exclude( + status=Binary.StatusChoices.INSTALLED, + ) + .order_by("name"), + ) + if not unresolved_binaries: + return resolved_binary_names + + needed_provider_names: set[str] = set() + for binary in unresolved_binaries: + allowed_binproviders = binary._allowed_binproviders() + if allowed_binproviders is None: + continue + needed_provider_names.update(allowed_binproviders) + + if not needed_provider_names: + return resolved_binary_names + + provider_hooks = [ + hook + for hook in discover_hooks("Crawl", filter_disabled=False, config=get_runtime_config()) + if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks + ] + if not provider_hooks: + return resolved_binary_names + + for hook in provider_hooks: + resolved_binary_names.update(run_crawl_hook(hook)) + + hooks = discover_hooks("Crawl", config=get_runtime_config()) + + for hook in hooks: + hook_binary_names = run_crawl_hook(hook) + if hook_binary_names: + self.install_declared_binaries(resolve_provider_binaries(hook_binary_names), machine=machine) + + # Safety check: don't create snapshots if any crawl-declared dependency + # is still unresolved after all crawl hooks have run. + self.install_declared_binaries(declared_binary_names, machine=machine) + + # Create snapshots from all URLs in self.urls + if system_task: + leaked_snapshots = self.snapshot_set.all() + if leaked_snapshots.exists(): + leaked_count = leaked_snapshots.count() + leaked_snapshots.delete() + print(f"[yellow]โš ๏ธ Removed {leaked_count} leaked snapshot(s) created during system crawl {system_task}[/yellow]") + return None + + self.create_snapshots_from_urls() + + # Return first snapshot for this crawl (newly created or existing) + # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created + return self.snapshot_set.first() + + def is_finished(self) -> bool: + """Check if crawl is finished (all snapshots sealed or no snapshots exist).""" + from archivebox.core.models import Snapshot + + # Check if any snapshots exist for this crawl + snapshots = Snapshot.objects.filter(crawl=self) + + # If no snapshots exist, allow finishing (e.g., system crawls that only run setup hooks) + if not snapshots.exists(): + return True + + # If snapshots exist, check if all are sealed + if snapshots.filter( + status__in=[ + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + ], + ).exists(): + return False + + return True + + def cleanup(self): + """Clean up background hooks and run on_CrawlEnd hooks.""" + from archivebox.plugins.hooks import run_hook, discover_hooks + + # Clean up .pid files from output directory + if self.output_dir.exists(): + for pid_file in self.output_dir.glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + persona = self.resolve_persona() + if persona: + persona.cleanup_runtime_for_crawl(self) + + # Run on_CrawlEnd hooks + from archivebox.config.common import get_config + + config = get_config(crawl=self) + + hooks = discover_hooks("CrawlEnd", config=config) + + for hook in hooks: + plugin_name = hook.parent.name + output_dir = self.output_dir / plugin_name + output_dir.mkdir(parents=True, exist_ok=True) + + process = run_hook( + hook, + output_dir=output_dir, + config=config, + crawl_id=str(self.id), + source_url=self.urls, # Pass full newline-separated URLs + ) + + # Log failures but don't block + if process.exit_code != 0: + print(f"[yellow]โš ๏ธ CrawlEnd hook failed: {hook.name}[/yellow]") + + +# ============================================================================= +# State Machines +# ============================================================================= + + +class CrawlMachine(BaseStateMachine): + crawl: Crawl + + """ + State machine for managing Crawl lifecycle. + + Hook Lifecycle: + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ QUEUED State โ”‚ + โ”‚ โ€ข Waiting for crawl to be ready (has URLs) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when can_start() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ STARTED State โ†’ enter_started() โ”‚ + โ”‚ 1. crawl.run() โ”‚ + โ”‚ โ€ข discover_hooks('Crawl') โ†’ finds all crawl hooks โ”‚ + โ”‚ โ€ข For each hook: โ”‚ + โ”‚ - run_hook(script, output_dir, ...) โ”‚ + โ”‚ - Parse JSONL from hook output โ”‚ + โ”‚ - process_hook_records() โ†’ creates Snapshots โ”‚ + โ”‚ โ€ข create_snapshots_from_urls() โ†’ from self.urls field โ”‚ + โ”‚ โ”‚ + โ”‚ 2. Snapshots process independently with their own โ”‚ + โ”‚ state machines (see SnapshotMachine) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when is_finished() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ SEALED State โ†’ enter_sealed() โ”‚ + โ”‚ โ€ข cleanup() โ†’ runs on_CrawlEnd hooks, kills background โ”‚ + โ”‚ โ€ข Set retry_at=None (no more processing) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + """ + + model_attr_name = "crawl" + + # States + queued = State(value=Crawl.StatusChoices.QUEUED, initial=True) + started = State(value=Crawl.StatusChoices.STARTED) + paused = State(value=Crawl.StatusChoices.PAUSED) + sealed = State(value=Crawl.StatusChoices.SEALED, final=True) + + # Tick Event (polled by workers) + tick = ( + queued.to(sealed, cond="has_finished_snapshots") + | queued.to.itself(unless="can_start") + | queued.to(started, cond="can_start") + | started.to(sealed, cond="is_finished") + | paused.to.itself() + ) + + # Manual event (triggered by last Snapshot sealing, or by direct + # index-only/bg creation when every requested URL is rejected before any + # Snapshot rows exist). + seal = queued.to(sealed) | started.to(sealed) | paused.to(sealed) + pause_requested = queued.to(paused) | started.to(paused) + resume_requested = paused.to(queued) + + def can_start(self) -> bool: + if not self.crawl.urls: + print(f"[red]โš ๏ธ Crawl {self.crawl.id} cannot start: no URLs[/red]") + return False + urls_list = self.crawl.get_urls_list() + if not urls_list: + print(f"[red]โš ๏ธ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]") + return False + return True + + def is_finished(self) -> bool: + """Check if all Snapshots for this crawl are finished.""" + return self.crawl.is_finished() + + def has_finished_snapshots(self) -> bool: + """A queued crawl with only final Snapshot rows was interrupted before sealing.""" + from archivebox.core.models import Snapshot + + snapshots = self.crawl.snapshot_set.all() + return snapshots.exists() and not snapshots.exclude(status=Snapshot.StatusChoices.SEALED).exists() + + @queued.enter + def enter_queued(self): + self.crawl.update_and_requeue( + retry_at=timezone.now(), + status=Crawl.StatusChoices.QUEUED, + ) + + @started.enter + def enter_started(self): + import sys + + print(f"[cyan]๐Ÿ”„ CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]", file=sys.stderr) + + try: + # Run the crawl - runs hooks, processes JSONL, creates snapshots + first_snapshot = self.crawl.run() + + if first_snapshot: + print( + f"[cyan]๐Ÿ”„ Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]", + file=sys.stderr, + ) + # Update status to STARTED + # Set retry_at to near future so tick() can poll and check is_finished() + self.crawl.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=2), + status=Crawl.StatusChoices.STARTED, + ) + else: + # No snapshots (system crawl that only runs setup hooks) + print("[cyan]๐Ÿ”„ No snapshots created, sealing crawl immediately[/cyan]", file=sys.stderr) + # Seal immediately since there's no work to do + self.seal() + + except Exception as e: + print(f"[red]โš ๏ธ Crawl {self.crawl.id} failed to start: {e}[/red]") + import traceback + + traceback.print_exc() + raise + + @paused.enter + def enter_paused(self): + self.crawl.update_and_requeue( + retry_at=RETRY_AT_MAX, + status=Crawl.StatusChoices.PAUSED, + ) + self.crawl.schedule_child_snapshots_for_pause() + + @sealed.enter + def enter_sealed(self): + now = timezone.now() + self.crawl.status = Crawl.StatusChoices.SEALED + self.crawl.retry_at = None + # Guard: never seal a row that a concurrent writer flipped to PAUSED. + # Sealing is idempotent (SEALEDโ†’SEALED is a no-op rewrite), so + # status__in covers both the QUEUED/STARTEDโ†’SEALED transition and the + # rare re-entry case. + updated = self.crawl.safe_update( + { + "status": Crawl.StatusChoices.SEALED, + "retry_at": None, + "modified_at": now, + }, + refresh=False, + extra_filter={ + "status__in": [ + Crawl.StatusChoices.QUEUED, + Crawl.StatusChoices.STARTED, + Crawl.StatusChoices.SEALED, + ], + }, + ) + if not updated: + self.crawl.refresh_from_db() + return + self.crawl.modified_at = now + + self.crawl.schedule_child_snapshots_for_sealing() + # Clean up background hooks and run on_CrawlEnd hooks after the final + # state is visible so cleanup projectors cannot resurrect the crawl. + self.crawl.cleanup() + + +# ============================================================================= +# Register State Machines +# ============================================================================= + +# Manually register state machines with python-statemachine registry +# (normally auto-discovered from statemachines.py, but we define them here for clarity) +registry.register(CrawlMachine) diff --git a/archivebox/crawls/schedule_util.py b/archivebox/crawls/schedule_util.py new file mode 100644 index 0000000000..a5307f990b --- /dev/null +++ b/archivebox/crawls/schedule_util.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from datetime import datetime + +from croniter import croniter + + +SCHEDULE_ALIASES: dict[str, str] = { + "minute": "* * * * *", + "minutely": "* * * * *", + "hour": "0 * * * *", + "hourly": "0 * * * *", + "day": "0 0 * * *", + "daily": "0 0 * * *", + "week": "0 0 * * 0", + "weekly": "0 0 * * 0", + "month": "0 0 1 * *", + "monthly": "0 0 1 * *", + "year": "0 0 1 1 *", + "yearly": "0 0 1 1 *", +} + + +def normalize_schedule(schedule: str) -> str: + normalized = (schedule or "").strip() + if not normalized: + raise ValueError("Schedule cannot be empty.") + + return SCHEDULE_ALIASES.get(normalized.lower(), normalized) + + +def validate_schedule(schedule: str) -> str: + normalized = normalize_schedule(schedule) + if not croniter.is_valid(normalized): + raise ValueError( + "Invalid schedule. Use an alias like daily/weekly/monthly or a cron expression such as '0 */6 * * *'.", + ) + return normalized + + +def next_run_for_schedule(schedule: str, after: datetime) -> datetime: + normalized = validate_schedule(schedule) + return croniter(normalized, after).get_next(datetime) diff --git a/archivebox/index.py b/archivebox/index.py deleted file mode 100644 index 3f4ada3f8c..0000000000 --- a/archivebox/index.py +++ /dev/null @@ -1,270 +0,0 @@ -import os -import json - -from datetime import datetime -from string import Template -try: - from distutils.dir_util import copy_tree -except ImportError: - print('[X] Missing "distutils" python package. To install it, run:') - print(' pip install distutils') - -from config import ( - OUTPUT_DIR, - TEMPLATES_DIR, - GIT_SHA, - FOOTER_INFO, -) -from util import ( - chmod_file, - urlencode, - derived_link_info, - check_link_structure, - check_links_structure, - wget_output_path, - latest_output, -) -from parse import parse_links -from links import validate_links -from logs import ( - log_indexing_process_started, - log_indexing_started, - log_indexing_finished, - log_parsing_started, - log_parsing_finished, -) - -TITLE_LOADING_MSG = 'Not yet archived...' - - -### Homepage index for all the links - -def write_links_index(out_dir, links, finished=False): - """create index.html file for a given list of links""" - - log_indexing_process_started() - check_links_structure(links) - - log_indexing_started(out_dir, 'index.json') - write_json_links_index(out_dir, links) - log_indexing_finished(out_dir, 'index.json') - - log_indexing_started(out_dir, 'index.html') - write_html_links_index(out_dir, links, finished=finished) - log_indexing_finished(out_dir, 'index.html') - -def load_links_index(out_dir=OUTPUT_DIR, import_path=None): - """parse and load existing index with any new links from import_path merged in""" - - existing_links = [] - if out_dir: - existing_links = parse_json_links_index(out_dir) - check_links_structure(existing_links) - - new_links = [] - if import_path: - # parse and validate the import file - log_parsing_started(import_path) - raw_links, parser_name = parse_links(import_path) - new_links = validate_links(raw_links) - check_links_structure(new_links) - - # merge existing links in out_dir and new links - all_links = validate_links(existing_links + new_links) - check_links_structure(all_links) - num_new_links = len(all_links) - len(existing_links) - - if import_path and parser_name: - log_parsing_finished(num_new_links, parser_name) - - return all_links, new_links - -def write_json_links_index(out_dir, links): - """write the json link index to a given path""" - - check_links_structure(links) - - path = os.path.join(out_dir, 'index.json') - - index_json = { - 'info': 'ArchiveBox Index', - 'help': 'https://github.com/pirate/ArchiveBox', - 'version': GIT_SHA, - 'num_links': len(links), - 'updated': str(datetime.now().timestamp()), - 'links': links, - } - - with open(path, 'w', encoding='utf-8') as f: - json.dump(index_json, f, indent=4, default=str) - - chmod_file(path) - -def parse_json_links_index(out_dir=OUTPUT_DIR): - """parse a archive index json file and return the list of links""" - index_path = os.path.join(out_dir, 'index.json') - if os.path.exists(index_path): - with open(index_path, 'r', encoding='utf-8') as f: - links = json.load(f)['links'] - check_links_structure(links) - return links - - return [] - -def write_html_links_index(out_dir, links, finished=False): - """write the html link index to a given path""" - - check_links_structure(links) - - path = os.path.join(out_dir, 'index.html') - - copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static')) - - with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f: - f.write('User-agent: *\nDisallow: /') - - with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f: - index_html = f.read() - - with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f: - link_row_html = f.read() - - full_links_info = (derived_link_info(link) for link in links) - - link_rows = '\n'.join( - Template(link_row_html).substitute(**{ - **link, - 'title': ( - link['title'] - or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG) - ), - 'favicon_url': ( - os.path.join('archive', link['timestamp'], 'favicon.ico') - # if link['is_archived'] else 'data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs=' - ), - 'archive_url': urlencode( - wget_output_path(link) or 'index.html' - ), - }) - for link in full_links_info - ) - - template_vars = { - 'num_links': len(links), - 'date_updated': datetime.now().strftime('%Y-%m-%d'), - 'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'), - 'footer_info': FOOTER_INFO, - 'git_sha': GIT_SHA, - 'short_git_sha': GIT_SHA[:8], - 'rows': link_rows, - 'status': 'finished' if finished else 'running', - } - - with open(path, 'w', encoding='utf-8') as f: - f.write(Template(index_html).substitute(**template_vars)) - - chmod_file(path) - - -def patch_links_index(link, out_dir=OUTPUT_DIR): - """hack to in-place update one row's info in the generated index html""" - - title = link['title'] or latest_output(link)['title'] - successful = len(tuple(filter(None, latest_output(link).values()))) - - # Patch JSON index - changed = False - json_file_links = parse_json_links_index(out_dir) - for saved_link in json_file_links: - if saved_link['url'] == link['url']: - saved_link['title'] = title - saved_link['history'] = link['history'] - changed = True - break - if changed: - write_json_links_index(out_dir, json_file_links) - - # Patch HTML index - html_path = os.path.join(out_dir, 'index.html') - html = open(html_path, 'r').read().split('\n') - for idx, line in enumerate(html): - if title and (' 1234.1, 1234.2""" - - timestamp = timestamp.split('.')[0] - nonce = 0 - - # first try 152323423 before 152323423.0 - if timestamp not in used_timestamps: - return timestamp - - new_timestamp = '{}.{}'.format(timestamp, nonce) - while new_timestamp in used_timestamps: - nonce += 1 - new_timestamp = '{}.{}'.format(timestamp, nonce) - - return new_timestamp - - diff --git a/archivebox/logs.py b/archivebox/logs.py deleted file mode 100644 index 4dc2c05150..0000000000 --- a/archivebox/logs.py +++ /dev/null @@ -1,201 +0,0 @@ -import sys -from datetime import datetime -from config import ANSI, REPO_DIR, OUTPUT_DIR - - -# globals are bad, mmkay -_LAST_RUN_STATS = { - 'skipped': 0, - 'succeeded': 0, - 'failed': 0, - - 'parsing_start_ts': 0, - 'parsing_end_ts': 0, - - 'indexing_start_ts': 0, - 'indexing_end_ts': 0, - - 'archiving_start_ts': 0, - 'archiving_end_ts': 0, - - 'links': {}, -} - -def pretty_path(path): - """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" - return path.replace(REPO_DIR + '/', '') - - -### Parsing Stage - -def log_parsing_started(source_file): - start_ts = datetime.now() - _LAST_RUN_STATS['parse_start_ts'] = start_ts - print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - source_file.rsplit('/', 1)[-1], - **ANSI, - )) - -def log_parsing_finished(num_new_links, parser_name): - print(' > Adding {} new links to index (parsed import as {})'.format( - num_new_links, - parser_name, - )) - - -### Indexing Stage - -def log_indexing_process_started(): - start_ts = datetime.now() - _LAST_RUN_STATS['index_start_ts'] = start_ts - print('{green}[*] [{}] Saving main index files...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - **ANSI, - )) - -def log_indexing_started(out_dir, out_file): - sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file)) - -def log_indexing_finished(out_dir, out_file): - end_ts = datetime.now() - _LAST_RUN_STATS['index_end_ts'] = end_ts - print('\r โˆš {}/{}'.format(pretty_path(out_dir), out_file)) - - -### Archiving Stage - -def log_archiving_started(num_links, resume): - start_ts = datetime.now() - _LAST_RUN_STATS['start_ts'] = start_ts - if resume: - print('{green}[โ–ถ] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - resume, - **ANSI, - )) - else: - print('{green}[โ–ถ] [{}] Updating content for {} pages in archive...{reset}'.format( - start_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - **ANSI, - )) - -def log_archiving_paused(num_links, idx, timestamp): - end_ts = datetime.now() - _LAST_RUN_STATS['end_ts'] = end_ts - print() - print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format( - **ANSI, - now=end_ts.strftime('%Y-%m-%d %H:%M:%S'), - idx=idx+1, - timestamp=timestamp, - total=num_links, - )) - print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) - print(' Continue where you left off by running:') - print(' {} {}'.format( - pretty_path(sys.argv[0]), - timestamp, - )) - -def log_archiving_finished(num_links): - end_ts = datetime.now() - _LAST_RUN_STATS['end_ts'] = end_ts - seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp() - if seconds > 60: - duration = '{0:.2f} min'.format(seconds / 60, 2) - else: - duration = '{0:.2f} sec'.format(seconds, 2) - - print('{}[โˆš] [{}] Update of {} pages complete ({}){}'.format( - ANSI['green'], - end_ts.strftime('%Y-%m-%d %H:%M:%S'), - num_links, - duration, - ANSI['reset'], - )) - print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped'])) - print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded'])) - print(' - {} links had errors'.format(_LAST_RUN_STATS['failed'])) - print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', ''))) - - -def log_link_archiving_started(link_dir, link, is_new): - # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford" - # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/ - # > output/archive/1478739709 - - print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format( - symbol_color=ANSI['green' if is_new else 'black'], - symbol='+' if is_new else '*', - now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - title=link['title'] or link['url'], - **ANSI, - )) - print(' {blue}{url}{reset}'.format(url=link['url'], **ANSI)) - print(' {} {}'.format( - '>' if is_new else 'โˆš', - pretty_path(link_dir), - )) - -def log_link_archiving_finished(link_dir, link, is_new, stats): - total = sum(stats.values()) - - if stats['failed'] > 0 : - _LAST_RUN_STATS['failed'] += 1 - elif stats['skipped'] == total: - _LAST_RUN_STATS['skipped'] += 1 - else: - _LAST_RUN_STATS['succeeded'] += 1 - - -def log_archive_method_started(method): - print(' > {}'.format(method)) - -def log_archive_method_finished(result): - """quote the argument with whitespace in a command so the user can - copy-paste the outputted string directly to run the cmd - """ - required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts') - assert ( - isinstance(result, dict) - and all(key in result for key in required_keys) - and ('output' in result) - ), 'Archive method did not return a valid result.' - - # Prettify CMD string and make it safe to copy-paste by quoting arguments - quoted_cmd = ' '.join( - '"{}"'.format(arg) if ' ' in arg else arg - for arg in result['cmd'] - ) - - if result['status'] == 'failed': - # Prettify error output hints string and limit to five lines - hints = getattr(result['output'], 'hints', None) or () - if hints: - hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n') - hints = ( - ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset']) - for line in hints[:5] if line.strip() - ) - - # Collect and prefix output lines with indentation - output_lines = [ - '{}Failed:{} {}{}'.format( - ANSI['red'], - result['output'].__class__.__name__.replace('ArchiveError', ''), - result['output'], - ANSI['reset'] - ), - *hints, - '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']), - ' cd {};'.format(result['pwd']), - ' {}'.format(quoted_cmd), - ] - print('\n'.join( - ' {}'.format(line) - for line in output_lines - if line - )) diff --git a/archivebox/machine/__init__.py b/archivebox/machine/__init__.py new file mode 100644 index 0000000000..36a1de6e7a --- /dev/null +++ b/archivebox/machine/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.machine" diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py new file mode 100644 index 0000000000..889f676956 --- /dev/null +++ b/archivebox/machine/admin.py @@ -0,0 +1,773 @@ +__package__ = "archivebox.machine" + +import json +import shlex +from pathlib import Path + +from django.contrib import admin, messages +from django.db.models import DurationField, ExpressionWrapper, F +from django.db.models.functions import Coalesce, Now +from django.shortcuts import redirect +from django.utils import timezone +from django.utils.html import format_html +from django.utils.safestring import mark_safe +from django_object_actions import action + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.misc.logging_util import printable_filesize +from archivebox.machine.env_util import env_to_dotenv_text +from archivebox.machine.models import Machine, NetworkInterface, Binary, Process + + +def _render_copy_block(text: str, *, multiline: bool = False): + if multiline: + return format_html( + """ +
+ +
{}
+
+ """, + text, + text, + text, + ) + return format_html( + """ +
+ + + {} + +
+ """, + text, + text, + text, + ) + + +def _format_process_duration_seconds(started_at, ended_at) -> str: + if not started_at: + return "-" + + end_time = ended_at or timezone.now() + seconds = max((end_time - started_at).total_seconds(), 0.0) + if seconds < 1: + return f"{seconds:.2f}s" + if seconds < 10 and seconds != int(seconds): + return f"{seconds:.1f}s" + return f"{int(seconds)}s" + + +class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): + list_display = ( + "id_display", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "hostname", + "ips", + "os_platform", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "os_arch", + "os_family", + "os_release", + "hw_uuid", + ) + + readonly_fields = ("guid", "created_at", "modified_at", "ips") + + fieldsets = ( + ( + "Identity", + { + "fields": ("hostname", "guid", "ips"), + "classes": ("card",), + }, + ), + ( + "Hardware", + { + "fields": ("hw_manufacturer", "hw_product", "hw_uuid", "hw_in_docker", "hw_in_vm"), + "classes": ("card",), + }, + ), + ( + "Operating System", + { + "fields": ("os_platform", "os_family", "os_arch", "os_kernel", "os_release"), + "classes": ("card",), + }, + ), + ( + "Statistics", + { + "fields": ("stats", "num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Configuration", + { + "fields": ("config",), + "classes": ("card", "wide"), + "description": mark_safe( + '
' + "Heads up: saving here also rewrites " + "data/ArchiveBox.conf on disk to match โ€” the two stores are " + "kept in 1:1 sync, so any keys you remove here will be removed from the file " + "too. Edits to ArchiveBox.conf (or archivebox config --set) " + "propagate back into this field on the next request." + "
", + ), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("hw_in_docker", "hw_in_vm", "os_arch", "os_family", "os_platform") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Public IP", ordering="networkinterface__ip_public") + def ips(self, machine): + return format_html( + '{}', + machine.id, + ", ".join(machine.networkinterface_set.values_list("ip_public", flat=True)), + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('{}', color, h) + + @admin.display(description="ID", ordering="id") + def id_display(self, machine): + # Highlight the row representing the machine that ``Machine.current()`` + # resolves to in this process โ€” that's the one whose ``config`` is + # actually being applied at runtime. Important to surface here because + # a collection can accumulate stale Machine rows from prior hosts (VM + # snapshots, container rebuilds, hostname changes), and editing the + # wrong one silently produces "I set BASE_URL but it didn't stick." + from archivebox.machine.models import Machine + + try: + current_id = str(Machine.current().pk) + except Exception: + current_id = None + + machine_id = str(machine.pk) + short_id = machine_id[:8] + if current_id and machine_id == current_id: + return format_html( + 'โ˜… CURRENT' + '{}', + short_id, + ) + return format_html( + '{}', + short_id, + ) + + +class NetworkInterfaceAdmin(BaseModelAdmin): + list_display = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + "health_display", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "ip_public", + "dns_server", + "isp", + "country", + "region", + "city", + "iface", + "ip_local", + "mac_address", + ) + search_fields = ( + "id", + "machine__id", + "iface", + "ip_public", + "ip_local", + "mac_address", + "dns_server", + "hostname", + "isp", + "city", + "region", + "country", + ) + + readonly_fields = ("machine", "created_at", "modified_at", "mac_address", "ip_public", "ip_local", "dns_server") + + fieldsets = ( + ( + "Machine", + { + "fields": ("machine",), + "classes": ("card",), + }, + ), + ( + "Network", + { + "fields": ("iface", "ip_public", "ip_local", "mac_address", "dns_server"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("hostname", "isp", "city", "region", "country"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("isp", "country", "region") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, iface): + return format_html( + '[{}]   {}', + iface.machine.id, + str(iface.machine.id)[:8], + iface.machine.hostname, + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('{}', color, h) + + +class BinaryAdmin(BaseModelAdmin): + list_display = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status", "health_display") + sort_fields = ("id", "created_at", "machine_info", "name", "binprovider", "version", "abspath", "sha256", "status") + search_fields = ("id", "machine__id", "name", "binprovider", "version", "abspath", "sha256") + + readonly_fields = ("created_at", "modified_at", "output_dir") + + fieldsets = ( + ( + "Binary Info", + { + "fields": ("name", "binproviders", "binprovider", "overrides"), + "classes": ("card",), + }, + ), + ( + "Location", + { + "fields": ("machine", "abspath"), + "classes": ("card",), + }, + ), + ( + "Version", + { + "fields": ("version", "sha256"), + "classes": ("card",), + }, + ), + ( + "State", + { + "fields": ("status", "retry_at", "output_dir"), + "classes": ("card",), + }, + ), + ( + "Usage", + { + "fields": ("num_uses_succeeded", "num_uses_failed"), + "classes": ("card",), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("name", "binprovider", "status", "machine_id") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, binary): + return format_html( + '[{}]   {}', + binary.machine.id, + str(binary.machine.id)[:8], + binary.machine.hostname, + ) + + @admin.display(description="Health", ordering="health") + def health_display(self, obj): + h = obj.health + color = "green" if h >= 80 else "orange" if h >= 50 else "red" + return format_html('{}', color, h) + + +class ProcessAdmin(BaseModelAdmin): + list_display = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status_badge", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + sort_fields = ( + "id", + "created_at", + "machine_info", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_str", + "status_badge", + "duration_display", + "exit_code", + "pid", + "output_summary", + "binary_info", + ) + search_fields = ("id", "machine__id", "binary__name", "cmd", "pwd", "stdout", "stderr") + + readonly_fields = ( + "created_at", + "modified_at", + "machine", + "binary_link", + "iface_link", + "archiveresult_link", + "snapshot_link", + "crawl_link", + "cmd_display", + "env_display", + "stdout_display", + "stderr_display", + "archiveresult_output_display", + "timeout", + "pid", + "exit_code", + "url", + "started_at", + "ended_at", + "duration_display", + ) + + fieldsets = ( + ( + "Process Info", + { + "fields": ("machine", "archiveresult_link", "snapshot_link", "crawl_link", "status", "retry_at"), + "classes": ("card",), + }, + ), + ( + "Command", + { + "fields": ("cmd_display", "pwd", "env_display", "timeout"), + "classes": ("card", "wide"), + }, + ), + ( + "Execution", + { + "fields": ("binary_link", "iface_link", "pid", "exit_code", "url"), + "classes": ("card",), + }, + ), + ( + "Timing", + { + "fields": ("started_at", "ended_at", "duration_display"), + "classes": ("card",), + }, + ), + ( + "Output", + { + "fields": ("stdout_display", "stderr_display", "archiveresult_output_display"), + "classes": ("card", "wide", "collapse"), + }, + ), + ( + "Timestamps", + { + "fields": ("created_at", "modified_at"), + "classes": ("card",), + }, + ), + ) + + list_filter = ("status", "exit_code", "machine_id") + ordering = ["-created_at"] + list_per_page = 100 + actions = ["kill_processes", "delete_selected"] + change_actions = ["kill_process"] + + def get_queryset(self, request): + return ( + super() + .get_queryset(request) + .select_related( + "machine", + "binary", + "iface", + "archiveresult__snapshot__crawl", + ) + .annotate( + runtime_sort=ExpressionWrapper( + Coalesce(F("ended_at"), Now()) - F("started_at"), + output_field=DurationField(), + ), + ) + ) + + def _terminate_processes(self, request, processes): + terminated = 0 + skipped = 0 + + for process in processes: + if process.status == Process.StatusChoices.EXITED or not process.is_running: + skipped += 1 + continue + if process.terminate(): + terminated += 1 + else: + skipped += 1 + + if terminated: + self.message_user( + request, + f"Killed {terminated} running process{'es' if terminated != 1 else ''}.", + level=messages.SUCCESS, + ) + if skipped: + self.message_user( + request, + f"Skipped {skipped} process{'es' if skipped != 1 else ''} that were already exited.", + level=messages.INFO, + ) + + return terminated, skipped + + @admin.action(description="Kill selected processes") + def kill_processes(self, request, queryset): + self._terminate_processes(request, queryset) + + @action( + label="Kill", + description="Kill this process if it is still running", + attrs={"class": "deletelink"}, + methods=("POST",), + ) + def kill_process(self, request, obj): + self._terminate_processes(request, [obj]) + return redirect("admin:machine_process_change", obj.pk) + + @admin.display(description="Machine", ordering="machine__id") + def machine_info(self, process): + return format_html( + '[{}]   {}', + process.machine.id, + str(process.machine.id)[:8], + process.machine.hostname, + ) + + @admin.display(description="Binary", ordering="binary__name") + def binary_info(self, process): + if not process.binary: + return "-" + return format_html( + '{} v{}', + process.binary.id, + process.binary.name, + process.binary.version, + ) + + @admin.display(description="Binary", ordering="binary__name") + def binary_link(self, process): + return self.binary_info(process) + + @admin.display(description="Network Interface", ordering="iface__id") + def iface_link(self, process): + if not process.iface: + return "-" + return format_html( + '{} {}', + process.iface.id, + str(process.iface.id)[:8], + process.iface.iface or process.iface.ip_public or process.iface.ip_local, + ) + + @admin.display(description="ArchiveResult", ordering="archiveresult__plugin") + def archiveresult_link(self, process): + try: + ar = process.archiveresult + except Process.archiveresult.RelatedObjectDoesNotExist: + return "-" + return format_html( + '{} โ† {}', + ar.id, + ar.snapshot.url[:50], + ar.plugin, + ) + + @admin.display(description="Snapshot", ordering="archiveresult__snapshot__id") + def snapshot_link(self, process): + try: + snapshot = process.archiveresult.snapshot + except Process.archiveresult.RelatedObjectDoesNotExist: + return "-" + return format_html( + '{}', + snapshot.id, + str(snapshot.id)[:8], + ) + + @admin.display(description="Crawl", ordering="archiveresult__snapshot__crawl__id") + def crawl_link(self, process): + try: + crawl = process.archiveresult.snapshot.crawl + except Process.archiveresult.RelatedObjectDoesNotExist: + return "-" + return format_html( + '{}', + crawl.id, + str(crawl.id)[:8], + ) + + @admin.display(description="Command", ordering="cmd") + def cmd_str(self, process): + if not process.cmd: + return "-" + # Compact the list-view rendering only โ€” the change-page ``cmd_display`` + # still shows the full original ``process.cmd`` (and the DB row is + # untouched). If the first argv token looks like an absolute path, + # collapse it to its basename so a row like + # ``/Users/.../.venv/bin/python -m archivebox foo`` reads as + # ``python -m archivebox foo`` in the column. + if isinstance(process.cmd, list): + parts = [str(arg) for arg in process.cmd[:3]] + if parts and (parts[0].startswith("/") or parts[0].startswith("~")): + parts[0] = Path(parts[0]).name + cmd = " ".join(parts) + if len(process.cmd) > 3: + cmd += " ..." + else: + cmd = str(process.cmd) + return format_html('{}', cmd[:80]) + + @admin.display(description="Status", ordering="status") + def status_badge(self, process): + # Pill-style badge matching the look of other admin status columns. + # Color rules requested by the operator: + # RUNNING โ†’ green + # EXITED, code == 0 โ†’ grey (clean exit) + # EXITED, code == 10 โ†’ grey (treated as a clean / "skipped" exit) + # EXITED, code other โ†’ red (failure) + # QUEUED / anything โ†’ amber so it stands out without screaming + status_value = str(process.status or "").lower() + label = (process.get_status_display() or status_value or "?").upper() + exit_code = process.exit_code + if status_value == Process.StatusChoices.RUNNING: + bg, fg = "#16a34a", "#fff" + elif status_value == Process.StatusChoices.EXITED: + if exit_code in (0, 10): + bg, fg = "#6b7280", "#fff" + else: + bg, fg = "#dc2626", "#fff" + else: + bg, fg = "#f59e0b", "#fff" + return format_html( + '{}', + bg, + fg, + label, + ) + + @admin.display(description="Duration", ordering="runtime_sort") + def duration_display(self, process): + return _format_process_duration_seconds(process.started_at, process.ended_at) + + @admin.display(description="Output", ordering="archiveresult__output_size") + def output_summary(self, process): + try: + output_files = process.archiveresult.output_files or {} + except Process.archiveresult.RelatedObjectDoesNotExist: + output_files = {} + + if isinstance(output_files, str): + try: + output_files = json.loads(output_files) + except Exception: + output_files = {} + + file_count = 0 + total_bytes = 0 + + if isinstance(output_files, dict): + file_count = len(output_files) + items = output_files.values() + elif isinstance(output_files, (list, tuple, set)): + file_count = len(output_files) + items = output_files + else: + items = () + + for metadata in items: + if not isinstance(metadata, dict): + continue + size = metadata.get("size", 0) + try: + total_bytes += int(size or 0) + except (TypeError, ValueError): + continue + + file_label = "file" if file_count == 1 else "files" + return format_html( + '{} {} โ€ข {}', + file_count, + file_label, + printable_filesize(total_bytes), + ) + + @admin.display(description="Command") + def cmd_display(self, process): + if not process.cmd: + return "-" + if isinstance(process.cmd, list): + cmd = shlex.join(str(arg) for arg in process.cmd) + else: + cmd = str(process.cmd) + return _render_copy_block(cmd) + + @admin.display(description="Environment") + def env_display(self, process): + env_text = env_to_dotenv_text(process.env) + if not env_text: + return "-" + return _render_copy_block(env_text, multiline=True) + + @admin.display(description="Stdout") + def stdout_display(self, process): + if not process.stdout: + return "-" + return _render_copy_block(process.stdout, multiline=True) + + @admin.display(description="Stderr") + def stderr_display(self, process): + if not process.stderr: + return "-" + return _render_copy_block(process.stderr, multiline=True) + + @admin.display(description="ArchiveResult Output") + def archiveresult_output_display(self, process): + try: + output = process.archiveresult.output_str + except Process.archiveresult.RelatedObjectDoesNotExist: + return "-" + if not output: + return "-" + return _render_copy_block(output, multiline=True) + + +def register_admin(admin_site): + admin_site.register(Machine, MachineAdmin) + admin_site.register(NetworkInterface, NetworkInterfaceAdmin) + admin_site.register(Binary, BinaryAdmin) + admin_site.register(Process, ProcessAdmin) diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py new file mode 100644 index 0000000000..f4834e4c97 --- /dev/null +++ b/archivebox/machine/apps.py @@ -0,0 +1,25 @@ +__package__ = "archivebox.machine" + +from django.apps import AppConfig + + +class MachineConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + + name = "archivebox.machine" + label = "machine" # Explicit label for migrations + verbose_name = "Machine Info" + + def ready(self): + """Import models to register state machines with the registry""" + import sys + + # Skip during makemigrations to avoid premature state machine access + if "makemigrations" not in sys.argv: + from archivebox.machine import models # noqa: F401 + + +def register_admin(admin_site): + from archivebox.machine.admin import register_admin + + register_admin(admin_site) diff --git a/archivebox/machine/detect.py b/archivebox/machine/detect.py new file mode 100644 index 0000000000..6a48e7466c --- /dev/null +++ b/archivebox/machine/detect.py @@ -0,0 +1,335 @@ +import os +import json +import socket +import urllib.request +from typing import Any +from pathlib import Path +import subprocess +import platform +import tempfile +from datetime import datetime + +import psutil +import machineid # https://github.com/keygen-sh/py-machineid + +from rich import print + +PACKAGE_DIR = Path(__file__).parent +DATA_DIR = Path(os.getcwd()).resolve() + + +def get_vm_info(): + hw_in_docker = bool(os.getenv("IN_DOCKER", False) in ("1", "true", "True", "TRUE")) + hw_in_vm = False + try: + # check for traces of docker/containerd/podman in cgroup + with open("/proc/self/cgroup") as procfile: + for line in procfile: + cgroup = line.strip() # .split('/', 1)[-1].lower() + if "docker" in cgroup or "containerd" in cgroup or "podman" in cgroup: + hw_in_docker = True + except Exception: + pass + + hw_manufacturer = "Docker" if hw_in_docker else "Unknown" + hw_product = "Container" if hw_in_docker else "Unknown" + hw_uuid = machineid.id() + + if platform.system().lower() == "darwin": + # Get macOS machine info + hw_manufacturer = "Apple" + hw_product = "Mac" + try: + # Hardware: + # Hardware Overview: + # Model Name: Mac Studio + # Model Identifier: Mac13,1 + # Model Number: MJMV3LL/A + # ... + # Serial Number (system): M230YYTD77 + # Hardware UUID: 39A12B50-1972-5910-8BEE-235AD20C8EE3 + # ... + result = subprocess.run(["system_profiler", "SPHardwareDataType"], capture_output=True, text=True, check=True) + for line in result.stdout.split("\n"): + if "Model Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "Model Identifier:" in line: + hw_product += " " + line.split(":", 1)[-1].strip() + elif "Hardware UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() + except Exception: + pass + else: + # get Linux machine info + try: + # Getting SMBIOS data from sysfs. + # SMBIOS 2.8 present. + # argo-1 | 2024-10-01T10:40:51Z ERR error="Incoming request ended abruptly: context canceled" connIndex=2 event=1 ingressRule=0 originService=http://archivebox:8000 โ”‚ + # Handle 0x0100, DMI type 1, 27 bytes + # System Information + # Manufacturer: DigitalOcean + # Product Name: Droplet + # Serial Number: 411922099 + # UUID: fb65f41c-ec24-4539-beaf-f941903bdb2c + # ... + # Family: DigitalOcean_Droplet + dmidecode = subprocess.run(["dmidecode", "-t", "system"], capture_output=True, text=True, check=True) + for line in dmidecode.stdout.split("\n"): + if "Manufacturer:" in line: + hw_manufacturer = line.split(":", 1)[-1].strip() + elif "Product Name:" in line: + hw_product = line.split(":", 1)[-1].strip() + elif "UUID:" in line: + hw_uuid = line.split(":", 1)[-1].strip() + except Exception: + pass + + # Check for VM fingerprint in manufacturer/product name + if "qemu" in hw_product.lower() or "vbox" in hw_product.lower() or "lxc" in hw_product.lower() or "vm" in hw_product.lower(): + hw_in_vm = True + + # Check for QEMU explicitly in pmap output + try: + result = subprocess.run(["pmap", "1"], capture_output=True, text=True, check=True) + if "qemu" in result.stdout.lower(): + hw_in_vm = True + except Exception: + pass + + return { + "hw_in_docker": hw_in_docker, + "hw_in_vm": hw_in_vm, + "hw_manufacturer": hw_manufacturer, + "hw_product": hw_product, + "hw_uuid": hw_uuid, + } + + +def get_public_ip() -> str: + def fetch_url(url: str) -> str: + with urllib.request.urlopen(url, timeout=5) as response: + return response.read().decode("utf-8").strip() + + def fetch_dns(pubip_lookup_host: str) -> str: + return socket.gethostbyname(pubip_lookup_host).strip() + + methods = [ + (lambda: fetch_url("https://ipinfo.io/ip"), lambda r: r), + (lambda: fetch_url("https://api.ipify.org?format=json"), lambda r: json.loads(r)["ip"]), + (lambda: fetch_dns("myip.opendns.com"), lambda r: r), + (lambda: fetch_url("http://whatismyip.akamai.com/"), lambda r: r), # try HTTP as final fallback in case of TLS/system time errors + ] + + for fetch, parse in methods: + try: + result = parse(fetch()) + if result: + return result + except Exception: + continue + + return "127.0.0.1" + + +def get_local_ip(remote_ip: str = "1.1.1.1", remote_port: int = 80) -> str: + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: + s.connect((remote_ip, remote_port)) + return s.getsockname()[0] + except Exception: + pass + return "127.0.0.1" + + +ip_addrs = lambda addrs: (a for a in addrs if a.family == socket.AF_INET) +mac_addrs = lambda addrs: (a for a in addrs if a.family == psutil.AF_LINK) + + +def get_isp_info(ip=None): + # Get public IP + try: + ip = ip or urllib.request.urlopen("https://api.ipify.org").read().decode("utf8") + except Exception: + pass + + # Get ISP name, city, and country + data = {} + try: + url = f"https://ipapi.co/{ip}/json/" + response = urllib.request.urlopen(url) + data = json.loads(response.read().decode()) + except Exception: + pass + + isp = data.get("org", "Unknown") + city = data.get("city", "Unknown") + region = data.get("region", "Unknown") + country = data.get("country_name", "Unknown") + + # Get system DNS resolver servers + dns_server = None + try: + result = subprocess.run(["dig", "example.com", "A"], capture_output=True, text=True, check=True).stdout + dns_server = result.split(";; SERVER: ", 1)[-1].split("\n")[0].split("#")[0].strip() + except Exception: + try: + dns_server = Path("/etc/resolv.conf").read_text().split("nameserver ", 1)[-1].split("\n")[0].strip() + except Exception: + dns_server = "127.0.0.1" + print(f"[red]:warning: WARNING: Could not determine DNS server, using {dns_server}[/red]") + + # Get DNS resolver's ISP name + # url = f'https://ipapi.co/{dns_server}/json/' + # dns_isp = json.loads(urllib.request.urlopen(url).read().decode()).get('org', 'Unknown') + + return { + "isp": isp, + "city": city, + "region": region, + "country": country, + "dns_server": dns_server, + # 'net_dns_isp': dns_isp, + } + + +def get_host_network() -> dict[str, Any]: + default_gateway_local_ip = get_local_ip() + gateways = psutil.net_if_addrs() + + for interface, ips in gateways.items(): + for local_ip in ip_addrs(ips): + if default_gateway_local_ip == local_ip.address: + mac_address = next(mac_addrs(ips)).address + public_ip = get_public_ip() + return { + "hostname": max([socket.gethostname(), platform.node()], key=len), + "iface": interface, + "mac_address": mac_address, + "ip_local": local_ip.address, + "ip_public": public_ip, + # "is_behind_nat": local_ip.address != public_ip, + **get_isp_info(public_ip), + } + + raise Exception("Could not determine host network info") + + +def get_os_info() -> dict[str, Any]: + os_release = platform.release() + if platform.system().lower() == "darwin": + os_release = "macOS " + platform.mac_ver()[0] + else: + try: + os_release = subprocess.run(["lsb_release", "-ds"], capture_output=True, text=True, check=True).stdout.strip() + except Exception: + pass + + return { + "os_arch": platform.machine(), + "os_family": platform.system().lower(), + "os_platform": platform.platform(), + "os_kernel": platform.version(), + "os_release": os_release, + } + + +def get_host_stats() -> dict[str, Any]: + try: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_usage = psutil.disk_usage(str(tmp_dir)) + app_usage = psutil.disk_usage(str(PACKAGE_DIR)) + data_usage = psutil.disk_usage(str(DATA_DIR)) + mem_usage = psutil.virtual_memory() + try: + swap_usage = psutil.swap_memory() + swap_used_pct = swap_usage.percent + swap_used_gb = round(swap_usage.used / 1024 / 1024 / 1024, 3) + swap_free_gb = round(swap_usage.free / 1024 / 1024 / 1024, 3) + except OSError: + # Some sandboxed environments deny access to swap stats + swap_used_pct = 0.0 + swap_used_gb = 0.0 + swap_free_gb = 0.0 + return { + "cpu_boot_time": datetime.fromtimestamp(psutil.boot_time()).isoformat(), + "cpu_count": psutil.cpu_count(logical=False), + "cpu_load": psutil.getloadavg(), + # "cpu_pct": psutil.cpu_percent(interval=1), + "mem_virt_used_pct": mem_usage.percent, + "mem_virt_used_gb": round(mem_usage.used / 1024 / 1024 / 1024, 3), + "mem_virt_free_gb": round(mem_usage.free / 1024 / 1024 / 1024, 3), + "mem_swap_used_pct": swap_used_pct, + "mem_swap_used_gb": swap_used_gb, + "mem_swap_free_gb": swap_free_gb, + "disk_tmp_used_pct": tmp_usage.percent, + "disk_tmp_used_gb": round(tmp_usage.used / 1024 / 1024 / 1024, 3), + "disk_tmp_free_gb": round(tmp_usage.free / 1024 / 1024 / 1024, 3), # in GB + "disk_app_used_pct": app_usage.percent, + "disk_app_used_gb": round(app_usage.used / 1024 / 1024 / 1024, 3), + "disk_app_free_gb": round(app_usage.free / 1024 / 1024 / 1024, 3), + "disk_data_used_pct": data_usage.percent, + "disk_data_used_gb": round(data_usage.used / 1024 / 1024 / 1024, 3), + "disk_data_free_gb": round(data_usage.free / 1024 / 1024 / 1024, 3), + } + except Exception: + return {} + + +def get_host_guid() -> str: + return machineid.hashed_id("archivebox") + + +# Example usage +if __name__ == "__main__": + host_info = { + "guid": get_host_guid(), + "os": get_os_info(), + "vm": get_vm_info(), + "net": get_host_network(), + "stats": get_host_stats(), + } + print(host_info) + +# { +# 'guid': '1cd2dd279f8a854...6943f2384437991a', +# 'os': { +# 'os_arch': 'arm64', +# 'os_family': 'darwin', +# 'os_platform': 'macOS-14.6.1-arm64-arm-64bit', +# 'os_kernel': 'Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000', +# 'os_release': 'macOS 14.6.1' +# }, +# 'vm': {'hw_in_docker': False, 'hw_in_vm': False, 'hw_manufacturer': 'Apple', 'hw_product': 'Mac Studio Mac13,1', 'hw_uuid': '39A12B50-...-...-...-...'}, +# 'net': { +# 'hostname': 'somehost.sub.example.com', +# 'iface': 'en0', +# 'mac_address': 'ab:cd:ef:12:34:56', +# 'ip_local': '192.168.2.18', +# 'ip_public': '123.123.123.123', +# 'isp': 'AS-SONICTELECOM', +# 'city': 'Berkeley', +# 'region': 'California', +# 'country': 'United States', +# 'dns_server': '192.168.1.1' +# }, +# 'stats': { +# 'cpu_boot_time': '2024-09-24T21:20:16', +# 'cpu_count': 10, +# 'cpu_load': (2.35693359375, 4.013671875, 4.1171875), +# 'mem_virt_used_pct': 66.0, +# 'mem_virt_used_gb': 15.109, +# 'mem_virt_free_gb': 0.065, +# 'mem_swap_used_pct': 89.4, +# 'mem_swap_used_gb': 8.045, +# 'mem_swap_free_gb': 0.955, +# 'disk_tmp_used_pct': 26.0, +# 'disk_tmp_used_gb': 113.1, +# 'disk_tmp_free_gb': 322.028, +# 'disk_app_used_pct': 56.1, +# 'disk_app_used_gb': 2138.796, +# 'disk_app_free_gb': 1675.996, +# 'disk_data_used_pct': 56.1, +# 'disk_data_used_gb': 2138.796, +# 'disk_data_free_gb': 1675.996 +# } +# } diff --git a/archivebox/machine/env_util.py b/archivebox/machine/env_util.py new file mode 100644 index 0000000000..a0cc6e5b3f --- /dev/null +++ b/archivebox/machine/env_util.py @@ -0,0 +1,49 @@ +__package__ = "archivebox.machine" + +import json +import re +import shlex +from typing import Any + + +SENSITIVE_ENV_KEY_PARTS = ("KEY", "TOKEN", "SECRET") +SHELL_ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +def stringify_env_value(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + if isinstance(value, bool): + return "True" if value else "False" + return json.dumps(value, separators=(",", ":")) + + +def is_redacted_env_key(key: str) -> bool: + upper_key = str(key or "").upper() + return any(part in upper_key for part in SENSITIVE_ENV_KEY_PARTS) + + +def redact_env(env: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(env, dict): + return {} + return {str(key): value for key, value in env.items() if key is not None and not is_redacted_env_key(str(key))} + + +def env_to_dotenv_text(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return "\n".join( + f"{key}={shlex.quote(stringify_env_value(value))}" + for key, value in sorted(redacted_env.items()) + if value is not None and SHELL_ENV_KEY_RE.fullmatch(str(key)) + ) + + +def env_to_shell_exports(env: dict[str, Any] | None) -> str: + redacted_env = redact_env(env) + return " ".join( + f"{key}={shlex.quote(stringify_env_value(value))}" + for key, value in sorted(redacted_env.items()) + if value is not None and SHELL_ENV_KEY_RE.fullmatch(str(key)) + ) diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py new file mode 100644 index 0000000000..2a0f018cd5 --- /dev/null +++ b/archivebox/machine/migrations/0001_initial.py @@ -0,0 +1,247 @@ +# Generated by hand on 2025-12-29 +# Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Create machine_machine table + CREATE TABLE IF NOT EXISTS machine_machine ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + guid VARCHAR(64) NOT NULL UNIQUE, + hostname VARCHAR(63) NOT NULL, + hw_in_docker BOOLEAN NOT NULL DEFAULT 0, + hw_in_vm BOOLEAN NOT NULL DEFAULT 0, + hw_manufacturer VARCHAR(63) NOT NULL, + hw_product VARCHAR(63) NOT NULL, + hw_uuid VARCHAR(255) NOT NULL, + + os_arch VARCHAR(15) NOT NULL, + os_family VARCHAR(15) NOT NULL, + os_platform VARCHAR(63) NOT NULL, + os_release VARCHAR(63) NOT NULL, + os_kernel VARCHAR(255) NOT NULL, + + stats TEXT, + config TEXT + ); + CREATE INDEX IF NOT EXISTS machine_machine_guid_idx ON machine_machine(guid); + + -- Create machine_networkinterface table + CREATE TABLE IF NOT EXISTS machine_networkinterface ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + machine_id TEXT NOT NULL, + iface VARCHAR(15) NOT NULL, + ip_public VARCHAR(39) NOT NULL, + ip_local VARCHAR(39) NOT NULL, + mac_address VARCHAR(17) NOT NULL, + dns_server VARCHAR(39) NOT NULL, + hostname VARCHAR(256) NOT NULL, + isp VARCHAR(256) NOT NULL, + city VARCHAR(100) NOT NULL, + region VARCHAR(100) NOT NULL, + country VARCHAR(100) NOT NULL, + + FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE + ); + CREATE INDEX IF NOT EXISTS machine_networkinterface_machine_id_idx ON machine_networkinterface(machine_id); + + -- Create machine_binary table + CREATE TABLE IF NOT EXISTS machine_binary ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + machine_id TEXT NOT NULL, + name VARCHAR(63) NOT NULL, + binproviders VARCHAR(127) NOT NULL DEFAULT 'env', + overrides TEXT NOT NULL DEFAULT '{}', + + binprovider VARCHAR(31) NOT NULL DEFAULT '', + abspath VARCHAR(255) NOT NULL DEFAULT '', + version VARCHAR(32) NOT NULL DEFAULT '', + sha256 VARCHAR(64) NOT NULL DEFAULT '', + + status VARCHAR(16) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + output_dir VARCHAR(255) NOT NULL DEFAULT '', + + FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE, + UNIQUE(machine_id, name, abspath, version, sha256) + ); + CREATE INDEX IF NOT EXISTS machine_binary_machine_id_idx ON machine_binary(machine_id); + CREATE INDEX IF NOT EXISTS machine_binary_name_idx ON machine_binary(name); + CREATE INDEX IF NOT EXISTS machine_binary_status_idx ON machine_binary(status); + CREATE INDEX IF NOT EXISTS machine_binary_retry_at_idx ON machine_binary(retry_at); + + """, + reverse_sql=""" + DROP TABLE IF EXISTS machine_binary; + DROP TABLE IF EXISTS machine_networkinterface; + DROP TABLE IF EXISTS machine_machine; + """, + ), + ], + state_operations=[ + migrations.CreateModel( + name="Machine", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("guid", models.CharField(default=None, editable=False, max_length=64, unique=True)), + ("hostname", models.CharField(default=None, max_length=63)), + ("hw_in_docker", models.BooleanField(default=False)), + ("hw_in_vm", models.BooleanField(default=False)), + ("hw_manufacturer", models.CharField(default=None, max_length=63)), + ("hw_product", models.CharField(default=None, max_length=63)), + ("hw_uuid", models.CharField(default=None, max_length=255)), + ("os_arch", models.CharField(default=None, max_length=15)), + ("os_family", models.CharField(default=None, max_length=15)), + ("os_platform", models.CharField(default=None, max_length=63)), + ("os_release", models.CharField(default=None, max_length=63)), + ("os_kernel", models.CharField(default=None, max_length=255)), + ("stats", models.JSONField(blank=True, default=dict, null=True)), + ( + "config", + models.JSONField( + blank=True, + default=dict, + help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)", + null=True, + ), + ), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "app_label": "machine", + }, + ), + migrations.CreateModel( + name="NetworkInterface", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("mac_address", models.CharField(default=None, editable=False, max_length=17)), + ("ip_public", models.GenericIPAddressField(default=None, editable=False)), + ("ip_local", models.GenericIPAddressField(default=None, editable=False)), + ("dns_server", models.GenericIPAddressField(default=None, editable=False)), + ("hostname", models.CharField(default=None, max_length=63)), + ("iface", models.CharField(default=None, max_length=15)), + ("isp", models.CharField(default=None, max_length=63)), + ("city", models.CharField(default=None, max_length=63)), + ("region", models.CharField(default=None, max_length=63)), + ("country", models.CharField(default=None, max_length=63)), + ("machine", models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "unique_together": {("machine", "ip_public", "ip_local", "mac_address", "dns_server")}, + "app_label": "machine", + }, + ), + migrations.CreateModel( + name="Binary", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("name", models.CharField(blank=True, db_index=True, default="", max_length=63)), + ( + "binproviders", + models.CharField( + blank=True, + default="env", + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + max_length=127, + ), + ), + ( + "overrides", + models.JSONField( + blank=True, + default=dict, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ), + ), + ( + "binprovider", + models.CharField( + blank=True, + default="", + help_text="Provider that successfully installed this binary", + max_length=31, + ), + ), + ("abspath", models.CharField(blank=True, default="", max_length=255)), + ("version", models.CharField(blank=True, default="", max_length=32)), + ("sha256", models.CharField(blank=True, default="", max_length=64)), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("started", "Started"), ("succeeded", "Succeeded"), ("failed", "Failed")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this binary installation", + null=True, + ), + ), + ( + "output_dir", + models.CharField( + blank=True, + default="", + help_text="Directory where installation hook logs are stored", + max_length=255, + ), + ), + ("machine", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="machine.machine")), + ("num_uses_succeeded", models.PositiveIntegerField(default=0)), + ("num_uses_failed", models.PositiveIntegerField(default=0)), + ], + options={ + "verbose_name": "Binary", + "verbose_name_plural": "Binaries", + "unique_together": {("machine", "name", "abspath", "version", "sha256")}, + "app_label": "machine", + }, + ), + ], + ), + ] diff --git a/archivebox/machine/migrations/0005_converge_binary_model.py b/archivebox/machine/migrations/0005_converge_binary_model.py new file mode 100644 index 0000000000..c1e89175c1 --- /dev/null +++ b/archivebox/machine/migrations/0005_converge_binary_model.py @@ -0,0 +1,92 @@ +# Generated by hand on 2026-01-01 +# Converges machine app for 0.8.6rc0 โ†’ 0.9.x migration path +# Drops old Binary table and ensures Binary table exists + +from django.db import migrations, connection + + +def converge_binary_table(apps, schema_editor): + """ + Drop machine_installedbinary if it exists (0.8.6rc0 path). + Create machine_binary if it doesn't exist (needed by Process model). + """ + cursor = connection.cursor() + + # Check what tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')") + existing_tables = {row[0] for row in cursor.fetchall()} + + # Drop old Binary table if it exists (0.8.6rc0 path) + if "machine_installedbinary" in existing_tables: + print(" - Removing old machine_installedbinary table...") + cursor.execute("DROP TABLE IF EXISTS machine_installedbinary") + + # Create Binary table if it doesn't exist. + # This handles the case where 0.8.6rc0's 0001_initial didn't create it. + if "machine_binary" not in existing_tables: + print(" - Creating machine_binary table...") + cursor.execute(""" + CREATE TABLE machine_binary ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE, + name VARCHAR(63) NOT NULL, + binproviders VARCHAR(255) NOT NULL DEFAULT 'env', + overrides TEXT NOT NULL DEFAULT '{}', + binprovider VARCHAR(63) NOT NULL DEFAULT 'env', + abspath VARCHAR(255) NOT NULL, + version VARCHAR(128) NOT NULL, + sha256 VARCHAR(64) NOT NULL DEFAULT '', + status VARCHAR(16) NOT NULL DEFAULT 'succeeded', + retry_at DATETIME NULL, + output_dir VARCHAR(255) NOT NULL DEFAULT '' + ) + """) + + # Create indexes + cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)") + cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)") + cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)") + + print(" โœ“ machine_binary table ready") + else: + print(" - Converging existing machine_binary table...") + cursor.execute("PRAGMA table_info(machine_binary)") + binary_cols = {row[1] for row in cursor.fetchall()} + + # Old 0.8.x data dirs already have machine_binary, but with the + # pre-abxpkg shape. Converge it here before later migrations and + # runtime code expect Binary.binproviders / Binary.status to exist. + if "binproviders" not in binary_cols: + cursor.execute("ALTER TABLE machine_binary ADD COLUMN binproviders VARCHAR(255) NOT NULL DEFAULT 'env'") + if "overrides" not in binary_cols: + cursor.execute("ALTER TABLE machine_binary ADD COLUMN overrides TEXT NOT NULL DEFAULT '{}'") + if "status" not in binary_cols: + cursor.execute("ALTER TABLE machine_binary ADD COLUMN status VARCHAR(16) NOT NULL DEFAULT 'installed'") + if "retry_at" not in binary_cols: + cursor.execute("ALTER TABLE machine_binary ADD COLUMN retry_at DATETIME NULL") + if "output_dir" not in binary_cols: + cursor.execute("ALTER TABLE machine_binary ADD COLUMN output_dir VARCHAR(255) NOT NULL DEFAULT ''") + + cursor.execute( + "UPDATE machine_binary SET binproviders = COALESCE(NULLIF(binproviders, ''), COALESCE(NULLIF(binprovider, ''), 'env'))", + ) + cursor.execute("UPDATE machine_binary SET overrides = COALESCE(NULLIF(overrides, ''), '{}')") + cursor.execute("UPDATE machine_binary SET status = COALESCE(NULLIF(status, ''), 'installed')") + print(" โœ“ machine_binary table ready") + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0001_initial"), + ] + + operations = [ + migrations.RunPython( + converge_binary_table, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/machine/migrations/0006_process.py b/archivebox/machine/migrations/0006_process.py new file mode 100644 index 0000000000..eb23475661 --- /dev/null +++ b/archivebox/machine/migrations/0006_process.py @@ -0,0 +1,102 @@ +# Generated by Django 6.0 on 2025-12-31 22:54 + +import django.db.models.deletion +import django.utils.timezone +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0005_converge_binary_model"), + ] + + operations = [ + migrations.CreateModel( + name="Process", + fields=[ + ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ("modified_at", models.DateTimeField(auto_now=True)), + ("pwd", models.CharField(blank=True, default="", help_text="Working directory for process execution", max_length=512)), + ("cmd", models.JSONField(blank=True, default=list, help_text="Command as array of arguments")), + ("env", models.JSONField(blank=True, default=dict, help_text="Environment variables for process")), + ("timeout", models.IntegerField(default=120, help_text="Timeout in seconds")), + ("pid", models.IntegerField(blank=True, default=None, help_text="OS process ID", null=True)), + ("exit_code", models.IntegerField(blank=True, default=None, help_text="Process exit code (0 = success)", null=True)), + ("stdout", models.TextField(blank=True, default="", help_text="Standard output from process")), + ("stderr", models.TextField(blank=True, default="", help_text="Standard error from process")), + ("started_at", models.DateTimeField(blank=True, default=None, help_text="When process was launched", null=True)), + ("ended_at", models.DateTimeField(blank=True, default=None, help_text="When process completed/terminated", null=True)), + ( + "url", + models.URLField( + blank=True, + default=None, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + max_length=2048, + null=True, + ), + ), + ( + "status", + models.CharField( + choices=[("queued", "Queued"), ("running", "Running"), ("exited", "Exited")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ( + "retry_at", + models.DateTimeField( + blank=True, + db_index=True, + default=django.utils.timezone.now, + help_text="When to retry this process", + null=True, + ), + ), + ( + "binary", + models.ForeignKey( + blank=True, + help_text="Binary used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.binary", + ), + ), + ( + "iface", + models.ForeignKey( + blank=True, + help_text="Network interface used by this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="process_set", + to="machine.networkinterface", + ), + ), + ( + "machine", + models.ForeignKey( + help_text="Machine where this process executed", + on_delete=django.db.models.deletion.CASCADE, + related_name="process_set", + to="machine.machine", + ), + ), + ], + options={ + "verbose_name": "Process", + "verbose_name_plural": "Processes", + "indexes": [ + models.Index(fields=["machine", "status", "retry_at"], name="machine_pro_machine_5e3a87_idx"), + models.Index(fields=["binary", "exit_code"], name="machine_pro_binary__7bd19c_idx"), + ], + }, + ), + ] diff --git a/archivebox/machine/migrations/0007_add_process_type_and_parent.py b/archivebox/machine/migrations/0007_add_process_type_and_parent.py new file mode 100644 index 0000000000..9d81a773a6 --- /dev/null +++ b/archivebox/machine/migrations/0007_add_process_type_and_parent.py @@ -0,0 +1,42 @@ +# Generated by Django 6.0 on 2026-01-01 22:55 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0006_process"), + ] + + operations = [ + migrations.AddField( + model_name="process", + name="parent", + field=models.ForeignKey( + blank=True, + help_text="Parent process that spawned this process", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="children", + to="machine.process", + ), + ), + migrations.AddField( + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0008_add_worker_type_field.py b/archivebox/machine/migrations/0008_add_worker_type_field.py new file mode 100644 index 0000000000..905870cf18 --- /dev/null +++ b/archivebox/machine/migrations/0008_add_worker_type_field.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0 on 2026-01-02 03:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0007_add_process_type_and_parent"), + ] + + operations = [ + migrations.AddField( + model_name="process", + name="worker_type", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)", + max_length=32, + ), + ), + ] diff --git a/archivebox/machine/migrations/0009_alter_binary_status.py b/archivebox/machine/migrations/0009_alter_binary_status.py new file mode 100644 index 0000000000..bbc27598ac --- /dev/null +++ b/archivebox/machine/migrations/0009_alter_binary_status.py @@ -0,0 +1,22 @@ +# Generated by Django 6.0 on 2026-01-02 08:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0008_add_worker_type_field"), + ] + + operations = [ + migrations.AlterField( + model_name="binary", + name="status", + field=models.CharField( + choices=[("queued", "Queued"), ("installed", "Installed")], + db_index=True, + default="queued", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0010_alter_process_process_type.py b/archivebox/machine/migrations/0010_alter_process_process_type.py new file mode 100644 index 0000000000..477ea353ed --- /dev/null +++ b/archivebox/machine/migrations/0010_alter_process_process_type.py @@ -0,0 +1,30 @@ +# Generated by Django 6.0 on 2026-01-03 06:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0009_alter_binary_status"), + ] + + operations = [ + migrations.AlterField( + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("worker", "Worker"), + ("cli", "CLI"), + ("hook", "Hook"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0011_remove_binary_output_dir.py b/archivebox/machine/migrations/0011_remove_binary_output_dir.py new file mode 100644 index 0000000000..0a24dff11d --- /dev/null +++ b/archivebox/machine/migrations/0011_remove_binary_output_dir.py @@ -0,0 +1,33 @@ +from django.db import migrations + + +def remove_output_dir_if_exists(apps, schema_editor): + cursor = schema_editor.connection.cursor() + cursor.execute("PRAGMA table_info(machine_binary)") + columns = {row[1] for row in cursor.fetchall()} + + if "output_dir" not in columns: + return + + Binary = apps.get_model("machine", "Binary") + schema_editor.remove_field(Binary, Binary._meta.get_field("output_dir")) + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0010_alter_process_process_type"), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython(remove_output_dir_if_exists, migrations.RunPython.noop), + ], + state_operations=[ + migrations.RemoveField( + model_name="binary", + name="output_dir", + ), + ], + ), + ] diff --git a/archivebox/machine/migrations/0012_add_machine_config_if_missing.py b/archivebox/machine/migrations/0012_add_machine_config_if_missing.py new file mode 100644 index 0000000000..d462b8ceb0 --- /dev/null +++ b/archivebox/machine/migrations/0012_add_machine_config_if_missing.py @@ -0,0 +1,22 @@ +from django.db import migrations + + +def add_machine_config_if_missing(apps, schema_editor): + cursor = schema_editor.connection.cursor() + cursor.execute("PRAGMA table_info(machine_machine)") + columns = {row[1] for row in cursor.fetchall()} + if "config" not in columns: + cursor.execute("ALTER TABLE machine_machine ADD COLUMN config TEXT") + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0011_remove_binary_output_dir"), + ] + + operations = [ + migrations.RunPython( + add_machine_config_if_missing, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/machine/migrations/0013_alter_machine_config.py b/archivebox/machine/migrations/0013_alter_machine_config.py new file mode 100644 index 0000000000..a3f98e5703 --- /dev/null +++ b/archivebox/machine/migrations/0013_alter_machine_config.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-24 09:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0012_add_machine_config_if_missing"), + ] + + operations = [ + migrations.AlterField( + model_name="machine", + name="config", + field=models.JSONField(blank=True, default=dict, help_text="Machine-specific config overrides.", null=True), + ), + ] diff --git a/archivebox/machine/migrations/0014_process_machine_pro_pid_6eec8b_idx_and_more.py b/archivebox/machine/migrations/0014_process_machine_pro_pid_6eec8b_idx_and_more.py new file mode 100644 index 0000000000..1d3b106644 --- /dev/null +++ b/archivebox/machine/migrations/0014_process_machine_pro_pid_6eec8b_idx_and_more.py @@ -0,0 +1,20 @@ +# Generated by Django 6.0.5 on 2026-05-26 18:19 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0013_alter_machine_config"), + ] + + operations = [ + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["pid", "started_at"], name="machine_pro_pid_6eec8b_idx"), + ), + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["process_type", "worker_type", "pwd", "started_at"], name="machine_pro_process_b0411b_idx"), + ), + ] diff --git a/archivebox/machine/migrations/0015_process_progress_indexes.py b/archivebox/machine/migrations/0015_process_progress_indexes.py new file mode 100644 index 0000000000..13e0ec8aab --- /dev/null +++ b/archivebox/machine/migrations/0015_process_progress_indexes.py @@ -0,0 +1,20 @@ +# Generated by ArchiveBox on 2026-05-27 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0014_process_machine_pro_pid_6eec8b_idx_and_more"), + ] + + operations = [ + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["machine", "process_type", "-modified_at"], name="machine_pro_progress_recent_idx"), + ), + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["machine", "status", "process_type"], name="machine_pro_progress_running_idx"), + ), + ] diff --git a/archivebox/machine/migrations/0016_process_delete_at.py b/archivebox/machine/migrations/0016_process_delete_at.py new file mode 100644 index 0000000000..0fc21ea4b8 --- /dev/null +++ b/archivebox/machine/migrations/0016_process_delete_at.py @@ -0,0 +1,17 @@ +# Generated by Django 6.0.5 on 2026-05-27 20:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0015_process_progress_indexes"), + ] + + operations = [ + migrations.AddField( + model_name="process", + name="delete_at", + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + ] diff --git a/archivebox/machine/migrations/0017_shorten_process_progress_index_names.py b/archivebox/machine/migrations/0017_shorten_process_progress_index_names.py new file mode 100644 index 0000000000..617238f007 --- /dev/null +++ b/archivebox/machine/migrations/0017_shorten_process_progress_index_names.py @@ -0,0 +1,26 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0016_process_delete_at"), + ] + + operations = [ + migrations.RemoveIndex( + model_name="process", + name="machine_pro_progress_recent_idx", + ), + migrations.RemoveIndex( + model_name="process", + name="machine_pro_progress_running_idx", + ), + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["machine", "process_type", "-modified_at"], name="mach_proc_recent_idx"), + ), + migrations.AddIndex( + model_name="process", + index=models.Index(fields=["machine", "status", "process_type"], name="mach_proc_running_idx"), + ), + ] diff --git a/archivebox/machine/migrations/0018_alter_process_process_type.py b/archivebox/machine/migrations/0018_alter_process_process_type.py new file mode 100644 index 0000000000..9a93188c48 --- /dev/null +++ b/archivebox/machine/migrations/0018_alter_process_process_type.py @@ -0,0 +1,34 @@ +# Generated by ArchiveBox on 2026-05-28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0017_shorten_process_progress_index_names"), + ] + + operations = [ + migrations.AlterField( + model_name="process", + name="process_type", + field=models.CharField( + choices=[ + ("supervisord", "Supervisord"), + ("orchestrator", "Orchestrator"), + ("server", "Server"), + ("update", "Update"), + ("add", "Add"), + ("search", "Search"), + ("worker", "Worker"), + ("cli", "CLI"), + ("hook", "Hook"), + ("binary", "Binary"), + ], + db_index=True, + default="cli", + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + max_length=16, + ), + ), + ] diff --git a/archivebox/machine/migrations/0019_single_active_runner_constraint.py b/archivebox/machine/migrations/0019_single_active_runner_constraint.py new file mode 100644 index 0000000000..86ac18db56 --- /dev/null +++ b/archivebox/machine/migrations/0019_single_active_runner_constraint.py @@ -0,0 +1,91 @@ +from django.db import migrations, models +from django.db.models import Count, Q +from django.utils import timezone + + +def dedupe_network_interfaces(apps, schema_editor): + NetworkInterface = apps.get_model("machine", "NetworkInterface") + duplicate_groups = ( + NetworkInterface.objects.values( + "machine_id", + "ip_public", + "ip_local", + "mac_address", + "dns_server", + ) + .annotate(count=Count("id")) + .filter(count__gt=1) + ) + for group in duplicate_groups.iterator(chunk_size=100): + lookup = { + "machine_id": group["machine_id"], + "ip_public": group["ip_public"], + "ip_local": group["ip_local"], + "mac_address": group["mac_address"], + "dns_server": group["dns_server"], + } + keep = NetworkInterface.objects.filter(**lookup).order_by("-modified_at", "-created_at").first() + if keep is not None: + NetworkInterface.objects.filter(**lookup).exclude(id=keep.id).delete() + + +def dedupe_active_runners(apps, schema_editor): + Process = apps.get_model("machine", "Process") + duplicate_groups = ( + Process.objects.filter(status="running", process_type="orchestrator", worker_type="worker_runner") + .values("machine_id", "pwd") + .annotate(count=Count("id")) + .filter(count__gt=1) + ) + for group in duplicate_groups.iterator(chunk_size=100): + lookup = { + "machine_id": group["machine_id"], + "pwd": group["pwd"], + "status": "running", + "process_type": "orchestrator", + "worker_type": "worker_runner", + } + keep = Process.objects.filter(**lookup).order_by("-started_at", "-created_at").first() + if keep is not None: + Process.objects.filter(**lookup).exclude(id=keep.id).update( + status="exited", + ended_at=timezone.now(), + exit_code=0, + ) + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0018_alter_process_process_type"), + ] + + operations = [ + migrations.RunPython(dedupe_network_interfaces, migrations.RunPython.noop), + migrations.RunPython(dedupe_active_runners, migrations.RunPython.noop), + migrations.AddConstraint( + model_name="networkinterface", + constraint=models.UniqueConstraint( + fields=("machine", "ip_public", "ip_local", "mac_address", "dns_server"), + name="unique_network_interface_identity", + ), + ), + migrations.AlterField( + model_name="process", + name="worker_type", + field=models.CharField( + blank=True, + db_index=True, + default="", + help_text="Worker role name for worker/orchestrator subprocesses", + max_length=32, + ), + ), + migrations.AddConstraint( + model_name="process", + constraint=models.UniqueConstraint( + condition=Q(process_type="orchestrator", status="running", worker_type="worker_runner"), + fields=("machine", "pwd"), + name="single_active_runner_per_data_dir", + ), + ), + ] diff --git a/archivebox/machine/migrations/0020_repair_process_binary_iface_links.py b/archivebox/machine/migrations/0020_repair_process_binary_iface_links.py new file mode 100644 index 0000000000..ac13b14bfd --- /dev/null +++ b/archivebox/machine/migrations/0020_repair_process_binary_iface_links.py @@ -0,0 +1,130 @@ +import json +from pathlib import Path + +from django.db import migrations +from django.utils import timezone + + +def _cmd_array(value): + if isinstance(value, list): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return [value] if value else [] + return parsed if isinstance(parsed, list) else [] + return [] + + +def _ensure_placeholder_iface(NetworkInterface, machine_id, hostname): + iface = NetworkInterface.objects.filter(machine_id=machine_id).order_by("-modified_at", "-created_at").first() + if iface is not None: + return iface + + now = timezone.now() + return NetworkInterface.objects.create( + machine_id=machine_id, + created_at=now, + modified_at=now, + mac_address="00:00:00:00:00:00", + ip_public="0.0.0.0", + ip_local="0.0.0.0", + dns_server="0.0.0.0", + hostname=(hostname or "unknown")[:63], + iface="unknown", + isp="", + city="", + region="", + country="", + ) + + +def _get_or_create_binary(Binary, machine_id, reference): + reference = str(reference or "").strip() + if not reference: + return None + + name = Path(reference).name or reference + qs = Binary.objects.filter(machine_id=machine_id) + binary = qs.filter(abspath=reference).order_by("-modified_at", "-created_at").first() + if binary is None: + binary = qs.filter(name=name).order_by("-modified_at", "-created_at").first() + if binary is not None: + return binary + + now = timezone.now() + return Binary.objects.create( + machine_id=machine_id, + created_at=now, + modified_at=now, + name=name[:63], + binproviders="env", + overrides={}, + binprovider="env", + abspath=reference[:255], + version="", + sha256="", + status="installed", + retry_at=None, + ) + + +def repair_process_binary_iface_links(apps, schema_editor): + Binary = apps.get_model("machine", "Binary") + Machine = apps.get_model("machine", "Machine") + NetworkInterface = apps.get_model("machine", "NetworkInterface") + Process = apps.get_model("machine", "Process") + + machines = {machine.id: machine for machine in Machine.objects.only("id", "hostname").iterator(chunk_size=100)} + iface_by_machine = {} + binary_by_key = {} + + qs = Process.objects.filter(machine_id__isnull=False).filter(binary_id__isnull=True) | Process.objects.filter( + machine_id__isnull=False, + iface_id__isnull=True, + ) + for process in qs.distinct().only("id", "machine_id", "binary_id", "iface_id", "cmd").iterator(chunk_size=500): + update_fields = [] + machine = machines.get(process.machine_id) + + if process.iface_id is None: + iface = iface_by_machine.get(process.machine_id) + if iface is None: + iface = _ensure_placeholder_iface( + NetworkInterface, + process.machine_id, + machine.hostname if machine is not None else "", + ) + iface_by_machine[process.machine_id] = iface + process.iface_id = iface.id + update_fields.append("iface_id") + + if process.binary_id is None: + cmd = _cmd_array(process.cmd) + reference = str(cmd[0]).strip() if cmd else "" + if reference: + key = (process.machine_id, reference) + binary = binary_by_key.get(key) + if binary is None: + binary = _get_or_create_binary(Binary, process.machine_id, reference) + binary_by_key[key] = binary + if binary is not None: + process.binary_id = binary.id + update_fields.append("binary_id") + + if update_fields: + process.modified_at = timezone.now() + process.save(update_fields=[*update_fields, "modified_at"]) + + +class Migration(migrations.Migration): + atomic = False + + dependencies = [ + ("machine", "0019_single_active_runner_constraint"), + ] + + operations = [ + migrations.RunPython(repair_process_binary_iface_links, migrations.RunPython.noop), + ] diff --git a/archivebox/machine/migrations/__init__.py b/archivebox/machine/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py new file mode 100755 index 0000000000..fa82ca7ee2 --- /dev/null +++ b/archivebox/machine/models.py @@ -0,0 +1,2753 @@ +from __future__ import annotations + +__package__ = "archivebox.machine" + +import os +import signal +import sys +import uuid +import socket +from pathlib import Path +from archivebox.uuid_compat import CompactUUIDField, uuid7 +from datetime import timedelta, datetime +from typing import TYPE_CHECKING, Any, cast + +from statemachine import State, registry + +from django.db import IntegrityError, transaction +from django.db import models +from django.db.models import Q, QuerySet +from django.utils import timezone +from django.utils.functional import cached_property + +from archivebox.config import CONSTANTS +from archivebox.config.common import rprint +from archivebox.base_models.models import ModelWithDeleteAfter, ModelWithHealthStats, normalize_config_json_values +from archivebox.workers.models import BaseStateMachine, ModelWithStateMachine +from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats + +_psutil: Any | None = None +try: + import psutil as _psutil_import + + PSUTIL_AVAILABLE = True +except ImportError: + PSUTIL_AVAILABLE = False +else: + _psutil = _psutil_import + +if TYPE_CHECKING: + import psutil + from archivebox.core.models import ArchiveResult +else: + psutil = cast(Any, _psutil) + +_CURRENT_MACHINE: Machine | None = None +_CURRENT_INTERFACE: NetworkInterface | None = None +_CURRENT_BINARIES: dict[str, Binary] = {} +_CURRENT_PROCESS: Process | None = None + +MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 +NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 +BINARY_RECHECK_INTERVAL = 1 * 30 * 60 +PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds +PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid +PROCESS_TIMEOUT_GRACE = timedelta(seconds=30) # Extra margin before force-cleaning timed-out RUNNING rows +START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching + + +def _default_exit_code_for_unowned_process(process_type: str) -> int: + # Hooks are externally visible work items. If their owning runner disappeared + # before recording the real exit code, retrying is safer than converting an + # unknown interrupted extraction into a durable success/no-result row. + return 128 + signal.SIGTERM if process_type == Process.TypeChoices.HOOK else 0 + + +def _find_existing_binary_for_reference(machine: Machine, reference: str) -> Binary | None: + reference = str(reference or "").strip() + if not reference: + return None + + qs = Binary.objects.filter(machine=machine) + + direct_match = qs.filter(abspath=reference).order_by("-modified_at").first() + if direct_match: + return direct_match + + ref_name = Path(reference).name + if ref_name: + named_match = qs.filter(name=ref_name).order_by("-modified_at").first() + if named_match: + return named_match + + return qs.filter(name=reference).order_by("-modified_at").first() + + +def _canonical_binary_name(name: Any) -> str: + name = str(name or "").strip() + if "/" in name or "\\" in name or name.startswith("~"): + return Path(name).expanduser().name + return name + + +def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]: + env = env or {} + plugin_name = str(plugin_name or "").strip() + hook_path = str(hook_path or "").strip() + plugin_key = plugin_name.upper().replace("-", "_") + keys: list[str] = [] + seen: set[str] = set() + + def add(key: str) -> None: + if key and key not in seen and env.get(key): + seen.add(key) + keys.append(key) + + if plugin_key: + add(f"{plugin_key}_BINARY") + + try: + from archivebox.plugins.discovery import discover_plugin_configs + + plugin_schema = discover_plugin_configs().get(plugin_name, {}) + schema_keys = [key for key in (plugin_schema.get("properties") or {}) if key.endswith("_BINARY")] + except Exception: + schema_keys = [] + + schema_keys.sort( + key=lambda key: ( + key != f"{plugin_key}_BINARY", + key, + ), + ) + for key in schema_keys: + add(key) + + if plugin_name.startswith("search_backend_"): + backend_name = plugin_name.removeprefix("search_backend_").upper().replace("-", "_") + configured_engine = str(env.get("SEARCH_BACKEND_ENGINE") or "").strip().upper().replace("-", "_") + if backend_name and backend_name == configured_engine: + add(f"{backend_name}_BINARY") + + hook_suffix = Path(hook_path).suffix.lower() + if hook_suffix == ".js": + add("NODE_BINARY") + + return keys + + +def _sanitize_machine_config(config: dict[str, Any] | None, *, lib_dir: str | Path | None = None) -> dict[str, Any]: + """Validate ``Machine.config`` in place. + + Drops stale ``*_BINARY`` overrides whose path no longer exists or whose + path falls outside of ``LIB_DIR`` (so a binary uninstall or a lib_dir + move clears the override automatically). Non-``_BINARY`` keys + (``BASE_URL``, ``SERVER_SECURITY_MODE``, plugin tunables, etc.) are + pass-through โ€” they're arbitrary config overrides and not ours to filter. + """ + if not isinstance(config, dict): + return {} + + sanitized = dict(config) + active_lib_dir = Path(lib_dir).expanduser().absolute() if lib_dir else None + for key, value in list(sanitized.items()): + if not str(key).endswith("_BINARY"): + continue + if not isinstance(value, str): + continue + value = value.strip() + if not value: + sanitized.pop(key, None) + continue + if "/" in value or value.startswith("~"): + try: + path = Path(value).expanduser() + if not path.exists(): + sanitized.pop(key, None) + continue + if active_lib_dir is not None: + resolved_path = path.absolute() + try: + resolved_path.relative_to(active_lib_dir) + except ValueError: + sanitized.pop(key, None) + except OSError: + sanitized.pop(key, None) + return sanitized + + +class MachineManager(models.Manager): + def current(self) -> Machine: + return Machine.current() + + +class Machine(ModelWithHealthStats): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False) + hostname = models.CharField(max_length=63, default=None, null=False) + hw_in_docker = models.BooleanField(default=False, null=False) + hw_in_vm = models.BooleanField(default=False, null=False) + hw_manufacturer = models.CharField(max_length=63, default=None, null=False) + hw_product = models.CharField(max_length=63, default=None, null=False) + hw_uuid = models.CharField(max_length=255, default=None, null=False) + os_arch = models.CharField(max_length=15, default=None, null=False) + os_family = models.CharField(max_length=15, default=None, null=False) + os_platform = models.CharField(max_length=63, default=None, null=False) + os_release = models.CharField(max_length=63, default=None, null=False) + os_kernel = models.CharField(max_length=255, default=None, null=False) + stats = models.JSONField(default=dict, null=True, blank=True) + config = models.JSONField( + default=dict, + null=True, + blank=True, + help_text="Machine-specific config overrides.", + ) + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride] + networkinterface_set: models.Manager[NetworkInterface] + + class Meta(ModelWithHealthStats.Meta): + app_label = "machine" + + @classmethod + def current(cls, refresh: bool = False) -> Machine: + global _CURRENT_MACHINE + if refresh: + _CURRENT_MACHINE = None + if _CURRENT_MACHINE: + if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL): + # One-time-per-process reconciliation between ArchiveBox.conf + # and Machine.config. Fast-path: bool check + early-return when + # the sync has already run, so the cached-machine return path + # stays sub-microsecond. + try: + from archivebox.config.collection import sync_machine_and_file + + sync_machine_and_file(_CURRENT_MACHINE) + except Exception: + pass + return _CURRENT_MACHINE + else: + _CURRENT_MACHINE = None + + host_guid = get_host_guid() + try: + _CURRENT_MACHINE = cls.objects.get(guid=host_guid) + except cls.DoesNotExist: + _CURRENT_MACHINE = cls.objects.create( + guid=host_guid, + hostname=socket.gethostname(), + **get_os_info(), + **get_vm_info(), + stats=get_host_stats(), + ) + else: + if timezone.now() >= _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL): + for key, value in { + "hostname": socket.gethostname(), + **get_os_info(), + **get_vm_info(), + "stats": get_host_stats(), + }.items(): + setattr(_CURRENT_MACHINE, key, value) + _CURRENT_MACHINE.save( + update_fields=[ + "hostname", + "hw_in_docker", + "hw_in_vm", + "hw_manufacturer", + "hw_product", + "hw_uuid", + "os_arch", + "os_family", + "os_platform", + "os_release", + "os_kernel", + "stats", + "modified_at", + ], + ) + machine = cls._sanitize_config(_CURRENT_MACHINE) + # Same one-time sync as the cached-return path. Triggers here on the + # very first ``Machine.current()`` call in a process before any + # cached return can occur. + try: + from archivebox.config.collection import sync_machine_and_file + + sync_machine_and_file(_CURRENT_MACHINE) + except Exception: + pass + return machine + + @classmethod + def _sanitize_config(cls, machine: Machine) -> Machine: + from archivebox.config.common import get_config + + sanitized = _sanitize_machine_config(machine.config, lib_dir=get_config(include_machine=False).LIB_DIR) + current = machine.config or {} + if sanitized != current: + machine.config = sanitized + machine.save(update_fields=["config", "modified_at"]) + return machine + + def to_json(self) -> dict: + """ + Convert Machine model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + from archivebox.config.common import redact_sensitive_config + + return { + "type": "Machine", + "schema_version": VERSION, + "id": str(self.id), + "guid": self.guid, + "hostname": self.hostname, + "hw_in_docker": self.hw_in_docker, + "hw_in_vm": self.hw_in_vm, + "hw_manufacturer": self.hw_manufacturer, + "hw_product": self.hw_product, + "hw_uuid": self.hw_uuid, + "os_arch": self.os_arch, + "os_family": self.os_family, + "os_platform": self.os_platform, + "os_kernel": self.os_kernel, + "os_release": self.os_release, + "stats": self.stats, + "config": redact_sensitive_config(self.config), + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Update Machine config from JSON dict. + + Args: + record: JSON dict with 'config': {key: value} patch + overrides: Not used + + Returns: + Machine instance or None + """ + config_patch = _sanitize_machine_config(record.get("config")) + if config_patch: + machine = Machine.current() + machine.config = _sanitize_machine_config(machine.config) + machine.config.update(config_patch) + machine.save(update_fields=["config"]) + return machine + return None + + def save(self, *args, **kwargs): + normalized_config = normalize_config_json_values(self.config) + if normalized_config != self.config: + self.config = normalized_config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + + # Drop the ``Machine.current()`` module-level cache on every save so + # config edits (admin form, from_json, etc.) become live in the same + # process without waiting out the 7-day ``MACHINE_RECHECK_INTERVAL``. + # The cache reads ``_CURRENT_MACHINE.modified_at`` and that value + # never moves forward on the cached object even when the row is + # updated in the DB, so without this we'd keep serving stale + # ``machine.config`` (incl. ``BASE_URL``) until the worker restarts. + update_fields = kwargs.get("update_fields") + super().save(*args, **kwargs) + global _CURRENT_MACHINE + if _CURRENT_MACHINE is not None and _CURRENT_MACHINE.pk == self.pk: + _CURRENT_MACHINE = None + + # Mirror Machine.config into ArchiveBox.conf so the two stores stay + # 1:1. Skipped when ``update_fields`` is set and doesn't touch + # ``config`` (binary autodetection + ``hostname``/``stats`` refreshes + # save fields we don't need to disk-mirror, which keeps hot paths + # zero-IO). Errors during mirroring are swallowed: the DB write + # already succeeded and we don't want a config-file write hiccup to + # turn a routine save into a 500. + if update_fields is not None and "config" not in update_fields: + return + try: + from archivebox.config.collection import mirror_machine_config_to_file + + mirror_machine_config_to_file(self.config) + except Exception: + pass + + +class NetworkInterfaceManager(models.Manager): + def current(self) -> NetworkInterface: + return NetworkInterface.current() + + +class NetworkInterface(ModelWithHealthStats): + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False) + mac_address = models.CharField(max_length=17, default=None, null=False, editable=False) + ip_public = models.GenericIPAddressField(default=None, null=False, editable=False) + ip_local = models.GenericIPAddressField(default=None, null=False, editable=False) + dns_server = models.GenericIPAddressField(default=None, null=False, editable=False) + hostname = models.CharField(max_length=63, default=None, null=False) + iface = models.CharField(max_length=15, default=None, null=False) + isp = models.CharField(max_length=63, default=None, null=False) + city = models.CharField(max_length=63, default=None, null=False) + region = models.CharField(max_length=63, default=None, null=False) + country = models.CharField(max_length=63, default=None, null=False) + # num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats + # num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats + + objects = NetworkInterfaceManager() # pyright: ignore[reportIncompatibleVariableOverride] + machine_id: uuid.UUID + + class Meta(ModelWithHealthStats.Meta): + app_label = "machine" + unique_together = (("machine", "ip_public", "ip_local", "mac_address", "dns_server"),) + constraints = [ + models.UniqueConstraint( + fields=["machine", "ip_public", "ip_local", "mac_address", "dns_server"], + name="unique_network_interface_identity", + ), + ] + + @classmethod + def current(cls, refresh: bool = False) -> NetworkInterface: + global _CURRENT_INTERFACE + machine = Machine.current(refresh=refresh) + if _CURRENT_INTERFACE and _CURRENT_INTERFACE.machine_id == machine.id: + if not refresh: + # Callers that pass refresh=False are asking for attribution to + # the currently known interface, not for public-IP/ISP probing. + # Maintenance paths create many short-lived services per run; + # expiring this in-memory object by age forced every crawl to + # hit external network APIs even though Process rows only need + # a stable existing FK. Active downloading paths opt into live + # detection with refresh=True. + return _CURRENT_INTERFACE + if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL): + return _CURRENT_INTERFACE + _CURRENT_INTERFACE = None + + if not refresh: + _CURRENT_INTERFACE = cls.objects.filter(machine=machine).order_by("-modified_at", "-created_at").first() + if _CURRENT_INTERFACE is not None: + return _CURRENT_INTERFACE + + net_info = get_host_network() + lookup = dict( + machine=machine, + ip_public=net_info.pop("ip_public"), + ip_local=net_info.pop("ip_local"), + mac_address=net_info.pop("mac_address"), + dns_server=net_info.pop("dns_server"), + ) + _CURRENT_INTERFACE = cls.objects.filter(**lookup).order_by("-modified_at", "-created_at").first() + if _CURRENT_INTERFACE is None: + try: + _CURRENT_INTERFACE = cls.objects.create(**lookup, **net_info) + except IntegrityError: + _CURRENT_INTERFACE = cls.objects.filter(**lookup).order_by("-modified_at", "-created_at").first() + if _CURRENT_INTERFACE is None: + raise + else: + # Avoid update_or_create() here: command startup calls this before + # leadership handoff, and SQLite should not take a write lock unless + # the cached interface metadata is actually stale. + if refresh or timezone.now() >= _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL): + updates = ["modified_at"] + for key, value in net_info.items(): + if _CURRENT_INTERFACE.__dict__.get(key) != value: + setattr(_CURRENT_INTERFACE, key, value) + updates.append(key) + if len(updates) > 1: + _CURRENT_INTERFACE.save(update_fields=updates) + return _CURRENT_INTERFACE + + +class BinaryManager(models.Manager): + def get_from_db_or_cache(self, name: str, abspath: str = "", version: str = "", sha256: str = "", binprovider: str = "env") -> Binary: + """Get or create an Binary record from the database or cache.""" + cached = _CURRENT_BINARIES.get(name) + if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL): + return cached + _CURRENT_BINARIES[name], _ = self.update_or_create( + machine=Machine.current(), + name=name, + binprovider=binprovider, + version=version, + abspath=abspath, + sha256=sha256, + ) + return _CURRENT_BINARIES[name] + + def get_valid_binary(self, name: str, machine: Machine | None = None) -> Binary | None: + """Get a valid Binary for the given name on the current machine, or None if not found.""" + machine = machine or Machine.current() + return ( + self.filter( + machine=machine, + name__iexact=name, + ) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + .first() + ) + + +class Binary(ModelWithHealthStats, ModelWithStateMachine): + """ + Tracks a binary on a specific machine. + + Simple state machine with 2 states: + - queued: Binary needs to be installed + - installed: Binary installed successfully (abspath, version, sha256 populated) + + Installation is synchronous during queuedโ†’installed transition. + If installation fails, Binary stays in queued with retry_at set for later retry. + + State machine calls run(), which emits an abxpkg BinaryRequestEvent through + the ArchiveBox runner and installs the binary using the specified providers. + """ + + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + INSTALLED = "installed", "Installed" + + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False) + + # Binary metadata + name = models.CharField(max_length=63, default="", null=False, blank=True, db_index=True) + binproviders = models.CharField( + max_length=127, + default="env", + null=False, + blank=True, + help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env", + ) + overrides = models.JSONField( + default=dict, + blank=True, + help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}", + ) + + # Installation results (populated after installation) + binprovider = models.CharField( + max_length=31, + default="", + null=False, + blank=True, + help_text="Provider that successfully installed this binary", + ) + abspath = models.CharField(max_length=255, default="", null=False, blank=True) + version = models.CharField(max_length=32, default="", null=False, blank=True) + sha256 = models.CharField(max_length=64, default="", null=False, blank=True) + + # State machine fields + status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16) + retry_at = ModelWithStateMachine.RetryAtField( + default=timezone.now, + help_text="When to retry this binary installation", + ) + + # Health stats + num_uses_failed = models.PositiveIntegerField(default=0) + num_uses_succeeded = models.PositiveIntegerField(default=0) + + machine_id: uuid.UUID + + state_machine_name: str | None = "archivebox.machine.models.BinaryMachine" + active_state: str = StatusChoices.QUEUED + warn_on_save_outside_runner = False + + objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride] + + if TYPE_CHECKING: + + @property + def sm(self) -> BinaryMachine: ... + + class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta): + app_label = "machine" + verbose_name = "Binary" + verbose_name_plural = "Binaries" + unique_together = (("machine", "name", "abspath", "version", "sha256"),) + + def __str__(self) -> str: + return f"{self.name}@{self.binprovider}+{self.abspath}@{self.version}" + + @property + def is_valid(self) -> bool: + """A binary is valid if it has a resolved path and is marked installed.""" + return bool(self.abspath) and self.status == self.StatusChoices.INSTALLED + + @cached_property + def binary_info(self) -> dict: + """Return info about the binary.""" + return { + "name": self.name, + "abspath": self.abspath, + "version": self.version, + "binprovider": self.binprovider, + "is_valid": self.is_valid, + } + + @property + def output_dir(self) -> Path: + """ + Get output directory for this binary's hook logs. + Path: data/machines/{machine_uuid}/binaries/{binary_name}/{binary_uuid} + """ + return CONSTANTS.DATA_DIR / "machines" / str(self.machine_id) / "binaries" / self.name / str(self.id) + + def to_json(self) -> dict: + """ + Convert Binary model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + is_installed = bool(self.abspath and self.version) + return { + "type": "Binary" if is_installed else "BinaryRequest", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "name": self.name, + "binproviders": self.binproviders, + "overrides": self.overrides, + "binprovider": self.binprovider, + "abspath": self.abspath, + "version": self.version, + "sha256": self.sha256, + "status": self.status, + } + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Binary from JSON dict. + + Handles two cases: + 1. From binaries.json: creates queued binary with name, binproviders, overrides + 2. From hook output: updates binary with abspath, version, sha256, binprovider + + Args: + record: JSON dict with 'name' and either: + - 'binproviders', 'overrides' (from binaries.json) + - 'abspath', 'version', 'sha256', 'binprovider' (from hook output) + overrides: Not used + + Returns: + Binary instance or None + """ + name = _canonical_binary_name(record.get("name")) + if not name: + return None + + machine = Machine.current() + overrides = overrides or {} + binary_overrides = record.get("overrides", {}) + normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {} + + # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders + # This happens when on_Crawl hooks detect already-installed binaries + abspath = record.get("abspath") + version = record.get("version") + binproviders = record.get("binproviders") + + if abspath and version and binproviders: + # Binary is already installed, create INSTALLED record with binproviders filter + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "binproviders": binproviders, # Preserve the filter + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, + ) + from archivebox.config.common import get_config + + binary.symlink_to_lib_bin_after_commit(get_config().LIB_DIR / "bin") + return binary + + # Case 2: From binaries.json - create queued binary (needs installation) + if "binproviders" in record or ("overrides" in record and not abspath): + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "binproviders": record.get("binproviders", "env"), + "overrides": normalized_overrides, + "status": Binary.StatusChoices.QUEUED, + "retry_at": timezone.now(), + }, + ) + return binary + + # Case 3: From Binary output - update with installation results + if abspath and version: + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + "abspath": abspath, + "version": version, + "sha256": record.get("sha256", ""), + "binprovider": record.get("binprovider", "env"), + "status": Binary.StatusChoices.INSTALLED, + "retry_at": None, + }, + ) + from archivebox.config.common import get_config + + binary.symlink_to_lib_bin_after_commit(get_config().LIB_DIR / "bin") + return binary + + return None + + def _allowed_binproviders(self) -> set[str] | None: + """Return the allowed binproviders for this binary, or None for wildcard.""" + providers = str(self.binproviders or "").strip() + if not providers or providers == "*": + return None + return {provider.strip() for provider in providers.split(",") if provider.strip()} + + def run(self): + """ + Execute binary installation through the ArchiveBox binary runner. + """ + from archivebox.services.runner import run_binary + + run_binary(str(self.id)) + + def cleanup(self): + """ + Clean up background binary installation hooks. + + Called by state machine if needed (not typically used for binaries + since installations are foreground, but included for consistency). + """ + + # Clean up .pid files from output directory + output_dir = self.output_dir + if output_dir.exists(): + for pid_file in output_dir.glob("**/*.pid"): + pid_file.unlink(missing_ok=True) + + def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None: + """ + Symlink this binary into a derived lib/bin directory for human-facing convenience. + + After a binary is installed by any binprovider (pip, npm, brew, apt, etc), + we can optionally expose a flat convenience directory for shell users. + ArchiveBox/abx-dl runtime lookup must use the provider-specific LIB_DIR + paths, not this indirection. + + Args: + lib_bin_dir: Path to the derived convenience bin dir (e.g., /data/lib/bin) + + Returns: + Path to the created symlink, or None if symlinking failed + + Example: + >>> binary = Binary.objects.get(name='yt-dlp') + >>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin') + Path('/data/lib/arm64-darwin/bin/yt-dlp') + """ + import sys + from pathlib import Path + + if not self.abspath: + return None + + binary_abspath = Path(self.abspath).resolve() + lib_bin_dir = Path(lib_bin_dir).resolve() + binary_parts = binary_abspath.parts + try: + app_index = next(index for index, part in enumerate(binary_parts) if part.endswith(".app")) + except StopIteration: + app_index = -1 + + # Create the derived convenience bin dir if it doesn't exist. + try: + lib_bin_dir.mkdir(parents=True, exist_ok=True) + except (OSError, PermissionError) as e: + print(f"Failed to create lib/bin convenience dir {lib_bin_dir}: {e}", file=sys.stderr) + return None + + # Expose the canonical Binary.name in the convenience bin dir. Some providers point + # abspath at implementation files like cli.js or manifest.json; those + # are valid targets, but they are not user-facing binary names. + binary_name = _canonical_binary_name(self.name) or binary_abspath.name + symlink_path = lib_bin_dir / binary_name + + if app_index != -1 and len(binary_parts) > app_index + 2 and binary_parts[app_index + 1 : app_index + 3] == ("Contents", "MacOS"): + if symlink_path.exists() or symlink_path.is_symlink(): + try: + symlink_path.unlink() + except (OSError, PermissionError) as e: + print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr) + return None + return binary_abspath + + # Remove existing symlink/file if it exists + if symlink_path.exists() or symlink_path.is_symlink(): + try: + # Check if it's already pointing to the right place + if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath: + # Already correctly symlinked, nothing to do + return symlink_path + + # Remove old symlink/file + symlink_path.unlink() + except (OSError, PermissionError) as e: + print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr) + return None + + # Create new symlink + try: + symlink_path.symlink_to(binary_abspath) + return symlink_path + except (OSError, PermissionError) as e: + print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr) + return None + + def symlink_to_lib_bin_after_commit(self, lib_bin_dir: str | Path) -> None: + """ + Symlink after the current DB transaction commits. + + Binary rows are projections of provider/hook state and are allowed to be + updated directly, but filesystem writes must not run while an outer + transaction is still open. Refetch after commit so the symlink points at + the committed row, not a possibly-rolled-back in-memory value. + """ + binary_id = self.id + lib_bin_path = Path(lib_bin_dir) + + def create_symlink() -> None: + binary = type(self).objects.filter(id=binary_id).first() + if binary is not None: + binary.symlink_to_lib_bin(lib_bin_path) + + transaction.on_commit(create_symlink) + + +# ============================================================================= +# Process Model +# ============================================================================= + + +class ProcessManager(models.Manager): + """Manager for Process model.""" + + def current(self) -> Process: + """Get the Process record for the current OS process.""" + return Process.current() + + def get_by_pid(self, pid: int, machine: Machine | None = None) -> Process | None: + """ + Find a Process by PID with proper validation against PID reuse. + + IMPORTANT: PIDs are reused by the OS! This method: + 1. Filters by machine (required - PIDs are only unique per machine) + 2. Filters by time window (processes older than 24h are stale) + 3. Validates via psutil that start times match + + Args: + pid: OS process ID + machine: Machine instance (defaults to current machine) + + Returns: + Process if found and validated, None otherwise + """ + if not PSUTIL_AVAILABLE: + return None + + machine = machine or Machine.current() + + # Get the actual process start time from OS + try: + os_proc = psutil.Process(pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process doesn't exist - any DB record with this PID is stale + return None + + # Query candidates: same machine, same PID, recent, still RUNNING + candidates = self.filter( + machine=machine, + pid=pid, + status=Process.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by("-started_at") + + for candidate in candidates: + # Validate start time matches (within tolerance) + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + return candidate + + return None + + def create_for_archiveresult(self, archiveresult, **kwargs): + """ + Create a Process record for an ArchiveResult. + + Called during migration and when creating new ArchiveResults. + """ + iface = kwargs.get("iface") or NetworkInterface.current() + + # Defaults from ArchiveResult if not provided + defaults = { + "machine": iface.machine, + "pwd": kwargs.get("pwd") or str(archiveresult.snapshot.output_dir / archiveresult.plugin), + "cmd": kwargs.get("cmd") or [], + "status": "queued", + "timeout": kwargs.get("timeout", 120), + "env": kwargs.get("env", {}), + "iface": iface, + } + defaults.update(kwargs) + + process = self.create(**defaults) + return process + + +class Process(ModelWithDeleteAfter, models.Model): + """ + Tracks a single OS process execution. + + Process represents the actual subprocess spawned to execute a hook. + One Process can optionally be associated with an ArchiveResult (via OneToOne), + but Process can also exist standalone for internal operations. + + Follows the unified state machine pattern: + - queued: Process ready to launch + - running: Process actively executing + - exited: Process completed (check exit_code for success/failure) + + State machine calls launch() to spawn the process and monitors its lifecycle. + """ + + class StatusChoices(models.TextChoices): + QUEUED = "queued", "Queued" + RUNNING = "running", "Running" + EXITED = "exited", "Exited" + + class TypeChoices(models.TextChoices): + SUPERVISORD = "supervisord", "Supervisord" + ORCHESTRATOR = "orchestrator", "Orchestrator" + SERVER = "server", "Server" + UPDATE = "update", "Update" + ADD = "add", "Add" + SEARCH = "search", "Search" + WORKER = "worker", "Worker" + CLI = "cli", "CLI" + HOOK = "hook", "Hook" + BINARY = "binary", "Binary" + + # Primary fields + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + # Machine FK - required (every process runs on a machine) + machine = models.ForeignKey( + Machine, + on_delete=models.CASCADE, + null=False, + related_name="process_set", + help_text="Machine where this process executed", + ) + + # Parent process (optional) + parent = models.ForeignKey( + "self", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="children", + help_text="Parent process that spawned this process", + ) + + # Process type (cli, worker, orchestrator, binary, supervisord) + process_type = models.CharField( + max_length=16, + choices=TypeChoices.choices, + default=TypeChoices.CLI, + db_index=True, + help_text="Type of process (cli, worker, orchestrator, binary, supervisord)", + ) + + # Worker type (only for WORKER processes: crawl, snapshot, archiveresult) + worker_type = models.CharField( + max_length=32, + default="", + null=False, + blank=True, + db_index=True, + help_text="Worker role name for worker/orchestrator subprocesses", + ) + + # Execution metadata + pwd = models.CharField( + max_length=512, + default="", + null=False, + blank=True, + help_text="Working directory for process execution", + ) + cmd = models.JSONField( + default=list, + null=False, + blank=True, + help_text="Command as array of arguments", + ) + env = models.JSONField( + default=dict, + null=False, + blank=True, + help_text="Environment variables for process", + ) + timeout = models.IntegerField( + default=120, + null=False, + help_text="Timeout in seconds", + ) + + # Process results + pid = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="OS process ID", + ) + exit_code = models.IntegerField( + default=None, + null=True, + blank=True, + help_text="Process exit code (0 = success)", + ) + stdout = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard output from process", + ) + stderr = models.TextField( + default="", + null=False, + blank=True, + help_text="Standard error from process", + ) + + # Timing + started_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process was launched", + ) + ended_at = models.DateTimeField( + default=None, + null=True, + blank=True, + help_text="When process completed/terminated", + ) + + # Optional FKs + binary = models.ForeignKey( + Binary, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="process_set", + help_text="Binary used by this process", + ) + iface = models.ForeignKey( + NetworkInterface, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="process_set", + help_text="Network interface used by this process", + ) + + # Optional connection URL (for CDP, sonic, etc.) + url = models.URLField( + max_length=2048, + default=None, + null=True, + blank=True, + help_text="Connection URL (CDP endpoint, sonic server, etc.)", + ) + + # Reverse relation to ArchiveResult (OneToOne from AR side) + # archiveresult: OneToOneField defined on ArchiveResult model + + # State machine fields + status = models.CharField( + max_length=16, + choices=StatusChoices.choices, + default=StatusChoices.QUEUED, + db_index=True, + ) + retry_at = models.DateTimeField( + default=timezone.now, + null=True, + blank=True, + db_index=True, + help_text="When to retry this process", + ) + + machine_id: uuid.UUID + parent_id: uuid.UUID | None + binary_id: uuid.UUID | None + children: models.Manager[Process] + archiveresult: ArchiveResult + + state_machine_name: str = "archivebox.machine.models.ProcessMachine" + delete_after_final_statuses = (StatusChoices.EXITED,) + + objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride] + + class Meta(ModelWithDeleteAfter.Meta): + app_label = "machine" + verbose_name = "Process" + verbose_name_plural = "Processes" + indexes = [ + models.Index(fields=["machine", "status", "retry_at"]), + models.Index(fields=["binary", "exit_code"]), + models.Index(fields=["pid", "started_at"]), + models.Index(fields=["process_type", "worker_type", "pwd", "started_at"]), + models.Index(fields=["machine", "process_type", "-modified_at"], name="mach_proc_recent_idx"), + models.Index(fields=["machine", "status", "process_type"], name="mach_proc_running_idx"), + ] + constraints = [ + models.UniqueConstraint( + fields=["machine", "pwd"], + condition=Q(status="running", process_type="orchestrator", worker_type="worker_runner"), + name="single_active_runner_per_data_dir", + ), + ] + + def __str__(self) -> str: + cmd_str = " ".join(self.cmd[:3]) if self.cmd else "(no cmd)" + return f"Process[{self.id}] {cmd_str} ({self.status})" + + def get_delete_after_config_value(self): + value = self.env.get("DELETE_AFTER") + if value not in (None, ""): + return value + value = (self.machine.config or {}).get("DELETE_AFTER") + if value not in (None, ""): + return value + return "0" + + @classmethod + def missing_delete_at_candidates(cls): + return cls.objects.filter(delete_at__isnull=True).filter( + Q(env__has_key="DELETE_AFTER") | Q(machine__config__has_key="DELETE_AFTER"), + ) + + # Properties that delegate to related objects + @property + def cmd_version(self) -> str: + """Get version from associated binary.""" + return self.binary.version if self.binary else "" + + @property + def bin_abspath(self) -> str: + """Get absolute path from associated binary.""" + return self.binary.abspath if self.binary else "" + + @property + def plugin(self) -> str: + """Get plugin name from associated ArchiveResult (if any).""" + try: + return self.archiveresult.plugin + except Process.archiveresult.RelatedObjectDoesNotExist: + return "" + + @property + def hook_name(self) -> str: + """Get hook name from associated ArchiveResult (if any).""" + try: + return self.archiveresult.hook_name + except Process.archiveresult.RelatedObjectDoesNotExist: + return "" + + def to_json(self) -> dict: + """ + Convert Process model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + + record = { + "type": "Process", + "schema_version": VERSION, + "id": str(self.id), + "machine_id": str(self.machine_id), + "cmd": self.cmd, + "pwd": self.pwd, + "status": self.status, + "exit_code": self.exit_code, + "started_at": self.started_at.isoformat() if self.started_at else None, + "ended_at": self.ended_at.isoformat() if self.ended_at else None, + } + # Include optional fields if set + if self.binary_id: + record["binary_id"] = str(self.binary_id) + if self.pid: + record["pid"] = self.pid + if self.timeout: + record["timeout"] = self.timeout + return record + + def hydrate_binary_from_context(self, *, plugin_name: str = "", hook_path: str = "") -> Binary | None: + machine = self.machine if self.machine_id else Machine.current() + + references: list[str] = [] + for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env): + value = str(self.env.get(key) or "").strip() + if value and value not in references: + references.append(value) + + if self.cmd: + cmd_0 = str(self.cmd[0]).strip() + if cmd_0 and cmd_0 not in references: + references.append(cmd_0) + + for reference in references: + binary = _find_existing_binary_for_reference(machine, reference) + if binary: + self.binary = binary + return binary + + return None + + @classmethod + def parse_records_from_text(cls, text: str) -> list[dict]: + """Parse JSONL records from raw text using the shared JSONL parser.""" + from archivebox.misc.jsonl import parse_line + + records: list[dict] = [] + if not text: + return records + for line in text.splitlines(): + record = parse_line(line) + if record and record.get("type"): + records.append(record) + return records + + def get_records(self) -> list[dict]: + """Parse JSONL records from this process's stdout.""" + stdout = self.stdout + if not stdout and self.stdout_file and self.stdout_file.exists(): + stdout = self.stdout_file.read_text(errors="replace") + return self.parse_records_from_text(stdout or "") + + @staticmethod + def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None): + """ + Create/update Process from JSON dict. + + Args: + record: JSON dict with 'id' or process details + overrides: Optional dict of field overrides + + Returns: + Process instance or None + """ + process_id = record.get("id") + if process_id: + try: + return Process.objects.get(id=process_id) + except Process.DoesNotExist: + pass + return None + + def safe_update(self, update_fields: dict[str, Any], *, refresh: bool = True, extra_filter: dict[str, Any] | None = None) -> bool: + """ + Compare-and-swap update for short Process scheduler writes. + + Process is not a ModelWithStateMachine subclass yet, but its + state-machine methods still need the same modified_at CAS behavior as + Crawl/Snapshot/Binary without falling back to save(). + """ + values = dict(update_fields) + values.setdefault("modified_at", timezone.now()) + queryset = type(self).objects.filter(pk=self.pk, modified_at=self.modified_at) + if extra_filter: + queryset = queryset.filter(**extra_filter) + updated = queryset.update(**values) + if refresh: + try: + self.refresh_from_db() + except type(self).DoesNotExist: + pass + return updated == 1 + + def update_and_requeue(self, **kwargs) -> bool: + """Scheduler-facing wrapper around safe_update().""" + return self.safe_update( + dict(kwargs), + extra_filter={"retry_at": self.retry_at}, + ) + + def mark_running( + self, + *, + process_type: str | None = None, + pwd: str | Path | None = None, + url: str | None = None, + worker_type: str = "", + timeout: int | None = None, + ) -> None: + """Record the current process role without changing ownership state elsewhere.""" + updates = ["status", "retry_at", "modified_at"] + self.status = self.StatusChoices.RUNNING + self.retry_at = None + if process_type is not None and self.process_type != process_type: + self.process_type = process_type + updates.append("process_type") + if worker_type and self.worker_type != worker_type: + self.worker_type = worker_type + updates.append("worker_type") + if pwd is not None and self.pwd != str(pwd): + self.pwd = str(pwd) + updates.append("pwd") + if url is not None and self.url != url: + self.url = url + updates.append("url") + if timeout is not None and self.timeout != timeout: + self.timeout = timeout + updates.append("timeout") + self.save(update_fields=updates) + + def heartbeat(self) -> None: + """Touch modified_at so standby/leader selection can see this parent is alive.""" + self.save(update_fields=["modified_at"]) + + def mark_exited(self, *, exit_code: int = 0) -> None: + """Mark a foreground/internal process row exited after command cleanup.""" + if self.status == self.StatusChoices.EXITED and self.exit_code == exit_code: + return + self.status = self.StatusChoices.EXITED + self.exit_code = exit_code + self.ended_at = self.ended_at or timezone.now() + self.retry_at = None + self.save(update_fields=["status", "exit_code", "ended_at", "retry_at", "modified_at"]) + + # ========================================================================= + # Process.current() and hierarchy methods + # ========================================================================= + + @classmethod + def current(cls) -> Process: + """ + Get or create the Process record for the current OS process. + + Similar to Machine.current(), this: + 1. Checks cache for existing Process with matching PID + 2. Validates the cached Process is still valid (PID not reused) + 3. Creates new Process if needed + + IMPORTANT: Uses psutil to validate PID hasn't been reused. + PIDs are recycled by OS, so we compare start times. + """ + global _CURRENT_PROCESS + + current_pid = os.getpid() + + # Fast path used by model save diagnostics and hot runner loops. A + # cached Process object is valid when the immutable identity we wrote + # at creation time still describes this Python process. PID reuse cannot + # happen while this process is alive, so pid + present started_at/cmd is + # enough here; the slower psutil validation below remains the fallback + # for missing/stale cache. + if ( + _CURRENT_PROCESS + and _CURRENT_PROCESS.pid == current_pid + and _CURRENT_PROCESS.status == cls.StatusChoices.RUNNING + and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL) + and _CURRENT_PROCESS.started_at is not None + and bool(_CURRENT_PROCESS.cmd) + ): + return _CURRENT_PROCESS + + machine = Machine.current() + iface = NetworkInterface.current() + + # Check cache validity + if _CURRENT_PROCESS: + # Verify: same PID, same machine, cache not expired + if ( + _CURRENT_PROCESS.pid == current_pid + and _CURRENT_PROCESS.machine_id == machine.id + and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL) + ): + if _CURRENT_PROCESS.iface_id != iface.id: + _CURRENT_PROCESS.iface = iface + _CURRENT_PROCESS.save(update_fields=["iface", "modified_at"]) + _CURRENT_PROCESS.ensure_log_files() + return _CURRENT_PROCESS + _CURRENT_PROCESS = None + + # Get actual process start time from OS for validation + os_start_time = None + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Try to find existing Process for this PID on this machine + # Filter by: machine + PID + RUNNING + recent + start time matches + if os_start_time: + existing = ( + cls.objects.filter( + machine=machine, + pid=current_pid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ) + .order_by("-started_at") + .first() + ) + + if existing and existing.started_at: + db_start_time = existing.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + _CURRENT_PROCESS = existing + if existing.iface_id != iface.id: + existing.iface = iface + existing.save(update_fields=["iface", "modified_at"]) + _CURRENT_PROCESS.ensure_log_files() + return existing + + # No valid existing record - create new one + parent = cls._find_parent_process(machine) + process_type = cls._detect_process_type() + + # Use psutil cmdline if available (matches what proc() will validate against) + # Otherwise fall back to sys.argv + cmd = sys.argv + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + cmd = os_proc.cmdline() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Use psutil start time if available (more accurate than timezone.now()) + if os_start_time: + started_at = datetime.fromtimestamp(os_start_time, tz=timezone.get_current_timezone()) + else: + started_at = timezone.now() + + _CURRENT_PROCESS = cls.objects.create( + machine=machine, + parent=parent, + process_type=process_type, + cmd=cmd, + pwd=os.getcwd(), + pid=current_pid, + started_at=started_at, + status=cls.StatusChoices.RUNNING, + iface=iface, + ) + _CURRENT_PROCESS.ensure_log_files() + return _CURRENT_PROCESS + + @classmethod + def _find_parent_process(cls, machine: Machine | None = None) -> Process | None: + """ + Find the parent Process record by looking up PPID. + + IMPORTANT: Validates against PID reuse by checking: + 1. Same machine (PIDs are only unique per machine) + 2. Start time matches OS process start time + 3. Process is still RUNNING and recent + + Returns None if parent is not an ArchiveBox process. + """ + if not PSUTIL_AVAILABLE: + return None + + ppid = os.getppid() + machine = machine or Machine.current() + + # Debug logging + # print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) + + # Get parent process start time from OS + try: + os_parent = psutil.Process(ppid) + os_parent_start = os_parent.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) + return None # Parent process doesn't exist + + # Find matching Process record + candidates = cls.objects.filter( + machine=machine, + pid=ppid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by("-started_at") + + # print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) + + for candidate in candidates: + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + time_diff = abs(db_start_time - os_parent_start) + # print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) + if time_diff < START_TIME_TOLERANCE: + # print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) + return candidate + + # print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) + return None # No matching ArchiveBox parent process + + @classmethod + def _detect_process_type(cls) -> str: + """ + Detect the type of the current process from sys.argv. + + ``archivebox add --bg`` is a fire-and-forget queue write โ€” it does not + run the runner or own the runtime stack โ€” so it's classified as CLI + instead of ADD. The ADD process_type is reserved for the foreground + ``archivebox add`` flow that actually takes over the runtime stack via + ``current_command(TypeChoices.ADD, ...)``. Misclassifying ``--bg`` as + ADD makes ``runtime_stack_owner`` treat it as a newer stack owner for + the few seconds it's alive, knocks the running ``archivebox server`` + out of leadership, and triggers a supervisord tear-down + respawn + cycle (~5s of dead time per add). Detecting bg here at insert time + avoids any race window where the row briefly exists as ADD before a + higher-level demotion. + """ + argv = [str(arg) for arg in sys.argv] + argv_str = " ".join(argv).lower() + executable = Path(argv[0]).name.lower() if argv else "" + + if executable == "supervisord" or any(arg.startswith("--configuration=") for arg in argv[1:]): + return cls.TypeChoices.SUPERVISORD + elif "runner_watch" in argv_str: + return cls.TypeChoices.WORKER + elif "archivebox server" in argv_str: + return cls.TypeChoices.SERVER + elif "archivebox update" in argv_str: + return cls.TypeChoices.UPDATE + elif "archivebox add" in argv_str: + if "--bg" in sys.argv: + return cls.TypeChoices.CLI + return cls.TypeChoices.ADD + elif "archivebox search" in argv_str or "archivebox list" in argv_str: + return cls.TypeChoices.SEARCH + elif "archivebox run" in argv_str: + return cls.TypeChoices.ORCHESTRATOR + elif "archivebox" in argv_str: + return cls.TypeChoices.CLI + else: + return cls.TypeChoices.BINARY + + @classmethod + def cleanup_stale_running(cls, machine: Machine | None = None) -> int: + """ + Mark stale RUNNING processes as EXITED in the DB. + + Processes are stale if: + - Status is RUNNING but OS process no longer exists + - Status is RUNNING but exceeded its timeout plus a small grace margin + - Status is RUNNING but started_at is older than PID_REUSE_WINDOW + + Returns count of processes cleaned up. + """ + cleaned = 0 + + stale = cls.objects.filter(status=cls.StatusChoices.RUNNING) + if machine is not None: + stale = stale.filter(machine=machine) + + # Recovery can run against damaged DB state; stream rows so a large + # stale Process backlog cannot be materialized in memory at once. + for proc in stale.iterator(chunk_size=100): + if proc.poll() is not None: + cleaned += 1 + continue + + is_stale = False + + if proc.started_at: + timeout_seconds = max(int(proc.timeout or 0), 0) + timeout_deadline = proc.started_at + timedelta(seconds=timeout_seconds) + PROCESS_TIMEOUT_GRACE + if timeout_seconds > 0 and timezone.now() >= timeout_deadline: + is_stale = True + + # Check if too old (PID definitely reused) + if not is_stale and proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: + is_stale = True + elif not is_stale and PSUTIL_AVAILABLE and proc.pid is not None: + # Check if OS process still exists with matching start time + try: + os_proc = psutil.Process(proc.pid) + if proc.started_at: + db_start = proc.started_at.timestamp() + os_start = os_proc.create_time() + if abs(db_start - os_start) > START_TIME_TOLERANCE: + is_stale = True # PID reused by different process + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + is_stale = True # Process no longer exists + + if is_stale: + proc.mark_exited( + exit_code=proc.exit_code if proc.exit_code is not None else _default_exit_code_for_unowned_process(proc.process_type), + ) + cleaned += 1 + + return cleaned + + # ========================================================================= + # Tree traversal properties + # ========================================================================= + + @property + def root(self) -> Process: + """Get the root process (CLI command) of this hierarchy.""" + proc = self + while proc.parent_id: + proc = proc.parent + return proc + + @property + def ancestors(self) -> list[Process]: + """Get all ancestor processes from parent to root.""" + ancestors = [] + proc = self.parent + while proc: + ancestors.append(proc) + proc = proc.parent + return ancestors + + @property + def depth(self) -> int: + """Get depth in the process tree (0 = root).""" + return len(self.ancestors) + + def get_descendants(self, include_self: bool = False): + """Get all descendant processes recursively.""" + if include_self: + pks = [self.pk] + else: + pks = [] + + children = list(self.children.values_list("pk", flat=True)) + while children: + pks.extend(children) + children = list(Process.objects.filter(parent_id__in=children).values_list("pk", flat=True)) + + return Process.objects.filter(pk__in=pks) + + # ========================================================================= + # Validated psutil access via .proc property + # ========================================================================= + + @property + def proc(self) -> psutil.Process | None: + """ + Get validated psutil.Process for this record. + + Returns psutil.Process ONLY if: + 1. Process with this PID exists in OS + 2. OS process start time matches our started_at (within tolerance) + 3. Process is on current machine + + Returns None if: + - PID doesn't exist (process exited) + - PID was reused by a different process (start times don't match) + - We're on a different machine than where process ran + - psutil is not available + + This prevents accidentally matching a stale/recycled PID. + """ + if not PSUTIL_AVAILABLE: + return None + + # Can't get psutil.Process if we don't have a PID + if not self.pid: + return None + + # Can't validate processes on other machines + if self.machine_id != Machine.current().id: + return None + + try: + os_proc = psutil.Process(self.pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Process no longer exists + + # Validate start time matches to prevent PID reuse confusion + if self.started_at: + os_start_time = os_proc.create_time() + db_start_time = self.started_at.timestamp() + + if abs(os_start_time - db_start_time) > START_TIME_TOLERANCE: + # PID has been reused by a different process! + return None + + # Optionally validate command matches (extra safety) + if self.cmd: + try: + os_cmdline = os_proc.cmdline() + if os_cmdline and self.cmd: + db_binary = self.cmd[0] if self.cmd else "" + if db_binary: + db_binary_name = Path(db_binary).name + cmd_matches = any(arg == db_binary or Path(arg).name == db_binary_name for arg in os_cmdline if arg) + if not cmd_matches: + return None # Different command, PID reused + except (psutil.AccessDenied, psutil.ZombieProcess): + pass # Can't check cmdline, trust start time match + + return os_proc + + @property + def is_running(self) -> bool: + """ + Check if process is currently running via psutil. + + More reliable than checking status field since it validates + the actual OS process exists and matches our record. + """ + proc = self.proc + if proc is None: + return False + try: + # Treat zombies as not running (they should be reaped) + if proc.status() == psutil.STATUS_ZOMBIE: + return False + except Exception: + pass + return proc.is_running() + + def is_alive(self) -> bool: + """ + Alias for is_running, for compatibility with subprocess.Popen API. + """ + return self.is_running + + def get_memory_info(self) -> dict | None: + """Get memory usage if process is running.""" + proc = self.proc + if proc: + try: + mem = proc.memory_info() + return {"rss": mem.rss, "vms": mem.vms} + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_cpu_percent(self) -> float | None: + """Get CPU usage percentage if process is running.""" + proc = self.proc + if proc: + try: + return proc.cpu_percent(interval=0.1) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_children_pids(self) -> list[int]: + """Get PIDs of child processes from OS (not DB).""" + proc = self.proc + if proc: + try: + return [child.pid for child in proc.children(recursive=True)] + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return [] + + # ========================================================================= + # Lifecycle methods (launch, kill, poll, wait) + # ========================================================================= + + @property + def stdout_file(self) -> Path | None: + """Path to stdout log.""" + runtime_dir = self.runtime_dir + return runtime_dir / "stdout.log" if runtime_dir else None + + @property + def stderr_file(self) -> Path | None: + """Path to stderr log.""" + runtime_dir = self.runtime_dir + return runtime_dir / "stderr.log" if runtime_dir else None + + @property + def hook_script_name(self) -> str | None: + """Best-effort hook filename extracted from the process command.""" + if self.process_type != self.TypeChoices.HOOK or not self.cmd: + return None + + for arg in self.cmd: + arg = str(arg) + if arg.startswith("-"): + continue + candidate = Path(arg).name + if candidate.startswith("on_") and Path(candidate).suffix in {".py", ".js", ".sh"}: + return candidate + + return None + + @property + def runtime_dir(self) -> Path | None: + """Directory where this process stores runtime stdout/stderr logs.""" + if not self.pwd: + return None + + base_dir = Path(self.pwd) + hook_name = self.hook_script_name + if hook_name: + return base_dir / ".hooks" / hook_name + return base_dir + + def tail_stdout(self, lines: int = 50, follow: bool = False): + """ + Tail stdout log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stdout + """ + if not self.stdout_file or not self.stdout_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + + with open(self.stdout_file) as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip("\n") + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip("\n") + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stdout_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def tail_stderr(self, lines: int = 50, follow: bool = False): + """ + Tail stderr log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stderr + """ + if not self.stderr_file or not self.stderr_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + + with open(self.stderr_file) as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip("\n") + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip("\n") + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stderr_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def pipe_stdout(self, lines: int = 10, follow: bool = True): + """ + Pipe stdout to sys.stdout. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + + for line in self.tail_stdout(lines=lines, follow=follow): + print(line, file=sys.stdout, flush=True) + + def pipe_stderr(self, lines: int = 10, follow: bool = True): + """ + Pipe stderr to sys.stderr. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + + for line in self.tail_stderr(lines=lines, follow=follow): + print(line, file=sys.stderr, flush=True) + + def ensure_log_files(self) -> None: + """Ensure stdout/stderr log files exist for this process.""" + runtime_dir = self.runtime_dir + if not runtime_dir: + return + try: + runtime_dir.mkdir(parents=True, exist_ok=True) + except OSError: + return + try: + if self.stdout_file: + self.stdout_file.parent.mkdir(parents=True, exist_ok=True) + self.stdout_file.touch(exist_ok=True) + if self.stderr_file: + self.stderr_file.parent.mkdir(parents=True, exist_ok=True) + self.stderr_file.touch(exist_ok=True) + except OSError: + return + + def _build_env(self) -> dict: + """Build environment dict for subprocess, merging stored env with system.""" + import json + + env = os.environ.copy() + + # Convert all values to strings for subprocess.Popen + if self.env: + for key, value in self.env.items(): + if value is None: + continue + elif isinstance(value, str): + env[key] = value # Already a string, use as-is + elif isinstance(value, bool): + env[key] = "True" if value else "False" + elif isinstance(value, (int, float)): + env[key] = str(value) + else: + # Lists, dicts, etc. - serialize to JSON + env[key] = json.dumps(value, default=str) + + return env + + def launch(self, background: bool = False, cwd: str | None = None) -> Process: + """ + Spawn the subprocess and update this Process record. + + Args: + background: If True, don't wait for completion (for daemons/bg hooks) + cwd: Working directory for the subprocess (defaults to self.pwd) + + Returns: + self (updated with pid, started_at, etc.) + """ + import subprocess + + # Validate pwd is set (required for output files) + if not self.pwd: + raise ValueError("Process.pwd must be set before calling launch()") + + # Use provided cwd or default to pwd + working_dir = cwd or self.pwd + + stdout_path = self.stdout_file + stderr_path = self.stderr_file + if stdout_path: + stdout_path.parent.mkdir(parents=True, exist_ok=True) + if stderr_path: + stderr_path.parent.mkdir(parents=True, exist_ok=True) + if stdout_path is None or stderr_path is None: + raise RuntimeError("Process log paths could not be determined") + + with open(stdout_path, "a") as out, open(stderr_path, "a") as err: + proc = subprocess.Popen( + self.cmd, + cwd=working_dir, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + # Get accurate start time from psutil if available + if PSUTIL_AVAILABLE: + try: + ps_proc = psutil.Process(proc.pid) + self.started_at = datetime.fromtimestamp( + ps_proc.create_time(), + tz=timezone.get_current_timezone(), + ) + except (psutil.NoSuchProcess, psutil.AccessDenied): + self.started_at = timezone.now() + else: + self.started_at = timezone.now() + + self.pid = proc.pid + self.status = self.StatusChoices.RUNNING + self.save() + + if not background: + try: + proc.wait(timeout=self.timeout) + self.exit_code = proc.returncode + except subprocess.TimeoutExpired: + import signal + + proc.kill() + proc.wait() + self.exit_code = 128 + signal.SIGKILL + + self.ended_at = timezone.now() + if stdout_path.exists(): + self.stdout = stdout_path.read_text(errors="replace") + if stderr_path.exists(): + self.stderr = stderr_path.read_text(errors="replace") + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def kill(self, signal_num: int = 15) -> bool: + """ + Kill this process and update status. + + Uses self.proc for safe killing - only kills if PID matches + our recorded process (prevents killing recycled PIDs). + + Args: + signal_num: Signal to send (default SIGTERM=15) + + Returns: + True if killed successfully, False otherwise + """ + # Use validated psutil.Process to ensure we're killing the right process + proc = self.proc + if proc is None: + # Process doesn't exist or PID was recycled - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Safe to kill - we validated it's our process via start time match + proc.send_signal(signal_num) + + # Update our record + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal_num + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + return True + except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError): + # Process already exited between proc check and kill + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def poll(self) -> int | None: + """ + Check if process has exited and update status if so. + + Cleanup when process exits: + - Copy stdout/stderr to DB (keep files for debugging) + - Delete PID file + + Returns: + exit_code if exited, None if still running + """ + if self.status == self.StatusChoices.EXITED: + if self.exit_code == -1: + self.exit_code = 137 + self.save(update_fields=["exit_code"]) + return self.exit_code + + if not self.is_running: + # Reap child process if it's a zombie (best-effort) + proc = self.proc + if proc is not None: + try: + proc.wait(timeout=0.001) + except Exception: + pass + # Process exited - read output and copy to DB + if self.stdout_file and self.stdout_file.exists(): + self.stdout = self.stdout_file.read_text(errors="replace") + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stdout_file.unlink(missing_ok=True) + if self.stderr_file and self.stderr_file.exists(): + self.stderr = self.stderr_file.read_text(errors="replace") + # TODO: Uncomment to cleanup (keeping for debugging for now) + # self.stderr_file.unlink(missing_ok=True) + + self.exit_code = self.exit_code if self.exit_code is not None else _default_exit_code_for_unowned_process(self.process_type) + if self.exit_code == -1: + self.exit_code = 137 + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + return self.exit_code + + return None # Still running + + def wait(self, timeout: int | None = None) -> int: + """ + Wait for process to exit, polling periodically. + + Args: + timeout: Max seconds to wait (None = use self.timeout) + + Returns: + exit_code + + Raises: + TimeoutError if process doesn't exit in time + """ + import time + from archivebox.config.constants import CONSTANTS + + timeout = timeout or self.timeout + if self.process_type == self.TypeChoices.HOOK: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) + start = time.time() + + while True: + exit_code = self.poll() + if exit_code is not None: + return exit_code + + if time.time() - start > timeout: + raise TimeoutError(f"Process {self.id} did not exit within {timeout}s") + + time.sleep(0.1) + + def terminate(self, graceful_timeout: float = 5.0) -> bool: + """ + Gracefully terminate process: SIGTERM โ†’ wait โ†’ SIGKILL. + + This consolidates the scattered SIGTERM/SIGKILL logic from: + - crawls/models.py Crawl.cleanup() + - workers/pid_utils.py stop_worker() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + True if process was terminated, False if already dead + """ + import signal + + proc = self.proc + if proc is None: + # Already dead - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Step 1: Send SIGTERM for graceful shutdown + proc.terminate() + + # Step 2: Wait for graceful exit + try: + exit_status = proc.wait(timeout=graceful_timeout) + # Process exited gracefully + # psutil.Process.wait() returns the exit status + self.exit_code = exit_status if exit_status is not None else 0 + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + except psutil.TimeoutExpired: + pass # Still running, need to force kill + + # Step 3: Force kill with SIGKILL + proc.kill() + proc.wait(timeout=2) + + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal.SIGKILL + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def kill_tree(self, graceful_timeout: float = 2.0) -> int: + """ + Kill this process and all its children (OS children, not DB children) in parallel. + + Uses parallel polling approach - sends SIGTERM to all processes at once, + then polls all simultaneously with individual deadline tracking. + + This consolidates the scattered child-killing logic from: + - crawls/models.py Crawl.cleanup() os.killpg() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + Number of processes killed (including self) + """ + import signal + import time + import os + + killed_count = 0 + used_sigkill = False + proc = self.proc + if proc is None: + # Already dead + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return 0 + + try: + # Phase 1: Get all children and send SIGTERM to entire tree in parallel + children = proc.children(recursive=True) + deadline = time.time() + graceful_timeout + + # Send SIGTERM to all children first (non-blocking) + for child in children: + try: + os.kill(child.pid, signal.SIGTERM) + except (OSError, ProcessLookupError): + pass + + # Send SIGTERM to parent + try: + os.kill(proc.pid, signal.SIGTERM) + except (OSError, ProcessLookupError): + pass + + # Phase 2: Poll all processes in parallel + all_procs = children + [proc] + still_running = {p.pid for p in all_procs} + + while still_running and time.time() < deadline: + time.sleep(0.1) + + for pid in list(still_running): + try: + # Check if process exited + os.kill(pid, 0) # Signal 0 checks if process exists + except (OSError, ProcessLookupError): + # Process exited + still_running.remove(pid) + killed_count += 1 + + # Phase 3: SIGKILL any stragglers that exceeded timeout + if still_running: + for pid in still_running: + try: + os.kill(pid, signal.SIGKILL) + killed_count += 1 + used_sigkill = True + except (OSError, ProcessLookupError): + pass + + # Update self status + if used_sigkill: + self.exit_code = 128 + signal.SIGKILL + else: + self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0 + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + + return killed_count + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process tree already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return killed_count + + def kill_children_db(self) -> int: + """ + Kill all DB-tracked child processes (via parent FK). + + Different from kill_tree() which uses OS children. + This kills processes created via Process.create(parent=self). + + Returns: + Number of child Process records killed + """ + killed = 0 + for child in self.children.filter(status=self.StatusChoices.RUNNING): + if child.terminate(): + killed += 1 + return killed + + # ========================================================================= + # Class methods for querying processes + # ========================================================================= + + @classmethod + def get_running(cls, process_type: str | None = None, machine: Machine | None = None) -> QuerySet[Process]: + """ + Get all running processes, optionally filtered by type. + + Replaces: + - workers/pid_utils.py get_all_worker_pids() + - workers/orchestrator.py get_total_worker_count() + + Args: + process_type: Filter by TypeChoices (e.g., 'worker', 'hook') + machine: Filter by machine (defaults to current) + + Returns: + QuerySet of running Process records + """ + machine = machine or Machine.current() + qs = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + if process_type: + qs = qs.filter(process_type=process_type) + return qs + + @classmethod + def get_running_count(cls, process_type: str | None = None, machine: Machine | None = None) -> int: + """ + Get count of running processes. + + Replaces: + - workers/pid_utils.py get_running_worker_count() + """ + return cls.get_running(process_type=process_type, machine=machine).count() + + @classmethod + def stop_all(cls, process_type: str | None = None, machine: Machine | None = None, graceful: bool = True) -> int: + """ + Stop all running processes of a given type. + + Args: + process_type: Filter by TypeChoices + machine: Filter by machine + graceful: If True, use terminate() (SIGTERMโ†’SIGKILL), else kill() + + Returns: + Number of processes stopped + """ + stopped = 0 + for proc in cls.get_running(process_type=process_type, machine=machine): + if graceful: + if proc.terminate(): + stopped += 1 + else: + if proc.kill(): + stopped += 1 + return stopped + + @classmethod + def get_next_worker_id(cls, process_type: str = "worker", machine: Machine | None = None) -> int: + """ + Get the next available worker ID for spawning new workers. + + Replaces workers/pid_utils.py get_next_worker_id(). + Simply returns count of running workers of this type. + + Args: + process_type: Worker type to count + machine: Machine to scope query + + Returns: + Next available worker ID (0-indexed) + """ + return cls.get_running_count(process_type=process_type, machine=machine) + + @classmethod + def cleanup_orphaned_chrome(cls) -> int: + """ + Kill orphaned Chrome processes using chrome_utils.js killZombieChrome. + + Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old) + and kills any orphaned Chrome processes. + + Called by: + - Orchestrator on startup (cleanup from previous crashes) + - Orchestrator periodically (every N minutes) + + Returns: + Number of zombie Chrome processes killed + """ + import subprocess + from importlib.resources import files + + chrome_utils = files("abx_plugins.plugins.chrome").joinpath("chrome_utils.js") + if not chrome_utils.exists(): + return 0 + + crawl_roots = [ + crawls_dir + for user_dir in CONSTANTS.USERS_DIR.iterdir() + if user_dir.is_dir() + for crawls_dir in [user_dir / "crawls"] + if crawls_dir.is_dir() + ] + if not crawl_roots: + return 0 + + killed = 0 + try: + for crawl_root in crawl_roots: + result = subprocess.run( + ["node", str(chrome_utils), "killZombieChrome", str(crawl_root)], + capture_output=True, + timeout=30, + text=True, + ) + if result.returncode == 0: + killed += int(result.stdout.strip()) + if killed > 0: + rprint(f"[yellow]๐Ÿงน Cleaned up {killed} orphaned Chrome processes[/yellow]") + return killed + except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e: + rprint(f"[red]Failed to cleanup orphaned Chrome: {e}[/red]") + + return 0 + + @classmethod + def cleanup_orphaned_workers(cls) -> int: + """ + Mark orphaned worker/hook processes as EXITED in the DB. + + Orphaned if: + - Root (orchestrator/cli) is not running, or + - No orchestrator/cli ancestor exists. + + Standalone worker runs (archivebox run --snapshot-id) are allowed. + """ + cleaned = 0 + + running_children = cls.objects.filter( + process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK], + status=cls.StatusChoices.RUNNING, + ) + + # Recovery can run against damaged DB state; stream rows so a large + # orphaned Process backlog cannot be materialized in memory at once. + for proc in running_children.iterator(chunk_size=100): + if not proc.is_running: + proc.mark_exited( + exit_code=proc.exit_code if proc.exit_code is not None else _default_exit_code_for_unowned_process(proc.process_type), + ) + cleaned += 1 + continue + + root = proc.root + # Standalone worker/hook process (run directly) + if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK): + continue + + # If root is an active ArchiveBox command/orchestrator, keep it. + if ( + root.process_type + in ( + cls.TypeChoices.ORCHESTRATOR, + cls.TypeChoices.SERVER, + cls.TypeChoices.UPDATE, + cls.TypeChoices.ADD, + cls.TypeChoices.SEARCH, + cls.TypeChoices.CLI, + ) + and root.is_running + ): + continue + + proc.mark_exited( + exit_code=proc.exit_code if proc.exit_code is not None else _default_exit_code_for_unowned_process(proc.process_type), + ) + cleaned += 1 + + if cleaned: + rprint(f"[yellow]๐Ÿงน Cleaned up {cleaned} orphaned worker/hook process record(s)[/yellow]") + return cleaned + + +# ============================================================================= +# Binary State Machine +# ============================================================================= + + +class BinaryMachine(BaseStateMachine): + """ + State machine for managing Binary installation lifecycle. + + Simple 2-state machine: + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ QUEUED State โ”‚ + โ”‚ โ€ข Binary needs to be installed โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when can_install() + โ†“ Synchronous installation during transition + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ INSTALLED State โ”‚ + โ”‚ โ€ข Binary installed (abspath, version, sha256 set) โ”‚ + โ”‚ โ€ข Health stats incremented โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + If installation fails, Binary stays in QUEUED with retry_at bumped. + """ + + model_attr_name = "binary" + binary: Binary + + # States + queued = State(value=Binary.StatusChoices.QUEUED, initial=True) + installed = State(value=Binary.StatusChoices.INSTALLED, final=True) + + # Tick Event - install happens during transition + tick = queued.to.itself(unless="can_install") | queued.to(installed, cond="can_install", on="on_install") + + def can_install(self) -> bool: + """Check if binary installation can start.""" + return bool(self.binary.name and self.binary.binproviders) + + @queued.enter + def enter_queued(self): + """Binary is queued for installation.""" + self.binary.update_and_requeue( + retry_at=timezone.now(), + status=Binary.StatusChoices.QUEUED, + ) + + def on_install(self): + """Called during queuedโ†’installed transition. Runs installation synchronously.""" + import sys + + rprint(f"[cyan] ๐Ÿ”„ BinaryMachine.on_install() - installing {self.binary.name}[/cyan]", file=sys.stderr) + + # Run installation hooks (synchronous, updates abspath/version/sha256 and sets status) + self.binary.run() + + # Check if installation succeeded by looking at updated status + # Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference + self.binary.refresh_from_db() + + if self.binary.status != Binary.StatusChoices.INSTALLED: + # Installation failed - abort transition, stay in queued + rprint(f"[red] โŒ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]", file=sys.stderr) + + # Bump retry_at to try again later + self.binary.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes + status=Binary.StatusChoices.QUEUED, # Ensure we stay queued + ) + + # Increment health stats for failure + self.binary.increment_health_stats(success=False) + + # Abort the transition - this will raise an exception and keep us in queued + raise Exception(f"Binary {self.binary.name} installation failed") + + rprint(f"[cyan] โœ… BinaryMachine - {self.binary.name} installed successfully[/cyan]", file=sys.stderr) + + @installed.enter + def enter_installed(self): + """Binary installed successfully.""" + self.binary.update_and_requeue( + retry_at=None, + status=Binary.StatusChoices.INSTALLED, + ) + + # Increment health stats + self.binary.increment_health_stats(success=True) + + +# ============================================================================= +# Process State Machine +# ============================================================================= + + +class ProcessMachine(BaseStateMachine): + """ + State machine for managing Process (OS subprocess) lifecycle. + + Process Lifecycle: + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ QUEUED State โ”‚ + โ”‚ โ€ข Process ready to launch, waiting for resources โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() when can_start() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ RUNNING State โ†’ enter_running() โ”‚ + โ”‚ 1. process.launch() โ”‚ + โ”‚ โ€ข Spawn subprocess with cmd, pwd, env, timeout โ”‚ + โ”‚ โ€ข Set pid, started_at โ”‚ + โ”‚ โ€ข Process runs in background or foreground โ”‚ + โ”‚ 2. Monitor process completion โ”‚ + โ”‚ โ€ข Check exit code when process completes โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ†“ tick() checks is_exited() + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ EXITED State โ”‚ + โ”‚ โ€ข Process completed (exit_code set) โ”‚ + โ”‚ โ€ข Health stats incremented โ”‚ + โ”‚ โ€ข stdout/stderr captured โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + Note: This is a simpler state machine than ArchiveResult. + Process is just about execution lifecycle. ArchiveResult handles + the archival-specific logic (status, output parsing, etc.). + """ + + model_attr_name = "process" + process: Process + + # States + queued = State(value=Process.StatusChoices.QUEUED, initial=True) + running = State(value=Process.StatusChoices.RUNNING) + exited = State(value=Process.StatusChoices.EXITED, final=True) + + # Tick Event - transitions based on conditions + tick = ( + queued.to.itself(unless="can_start") + | queued.to(running, cond="can_start") + | running.to.itself(unless="is_exited") + | running.to(exited, cond="is_exited") + ) + + # Additional events (for explicit control) + launch = queued.to(running) + kill = running.to(exited) + + def can_start(self) -> bool: + """Check if process can start (has cmd and machine).""" + return bool(self.process.cmd and self.process.machine) + + def is_exited(self) -> bool: + """Check if process has exited (exit_code is set).""" + return self.process.exit_code is not None + + @queued.enter + def enter_queued(self): + """Process is queued for execution.""" + self.process.update_and_requeue( + retry_at=timezone.now(), + status=Process.StatusChoices.QUEUED, + ) + + @running.enter + def enter_running(self): + """Start process execution.""" + # Lock the process while it runs + self.process.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=self.process.timeout), + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + + # Launch the subprocess + # NOTE: This is a placeholder - actual launch logic would + # be implemented based on how hooks currently spawn processes + # For now, Process is a data model that tracks execution metadata + # The actual subprocess spawning is still handled by run_hook() + + # Mark as immediately exited for now (until we refactor run_hook) + # In the future, this would actually spawn the subprocess + self.process.exit_code = 0 # Placeholder + self.process.save() + + @exited.enter + def enter_exited(self): + """Process has exited.""" + self.process.update_and_requeue( + retry_at=None, + status=Process.StatusChoices.EXITED, + ended_at=timezone.now(), + ) + + +# ============================================================================= +# State Machine Registration +# ============================================================================= + +# Manually register state machines with python-statemachine registry +registry.register(BinaryMachine) +registry.register(ProcessMachine) diff --git a/archivebox/machine/tests/__init__.py b/archivebox/machine/tests/__init__.py new file mode 100644 index 0000000000..d7ce160be3 --- /dev/null +++ b/archivebox/machine/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the machine module (Machine, NetworkInterface, Binary, Process models).""" diff --git a/archivebox/manage.py b/archivebox/manage.py new file mode 100755 index 0000000000..ee4e8d7b53 --- /dev/null +++ b/archivebox/manage.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +import os +import sys + +if __name__ == "__main__": + # if you're a developer working on archivebox, still prefer the archivebox + # versions of ./manage.py commands whenever possible. When that's not possible + # (e.g. makemigrations), you can comment out this check temporarily + + allowed_commands = ["makemigrations", "migrate", "startapp", "squashmigrations", "generate_stubs", "test"] + + if not any(cmd in sys.argv for cmd in allowed_commands): + print("[X] Don't run ./manage.py directly (unless you are a developer running makemigrations):") + print() + print(" Hint: Use these archivebox CLI commands instead of the ./manage.py equivalents:") + print(" archivebox init (migrates the database to latest version)") + print(" archivebox server (runs the Django web server)") + print(" archivebox shell (opens an iPython Django shell with all models imported)") + print(" archivebox manage [cmd] (any other management commands)") + raise SystemExit(2) + + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?", + ) from exc + execute_from_command_line(sys.argv) diff --git a/archivebox/mcp/README.md b/archivebox/mcp/README.md new file mode 100644 index 0000000000..8b0aa42b08 --- /dev/null +++ b/archivebox/mcp/README.md @@ -0,0 +1,138 @@ +# ArchiveBox MCP Server + +Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents. + +## Overview + +This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata. + +## Features + +- โœ… **Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands +- โœ… **Zero duplication**: Reuses existing Click command definitions, types, and help text +- โœ… **Auto-sync**: Changes to CLI commands automatically reflected in MCP tools +- โœ… **Stateless**: No database models or state management required +- โœ… **Lightweight**: ~200 lines of code + +## Usage + +### Start the MCP Server + +```bash +archivebox mcp +``` + +The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout. + +### Example Client + +```python +import subprocess +import json + +# Start MCP server +proc = subprocess.Popen( + ['archivebox', 'mcp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True +) + +# Send initialize request +request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}} +proc.stdin.write(json.dumps(request) + '\n') +proc.stdin.flush() + +# Read response +response = json.loads(proc.stdout.readline()) +print(response) +``` + +### Example Requests + +**Initialize:** +```json +{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} +``` + +**List all available tools:** +```json +{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} +``` + +**Call a tool:** +```json +{ + "jsonrpc":"2.0", + "id":3, + "method":"tools/call", + "params":{ + "name":"version", + "arguments":{"quiet":true} + } +} +``` + +## Supported MCP Methods + +- `initialize` - Handshake and capability negotiation +- `tools/list` - List all available CLI commands as MCP tools +- `tools/call` - Execute a CLI command with arguments + +## Available Tools + +The server exposes all ArchiveBox CLI commands: + +**Meta**: `help`, `version`, `mcp` +**Setup**: `init`, `install` +**Archive**: `add`, `remove`, `update`, `search`, `status`, `config` +**Workers**: `orchestrator`, `worker` +**Tasks**: `crawl`, `snapshot`, `extract` +**Server**: `server`, `schedule` +**Utilities**: `shell`, `manage` + +## Architecture + +### Dynamic Introspection + +Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions: + +```python +# Auto-discover commands +from archivebox.cli import ArchiveBoxGroup +cli_group = ArchiveBoxGroup() +all_commands = cli_group.all_subcommands + +# Auto-generate schemas from Click metadata +for cmd_name in all_commands: + click_cmd = cli_group.get_command(None, cmd_name) + # Extract params, types, help text, etc. + tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd) +``` + +### Tool Execution + +Commands are executed using Click's `CliRunner`: + +```python +from click.testing import CliRunner + +runner = CliRunner() +result = runner.invoke(click_command, args) +``` + +## Files + +- `server.py` (~350 lines) - Core MCP server with Click introspection +- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point +- `apps.py`, `__init__.py` - Django app boilerplate + +## MCP Specification + +Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25). + +## Sources + +- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25) +- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol) +- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol) diff --git a/archivebox/mcp/__init__.py b/archivebox/mcp/__init__.py new file mode 100644 index 0000000000..dd4a67f328 --- /dev/null +++ b/archivebox/mcp/__init__.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.mcp" + +""" +Model Context Protocol (MCP) server for ArchiveBox. + +Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection. +Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox. +""" diff --git a/archivebox/mcp/apps.py b/archivebox/mcp/apps.py new file mode 100644 index 0000000000..3413e01b1a --- /dev/null +++ b/archivebox/mcp/apps.py @@ -0,0 +1,9 @@ +__package__ = "archivebox.mcp" + +from django.apps import AppConfig + + +class MCPConfig(AppConfig): + name = "mcp" + verbose_name = "Model Context Protocol Server" + default_auto_field = "django.db.models.BigAutoField" diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py new file mode 100644 index 0000000000..26196b7951 --- /dev/null +++ b/archivebox/mcp/server.py @@ -0,0 +1,402 @@ +""" +Model Context Protocol (MCP) server implementation for ArchiveBox. + +Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting +Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport. +""" + +import sys +import json +import traceback +from typing import Any + +import click +from click.testing import CliRunner + +from archivebox.config.version import VERSION + + +class MCPJSONEncoder(json.JSONEncoder): + """Custom JSON encoder that handles Click sentinel values and other special types""" + + def default(self, o): + # Handle Click's sentinel values + sentinel_type = getattr(click.core, "_SentinelClass", None) + if isinstance(sentinel_type, type) and isinstance(o, sentinel_type): + return None + + # Handle tuples (convert to lists) + if isinstance(o, tuple): + return list(o) + + # Handle any other non-serializable objects + try: + return super().default(o) + except TypeError: + return str(o) + + +# Type mapping from Click types to JSON Schema types +def click_type_to_json_schema_type(click_type: click.ParamType) -> dict[str, Any]: + """Convert a Click parameter type to JSON Schema type definition""" + + if isinstance(click_type, click.types.StringParamType): + return {"type": "string"} + elif isinstance(click_type, click.types.IntParamType): + return {"type": "integer"} + elif isinstance(click_type, click.types.FloatParamType): + return {"type": "number"} + elif isinstance(click_type, click.types.BoolParamType): + return {"type": "boolean"} + elif isinstance(click_type, click.types.Choice): + return {"type": "string", "enum": list(click_type.choices)} + elif isinstance(click_type, click.types.Path): + return {"type": "string", "description": "File or directory path"} + elif isinstance(click_type, click.types.File): + return {"type": "string", "description": "File path"} + elif isinstance(click_type, click.types.Tuple): + # Multiple arguments of same type + return {"type": "array", "items": {"type": "string"}} + else: + # Default to string for unknown types + return {"type": "string"} + + +def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict[str, Any]: + """ + Convert a Click command to an MCP tool definition with JSON Schema. + + Introspects the Click command's parameters to automatically generate + the input schema without manual definition. + """ + + properties: dict[str, dict[str, Any]] = {} + required: list[str] = [] + + # Extract parameters from Click command + for param in click_command.params: + # Skip internal parameters + if param.name is None or param.name in ("help", "version"): + continue + + param_schema = click_type_to_json_schema_type(param.type) + + # Add description from Click help text + help_text = getattr(param, "help", None) + if help_text: + param_schema["description"] = help_text + + # Handle default values + if param.default is not None and param.default != (): + param_schema["default"] = param.default + + # Handle multiple values (like multiple URLs) + if param.multiple: + properties[param.name] = { + "type": "array", + "items": param_schema, + "description": param_schema.get("description", f"Multiple {param.name} values"), + } + else: + properties[param.name] = param_schema + + # Mark as required if Click requires it + if param.required: + required.append(param.name) + + return { + "name": cmd_name, + "description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command", + "inputSchema": { + "type": "object", + "properties": properties, + "required": required, + }, + } + + +def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict: + """ + Execute a Click command programmatically with given arguments. + + Returns MCP-formatted result with captured output and error status. + """ + + # Setup Django for archive commands (commands that need database access) + from archivebox.cli import ArchiveBoxGroup + + if cmd_name in ArchiveBoxGroup.archive_commands: + try: + from archivebox.config.django import setup_django + from archivebox.misc.checks import check_data_folder + + setup_django() + check_data_folder() + except Exception as e: + # If Django setup fails, return error (unless it's manage/shell which handle this themselves) + if cmd_name not in ("manage", "shell"): + return { + "content": [ + { + "type": "text", + "text": f"Error setting up Django: {str(e)}\n\nMake sure you're running the MCP server from inside an ArchiveBox data directory.", + }, + ], + "isError": True, + } + + # Use Click's test runner to invoke command programmatically + runner = CliRunner() + + # Build a map of parameter names to their Click types (Argument vs Option) + param_map = {param.name: param for param in click_command.params} + + # Convert arguments dict to CLI args list + args = [] + positional_args = [] + + for key, value in arguments.items(): + param_name = key.replace("_", "-") # Click uses dashes + param = param_map.get(key) + + # Check if this is a positional Argument (not an Option) + is_argument = isinstance(param, click.Argument) + + if is_argument: + # Positional arguments - add them without dashes + if isinstance(value, list): + positional_args.extend([str(v) for v in value]) + elif value is not None: + positional_args.append(str(value)) + else: + # Options - add with dashes + if isinstance(value, bool): + if value: + args.append(f"--{param_name}") + elif isinstance(value, list): + # Multiple values for an option (rare) + for item in value: + args.append(f"--{param_name}") + args.append(str(item)) + elif value is not None: + args.append(f"--{param_name}") + args.append(str(value)) + + # Add positional arguments at the end + args.extend(positional_args) + + # Execute the command + try: + result = runner.invoke(click_command, args, catch_exceptions=False) + + # Format output as MCP content + content = [] + + if result.output: + content.append( + { + "type": "text", + "text": result.output, + }, + ) + + if result.stderr_bytes: + stderr_text = result.stderr_bytes.decode("utf-8", errors="replace") + if stderr_text.strip(): + content.append( + { + "type": "text", + "text": f"[stderr]\n{stderr_text}", + }, + ) + + # Check exit code + is_error = result.exit_code != 0 + + if is_error and not content: + content.append( + { + "type": "text", + "text": f"Command failed with exit code {result.exit_code}", + }, + ) + + return { + "content": content or [{"type": "text", "text": "(no output)"}], + "isError": is_error, + } + + except Exception as e: + # Capture any exceptions during execution + error_trace = traceback.format_exc() + return { + "content": [ + { + "type": "text", + "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}", + }, + ], + "isError": True, + } + + +class MCPServer: + """ + Model Context Protocol server for ArchiveBox. + + Provides JSON-RPC 2.0 interface over stdio, dynamically exposing + all Click commands as MCP tools. + """ + + def __init__(self): + # Import here to avoid circular imports + from archivebox.cli import ArchiveBoxGroup + + self.cli_group = ArchiveBoxGroup() + self.protocol_version = "2025-11-25" + self._tool_cache = {} # Cache loaded Click commands + + def get_click_command(self, cmd_name: str) -> click.Command | None: + """Get a Click command by name, with caching""" + if cmd_name not in self._tool_cache: + if cmd_name not in self.cli_group.all_subcommands: + return None + self._tool_cache[cmd_name] = self.cli_group.get_command(click.Context(self.cli_group), cmd_name) + return self._tool_cache[cmd_name] + + def handle_initialize(self, params: dict) -> dict: + """Handle MCP initialize request""" + return { + "protocolVersion": self.protocol_version, + "capabilities": { + "tools": {}, + }, + "serverInfo": { + "name": "archivebox-mcp", + "version": VERSION, + }, + } + + def handle_tools_list(self, params: dict) -> dict: + """Handle MCP tools/list request - returns all available CLI commands as tools""" + tools = [] + + for cmd_name in self.cli_group.all_subcommands.keys(): + click_cmd = self.get_click_command(cmd_name) + if click_cmd: + try: + tool_def = click_command_to_mcp_tool(cmd_name, click_cmd) + tools.append(tool_def) + except Exception as e: + # Log but don't fail - skip problematic commands + print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr) + + return {"tools": tools} + + def handle_tools_call(self, params: dict) -> dict: + """Handle MCP tools/call request - executes a CLI command""" + tool_name = params.get("name") + arguments = params.get("arguments", {}) + + if not tool_name: + raise ValueError("Missing required parameter: name") + + click_cmd = self.get_click_command(tool_name) + if not click_cmd: + raise ValueError(f"Unknown tool: {tool_name}") + + # Execute the command and return MCP-formatted result + return execute_click_command(tool_name, click_cmd, arguments) + + def handle_request(self, request: dict) -> dict: + """ + Handle a JSON-RPC 2.0 request and return response. + + Supports MCP methods: initialize, tools/list, tools/call + """ + + method = request.get("method") + params = request.get("params", {}) + request_id = request.get("id") + + try: + # Route to appropriate handler + if method == "initialize": + result = self.handle_initialize(params) + elif method == "tools/list": + result = self.handle_tools_list(params) + elif method == "tools/call": + result = self.handle_tools_call(params) + else: + # Method not found + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32601, + "message": f"Method not found: {method}", + }, + } + + # Success response + return { + "jsonrpc": "2.0", + "id": request_id, + "result": result, + } + + except Exception as e: + # Error response + error_trace = traceback.format_exc() + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32603, + "message": str(e), + "data": error_trace, + }, + } + + def run_stdio_server(self): + """ + Run the MCP server in stdio mode. + + Reads JSON-RPC requests from stdin (one per line), + writes JSON-RPC responses to stdout (one per line). + """ + + # Read requests from stdin line by line + for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + # Parse JSON-RPC request + request = json.loads(line) + + # Handle request + response = self.handle_request(request) + + # Write response to stdout (use custom encoder for Click types) + print(json.dumps(response, cls=MCPJSONEncoder), flush=True) + + except json.JSONDecodeError as e: + # Invalid JSON + error_response = { + "jsonrpc": "2.0", + "id": None, + "error": { + "code": -32700, + "message": "Parse error", + "data": str(e), + }, + } + print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True) + + +def run_mcp_server(): + """Main entry point for MCP server""" + server = MCPServer() + server.run_stdio_server() diff --git a/archivebox/misc/__init__.py b/archivebox/misc/__init__.py new file mode 100644 index 0000000000..1619d0560c --- /dev/null +++ b/archivebox/misc/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.misc" diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py new file mode 100644 index 0000000000..9e623f7c8f --- /dev/null +++ b/archivebox/misc/checks.py @@ -0,0 +1,409 @@ +__package__ = "archivebox.misc" + +import os +import signal +import sys +import time +import threading +from contextlib import contextmanager +from pathlib import Path + +from rich import print +from rich.panel import Panel + +# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries +# this file is imported by archivebox/__init__.py +# and any imports here will be imported by EVERYTHING else +# so this file should only be used for pure python checks +# that don't need to import other parts of ArchiveBox + +# if a check needs to import other parts of ArchiveBox, +# the imports should be done inside the check function +# and you should make sure if you need to import any django stuff +# that the check is called after django.setup() has been called + + +def _migration_interrupt_message(*, before_apply: bool = False) -> str: + status = "Migration cancelled before any changes were applied." if before_apply else "Migration interrupted." + return ( + f"\n[X] {status}\n" + " Database migrations are atomic; interrupted migration work is rolled back or left unapplied,\n" + " so no partially-applied migration is recorded and no data loss has occurred.\n\n" + " To continue the upgrade, run:\n" + " archivebox init\n" + ) + + +@contextmanager +def _exit_on_migration_interrupt(): + if threading.current_thread() is not threading.main_thread(): + yield + return + + handled_signals = (signal.SIGINT, signal.SIGTERM) + previous_handlers = {sig: signal.getsignal(sig) for sig in handled_signals} + + def handle_shutdown(_signum, _frame): + try: + os.write(sys.stderr.fileno(), _migration_interrupt_message().encode()) + except Exception: + pass + os._exit(130) + + try: + for sig in handled_signals: + signal.signal(sig, handle_shutdown) + yield + finally: + for sig, previous_handler in previous_handlers.items(): + signal.signal(sig, previous_handler) + + +def check_data_folder(config=None, **config_kwargs) -> None: + from archivebox import DATA_DIR + from archivebox.config import CONSTANTS + from archivebox.config.common import get_config + from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + config = config or get_config(**config_kwargs) + archive_dir = CONSTANTS.ARCHIVE_DIR + archive_dir_exists = os.path.isdir(archive_dir) + if not archive_dir_exists: + print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr) + print(f" {DATA_DIR}", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr) + print(" cd path/to/your/archive/folder", file=sys.stderr) + print(" archivebox [command]", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr) + print(" archivebox init", file=sys.stderr) + raise SystemExit(2) + + # Create data dir subdirs + create_and_chown_dir(CONSTANTS.SOURCES_DIR) + create_and_chown_dir(CONSTANTS.USERS_DIR) + create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default") + create_and_chown_dir(CONSTANTS.LOGS_DIR) + # create_and_chown_dir(CONSTANTS.CACHE_DIR) + + # Create /tmp and /lib dirs if they don't exist + get_or_create_working_tmp_dir(autofix=True, quiet=False, config=config) + get_or_create_working_lib_dir(autofix=True, quiet=False, config=config) + + # Check data dir permissions, /tmp, and /lib permissions + check_data_dir_permissions(config=config) + + +def check_migrations(*, blocking: bool = True, auto_apply: bool = False, cancel_delay: int = 3) -> list[str]: + from archivebox import DATA_DIR + from archivebox.misc.db import apply_migrations, migration_state, pending_migrations + + pending, missing_from_code, rollback_targets = migration_state() + is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"]) or os.environ.get("ARCHIVEBOX_WANTS_INIT") == "1" + + if missing_from_code: + print( + "[red][X] This collection was migrated by a newer version of ArchiveBox than the one currently running.[/red]", + file=sys.stderr, + ) + print(f" {DATA_DIR}", file=sys.stderr) + print(file=sys.stderr) + print(" [violet]Hint:[/violet] Upgrade ArchiveBox / pull the latest Docker image, then restart:", file=sys.stderr) + print(" docker compose pull && docker compose up -d", file=sys.stderr) + print(file=sys.stderr) + print(" Applied migrations missing from this build:", file=sys.stderr) + for migration in missing_from_code[:10]: + print(f" {migration}", file=sys.stderr) + if len(missing_from_code) > 10: + print(f" ... and {len(missing_from_code) - 10} more", file=sys.stderr) + print(file=sys.stderr) + print( + " If you are intentionally trying to downgrade, switch back to the newer version temporarily", + file=sys.stderr, + ) + print( + " and run this to downgrade the DB version (back up your DB first!):", + file=sys.stderr, + ) + for app, target in sorted(rollback_targets.items()): + print(f" archivebox manage migrate {app} {target}", file=sys.stderr) + raise SystemExit(3) + + if pending and not is_migrating: + print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]", file=sys.stderr) + print(f" {DATA_DIR}", file=sys.stderr) + print(file=sys.stderr) + print( + f" [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending)} pending migrations, run:", + file=sys.stderr, + ) + print(" archivebox init", file=sys.stderr) + if auto_apply: + print(file=sys.stderr) + print( + f"[yellow][*] ArchiveBox will apply migrations automatically in {cancel_delay}s. Press CTRL+C to cancel.[/yellow]", + file=sys.stderr, + ) + try: + time.sleep(cancel_delay) + except KeyboardInterrupt: + print(_migration_interrupt_message(before_apply=True), file=sys.stderr) + raise SystemExit(130) from None + + # Always delegate to Django's migration executor. It records each + # migration only after it succeeds, so power loss or SIGKILL leaves + # unapplied work visible here and the next startup resumes normally. + print("[yellow][*] Applying database migrations...[/yellow]", file=sys.stderr) + try: + with _exit_on_migration_interrupt(): + apply_migrations(stdout=sys.stderr, stderr=sys.stderr, verbosity=1) + except KeyboardInterrupt: + print(_migration_interrupt_message(), file=sys.stderr) + raise SystemExit(130) from None + return pending_migrations() + if blocking: + raise SystemExit(3) + return pending + + +def check_io_encoding(): + PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8") + + if PYTHON_ENCODING != "UTF-8": + print( + f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]", + file=sys.stderr, + ) + print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr) + print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr) + print("") + print(" Confirm that it's fixed by opening a new shell and running:", file=sys.stderr) + print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr) + raise SystemExit(2) + + # # hard errors: check python version + # if sys.version_info[:3] < (3, 10, 0): + # print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr) + # print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr) + # raise SystemExit(2) + + # # hard errors: check django version + # if int(django.VERSION[0]) < 5: + # print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr) + # print(' Upgrade django using pip or your system package manager: pip3 install --upgrade django', file=sys.stderr) + # raise SystemExit(2) + + +def check_not_root(): + from archivebox.config.permissions import IS_ROOT + + is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv + is_getting_version = "--version" in sys.argv or "version" in sys.argv + + if IS_ROOT and not (is_getting_help or is_getting_version): + print("[yellow][!] Running ArchiveBox as root is not recommended.[/yellow]", file=sys.stderr) + print(" Root-owned DATA_DIR files may be inaccessible to non-root users later.", file=sys.stderr) + print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr) + + +def check_not_inside_source_dir(): + """Prevent running ArchiveBox from inside its source directory (would pollute repo with data files).""" + cwd = Path(os.getcwd()).resolve() + is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists() + is_testing = "pytest" in sys.modules or "unittest" in sys.modules + + if is_source_dir and not is_testing: + raise SystemExit("[!] Cannot run from source dir, cd to a data folder first") + + +def check_data_dir_permissions(config=None, **config_kwargs): + from archivebox import DATA_DIR + from archivebox.misc.logging import STDERR + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_UID, DEFAULT_GID, IS_ROOT, USER + from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir + + data_dir_stat = Path(DATA_DIR).stat() + data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid + data_owned_by_root = data_dir_uid == 0 + + # data_owned_by_default_user = data_dir_uid == DEFAULT_UID or data_dir_gid == DEFAULT_GID + data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False + data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK)) + if data_not_writable: + STDERR.print( + f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is not writable by ArchiveBox user [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER}).[/yellow]", + ) + elif data_owned_by_root: + STDERR.print( + "\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] appears to be owned by [red]root[/red]. If this is an NFS or mapped volume and writes work, no change is required.[/yellow]", + ) + elif data_owner_doesnt_match: + STDERR.print( + f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]", + ) + + if data_not_writable: + STDERR.print( + f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} to the user & group that will run ArchiveBox, e.g.:", + ) + STDERR.print(f" [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_UID}:{DEFAULT_GID}[/blue] {DATA_DIR.resolve()}") + STDERR.print(" Avoid recursive chown on very large archives unless you know the full tree needs repair.") + STDERR.print() + STDERR.print("[blue]More info:[/blue]") + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]", + ) + STDERR.print( + " [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]", + ) + + from archivebox.config.common import get_config + + config = config or get_config(**config_kwargs) + try: + tmp_dir = get_or_create_working_tmp_dir(autofix=True, quiet=True, config=config) or config.TMP_DIR + except Exception: + tmp_dir = config.TMP_DIR + + try: + lib_dir = get_or_create_working_lib_dir(autofix=True, quiet=True, config=config) or config.LIB_DIR + except Exception: + lib_dir = config.LIB_DIR + + # Check /tmp dir permissions + check_tmp_dir(tmp_dir, throw=False, must_exist=True, config=config) + + # Check /lib dir permissions + check_lib_dir(lib_dir, throw=False, must_exist=True, config=config) + + # Derive directory mode from file mode by OR-ing the execute bits (matches + # the old DIR_OUTPUT_PERMISSIONS=755 vs OUTPUT_PERMISSIONS=644 convention). + os.umask(0o777 - (int(config.OUTPUT_PERMISSIONS, base=8) | 0o111)) + + +def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True, config=None, **config_kwargs): + from archivebox.config.paths import ( + MAX_TMP_SOCKET_URL_LENGTH, + SUPERVISORD_SOCKET_FILENAME, + assert_dir_can_contain_unix_sockets, + dir_is_writable, + get_or_create_working_tmp_dir, + tmp_dir_socket_path_is_short_enough, + ) + from archivebox.misc.logging import STDERR + from archivebox.misc.logging_util import pretty_path + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.common import get_config + + config = config or get_config(**config_kwargs) + tmp_dir = tmp_dir or config.TMP_DIR + socket_file = tmp_dir.absolute().resolve() / SUPERVISORD_SOCKET_FILENAME + + if not must_exist and not os.path.isdir(tmp_dir): + # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable) + return tmp_dir_socket_path_is_short_enough(tmp_dir) + + tmp_is_valid = False + try: + tmp_is_valid = dir_is_writable(tmp_dir) + if not config.ALLOW_NO_UNIX_SOCKETS: + tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) + assert tmp_is_valid, f"ArchiveBox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}" + socket_url_len = len(f"file://{socket_file}") + assert tmp_dir_socket_path_is_short_enough(tmp_dir), ( + f"ArchiveBox TMP_DIR={tmp_dir} is too long, file://{socket_file} is {socket_url_len} chars " + f"and must be <{MAX_TMP_SOCKET_URL_LENGTH} chars." + ) + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.", + " - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user ({ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}).", + " - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.", + " - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured TMP_DIR[/red]", + subtitle="Background workers may fail to start until fixed.", + ), + ) + STDERR.print() + if throw: + raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e + return False + + +def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True, config=None, **config_kwargs): + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.misc.logging import STDERR + from archivebox.misc.logging_util import pretty_path + from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir + from archivebox.config.common import get_config + + config = config or get_config(**config_kwargs) + lib_dir = lib_dir or config.LIB_DIR + + if not must_exist and not os.path.isdir(lib_dir): + return True + + lib_is_valid = False + try: + lib_is_valid = dir_is_writable(lib_dir) + assert lib_is_valid, f"ArchiveBox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}" + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = "\n".join( + ( + "", + f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]", + f" [yellow]{e}[/yellow]", + "", + "[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.", + f" - It [red]must[/red] be readable and writable by the ArchiveBox user ({ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}).", + " - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).", + " - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]", + "", + "[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:", + f" [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or config.LIB_DIR}[/green]", + "", + ), + ) + STDERR.print( + Panel( + ERROR_TEXT, + expand=False, + border_style="red", + title="[red]:cross_mark: Error with configured LIB_DIR[/red]", + subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]", + ), + ) + STDERR.print() + if throw: + raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e + return False diff --git a/archivebox/misc/db.py b/archivebox/misc/db.py new file mode 100644 index 0000000000..a154d16bfb --- /dev/null +++ b/archivebox/misc/db.py @@ -0,0 +1,431 @@ +""" +Database utility functions for ArchiveBox. + +Post-bootstrap: requires archivebox.config constants and uses Django lazily +(``from django.db import ...`` inside functions). Not safe to import pre-bootstrap. +""" + +__package__ = "archivebox.misc" + +from io import StringIO +from pathlib import Path +from typing import TextIO +from typing import Any +import fcntl +import time +from collections.abc import Callable +from contextlib import contextmanager +from sqlite3 import OperationalError as SQLiteOperationalError + +from archivebox.config import CONSTANTS +from archivebox.misc.util import enforce_types + + +def run_db_analyze_batch( + remaining: list[str] | None, + *, + max_seconds_per_table: float = 120.0, +) -> list[str]: + """Advance one step of a batched SQLite ``ANALYZE`` sweep. + + Without periodic ANALYZE the optimizer's table stats go stale as + snapshot/archiveresult tables grow, causing it to start large joins from + ``auth_user`` instead of using the indexed url column and blowing snapshot + detail page render time from ~50ms to ~500ms+. + + The whole sweep is spread across many calls instead of running as one + blocking ``ANALYZE``: pass ``None`` to start a fresh sweep (this call + enumerates user tables and runs ``ANALYZE`` on the first one); pass the + returned list to advance one more table on each subsequent call. An + empty return value means the sweep is complete (or has been aborted) and + the next caller should pass ``None`` again. Caller is responsible for + throttling new sweeps (orchestrator starts at most one per 24hr while + idle) and enforcing a hard upper bound on total sweep wall time. + + Safety guarantees: + + - **Never raises**: every database call is wrapped; on any failure the + function returns ``[]`` (abandoning the rest of the sweep) so the + orchestrator never crashes on maintenance errors. + - **Bounded per-call wall time**: a SQLite progress handler aborts the + current ``ANALYZE`` statement once ``max_seconds_per_table`` is + exceeded, so a single pathological table cannot wedge the call. + - **Never leaves the db locked**: each ``ANALYZE`` runs as a single + statement transaction that auto-commits (or rolls back on + abort/error). The cursor and progress handler are always cleaned up + in ``finally`` blocks even if Python raises mid-call. + - Silent no-op on non-SQLite backends. + + WAL journal mode (set in Django settings) keeps readers fully unblocked + throughout; the writer lock is only held for the brief ``sqlite_stat*`` + flush after each table completes. + """ + from django.db import connection + + if connection.vendor != "sqlite": + return [] + + if remaining is None: + try: + with connection.cursor() as cursor: + cursor.execute( + "SELECT name FROM sqlite_master WHERE type = 'table' AND name NOT LIKE 'sqlite_%' ORDER BY name", + ) + remaining = [row[0] for row in cursor.fetchall()] + except Exception: + return [] + + if not remaining: + return [] + + next_table, *rest = remaining + raw_conn = connection.connection + progress_handler_set = False + if raw_conn is not None and max_seconds_per_table > 0: + deadline = time.monotonic() + max_seconds_per_table + try: + raw_conn.set_progress_handler(lambda: 1 if time.monotonic() > deadline else 0, 10000) + progress_handler_set = True + except Exception: + progress_handler_set = False + + try: + with connection.cursor() as cursor: + cursor.execute(f'ANALYZE "{next_table}"') + except Exception: + # Aborted by progress handler, locked db, or any other failure โ€” skip + # this table and continue the sweep. ANALYZE is idempotent so we can + # retry on the next 24hr sweep. + pass + finally: + if progress_handler_set and raw_conn is not None: + try: + raw_conn.set_progress_handler(None, 0) + except Exception: + pass + return rest + + +def compact_command(cmdline: list[str] | None, fallback: str = "") -> str: + parts = [str(part) for part in (cmdline or []) if str(part)] + if not parts: + return fallback + for marker in ("archivebox", "daphne", "gunicorn", "uvicorn", "supervisord", "sonic", "node"): + for idx, part in enumerate(parts): + if Path(part).name == marker or part == marker: + return " ".join([Path(parts[idx]).name, *parts[idx + 1 :]])[:220] + return " ".join([Path(parts[0]).name, *parts[1:]])[:220] + + +def sqlite_lock_holders(db_path: Path = CONSTANTS.DATABASE_FILE) -> list[str]: + import psutil + + db_path = db_path.resolve() + db_sidecars = { + db_path, + db_path.with_name(f"{db_path.name}-wal"), + db_path.with_name(f"{db_path.name}-shm"), + db_path.with_name(f"{db_path.name}-journal"), + } + holders: list[str] = [] + for proc in psutil.process_iter(["pid", "ppid", "name", "cmdline", "status"]): + try: + open_files = proc.open_files() + except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess): + continue + for open_file in open_files: + try: + open_path = Path(open_file.path).resolve() + except (OSError, RuntimeError): + continue + if open_path in db_sidecars: + info = proc.info + cmdline = compact_command(info.get("cmdline"), fallback=info.get("name") or "") + holders.append(f"pid={info['pid']} ppid={info['ppid']} {info['status']} {cmdline}") + break + return holders + + +def log_sqlite_lock_holders(console: Any, *, db_path: Path = CONSTANTS.DATABASE_FILE, limit: int = 8) -> None: + holders = sqlite_lock_holders(db_path) + if holders: + console.print("[yellow] DB holders:[/yellow]") + for holder in holders[:limit]: + console.print(f"[yellow] - {holder}[/yellow]") + if len(holders) > limit: + console.print(f"[yellow] ... {len(holders) - limit} more[/yellow]") + else: + console.print("[yellow] No local process with index.sqlite3 open was visible to this user.[/yellow]") + + +def sqlite_lock_error(error: BaseException) -> bool: + from django.db import OperationalError as DjangoOperationalError + + return isinstance(error, (SQLiteOperationalError, DjangoOperationalError)) and "database is locked" in str(error).lower() + + +def retry_sqlite_locks(action: Callable[[], Any], *, label: str, stderr: TextIO | None = None) -> Any: + from django.db import OperationalError, connections + from rich.console import Console + + console = Console(file=stderr or None, stderr=stderr is None) + while True: + try: + return action() + except OperationalError as err: + if "database is locked" not in str(err).lower(): + raise + except SQLiteOperationalError as err: + if not sqlite_lock_error(err): + raise + + connections.close_all() + console.print(f"[yellow][*] SQLite database is locked while {label}; retrying in 5s...[/yellow]") + log_sqlite_lock_holders(console) + with console.status("[yellow]Waiting for SQLite database lock to clear...[/yellow]", spinner="dots"): + time.sleep(5.0) + + +@contextmanager +def migration_lock(stdout: TextIO | None = None): + from archivebox.config.paths import get_or_create_working_tmp_dir + from rich.console import Console + + lock_path = get_or_create_working_tmp_dir(autofix=True, quiet=True) / "migrate.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + with lock_path.open("a+") as lock_file: + try: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError: + # Migrations on large SQLite collections can run for hours. Use a + # kernel lock with no timeout so parallel ArchiveBox commands queue + # behind the active migrate process instead of racing it. + console = Console(file=stdout or None, stderr=stdout is None) + with console.status("[yellow]Waiting for migration lock...[/yellow]", spinner="dots"): + while True: + try: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + break + except BlockingIOError: + time.sleep(1.0) + try: + yield + finally: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + +# Migration names that previously existed in ArchiveBox's source tree but have +# since been deleted (squashed away, renamed, moved between apps, etc.). DBs +# upgraded incrementally through 0.8.x โ†’ 0.9.x dev rcs accumulate rows for +# these in ``django_migrations``; the newer-DB guard added in 65dc2521 would +# otherwise refuse to start with "applied migrations missing from this build" +# and brick beta-tester collections. We deliberately do NOT use Django's +# ``replaces=`` for these because Django's all-or-none replaces semantics +# splits the migration graph when only a *subset* of the replaces list is +# applied (which is exactly what happens for users at different intermediate +# dev branch states). This set is the authoritative compat list โ€” extend it +# when squashing more migrations away. Generated from +# ``git log --diff-filter=D --name-only`` over each app's migrations/ tree. +HISTORICAL_GHOST_MIGRATIONS: frozenset[tuple[str, str]] = frozenset( + { + # core: 0023โ†’0075 sequence plus every transient dev rename + ("core", "0002_auto_20190417_0739"), + ("core", "0006_auto_20200915_2006"), + ("core", "0023_alter_archiveresult_options_archiveresult_abid_and_more"), + ("core", "0023_new_schema"), + ("core", "0024_auto_20240513_1143"), + ("core", "0024_b_clear_config_fields"), + ("core", "0024_c_disable_fk_checks"), + ("core", "0024_d_fix_crawls_config"), + ("core", "0024_f_add_snapshot_config"), + ("core", "0024_snapshot_crawl"), + ("core", "0025_allow_duplicate_urls_per_crawl"), + ("core", "0025_alter_archiveresult_uuid"), + ("core", "0025_cleanup_schema"), + ("core", "0026_archiveresult_created_archiveresult_created_by_and_more"), + ("core", "0026_final_field_adjustments"), + ("core", "0026_remove_archiveresult_output_dir_and_more"), + ("core", "0027_alter_archiveresult_created_by_and_more"), + ("core", "0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more"), + ("core", "0027_update_snapshot_ids"), + ("core", "0028_alter_archiveresult_uuid"), + ("core", "0028_snapshot_fs_version"), + ("core", "0029_alter_archiveresult_id"), + ("core", "0029_archiveresult_hook_fields"), + ("core", "0030_alter_archiveresult_uuid"), + ("core", "0030_migrate_output_field"), + ("core", "0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more"), + ("core", "0031_snapshot_parent_snapshot"), + ("core", "0032_alter_archiveresult_binary_and_more"), + ("core", "0032_alter_archiveresult_id"), + ("core", "0033_rename_extractor_add_hook_name"), + ("core", "0033_rename_id_archiveresult_old_id"), + ("core", "0034_alter_archiveresult_old_id_alter_archiveresult_uuid"), + ("core", "0034_snapshot_current_step"), + ("core", "0035_remove_archiveresult_uuid_archiveresult_id"), + ("core", "0035_snapshot_crawl_non_nullable_remove_created_by"), + ("core", "0036_alter_archiveresult_id_alter_archiveresult_old_id"), + ("core", "0036_remove_archiveresult_created_by"), + ("core", "0037_remove_archiveresult_output_dir_and_more"), + ("core", "0037_rename_id_snapshot_old_id"), + ("core", "0038_fix_missing_columns"), + ("core", "0038_rename_uuid_snapshot_id"), + ("core", "0039_fix_num_uses_values"), + ("core", "0039_rename_snapshot_archiveresult_snapshot_old"), + ("core", "0040_archiveresult_snapshot"), + ("core", "0041_alter_archiveresult_snapshot_and_more"), + ("core", "0042_remove_archiveresult_snapshot_old"), + ("core", "0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more"), + ("core", "0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more"), + ("core", "0045_alter_snapshot_old_id"), + ("core", "0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more"), + ("core", "0047_alter_snapshottag_unique_together_and_more"), + ("core", "0048_alter_archiveresult_snapshot_and_more"), + ("core", "0049_rename_snapshot_snapshottag_snapshot_old_and_more"), + ("core", "0050_alter_snapshottag_snapshot_old"), + ("core", "0051_snapshottag_snapshot_alter_snapshottag_snapshot_old"), + ("core", "0052_alter_snapshottag_unique_together_and_more"), + ("core", "0053_remove_snapshottag_snapshot_old"), + ("core", "0054_alter_snapshot_timestamp"), + ("core", "0055_alter_tag_slug"), + ("core", "0056_remove_tag_uuid"), + ("core", "0057_rename_id_tag_old_id"), + ("core", "0058_alter_tag_old_id"), + ("core", "0059_tag_id"), + ("core", "0060_alter_tag_id"), + ("core", "0061_rename_tag_snapshottag_old_tag_and_more"), + ("core", "0062_alter_snapshottag_old_tag"), + ("core", "0063_snapshottag_tag_alter_snapshottag_old_tag"), + ("core", "0064_alter_snapshottag_unique_together_and_more"), + ("core", "0065_remove_snapshottag_old_tag"), + ("core", "0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id"), + ("core", "0067_alter_snapshottag_tag"), + ("core", "0068_alter_archiveresult_options"), + ("core", "0069_alter_archiveresult_created_alter_snapshot_added_and_more"), + ("core", "0070_alter_archiveresult_created_by_alter_snapshot_added_and_more"), + ("core", "0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more"), + ("core", "0072_rename_added_snapshot_bookmarked_at_and_more"), + ("core", "0073_rename_created_archiveresult_created_at_and_more"), + ("core", "0074_alter_snapshot_downloaded_at"), + ("core", "0075_crawl"), + # api: pre-squash 0001_squashed plus 0002โ†’0009 chain + ("api", "0001_squashed"), + ("api", "0002_alter_apitoken_options"), + ("api", "0002_alter_outboundwebhook_options_and_more"), + ("api", "0003_alter_apitoken_created_by_and_more"), + ("api", "0003_rename_user_apitoken_created_by_apitoken_abid_and_more"), + ("api", "0004_alter_apitoken_id_alter_apitoken_uuid"), + ("api", "0004_rename_user_apitoken_created_by_apitoken_modified_and_more"), + ("api", "0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more"), + ("api", "0006_remove_outboundwebhook_uuid_apitoken_id_and_more"), + ("api", "0007_alter_apitoken_created_by"), + ("api", "0008_alter_apitoken_created_alter_apitoken_created_by_and_more"), + ("api", "0009_rename_created_apitoken_created_at_and_more"), + # machine: pre-squash 0001_squashed plus transient 0002โ†’0005 renames + ("machine", "0001_squashed"), + ("machine", "0002_alter_dependency_bin_name_and_more"), + ("machine", "0002_alter_machine_stats_installedbinary"), + ("machine", "0002_process_parent_and_type"), + ("machine", "0002_rename_custom_cmds_to_overrides"), + ("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"), + ("machine", "0003_alter_installedbinary_options_and_more"), + ("machine", "0004_alter_installedbinary_abspath_and_more"), + ("machine", "0004_drop_dependency_table"), + ("machine", "0004_rename_installedbinary_to_binary"), + ("machine", "0005_binary_binproviders_binary_output_dir_and_more"), + # crawls: transient dev renames around the seed-model removal + ("crawls", "0002_delete_outlink"), + ("crawls", "0002_drop_seed_model"), + ("crawls", "0002_upgrade_to_0_9_0"), + ("crawls", "0003_alter_crawl_output_dir"), + ("crawls", "0004_alter_crawl_output_dir"), + ("crawls", "0005_drop_seed_id_column"), + ("crawls", "0006_alter_crawl_config_alter_crawl_output_dir_and_more"), + }, +) + + +@enforce_types +def migration_state(out_dir: Path = CONSTANTS.DATA_DIR) -> tuple[list[str], list[str], dict[str, str]]: + """Cheaply compare migration files to django_migrations without invoking migrate.""" + from django.apps import apps + from django.db import connection + from django.db.migrations.loader import MigrationLoader + + def applied_rows() -> set[tuple[str, str]]: + with connection.cursor() as cursor: + try: + cursor.execute("SELECT app, name FROM django_migrations") + except Exception as err: + if "no such table" in str(err).lower(): + return set() + raise + return {(str(app), str(name)) for app, name in cursor.fetchall()} + + applied = retry_sqlite_locks(applied_rows, label="checking applied migrations") + disk_migrations: set[tuple[str, str]] = set() + # Names that any current migration declares it ``replaces=``. Whether or + # not we use ``replaces=`` today, supporting it costs nothing and keeps + # the checker honest if a future migration adopts it. + squashed_replaced: set[tuple[str, str]] = set() + app_labels = {app_config.label for app_config in apps.get_app_configs()} + loader = MigrationLoader(connection=None, ignore_no_migrations=True, load=False) + loader.load_disk() + for (app_label, migration_name), migration in loader.disk_migrations.items(): + disk_migrations.add((app_label, migration_name)) + for replaced_app, replaced_name in migration.replaces or (): + squashed_replaced.add((replaced_app, replaced_name)) + + applied = {(app, name) for app, name in applied if app in app_labels} + pending = [f"{app}.{name}" for app, name in sorted(disk_migrations - applied)] + missing_pairs = sorted(applied - disk_migrations - squashed_replaced - HISTORICAL_GHOST_MIGRATIONS) + missing_from_code = [f"{app}.{name}" for app, name in missing_pairs] + rollback_targets = { + app: ( + max(name for disk_app, name in disk_migrations if disk_app == app) + if any(disk_app == app for disk_app, _name in disk_migrations) + else "zero" + ) + for app, _name in missing_pairs + } + return pending, missing_from_code, rollback_targets + + +@enforce_types +def pending_migrations(out_dir: Path = CONSTANTS.DATA_DIR) -> list[str]: + """Return migration files on disk that have not been applied yet.""" + pending, _missing_from_code, _rollback_targets = migration_state(out_dir=out_dir) + return pending + + +@enforce_types +def apply_migrations( + out_dir: Path = CONSTANTS.DATA_DIR, + stdout: TextIO | None = None, + stderr: TextIO | None = None, + verbosity: int = 1, +) -> list[str]: + """Apply pending Django migrations""" + from django.core.management import call_command + + with migration_lock(stdout=stderr or stdout): + if not pending_migrations(): + return [] + + if stdout is not None: + retry_sqlite_locks( + lambda: call_command("migrate", interactive=False, database="default", stdout=stdout, stderr=stderr, verbosity=verbosity), + label="applying migrations", + stderr=stderr, + ) + return [] + + def migrate() -> StringIO: + out1 = StringIO() + call_command("migrate", interactive=False, database="default", stdout=out1, verbosity=verbosity) + out1.seek(0) + return out1 + + out1 = retry_sqlite_locks(migrate, label="applying migrations") + + return [line.strip() for line in out1.readlines() if line.strip()] diff --git a/archivebox/misc/hashing.py b/archivebox/misc/hashing.py new file mode 100644 index 0000000000..872cb9a435 --- /dev/null +++ b/archivebox/misc/hashing.py @@ -0,0 +1,260 @@ +# Bootable hashing helpers (file content + mtime/size cache, mime detection). +# MUST NOT import archivebox.config, archivebox.core, or Django โ€” stdlib only. + +import hashlib +import mimetypes +from functools import lru_cache +from pathlib import Path +from collections.abc import Callable +from datetime import datetime + + +@lru_cache(maxsize=1024) +def _cached_file_hash(filepath: str, size: int, mtime: float) -> str: + """Internal function to calculate file hash with cache key based on path, size and mtime.""" + sha256_hash = hashlib.sha256() + + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256_hash.update(chunk) + + return sha256_hash.hexdigest() + + +@lru_cache(maxsize=10) +def hash_file(file_path: Path, pwd: Path | None = None) -> str: + """Calculate SHA256 hash of a file with caching based on path, size and mtime.""" + pwd = Path(pwd) if pwd else None + file_path = Path(file_path) + if not file_path.is_absolute(): + file_path = pwd / file_path if pwd else file_path.absolute() + + abs_path = file_path.resolve() + stat_info = abs_path.stat() + + return _cached_file_hash( + str(abs_path), + stat_info.st_size, + stat_info.st_mtime, + ) + + +@lru_cache(maxsize=10) +def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]: + """Calculate SHA256 hashes for all files and directories recursively.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + if not dir_path.is_dir(): + raise ValueError(f"Not a directory: {dir_path}") + if max_depth < -1: + raise ValueError(f"max_depth must be >= -1, got {max_depth}") + + # Get all files recursively + all_files = get_dir_entries( + dir_path, + pwd=pwd, + recursive=True, + include_files=True, + include_dirs=False, + filter_func=filter_func, + ) + + hashes: dict[str, str] = {} + hashable_summary = [] + + # Calculate hashes for all files + for subfile in all_files: + subfile_path = dir_path / subfile + sha256_hash = hash_file(subfile_path) + hashes[subfile] = sha256_hash + hashable_summary.append(f"{sha256_hash} ./{subfile}") + + # Calculate hashes for all directories + subdirs = get_dir_entries( + dir_path, + pwd=pwd, + recursive=True, + include_files=False, + include_dirs=True, + include_hidden=False, + filter_func=filter_func, + max_depth=max_depth, + ) + + for subdir in subdirs: + subdir_path = dir_path / subdir + subdir_hashes = get_dir_hashes( + subdir_path, + filter_func=filter_func, + max_depth=0, + ) + hashes[subdir] = subdir_hashes["."] + + # Filter results by max_depth + if max_depth >= 0: + hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1} + + # Calculate root directory hash + hashable_summary.sort() + root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest() + hashes["."] = root_sha256 + + return hashes + + +@lru_cache(maxsize=128) +def get_dir_entries( + dir_path: Path, + pwd: Path | None = None, + recursive: bool = True, + include_files: bool = True, + include_dirs: bool = True, + include_hidden: bool = False, + filter_func: Callable | None = None, + max_depth: int = -1, +) -> tuple[str, ...]: + """Get filtered list of directory entries.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + results = [] + + def process_path(path: Path, depth: int): + if not include_hidden and path.name.startswith("."): + return False + if max_depth >= 0 and depth > max_depth: + return False + if filter_func: + info = { + "abspath": str(path.absolute()), + "relpath": str(path.relative_to(dir_path)), + } + if not filter_func(info): + return False + return True + + for path in dir_path.rglob("*") if recursive else dir_path.glob("*"): + current_depth = len(path.relative_to(dir_path).parts) + + if path.is_file() and include_files and process_path(path, current_depth): + results.append(str(path.relative_to(dir_path))) + elif path.is_dir() and include_dirs and process_path(path, current_depth): + results.append(str(path.relative_to(dir_path))) + + if not recursive: + break + + return tuple(sorted(results)) # Make immutable for caching + + +@lru_cache(maxsize=1024) +def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]: + """Calculate sizes for all files and directories recursively.""" + sizes: dict[str, int] = {} + hashes = get_dir_hashes(dir_path, pwd=pwd, **kwargs) + dir_path = Path(dir_path) + + for path_key in hashes: + full_path = dir_path / path_key + if full_path.is_file(): + sizes[path_key] = full_path.stat().st_size + else: + total = 0 + for file_path in full_path.rglob("*"): + if file_path.is_file() and not file_path.name.startswith("."): + total += file_path.stat().st_size + sizes[path_key + "/"] = total + + return sizes + + +@lru_cache(maxsize=10) +def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict: + """Get detailed information about directory contents including hashes and sizes.""" + pwd = Path(pwd) if pwd else None + dir_path = Path(dir_path) + if not dir_path.is_absolute(): + dir_path = pwd / dir_path if pwd else dir_path.absolute() + + hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth) + sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth) + + num_total_subpaths = sum(1 for name in hashes if name != ".") + details = {} + + for filename, sha256_hash in sorted(hashes.items()): + abs_path = (dir_path / filename).resolve() + stat_info = abs_path.stat() + num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/")) + is_dir = abs_path.is_dir() + if is_dir: + mime_type = "inode/directory" + basename = abs_path.name + extension = "" + num_bytes = sizes[filename + "/"] + if filename == ".": + num_subpaths = num_total_subpaths + else: + filename += "/" + num_subpaths = num_subpaths + else: # is_file + num_subpaths = None + mime_type = mimetypes.guess_type(str(abs_path))[0] + extension = abs_path.suffix + basename = abs_path.name.rsplit(extension, 1)[0] + num_bytes = sizes[filename] + + details[filename] = { + "basename": basename, + "mime_type": mime_type, + "extension": extension, + "num_subpaths": num_subpaths, + "num_bytes": num_bytes, + "hash_sha256": sha256_hash, + "created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(), + "modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(), + } + + if filter_func and not filter_func(details[filename]): + del details[filename] + + return details + + +if __name__ == "__main__": + import json + + dir_info = get_dir_info(Path("."), max_depth=6) + with open(".hashes.json", "w") as f: + json.dump(dir_info, f, indent=4) + print("Wrote .hashes.json") + +# Example output: +# { +# ".": { +# "basename": "misc", +# "mime_type": "inode/directory", +# "extension": "", +# "num_subpaths": 25, +# "num_bytes": 214677, +# "hash_sha256": "addfacf88b2ff6b564846415fb7b21dcb7e63ee4e911bc0aec255ee354958530", +# "created_at": "2024-12-04T00:08:38.537449", +# "modified_at": "2024-12-04T00:08:38.537449" +# }, +# "__init__.py": { +# "basename": "__init__", +# "mime_type": "text/x-python", +# "extension": ".py", +# "num_subpaths": null, +# "num_bytes": 32, +# "hash_sha256": "b0e5e7ff17db3b60535cf664282787767c336e3e203a43e21b6326c6fe457551", +# "created_at": "2024-10-08T00:51:41.001359", +# "modified_at": "2024-10-08T00:51:41.001359" +# }, +# ... +# } diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py new file mode 100644 index 0000000000..d987be92da --- /dev/null +++ b/archivebox/misc/jsonl.py @@ -0,0 +1,158 @@ +""" +JSONL (JSON Lines) utilities for ArchiveBox. + +Provides functions for reading, writing, and processing typed JSONL records. +All CLI commands that accept stdin can read both plain URLs and typed JSONL. + +CLI Pipeline: + archivebox crawl URL -> {"type": "Crawl", "id": "...", "urls": "...", ...} + archivebox snapshot -> {"type": "Snapshot", "id": "...", "url": "...", ...} + archivebox extract -> {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", ...} + +Typed JSONL Format: + {"type": "Crawl", "id": "...", "urls": "...", "max_depth": 0, ...} + {"type": "Snapshot", "id": "...", "url": "https://example.com", "title": "...", ...} + {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", ...} + {"type": "Tag", "name": "..."} + +Plain URLs (also supported): + https://example.com + https://foo.com + +Local filesystem paths and file:// URLs are intentionally not accepted here. +Pipe file contents through stdin instead of passing a path/URI as a crawl URL. +""" + +__package__ = "archivebox.misc" + +# Bootable: stdlib only. MUST NOT import archivebox.config, archivebox.core, or Django. +# Used by CLI commands at entry, before Django setup. + +import sys +import json +from typing import Any, TextIO +from collections.abc import Iterable, Iterator +from pathlib import Path + + +# Type constants for JSONL records +TYPE_SNAPSHOT = "Snapshot" +TYPE_ARCHIVERESULT = "ArchiveResult" +TYPE_TAG = "Tag" +TYPE_CRAWL = "Crawl" +TYPE_BINARYREQUEST = "BinaryRequest" +TYPE_BINARY = "Binary" +TYPE_PROCESS = "Process" +TYPE_MACHINE = "Machine" + +VALID_TYPES = { + TYPE_SNAPSHOT, + TYPE_ARCHIVERESULT, + TYPE_TAG, + TYPE_CRAWL, + TYPE_BINARYREQUEST, + TYPE_BINARY, + TYPE_PROCESS, + TYPE_MACHINE, +} + + +def parse_line(line: str) -> dict[str, Any] | None: + """ + Parse a single line of input as either JSONL or plain URL. + + Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid. + """ + line = line.strip() + if not line or line.startswith("#"): + return None + + # Try to parse as JSON first + if line.startswith("{"): + try: + record = json.loads(line) + # If it has a type, validate it + if "type" in record and record["type"] not in VALID_TYPES: + # Unknown type, treat as raw data + pass + # If it has url but no type, assume Snapshot + if "url" in record and "type" not in record: + record["type"] = TYPE_SNAPSHOT + return record + except json.JSONDecodeError: + pass + + # Treat as plain URL if it looks like one + if line.startswith("http://") or line.startswith("https://"): + return {"type": TYPE_SNAPSHOT, "url": line} + + # Could be a snapshot ID (UUID with dashes or compact 32-char hex) + if len(line) == 36 and line.count("-") == 4: + return {"type": TYPE_SNAPSHOT, "id": line} + if len(line) == 32: + try: + int(line, 16) + except ValueError: + pass + else: + return {"type": TYPE_SNAPSHOT, "id": line} + + # Unknown format, skip + return None + + +def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]: + """ + Read JSONL or plain URLs from stdin. + + Yields parsed records as dicts. + Supports both JSONL format and plain URLs (one per line). + """ + active_stream: TextIO = sys.stdin if stream is None else stream + + # Don't block if stdin is a tty with no input + if active_stream.isatty(): + return + + for line in active_stream: + record = parse_line(line) + if record: + yield record + + +def read_file(path: Path) -> Iterator[dict[str, Any]]: + """ + Read JSONL or plain URLs from a file. + + Yields parsed records as dicts. + """ + with open(path) as f: + for line in f: + record = parse_line(line) + if record: + yield record + + +def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]: + """ + Read from CLI arguments if provided, otherwise from stdin. + + Handles both URLs and JSONL from either source. + Does not expand local file path arguments; pipe file contents via stdin. + """ + if args: + for arg in args: + record = parse_line(arg) + if record: + yield record + else: + yield from read_stdin(stream) + + +def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None: + """ + Write a single JSONL record to stdout (or provided stream). + """ + active_stream: TextIO = sys.stdout if stream is None else stream + active_stream.write(json.dumps(record) + "\n") + active_stream.flush() diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py new file mode 100644 index 0000000000..0049ae1232 --- /dev/null +++ b/archivebox/misc/logging.py @@ -0,0 +1,123 @@ +__package__ = "archivebox.misc" + +# Bootable logging primitives (Rich console, ANSI colors, stdout/stderr helpers). +# MUST NOT import archivebox.config, archivebox.core, or Django โ€” this module is +# loaded via config/constants.py during pre-bootstrap, before settings or apps +# are ready. Post-bootstrap CLI logging helpers live in logging_util.py. + +import sys +from collections import defaultdict +from random import randint + +from rich.console import Console +from rich.highlighter import Highlighter + +# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS +# Disable wrapping - use soft_wrap=True and large width so text flows naturally +# Colors are preserved, just no hard line breaks inserted +CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True) +STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True) +IS_TTY = sys.stdout.isatty() + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super().__init__() + self.update(*args, **kwargs) + + @classmethod + def _wrap(cls, value): + if isinstance(value, dict) and not isinstance(value, AttrDict): + return cls(value) + if isinstance(value, list): + return [cls._wrap(item) for item in value] + if isinstance(value, tuple): + return tuple(cls._wrap(item) for item in value) + return value + + def __setitem__(self, key, value): + super().__setitem__(key, self._wrap(value)) + + def update(self, *args, **kwargs): + for key, value in dict(*args, **kwargs).items(): + self[key] = value + + def __getattr__(self, key): + try: + return self[key] + except KeyError as err: + raise AttributeError(key) from err + + +class RainbowHighlighter(Highlighter): + def highlight(self, text): + for index in range(len(text)): + text.stylize(f"color({randint(90, 98)})", index, index + 1) + + +rainbow = RainbowHighlighter() + + +DEFAULT_CLI_COLORS = AttrDict( + { + "reset": "\033[00;00m", + "lightblue": "\033[01;30m", + "lightyellow": "\033[01;33m", + "lightred": "\033[01;35m", + "red": "\033[01;31m", + "green": "\033[01;32m", + "blue": "\033[01;34m", + "white": "\033[01;37m", + "black": "\033[01;30m", + }, +) +ANSI = AttrDict({k: "" for k in DEFAULT_CLI_COLORS.keys()}) + +COLOR_DICT = defaultdict( + lambda: [(0, 0, 0), (0, 0, 0)], + { + "00": [(0, 0, 0), (0, 0, 0)], + "30": [(0, 0, 0), (0, 0, 0)], + "31": [(255, 0, 0), (128, 0, 0)], + "32": [(0, 200, 0), (0, 128, 0)], + "33": [(255, 255, 0), (128, 128, 0)], + "34": [(0, 0, 255), (0, 0, 128)], + "35": [(255, 0, 255), (128, 0, 128)], + "36": [(0, 255, 255), (0, 128, 128)], + "37": [(255, 255, 255), (255, 255, 255)], + }, +) + + +# Logging Helpers (DEPRECATED, use rich.print instead going forward) +def stdout(*args, color: str | None = None, prefix: str = "", config: dict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if color: + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] + else: + strs = [" ".join(str(a) for a in args), "\n"] + + sys.stdout.write(prefix + "".join(strs)) + + +def stderr(*args, color: str | None = None, prefix: str = "", config: dict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if color: + strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"] + else: + strs = [" ".join(str(a) for a in args), "\n"] + + sys.stderr.write(prefix + "".join(strs)) + + +def hint(text: tuple[str, ...] | list[str] | str, prefix=" ", config: dict | None = None) -> None: + ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI + + if isinstance(text, str): + stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}") + else: + stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}") + for line in text[1:]: + stderr(f"{prefix} {line}") diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py new file mode 100644 index 0000000000..ac4b2718c6 --- /dev/null +++ b/archivebox/misc/logging_util.py @@ -0,0 +1,482 @@ +__package__ = "archivebox" + +# Post-bootstrap CLI logging helpers (event loggers, progress bars, formatters). +# Requires archivebox.config to be loaded โ€” imports CONSTANTS/get_config and +# references Django ORM types. For pre-bootstrap logging primitives use +# misc/logging.py, which has no archivebox or Django dependencies. + +import re +import os +import sys +import time + +from math import log +from multiprocessing import Process +from pathlib import Path + +from datetime import datetime, timezone +from typing import Any, Optional, IO, TYPE_CHECKING, cast +from collections.abc import Iterable + +if TYPE_CHECKING: + from archivebox.core.models import Snapshot + +from rich import print +from rich.panel import Panel + +from archivebox.config import CONSTANTS, VERSION +from archivebox.config.common import get_config +from archivebox.misc.util import enforce_types +from archivebox.misc.logging import ANSI + + +class TimedProgress: + """Show a progress bar and measure elapsed time until .end() is called""" + + def __init__(self, seconds, prefix="", config=None, **config_kwargs): + + config = config or get_config(**config_kwargs) + self.SHOW_PROGRESS = config.SHOW_PROGRESS + self.ANSI = config.ANSI + self.TERM_WIDTH = config.TERM_WIDTH + + if self.SHOW_PROGRESS: + self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI)) + self.p.start() + + self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None} + + def end(self): + """immediately end progress, clear the progressbar line, and save end_ts""" + + end_ts = datetime.now(timezone.utc) + self.stats["end_ts"] = end_ts + + if self.SHOW_PROGRESS: + # terminate if we havent already terminated + try: + # kill the progress bar subprocess + try: + self.p.close() # must be closed *before* its terminnated + except (KeyboardInterrupt, SystemExit): + print() + raise + except BaseException: # lgtm [py/catch-base-exception] + pass + self.p.terminate() + time.sleep(0.1) + # sometimes the timer doesn't terminate properly, then blocks at the join until + # the full time has elapsed. sending a kill tries to avoid that. + try: + self.p.kill() + except Exception: + pass + + # clear whole terminal line + try: + sys.stdout.write("\r{}{}\r".format((" " * self.TERM_WIDTH), self.ANSI["reset"])) + except (OSError, BrokenPipeError): + # ignore when the parent proc has stopped listening to our stdout + pass + except ValueError: + pass + + +@enforce_types +def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI, config=None, **config_kwargs) -> None: + """show timer in the form of progress bar, with percentage and seconds remaining""" + output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__ + chunk = "โ–ˆ" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#" + config = config or get_config(**config_kwargs) + last_width = config.TERM_WIDTH + chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + try: + for s in range(seconds * chunks): + max_width = config.TERM_WIDTH + if max_width < last_width: + # when the terminal size is shrunk, we have to write a newline + # otherwise the progress bar will keep wrapping incorrectly + sys.stdout.write("\r\n") + sys.stdout.flush() + chunks = max_width - len(prefix) - 20 + pct_complete = s / chunks / seconds * 100 + log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;) + bar_width = round(log_pct / (100 / chunks)) + last_width = max_width + + # โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 0.9% (1/60sec) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["green" if pct_complete < 80 else "lightyellow"], + (chunk * bar_width).ljust(chunks), + ANSI["reset"], + round(pct_complete, 1), + round(s / chunks), + seconds, + ), + ) + sys.stdout.flush() + time.sleep(1 / chunks) + + # โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ 100.0% (60/60sec) + sys.stdout.write( + "\r{}{}{}{} {}% ({}/{}sec)".format( + prefix, + ANSI["red"], + chunk * chunks, + ANSI["reset"], + 100.0, + seconds, + seconds, + ), + ) + sys.stdout.flush() + # uncomment to have it disappear when it hits 100% instead of staying full red: + # time.sleep(0.5) + # sys.stdout.write('\r{}{}\r'.format((' ' * get_config().TERM_WIDTH), ANSI['reset'])) + # sys.stdout.flush() + except (KeyboardInterrupt, BrokenPipeError): + print() + + +def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."): + args = " ".join(subcommand_args) + version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format( + now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"), + VERSION=VERSION, + subcommand=subcommand, + args=args, + ) + # stderr() + # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI)) + # stderr() + print(Panel(version_msg), file=sys.stderr) + + +def log_list_started(filter_patterns: list[str] | None, filter_type: str): + print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]") + print(" {}".format(" ".join(filter_patterns or ()))) + + +def log_list_finished(snapshots): + from archivebox.core.models import Snapshot + + print() + print("---------------------------------------------------------------------------------------------------") + csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots])) + print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | ")) + print("---------------------------------------------------------------------------------------------------") + print() + + +def log_removal_started(snapshots, yes: bool): + from django.db.models import QuerySet + + count = snapshots.count() if isinstance(snapshots, QuerySet) else len(snapshots) + print(f"[yellow3][i] Found {count} matching URLs to remove.[/]") + file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)] + print( + f" {count} Links will be deleted from the index and their archived content folders will be deleted from disk.\n" + f" ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)", + ) + + if not yes: + print() + print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]") + try: + assert input(" y/[n]: ").lower() == "y" + except (KeyboardInterrupt, EOFError, AssertionError): + raise SystemExit(0) + + +def log_removal_finished(remaining_links: int, removed_links: int): + if remaining_links == 0 and removed_links == 0: + print() + print("[red1][X] No matching links found.[/]") + else: + total_before = remaining_links + removed_links + print() + print(f"[red1][โˆš] Removed {removed_links} out of {total_before} links from the archive index.[/]") + print(f" Index now contains {remaining_links} links.") + + +### Helpers + + +@enforce_types +def pretty_path(path: Path | str, pwd: Path | str = CONSTANTS.DATA_DIR, color: bool = True) -> str: + """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" + pwd = str(Path(pwd)) # .resolve() + path = str(path) + + if not path: + return path + + # replace long absolute paths with ./ relative ones to save on terminal output width + if path.startswith(pwd) and (pwd != "/") and path != pwd: + if color: + path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1) + else: + path = path.replace(pwd, ".", 1) + + # quote paths containing spaces + if " " in path: + path = f'"{path}"' + + # replace home directory with ~ for shorter output + path = path.replace(str(Path("~").expanduser()), "~") + + return path + + +@enforce_types +def printable_filesize(num_bytes: int | float) -> str: + for count in ["Bytes", "KB", "MB", "GB"]: + if num_bytes > -1024.0 and num_bytes < 1024.0: + return f"{num_bytes:3.1f} {count}" + num_bytes /= 1024.0 + return "{:3.1f} {}".format(num_bytes, "TB") + + +@enforce_types +def format_duration(seconds: float) -> str: + """Format duration in human-readable form.""" + if seconds < 1: + return f"{seconds * 1000:.0f}ms" + elif seconds < 60: + return f"{seconds:.1f}s" + elif seconds < 3600: + minutes = int(seconds // 60) + secs = int(seconds % 60) + return f"{minutes}min {secs}s" if secs else f"{minutes}min" + else: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + return f"{hours}hr {minutes}min" if minutes else f"{hours}hr" + + +@enforce_types +def truncate_url(url: str, max_length: int = 60) -> str: + """Truncate URL to max_length, keeping domain and adding ellipsis.""" + if len(url) <= max_length: + return url + # Try to keep the domain and beginning of path + if "://" in url: + protocol, rest = url.split("://", 1) + if "/" in rest: + domain, path = rest.split("/", 1) + available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..." + if available > 10: + return f"{protocol}://{domain}/{path[:available]}..." + # Fallback: just truncate + return url[: max_length - 3] + "..." + + +@enforce_types +def log_worker_event( + worker_type: str, + event: str, + indent_level: int = 0, + pid: int | None = None, + worker_id: str | None = None, + url: str | None = None, + plugin: str | None = None, + metadata: dict[str, Any] | None = None, + error: Exception | None = None, +) -> None: + """ + Log a worker event with structured metadata and indentation. + + Args: + worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker) + event: Event name (Starting, Completed, Failed, etc.) + indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker) + pid: Process ID + worker_id: Worker ID (UUID for workers) + url: URL being processed (for SnapshotWorker) + plugin: Plugin name (for hook processes) + metadata: Dict of metadata to show in curly braces + error: Exception if event is an error + """ + indent = " " * indent_level + + from rich.markup import escape + + # Build worker identifier (without URL/plugin) + worker_parts = [worker_type] + # Don't add pid/worker_id for DB operations (they happen in whatever process is running) + if pid and worker_type != "DB": + worker_parts.append(f"pid={pid}") + if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB": + worker_parts.append(f"id={worker_id}") + + # Build worker label parts for brackets (shown inside brackets) + worker_label_base = worker_parts[0] + worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None + + # Build URL/plugin display (shown AFTER the label, outside brackets) + url_extractor_parts = [] + if url: + url_extractor_parts.append(f"url: {escape(url)}") + if plugin: + url_extractor_parts.append(f"extractor: {escape(plugin)}") + + url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else "" + + # Build metadata string + metadata_str = "" + if metadata: + # Format metadata nicely + meta_parts = [] + for k, v in metadata.items(): + if isinstance(v, float): + # Format floats nicely (durations, sizes) + if "duration" in k.lower(): + meta_parts.append(f"{k}: {format_duration(v)}") + elif "size" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(int(v))}") + else: + meta_parts.append(f"{k}: {v:.2f}") + elif isinstance(v, int): + # Format integers - check if it's a size + if "size" in k.lower() or "bytes" in k.lower(): + meta_parts.append(f"{k}: {printable_filesize(v)}") + else: + meta_parts.append(f"{k}: {v}") + elif isinstance(v, (list, tuple)): + meta_parts.append(f"{k}: {len(v)}") + else: + meta_parts.append(f"{k}: {v}") + metadata_str = " | ".join(meta_parts) + + # Determine color based on event + color = "white" + if event in ("Starting...", "Started", "STARTED", "Started in background"): + color = "green" + elif event.startswith("Created"): + color = "cyan" # DB creation events + elif event in ("Completed", "COMPLETED", "All work complete"): + color = "blue" + elif event in ("Failed", "ERROR", "Failed to spawn worker"): + color = "red" + elif event in ("Shutting down", "SHUTDOWN"): + color = "grey53" + + # Build final message + error_str = f" {type(error).__name__}: {error}" if error else "" + from archivebox.misc.logging import CONSOLE, STDERR + from rich.text import Text + + # Create a Rich Text object for proper formatting + # Text.append() treats content as literal (no markup parsing) + text = Text() + text.append(indent) + text.append(worker_label_base, style=color) + + # Add bracketed content if present (using Text.append to avoid markup issues) + if worker_bracket_content: + text.append("[", style=color) + text.append(worker_bracket_content, style=color) + text.append("]", style=color) + + text.append(f" {event}{error_str}", style=color) + + # Add URL/plugin info first (more important) + if url_extractor_str: + text.append(f" | {url_extractor_str}") + + # Then add other metadata + if metadata_str: + text.append(f" | {metadata_str}") + + # Stdout is reserved for JSONL records whenever commands are piped together. + # Route worker/DB progress to stderr in non-TTY contexts so pipelines like + # `archivebox snapshot list | archivebox run` keep stdout machine-readable. + output_console = CONSOLE if sys.stdout.isatty() else STDERR + output_console.print(text, soft_wrap=True) + + +@enforce_types +def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str: + return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items()) + + +@enforce_types +def printable_config(config: dict, prefix: str = "") -> str: + return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val))) + + +@enforce_types +def printable_folder_status(name: str, folder: dict) -> str: + if folder["enabled"]: + if folder["is_valid"]: + color, symbol, note, num_files = "green", "โˆš", "valid", "" + else: + color, symbol, note, num_files = "red", "X", "invalid", "?" + else: + color, symbol, note, num_files = "grey53", "-", "unused", "-" + + if folder["path"]: + if os.access(folder["path"], os.R_OK): + try: + num_files = ( + f"{len(os.listdir(folder['path']))} files" + if os.path.isdir(folder["path"]) + else printable_filesize(Path(folder["path"]).stat().st_size) + ) + except PermissionError: + num_files = "error" + else: + num_files = "missing" + + if folder.get("is_mount"): + # add symbol @ next to filecount if path is a remote filesystem mount + num_files = f"{num_files} @" if num_files else "@" + + path = pretty_path(folder["path"]) + + return " ".join( + ( + f"[{color}]", + symbol, + "[/]", + name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"), + num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"), + f"[{color}]", + note.ljust(8), + "[/]", + path.ljust(76), + ), + ) + + +@enforce_types +def printable_dependency_version(name: str, dependency: dict) -> str: + color, symbol, note, version = "red", "X", "invalid", "?" + + if dependency["enabled"]: + if dependency["is_valid"]: + color, symbol, note = "green", "โˆš", "valid" + + parsed_version_num = re.search(r"[\d\.]+", dependency["version"]) + if parsed_version_num: + version = f"v{parsed_version_num[0]}" + else: + color, symbol, note, version = "lightyellow", "-", "disabled", "-" + + path = pretty_path(dependency["path"]) + + return " ".join( + ( + ANSI[color], + symbol, + ANSI["reset"], + name.ljust(21), + version.ljust(14), + ANSI[color], + note.ljust(8), + ANSI["reset"], + path.ljust(76), + ), + ) diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py new file mode 100644 index 0000000000..a2f99e3292 --- /dev/null +++ b/archivebox/misc/monkey_patches.py @@ -0,0 +1,87 @@ +__package__ = "archivebox" + + +import datetime +import re +import warnings + +from daphne import access +import django_stubs_ext +from django.utils import timezone + +django_stubs_ext.monkeypatch() + + +# monkey patch django timezone to add back utc (it was removed in Django 5.0) +setattr(timezone, "utc", datetime.UTC) + +# monkey patch django-signals-webhooks to change how it shows up in Admin UI +# from signal_webhooks.apps import DjangoSignalWebhooksConfig +# DjangoSignalWebhooksConfig.verbose_name = 'API' + + +# Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files +# Standard Python tracebacks are used instead (full width, no frames) +# from rich.traceback import install +# install(show_locals=True, word_wrap=False, ...) + + +# Hide site-packages/sonic/client.py:115: SyntaxWarning +# https://github.com/xmonader/python-sonic-client/pull/18 +warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic") + + +SENSITIVE_QUERY_PARAM_RE = re.compile(r"(?i)([?&](?:api_key|token|access_token|password|secret)=)([^&#\s]+)") + + +# Make daphne log requests quieter and easier to read +class ModifiedAccessLogGenerator(access.AccessLogGenerator): + """Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files""" + + def __call__(self, protocol, action, details): + if protocol == "http" and action == "complete": + self.write_entry( + host=details["client"], + date=datetime.datetime.now(), + request="%(method)s %(path)s" % details, + status=details["status"], + length=details["size"], + time_taken=details.get("time_taken"), + ) + return + return super().__call__(protocol, action, details) + + def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None, time_taken=None): + request = SENSITIVE_QUERY_PARAM_RE.sub(r"\1[REDACTED]", request) + + # Ignore noisy requests to staticfiles / favicons / etc. + if "GET /static/" in request: + return + if "GET /health/" in request: + return + if "GET /progress.json" in request and (time_taken is None or time_taken < 1.0): + return + if "GET /api/v1/crawls/crawl/" in request and "/files/chrome_screencast/latest.jpg" in request: + return + if "GET /admin/jsi18n/" in request: + return + if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"): + return + if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"): + return + if str(status) in ("404", "304"): + return + + # clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats + self.stream.write( + "%s HTTP %s %s %s\n" + % ( + date.strftime("%Y-%m-%d %H:%M:%S"), + request, + status or "-", + "localhost" if host.startswith("127.") else host.split(":")[0], + ), + ) + + +access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore diff --git a/archivebox/misc/paginators.py b/archivebox/misc/paginators.py new file mode 100644 index 0000000000..f8a07f7167 --- /dev/null +++ b/archivebox/misc/paginators.py @@ -0,0 +1,25 @@ +__package__ = "archivebox.misc" + +from django.core.paginator import Paginator +from django.db.models import QuerySet +from django.utils.functional import cached_property + + +class AcceleratedPaginator(Paginator): + """Paginator that accepts exact count hints for already-optimized query paths.""" + + @cached_property + def count(self): + if not isinstance(self.object_list, QuerySet): + return super().count + + query = self.object_list.query + count_hint = self.object_list.__dict__.get("_archivebox_count_hint") + if count_hint is None: + count_hint = query.__dict__.get("_archivebox_count_hint") + if count_hint is not None: + if callable(count_hint): + return count_hint() + return count_hint + + return super().count diff --git a/archivebox/misc/serve_static.py b/archivebox/misc/serve_static.py new file mode 100644 index 0000000000..b0fc4a0df8 --- /dev/null +++ b/archivebox/misc/serve_static.py @@ -0,0 +1,1137 @@ +import html +import json +import re +import os +import sys +import stat +import asyncio +import posixpath +import mimetypes +import importlib +import queue +import threading +import time +import zipfile +from datetime import datetime +from collections.abc import Callable +from pathlib import Path +from urllib.parse import urlencode + +from django import template +from django.core.handlers.asgi import ASGIRequest +from django.contrib.staticfiles import finders +from django.template import TemplateDoesNotExist, loader +from django.views import static +from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified +from django.utils._os import safe_join +from django.utils.http import http_date +from django.utils.translation import gettext as _ +from abx_plugins.plugins.archivewebpage import replay_preview as archivewebpage_replay +from archivebox.config.common import get_config +from archivebox.misc.logging_util import printable_filesize + + +_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {} + + +def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None: + hashes_path = snapshot_dir / "hashes" / "hashes.json" + if not hashes_path.exists(): + return None + try: + mtime = hashes_path.stat().st_mtime + except OSError: + return None + + cached = _HASHES_CACHE.get(hashes_path) + if cached and cached[0] == mtime: + return cached[1] + + try: + data = json.loads(hashes_path.read_text(encoding="utf-8")) + except Exception: + return None + + file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")} + _HASHES_CACHE[hashes_path] = (mtime, file_map) + return file_map + + +def _hash_for_path(document_root: Path, rel_path: str) -> str | None: + file_map = _load_hash_map(document_root) + if not file_map: + return None + return file_map.get(rel_path) + + +def _resolve_archive_path(document_root: str | Path, rel_path: str) -> tuple[Path, str]: + rel_path = posixpath.normpath(rel_path).lstrip("/") if rel_path else "" + fullpath = Path(safe_join(document_root, rel_path)) + if os.access(fullpath, os.R_OK): + return fullpath, rel_path + + root = Path(document_root) + current = root + resolved_parts: list[str] = [] + for part in Path(rel_path).parts: + exact = current / part + if os.access(exact, os.R_OK): + current = exact + resolved_parts.append(part) + continue + + folded_part = part.casefold() + try: + match = next((child for child in current.iterdir() if child.name.casefold() == folded_part), None) + except OSError: + match = None + if match is None: + return fullpath, rel_path + + current = match + resolved_parts.append(match.name) + + return current, posixpath.join(*resolved_parts) if resolved_parts else "" + + +def _cache_policy(config=None, **config_kwargs) -> str: + config = config or get_config(resolve_plugins=False, **config_kwargs) + return "private" if config.PERMISSIONS == "private" else "public" + + +def _render_mhtml_preview_document(filename: str, output_path: str) -> str: + from archivebox.plugins.discovery import get_plugin_template + + template_str = get_plugin_template("chrome_mhtml", "full", fallback=False) + if not template_str: + raise FileNotFoundError("chrome_mhtml/templates/full.html") + + tpl = template.Engine(debug=False).from_string(template_str) + return tpl.render( + template.Context( + { + "output_path": output_path, + "output_path_raw": filename, + "plugin": "chrome_mhtml", + }, + ), + ) + + +def _format_direntry_timestamp(stat_result: os.stat_result) -> str: + timestamp = stat_result.st_birthtime if sys.platform == "darwin" else stat_result.st_mtime + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M") + + +def _safe_zip_stem(name: str) -> str: + safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-") + return safe_name or "archivebox" + + +class _StreamingQueueWriter: + """Expose a write-only file-like object so zipfile can stream into a queue.""" + + def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None: + self.output_queue = output_queue + self.position = 0 + + def write(self, data: bytes) -> int: + if data: + self.output_queue.put(data) + self.position += len(data) + return len(data) + + def tell(self) -> int: + return self.position + + def flush(self) -> None: + return None + + def close(self) -> None: + return None + + def writable(self) -> bool: + return True + + def seekable(self) -> bool: + return False + + +def _iter_visible_files(root: Path): + """Yield non-hidden files in a stable order so ZIP output is deterministic.""" + + for current_root, dirnames, filenames in os.walk(root): + dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith(".")) + for filename in sorted(name for name in filenames if not name.startswith(".")): + yield Path(current_root) / filename + + +def _build_directory_zip_response( + fullpath: Path, + path: str, + *, + is_archive_replay: bool, + use_async_stream: bool, + config=None, +) -> StreamingHttpResponse: + root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox") + sentinel = object() + output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8) + initial_chunk_target = 64 * 1024 + initial_chunk_wait = 0.05 + + def build_zip() -> None: + # zipfile wants a write-only file object. Feed those bytes straight into + # a queue so the response can stream them out as soon as they are ready. + writer = _StreamingQueueWriter(output_queue) + try: + with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file: + for entry in _iter_visible_files(fullpath): + rel_parts = entry.relative_to(fullpath).parts + arcname = Path(root_name, *rel_parts).as_posix() + zip_file.write(entry, arcname) + except BaseException as err: + output_queue.put(err) + finally: + output_queue.put(sentinel) + + threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start() + + def iter_zip_chunks(): + # Emit a meaningful first chunk quickly so browsers show the download + # immediately instead of waiting on dozens of tiny ZIP header writes. + first_chunk = bytearray() + initial_deadline = time.monotonic() + initial_chunk_wait + + while True: + timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None + try: + chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get() + except queue.Empty: + if first_chunk: + yield bytes(first_chunk) + first_chunk.clear() + continue + chunk = output_queue.get() + + if chunk is sentinel: + if first_chunk: + yield bytes(first_chunk) + break + if isinstance(chunk, BaseException): + raise chunk + if len(first_chunk) < initial_chunk_target: + first_chunk.extend(chunk) + if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline: + yield bytes(first_chunk) + first_chunk.clear() + continue + yield chunk + + async def stream_zip_async(): + # Django ASGI buffers sync StreamingHttpResponse iterators by consuming + # them into a list. Drive the same sync iterator from a worker thread so + # Daphne can send each chunk as it arrives instead of buffering the ZIP. + iterator = iter(iter_zip_chunks()) + while True: + chunk = await asyncio.to_thread(next, iterator, None) + if chunk is None: + break + yield chunk + + response = StreamingHttpResponse( + stream_zip_async() if use_async_stream else iter_zip_chunks(), + content_type="application/zip", + ) + response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"' + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime) + response.headers["X-Accel-Buffering"] = "no" + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="application/zip", + is_archive_replay=is_archive_replay, + config=config, + ) + + +async def _stream_ranged_file_async(ranged_file: "RangedFileReader"): + iterator = iter(ranged_file) + try: + while True: + chunk = await asyncio.to_thread(next, iterator, None) + if chunk is None: + break + yield chunk + finally: + ranged_file.close() + + +def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse: + try: + template = loader.select_template( + [ + "static/directory_index.html", + "static/directory_index", + ], + ) + except TemplateDoesNotExist: + return static.directory_index(path, fullpath) + + entries = [] + file_list = [] + visible_entries = sorted( + (entry for entry in fullpath.iterdir() if not entry.name.startswith(".")), + key=lambda entry: (not entry.is_dir(), entry.name.lower()), + ) + for entry in visible_entries: + url = str(entry.relative_to(fullpath)) + if entry.is_dir(): + url += "/" + file_list.append(url) + + stat_result = entry.stat() + entries.append( + { + "name": url, + "url": url, + "is_dir": entry.is_dir(), + "size": "โ€”" if entry.is_dir() else printable_filesize(stat_result.st_size), + "timestamp": _format_direntry_timestamp(stat_result), + }, + ) + + zip_query = request.GET.copy() + zip_query["download"] = "zip" + zip_url = request.path + if zip_query: + zip_url = f"{zip_url}?{zip_query.urlencode()}" + + context = { + "directory": f"{path}/", + "file_list": file_list, + "entries": entries, + "zip_url": zip_url, + } + return HttpResponse(template.render(context)) + + +# Ensure common web types are mapped consistently across platforms. +mimetypes.add_type("text/html", ".html") +mimetypes.add_type("text/html", ".htm") +mimetypes.add_type("text/css", ".css") +mimetypes.add_type("application/javascript", ".js") +mimetypes.add_type("application/json", ".json") +mimetypes.add_type("application/x-ndjson", ".jsonl") +mimetypes.add_type("text/markdown", ".md") +mimetypes.add_type("text/yaml", ".yml") +mimetypes.add_type("text/yaml", ".yaml") +mimetypes.add_type("text/csv", ".csv") +mimetypes.add_type("text/tab-separated-values", ".tsv") +mimetypes.add_type("application/xml", ".xml") +mimetypes.add_type("image/svg+xml", ".svg") +mimetypes.add_type("multipart/related", ".mhtml") +mimetypes.add_type("multipart/related", ".mht") + +try: + _markdown = importlib.import_module("markdown").markdown +except ImportError: + _markdown: Callable[..., str] | None = None + +MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)") +MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") +MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*") +MARKDOWN_ITALIC_RE = re.compile(r"(?]*>") +HTML_BODY_RE = re.compile(r"]*>(.*)", flags=re.IGNORECASE | re.DOTALL) +RISKY_REPLAY_MIMETYPES = { + "text/html", + "application/xhtml+xml", + "image/svg+xml", +} +RISKY_REPLAY_EXTENSIONS = {".html", ".htm", ".xhtml", ".svg", ".svgz"} +RISKY_REPLAY_MARKERS = ( + " str: + candidate = text + body_match = HTML_BODY_RE.search(candidate) + if body_match: + candidate = body_match.group(1) + candidate = re.sub(r"^\s*]*>", "", candidate, flags=re.IGNORECASE) + candidate = re.sub(r"

\s*$", "", candidate, flags=re.IGNORECASE) + return candidate.strip() + + +def _looks_like_markdown(text: str) -> bool: + lower = text.lower() + if "" in lower: + return False + md_markers = 0 + md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE)) + md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE)) + md_markers += text.count("[TOC]") + md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text)) + md_markers += text.count("\n---") + text.count("\n***") + return md_markers >= 6 + + +def _render_text_preview_document(text: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_text = html.escape(text) + return f""" + + + + + {escaped_title} + + + +
{escaped_title}
+
{escaped_text}
+ +""" + + +def _render_image_preview_document(image_url: str, title: str) -> str: + escaped_title = html.escape(title) + escaped_url = html.escape(image_url, quote=True) + return f""" + + + + + {escaped_title} + + + +
+ {escaped_title} +
+ +""" + + +def _render_markdown_fallback(text: str) -> str: + if _markdown is not None and not HTML_TAG_RE.search(text): + try: + return _markdown( + text, + extensions=["extra", "toc", "sane_lists"], + output_format="html", + ) + except Exception: + pass + + lines = text.splitlines() + headings = [] + + def slugify(value: str) -> str: + slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-") + return slug or "section" + + for raw_line in lines: + heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line) + if heading_match: + level = len(heading_match.group(1)) + content = heading_match.group(2).strip() + headings.append((level, content, slugify(content))) + + html_lines = [] + in_code = False + in_ul = False + in_ol = False + in_blockquote = False + + def render_inline(markup: str) -> str: + content = MARKDOWN_INLINE_IMAGE_RE.sub(r'\1', markup) + content = MARKDOWN_INLINE_LINK_RE.sub(r'\1', content) + content = MARKDOWN_BOLD_RE.sub(r"\1", content) + content = MARKDOWN_ITALIC_RE.sub(r"\1", content) + return content + + def close_lists(): + nonlocal in_ul, in_ol + if in_ul: + html_lines.append("") + in_ul = False + if in_ol: + html_lines.append("") + in_ol = False + + for raw_line in lines: + line = raw_line.rstrip("\n") + stripped = line.strip() + + if stripped.startswith("```"): + if in_code: + html_lines.append("") + in_code = False + else: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + html_lines.append("
")
+                in_code = True
+            continue
+
+        if in_code:
+            html_lines.append(html.escape(line))
+            continue
+
+        if not stripped:
+            close_lists()
+            if in_blockquote:
+                html_lines.append("")
+                in_blockquote = False
+            html_lines.append("
") + continue + + heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line) + if heading_match: + close_lists() + if in_blockquote: + html_lines.append("") + in_blockquote = False + leading_tags = heading_match.group(1).strip() + level = len(heading_match.group(2)) + content = heading_match.group(3).strip() + if leading_tags: + html_lines.append(leading_tags) + html_lines.append(f'{render_inline(content)}') + continue + + if stripped in ("---", "***"): + close_lists() + html_lines.append("
") + continue + + if stripped.startswith("> "): + if not in_blockquote: + close_lists() + html_lines.append("
") + in_blockquote = True + content = stripped[2:] + html_lines.append(render_inline(content)) + continue + else: + if in_blockquote: + html_lines.append("
") + in_blockquote = False + + ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line) + if ul_match: + if in_ol: + html_lines.append("") + in_ol = False + if not in_ul: + html_lines.append("
    ") + in_ul = True + html_lines.append(f"
  • {render_inline(ul_match.group(1))}
  • ") + continue + + ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line) + if ol_match: + if in_ul: + html_lines.append("
") + in_ul = False + if not in_ol: + html_lines.append("
    ") + in_ol = True + html_lines.append(f"
  1. {render_inline(ol_match.group(1))}
  2. ") + continue + + close_lists() + + # Inline conversions (leave raw HTML intact) + if stripped == "[TOC]": + toc_items = [] + for level, title, slug in headings: + toc_items.append( + f'
  3. {title}
  4. ', + ) + html_lines.append( + '", + ) + continue + + html_lines.append(f"

    {render_inline(line)}

    ") + + close_lists() + if in_blockquote: + html_lines.append("") + if in_code: + html_lines.append("
") + + return "\n".join(html_lines) + + +def _render_markdown_document(markdown_text: str) -> str: + body = _render_markdown_fallback(markdown_text) + wrapped = ( + '' + '' + "" + "" + f"{body}" + "" + ) + return wrapped + + +def _content_type_base(content_type: str) -> str: + return (content_type or "").split(";", 1)[0].strip().lower() + + +def _is_risky_replay_document(fullpath: Path, content_type: str) -> bool: + if fullpath.suffix.lower() in RISKY_REPLAY_EXTENSIONS: + return True + + if _content_type_base(content_type) in RISKY_REPLAY_MIMETYPES: + return True + + # Unknown archived response paths often have no extension. Sniff a small prefix + # so one-domain no-JS mode still catches HTML/SVG documents. + try: + head = fullpath.read_bytes()[:4096].decode("utf-8", errors="ignore").lower() + except Exception: + return False + + return any(marker in head for marker in RISKY_REPLAY_MARKERS) + + +def _apply_archive_replay_headers( + response: HttpResponse, + *, + fullpath: Path, + content_type: str, + is_archive_replay: bool, + config=None, + **config_kwargs, +) -> HttpResponse: + if not is_archive_replay: + return response + + response.headers.setdefault("X-Content-Type-Options", "nosniff") + config = config or get_config(resolve_plugins=False, **config_kwargs) + response.headers.setdefault("X-ArchiveBox-Security-Mode", config.SERVER_SECURITY_MODE) + + if config.SHOULD_NEUTER_RISKY_REPLAY and _is_risky_replay_document(fullpath, content_type): + response.headers["Content-Security-Policy"] = ( + "sandbox; " + "default-src 'self' data: blob:; " + "script-src 'none'; " + "object-src 'none'; " + "base-uri 'none'; " + "form-action 'none'; " + "connect-src 'none'; " + "worker-src 'none'; " + "frame-ancestors 'self'; " + "style-src 'self' 'unsafe-inline' data: blob:; " + "img-src 'self' data: blob:; " + "media-src 'self' data: blob:; " + "font-src 'self' data: blob:;" + ) + response.headers.setdefault("Referrer-Policy", "no-referrer") + + return response + + +def _is_asgi_request(request) -> bool: + return isinstance(request, ASGIRequest) or "scope" in request.__dict__ + + +def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool = False): + """ + Overrides Django's built-in django.views.static.serve function to support byte range requests. + This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + assert document_root + config = request.__dict__.get("archivebox_config") + if config is None: + config = get_config(resolve_plugins=False) + fullpath, path = _resolve_archive_path(document_root, path) + if os.access(fullpath, os.R_OK) and fullpath.is_dir(): + if request.GET.get("download") == "zip" and show_indexes: + return _build_directory_zip_response( + fullpath, + path, + is_archive_replay=is_archive_replay, + use_async_stream=_is_asgi_request(request), + config=config, + ) + if show_indexes: + response = _render_directory_index(request, path, fullpath) + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime) + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html", + is_archive_replay=is_archive_replay, + config=config, + ) + raise Http404(_("Directory indexes are not allowed here.")) + if not os.access(fullpath, os.R_OK): + raise Http404(_("โ€œ%(path)sโ€ does not exist") % {"path": fullpath}) + + statobj = fullpath.stat() + document_root = Path(document_root) if document_root else None + rel_path = path + etag = None + if document_root: + file_hash = _hash_for_path(document_root, rel_path) + if file_hash: + etag = f'"{file_hash}"' + + if etag: + inm = request.META.get("HTTP_IF_NONE_MATCH") + if inm: + inm_list = [item.strip() for item in inm.split(",")] + if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]: + not_modified = HttpResponseNotModified() + not_modified.headers["ETag"] = etag + not_modified.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime) + return _apply_archive_replay_headers( + not_modified, + fullpath=fullpath, + content_type="", + is_archive_replay=is_archive_replay, + config=config, + ) + + content_type, encoding = mimetypes.guess_type(str(fullpath)) + content_type = content_type or "application/octet-stream" + # Add charset for text-like types (best guess), but don't override the type. + is_text_like = content_type.startswith("text/") or content_type in { + "application/json", + "application/javascript", + "application/xml", + "application/x-ndjson", + "image/svg+xml", + } + if is_text_like and "charset=" not in content_type: + content_type = f"{content_type}; charset=utf-8" + preview_as_text_html = ( + bool(request.GET.get("preview")) + and is_text_like + and not content_type.startswith("text/html") + and not content_type.startswith("image/svg+xml") + ) + preview_as_image_html = ( + bool(request.GET.get("preview")) and content_type.startswith("image/") and not content_type.startswith("image/svg+xml") + ) + preview_as_mhtml_html = bool(request.GET.get("preview")) and fullpath.suffix.lower() in {".mhtml", ".mht"} + preview_as_archivewebpage_html = bool(request.GET.get("preview")) and archivewebpage_replay.is_replay_target(fullpath.name) + + # Respect the If-Modified-Since header for non-markdown responses. + if not (content_type.startswith("text/plain") or content_type.startswith("text/html")): + if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime): + return _apply_archive_replay_headers( + HttpResponseNotModified(), + fullpath=fullpath, + content_type=content_type, + is_archive_replay=is_archive_replay, + config=config, + ) + + # Wrap text-like outputs in HTML when explicitly requested for iframe previewing. + if preview_as_text_html: + try: + max_preview_size = 10 * 1024 * 1024 + if statobj.st_size <= max_preview_size: + decoded = fullpath.read_text(encoding="utf-8", errors="replace") + wrapped = _render_text_preview_document(decoded, fullpath.name) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + config=config, + ) + except Exception: + pass + + if preview_as_image_html: + try: + preview_query = request.GET.copy() + preview_query.pop("preview", None) + raw_image_url = request.path + if preview_query: + raw_image_url = f"{raw_image_url}?{urlencode(list(preview_query.lists()), doseq=True)}" + wrapped = _render_image_preview_document(raw_image_url, fullpath.name) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + config=config, + ) + except Exception: + pass + + if preview_as_archivewebpage_html: + try: + raw_query = request.GET.copy() + raw_query.pop("preview", None) + raw_output_path = request.path + if raw_query: + raw_output_path = f"{raw_output_path}?{raw_query.urlencode()}" + body, preview_content_type, headers = archivewebpage_replay.render_preview_response( + fullpath.name, + raw_output_path, + wacz_path=fullpath, + fallback_url=request.archivebox_snapshot_url or "", + last_modified=http_date(statobj.st_mtime), + etag=etag or "", + cache_control=( + f"{_cache_policy(config=config)}, max-age=31536000, immutable" + if etag + else f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + ), + content_encoding=encoding or "", + ) + response = HttpResponse(body, content_type=preview_content_type) + for key, value in headers.items(): + response.headers[key] = value + return response + except Exception: + pass + + if preview_as_mhtml_html: + try: + raw_query = request.GET.copy() + raw_query.pop("preview", None) + raw_output_path = request.path + if raw_query: + raw_output_path = f"{raw_output_path}?{raw_query.urlencode()}" + rendered = _render_mhtml_preview_document(fullpath.name, raw_output_path) + response = HttpResponse(rendered, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.stem}.html"' + response.headers["X-Content-Type-Options"] = "nosniff" + response.headers["Content-Security-Policy"] = ( + "default-src 'self' data: blob:; " + "script-src 'unsafe-inline'; " + "style-src 'unsafe-inline' data: blob:; " + "connect-src 'self'; " + "frame-src 'self' data: blob:; " + "object-src 'none'; " + "base-uri 'none'; " + "form-action 'none';" + ) + if encoding: + response.headers["Content-Encoding"] = encoding + return response + except Exception: + pass + + # Heuristic fix: some archived HTML outputs (e.g. mercury content.html) + # are stored with HTML-escaped markup or markdown sources. If so, render sensibly. + if content_type.startswith("text/plain") or content_type.startswith("text/html"): + try: + max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use + if statobj.st_size <= max_unescape_size: + raw = fullpath.read_bytes() + decoded = raw.decode("utf-8", errors="replace") + escaped_count = decoded.count("<") + decoded.count(">") + tag_count = decoded.count("<") + if escaped_count and escaped_count > tag_count * 2: + decoded = html.unescape(decoded) + markdown_candidate = _extract_markdown_candidate(decoded) + if _looks_like_markdown(markdown_candidate): + wrapped = _render_markdown_document(markdown_candidate) + response = HttpResponse(wrapped, content_type="text/html; charset=utf-8") + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type="text/html; charset=utf-8", + is_archive_replay=is_archive_replay, + config=config, + ) + if escaped_count and escaped_count > tag_count * 2: + response = HttpResponse(decoded, content_type=content_type) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type=content_type, + is_archive_replay=is_archive_replay, + config=config, + ) + except Exception: + pass + + # setup response object + ranged_file = RangedFileReader(open(fullpath, "rb")) + response = StreamingHttpResponse( + _stream_ranged_file_async(ranged_file) if _is_asgi_request(request) else ranged_file, + content_type=content_type, + ) + response.headers["Last-Modified"] = http_date(statobj.st_mtime) + if etag: + response.headers["ETag"] = etag + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=31536000, immutable" + else: + response.headers["Cache-Control"] = f"{_cache_policy(config=config)}, max-age=60, stale-while-revalidate=300" + if is_text_like: + response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"' + if content_type.startswith("image/"): + response.headers["Cache-Control"] = "public, max-age=604800, immutable" + + # handle byte-range requests by serving chunk of file + if stat.S_ISREG(statobj.st_mode): + size = statobj.st_size + response["Content-Length"] = size + response["Accept-Ranges"] = "bytes" + response["X-Django-Ranges-Supported"] = "1" + # Respect the Range header. + if "HTTP_RANGE" in request.META: + try: + ranges = parse_range_header(request.META["HTTP_RANGE"], size) + except ValueError: + ranges = None + # only handle syntactically valid headers, that are simple (no + # multipart byteranges) + if ranges is not None and len(ranges) == 1: + start, stop = ranges[0] + if stop > size: + # requested range not satisfiable + return HttpResponse(status=416) + ranged_file.start = start + ranged_file.stop = stop + response["Content-Range"] = "bytes %d-%d/%d" % (start, stop - 1, size) + response["Content-Length"] = stop - start + response.status_code = 206 + if encoding: + response.headers["Content-Encoding"] = encoding + return _apply_archive_replay_headers( + response, + fullpath=fullpath, + content_type=content_type, + is_archive_replay=is_archive_replay, + config=config, + ) + + +def serve_static(request, path, **kwargs): + """ + Serve static files below a given point in the directory structure or + from locations inferred from the staticfiles finders. + + To use, put a URL pattern such as:: + + from django.contrib.staticfiles import views + + path('', views.serve) + + in your URLconf. + + It uses the django.views.static.serve() view to serve the found files. + """ + + normalized_path = posixpath.normpath(path).lstrip("/") + absolute_path = finders.find(normalized_path) + if not absolute_path: + if path.endswith("/") or path == "": + raise Http404("Directory indexes are not allowed here.") + raise Http404("'%s' could not be found" % path) + document_root, path = os.path.split(absolute_path) + return serve_static_with_byterange_support(request, path, document_root=document_root, **kwargs) + + +def parse_range_header(header, resource_size): + """ + Parses a range header into a list of two-tuples (start, stop) where `start` + is the starting byte of the range (inclusive) and `stop` is the ending byte + position of the range (exclusive). + Returns None if the value of the header is not syntactically valid. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + if not header or "=" not in header: + return None + + ranges = [] + units, range_ = header.split("=", 1) + units = units.strip().lower() + + if units != "bytes": + return None + + for val in range_.split(","): + val = val.strip() + if "-" not in val: + return None + + if val.startswith("-"): + # suffix-byte-range-spec: this form specifies the last N bytes of an + # entity-body + start = resource_size + int(val) + if start < 0: + start = 0 + stop = resource_size + else: + # byte-range-spec: first-byte-pos "-" [last-byte-pos] + start, stop = val.split("-", 1) + start = int(start) + # the +1 is here since we want the stopping point to be exclusive, whereas in + # the HTTP spec, the last-byte-pos is inclusive + stop = int(stop) + 1 if stop else resource_size + if start >= stop: + return None + + ranges.append((start, stop)) + + return ranges + + +class RangedFileReader: + """ + Wraps a file like object with an iterator that runs over part (or all) of + the file defined by start and stop. Blocks of block_size will be returned + from the starting position, up to, but not including the stop point. + https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d + """ + + block_size = 8192 + + def __init__(self, file_like, start=0, stop=float("inf"), block_size=None): + self.f = file_like + self.block_size = block_size or RangedFileReader.block_size + self.start = start + self.stop = stop + + def __iter__(self): + try: + self.f.seek(self.start) + position = self.start + while position < self.stop: + data = self.f.read(min(self.block_size, self.stop - position)) + if not data: + break + + yield data + position += len(data) + finally: + self.close() + + def close(self): + self.f.close() diff --git a/archivebox/misc/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py new file mode 100644 index 0000000000..39ffeaa9a0 --- /dev/null +++ b/archivebox/misc/shell_welcome_message.py @@ -0,0 +1,64 @@ +__package__ = "archivebox.core" + +from rich.console import Console + +# helpful imports that make the shell easier to work with out-of-the-box: +import re # noqa +import os # noqa +import sys # noqa +import json # noqa +import psutil # noqa +import django # noqa +import pydantic # noqa +import requests # noqa +import subprocess # noqa +import archivebox +from django.utils import timezone # noqa +from datetime import datetime, timedelta # noqa + +from archivebox import CONSTANTS # noqa +from archivebox.cli import * # noqa +from archivebox.config.common import get_config +from archivebox.misc.logging import AttrDict # noqa + + +if __name__ == "__main__": + CONFIG = get_config() + + # load the rich extension for ipython for pretty printing + # https://rich.readthedocs.io/en/stable/introduction.html#ipython-extension + get_ipython().run_line_magic("load_ext", "rich") # type: ignore # noqa + + # prnt = print with cropping using ... ellipsis for helptext that doesn't matter that much + console = Console() + prnt = lambda *args, **kwargs: console.print(*args, overflow="ellipsis", soft_wrap=True, **kwargs) + + # print the welcome message + prnt("[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, django[/]") + prnt("[yellow4]# ArchiveBox Imports[/]") + prnt("[yellow4]import archivebox[/]") + prnt("[yellow4]from archivebox.cli import *[/]") + prnt("[yellow4]from archivebox.misc.logging import AttrDict[/]") + prnt() + + if console.width >= 80: + from archivebox.misc.logging import rainbow + + prnt(rainbow(archivebox.ASCII_LOGO)) + + prnt("[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!") + prnt( + " [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]", + ) + prnt( + " [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]", + ) + prnt() + prnt(" :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]") + prnt( + " add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]", + ) + prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]') + prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]') + prnt(" snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]") + prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]') diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py new file mode 100644 index 0000000000..13e937db36 --- /dev/null +++ b/archivebox/misc/system.py @@ -0,0 +1,80 @@ +__package__ = "archivebox.misc" + +# Post-bootstrap filesystem utilities (atomic_write, get_dir_size). +# Requires archivebox.config (uses get_config for permissions/output settings). +# Not safe to import pre-bootstrap. + +import os + +from json import dump +from pathlib import Path + +from atomicwrites import atomic_write as lib_atomic_write + +from archivebox.config.common import get_config +from archivebox.misc.util import enforce_types, ExtendedEncoder + + +@enforce_types +def atomic_write(path: Path | str, contents: dict | str | bytes, overwrite: bool = True, config=None, **config_kwargs) -> None: + """Safe atomic write to filesystem by writing to temp file + atomic rename""" + + mode = "wb+" if isinstance(contents, bytes) else "w" + encoding = None if isinstance(contents, bytes) else "utf-8" # enforce utf-8 on all text writes + + # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') + try: + with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f: + if isinstance(contents, dict): + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): + f.write(contents) + except OSError as e: + config = config or get_config(**config_kwargs) + if config.ENFORCE_ATOMIC_WRITES: + print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})") + print( + " You can store the archive/ subfolder on a hard drive or network share that doesn't support support synchronous writes,", + ) + print( + " but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.", + ) + raise SystemExit(1) + + # retry the write without forcing FSYNC (aka atomic mode) + with open(path, mode=mode, encoding=encoding) as f: + if isinstance(contents, dict): + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): + f.write(contents) + + # set file permissions + config = config or get_config(**config_kwargs) + os.chmod(path, int(config.OUTPUT_PERMISSIONS, base=8)) + + +@enforce_types +def get_dir_size(path: str | Path, recursive: bool = True, pattern: str | None = None) -> tuple[int, int, int]: + """get the total disk size of a given directory, optionally summing up + recursively and limiting to a given filter list + """ + num_bytes, num_dirs, num_files = 0, 0, 0 + try: + for entry in os.scandir(path): + if (pattern is not None) and (pattern not in entry.path): + continue + if entry.is_dir(follow_symlinks=False): + if not recursive: + continue + num_dirs += 1 + bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path) + num_bytes += bytes_inside + num_dirs += dirs_inside + num_files += files_inside + else: + num_bytes += entry.stat(follow_symlinks=False).st_size + num_files += 1 + except OSError: + # e.g. FileNameTooLong or other error while trying to read dir + pass + return num_bytes, num_dirs, num_files diff --git a/archivebox/misc/toml_util.py b/archivebox/misc/toml_util.py new file mode 100644 index 0000000000..33daaa4065 --- /dev/null +++ b/archivebox/misc/toml_util.py @@ -0,0 +1,119 @@ +from typing import Any, cast +from collections.abc import Callable + +import json +import ast +import inspect +import toml +import re +import configparser + +from pathlib import Path, PosixPath + +from pydantic.json_schema import GenerateJsonSchema +from pydantic_core import to_jsonable_python + +JSONValue = str | bool | int | None | list["JSONValue"] + +TOML_HEADER = "# Converted from INI to TOML format: https://toml.io/en/\n\n" + + +def load_ini_value(val: str) -> JSONValue: + """Convert lax INI values into strict TOML-compliant (JSON) values""" + if val.lower() in ("true", "yes", "1"): + return True + if val.lower() in ("false", "no", "0"): + return False + if val.isdigit(): + return int(val) + + try: + return ast.literal_eval(val) + except Exception: + pass + + try: + return json.loads(val) + except Exception: + pass + + return val + + +def convert(ini_str: str) -> str: + """Convert a string of INI config into its TOML equivalent (warning: strips comments)""" + + config = configparser.ConfigParser() + setattr(config, "optionxform", str) # capitalize key names + config.read_string(ini_str) + + # Initialize an empty dictionary to store the TOML representation + toml_dict = {} + + # Iterate over each section in the INI configuration + for section in config.sections(): + toml_dict[section] = {} + + # Iterate over each key-value pair in the section + for key, value in config.items(section): + parsed_value = load_ini_value(value) + + # Convert the parsed value to its TOML-compatible JSON representation + toml_dict[section.upper()][key.upper()] = json.dumps(parsed_value) + + # Build the TOML string + toml_str = TOML_HEADER + for section, items in toml_dict.items(): + toml_str += f"[{section}]\n" + for key, value in items.items(): + toml_str += f"{key} = {value}\n" + toml_str += "\n" + + return toml_str.strip() + + +class JSONSchemaWithLambdas(GenerateJsonSchema): + """ + Encode lambda functions in default values properly. + Usage: + >>> json.dumps(value, encoder=JSONSchemaWithLambdas()) + """ + + def encode_default(self, dft: Any) -> Any: + config = self._config + if isinstance(dft, Callable): + return "{{lambda " + inspect.getsource(dft).split("=lambda ")[-1].strip()[:-1] + "}}" + return to_jsonable_python( + dft, + timedelta_mode=config.ser_json_timedelta, + bytes_mode=config.ser_json_bytes, + serialize_unknown=True, + ) + + # for computed_field properties render them like this instead: + # inspect.getsource(field.wrapped_property.fget).split('def ', 1)[-1].split('\n', 1)[-1].strip().strip('return '), + + +def better_toml_dump_str(val: Any) -> str: + try: + dump_str = cast(Callable[[Any], str], toml.encoder._dump_str) + return dump_str(val) + except Exception: + # if we hit any of toml's numerous encoding bugs, + # fall back to using json representation of string + return json.dumps(str(val)) + + +class CustomTOMLEncoder(toml.encoder.TomlEncoder): + """ + Custom TomlEncoder to work around https://github.com/uiri/toml's many encoding bugs. + >>> toml.dumps(value, encoder=CustomTOMLEncoder()) + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs) + dump_funcs[Path] = lambda x: json.dumps(str(x)) + dump_funcs[PosixPath] = lambda x: json.dumps(str(x)) + dump_funcs[str] = better_toml_dump_str + dump_funcs[re.RegexFlag] = better_toml_dump_str diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py new file mode 100644 index 0000000000..f8ff1e680a --- /dev/null +++ b/archivebox/misc/util.py @@ -0,0 +1,525 @@ +__package__ = "archivebox.misc" + +# Bootable utility functions (URL parsing, date parsing, JSON encoding, decorators). +# MUST NOT import archivebox.config, archivebox.core, or Django โ€” this module is +# loaded by hooks.py and other early code paths. Only depends on stdlib and the +# bootable .logging module. + +import re +import json as pyjson +import http.cookiejar +from decimal import Decimal, InvalidOperation + +from typing import Any +from collections.abc import Callable +from pathlib import Path +from inspect import signature +from functools import wraps +from urllib.parse import urlparse, quote, unquote +from html import escape, unescape +from datetime import datetime, timezone + +from .logging import COLOR_DICT + + +def filter_queryset_by_uuid_substring(queryset, slug: str, field: str = "id"): + """Filter a queryset to UUID-column matches by prefix or suffix (case-insensitive). + + Avoids ``id__icontains`` (an unindexed full-table scan over the UUID column) by + stripping non-hex chars from ``slug`` and matching with ``istartswith`` / + ``iendswith``. Returns an empty queryset for inputs with fewer than 8 hex chars + to avoid overly broad matches. A full 32-char hex string falls back to an + exact-equality lookup. + """ + from django.db.models import Q + + normalized = re.sub(r"[^0-9a-fA-F]", "", slug or "").lower() + if len(normalized) < 8: + return queryset.none() + if len(normalized) == 32: + return queryset.filter(**{field: normalized}) + prefix = f"{field}__istartswith" + suffix = f"{field}__iendswith" + return queryset.filter(Q(**{prefix: normalized}) | Q(**{suffix: normalized})) + + +### Parsing Helpers + +# All of these are (str) -> str +# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing +scheme = lambda url: urlparse(url).scheme.lower() +without_scheme = lambda url: urlparse(url)._replace(scheme="").geturl().strip("//") +without_query = lambda url: urlparse(url)._replace(query="").geturl().strip("//") +without_fragment = lambda url: urlparse(url)._replace(fragment="").geturl().strip("//") +without_path = lambda url: urlparse(url)._replace(path="", fragment="", query="").geturl().strip("//") +path = lambda url: urlparse(url).path +basename = lambda url: urlparse(url).path.rsplit("/", 1)[-1] +domain = lambda url: urlparse(url).netloc +query = lambda url: urlparse(url).query +fragment = lambda url: urlparse(url).fragment +extension = lambda url: basename(url).rsplit(".", 1)[-1].lower() if "." in basename(url) else "" +base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links + +urlencode = lambda s: s and quote(s, encoding="utf-8", errors="replace") +urldecode = lambda s: s and unquote(s) +htmlencode = lambda s: s and escape(s, quote=True) +htmldecode = lambda s: s and unescape(s) + + +def ts_to_date_str(ts: Any) -> str | None: + parsed = parse_date(ts) + return None if parsed is None else parsed.strftime("%Y-%m-%d %H:%M") + + +COLOR_REGEX = re.compile(r"\[(?P\d+)(;(?P\d+)(;(?P\d+))?)?m") + + +# https://mathiasbynens.be/demo/url-regex +URL_REGEX = re.compile( + r"(?=(" + r"http[s]?://" # start matching from allowed schemes + r"(?:[a-zA-Z]|[0-9]" # followed by allowed alphanum characters + r"|[-_$@.&+!*\(\),]" # or allowed symbols (keep hyphen first to match literal hyphen) + r"|[^\u0000-\u007F])+" # or allowed unicode bytes + r'[^\]\[<>"\'\s]+' # stop parsing at these symbols + r"))", + re.IGNORECASE | re.UNICODE, +) + +# Maximum supported URL length. Very long URLs are rare but must be supported correctly +# (e.g. data: URLs, deeply nested query strings). The Snapshot.url column is stored as a +# variable-length TextField (so short URLs don't reserve space and very long URLs still +# fit) while keeping a normal index on the field so exact, prefix, and substring lookups +# all keep working. +MAX_URL_LENGTH = 65535 + +QUOTE_DELIMITERS = ( + '"', + "'", + "`", + "โ€œ", + "โ€", + "โ€˜", + "โ€™", +) +QUOTE_ENTITY_DELIMITERS = ( + """, + """, + """, + "'", + "'", + "'", +) +URL_ENTITY_REPLACEMENTS = ( + ("&", "&"), + ("&", "&"), + ("&", "&"), +) + +FILESIZE_UNITS: dict[str, int] = { + "": 1, + "b": 1, + "byte": 1, + "bytes": 1, + "k": 1024, + "kb": 1024, + "kib": 1024, + "m": 1024**2, + "mb": 1024**2, + "mib": 1024**2, + "g": 1024**3, + "gb": 1024**3, + "gib": 1024**3, + "t": 1024**4, + "tb": 1024**4, + "tib": 1024**4, +} + + +def sanitize_extracted_url(url: str) -> str: + """Trim quote garbage and dangling prose punctuation from an extracted URL candidate.""" + cleaned = (url or "").strip() + if not cleaned: + return cleaned + + lower_cleaned = cleaned.lower() + cut_index = len(cleaned) + + for delimiter in QUOTE_DELIMITERS: + found_index = cleaned.find(delimiter) + if found_index != -1: + cut_index = min(cut_index, found_index) + + for delimiter in QUOTE_ENTITY_DELIMITERS: + found_index = lower_cleaned.find(delimiter) + if found_index != -1: + cut_index = min(cut_index, found_index) + + cleaned = cleaned[:cut_index].strip() + lower_cleaned = cleaned.lower() + for entity, replacement in URL_ENTITY_REPLACEMENTS: + while entity in lower_cleaned: + entity_index = lower_cleaned.find(entity) + cleaned = cleaned[:entity_index] + replacement + cleaned[entity_index + len(entity) :] + lower_cleaned = cleaned.lower() + + cleaned = cleaned.rstrip(".,;:!?\\'\"") + cleaned = cleaned.rstrip('"') + + return cleaned + + +def validate_url_length(url: str) -> str: + if len(url) > MAX_URL_LENGTH: + raise ValueError(f"URL is too long ({len(url)} characters). Maximum length is {MAX_URL_LENGTH} characters.") + return url + + +def validate_url(url: str) -> str: + url = validate_url_length((url or "").strip()) + parsed = urlparse(url) + if parsed.scheme.lower() not in ("http", "https") or not parsed.hostname: + raise ValueError("URL must start with http:// or https:// and include a hostname.") + return url + + +def parens_are_matched(string: str, open_char="(", close_char=")"): + """check that all parentheses in a string are balanced and nested properly""" + count = 0 + for c in string: + if c == open_char: + count += 1 + elif c == close_char: + count -= 1 + if count < 0: + return False + return count == 0 + + +def fix_url_from_markdown(url_str: str) -> str: + """ + cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax + helpful to fix URLs parsed from markdown e.g. + input: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).somemoretext + result: https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def + + IMPORTANT ASSUMPTION: valid urls wont have unbalanced or incorrectly nested parentheses + e.g. this will fail the user actually wants to ingest a url like 'https://example.com/some_wei)(rd_url' + in that case it will return https://example.com/some_wei (truncated up to the first unbalanced paren) + This assumption is true 99.9999% of the time, and for the rare edge case the user can use url_list parser. + """ + trimmed_url = url_str + if len(trimmed_url) > 2048: + return trimmed_url + + # cut off one trailing character at a time + # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c + trim_attempts = 0 + while trimmed_url and not parens_are_matched(trimmed_url) and trim_attempts < 256: + trimmed_url = trimmed_url[:-1] + trim_attempts += 1 + + if not trimmed_url or not parens_are_matched(trimmed_url): + return url_str + + # make sure trimmed url is still valid + if any(match == trimmed_url for match in re.findall(URL_REGEX, trimmed_url)): + return trimmed_url + + return url_str + + +def split_comma_separated_urls(url: str): + offset = 0 + while True: + http_index = url.find("http://", 1) + https_index = url.find("https://", 1) + next_indices = [idx for idx in (http_index, https_index) if idx != -1] + if not next_indices: + yield offset, url + return + + next_index = min(next_indices) + if url[next_index - 1] != ",": + yield offset, url + return + + yield offset, url[: next_index - 1] + offset += next_index + url = url[next_index:] + + +def find_all_urls(urls_str: str): + skipped_starts = set() + for match in re.finditer(URL_REGEX, urls_str): + if match.start() in skipped_starts: + continue + + cleaned_match = sanitize_extracted_url(fix_url_from_markdown(match.group(1))) + for offset, url in split_comma_separated_urls(cleaned_match): + if offset: + skipped_starts.add(match.start() + offset) + yield url + + +def parse_filesize_to_bytes(value: str | int | float | None) -> int: + """ + Parse a byte count from an integer or human-readable string like 45mb or 2 GB. + """ + if value is None: + return 0 + + if isinstance(value, bool): + raise ValueError("Size value must be an integer or size string.") + + if isinstance(value, int): + return value + + if isinstance(value, float): + if not value.is_integer(): + raise ValueError("Size value must resolve to a whole number of bytes.") + return int(value) + + raw_value = str(value).strip() + if not raw_value: + return 0 + + if raw_value.isdigit(): + return int(raw_value) + + match = re.fullmatch(r"(?i)(\d+(?:\.\d+)?)\s*([a-z]+)", raw_value) + if not match: + raise ValueError(f"Invalid size value: {value}") + + amount_str, unit_str = match.groups() + multiplier = FILESIZE_UNITS.get(unit_str.lower()) + if multiplier is None: + raise ValueError(f"Unknown size unit: {unit_str}") + + try: + amount = Decimal(amount_str) + except InvalidOperation as err: + raise ValueError(f"Invalid size value: {value}") from err + + return int(amount * multiplier) + + +def enforce_types(func): + """ + Enforce function arg and kwarg types at runtime using its python3 type hints + Simpler version of pydantic @validate_call decorator + """ + # TODO: check return type as well + + @wraps(func) + def typechecked_function(*args, **kwargs): + sig = signature(func) + + def check_argument_type(arg_key, arg_val): + try: + annotation = sig.parameters[arg_key].annotation + except KeyError: + annotation = None + + if annotation is not None and annotation.__class__ is type: + if not isinstance(arg_val, annotation): + raise TypeError( + "{}(..., {}: {}) got unexpected {} argument {}={}".format( + func.__name__, + arg_key, + annotation.__name__, + type(arg_val).__name__, + arg_key, + str(arg_val)[:64], + ), + ) + + # check args + for arg_val, arg_key in zip(args, sig.parameters): + check_argument_type(arg_key, arg_val) + + # check kwargs + for arg_key, arg_val in kwargs.items(): + check_argument_type(arg_key, arg_val) + + return func(*args, **kwargs) + + return typechecked_function + + +def docstring(text: str | None): + """attach the given docstring to the decorated function""" + + def decorator(func): + if text: + func.__doc__ = text + return func + + return decorator + + +@enforce_types +def parse_date(date: Any) -> datetime | None: + """Parse unix timestamps, iso format, and human-readable strings""" + + if date is None: + return None + + if isinstance(date, datetime): + if date.tzinfo is None: + return date.replace(tzinfo=timezone.utc) + + offset = date.utcoffset() + assert offset == datetime.now(timezone.utc).utcoffset(), "Refusing to load a non-UTC date!" + return date + + if isinstance(date, (float, int)): + date = str(date) + + if isinstance(date, str): + normalized = date.strip() + if not normalized: + raise ValueError(f"Tried to parse invalid date string! {date}") + + try: + return datetime.fromtimestamp(float(normalized), tz=timezone.utc) + except (TypeError, ValueError, OSError): + pass + + try: + iso_date = normalized.replace("Z", "+00:00") + parsed_date = datetime.fromisoformat(iso_date) + if parsed_date.tzinfo is None: + return parsed_date.replace(tzinfo=timezone.utc) + return parsed_date.astimezone(timezone.utc) + except ValueError: + pass + + from dateparser import parse as dateparser + + parsed_date = dateparser(normalized, settings={"TIMEZONE": "UTC"}) + if parsed_date is None: + raise ValueError(f"Tried to parse invalid date string! {date}") + return parsed_date.astimezone(timezone.utc) + + raise ValueError(f"Tried to parse invalid date! {date}") + + +@enforce_types +def download_url(url: str, timeout: int | None = None, config=None, **config_kwargs) -> str: + """Download the contents of a remote url and return the text""" + + import requests + from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding + + from archivebox.config.common import get_config + + config = config or get_config(**config_kwargs) + timeout = timeout or config.TIMEOUT + session = requests.Session() + + if config.COOKIES_FILE and Path(config.COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(config.COOKIES_FILE) + cookie_jar.load(ignore_discard=True, ignore_expires=True) + for cookie in cookie_jar: + if cookie.value is not None: + session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) + + response = session.get( + url, + headers={"User-Agent": config.USER_AGENT}, + verify=config.CHECK_SSL_VALIDITY, + timeout=timeout, + ) + + content_type = response.headers.get("Content-Type", "") + encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text) + + if encoding is not None: + response.encoding = encoding + + try: + return response.text + except UnicodeDecodeError: + # if response is non-test (e.g. image or other binary files), just return the filename instead + return url.rsplit("/", 1)[-1] + + +@enforce_types +def ansi_to_html(text: str) -> str: + """ + Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html + Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though. + """ + + TEMPLATE = '
' + text = text.replace("[m", "
") + + def single_sub(match): + argsdict = match.groupdict() + if argsdict["arg_3"] is None: + if argsdict["arg_2"] is None: + _, color = 0, argsdict["arg_1"] + else: + _, color = argsdict["arg_1"], argsdict["arg_2"] + else: + _, color = argsdict["arg_3"], argsdict["arg_2"] + + return TEMPLATE.format(COLOR_DICT[color][0]) + + return COLOR_REGEX.sub(single_sub, text) + + +class ExtendedEncoder(pyjson.JSONEncoder): + """ + Extended json serializer that supports serializing several model + fields and objects + """ + + def default(self, o): + cls_name = o.__class__.__name__ + + if isinstance(o, tuple) and "_asdict" in vars(type(o)): + return o._asdict() + + elif isinstance(o, bytes): + return o.decode() + + elif isinstance(o, datetime): + return o.isoformat() + + elif isinstance(o, Exception): + return f"{o.__class__.__name__}: {o}" + + elif isinstance(o, Path): + return str(o) + + elif cls_name in ("dict_items", "dict_keys", "dict_values"): + return list(o) + + elif isinstance(o, Callable): + return str(o) + + # Try dict/list conversion as fallback + try: + return dict(o) + except Exception: + pass + + try: + return list(o) + except Exception: + pass + + try: + return str(o) + except Exception: + pass + + return pyjson.JSONEncoder.default(self, o) + + +@enforce_types +def to_json(obj: Any, indent: int | None = 4, sort_keys: bool = True) -> str: + """Serialize object to JSON string with extended type support""" + return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder) diff --git a/archivebox/parse.py b/archivebox/parse.py deleted file mode 100644 index baaa447e86..0000000000 --- a/archivebox/parse.py +++ /dev/null @@ -1,315 +0,0 @@ -""" -Everything related to parsing links from input sources. - -For a list of supported services, see the README.md. -For examples of supported import formats see tests/. - -Link: { - 'url': 'https://example.com/example/?abc=123&xyc=345#lmnop', - 'timestamp': '1544212312.4234', - 'title': 'Example.com Page Title', - 'tags': 'abc,def', - 'sources': [ - 'output/sources/ril_export.html', - 'output/sources/getpocket.com-1523422111.txt', - 'output/sources/stdin-234234112312.txt' - ] -} -""" - -import re -import json - -from datetime import datetime -import xml.etree.ElementTree as etree - -from config import TIMEOUT -from util import ( - str_between, - URL_REGEX, - check_url_parsing_invariants, - TimedProgress, -) - - -def parse_links(source_file): - """parse a list of URLs with their metadata from an - RSS feed, bookmarks export, or text file - """ - - check_url_parsing_invariants() - PARSERS = ( - # Specialized parsers - ('Pocket HTML', parse_pocket_html_export), - ('Pinboard RSS', parse_pinboard_rss_export), - ('Shaarli RSS', parse_shaarli_rss_export), - ('Medium RSS', parse_medium_rss_export), - - # General parsers - ('Netscape HTML', parse_netscape_html_export), - ('Generic RSS', parse_rss_export), - ('Generic JSON', parse_json_export), - - # Fallback parser - ('Plain Text', parse_plain_text_export), - ) - timer = TimedProgress(TIMEOUT * 4) - with open(source_file, 'r', encoding='utf-8') as file: - for parser_name, parser_func in PARSERS: - try: - links = list(parser_func(file)) - if links: - timer.end() - return links, parser_name - except Exception as err: - # Parsers are tried one by one down the list, and the first one - # that succeeds is used. To see why a certain parser was not used - # due to error or format incompatibility, uncomment this line: - # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) - pass - - timer.end() - return [], 'Failed to parse' - - -### Import Parser Functions - -def parse_pocket_html_export(html_file): - """Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)""" - - html_file.seek(0) - pattern = re.compile("^\\s*
  • (.+)
  • ", re.UNICODE) - for line in html_file: - # example line - #
  • example title
  • - match = pattern.search(line) - if match: - url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url - time = datetime.fromtimestamp(float(match.group(2))) - tags = match.group(3) - title = match.group(4).replace(' โ€” Readability', '').replace('http://www.readability.com/read?url=', '') - - yield { - 'url': url, - 'timestamp': str(time.timestamp()), - 'title': title or None, - 'tags': tags or '', - 'sources': [html_file.name], - } - - -def parse_json_export(json_file): - """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)""" - - json_file.seek(0) - links = json.load(json_file) - json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z') - - for link in links: - # example line - # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}] - if link: - # Parse URL - url = link.get('href') or link.get('url') or link.get('URL') - if not url: - raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]') - - # Parse the timestamp - ts_str = str(datetime.now().timestamp()) - if link.get('timestamp'): - # chrome/ff histories use a very precise timestamp - ts_str = str(link['timestamp'] / 10000000) - elif link.get('time'): - ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp()) - elif link.get('created_at'): - ts_str = str(json_date(link['created_at']).timestamp()) - elif link.get('created'): - ts_str = str(json_date(link['created']).timestamp()) - elif link.get('date'): - ts_str = str(json_date(link['date']).timestamp()) - elif link.get('bookmarked'): - ts_str = str(json_date(link['bookmarked']).timestamp()) - elif link.get('saved'): - ts_str = str(json_date(link['saved']).timestamp()) - - # Parse the title - title = None - if link.get('title'): - title = link['title'].strip() or None - elif link.get('description'): - title = link['description'].replace(' โ€” Readability', '').strip() or None - elif link.get('name'): - title = link['name'].strip() or None - - yield { - 'url': url, - 'timestamp': ts_str, - 'title': title, - 'tags': link.get('tags') or '', - 'sources': [json_file.name], - } - - -def parse_rss_export(rss_file): - """Parse RSS XML-format files into links""" - - rss_file.seek(0) - items = rss_file.read().split('') - items = items[1:] if items else [] - for item in items: - # example item: - # - # <![CDATA[How JavaScript works: inside the V8 engine]]> - # Unread - # https://blog.sessionstack.com/how-javascript-works-inside - # https://blog.sessionstack.com/how-javascript-works-inside - # Mon, 21 Aug 2017 14:21:58 -0500 - # - - trailing_removed = item.split('', 1)[0] - leading_removed = trailing_removed.split('', 1)[-1].strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0] - - url = str_between(get_row('link'), '', '') - ts_str = str_between(get_row('pubDate'), '', '') - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") - title = str_between(get_row('title'), '')[1:] - for entry in entries: - # example entry: - # - # Aktuelle Trojaner-Welle: Emotet lauert in gefรƒยคlschten Rechnungsmails | heise online - # - # https://demo.shaarli.org/?cEV4vw - # 2019-01-30T06:06:01+00:00 - # 2019-01-30T06:06:01+00:00 - #

    Permalink

    ]]> - # - - trailing_removed = entry.split('', 1)[0] - leading_removed = trailing_removed.strip() - rows = leading_removed.split('\n') - - def get_row(key): - return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0] - - title = str_between(get_row('title'), '', '').strip() - url = str_between(get_row('link'), '') - ts_str = str_between(get_row('published'), '', '') - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - - yield { - 'url': url, - 'timestamp': str(time.timestamp()), - 'title': title or None, - 'tags': '', - 'sources': [rss_file.name], - } - - -def parse_netscape_html_export(html_file): - """Parse netscape-format bookmarks export files (produced by all browsers)""" - - html_file.seek(0) - pattern = re.compile("]*>(.+)", re.UNICODE | re.IGNORECASE) - for line in html_file: - # example line - #
    example bookmark title - - match = pattern.search(line) - if match: - url = match.group(1) - time = datetime.fromtimestamp(float(match.group(2))) - - yield { - 'url': url, - 'timestamp': str(time.timestamp()), - 'title': match.group(3).strip() or None, - 'tags': '', - 'sources': [html_file.name], - } - - -def parse_pinboard_rss_export(rss_file): - """Parse Pinboard RSS feed files into links""" - - rss_file.seek(0) - root = etree.parse(rss_file).getroot() - items = root.findall("{http://purl.org/rss/1.0/}item") - for item in items: - url = item.find("{http://purl.org/rss/1.0/}link").text - tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None - title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None - ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None - - # Pinboard includes a colon in its date stamp timezone offsets, which - # Python can't parse. Remove it: - if ts_str and ts_str[-3:-2] == ":": - ts_str = ts_str[:-3]+ts_str[-2:] - - if ts_str: - time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z") - else: - time = datetime.now() - - yield { - 'url': url, - 'timestamp': str(time.timestamp()), - 'title': title or None, - 'tags': tags or '', - 'sources': [rss_file.name], - } - - -def parse_medium_rss_export(rss_file): - """Parse Medium RSS feed files into links""" - - rss_file.seek(0) - root = etree.parse(rss_file).getroot() - items = root.find("channel").findall("item") - for item in items: - url = item.find("link").text - title = item.find("title").text.strip() - ts_str = item.find("pubDate").text - time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") - - yield { - 'url': url, - 'timestamp': str(time.timestamp()), - 'title': title or None, - 'tags': '', - 'sources': [rss_file.name], - } - - -def parse_plain_text_export(text_file): - """Parse raw links from each line in a text file""" - - text_file.seek(0) - for line in text_file.readlines(): - urls = re.findall(URL_REGEX, line) if line.strip() else () - for url in urls: - yield { - 'url': url, - 'timestamp': str(datetime.now().timestamp()), - 'title': None, - 'tags': '', - 'sources': [text_file.name], - } diff --git a/archivebox/personas/__init__.py b/archivebox/personas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py new file mode 100644 index 0000000000..143cf4573f --- /dev/null +++ b/archivebox/personas/admin.py @@ -0,0 +1,190 @@ +__package__ = "archivebox.personas" + +import shutil + +from django.contrib import admin, messages +from django.utils.html import format_html, format_html_join + +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from archivebox.personas.forms import PersonaAdminForm +from archivebox.personas.importers import discover_local_browser_profiles +from archivebox.personas.models import Persona + + +class PersonaAdmin(ConfigEditorMixin, BaseModelAdmin): + form = PersonaAdminForm + change_form_template = "admin/personas/persona/change_form.html" + + list_display = ("name", "created_by", "created_at", "chrome_profile_state", "cookies_state", "auth_state") + search_fields = ("name", "created_by__username") + list_filter = ("created_by",) + ordering = ["name"] + list_per_page = 100 + readonly_fields = ("id", "created_at", "persona_paths", "import_artifact_status") + + add_fieldsets = ( + ( + "Persona", + { + "fields": ("name", "created_by", "permissions"), + "classes": ("card", "persona-card-primary"), + }, + ), + ( + "Browser Import", + { + "fields": ( + "import_mode", + "import_discovered_profile", + "import_source", + "import_profile_name", + "import_copy_profile", + "import_extract_cookies", + "import_capture_storage", + ), + "classes": ("card", "wide"), + }, + ), + ( + "Advanced", + { + "fields": ("config",), + "classes": ("card", "wide"), + }, + ), + ) + + change_fieldsets = ( + add_fieldsets[0], + ( + "Timestamps", + { + "fields": ("id", "created_at"), + "classes": ("card", "persona-card-timestamps"), + }, + ), + add_fieldsets[1], + add_fieldsets[2], + ( + "Artifacts", + { + "fields": ("persona_paths", "import_artifact_status"), + "classes": ("card", "wide"), + }, + ), + ) + + @admin.display(description="Chrome Profile") + def chrome_profile_state(self, obj: Persona) -> str: + return "yes" if (obj.path / "chrome_profile").exists() else "no" + + @admin.display(description="cookies.txt") + def cookies_state(self, obj: Persona) -> str: + return "yes" if obj.COOKIES_FILE else "no" + + @admin.display(description="auth.json") + def auth_state(self, obj: Persona) -> str: + return "yes" if obj.AUTH_STORAGE_FILE else "no" + + @admin.display(description="Persona Paths") + def persona_paths(self, obj: Persona) -> str: + return format_html( + "
    " + "
    Persona root{}
    " + "
    chrome_profile{}
    " + "
    chrome_downloads{}
    " + "
    cookies.txt{}
    " + "
    auth.json{}
    " + "
    ", + obj.path, + obj.CHROME_USER_DATA_DIR, + obj.CHROME_DOWNLOADS_DIR, + obj.COOKIES_FILE or (obj.path / "cookies.txt"), + obj.AUTH_STORAGE_FILE or (obj.path / "auth.json"), + ) + + @admin.display(description="Import Artifacts") + def import_artifact_status(self, obj: Persona) -> str: + entries = [ + ("Browser profile", (obj.path / "chrome_profile").exists(), obj.CHROME_USER_DATA_DIR), + ("cookies.txt", bool(obj.COOKIES_FILE), obj.COOKIES_FILE or (obj.path / "cookies.txt")), + ("auth.json", bool(obj.AUTH_STORAGE_FILE), obj.AUTH_STORAGE_FILE or (obj.path / "auth.json")), + ] + return format_html( + "
    {}
    ", + format_html_join( + "", + "
    {}{}{}
    ", + ( + ( + label, + "abx-artifact-state abx-artifact-state--yes" if enabled else "abx-artifact-state abx-artifact-state--no", + "present" if enabled else "missing", + path, + ) + for label, enabled, path in entries + ), + ), + ) + + def get_fieldsets(self, request, obj=None): + return self.change_fieldsets if obj else self.add_fieldsets + + def get_form(self, request, obj=None, change=False, **kwargs): + if kwargs.get("fields"): + kwargs["fields"] = [field for field in kwargs["fields"] if field != "permissions"] + return super().get_form(request, obj=obj, change=change, **kwargs) + + def render_change_form(self, request, context, add=False, change=False, form_url="", obj=None): + context["detected_profile_count"] = len(discover_local_browser_profiles()) + return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj) + + def save_model(self, request, obj, form, change): + old_path = None + new_path = None + if change: + previous = Persona.objects.get(pk=obj.pk) + if previous.name != obj.name: + old_path = previous.path + new_path = obj.path + + super().save_model(request, obj, form, change) + + if old_path and new_path and old_path != new_path and old_path.exists(): + if new_path.exists(): + raise FileExistsError(f"Cannot rename Persona directory because the destination already exists: {new_path}") + shutil.move(str(old_path), str(new_path)) + + obj.ensure_dirs() + + import_result = form.apply_import(obj) + if import_result is None: + return + + completed_actions = [] + if import_result.profile_copied: + completed_actions.append("profile copied") + if import_result.cookies_imported: + completed_actions.append("cookies.txt generated") + if import_result.storage_captured: + completed_actions.append("auth.json captured") + if import_result.user_agent_imported: + completed_actions.append("USER_AGENT copied") + + if completed_actions: + messages.success( + request, + f"Imported {', '.join(completed_actions)} from {import_result.source.display_label}.", + ) + else: + messages.warning( + request, + f"Persona saved, but no browser artifacts were imported from {import_result.source.display_label}.", + ) + + for warning in import_result.warnings: + messages.warning(request, warning) + + +def register_admin(admin_site: admin.AdminSite) -> None: + admin_site.register(Persona, PersonaAdmin) diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py new file mode 100644 index 0000000000..df45c2668a --- /dev/null +++ b/archivebox/personas/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class PersonasConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.personas" + label = "personas" diff --git a/archivebox/personas/export_browser_state.js b/archivebox/personas/export_browser_state.js new file mode 100644 index 0000000000..77b394f95d --- /dev/null +++ b/archivebox/personas/export_browser_state.js @@ -0,0 +1,210 @@ +#!/usr/bin/env node +/** + * Export cookies and open-tab storage from a Chromium profile or live CDP URL. + * + * Environment variables: + * ARCHIVEBOX_ABX_PLUGINS_DIR Absolute path to abx_plugins/plugins + * CHROME_USER_DATA_DIR Local Chromium user-data directory to launch + * CHROME_CDP_URL Existing browser CDP URL to attach to + * COOKIES_OUTPUT_FILE Optional output path for Netscape cookies.txt + * AUTH_STORAGE_OUTPUT_FILE Optional output path for auth.json + * CHROME_BINARY Optional browser binary override + * NODE_MODULES_DIR Optional node_modules path for puppeteer-core + */ + +const fs = require('fs'); +const os = require('os'); +const path = require('path'); + +const pluginsDir = process.env.ARCHIVEBOX_ABX_PLUGINS_DIR || process.env.ABX_PLUGINS_DIR; +if (!pluginsDir) { + console.error('ARCHIVEBOX_ABX_PLUGINS_DIR is required'); + process.exit(1); +} + +const baseUtils = require(path.join(pluginsDir, 'base', 'utils.js')); +baseUtils.ensureNodeModuleResolution(module); + +const chromeUtils = require(path.join(pluginsDir, 'chrome', 'chrome_utils.js')); +const puppeteer = require('puppeteer-core'); + +function cookieToNetscape(cookie) { + let domain = cookie.domain; + if (!domain.startsWith('.') && !cookie.hostOnly) { + domain = '.' + domain; + } + + const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE'; + const cookiePath = cookie.path || '/'; + const secure = cookie.secure ? 'TRUE' : 'FALSE'; + const expiry = cookie.expires && cookie.expires > 0 ? Math.floor(cookie.expires).toString() : '0'; + + return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${cookie.name}\t${cookie.value}`; +} + +function writeCookiesFile(cookies, outputPath) { + const lines = [ + '# Netscape HTTP Cookie File', + '# https://curl.se/docs/http-cookies.html', + '# This file was generated by ArchiveBox persona cookie extraction', + '#', + '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', + '', + ]; + + for (const cookie of cookies) { + lines.push(cookieToNetscape(cookie)); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + fs.writeFileSync(outputPath, lines.join('\n') + '\n'); +} + +async function collectStorage(browser) { + const localStorage = {}; + const sessionStorage = {}; + const pages = await browser.pages(); + + for (const page of pages) { + try { + const url = page.url(); + if (!url || url === 'about:blank') continue; + if (url.startsWith('chrome:') || url.startsWith('edge:') || url.startsWith('devtools:')) continue; + + const payload = await page.evaluate(() => ({ + origin: window.location.origin, + localStorage: Object.fromEntries(Object.entries(window.localStorage)), + sessionStorage: Object.fromEntries(Object.entries(window.sessionStorage)), + })); + + if (!payload.origin || payload.origin === 'null') continue; + if (Object.keys(payload.localStorage || {}).length > 0) { + localStorage[payload.origin] = payload.localStorage; + } + if (Object.keys(payload.sessionStorage || {}).length > 0) { + sessionStorage[payload.origin] = payload.sessionStorage; + } + } catch (error) { + // Ignore pages that cannot be inspected via evaluate(). + } + } + + return { localStorage, sessionStorage }; +} + +async function openBrowser() { + const cdpUrl = process.env.CHROME_CDP_URL || ''; + if (cdpUrl) { + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, cdpUrl, { defaultViewport: null }); + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + }, + sourceDescription: cdpUrl, + }; + } + + const userDataDir = process.env.CHROME_USER_DATA_DIR; + if (!userDataDir) { + throw new Error('Either CHROME_USER_DATA_DIR or CHROME_CDP_URL is required'); + } + if (!fs.existsSync(userDataDir)) { + throw new Error(`User data directory does not exist: ${userDataDir}`); + } + + const outputDir = fs.mkdtempSync(path.join(os.tmpdir(), 'abx-browser-state-')); + const binary = process.env.CHROME_BINARY || chromeUtils.findAnyChromiumBinary(); + if (!binary) { + throw new Error('Could not find a Chromium binary for browser state export'); + } + + const launched = await chromeUtils.launchChromium({ + binary, + outputDir, + userDataDir, + headless: true, + killZombies: false, + }); + + if (!launched.success) { + throw new Error(launched.error || 'Chrome launch failed'); + } + + const browser = await chromeUtils.connectToBrowserEndpoint(puppeteer, launched.cdpUrl, { defaultViewport: null }); + + return { + browser, + async cleanup() { + try { + await browser.disconnect(); + } catch (error) {} + try { + await chromeUtils.killChrome(launched.pid, outputDir); + } catch (error) {} + try { + fs.rmSync(outputDir, { recursive: true, force: true }); + } catch (error) {} + }, + sourceDescription: userDataDir, + }; +} + +async function main() { + const cookiesOutput = process.env.COOKIES_OUTPUT_FILE || ''; + const authOutput = process.env.AUTH_STORAGE_OUTPUT_FILE || ''; + if (!cookiesOutput && !authOutput) { + throw new Error('COOKIES_OUTPUT_FILE or AUTH_STORAGE_OUTPUT_FILE is required'); + } + + const { browser, cleanup, sourceDescription } = await openBrowser(); + + try { + const session = await browser.target().createCDPSession(); + const browserVersion = await session.send('Browser.getVersion'); + const cookieResult = await session.send('Storage.getCookies'); + const cookies = cookieResult?.cookies || []; + const { localStorage, sessionStorage } = await collectStorage(browser); + const userAgent = browserVersion?.userAgent || ''; + + if (cookiesOutput) { + writeCookiesFile(cookies, cookiesOutput); + } + + if (authOutput) { + fs.mkdirSync(path.dirname(authOutput), { recursive: true }); + fs.writeFileSync( + authOutput, + JSON.stringify( + { + TYPE: 'auth', + SOURCE: sourceDescription, + captured_at: new Date().toISOString(), + user_agent: userAgent, + cookies, + localStorage, + sessionStorage, + }, + null, + 2, + ) + '\n', + ); + } + + console.error( + `[+] Exported ${cookies.length} cookies` + + `${authOutput ? ` and ${Object.keys(localStorage).length + Object.keys(sessionStorage).length} storage origins` : ''}` + + `${userAgent ? ' with browser USER_AGENT' : ''}` + + ` from ${sourceDescription}`, + ); + } finally { + await cleanup(); + } +} + +main().catch((error) => { + console.error(`ERROR: ${error.message}`); + process.exit(1); +}); diff --git a/archivebox/personas/forms.py b/archivebox/personas/forms.py new file mode 100644 index 0000000000..9b7cdfb9cc --- /dev/null +++ b/archivebox/personas/forms.py @@ -0,0 +1,205 @@ +__package__ = "archivebox.personas" + +from typing import Any + +from django import forms +from django.utils.safestring import mark_safe + +from archivebox.config.common import get_config +from archivebox.core.permissions import PERMISSIONS_CHOICES +from archivebox.plugins.forms import PluginConfigFormMixin +from archivebox.personas.importers import ( + PersonaImportResult, + PersonaImportSource, + discover_local_browser_profiles, + import_persona_from_source, + resolve_custom_import_source, + validate_persona_name, +) +from archivebox.personas.models import Persona + + +def _mode_label(title: str, description: str) -> str: + return mark_safe( + f'{title}{description}', + ) + + +class PersonaAdminForm(PluginConfigFormMixin, forms.ModelForm): + permissions = forms.ChoiceField( + label="Permissions", + choices=PERMISSIONS_CHOICES, + required=True, + help_text="Default visibility for crawls and snapshots created using this persona.", + ) + import_mode = forms.ChoiceField( + required=False, + initial="none", + label="Bootstrap this persona", + widget=forms.RadioSelect, + choices=( + ("none", _mode_label("Blank Persona", "Create the persona without importing browser state yet.")), + ("discovered", _mode_label("Use a detected profile", "Pick from Chromium profiles auto-discovered on this host.")), + ( + "custom", + _mode_label( + "Use a custom path or CDP URL", + "Paste an absolute Chromium path or attach to a live browser debugging endpoint.", + ), + ), + ), + help_text="These options run after the Persona row is saved, using the same backend import helpers as the CLI.", + ) + import_discovered_profile = forms.ChoiceField( + required=False, + label="Autodiscovered profiles", + widget=forms.RadioSelect, + choices=(), + help_text="Detected from local Chrome, Chromium, Brave, and Edge profile roots.", + ) + import_source = forms.CharField( + required=False, + label="Absolute path or CDP URL", + widget=forms.TextInput( + attrs={ + "placeholder": "/Users/alice/Library/Application Support/Google/Chrome or http://127.0.0.1:9222 or ws://127.0.0.1:9222/devtools/browser/...", + "style": "width: 100%; font-family: monospace;", + }, + ), + help_text="Accepts an absolute Chromium user-data dir, an exact profile dir, or a live HTTP/WS CDP endpoint.", + ) + import_profile_name = forms.CharField( + required=False, + label="Profile directory name", + widget=forms.TextInput( + attrs={ + "placeholder": "Default or Profile 1", + "style": "width: 100%; font-family: monospace;", + }, + ), + help_text="Only used when the custom path points at a browser root containing multiple profiles.", + ) + import_copy_profile = forms.BooleanField( + required=False, + initial=True, + label="Copy browser profile into this persona", + help_text="Copies the chosen Chromium user-data tree into `chrome_profile` for future archiving runs.", + ) + import_extract_cookies = forms.BooleanField( + required=False, + initial=True, + label="Generate `cookies.txt`", + help_text="Extracts cookies through Chrome DevTools Protocol and writes a Netscape cookie jar for wget/curl-based plugins.", + ) + import_capture_storage = forms.BooleanField( + required=False, + initial=True, + label="Capture open-tab storage into `auth.json`", + help_text="Snapshots currently open tab `localStorage` / `sessionStorage` values by origin. This is most useful for live CDP imports.", + ) + + class Meta: + model = Persona + fields = ("name", "created_by", "config") + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.discovered_profiles = discover_local_browser_profiles() + self._resolved_import_source: PersonaImportSource | None = None + + self.fields["import_mode"].widget.attrs["class"] = "abx-import-mode" + self.fields["import_discovered_profile"].widget.attrs["class"] = "abx-profile-picker" + self.fields["permissions"].initial = str((self.instance.config or {}).get("PERMISSIONS") or "public").strip().lower() + + if self.discovered_profiles: + self.fields["import_discovered_profile"].choices = [ + (profile.choice_value, profile.as_choice_label()) for profile in self.discovered_profiles + ] + else: + self.fields["import_discovered_profile"].choices = [] + self.fields["import_discovered_profile"].help_text = ( + "No local Chromium profiles were detected on this host right now. " + "Use the custom path/CDP option if the browser data lives elsewhere." + ) + + self.build_plugin_groups(get_config(persona=self.instance) if self.instance and self.instance.pk else get_config()) + + def clean_name(self) -> str: + name = str(self.cleaned_data.get("name") or "").strip() + is_valid, error_message = validate_persona_name(name) + if not is_valid: + raise forms.ValidationError(error_message) + return name + + def clean(self) -> dict[str, Any]: + cleaned_data = super().clean() + self._resolved_import_source = None + + import_mode = str(cleaned_data.get("import_mode") or "none").strip() or "none" + manual_config = cleaned_data.get("config") or {} + if not isinstance(manual_config, dict): + manual_config = {} + manual_config["PERMISSIONS"] = cleaned_data.get("permissions") or "public" + plugin_config_overrides = self.clean_plugin_config_overrides(get_config()) + cleaned_data["plugin_config"] = plugin_config_overrides + cleaned_data["config"] = { + **{key: value for key, value in manual_config.items() if key not in self.plugin_config_keys()}, + **plugin_config_overrides, + } + + if import_mode == "none": + return cleaned_data + + if import_mode == "discovered": + selection = str(cleaned_data.get("import_discovered_profile") or "").strip() + if not selection: + self.add_error("import_discovered_profile", "Choose one of the discovered profiles to import.") + return cleaned_data + try: + self._resolved_import_source = PersonaImportSource.from_choice_value(selection) + except ValueError as err: + self.add_error("import_discovered_profile", str(err)) + return cleaned_data + elif import_mode == "custom": + raw_value = str(cleaned_data.get("import_source") or "").strip() + if not raw_value: + self.add_error("import_source", "Provide an absolute Chromium profile path or a CDP URL.") + return cleaned_data + try: + self._resolved_import_source = resolve_custom_import_source( + raw_value, + profile_dir=str(cleaned_data.get("import_profile_name") or "").strip() or None, + ) + except ValueError as err: + self.add_error("import_source", str(err)) + return cleaned_data + else: + self.add_error("import_mode", "Choose how this Persona should be bootstrapped.") + return cleaned_data + + copy_profile = bool(cleaned_data.get("import_copy_profile")) + import_cookies = bool(cleaned_data.get("import_extract_cookies")) + capture_storage = bool(cleaned_data.get("import_capture_storage")) + + if self._resolved_import_source.kind == "cdp": + if not (import_cookies or capture_storage): + self.add_error( + "import_extract_cookies", + "CDP imports can only capture cookies and/or open-tab storage. Profile copying is not available for a remote browser endpoint.", + ) + elif not (copy_profile or import_cookies or capture_storage): + raise forms.ValidationError("Select at least one import action.") + + return cleaned_data + + def apply_import(self, persona: Persona) -> PersonaImportResult | None: + if not self._resolved_import_source: + return None + + return import_persona_from_source( + persona, + self._resolved_import_source, + copy_profile=bool(self.cleaned_data.get("import_copy_profile")), + import_cookies=bool(self.cleaned_data.get("import_extract_cookies")), + capture_storage=bool(self.cleaned_data.get("import_capture_storage")), + ) diff --git a/archivebox/personas/importers.py b/archivebox/personas/importers.py new file mode 100644 index 0000000000..42cbff1b02 --- /dev/null +++ b/archivebox/personas/importers.py @@ -0,0 +1,857 @@ +""" +Shared persona browser discovery/import helpers. + +These helpers are used by both the CLI and the Django admin so Persona import +behavior stays consistent regardless of where it is triggered from. +""" + +from __future__ import annotations + +import json +import os +import platform +import shutil +import subprocess +import tempfile +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING +from urllib.parse import urlparse + +from django.utils.html import format_html +from django.utils.safestring import SafeString + +if TYPE_CHECKING: + from archivebox.personas.models import Persona + + +BROWSER_LABELS = { + "chrome": "Google Chrome", + "chromium": "Chromium", + "brave": "Brave", + "edge": "Microsoft Edge", + "custom": "Custom Path", + "persona": "Persona Template", +} + +BROWSER_PROFILE_DIR_NAMES = ( + "Default", + "Profile ", + "Guest Profile", +) + +VOLATILE_PROFILE_COPY_PATTERNS = ( + "Cache", + "Code Cache", + "GPUCache", + "ShaderCache", + "Service Worker", + "GCM Store", + "*.log", + "Crashpad", + "BrowserMetrics", + "BrowserMetrics-spare.pma", + "SingletonLock", + "SingletonSocket", + "SingletonCookie", +) + +PERSONA_PROFILE_DIR_CANDIDATES = ( + "chrome_profile", + "chrome_user_data", +) + + +@dataclass(frozen=True) +class PersonaImportSource: + kind: str + browser: str = "custom" + source_name: str | None = None + user_data_dir: Path | None = None + profile_dir: str | None = None + browser_binary: str | None = None + cdp_url: str | None = None + + @property + def browser_label(self) -> str: + return BROWSER_LABELS.get(self.browser, self.browser.title()) + + @property + def profile_path(self) -> Path | None: + if not self.user_data_dir or not self.profile_dir: + return None + return self.user_data_dir / self.profile_dir + + @property + def display_label(self) -> str: + if self.kind == "cdp": + return self.cdp_url or "CDP URL" + profile_suffix = f" / {self.profile_dir}" if self.profile_dir else "" + source_prefix = f": {self.source_name}" if self.source_name else "" + return f"{self.browser_label}{source_prefix}{profile_suffix}" + + @property + def choice_value(self) -> str: + return json.dumps( + { + "kind": self.kind, + "browser": self.browser, + "source_name": self.source_name or "", + "user_data_dir": str(self.user_data_dir) if self.user_data_dir else "", + "profile_dir": self.profile_dir or "", + "browser_binary": self.browser_binary or "", + "cdp_url": self.cdp_url or "", + }, + sort_keys=True, + ) + + def as_choice_label(self) -> SafeString: + path_str = str(self.profile_path or self.user_data_dir or self.cdp_url or "") + binary_suffix = f"Using {self.browser_binary}" if self.browser_binary else "Will auto-detect a Chromium binary" + return format_html( + '{}{}{}', + self.display_label, + binary_suffix, + path_str, + ) + + @classmethod + def from_choice_value(cls, value: str) -> PersonaImportSource: + try: + payload = json.loads(value) + except json.JSONDecodeError as err: + raise ValueError("Invalid discovered profile selection.") from err + + if payload.get("kind") != "browser-profile": + raise ValueError("Invalid discovered profile selection.") + + user_data_dir = Path(str(payload.get("user_data_dir") or "")).expanduser() + profile_dir = str(payload.get("profile_dir") or "").strip() + browser = str(payload.get("browser") or "custom").strip().lower() or "custom" + source_name = str(payload.get("source_name") or "").strip() or None + browser_binary = str(payload.get("browser_binary") or "").strip() or None + + return resolve_browser_profile_source( + browser=browser, + source_name=source_name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +@dataclass +class PersonaImportResult: + source: PersonaImportSource + profile_copied: bool = False + cookies_imported: bool = False + storage_captured: bool = False + user_agent_imported: bool = False + warnings: list[str] = field(default_factory=list) + + @property + def did_work(self) -> bool: + return self.profile_copied or self.cookies_imported or self.storage_captured or self.user_agent_imported + + +def get_chrome_user_data_dir() -> Path | None: + """Get the default Chrome user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Google" / "Chrome", + home / "Library" / "Application Support" / "Chromium", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "google-chrome", + home / ".config" / "chromium", + home / ".config" / "chrome", + home / "snap" / "chromium" / "common" / "chromium", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Google" / "Chrome" / "User Data", + local_app_data / "Chromium" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_brave_user_data_dir() -> Path | None: + """Get the default Brave user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "BraveSoftware" / "Brave-Browser", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "BraveSoftware" / "Brave-Browser" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_edge_user_data_dir() -> Path | None: + """Get the default Edge user data directory for the current platform.""" + system = platform.system() + home = Path.home() + + if system == "Darwin": + candidates = [ + home / "Library" / "Application Support" / "Microsoft Edge", + ] + elif system == "Linux": + candidates = [ + home / ".config" / "microsoft-edge", + home / ".config" / "microsoft-edge-beta", + home / ".config" / "microsoft-edge-dev", + ] + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = [ + local_app_data / "Microsoft" / "Edge" / "User Data", + ] + else: + candidates = [] + + for candidate in candidates: + if candidate.exists() and _list_profile_names(candidate): + return candidate + + return None + + +def get_browser_binary(browser: str) -> str | None: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == "Darwin": + candidates = { + "chrome": ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"], + "chromium": ["/Applications/Chromium.app/Contents/MacOS/Chromium"], + "brave": ["/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"], + "edge": ["/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge"], + }.get(browser, []) + elif system == "Linux": + candidates = { + "chrome": [ + "/usr/bin/google-chrome", + "/usr/bin/google-chrome-stable", + "/usr/bin/google-chrome-beta", + "/usr/bin/google-chrome-unstable", + ], + "chromium": ["/usr/bin/chromium", "/usr/bin/chromium-browser"], + "brave": ["/usr/bin/brave-browser", "/usr/bin/brave-browser-beta", "/usr/bin/brave-browser-nightly"], + "edge": [ + "/usr/bin/microsoft-edge", + "/usr/bin/microsoft-edge-stable", + "/usr/bin/microsoft-edge-beta", + "/usr/bin/microsoft-edge-dev", + ], + }.get(browser, []) + elif system == "Windows": + local_app_data = Path(os.environ.get("LOCALAPPDATA", home / "AppData" / "Local")) + candidates = { + "chrome": [ + str(local_app_data / "Google" / "Chrome" / "Application" / "chrome.exe"), + "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe", + ], + "chromium": [str(local_app_data / "Chromium" / "Application" / "chrome.exe")], + "brave": [ + str(local_app_data / "BraveSoftware" / "Brave-Browser" / "Application" / "brave.exe"), + "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + "C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe", + ], + "edge": [ + str(local_app_data / "Microsoft" / "Edge" / "Application" / "msedge.exe"), + "C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe", + "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe", + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + +BROWSER_PROFILE_FINDERS = { + "chrome": get_chrome_user_data_dir, + "chromium": get_chrome_user_data_dir, + "brave": get_brave_user_data_dir, + "edge": get_edge_user_data_dir, +} + +CHROMIUM_BROWSERS = tuple(BROWSER_PROFILE_FINDERS.keys()) + + +NETSCAPE_COOKIE_HEADER = [ + "# Netscape HTTP Cookie File", + "# https://curl.se/docs/http-cookies.html", + "# This file was generated by ArchiveBox persona cookie extraction", + "#", + "# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue", + "", +] + + +def validate_persona_name(name: str) -> tuple[bool, str]: + """Validate persona name to prevent path traversal.""" + if not name or not name.strip(): + return False, "Persona name cannot be empty" + if "/" in name or "\\" in name: + return False, "Persona name cannot contain path separators (/ or \\)" + if ".." in name: + return False, "Persona name cannot contain parent directory references (..)" + if name.startswith("."): + return False, "Persona name cannot start with a dot (.)" + if "\x00" in name or "\n" in name or "\r" in name: + return False, "Persona name contains invalid characters" + return True, "" + + +def discover_local_browser_profiles() -> list[PersonaImportSource]: + discovered: list[PersonaImportSource] = [] + + for browser, finder in BROWSER_PROFILE_FINDERS.items(): + user_data_dir = finder() + if not user_data_dir: + continue + + browser_binary = get_browser_binary(browser) + for profile_dir in _list_profile_names(user_data_dir): + try: + discovered.append( + resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=browser_binary, + ), + ) + except ValueError: + continue + + discovered.extend(discover_persona_template_profiles()) + + return discovered + + +def discover_persona_template_profiles(personas_dir: Path | None = None) -> list[PersonaImportSource]: + from archivebox.config.constants import CONSTANTS + + templates: list[PersonaImportSource] = [] + candidate_roots: list[Path] = [] + + if personas_dir is not None: + candidate_roots.append(personas_dir.expanduser()) + else: + candidate_roots.extend( + [ + CONSTANTS.PERSONAS_DIR.expanduser(), + ], + ) + + seen_roots: set[Path] = set() + for personas_root in candidate_roots: + resolved_root = personas_root.resolve() + if resolved_root in seen_roots: + continue + seen_roots.add(resolved_root) + + if not resolved_root.exists() or not resolved_root.is_dir(): + continue + + for persona_dir in sorted((path for path in resolved_root.iterdir() if path.is_dir()), key=lambda path: path.name.lower()): + for candidate_dir_name in PERSONA_PROFILE_DIR_CANDIDATES: + user_data_dir = persona_dir / candidate_dir_name + if not user_data_dir.exists() or not user_data_dir.is_dir(): + continue + + for profile_dir in _list_profile_names(user_data_dir): + try: + templates.append( + resolve_browser_profile_source( + browser="persona", + source_name=persona_dir.name, + user_data_dir=user_data_dir, + profile_dir=profile_dir, + browser_binary=get_browser_binary("chrome"), + ), + ) + except ValueError: + continue + + return templates + + +def resolve_browser_import_source(browser: str, profile_dir: str | None = None) -> PersonaImportSource: + browser = browser.lower().strip() + if browser not in BROWSER_PROFILE_FINDERS: + supported = ", ".join(BROWSER_PROFILE_FINDERS) + raise ValueError(f"Unknown browser: {browser}. Supported browsers: {supported}") + + user_data_dir = BROWSER_PROFILE_FINDERS[browser]() + if not user_data_dir: + raise ValueError(f"Could not find {browser} profile directory") + + chosen_profile = profile_dir or pick_default_profile_dir(user_data_dir) + if not chosen_profile: + raise ValueError(f"Could not find a profile in {user_data_dir}") + + return resolve_browser_profile_source( + browser=browser, + user_data_dir=user_data_dir, + profile_dir=chosen_profile, + browser_binary=get_browser_binary(browser), + ) + + +def resolve_browser_profile_source( + browser: str, + user_data_dir: Path, + profile_dir: str, + source_name: str | None = None, + browser_binary: str | None = None, +) -> PersonaImportSource: + resolved_root = user_data_dir.expanduser() + if not resolved_root.is_absolute(): + resolved_root = resolved_root.resolve() + if not resolved_root.exists(): + raise ValueError(f"Profile root does not exist: {resolved_root}") + if not profile_dir.strip(): + raise ValueError("Profile directory name cannot be empty.") + + profile_path = resolved_root / profile_dir + if not _looks_like_profile_dir(profile_path): + raise ValueError(f"Profile directory does not look valid: {profile_path}") + + return PersonaImportSource( + kind="browser-profile", + browser=browser, + source_name=source_name, + user_data_dir=resolved_root, + profile_dir=profile_dir, + browser_binary=browser_binary, + ) + + +def resolve_custom_import_source(raw_value: str, profile_dir: str | None = None) -> PersonaImportSource: + raw_value = raw_value.strip() + if not raw_value: + raise ValueError("Provide an absolute browser profile path or a CDP URL.") + + if _looks_like_cdp_url(raw_value): + return PersonaImportSource(kind="cdp", cdp_url=raw_value) + + source_path = Path(raw_value).expanduser() + if not source_path.is_absolute(): + raise ValueError("Custom browser path must be an absolute path.") + if not source_path.exists(): + raise ValueError(f"Custom browser path does not exist: {source_path}") + + explicit_profile = profile_dir.strip() if profile_dir else "" + if _looks_like_profile_dir(source_path): + if explicit_profile and explicit_profile != source_path.name: + raise ValueError("Profile name does not match the provided profile directory path.") + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.parent.resolve(), + profile_dir=source_path.name, + ) + + chosen_profile = explicit_profile or pick_default_profile_dir(source_path) + if not chosen_profile: + raise ValueError( + "Could not find a Chromium profile in that directory. " + "Provide an exact profile directory path or fill in the profile name field.", + ) + + return resolve_browser_profile_source( + browser="custom", + user_data_dir=source_path.resolve(), + profile_dir=chosen_profile, + ) + + +def pick_default_profile_dir(user_data_dir: Path) -> str | None: + profiles = _list_profile_names(user_data_dir) + if not profiles: + return None + if "Default" in profiles: + return "Default" + return profiles[0] + + +def import_persona_from_source( + persona: Persona, + source: PersonaImportSource, + *, + copy_profile: bool = True, + import_cookies: bool = True, + capture_storage: bool = False, +) -> PersonaImportResult: + persona.ensure_dirs() + result = PersonaImportResult(source=source) + + persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR) + cookies_file = persona.path / "cookies.txt" + auth_file = persona.path / "auth.json" + + launch_user_data_dir: Path | None = None + + if source.kind == "browser-profile": + if copy_profile and source.user_data_dir: + resolved_source_root = source.user_data_dir.resolve() + resolved_persona_root = persona_chrome_dir.resolve() + if resolved_source_root == resolved_persona_root: + result.warnings.append( + "Skipped profile copy because the selected source is already this persona's chrome_profile directory.", + ) + else: + copy_browser_user_data_dir(resolved_source_root, resolved_persona_root) + persona.cleanup_chrome_profile(resolved_persona_root) + result.profile_copied = True + launch_user_data_dir = resolved_persona_root + else: + launch_user_data_dir = source.user_data_dir + elif copy_profile: + result.warnings.append( + "Profile copying is only available for local Chromium profile paths. CDP imports can only pull cookies and open-tab storage.", + ) + + if not import_cookies and not capture_storage: + return result + + if source.kind == "cdp": + export_success, auth_payload, export_message = export_browser_state( + cdp_url=source.cdp_url, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + else: + export_success, auth_payload, export_message = export_browser_state( + user_data_dir=launch_user_data_dir, + profile_dir=source.profile_dir, + chrome_binary=source.browser_binary, + cookies_output_file=cookies_file if import_cookies else None, + auth_output_file=auth_file if capture_storage else None, + ) + + if not export_success: + result.warnings.append(export_message or "Browser import failed.") + return result + + if import_cookies and cookies_file.exists(): + result.cookies_imported = True + if capture_storage and auth_file.exists(): + result.storage_captured = True + if _apply_imported_user_agent(persona, auth_payload): + result.user_agent_imported = True + + return result + + +def copy_browser_user_data_dir(source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + shutil.copytree( + source_dir, + destination_dir, + symlinks=True, + ignore=shutil.ignore_patterns(*VOLATILE_PROFILE_COPY_PATTERNS), + ) + + +def export_browser_state( + *, + user_data_dir: Path | None = None, + cdp_url: str | None = None, + profile_dir: str | None = None, + chrome_binary: str | None = None, + cookies_output_file: Path | None = None, + auth_output_file: Path | None = None, +) -> tuple[bool, dict | None, str]: + if not user_data_dir and not cdp_url: + return False, None, "Missing browser source." + + from abx_plugins import get_plugins_dir + from archivebox.config.common import get_config + + state_script = Path(__file__).with_name("export_browser_state.js") + if not state_script.exists(): + return False, None, f"Browser state export script not found at {state_script}" + + node_modules_dir = get_config().LIB_DIR / "pnpm" / "packages" / "chrome" / "node_modules" + chrome_plugin_dir = Path(get_plugins_dir()).resolve() + + env = os.environ.copy() + env["NODE_MODULES_DIR"] = str(node_modules_dir) + env["ARCHIVEBOX_ABX_PLUGINS_DIR"] = str(chrome_plugin_dir) + + if user_data_dir: + env["CHROME_USER_DATA_DIR"] = str(user_data_dir) + if cdp_url: + env["CHROME_CDP_URL"] = cdp_url + env["CHROME_IS_LOCAL"] = "false" + if chrome_binary: + env["CHROME_BINARY"] = str(chrome_binary) + if profile_dir: + extra_arg = f"--profile-directory={profile_dir}" + existing_extra = env.get("CHROME_ARGS_EXTRA", "").strip() + args_list: list[str] = [] + if existing_extra: + if existing_extra.startswith("["): + try: + parsed = json.loads(existing_extra) + if isinstance(parsed, list): + args_list.extend(str(x) for x in parsed) + except Exception: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + else: + args_list.extend([s.strip() for s in existing_extra.split(",") if s.strip()]) + args_list.append(extra_arg) + env["CHROME_ARGS_EXTRA"] = json.dumps(args_list) + + temp_dir: Path | None = None + tmp_cookies_file: Path | None = None + tmp_auth_file: Path | None = None + + if cookies_output_file and cookies_output_file.exists(): + temp_dir = Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_cookies_file = temp_dir / "cookies.txt" + env["COOKIES_OUTPUT_FILE"] = str(tmp_cookies_file) + elif cookies_output_file: + env["COOKIES_OUTPUT_FILE"] = str(cookies_output_file) + + if auth_output_file and auth_output_file.exists(): + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + elif auth_output_file: + env["AUTH_STORAGE_OUTPUT_FILE"] = str(auth_output_file) + else: + temp_dir = temp_dir or Path(tempfile.mkdtemp(prefix="ab_browser_state_")) + tmp_auth_file = temp_dir / "auth.json" + env["AUTH_STORAGE_OUTPUT_FILE"] = str(tmp_auth_file) + + try: + result = subprocess.run( + ["node", str(state_script)], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + except subprocess.TimeoutExpired: + return False, None, "Browser state export timed out." + except FileNotFoundError: + return False, None, "Node.js was not found, so ArchiveBox could not extract browser state." + except Exception as err: + return False, None, f"Browser state export failed: {err}" + + if result.returncode != 0: + message = (result.stderr or result.stdout or "").strip() or "Browser state export failed." + return False, None, message + + auth_payload: dict | None = None + if cookies_output_file and tmp_cookies_file and tmp_cookies_file.exists(): + _merge_netscape_cookies(cookies_output_file, tmp_cookies_file) + if auth_output_file and tmp_auth_file and tmp_auth_file.exists(): + _merge_auth_storage(auth_output_file, tmp_auth_file) + auth_payload = _load_auth_storage(tmp_auth_file) + elif auth_output_file and auth_output_file.exists(): + auth_payload = _load_auth_storage(auth_output_file) + elif tmp_auth_file and tmp_auth_file.exists(): + auth_payload = _load_auth_storage(tmp_auth_file) + + if temp_dir and temp_dir.exists(): + shutil.rmtree(temp_dir, ignore_errors=True) + + return True, auth_payload, (result.stderr or result.stdout or "").strip() + + +def _list_profile_names(user_data_dir: Path) -> list[str]: + if not user_data_dir.exists() or not user_data_dir.is_dir(): + return [] + + profiles: list[str] = [] + for child in sorted(user_data_dir.iterdir(), key=lambda path: path.name.lower()): + if not child.is_dir(): + continue + if child.name == "System Profile": + continue + if child.name == "Default" or child.name.startswith("Profile ") or child.name.startswith("Guest Profile"): + if _looks_like_profile_dir(child): + profiles.append(child.name) + continue + if _looks_like_profile_dir(child): + profiles.append(child.name) + return profiles + + +def _looks_like_profile_dir(path: Path) -> bool: + if not path.exists() or not path.is_dir(): + return False + + marker_paths = ( + path / "Preferences", + path / "History", + path / "Cookies", + path / "Network" / "Cookies", + path / "Local Storage", + path / "Session Storage", + ) + + if any(marker.exists() for marker in marker_paths): + return True + + return any(path.name == prefix or path.name.startswith(prefix) for prefix in BROWSER_PROFILE_DIR_NAMES) + + +def _looks_like_cdp_url(value: str) -> bool: + parsed = urlparse(value) + return parsed.scheme in {"ws", "wss", "http", "https"} and bool(parsed.netloc) + + +def _parse_netscape_cookies(path: Path) -> dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]: + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]] = {} + if not path.exists(): + return cookies + + for line in path.read_text().splitlines(): + if not line or line.startswith("#"): + continue + parts = line.split("\t") + if len(parts) < 7: + continue + domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7] + cookies[(domain, cookie_path, name)] = (domain, include_subdomains, cookie_path, secure, expiry, name, value) + return cookies + + +def _write_netscape_cookies( + path: Path, + cookies: dict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]], +) -> None: + lines = list(NETSCAPE_COOKIE_HEADER) + for cookie in cookies.values(): + lines.append("\t".join(cookie)) + path.write_text("\n".join(lines) + "\n") + + +def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: + existing = _parse_netscape_cookies(existing_file) + new = _parse_netscape_cookies(new_file) + existing.update(new) + _write_netscape_cookies(existing_file, existing) + + +def _merge_auth_storage(existing_file: Path, new_file: Path) -> None: + existing_payload = _load_auth_storage(existing_file) + new_payload = _load_auth_storage(new_file) + + existing_local = existing_payload.setdefault("localStorage", {}) + existing_session = existing_payload.setdefault("sessionStorage", {}) + + for origin, payload in (new_payload.get("localStorage") or {}).items(): + existing_local[origin] = payload + for origin, payload in (new_payload.get("sessionStorage") or {}).items(): + existing_session[origin] = payload + + cookies = _merge_cookie_dicts(existing_payload.get("cookies") or [], new_payload.get("cookies") or []) + + merged = { + **existing_payload, + **new_payload, + "cookies": cookies, + "localStorage": existing_local, + "sessionStorage": existing_session, + "user_agent": new_payload.get("user_agent") or existing_payload.get("user_agent") or "", + } + existing_file.write_text(json.dumps(merged, indent=2, sort_keys=True) + "\n") + + +def _load_auth_storage(path: Path) -> dict: + if not path.exists(): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + try: + payload = json.loads(path.read_text()) + except json.JSONDecodeError: + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + if not isinstance(payload, dict): + return { + "TYPE": "auth", + "cookies": [], + "localStorage": {}, + "sessionStorage": {}, + } + return payload + + +def _merge_cookie_dicts(existing: list[dict], new: list[dict]) -> list[dict]: + merged: dict[tuple[str, str, str], dict] = {} + for cookie in existing: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + for cookie in new: + key = (str(cookie.get("domain") or ""), str(cookie.get("path") or "/"), str(cookie.get("name") or "")) + merged[key] = cookie + return list(merged.values()) + + +def _apply_imported_user_agent(persona: Persona, auth_payload: dict | None) -> bool: + if not auth_payload: + return False + + user_agent = str(auth_payload.get("user_agent") or "").strip() + if not user_agent: + return False + + config = dict(persona.config or {}) + if config.get("USER_AGENT") == user_agent: + return False + + config["USER_AGENT"] = user_agent + persona.config = config + persona.save(update_fields=["config"]) + return True diff --git a/archivebox/personas/migrations/0001_initial.py b/archivebox/personas/migrations/0001_initial.py new file mode 100644 index 0000000000..1d913674f1 --- /dev/null +++ b/archivebox/personas/migrations/0001_initial.py @@ -0,0 +1,36 @@ +# Generated by Django 6.0 on 2025-12-31 09:06 + +import archivebox.base_models.models +from archivebox.uuid_compat import uuid7 +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Persona", + fields=[ + ("id", models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)), + ("config", models.JSONField(blank=True, default=dict, null=True)), + ("name", models.CharField(max_length=64, unique=True)), + ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ( + "created_by", + models.ForeignKey( + default=archivebox.base_models.models.get_or_create_system_user_pk, + on_delete=django.db.models.deletion.CASCADE, + to=settings.AUTH_USER_MODEL, + ), + ), + ], + ), + ] diff --git a/archivebox/personas/migrations/0002_alter_persona_id.py b/archivebox/personas/migrations/0002_alter_persona_id.py new file mode 100644 index 0000000000..d105a15bd0 --- /dev/null +++ b/archivebox/personas/migrations/0002_alter_persona_id.py @@ -0,0 +1,19 @@ +# Generated by Django 6.0 on 2026-01-05 01:09 + +from django.db import migrations, models + +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + dependencies = [ + ("personas", "0001_initial"), + ] + + operations = [ + migrations.AlterField( + model_name="persona", + name="id", + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/personas/migrations/0003_persona_permissions.py b/archivebox/personas/migrations/0003_persona_permissions.py new file mode 100644 index 0000000000..3ddc4421da --- /dev/null +++ b/archivebox/personas/migrations/0003_persona_permissions.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0.5 on 2026-05-28 07:25 + +import django.db.models.fields.json +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("personas", "0002_alter_persona_id"), + ] + + operations = [ + migrations.AddField( + model_name="persona", + name="permissions", + field=models.GeneratedField( + db_index=True, + db_persist=True, + expression=django.db.models.fields.json.KeyTextTransform("PERMISSIONS", "config"), + output_field=models.CharField(max_length=16, null=True), + ), + ), + ] diff --git a/archivebox/personas/migrations/0004_hydrate_persona_permissions.py b/archivebox/personas/migrations/0004_hydrate_persona_permissions.py new file mode 100644 index 0000000000..7b7bd22ec3 --- /dev/null +++ b/archivebox/personas/migrations/0004_hydrate_persona_permissions.py @@ -0,0 +1,125 @@ +import os +import json +import uuid + +from django.db import migrations +from django.db.models import Q + + +VALID_PERMISSIONS = {"public", "unlisted", "private"} +BATCH_SIZE = 1000 + + +def normalize_permissions(value, default): + value = str(value or "").strip().lower() + return value if value in VALID_PERMISSIONS else default + + +def raw_base_config(apps): + try: + from archivebox.config import CONSTANTS + from archivebox.config.configset import BaseConfigSet + + config = {**BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE), **os.environ} + except Exception: + config = dict(os.environ) + + try: + Machine = apps.get_model("machine", "Machine") + machine_config = Machine.objects.order_by("-modified_at").values_list("config", flat=True).first() or {} + if isinstance(machine_config, dict): + config.update(machine_config) + except Exception: + pass + return config + + +def resolve_permissions(config, default): + from archivebox.config.common import permissions_from_legacy_public_flags + + explicit = str(config.get("PERMISSIONS") or "").strip().lower() + if explicit in VALID_PERMISSIONS: + return explicit + return permissions_from_legacy_public_flags(config) or default + + +def id_values(pk): + if isinstance(pk, uuid.UUID): + return str(pk), pk.hex + pk_str = str(pk) + return pk_str, pk_str.replace("-", "") + + +def flush_batch(cursor, table_name, batch): + if not batch: + return + cursor.executemany( + f"UPDATE {table_name} SET config = %s WHERE id = %s OR id = %s", + [(json.dumps(config), *id_values(pk)) for pk, config in batch], + ) + + +def _ensure_permissions_column(cursor): + """Backfill the ``permissions`` generated column on ``personas_persona``. + + Long-lived dev DBs have ``personas/0003_persona_permissions`` marked + applied in ``django_migrations`` but the historical migration with that + name predates the current GeneratedField design โ€” the column never made + it onto the table. Without this guard, the hydration query below fails + with ``no such column: personas_persona.permissions``. Fresh installs + already have the column, so this is a no-op on first-time setup. + """ + # ``table_info`` hides generated columns (SQLite docs: "this command + # does not include the generated columns"). Fresh installs add + # ``permissions`` as a STORED GeneratedField via 0003, which + # ``table_xinfo`` reports but ``table_info`` does not โ€” so the latter + # would lie to us and we'd try to ALTER an already-present column. + cursor.execute("PRAGMA table_xinfo(personas_persona)") + existing_cols = {row[1] for row in cursor.fetchall()} + if "permissions" in existing_cols: + return + # See crawls/0016 โ€” SQLite ALTER TABLE only allows VIRTUAL generated + # columns; the runtime model's STORED declaration only applies to fresh + # installs where the column lands during initial table creation. + cursor.execute( + "ALTER TABLE personas_persona " + "ADD COLUMN permissions varchar(16) " + "GENERATED ALWAYS AS (json_extract(config, '$.PERMISSIONS')) VIRTUAL", + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS personas_persona_permissions_idx ON personas_persona (permissions)", + ) + + +def hydrate_persona_permissions(apps, schema_editor): + Persona = apps.get_model("personas", "Persona") + base_config = raw_base_config(apps) + default_permissions = resolve_permissions(base_config, "public") + table_name = schema_editor.quote_name(Persona._meta.db_table) + cursor = schema_editor.connection.cursor() + _ensure_permissions_column(cursor) + batch = [] + missing_permissions = Q(permissions__isnull=True) | (Q(permissions__isnull=False) & ~Q(permissions__in=VALID_PERMISSIONS)) + + for persona in Persona.objects.filter(missing_permissions).iterator(chunk_size=BATCH_SIZE): + config = dict(persona.config or {}) + resolved = dict(base_config) + resolved.update(config) + config["PERMISSIONS"] = resolve_permissions(resolved, default_permissions) + batch.append((persona.id, config)) + if len(batch) >= BATCH_SIZE: + flush_batch(cursor, table_name, batch) + batch.clear() + + flush_batch(cursor, table_name, batch) + + +class Migration(migrations.Migration): + dependencies = [ + ("machine", "0019_single_active_runner_constraint"), + ("personas", "0003_persona_permissions"), + ] + + operations = [ + migrations.RunPython(hydrate_persona_permissions, migrations.RunPython.noop), + ] diff --git a/archivebox/personas/migrations/__init__.py b/archivebox/personas/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py new file mode 100644 index 0000000000..ba36a17fee --- /dev/null +++ b/archivebox/personas/models.py @@ -0,0 +1,344 @@ +""" +Persona management for ArchiveBox. + +A Persona represents a browser profile/identity used for archiving. +Each persona has its own: +- Chrome user data directory (for cookies, localStorage, extensions, etc.) +- Cookies file +- Config overrides +""" + +__package__ = "archivebox.personas" + +import shutil +import subprocess +import sys +from contextlib import contextmanager +from pathlib import Path +from typing import TYPE_CHECKING, Any +from collections.abc import Mapping + +from django.db import models +from django.db.models.fields.json import KT +from django.conf import settings +from django.utils import timezone + +from archivebox.core.permissions import PERMISSIONS_VALUES, normalize_permissions +from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk +from archivebox.uuid_compat import CompactUUIDField, uuid7 + +_fcntl: Any | None = None +try: + import fcntl as _fcntl_import +except ImportError: # pragma: no cover + pass +else: + _fcntl = _fcntl_import + +if TYPE_CHECKING: + import fcntl +else: + fcntl = _fcntl + + +VOLATILE_PROFILE_DIR_NAMES = { + "Cache", + "Code Cache", + "GPUCache", + "ShaderCache", + "Service Worker", + "GCM Store", + "Crashpad", + "BrowserMetrics", +} + +VOLATILE_PROFILE_FILE_NAMES = { + "BrowserMetrics-spare.pma", + "SingletonCookie", + "SingletonLock", + "SingletonSocket", +} + + +def derive_persona_config(*, name: str, config: Mapping[str, Any] | None, persona_dir: Path) -> dict[str, Any]: + derived = dict(config or {}) + + if "CHROME_USER_DATA_DIR" not in derived: + derived["CHROME_USER_DATA_DIR"] = str(persona_dir / "chrome_profile") + if "CHROME_DOWNLOADS_DIR" not in derived: + derived["CHROME_DOWNLOADS_DIR"] = str(persona_dir / "chrome_downloads") + + cookies_path = persona_dir / "cookies.txt" + if "COOKIES_FILE" not in derived and cookies_path.exists(): + derived["COOKIES_FILE"] = str(cookies_path) + + auth_path = persona_dir / "auth.json" + if "AUTH_STORAGE_FILE" not in derived and auth_path.exists(): + derived["AUTH_STORAGE_FILE"] = str(auth_path) + + derived["ACTIVE_PERSONA"] = name + return derived + + +class Persona(ModelWithConfig): + """ + Browser persona/profile for archiving sessions. + + Each persona provides: + - CHROME_USER_DATA_DIR: Chrome profile directory + - CHROME_DOWNLOADS_DIR: Chrome downloads directory + - COOKIES_FILE: Cookies file for wget/curl + - config: JSON field with persona-specific config overrides + + Usage: + # Get persona and its derived config + config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) + chrome_dir = config['CHROME_USER_DATA_DIR'] + + # Or access directly from persona + persona = Persona.objects.get(name='Default') + persona.CHROME_USER_DATA_DIR # -> Path to chrome_profile + """ + + id = CompactUUIDField(primary_key=True, default=uuid7, editable=False, unique=True) + name = models.CharField(max_length=64, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + permissions = models.GeneratedField( + expression=KT("config__PERMISSIONS"), + output_field=models.CharField(max_length=16, null=True), + db_persist=True, + db_index=True, + editable=False, + ) + + class Meta(ModelWithConfig.Meta): + app_label = "personas" + + def save(self, *args, **kwargs): + config = dict(self.config or {}) + if str(config.get("PERMISSIONS") or "").strip().lower() not in PERMISSIONS_VALUES: + from archivebox.config.common import get_config + + config["PERMISSIONS"] = normalize_permissions(get_config(include_machine=True).PERMISSIONS) + self.config = config + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = tuple(dict.fromkeys([*update_fields, "config"])) + super().save(*args, **kwargs) + + def __str__(self) -> str: + return self.name + + @property + def path(self) -> Path: + """Path to persona directory under PERSONAS_DIR.""" + from archivebox.config.constants import CONSTANTS + + return CONSTANTS.PERSONAS_DIR / self.name + + @property + def CHROME_USER_DATA_DIR(self) -> str: + """Derived path to Chrome user data directory for this persona.""" + return str(self.path / "chrome_profile") + + @property + def CHROME_DOWNLOADS_DIR(self) -> str: + """Derived path to Chrome downloads directory for this persona.""" + return str(self.path / "chrome_downloads") + + @property + def COOKIES_FILE(self) -> str: + """Derived path to cookies.txt file for this persona (if exists).""" + cookies_path = self.path / "cookies.txt" + return str(cookies_path) if cookies_path.exists() else "" + + @property + def AUTH_STORAGE_FILE(self) -> str: + """Derived path to auth.json for this persona (if it exists).""" + auth_path = self.path / "auth.json" + return str(auth_path) if auth_path.exists() else "" + + def get_derived_config(self) -> dict: + """ + Get config dict with derived paths filled in. + + Returns dict with: + - All values from self.config JSONField + - CHROME_USER_DATA_DIR (derived from persona path) + - CHROME_DOWNLOADS_DIR (derived from persona path) + - COOKIES_FILE (derived from persona path, if file exists) + - AUTH_STORAGE_FILE (derived from persona path, if file exists) + - ACTIVE_PERSONA (set to this persona's name) + """ + return derive_persona_config(name=self.name, config=self.config, persona_dir=self.path) + + def ensure_dirs(self) -> None: + """Create persona directories if they don't exist.""" + self.path.mkdir(parents=True, exist_ok=True) + (self.path / "chrome_profile").mkdir(parents=True, exist_ok=True) + (self.path / "chrome_downloads").mkdir(parents=True, exist_ok=True) + + def cleanup_chrome_profile(self, profile_dir: Path) -> bool: + """Remove volatile Chrome state that should never be reused across launches.""" + cleaned = False + + if not profile_dir.exists(): + return False + + for path in profile_dir.rglob("*"): + if path.name in VOLATILE_PROFILE_FILE_NAMES: + try: + path.unlink() + cleaned = True + except OSError: + pass + + for dirname in VOLATILE_PROFILE_DIR_NAMES: + for path in profile_dir.rglob(dirname): + if not path.is_dir(): + continue + shutil.rmtree(path, ignore_errors=True) + cleaned = True + + for path in profile_dir.rglob("*.log"): + try: + path.unlink() + cleaned = True + except OSError: + pass + + return cleaned + + def cleanup_chrome(self) -> bool: + """Clean up volatile Chrome state for this persona's base profile.""" + return self.cleanup_chrome_profile(self.path / "chrome_profile") + + @contextmanager + def lock_runtime_for_crawl(self): + lock_path = self.path / ".archivebox-crawl-profile.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + + with lock_path.open("w") as lock_file: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) + try: + yield + finally: + if fcntl is not None: + fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN) + + def runtime_root_for_crawl(self, crawl) -> Path: + return Path(crawl.output_dir) / ".persona" / self.name + + def runtime_profile_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / "chrome_profile" + + def runtime_downloads_dir_for_crawl(self, crawl) -> Path: + return self.runtime_root_for_crawl(crawl) / "chrome_downloads" + + def runtime_root_for_snapshot(self, snapshot) -> Path: + return Path(snapshot.output_dir) / ".persona" / self.name + + def runtime_profile_dir_for_snapshot(self, snapshot) -> Path: + return self.runtime_root_for_snapshot(snapshot) / "chrome_profile" + + def runtime_downloads_dir_for_snapshot(self, snapshot) -> Path: + return self.runtime_root_for_snapshot(snapshot) / "chrome_downloads" + + def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None: + destination_dir.parent.mkdir(parents=True, exist_ok=True) + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + copy_cmd: list[str] | None = None + source_contents = f"{source_dir}/." + + if sys.platform == "darwin": + copy_cmd = ["cp", "-cR", source_contents, str(destination_dir)] + elif sys.platform.startswith("linux"): + copy_cmd = ["cp", "-a", source_contents, str(destination_dir)] + + if copy_cmd: + result = subprocess.run(copy_cmd, capture_output=True, text=True) + if result.returncode == 0: + return + + shutil.rmtree(destination_dir, ignore_errors=True) + destination_dir.mkdir(parents=True, exist_ok=True) + + shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True) + + def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = "") -> dict[str, str]: + self.ensure_dirs() + + template_dir = Path(self.CHROME_USER_DATA_DIR) + runtime_root = self.runtime_root_for_crawl(crawl) + runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl) + runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl) + + with self.lock_runtime_for_crawl(): + if runtime_root.exists(): + shutil.rmtree(runtime_root, ignore_errors=True) + if template_dir.exists() and any(template_dir.iterdir()): + self.copy_chrome_profile(template_dir, runtime_profile_dir) + else: + runtime_profile_dir.mkdir(parents=True, exist_ok=True) + + runtime_downloads_dir.mkdir(parents=True, exist_ok=True) + self.cleanup_chrome_profile(runtime_profile_dir) + + (runtime_root / "persona_name.txt").write_text(self.name) + (runtime_root / "template_dir.txt").write_text(str(template_dir)) + if chrome_binary: + (runtime_root / "chrome_binary.txt").write_text(chrome_binary) + + return { + "CHROME_USER_DATA_DIR": str(runtime_profile_dir), + "CHROME_DOWNLOADS_DIR": str(runtime_downloads_dir), + } + + def prepare_runtime_for_snapshot(self, snapshot, chrome_binary: str = "") -> dict[str, str]: + crawl_runtime_profile_dir = self.runtime_profile_dir_for_crawl(snapshot.crawl) + template_dir = crawl_runtime_profile_dir if crawl_runtime_profile_dir.exists() else Path(self.CHROME_USER_DATA_DIR) + runtime_root = self.runtime_root_for_snapshot(snapshot) + runtime_profile_dir = self.runtime_profile_dir_for_snapshot(snapshot) + runtime_downloads_dir = self.runtime_downloads_dir_for_snapshot(snapshot) + + if runtime_root.exists(): + shutil.rmtree(runtime_root, ignore_errors=True) + if template_dir.exists() and any(template_dir.iterdir()): + self.copy_chrome_profile(template_dir, runtime_profile_dir) + else: + runtime_profile_dir.mkdir(parents=True, exist_ok=True) + + runtime_downloads_dir.mkdir(parents=True, exist_ok=True) + self.cleanup_chrome_profile(runtime_profile_dir) + + (runtime_root / "persona_name.txt").write_text(self.name) + (runtime_root / "template_dir.txt").write_text(str(template_dir)) + if chrome_binary: + (runtime_root / "chrome_binary.txt").write_text(chrome_binary) + + return { + "CHROME_USER_DATA_DIR": str(runtime_profile_dir), + "CHROME_DOWNLOADS_DIR": str(runtime_downloads_dir), + } + + def cleanup_runtime_for_crawl(self, crawl) -> None: + shutil.rmtree(Path(crawl.output_dir) / ".persona", ignore_errors=True) + + @classmethod + def get_or_create_default(cls) -> "Persona": + """Get or create the Default persona.""" + persona, _ = cls.objects.get_or_create(name="Default") + return persona + + @classmethod + def cleanup_chrome_all(cls) -> int: + """Clean up Chrome state files for all personas.""" + cleaned = 0 + for persona in cls.objects.all(): + if persona.cleanup_chrome(): + cleaned += 1 + return cleaned diff --git a/archivebox/plugins/__init__.py b/archivebox/plugins/__init__.py new file mode 100644 index 0000000000..4ac8b362ee --- /dev/null +++ b/archivebox/plugins/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.plugins" diff --git a/archivebox/plugins/apps.py b/archivebox/plugins/apps.py new file mode 100644 index 0000000000..9d3c3bb653 --- /dev/null +++ b/archivebox/plugins/apps.py @@ -0,0 +1,9 @@ +__package__ = "archivebox.plugins" + +from django.apps import AppConfig + + +class PluginsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.plugins" + verbose_name = "Plugins" diff --git a/archivebox/plugins/discovery.py b/archivebox/plugins/discovery.py new file mode 100644 index 0000000000..c05262c2be --- /dev/null +++ b/archivebox/plugins/discovery.py @@ -0,0 +1,377 @@ +__package__ = "archivebox.plugins" + +import json +from collections.abc import Iterable +from functools import lru_cache +from pathlib import Path +from typing import Any, Protocol, TypedDict + +from abx_plugins import get_plugins_dir +from django.utils.safestring import mark_safe + +from archivebox.config.constants import CONSTANTS + + +class ConfigLookup(Protocol): + def get(self, key: str, default: Any = None) -> Any: ... + + def items(self) -> Iterable[tuple[str, Any]]: ... + + +class PluginSpecialConfig(TypedDict): + enabled: bool + timeout: int + binary: str + + +BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() +USER_PLUGINS_DIR = CONSTANTS.USER_PLUGINS_DIR + + +def iter_plugin_dirs() -> list[Path]: + """Iterate over all built-in and user plugin directories.""" + plugin_dirs: list[Path] = [] + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): + plugin_dirs.append(plugin_dir) + + return plugin_dirs + + +@lru_cache(maxsize=1) +def get_plugins() -> list[str]: + """ + Get list of available plugins by discovering plugin directories. + + Returns plugin directory names for any plugin that exposes hooks, config.json, + or a standardized templates/icon.html asset. This includes non-extractor + plugins such as binary providers and shared base plugins. + """ + plugins = [] + + for plugin_dir in iter_plugin_dirs(): + has_hooks = any(plugin_dir.glob("on_*__*.*")) + has_config = (plugin_dir / "config.json").exists() + has_icon = (plugin_dir / "templates" / "icon.html").exists() + if has_hooks or has_config or has_icon: + plugins.append(plugin_dir.name) + + return sorted(set(plugins)) + + +def get_plugin_name(plugin: str) -> str: + """ + Get the base plugin name without numeric prefix. + + Examples: + '10_title' -> 'title' + '26_readability' -> 'readability' + '50_parse_html_urls' -> 'parse_html_urls' + """ + parts = plugin.split("_", 1) + if len(parts) == 2 and parts[0].isdigit(): + return parts[1] + return plugin + + +def get_enabled_plugins(config: ConfigLookup | None = None, **config_kwargs: Any) -> list[str]: + """ + Get the list of enabled plugins based on config and available hooks. + + Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled. + """ + if config is None: + from archivebox.config.common import get_config + + config = get_config(**config_kwargs) + + def normalize_enabled_plugins(value: Any) -> list[str]: + if value is None: + return [] + if isinstance(value, str): + raw = value.strip() + if not raw: + return [] + if raw.startswith("["): + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + parsed = None + if isinstance(parsed, list): + return [str(plugin).strip() for plugin in parsed if str(plugin).strip()] + return [plugin.strip() for plugin in raw.split(",") if plugin.strip()] + if isinstance(value, (list, tuple, set)): + return [str(plugin).strip() for plugin in value if str(plugin).strip()] + return [str(value).strip()] if str(value).strip() else [] + + plugins_override = config.get("PLUGINS") + if plugins_override: + return normalize_enabled_plugins(plugins_override) + + enabled = [] + for plugin in get_plugins(): + plugin_config = get_plugin_special_config(plugin, config) + if plugin_config["enabled"]: + enabled.append(plugin) + + return enabled + + +def discover_plugins_that_provide_interface( + module_name: str, + required_attrs: list[str], + plugin_prefix: str | None = None, +) -> dict[str, Any]: + """ + Discover plugins that provide a specific Python module with required interface. + + This enables dynamic plugin discovery for features like search backends, + storage backends, etc. without hardcoding imports. + """ + import importlib.util + + backends = {} + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if not plugin_dir.is_dir(): + continue + + plugin_name = plugin_dir.name + if plugin_prefix and not plugin_name.startswith(plugin_prefix): + continue + + module_path = plugin_dir / f"{module_name}.py" + if not module_path.exists(): + continue + + try: + spec = importlib.util.spec_from_file_location( + f"archivebox.dynamic_plugins.{plugin_name}.{module_name}", + module_path, + ) + if spec is None or spec.loader is None: + continue + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if not all(attr in vars(module) for attr in required_attrs): + continue + + if plugin_prefix: + backend_name = plugin_name[len(plugin_prefix) :] + else: + backend_name = plugin_name + + backends[backend_name] = module + + except Exception: + continue + + return backends + + +def get_search_backends() -> dict[str, Any]: + """ + Discover all available search backend plugins. + + Search backends must provide a search.py module with: + - search(query: str) -> List[str] (returns snapshot IDs) + - flush(snapshot_ids: Iterable[str]) -> None + """ + return discover_plugins_that_provide_interface( + module_name="search", + required_attrs=["search", "flush"], + plugin_prefix="search_backend_", + ) + + +@lru_cache(maxsize=1) +def discover_plugin_configs() -> dict[str, dict[str, Any]]: + """ + Discover all plugin config.json schemas. + + Each plugin can define a config.json file with JSONSchema defining + its configuration options. This is intentionally cached because these + schemas are plugin package metadata, not live user config; runtime values + still come from env/db config at each callsite. + """ + configs = {} + + for plugin_dir in iter_plugin_dirs(): + config_path = plugin_dir / "config.json" + if not config_path.exists(): + continue + + try: + with open(config_path) as f: + schema = json.load(f) + + if not isinstance(schema, dict): + continue + if schema.get("type") != "object": + continue + if "properties" not in schema: + continue + + configs[plugin_dir.name] = schema + + except (json.JSONDecodeError, OSError) as e: + import sys + + print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr) + continue + + return configs + + +def get_plugin_special_config(plugin_name: str, config: ConfigLookup, _visited: set[str] | None = None) -> PluginSpecialConfig: + """ + Extract special config keys for a plugin following naming conventions. + + ArchiveBox recognizes 3 special config key patterns per plugin: + - {PLUGIN}_ENABLED: Enable/disable toggle (default True) + - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300) + - {PLUGIN}_BINARY: Primary binary path (default to plugin_name) + """ + plugin_upper = plugin_name.upper() + + plugins_whitelist = config.get("PLUGINS", "") + if plugins_whitelist: + plugin_configs = discover_plugin_configs() + plugin_names = {p.strip().lower() for p in plugins_whitelist.split(",") if p.strip()} + pending = list(plugin_names) + + while pending: + current = pending.pop() + schema = plugin_configs.get(current, {}) + required_plugins = schema.get("required_plugins", []) + if not isinstance(required_plugins, list): + continue + + for required_plugin in required_plugins: + required_plugin_name = str(required_plugin).strip().lower() + if not required_plugin_name or required_plugin_name in plugin_names: + continue + plugin_names.add(required_plugin_name) + pending.append(required_plugin_name) + + if plugin_name.lower() not in plugin_names: + enabled = False + else: + enabled_key = f"{plugin_upper}_ENABLED" + enabled = config.get(enabled_key) + if enabled is None: + enabled = True + elif isinstance(enabled, str): + enabled = enabled.lower() not in ("false", "0", "no", "") + else: + enabled_key = f"{plugin_upper}_ENABLED" + enabled = config.get(enabled_key) + if enabled is None: + enabled = True + elif isinstance(enabled, str): + enabled = enabled.lower() not in ("false", "0", "no", "") + + plugin_configs = discover_plugin_configs() + plugin_name_lower = plugin_name.lower() + + if enabled: + visited = _visited or set() + if plugin_name_lower not in visited: + next_visited = visited | {plugin_name_lower} + schema = plugin_configs.get(plugin_name_lower, {}) + required_plugins = schema.get("required_plugins", []) + if isinstance(required_plugins, list): + for required_plugin in required_plugins: + required_plugin_name = str(required_plugin).strip() + if not required_plugin_name: + continue + required_config = get_plugin_special_config(required_plugin_name, config, _visited=next_visited) + if not required_config["enabled"]: + enabled = False + break + + timeout_key = f"{plugin_upper}_TIMEOUT" + timeout = config.get(timeout_key) or config.get("TIMEOUT", 300) + + binary_key = f"{plugin_upper}_BINARY" + binary = config.get(binary_key, plugin_name) + + return { + "enabled": bool(enabled), + "timeout": int(timeout), + "binary": str(binary), + } + + +DEFAULT_TEMPLATES = { + "icon": """ + + {{ icon }} + + """, + "card": """ + + """, + "full": """ + + """, +} + + +@lru_cache(maxsize=None) +def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> str | None: + """ + Get a plugin template by plugin name and template type. + + Args: + plugin: Plugin name (e.g., 'screenshot', '15_singlefile') + template_name: One of 'icon', 'card', 'full' + fallback: If True, return default template if plugin template not found + """ + base_name = get_plugin_name(plugin) + if base_name in ("yt-dlp", "youtube-dl"): + base_name = "ytdlp" + + for plugin_dir in iter_plugin_dirs(): + if plugin_dir.name == base_name or plugin_dir.name.endswith(f"_{base_name}"): + template_path = plugin_dir / "templates" / f"{template_name}.html" + if template_path.exists(): + return template_path.read_text() + + if fallback: + return DEFAULT_TEMPLATES.get(template_name, "") + + return None + + +@lru_cache(maxsize=None) +def get_plugin_icon(plugin: str) -> str: + """ + Get the icon for a plugin from its icon.html template. + """ + icon_template = get_plugin_template(plugin, "icon", fallback=False) + if icon_template: + return mark_safe(icon_template.strip()) + + return mark_safe("๐Ÿ“") diff --git a/archivebox/plugins/forms.py b/archivebox/plugins/forms.py new file mode 100644 index 0000000000..ba4d384f9b --- /dev/null +++ b/archivebox/plugins/forms.py @@ -0,0 +1,611 @@ +__package__ = "archivebox.plugins" + +import json +import re +from collections.abc import Iterable, Mapping +from pathlib import Path +from typing import Any + +from django import forms +from django.utils.html import format_html + +from archivebox.config import CONSTANTS_CONFIG +from archivebox.config.common import ArchiveBoxConfig, get_config +from archivebox.plugins.discovery import discover_plugin_configs, get_plugin_icon, get_plugins + + +PLUGIN_CONFIG_FIELD_PREFIX = "plugin_config__" +PLUGIN_GROUP_DEFINITIONS = ( + ( + "main_plugins", + "Main", + "", + "", + "", + ( + "dom", + "screenshot", + "pdf", + "singlefile", + "wget", + "archivedotorg", + "chrome_mhtml", + "archivewebpage", + ), + ), + ( + "page_setup_plugins", + "Page Setup", + "", + "", + "", + ( + "chrome", + "infiniscroll", + "modalcloser", + "ublock", + "istilldontcareaboutcookies", + "twocaptcha", + "claudechrome", + ), + ), + ( + "media_plugins", + "Media", + "", + "", + "", + ( + "staticfile", + "responses", + "chrome_screencast", + "ytdlp", + "gallerydl", + "git", + ), + ), + ( + "text_plugins", + "Text", + "", + "", + "", + ( + "readability", + "htmltotext", + "defuddle", + "forumdl", + "mercury", + "trafilatura", + "liteparse", + "opendataloader", + "papersdl", + ), + ), + ( + "metadata_plugins", + "Metadata", + "", + "", + "", + ( + "title", + "favicon", + "headers", + "redirects", + "accessibility", + "consolelog", + "sslcerts", + "dns", + "seo", + "hashes", + ), + ), + ( + "postprocessing_plugins", + "Postprocessing", + "", + "", + "", + ( + "parse_dom_outlinks", + "parse_html_urls", + "parse_jsonl_urls", + "parse_netscape_urls", + "parse_rss_urls", + "parse_txt_urls", + "claudecode", + "claudecodecleanup", + "claudecodeextract", + ), + ), +) +HIDDEN_PLUGIN_CONFIG_UI_PLUGINS = { + "apt", + "base", + "bash", + "brew", + "cargo", + "chromewebstore", + "env", + "media", + "npm", + "opencode", + "pip", + "puppeteer", + "search_backend_ripgrep", + "search_backend_sonic", + "search_backend_sqlite", + "ssl", +} +TIMEOUT_INPUT_PATTERN = r"(0|[1-9][0-9]*|[0-9]+(?:\.[0-9]+)?\s*(?:s|sec|secs|second|seconds|m|min|mins|minute|minutes|h|hr|hrs|hour|hours))" + + +def get_plugin_choices(): + """Get available extractor plugins from discovered hooks.""" + return [(name, name) for name in get_plugins()] + + +def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str: + schema = plugin_configs.get(plugin_name, {}) + description = str(schema.get("description") or "").strip() + if not description: + return plugin_name + icon_html = get_plugin_icon(plugin_name) + + return format_html( + '{}{}', + icon_html, + plugin_name, + ) + + +def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField: + field = form.fields[name] + if not isinstance(field, forms.ChoiceField): + raise TypeError(f"{name} must be a ChoiceField") + return field + + +def _plugin_config_input_name(plugin_name: str, config_key: str) -> str: + return f"{PLUGIN_CONFIG_FIELD_PREFIX}{plugin_name}__{config_key}" + + +def _schema_types(schema: Mapping[str, Any]) -> list[str]: + raw_type = schema.get("type") or "string" + if isinstance(raw_type, list): + return [str(item) for item in raw_type] + return [str(raw_type)] + + +def _jsonish(value: Any) -> str: + if isinstance(value, str): + return value + return json.dumps(value, sort_keys=True, default=str) + + +def _same_config_value(left: Any, right: Any) -> bool: + return json.dumps(left, sort_keys=True, default=str) == json.dumps(right, sort_keys=True, default=str) + + +def _coerce_plugin_config_value(raw_value: Any, schema: Mapping[str, Any]) -> Any: + schema_types = _schema_types(schema) + + if "boolean" in schema_types: + if isinstance(raw_value, bool): + return raw_value + value = str(raw_value).strip().lower() + if value in {"true", "1", "yes", "on"}: + return True + if value in {"false", "0", "no", "off", ""}: + return False + raise forms.ValidationError("Must be true or false.") + + if "integer" in schema_types: + value = int(str(raw_value).strip()) + minimum = schema.get("minimum") + maximum = schema.get("maximum") + if minimum is not None and value < int(minimum): + raise forms.ValidationError(f"Must be at least {minimum}.") + if maximum is not None and value > int(maximum): + raise forms.ValidationError(f"Must be at most {maximum}.") + return value + + if "number" in schema_types: + value = float(str(raw_value).strip()) + minimum = schema.get("minimum") + maximum = schema.get("maximum") + if minimum is not None and value < float(minimum): + raise forms.ValidationError(f"Must be at least {minimum}.") + if maximum is not None and value > float(maximum): + raise forms.ValidationError(f"Must be at most {maximum}.") + return value + + if "array" in schema_types: + if isinstance(raw_value, list): + return raw_value + value = str(raw_value).strip() + if not value: + return [] + if value.startswith("["): + parsed = json.loads(value) + if not isinstance(parsed, list): + raise forms.ValidationError("Must be a JSON array.") + return parsed + return [item.strip() for item in value.replace(",", "\n").splitlines() if item.strip()] + + if "object" in schema_types: + value = str(raw_value).strip() + if not value: + return {} + parsed = json.loads(value) + if not isinstance(parsed, dict): + raise forms.ValidationError("Must be a JSON object.") + return parsed + + value = str(raw_value) + enum = schema.get("enum") + if isinstance(enum, list) and enum and value not in {str(item) for item in enum}: + raise forms.ValidationError(f"Must be one of: {', '.join(str(item) for item in enum)}.") + return value + + +class PluginConfigFormMixin: + plugin_groups: list[dict[str, Any]] + allow_crawl_execution_config_fields = True + + def build_plugin_groups(self, runtime_config: Mapping[str, Any] | None = None) -> None: + all_plugins = get_plugins() + plugin_configs = discover_plugin_configs() + runtime_config = runtime_config or get_config() + self.plugin_config_binary_urls = get_plugin_config_binary_urls(runtime_config) + grouped_plugins = set().union(*(group[-1] for group in PLUGIN_GROUP_DEFINITIONS)) + other_plugins = tuple(sorted(set(all_plugins) - grouped_plugins - HIDDEN_PLUGIN_CONFIG_UI_PLUGINS)) + + for field_name, *_rest, plugin_names in PLUGIN_GROUP_DEFINITIONS: + if field_name in self.fields: + get_choice_field(self, field_name).choices = [ + (p, get_plugin_choice_label(p, plugin_configs)) for p in plugin_names if p in all_plugins + ] + + if "other_plugins" in self.fields: + get_choice_field(self, "other_plugins").choices = [(p, get_plugin_choice_label(p, plugin_configs)) for p in other_plugins] + + group_specs = ( + *PLUGIN_GROUP_DEFINITIONS, + ("other_plugins", "Other", "", "", "", other_plugins), + ) + binary_url_lookup = _build_required_binary_url_lookup(plugin_configs, runtime_config) + self.plugin_groups = [ + { + "field_name": field_name, + "title": title, + "note": note, + "dom_id": dom_id, + "select_all_group": select_all_group, + "show_selectors": field_name in self.fields, + "plugins": self._build_plugin_cards(field_name, plugin_names, plugin_configs, runtime_config, binary_url_lookup), + } + for field_name, title, note, dom_id, select_all_group, plugin_names in group_specs + if any(plugin in all_plugins for plugin in plugin_names) + ] + + def _build_plugin_cards( + self, + field_name: str, + plugin_names: Iterable[str], + plugin_configs: dict[str, dict[str, Any]], + runtime_config: Mapping[str, Any], + binary_url_lookup: Mapping[str, str] | None = None, + ) -> list[dict[str, Any]]: + if field_name in self.fields: + choices = list(get_choice_field(self, field_name).choices) + selected_values = set(self.data.getlist(field_name)) if self.is_bound else set(get_choice_field(self, field_name).initial or []) + else: + all_plugins = get_plugins() + choices = [(p, get_plugin_choice_label(p, plugin_configs)) for p in plugin_names if p in all_plugins] + selected_values = set() + + cards = [] + for index, (plugin_name, label) in enumerate(choices): + schema = plugin_configs.get(str(plugin_name), {}) + properties = schema.get("properties") or {} + enabled_config_key = f"{str(plugin_name).upper()}_ENABLED" + enabled_prop_schema = properties.get(enabled_config_key) + if not isinstance(enabled_prop_schema, dict) or "boolean" not in _schema_types(enabled_prop_schema): + enabled_config_key = "" + config_fields = [ + self._build_plugin_config_field(str(plugin_name), str(config_key), prop_schema, runtime_config) + for config_key, prop_schema in properties.items() + if ( + isinstance(prop_schema, dict) + and str(config_key) not in CONSTANTS_CONFIG + and (self.allow_crawl_execution_config_fields or ArchiveBoxConfig.scope_for_key(str(config_key)) == "crawl_frozen") + ) + ] + cards.append( + { + "name": str(plugin_name), + "label": label, + "checked": str(plugin_name) in selected_values, + "checkbox_id": f"id_{field_name}_{index}", + "enabled_config_key": enabled_config_key, + "description": str(schema.get("description") or "").strip(), + "source_url": f"https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/{plugin_name}", + "docs_url": f"https://archivebox.github.io/abx-plugins/#{plugin_name}", + "required_plugins": [str(item) for item in schema.get("required_plugins") or []], + "required_binary_links": _build_required_binary_links( + schema.get("required_binaries") or [], + runtime_config, + binary_url_lookup, + ), + "config_fields": config_fields, + "config_count": len(config_fields), + }, + ) + return cards + + def _build_plugin_config_field( + self, + plugin_name: str, + config_key: str, + prop_schema: Mapping[str, Any], + runtime_config: Mapping[str, Any], + ) -> dict[str, Any]: + schema_types = _schema_types(prop_schema) + enum = prop_schema.get("enum") + input_name = _plugin_config_input_name(plugin_name, config_key) + current_value = runtime_config.get(config_key, prop_schema.get("default", "")) + if self.is_bound and input_name in self.data: + try: + current_value = _coerce_plugin_config_value(self.data.get(input_name), prop_schema) + except (TypeError, ValueError, json.JSONDecodeError, forms.ValidationError): + current_value = self.data.get(input_name) + + default_value = prop_schema.get("default", "") + fallback_key = prop_schema.get("x-fallback") + default_display = f"{{{fallback_key}}}" if fallback_key else default_value + from archivebox.config.common import is_sensitive_config_key + + is_sensitive = bool(prop_schema.get("x-sensitive")) or is_sensitive_config_key(config_key) + input_value = "" if is_sensitive else _jsonish(current_value) + field_kind = "text" + input_type = "text" + options = [] + + if "boolean" in schema_types: + field_kind = "boolean" + input_value = "true" if bool(current_value) else "false" + elif isinstance(enum, list) and enum: + field_kind = "select" + options = [ + { + "value": str(option), + "label": str(option), + "selected": str(option) == str(current_value), + } + for option in enum + ] + elif "integer" in schema_types or "number" in schema_types: + field_kind = "number" + input_type = "number" + elif "array" in schema_types or "object" in schema_types: + field_kind = "json" + input_value = "" if is_sensitive else json.dumps(current_value, indent=2, sort_keys=True, default=str) + elif is_sensitive: + input_type = "password" + else: + input_value = "" if is_sensitive else str(current_value) + + return { + "key": config_key, + "input_name": input_name, + "kind": field_kind, + "input_type": input_type, + "value": input_value, + "checked": bool(current_value), + "options": options, + "description": str(prop_schema.get("description") or "").strip(), + "default": _jsonish(default_display), + "current": "configured" + if is_sensitive and current_value + else (str(current_value) if "string" in schema_types else _jsonish(current_value)), + "current_url": self.plugin_config_binary_urls.get(config_key, "") if str(config_key).endswith("_BINARY") else "", + "is_sensitive": is_sensitive, + "minimum": prop_schema.get("minimum"), + "maximum": prop_schema.get("maximum"), + "pattern": prop_schema.get("pattern"), + "type_label": " / ".join(schema_types), + } + + def clean_plugin_config_overrides(self, effective_config: Mapping[str, Any] | None = None) -> dict[str, Any]: + if not self.is_bound: + return {} + + effective_config = effective_config or get_config() + overrides: dict[str, Any] = {} + sources: dict[str, str] = {} + + for plugin_name, schema in discover_plugin_configs().items(): + for config_key, prop_schema in (schema.get("properties") or {}).items(): + if not isinstance(prop_schema, dict): + continue + + input_name = _plugin_config_input_name(plugin_name, config_key) + if input_name not in self.data: + continue + if str(config_key) in CONSTANTS_CONFIG: + continue + if not self.allow_crawl_execution_config_fields and ArchiveBoxConfig.scope_for_key(str(config_key)) != "crawl_frozen": + continue + + raw_value: Any = self.data.get(input_name) + if "array" in _schema_types(prop_schema) and isinstance(prop_schema.get("enum"), list): + raw_value = self.data.getlist(input_name) + + from archivebox.config.common import SENSITIVE_CONFIG_VALUE_REDACTED, is_sensitive_config_key + + if (prop_schema.get("x-sensitive") or is_sensitive_config_key(config_key)) and raw_value in ( + "", + SENSITIVE_CONFIG_VALUE_REDACTED, + ): + continue + + try: + coerced_value = _coerce_plugin_config_value(raw_value, prop_schema) + except (TypeError, ValueError, json.JSONDecodeError) as err: + self.add_error("config", forms.ValidationError(f"{config_key}: {err}")) + continue + except forms.ValidationError as err: + self.add_error("config", forms.ValidationError(f"{config_key}: {err.messages[0]}")) + continue + + base_value = effective_config.get(config_key, prop_schema.get("default", "")) + if _same_config_value(coerced_value, base_value): + continue + + existing_value = overrides.get(config_key) + if config_key in overrides and not _same_config_value(existing_value, coerced_value): + self.add_error( + "config", + forms.ValidationError( + f"{config_key} was set differently under {sources[config_key]} and {plugin_name}. Set it once in Custom config overrides.", + ), + ) + continue + + overrides[config_key] = coerced_value + sources[config_key] = plugin_name + + return overrides + + def plugin_config_keys(self) -> set[str]: + return { + str(config_key) + for schema in discover_plugin_configs().values() + for config_key, prop_schema in (schema.get("properties") or {}).items() + if isinstance(prop_schema, dict) + } + + +_BINARY_TEMPLATE_PATTERN = re.compile(r"\{([A-Z_][A-Z0-9_]*)\}") + + +def _resolve_required_binary_name(template_name: str, runtime_config: Mapping[str, Any]) -> str: + if "{" not in template_name: + return template_name + + def _replace(match: re.Match[str]) -> str: + key = match.group(1) + try: + value = runtime_config.get(key) + except Exception: + value = None + if value is None or value == "": + return match.group(0) + return str(value) + + resolved = _BINARY_TEMPLATE_PATTERN.sub(_replace, template_name).strip() + if not resolved: + return template_name + return Path(resolved).name if "/" in resolved else resolved + + +def _iter_required_binary_names( + required_binaries: Iterable[Any], + runtime_config: Mapping[str, Any], +) -> Iterable[str]: + for item in required_binaries or []: + if not isinstance(item, dict): + continue + raw_name = str(item.get("name") or "").strip() + if not raw_name: + continue + resolved = _resolve_required_binary_name(raw_name, runtime_config) + if resolved: + yield resolved + + +def _build_required_binary_url_lookup( + plugin_configs: Mapping[str, dict[str, Any]], + runtime_config: Mapping[str, Any], +) -> dict[str, str]: + """Resolve admin URLs for every required binary across all plugin schemas in a single DB query.""" + from archivebox.config.views import get_environment_binary_url, get_installed_binary_change_url + from archivebox.machine.models import Binary, Machine + + resolved_names: set[str] = set() + for schema in plugin_configs.values(): + for name in _iter_required_binary_names(schema.get("required_binaries") or [], runtime_config): + resolved_names.add(name) + + if not resolved_names: + return {} + + machine = Machine.current() + name_to_binary: dict[str, Binary] = {} + for binary in ( + Binary.objects.filter(machine=machine, name__in=resolved_names) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + ): + key = binary.name.lower() + if key not in name_to_binary: + name_to_binary[key] = binary + + return { + name: (get_installed_binary_change_url(name, name_to_binary.get(name.lower())) or get_environment_binary_url(name)) + for name in resolved_names + } + + +def _build_required_binary_links( + required_binaries: list[dict[str, Any]], + runtime_config: Mapping[str, Any], + binary_url_lookup: Mapping[str, str] | None = None, +) -> list[dict[str, str]]: + from archivebox.config.views import get_environment_binary_url + + links: list[dict[str, str]] = [] + seen: set[str] = set() + for resolved in _iter_required_binary_names(required_binaries, runtime_config): + if resolved in seen: + continue + seen.add(resolved) + url = (binary_url_lookup or {}).get(resolved) or get_environment_binary_url(resolved) + links.append({"name": resolved, "url": url}) + return links + + +def get_plugin_config_binary_urls(runtime_config: Mapping[str, Any]) -> dict[str, str]: + from archivebox.config.views import get_environment_binary_url, get_installed_binary_change_url + from archivebox.machine.models import Binary, Machine + + binary_keys = { + str(config_key) + for schema in discover_plugin_configs().values() + for config_key, prop_schema in (schema.get("properties") or {}).items() + if isinstance(prop_schema, dict) and str(config_key).endswith("_BINARY") + } + urls: dict[str, str] = {} + machine = Machine.current() + for key in binary_keys: + value = str(runtime_config.get(key) or "").strip() + if not value: + continue + name = Path(value).name if "/" in value else value + binary = Binary.objects.get_valid_binary(value, machine=machine) + if binary is None and "/" in value: + binary = ( + Binary.objects.exclude(abspath="") + .exclude(abspath__isnull=True) + .filter(machine=machine, abspath=value) + .order_by("-modified_at") + .first() + ) + if binary is None and name != value: + binary = Binary.objects.get_valid_binary(name, machine=machine) + binary_name = binary.name if binary is not None else name + urls[key] = get_installed_binary_change_url(binary_name, binary) or get_environment_binary_url(name) + return urls diff --git a/archivebox/plugins/hooks.py b/archivebox/plugins/hooks.py new file mode 100644 index 0000000000..4b596195ff --- /dev/null +++ b/archivebox/plugins/hooks.py @@ -0,0 +1,632 @@ +""" +Hook discovery and execution helpers for ArchiveBox plugins. + +ArchiveBox no longer drives plugin execution itself during normal crawls. +`abx-dl` owns the live runtime and emits typed bus events; ArchiveBox mainly: + +- discovers hook files for inspection / docs / legacy direct execution helpers +- executes individual hook scripts when explicitly requested +- parses hook stdout JSONL records into ArchiveBox models when needed + +Hook-backed event families are discovered from filenames like: + on_CrawlSetup__* + on_Snapshot__* + +Internal bus event names are normalized to the corresponding +`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist +for that prefix, discovery returns `[]`. + +Directory structure: + abx_plugins/plugins//on___. (built-in package) + data/custom_plugins//on___. (user) + +Hook contract: + Input: --url= (and other --key=value args) + Output: JSONL records to stdout, files to $PWD + Exit: 0 = success, non-zero = failure + +Execution order: + - Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename + - Foreground hooks run sequentially in that order + - Background hooks (.bg suffix) run concurrently and do not block foreground progress + - After all foreground hooks complete, background hooks receive SIGTERM and must finalize + +Hook naming convention: + on_{EventFamily}__{run_order}_{description}[.bg].{ext} + +API: + discover_hooks(event) -> List[Path] Find hook scripts for a hook-backed event family + run_hook(script, ...) -> Process Execute a hook script directly + is_background_hook(name) -> bool Check if hook is background (.bg suffix) +""" + +__package__ = "archivebox.plugins" + +import json +import os +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional, Protocol, TypeGuard, runtime_checkable + +from archivebox.config.constants import CONSTANTS +from archivebox.config.version import VERSION +from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url +from archivebox.plugins.discovery import ( + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + ConfigLookup, + get_plugin_special_config, +) + +if TYPE_CHECKING: + from archivebox.machine.models import Process + + +@runtime_checkable +class ConfigDump(Protocol): + def as_dict(self) -> dict[str, Any]: ... + + +def _has_config_dump(config: object) -> TypeGuard[ConfigDump]: + return isinstance(config, ConfigDump) + + +def _config_to_overrides(config: ConfigLookup | Mapping[str, Any] | None) -> dict[str, Any]: + if config is None: + return {} + if _has_config_dump(config): + return dict(config.as_dict()) + return dict(config.items()) + + +# ============================================================================= +# Hook Step Extraction +# ============================================================================= + + +def is_background_hook(hook_name: str) -> bool: + """ + Check if a hook is a background hook (doesn't block foreground progression). + + Background hooks have '.bg.' in their filename before the extension. + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.daemon.bg.js') + + Returns: + True if background hook, False if foreground. + + Examples: + is_background_hook('on_Snapshot__10_chrome_tab.daemon.bg.js') -> True + is_background_hook('on_Snapshot__50_wget.py') -> False + is_background_hook('on_Snapshot__63_media.finite.bg.py') -> True + """ + return ".bg." in hook_name or "__background" in hook_name + + +def normalize_hook_event_name(event_name: str) -> str | None: + """ + Normalize a hook event family or event class name to its on_* prefix. + + Examples: + CrawlSetupEvent -> CrawlSetup + SnapshotEvent -> Snapshot + BinaryEvent -> Binary + CrawlCleanupEvent -> CrawlCleanup + """ + normalized = str(event_name or "").strip() + if not normalized: + return None + + if normalized.endswith("Event"): + return normalized[:-5] or None + return normalized + + +def _model_output_dir_from_child_path(path: Path, marker: str) -> Path | None: + """ + Infer the model output dir from a model dir or one of its plugin subdirs. + + Current ArchiveBox snapshot/crawl dirs are: + .../{snapshots,crawls}/YYYYMMDD/domain/uuid[/plugin] + """ + parts = path.resolve().parts + try: + marker_index = parts.index(marker) + except ValueError: + return None + + model_end_index = marker_index + 4 + if len(parts) < model_end_index: + return None + return Path(*parts[:model_end_index]) + + +def discover_hooks( + event_name: str, + filter_disabled: bool = True, + config: ConfigLookup | None = None, + **config_kwargs: Any, +) -> list[Path]: + """ + Find all hook scripts for an event family. + + Searches both built-in and user plugin directories. + Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags). + Returns scripts sorted alphabetically by filename for deterministic execution order. + + Hook naming convention uses numeric prefixes to control order: + on_Snapshot__10_title.py # runs first + on_Snapshot__15_singlefile.py # runs second + on_Snapshot__26_readability.py # runs later (depends on singlefile) + + Args: + event_name: Hook event family or event class name. + Examples: 'CrawlSetupEvent', 'Snapshot'. + Event names are normalized by stripping a trailing `Event`. + If no matching `on_{EventFamily}__*` scripts exist, returns []. + filter_disabled: If True, skip hooks from disabled plugins (default: True) + config: Optional pre-merged config dict from get_config(). + **config_kwargs: Scope/override args forwarded to get_config() when config is not supplied. + + Returns: + Sorted list of hook script paths from enabled plugins only. + + Examples: + # With proper config context (recommended): + from archivebox.config.common import get_config + config = get_config(crawl=my_crawl, snapshot=my_snapshot) + discover_hooks('Snapshot', config=config) + # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False) + + # Without config (uses global defaults): + discover_hooks('Snapshot') + # Returns: [Path('.../on_Snapshot__10_title.py'), ...] + + # Show all plugins regardless of enabled status: + discover_hooks('Snapshot', filter_disabled=False) + # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')] + """ + hook_event_name = normalize_hook_event_name(event_name) + if not hook_event_name: + return [] + if hook_event_name == "BinaryRequest": + return [] + + hooks = [] + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + # Search for hook scripts in all subdirectories + for ext in ("sh", "py", "js"): + pattern = f"*/on_{hook_event_name}__*.{ext}" + hooks.extend(base_dir.glob(pattern)) + + # Also check for hooks directly in the plugins directory + pattern_direct = f"on_{hook_event_name}__*.{ext}" + hooks.extend(base_dir.glob(pattern_direct)) + + if filter_disabled: + # Get merged config if not provided (lazy import to avoid circular dependency) + if config is None: + from archivebox.config.common import get_config + + config = get_config(**config_kwargs) + + enabled_hooks = [] + + for hook in hooks: + # Get plugin name from parent directory + # e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget' + plugin_name = hook.parent.name + + # Check if this is a plugin directory (not the root plugins dir) + if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()): + # Hook is in root plugins directory, not a plugin subdir + # Include it by default (no filtering for non-plugin hooks) + enabled_hooks.append(hook) + continue + + # Check if plugin is enabled + plugin_config = get_plugin_special_config(plugin_name, config) + if plugin_config["enabled"]: + enabled_hooks.append(hook) + + hooks = enabled_hooks + + # Sort by filename (not full path) to ensure numeric prefix ordering works + # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py + return sorted(set(hooks), key=lambda p: p.name) + + +def run_hook( + script: Path, + output_dir: Path, + config: ConfigLookup | Mapping[str, Any] | None = None, + timeout: int | None = None, + parent: Optional["Process"] = None, + **kwargs: Any, +) -> "Process": + """ + Execute a hook script with the given arguments using Process model. + + This is the low-level hook executor that creates a Process record and + uses Process.launch() for subprocess management. + + Config is passed to hooks via environment variables. Crawl/snapshot callers + should pass the runtime config produced by for_crawl_runtime(). + + Args: + script: Path to the hook script (.sh, .py, or .js) + output_dir: Working directory for the script (where output files go) + config: Optional runtime config dict from for_crawl_runtime(). + If omitted, pass scope/override args using kwargs prefixed with config_. + timeout: Maximum execution time in seconds + If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300) + parent: Optional parent Process (for tracking worker->hook hierarchy) + **kwargs: Arguments passed to the script as --key=value + + Returns: + Process model instance (use process.exit_code, process.stdout, process.get_records()) + + Example: + from archivebox.config.common import get_config + config = get_config(crawl=my_crawl, snapshot=my_snapshot).for_crawl_runtime(crawl=my_crawl, snapshot=my_snapshot) + process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id) + if process.status == 'exited': + records = process.get_records() # Get parsed JSONL output + """ + from archivebox.machine.models import Process, Machine, NetworkInterface + from archivebox.config.common import get_config, normalize_runtime_config + + config_scope = {key.removeprefix("config_"): kwargs.pop(key) for key in list(kwargs) if key.startswith("config_")} + config_overrides = _config_to_overrides(config) + resolved_config = get_config(overrides=config_overrides, **config_scope) + hook_config = normalize_runtime_config( + config_overrides if config is not None else resolved_config.for_crawl(), + json_safe=False, + ) + + # Auto-detect timeout from plugin config if not explicitly provided + if timeout is None: + plugin_name = script.parent.name + plugin_config = get_plugin_special_config(plugin_name, resolved_config) + timeout = plugin_config["timeout"] + if timeout: + timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)) + + # Get current machine + machine = Machine.current() + iface = NetworkInterface.current(refresh=True) + machine = iface.machine + + # Auto-detect parent process if not explicitly provided + # This enables automatic hierarchy tracking: Worker -> Hook + if parent is None: + try: + parent = Process.current() + except Exception: + # If Process.current() fails (e.g., not in a worker context), leave parent as None + pass + + if not script.exists(): + # Create a failed Process record for hooks that don't exist + process = Process.objects.create( + machine=machine, + iface=iface, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=["echo", f"Hook script not found: {script}"], + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=1, + stderr=f"Hook script not found: {script}", + ) + return process + + # Python hooks carry their runtime contract in the shebang + # (usually `abxpkg run --script python3`), so execute them directly. + # For shell/JS hooks we still dispatch through the conventional + # interpreter because those hooks do not need per-script Python env setup. + ext = script.suffix.lower() + if ext == ".sh": + cmd = ["bash", str(script)] + elif ext == ".py": + cmd = [str(script)] + elif ext == ".js": + cmd = ["node", str(script)] + else: + # Try to execute directly (assumes shebang) + cmd = [str(script)] + + # Build CLI arguments from kwargs + for key, value in kwargs.items(): + # Skip keys that start with underscore (internal parameters) + if key.startswith("_"): + continue + + arg_key = f"--{key.replace('_', '-')}" + if isinstance(value, bool): + if value: + cmd.append(arg_key) + elif value is not None and value != "": + # JSON-encode complex values, use str for simple ones + # Skip empty strings to avoid --key= which breaks argument parsers + if isinstance(value, (dict, list)): + cmd.append(f"{arg_key}={json.dumps(value)}") + else: + # Ensure value is converted to string and strip whitespace + str_value = str(value).strip() + if str_value: # Only add if non-empty after stripping + cmd.append(f"{arg_key}={str_value}") + + # Set up environment with base paths + env = os.environ.copy() + env["DATA_DIR"] = str(CONSTANTS.DATA_DIR) + env["LIBRARY_VERSION"] = VERSION + env.setdefault("MACHINE_ID", os.environ.get("MACHINE_ID", CONSTANTS.MACHINE_ID)) + snap_dir = hook_config.get("SNAP_DIR") or _model_output_dir_from_child_path(output_dir, CONSTANTS.SNAPSHOTS_DIR_NAME) + crawl_dir = hook_config.get("CRAWL_DIR") or _model_output_dir_from_child_path(output_dir, CONSTANTS.CRAWLS_DIR_NAME) + if snap_dir: + env["SNAP_DIR"] = str(snap_dir) + if crawl_dir: + env["CRAWL_DIR"] = str(crawl_dir) + + # Export runtime library roots; abx-dl/abxpkg own executable lookup env. + lib_dir = hook_config.get("LIB_DIR") + if lib_dir: + env["LIB_DIR"] = str(lib_dir) + env["ABXPKG_LIB_DIR"] = str(lib_dir) + + # Set Node.js module resolution paths. + # NODE_PATH may be a path list, but NODE_MODULES_DIR is a single canonical directory. + node_modules_dir = hook_config.get("NODE_MODULES_DIR") + if not node_modules_dir and lib_dir: + node_modules_dir = Path(lib_dir) / "pnpm" / "packages" / "chrome" / "node_modules" + + node_path_parts = [part for part in str(hook_config.get("NODE_PATH") or "").split(os.pathsep) if part] + if node_modules_dir: + node_modules_dir = Path(node_modules_dir) + node_modules_dir.mkdir(parents=True, exist_ok=True) + node_modules_dir_str = str(node_modules_dir) + env["NODE_MODULES_DIR"] = node_modules_dir_str + env["NODE_MODULE_DIR"] = node_modules_dir_str + if node_modules_dir_str not in node_path_parts: + node_path_parts.append(node_modules_dir_str) + if node_path_parts: + env["NODE_PATH"] = os.pathsep.join(node_path_parts) + + # Export all config values to environment (already merged by get_config()) + # Skip keys we've already handled specially above (PATH, LIB_DIR, NODE_PATH, etc.) + SKIP_KEYS = { + "PATH", + "LIB_DIR", + "ABXPKG_LIB_DIR", + "NODE_PATH", + "NODE_MODULES_DIR", + "NODE_MODULE_DIR", + "DATA_DIR", + "MACHINE_ID", + "SNAP_DIR", + "CRAWL_DIR", + } + for key, value in hook_config.items(): + if key in SKIP_KEYS: + continue # Already handled specially above, don't overwrite + if value is None: + continue + elif isinstance(value, bool): + env[key] = "true" if value else "false" + elif isinstance(value, (list, dict)): + env[key] = json.dumps(value) + else: + env[key] = str(value) + + # Create output directory if needed + output_dir.mkdir(parents=True, exist_ok=True) + + # Detect if this is a background hook. + # Background hooks use the .bg. filename marker. + # Old convention: __background in stem (for backwards compatibility) + is_background = ".bg." in script.name or "__background" in script.stem + + try: + # Create Process record + process = Process.objects.create( + machine=machine, + iface=iface, + parent=parent, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, + ) + + # Copy the env dict we already built (includes os.environ + all customizations) + process.env = env.copy() + process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script)) + + # Save env before launching + process.save() + + # Launch subprocess using Process.launch() + process.launch(background=is_background) + + # Return Process object (caller can use process.exit_code, process.stdout, process.get_records()) + return process + + except Exception as e: + # Create a failed Process record for exceptions + process = Process.objects.create( + machine=machine, + iface=iface, + process_type=Process.TypeChoices.HOOK, + pwd=str(output_dir), + cmd=cmd, + timeout=timeout, + status=Process.StatusChoices.EXITED, + exit_code=1, + stderr=f"Failed to run hook: {type(e).__name__}: {e}", + ) + return process + + +def extract_records_from_process(process: "Process") -> list[dict[str, Any]]: + """ + Extract JSONL records from a Process's stdout. + + Adds plugin metadata to each record. + + Args: + process: Process model instance with stdout captured + + Returns: + List of parsed JSONL records with plugin metadata + """ + records = process.get_records() + if not records: + return [] + + # Extract plugin metadata from process.pwd and process.cmd + plugin_name = Path(process.pwd).name if process.pwd else "unknown" + hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else "unknown" + plugin_hook = process.cmd[1] if len(process.cmd) > 1 else "" + + for record in records: + # Add plugin metadata to record + record.setdefault("plugin", plugin_name) + record.setdefault("hook_name", hook_name) + record.setdefault("plugin_hook", plugin_hook) + + return records + + +def collect_urls_from_plugins(snapshot_dir: Path) -> list[dict[str, Any]]: + """ + Collect all urls.jsonl entries from parser plugin output subdirectories. + + Each parser plugin outputs urls.jsonl to its own subdir: + snapshot_dir/parse_rss_urls/urls.jsonl + snapshot_dir/parse_html_urls/urls.jsonl + etc. + + This is not special handling - urls.jsonl is just a normal output file. + This utility collects them all for the crawl system. + """ + urls = [] + + # Look in each immediate subdirectory for urls.jsonl + if not snapshot_dir.exists(): + return urls + + for subdir in snapshot_dir.iterdir(): + if not subdir.is_dir(): + continue + + urls_file = subdir / "urls.jsonl" + if not urls_file.exists(): + continue + + try: + from archivebox.machine.models import Process + + text = urls_file.read_text() + for entry in Process.parse_records_from_text(text): + if entry.get("url"): + entry["url"] = sanitize_extracted_url(fix_url_from_markdown(str(entry["url"]).strip())) + if not entry["url"]: + continue + # Track which parser plugin found this URL + entry["plugin"] = subdir.name + urls.append(entry) + except Exception: + pass + + return urls + + +# ============================================================================= +# Hook Result Processing Helpers +# ============================================================================= + + +def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any] | None = None) -> dict[str, int]: + """ + Process JSONL records emitted by hook stdout. + + This handles hook-emitted record types such as Snapshot, Tag, and Binary. + It does not process internal bus lifecycle events, since those + are not emitted as JSONL records by hook subprocesses. + + Args: + records: List of JSONL record dicts from result['records'] + overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc. + + Returns: + Dict with counts by record type + """ + stats = {} + overrides = overrides or {} + + for record in records: + record_type = record.get("type") + if not record_type: + continue + + # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones) + if record_type == "ArchiveResult": + continue + + try: + # Dispatch to appropriate model's from_json() method + if record_type == "Snapshot": + from archivebox.core.models import Snapshot + + if record.get("url"): + record = { + **record, + "url": sanitize_extracted_url(fix_url_from_markdown(str(record["url"]).strip())), + } + if not record["url"]: + continue + + # Check if discovered snapshot exceeds crawl max_depth + snapshot_depth = record.get("depth", 0) + crawl = overrides.get("crawl") + if crawl and snapshot_depth > crawl.max_depth: + # Skip - this URL was discovered but exceeds max crawl depth + continue + + obj = Snapshot.from_json(record.copy(), overrides) + if obj: + stats["Snapshot"] = stats.get("Snapshot", 0) + 1 + + elif record_type == "Tag": + from archivebox.core.models import Tag + + obj = Tag.from_json(record.copy(), overrides) + if obj: + stats["Tag"] = stats.get("Tag", 0) + 1 + + elif record_type == "Binary": + from archivebox.machine.models import Binary + + obj = Binary.from_json(record.copy(), overrides) + if obj: + stats[record_type] = stats.get(record_type, 0) + 1 + + else: + import sys + + print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) + + except Exception as e: + import sys + + print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr) + continue + + return stats diff --git a/archivebox/plugins/views.py b/archivebox/plugins/views.py new file mode 100644 index 0000000000..e23a89c402 --- /dev/null +++ b/archivebox/plugins/views.py @@ -0,0 +1,454 @@ +__package__ = "archivebox.plugins" + +import html +import json +import re +from typing import Any +from collections.abc import Callable +from urllib.parse import quote + +from django.http import HttpRequest +from django.utils.html import format_html +from django.utils.safestring import mark_safe + +from admin_data_views.typing import ItemContext, SectionData, TableContext +from admin_data_views.utils import ItemLink, render_with_item_view, render_with_table_view + +from archivebox.config.common import get_live_config_url +from archivebox.config.views import get_environment_binary_url, is_superuser +from archivebox.plugins.discovery import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, discover_plugin_configs, iter_plugin_dirs + + +ABX_PLUGINS_DOCS_BASE_URL = "https://archivebox.github.io/abx-plugins/" +ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/" +LIVE_PLUGIN_BASE_URL = "/admin/environment/plugins/" + + +JSON_TOKEN_RE = re.compile( + r'(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)' + r'|(?P"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")' + r"|(?P\btrue\b|\bfalse\b)" + r"|(?P\bnull\b)" + r"|(?P-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)", +) + + +def render_code_block(text: str, *, highlighted: bool = False) -> str: + code = html.escape(text, quote=False) + + if highlighted: + + def _wrap_token(match: re.Match[str]) -> str: + styles = { + "key": "color: #0550ae;", + "string": "color: #0a7f45;", + "boolean": "color: #8250df; font-weight: 600;", + "null": "color: #6e7781; font-style: italic;", + "number": "color: #b35900;", + } + token_type = next(name for name, value in match.groupdict().items() if value is not None) + return f'{match.group(0)}' + + code = JSON_TOKEN_RE.sub(_wrap_token, code) + + return ( + '
    '
    +        '"
    +        f"{code}"
    +        "
    " + ) + + +def render_highlighted_json_block(value: Any) -> str: + return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True) + + +def get_plugin_docs_url(plugin_name: str) -> str: + return f"{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}" + + +def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str: + return f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}" + + +def get_machine_admin_url() -> str | None: + from archivebox.machine.models import Machine + + return Machine.current().admin_change_url + + +def render_code_tag_list(values: list[str]) -> str: + if not values: + return '(none)' + + tags = "".join( + str( + format_html( + '{}', + value, + ), + ) + for value in values + ) + return f'
    {tags}
    ' + + +def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str: + if not values: + return '(none)' + + tags = [] + for value in values: + if url_resolver is None: + tags.append( + str( + format_html( + '{}', + value, + ), + ), + ) + else: + tags.append( + str( + format_html( + '' + '{}' + "", + url_resolver(value), + value, + ), + ), + ) + return f'
    {"".join(tags)}
    ' + + +def render_plugin_metadata_html(config: dict[str, Any]) -> str: + required_binaries = [ + str(item.get("name")) for item in (config.get("required_binaries") or []) if isinstance(item, dict) and item.get("name") + ] + rows = ( + ("Title", config.get("title") or "(none)"), + ("Description", config.get("description") or "(none)"), + ("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))), + ("Required Binaries", mark_safe(render_link_tag_list(required_binaries, get_environment_binary_url))), + ("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))), + ) + + rendered_rows = "".join( + str( + format_html( + '
    {}
    {}
    ', + label, + value, + ), + ) + for label, value in rows + ) + return f'
    {rendered_rows}
    ' + + +def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str: + links = [ + str(format_html('Computed value', get_live_config_url(prop_name))), + ] + if machine_admin_url: + links.append(str(format_html('Edit override', machine_admin_url))) + + fallback = prop_info.get("x-fallback") + if isinstance(fallback, str) and fallback: + links.append(str(format_html('Fallback: {}', get_live_config_url(fallback), fallback))) + + aliases = prop_info.get("x-aliases") or [] + if isinstance(aliases, list): + for alias in aliases: + if isinstance(alias, str) and alias: + links.append(str(format_html('Alias: {}', get_live_config_url(alias), alias))) + + default = prop_info.get("default") + if prop_name.endswith("_BINARY") and isinstance(default, str) and default: + links.append(str(format_html('Binary: {}', get_environment_binary_url(default), default))) + + return "   ".join(links) + + +def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str: + header_links = [ + str(format_html('Dependencies', "/admin/environment/binaries/")), + str(format_html('Installed Binaries', "/admin/machine/binary/")), + ] + if machine_admin_url: + header_links.insert(0, str(format_html('Machine Config Editor', machine_admin_url))) + + cards = [ + f'
    {"   |   ".join(header_links)}
    ', + ] + + for prop_name, prop_info in properties.items(): + prop_type = prop_info.get("type", "unknown") + if isinstance(prop_type, list): + prop_type = " | ".join(str(type_name) for type_name in prop_type) + prop_desc = prop_info.get("description", "") + + default_html = "" + if "default" in prop_info: + default_html = str( + format_html( + '
    Default: {}
    ', + prop_info["default"], + ), + ) + + description_html = prop_desc or mark_safe('(no description)') + cards.append( + str( + format_html( + '
    ' + '
    ' + '{}' + ' ({})' + "
    " + '
    {}
    ' + '
    {}
    ' + "{}" + "
    ", + get_live_config_url(prop_name), + prop_name, + prop_type, + description_html, + mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)), + mark_safe(default_html), + ), + ), + ) + + return "".join(cards) + + +def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str: + if not hooks: + return '(none)' + + items = [] + for hook_name in hooks: + if source == "builtin": + items.append( + str( + format_html( + '', + get_plugin_hook_source_url(plugin_name, hook_name), + hook_name, + ), + ), + ) + else: + items.append( + str( + format_html( + '
    {}
    ', + hook_name, + ), + ), + ) + return "".join(items) + + +def get_filesystem_plugins() -> dict[str, dict[str, Any]]: + """Discover plugins from filesystem directories.""" + plugins = {} + + for base_dir, source in [(BUILTIN_PLUGINS_DIR, "builtin"), (USER_PLUGINS_DIR, "user")]: + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"): + plugin_id = f"{source}.{plugin_dir.name}" + + hooks = [] + for ext in ("sh", "py", "js"): + hooks.extend(plugin_dir.glob(f"on_*__*.{ext}")) + + config_file = plugin_dir / "config.json" + config_data = None + if config_file.exists(): + try: + with open(config_file) as f: + config_data = json.load(f) + except (json.JSONDecodeError, OSError): + config_data = None + + plugins[plugin_id] = { + "id": plugin_id, + "name": plugin_dir.name, + "path": str(plugin_dir), + "source": source, + "hooks": [str(h.name) for h in hooks], + "config": config_data, + } + + return plugins + + +def find_plugin_for_config_key(key: str) -> str | None: + for plugin_name, schema in discover_plugin_configs().items(): + if key in (schema.get("properties") or {}): + return plugin_name + return None + + +def get_config_definition_link(key: str) -> tuple[str, str]: + plugin_name = find_plugin_for_config_key(key) + if not plugin_name: + return ( + f"https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code", + "archivebox/config", + ) + + plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) + if plugin_dir: + builtin_root = BUILTIN_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(builtin_root): + return ( + f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json", + f"abx_plugins/plugins/{plugin_name}/config.json", + ) + + user_root = USER_PLUGINS_DIR.resolve() + if plugin_dir.is_relative_to(user_root): + return ( + f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/", + f"data/custom_plugins/{plugin_name}/config.json", + ) + + return ( + f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/", + f"abx_plugins/plugins/{plugin_name}/config.json", + ) + + +@render_with_table_view +def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + rows = { + "Name": [], + "Source": [], + "Path": [], + "Hooks": [], + "Config": [], + } + + plugins = get_filesystem_plugins() + + for plugin_id, plugin in plugins.items(): + rows["Name"].append(ItemLink(plugin["name"], key=plugin_id)) + rows["Source"].append(plugin["source"]) + rows["Path"].append(format_html("{}", plugin["path"])) + rows["Hooks"].append(", ".join(plugin["hooks"]) or "(none)") + + if plugin.get("config"): + config_properties = plugin["config"].get("properties", {}) + config_count = len(config_properties) + rows["Config"].append(f"โœ… {config_count} properties" if config_count > 0 else "โœ… present") + else: + rows["Config"].append("โŒ none") + + if not plugins: + rows["Name"].append("(no plugins found)") + rows["Source"].append("-") + rows["Path"].append(mark_safe("abx_plugins/plugins/ or data/custom_plugins/")) + rows["Hooks"].append("-") + rows["Config"].append("-") + + return TableContext( + title="Installed plugins", + table=rows, + ) + + +@render_with_item_view +def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: + assert is_superuser(request), "Must be a superuser to view configuration settings." + + plugins = get_filesystem_plugins() + + plugin = plugins.get(key) + if not plugin: + return ItemContext( + slug=key, + title=f"Plugin not found: {key}", + data=[], + ) + + docs_url = get_plugin_docs_url(plugin["name"]) + machine_admin_url = get_machine_admin_url() + fields = { + "id": plugin["id"], + "name": plugin["name"], + "source": plugin["source"], + } + + sections: list[SectionData] = [ + { + "name": plugin["name"], + "description": format_html( + '{}
    ABX Plugin Docs', + plugin["path"], + docs_url, + ), + "fields": fields, + "help_texts": {}, + }, + ] + + if plugin["hooks"]: + sections.append( + { + "name": "Hooks", + "description": mark_safe(render_hook_links_html(plugin["name"], plugin["hooks"], plugin["source"])), + "fields": {}, + "help_texts": {}, + }, + ) + + if plugin.get("config"): + sections.append( + { + "name": "Plugin Metadata", + "description": mark_safe(render_plugin_metadata_html(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + sections.append( + { + "name": "config.json", + "description": mark_safe(render_highlighted_json_block(plugin["config"])), + "fields": {}, + "help_texts": {}, + }, + ) + + config_properties = plugin["config"].get("properties", {}) + if config_properties: + sections.append( + { + "name": "Config Properties", + "description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)), + "fields": {}, + "help_texts": {}, + }, + ) + + return ItemContext( + slug=key, + title=plugin["name"], + data=sections, + ) diff --git a/archivebox/progressmonitor/__init__.py b/archivebox/progressmonitor/__init__.py new file mode 100644 index 0000000000..e3f7e93a39 --- /dev/null +++ b/archivebox/progressmonitor/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.progressmonitor" diff --git a/archivebox/progressmonitor/apps.py b/archivebox/progressmonitor/apps.py new file mode 100644 index 0000000000..15fe1d5836 --- /dev/null +++ b/archivebox/progressmonitor/apps.py @@ -0,0 +1,8 @@ +__package__ = "archivebox.progressmonitor" + +from django.apps import AppConfig + + +class ProgressMonitorConfig(AppConfig): + name = "archivebox.progressmonitor" + label = "progressmonitor" diff --git a/archivebox/progressmonitor/templates/progressmonitor/progress_monitor.html b/archivebox/progressmonitor/templates/progressmonitor/progress_monitor.html new file mode 100644 index 0000000000..7ddbb6510e --- /dev/null +++ b/archivebox/progressmonitor/templates/progressmonitor/progress_monitor.html @@ -0,0 +1,1941 @@ + + +
    +
    +
    +
    + + Runner stopped + +
    +
    +
    + Crawls + 0 active ยท 0 queued +
    +
    + Snapshots + 0 active ยท 0 queued +
    +
    + Downloads + 0 active ยท 0 queued +
    +
    + Indexing + 0 active ยท 0 queued +
    +
    +
    +
    + +
    +
    + +
    +
    +
    No active crawls
    +
    +
    + +
    +
    + + diff --git a/archivebox/progressmonitor/views.py b/archivebox/progressmonitor/views.py new file mode 100644 index 0000000000..53ea71109c --- /dev/null +++ b/archivebox/progressmonitor/views.py @@ -0,0 +1,961 @@ +__package__ = "archivebox.progressmonitor" + +from functools import lru_cache +from pathlib import Path +from typing import Literal + +from django.db.models import CharField, Count, Q, Sum +from django.db.models.functions import Cast +from django.http import HttpResponse, JsonResponse +from django.utils import timezone + +from abx_dl.events import PROCESS_EXIT_SKIPPED + +from archivebox.config import CONSTANTS +from archivebox.config.common import get_config +from archivebox.core.routes_util import build_snapshot_url, build_web_url, get_api_base_url +from archivebox.core.permissions import can_view_snapshot, is_admin_user +from archivebox.plugins.discovery import discover_plugin_configs +from archivebox.misc.logging_util import printable_filesize + + +def progress_endpoint(scope: Literal["crawl", "snapshot"] | None = None, object_id: object | None = None) -> str: + """Return the canonical same-origin progress endpoint for monitor embeds.""" + if not scope or object_id is None: + return "/progress.json" + return f"/progress.json?{scope}_id={str(object_id).replace('-', '')}" + + +@lru_cache(maxsize=1) +def _live_progress_plugin_names() -> tuple[frozenset[str], frozenset[str]]: + plugin_configs = discover_plugin_configs() + download_plugin_names = frozenset( + plugin_name + for plugin_name, plugin_config in plugin_configs.items() + if plugin_config.get("output_mimetypes") and not plugin_name.startswith("search_backend_") + ) + indexing_plugin_names = frozenset(plugin_name for plugin_name in plugin_configs if plugin_name.startswith("search_backend_")) + return download_plugin_names, indexing_plugin_names + + +def live_progress_view(request): + """Simple JSON endpoint for live progress status - used by admin progress monitor.""" + try: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.machine.models import Process, Machine + + snapshot_id_filter = (request.GET.get("snapshot_id") or "").strip().replace("-", "") + crawl_id_filter = (request.GET.get("crawl_id") or "").strip().replace("-", "") + is_admin = is_admin_user(request) + + scoped_snapshot = None + if snapshot_id_filter: + import uuid as _uuid + + try: + _uuid.UUID(snapshot_id_filter) + except (TypeError, ValueError): + return JsonResponse({"error": "Invalid snapshot_id"}, status=400) + scoped_snapshot = Snapshot.objects.filter(id=snapshot_id_filter).select_related("crawl").first() + if scoped_snapshot is None or not can_view_snapshot(request, scoped_snapshot): + return JsonResponse({"error": "Permission denied"}, status=403) + elif crawl_id_filter: + # Crawl-only scope still requires staff: there's no per-crawl ACL helper, + # and a crawl can mix snapshot permissions levels. + if not is_admin: + return JsonResponse({"error": "Permission denied"}, status=403) + else: + if not is_admin: + return JsonResponse({"error": "Permission denied"}, status=403) + + request_config = request.archivebox_config + now = timezone.now() + crawl_scope = Crawl.objects.all() + snapshot_scope = Snapshot.objects.all() + archiveresult_scope = ArchiveResult.objects.all() + if is_admin and not request.user.is_superuser: + crawl_scope = crawl_scope.filter(created_by=request.user) + snapshot_scope = snapshot_scope.filter(crawl__created_by=request.user) + archiveresult_scope = archiveresult_scope.filter(snapshot__crawl__created_by=request.user) + if scoped_snapshot is not None: + snapshot_scope = Snapshot.objects.filter(id=scoped_snapshot.id) + crawl_scope = Crawl.objects.filter(id=scoped_snapshot.crawl_id) + archiveresult_scope = ArchiveResult.objects.filter(snapshot_id=scoped_snapshot.id) + elif crawl_id_filter: + snapshot_scope = snapshot_scope.filter(crawl_id=crawl_id_filter) + crawl_scope = crawl_scope.filter(id=crawl_id_filter) + archiveresult_scope = archiveresult_scope.filter(snapshot__crawl_id=crawl_id_filter) + + def is_current_run_timestamp(event_ts, run_started_at) -> bool: + if run_started_at is None: + return True + if event_ts is None: + return False + return event_ts >= run_started_at + + def archiveresult_matches_current_run(ar, run_started_at) -> bool: + if run_started_at is None: + return True + if ar.status in ( + ArchiveResult.StatusChoices.QUEUED, + ArchiveResult.StatusChoices.STARTED, + ArchiveResult.StatusChoices.BACKOFF, + ): + return True + event_ts = ar.end_ts or ar.start_ts or ar.modified_at or ar.created_at + return is_current_run_timestamp(event_ts, run_started_at) + + def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]: + normalized_hook_name = Path(hook_name).name if hook_name else "" + if not normalized_hook_name: + return (plugin, plugin, "unknown", "") + + phase = "unknown" + if normalized_hook_name == "InstallEvent": + phase = "install" + elif normalized_hook_name.startswith("on_CrawlSetup__"): + phase = "crawl" + elif normalized_hook_name.startswith("on_Snapshot__"): + phase = "snapshot" + + label = normalized_hook_name + if "__" in normalized_hook_name: + label = normalized_hook_name.split("__", 1)[1] + label = label.rsplit(".", 1)[0] + if len(label) > 3 and label[:2].isdigit() and label[2] == "_": + label = label[3:] + label = label.replace("_", " ").strip() or plugin + + return (plugin, label, phase, normalized_hook_name) + + def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]: + hook_path = "" + if isinstance(cmd, list) and cmd: + first = cmd[0] + if isinstance(first, str): + hook_path = first + + if not hook_path: + return ("", "setup", "unknown", "") + + return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup") + + def archiveresult_output_path(ar) -> str | None: + output_file_map = ar.output_files if isinstance(ar.output_files, dict) else {} + + def is_root_relative(path: str) -> bool: + metadata = output_file_map.get(path) or {} + return bool(isinstance(metadata, dict) and metadata.get("root_relative")) + + if ar.output_str: + raw_output = str(ar.output_str).strip() + if ar._looks_like_output_path(raw_output, ar.plugin): + output_path = Path(raw_output) + if output_path.is_absolute(): + return None + + if raw_output.startswith(f"{ar.plugin}/"): + candidates = [raw_output] + elif len(output_path.parts) == 1: + candidates = [f"{ar.plugin}/{raw_output}", raw_output] + else: + candidates = [raw_output] + + if raw_output in output_file_map and is_root_relative(raw_output): + return raw_output + + for relative_path in candidates: + plugin_relative = relative_path.removeprefix(f"{ar.plugin}/") + if relative_path in output_file_map: + return f"{ar.plugin}/{relative_path}" if not relative_path.startswith(f"{ar.plugin}/") else relative_path + if plugin_relative in output_file_map: + return f"{ar.plugin}/{plugin_relative}" + + output_file_paths = list(output_file_map.keys()) + if output_file_paths: + fallback_path = ArchiveResult._fallback_output_file_path(output_file_paths, ar.plugin, output_file_map) + if fallback_path: + if is_root_relative(fallback_path): + return fallback_path + return f"{ar.plugin}/{fallback_path}" + + return None + + def snapshot_output_url(snapshot, output_path: str) -> str: + return build_snapshot_url(str(snapshot["id"]), output_path, request=request, config=request_config) + + def snapshot_archive_path(snapshot) -> str: + if snapshot["fs_version"] in ("0.7.0", "0.8.0"): + return f"{CONSTANTS.ARCHIVE_DIR_NAME}/{snapshot['timestamp']}" + crawl = crawls_by_id.get(str(snapshot["crawl_id"])) + username = "web" + if crawl is not None and crawl["created_by_id"]: + username = crawl["created_by__username"] + if username == "system": + username = "web" + date_base = snapshot["bookmarked_at"] or snapshot["created_at"] + date_str = date_base.strftime("%Y%m%d") if date_base else "unknown" + domain = Snapshot.extract_domain_from_url(snapshot["url"]) + return f"{username}/{date_str}/{domain}/{snapshot['id']}" + + def snapshot_view_url(snapshot, output_path: str = "") -> str: + anchor = f"#{output_path}" if output_path else "" + return build_web_url( + f"/{snapshot_archive_path(snapshot)}/index.html{anchor}", + request=request, + config=request_config, + ) + + def snapshot_display_url(url: str) -> str: + url = str(url or "") + return url if len(url) <= 96 else f"{url[:93]}..." + + api_base = get_api_base_url(request=request, config=request_config) if scoped_snapshot is not None else "" + + def screencast_frame_url(crawl_id: str, crawl_dir: Path) -> str: + frame_path = crawl_dir / "chrome_screencast" / "latest.jpg" + try: + frame_stat = frame_path.stat() + except OSError: + return "" + if frame_stat.st_size <= 0: + return "" + if now.timestamp() - frame_stat.st_mtime > 15: + return "" + rel = f"/api/v1/crawls/crawl/{crawl_id}/files/chrome_screencast/latest.jpg?v={frame_stat.st_mtime_ns}" + return f"{api_base}{rel}" if api_base else rel + + machine_id = Machine.current().id + orchestrator_proc = ( + Process.objects.filter( + machine_id=machine_id, + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + ) + .only("id", "pid", "started_at", "machine_id", "process_type", "status") + .order_by("-started_at") + .first() + if machine_id is not None + else None + ) + runner_worker = None + orchestrator_proc_running = bool(orchestrator_proc and orchestrator_proc.is_running) + if not orchestrator_proc_running: + try: + from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker + + supervisor = get_existing_supervisord_process(quiet=True) + runner_worker = get_worker(supervisor, "worker_runner") if supervisor else None + except Exception: + runner_worker = None + + runner_worker_running = bool(runner_worker and runner_worker.get("statename") in ("STARTING", "RUNNING")) + runner_worker_pid = runner_worker.get("pid") if runner_worker else None + orchestrator_running = orchestrator_proc_running or runner_worker_running + orchestrator_pid = orchestrator_proc.pid if orchestrator_proc_running and orchestrator_proc else runner_worker_pid + + # Get model counts by status + crawl_status_counts = Crawl.status_counts( + crawl_scope, + (Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED, Crawl.StatusChoices.PAUSED), + ) + crawls_queued = crawl_status_counts.get(Crawl.StatusChoices.QUEUED, 0) + crawls_active = crawl_status_counts.get(Crawl.StatusChoices.STARTED, 0) + + # Get recent crawls (last 24 hours) + from datetime import timedelta + + one_day_ago = now - timedelta(days=1) + paused_crawl_cutoff = now - timedelta(hours=12) + crawls_recent = crawl_scope.filter(created_at__gte=one_day_ago).count() + + snapshot_status_counts = Snapshot.status_counts( + snapshot_scope, + Snapshot.OPEN_STATES, + ) + snapshots_queued = snapshot_status_counts.get(Snapshot.StatusChoices.QUEUED, 0) + snapshots_active = snapshot_status_counts.get(Snapshot.StatusChoices.STARTED, 0) + + download_plugin_names, indexing_plugin_names = _live_progress_plugin_names() + result_statuses = ( + ArchiveResult.StatusChoices.QUEUED, + ArchiveResult.StatusChoices.STARTED, + ) + archiveresult_status_counts = ArchiveResult.status_counts(archiveresult_scope, result_statuses) + download_scope = archiveresult_scope.filter( + plugin__in=download_plugin_names, + snapshot__status__in=Snapshot.RUNNABLE_STATES, + snapshot__crawl__status__in=Crawl.RUNNABLE_STATES, + ) + indexing_scope = archiveresult_scope.filter(plugin__in=indexing_plugin_names) + download_status_counts = ArchiveResult.status_counts(download_scope, result_statuses) + indexing_status_counts = ArchiveResult.status_counts(indexing_scope, result_statuses) + archiveresults_queued = archiveresult_status_counts.get(ArchiveResult.StatusChoices.QUEUED, 0) + archiveresults_active = archiveresult_status_counts.get(ArchiveResult.StatusChoices.STARTED, 0) + + downloads_queued = download_status_counts.get(ArchiveResult.StatusChoices.QUEUED, 0) + downloads_active = download_status_counts.get(ArchiveResult.StatusChoices.STARTED, 0) + indexing_queued = indexing_status_counts.get(ArchiveResult.StatusChoices.QUEUED, 0) + indexing_active = indexing_status_counts.get(ArchiveResult.StatusChoices.STARTED, 0) + + # Build hierarchical active crawls with nested snapshots and archive results + max_active_crawls = 10 + max_queued_crawls = 10 + max_started_snapshots_per_crawl = 50 + max_queued_snapshots_per_crawl = 50 + + active_crawl_fields = ( + "id", + "created_at", + "created_by_id", + "modified_at", + "urls", + "config", + "max_depth", + "tags_str", + "persona_id", + "status", + "retry_at", + "label", + "created_by__id", + "created_by__username", + ) + started_crawls = list( + crawl_scope.filter(status=Crawl.StatusChoices.STARTED) + .values(*active_crawl_fields) + .order_by("-modified_at")[:max_active_crawls], + ) + paused_crawls = list( + crawl_scope.filter( + Q(status=Crawl.StatusChoices.PAUSED, created_at__gte=paused_crawl_cutoff) + | Q( + status=Crawl.StatusChoices.PAUSED, + snapshot_set__status__in=Snapshot.RUNNABLE_STATES, + snapshot_set__retry_at__lte=now, + ) + | Q( + status=Crawl.StatusChoices.PAUSED, + snapshot_set__archiveresult__status=ArchiveResult.StatusChoices.QUEUED, + ), + ) + .values(*active_crawl_fields) + .distinct() + .order_by("-modified_at")[:max_active_crawls], + ) + queued_crawls = list( + crawl_scope.filter(status=Crawl.StatusChoices.QUEUED).values(*active_crawl_fields).order_by("-modified_at")[:max_queued_crawls], + ) + queued_crawls_hidden = max(crawls_queued - len(queued_crawls), 0) + active_crawls_list = started_crawls + paused_crawls + queued_crawls + for crawl in active_crawls_list: + crawl["id"] = str(crawl["id"]) + if crawl["persona_id"]: + crawl["persona_id"] = str(crawl["persona_id"]) + persona_details_by_id: dict[str, dict[str, str]] = {} + persona_details_by_name: dict[str, dict[str, str]] = {} + persona_objects_by_id = {} + persona_objects_by_name = {} + persona_ids = {crawl["persona_id"] for crawl in active_crawls_list if crawl["persona_id"]} + persona_names = {"Default"} if any(not crawl["persona_id"] for crawl in active_crawls_list) else set() + if persona_ids or persona_names: + from archivebox.personas.models import Persona + + for persona in Persona.objects.filter(Q(id__in=persona_ids) | Q(name__in=persona_names)).only("id", "name", "config"): + persona_details = { + "name": persona.name, + "admin_url": f"/admin/personas/persona/{persona.pk}/change/", + } + persona_details_by_id[str(persona.id)] = persona_details + persona_details_by_name[persona.name] = persona_details + persona_objects_by_id[str(persona.id)] = persona + persona_objects_by_name[persona.name] = persona + active_crawl_ids = [crawl["id"] for crawl in active_crawls_list] + active_crawl_objects = {} + if active_crawl_ids: + for crawl_obj in Crawl.objects.filter(id__in=active_crawl_ids).select_related("created_by", "persona"): + crawl_obj._runtime_config = request_config + active_crawl_objects[str(crawl_obj.id)] = crawl_obj + snapshot_counts_by_crawl: dict[str, dict[str, int]] = {str(crawl_id): {} for crawl_id in active_crawl_ids} + cancelled_snapshot_counts_by_crawl: dict[str, int] = {str(crawl_id): 0 for crawl_id in active_crawl_ids} + crawl_output_sizes_by_crawl: dict[str, int] = {str(crawl_id): 0 for crawl_id in active_crawl_ids} + queued_snapshot_overflow_by_crawl: dict[str, int] = {str(crawl_id): 0 for crawl_id in active_crawl_ids} + active_snapshot_scope = snapshot_scope.filter(crawl_id__in=active_crawl_ids) + if active_crawl_ids: + for row in active_snapshot_scope.values("crawl_id", "status").annotate(count=Count("id")): + snapshot_counts_by_crawl.setdefault(str(row["crawl_id"]), {})[row["status"]] = row["count"] + + for row in ( + active_snapshot_scope.filter(status=Snapshot.StatusChoices.SEALED, downloaded_at__isnull=True) + .values("crawl_id") + .annotate(count=Count("id")) + ): + cancelled_snapshot_counts_by_crawl[str(row["crawl_id"])] = row["count"] + + for row in ( + active_snapshot_scope.filter( + status=Snapshot.StatusChoices.SEALED, + ) + .values("crawl_id") + .annotate(size=Sum("output_size")) + ): + crawl_output_sizes_by_crawl[str(row["crawl_id"])] = int(row["size"] or 0) + + crawl_process_pids: dict[str, int] = {} + snapshot_process_pids: dict[str, int] = {} + process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {} + process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {} + seen_process_records: set[str] = set() + crawls_by_id = {str(crawl["id"]): crawl for crawl in active_crawls_list} + started_snapshot_fields = ( + "id_str", + "created_at", + "modified_at", + "url", + "timestamp", + "bookmarked_at", + "crawl_id_str", + "title", + "downloaded_at", + "fs_version", + "status", + ) + queued_snapshot_fields = ( + "id_str", + "url", + "crawl_id_str", + "title", + "status", + ) + snapshots = [] + for crawl_id in active_crawl_ids: + crawl_snapshot_scope = active_snapshot_scope.filter(crawl_id=crawl_id) + snapshots.extend( + crawl_snapshot_scope.filter(status=Snapshot.StatusChoices.STARTED) + .annotate(id_str=Cast("id", CharField()), crawl_id_str=Cast("crawl_id", CharField())) + .values(*started_snapshot_fields) + .order_by("-modified_at")[:max_started_snapshots_per_crawl], + ) + queued_snapshots = list( + crawl_snapshot_scope.filter(status=Snapshot.StatusChoices.QUEUED) + .annotate(id_str=Cast("id", CharField()), crawl_id_str=Cast("crawl_id", CharField())) + .values( + *queued_snapshot_fields, + ) + .order_by("modified_at")[:max_queued_snapshots_per_crawl], + ) + queued_snapshot_overflow_by_crawl[str(crawl_id)] = max( + snapshot_counts_by_crawl.get(str(crawl_id), {}).get(Snapshot.StatusChoices.QUEUED, 0) - len(queued_snapshots), + 0, + ) + snapshots.extend(queued_snapshots) + + for snapshot in snapshots: + # Process.pwd points at Snapshot.output_dir, which uses CompactUUID + # hex path components. Keep progress IDs compact too so process rows + # can be matched without carrying dashed/undashed variants. + snapshot["id"] = str(snapshot.pop("id_str")).replace("-", "") + snapshot["crawl_id"] = str(snapshot.pop("crawl_id_str")).replace("-", "") + snapshots_by_id = {str(snapshot["id"]): snapshot for snapshot in snapshots} + displayed_snapshots_by_crawl: dict[str, list[Snapshot]] = {str(crawl_id): [] for crawl_id in active_crawl_ids} + for snapshot in snapshots: + crawl_snapshots = displayed_snapshots_by_crawl.setdefault(str(snapshot["crawl_id"]), []) + crawl_snapshots.append(snapshot) + displayed_snapshot_ids = [ + snapshot["id"] for crawl_snapshots in displayed_snapshots_by_crawl.values() for snapshot in crawl_snapshots + ] + detailed_snapshot_ids = [snapshot["id"] for snapshot in snapshots if snapshot["status"] != Snapshot.StatusChoices.QUEUED] + process_value_fields = ("id", "process_type", "status", "pwd", "cmd", "pid", "exit_code", "started_at", "modified_at") + if active_crawl_ids or displayed_snapshot_ids: + process_scope = Process.objects.filter( + machine_id=machine_id, + process_type__in=[ + Process.TypeChoices.HOOK, + Process.TypeChoices.BINARY, + ], + ) + running_processes = process_scope.filter(status=Process.StatusChoices.RUNNING).values(*process_value_fields) + recent_processes = ( + process_scope.filter(modified_at__gte=now - timedelta(minutes=10)).values(*process_value_fields).order_by("-modified_at") + ) + else: + running_processes = Process.objects.none() + recent_processes = Process.objects.none() + + archiveresults_by_snapshot: dict[str, list[ArchiveResult]] = {str(snapshot_id): [] for snapshot_id in detailed_snapshot_ids} + if detailed_snapshot_ids: + displayed_archiveresults = ( + archiveresult_scope.filter(snapshot_id__in=detailed_snapshot_ids) + .select_related("process") + .only( + "id", + "snapshot_id", + "plugin", + "hook_name", + "status", + "output_str", + "output_files", + "output_size", + "start_ts", + "end_ts", + "created_at", + "modified_at", + "process_id", + "process__id", + "process__pid", + "process__started_at", + "process__timeout", + ) + .order_by("snapshot_id", "start_ts", "created_at") + ) + for archiveresult in displayed_archiveresults: + archiveresults_by_snapshot.setdefault(str(archiveresult.snapshot_id), []).append(archiveresult) + + def find_snapshot_for_process(proc_pwd: Path) -> Snapshot | None: + for path_part in reversed(proc_pwd.parts): + snapshot = snapshots_by_id.get(path_part) + if snapshot: + return snapshot + return None + + def find_crawl_for_process(proc_pwd: Path) -> Crawl | None: + for path_part in reversed(proc_pwd.parts): + crawl = crawls_by_id.get(path_part) + if crawl: + return crawl + return None + + running_worker_ids: set[str] = set() + for proc in running_processes: + if not proc["pwd"]: + continue + proc_pwd = Path(proc["pwd"]) + matched_snapshot = find_snapshot_for_process(proc_pwd) + matched_crawl = ( + crawls_by_id.get(str(matched_snapshot["crawl_id"])) if matched_snapshot is not None else find_crawl_for_process(proc_pwd) + ) + if matched_snapshot is None: + if matched_crawl is None: + continue + crawl_id = str(matched_crawl["id"]) + snapshot_id = "" + else: + crawl_id = str(matched_snapshot["crawl_id"]) + snapshot_id = str(matched_snapshot["id"]) + running_worker_ids.add(str(proc["id"])) + _plugin, _label, phase, _hook_name = process_label(proc["cmd"]) + if crawl_id and proc["pid"]: + crawl_process_pids.setdefault(crawl_id, proc["pid"]) + if phase == "snapshot" and snapshot_id and proc["pid"]: + snapshot_process_pids.setdefault(snapshot_id, proc["pid"]) + + for proc in recent_processes: + if not proc["pwd"]: + continue + proc_pwd = Path(proc["pwd"]) + matched_snapshot = find_snapshot_for_process(proc_pwd) + matched_crawl = ( + crawls_by_id.get(str(matched_snapshot["crawl_id"])) if matched_snapshot is not None else find_crawl_for_process(proc_pwd) + ) + if matched_snapshot is None and matched_crawl is None: + continue + crawl_id = str(matched_snapshot["crawl_id"] if matched_snapshot is not None else matched_crawl["id"]) + snapshot_id = str(matched_snapshot["id"]) if matched_snapshot is not None else "" + + plugin, label, phase, hook_name = process_label(proc["cmd"]) + + record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id) + proc_key = f"{record_scope}:{plugin}:{label}:{proc['status']}:{proc['exit_code']}" + if proc_key in seen_process_records: + continue + seen_process_records.add(proc_key) + + status = ( + "started" + if proc["status"] == Process.StatusChoices.RUNNING + else ( + "skipped" + if proc["exit_code"] == PROCESS_EXIT_SKIPPED or (phase == "binary" and proc["exit_code"] not in (None, 0)) + else ("failed" if proc["exit_code"] not in (None, 0) else "succeeded") + ) + ) + payload: dict[str, object] = { + "id": str(proc["id"]), + "plugin": plugin, + "label": label, + "hook_name": hook_name, + "status": status, + "phase": phase, + "source": "process", + "process_id": str(proc["id"]), + } + if status == "started" and proc["pid"]: + payload["pid"] = proc["pid"] + proc_started_at = proc["started_at"] or proc["modified_at"] + if phase == "snapshot" and snapshot_id: + process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at)) + elif crawl_id: + process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at)) + + active_crawls = [] + total_workers = len(running_worker_ids) + for crawl in active_crawls_list: + crawl_id = str(crawl["id"]) + crawl_snapshot_counts = snapshot_counts_by_crawl.get(crawl_id, {}) + total_snapshots = sum(crawl_snapshot_counts.values()) + completed_snapshots = crawl_snapshot_counts.get(Snapshot.StatusChoices.SEALED, 0) + started_snapshots = crawl_snapshot_counts.get(Snapshot.StatusChoices.STARTED, 0) + pending_snapshots = crawl_snapshot_counts.get(Snapshot.StatusChoices.QUEUED, 0) + cancelled_snapshots = cancelled_snapshot_counts_by_crawl.get(crawl_id, 0) + + # Count URLs in the crawl (for when snapshots haven't been created yet) + urls_count = 0 + if crawl["urls"]: + urls_count = len([u for u in crawl["urls"].split("\n") if u.strip() and not u.startswith("#")]) + + # Calculate crawl progress + crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 + crawl_run_started_at = crawl["created_at"] + crawl_setup_plugins = [ + payload + for payload, proc_started_at in process_records_by_crawl.get(crawl_id, []) + if is_current_run_timestamp(proc_started_at, crawl_run_started_at) + ] + crawl_setup_total = len(crawl_setup_plugins) + crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded") + crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed") + crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued") + crawl_screencast_url = screencast_frame_url(crawl_id, active_crawl_objects[crawl_id].output_dir) + crawl_screencast_link = f"/admin/crawls/crawl/{crawl_id.replace('-', '')}/change/" if crawl_screencast_url else "" + + # Get active snapshots for this crawl (already prefetched) + active_snapshots_for_crawl = [] + for snapshot in displayed_snapshots_by_crawl.get(crawl_id, []): + snapshot_run_started_at = snapshot.get("downloaded_at") or snapshot.get("created_at") + # Get archive results only for displayed active snapshots. Large crawls can + # contain thousands of sealed snapshots, and prefetching all their results + # makes the progress endpoint compete with the runner. + snapshot_results = [ + ar + for ar in archiveresults_by_snapshot.get(str(snapshot["id"]), []) + if archiveresult_matches_current_run(ar, snapshot_run_started_at) + ] + if snapshot["status"] == Snapshot.StatusChoices.QUEUED: + snapshot_results = [] + + plugin_progress_values: list[int] = [] + all_plugins: list[dict[str, object]] = [] + seen_plugin_keys: set[str] = set() + snapshot_title = ( + str(snapshot["title"] or "") + if snapshot["status"] == Snapshot.StatusChoices.QUEUED + else Snapshot._normalize_title_candidate(snapshot["title"], snapshot_url=snapshot["url"]) + ) + snapshot_favicon_url = "" + snapshot_preview_url = "" + snapshot_preview_link = "" + snapshot_screencast_url = "" + snapshot_screencast_link = "" + snapshot_fallback_urls: list[str] = [] + result_by_plugin = {result.plugin: result for result in snapshot_results} + title_result = result_by_plugin.get("title") + if not snapshot_title and title_result is not None and title_result.status == ArchiveResult.StatusChoices.SUCCEEDED: + snapshot_title = Snapshot._normalize_title_candidate(title_result.output_str, snapshot_url=snapshot["url"]) + favicon_result = result_by_plugin.get("favicon") + if favicon_result is not None and favicon_result.status == ArchiveResult.StatusChoices.SUCCEEDED: + favicon_path = archiveresult_output_path(favicon_result) or "favicon/favicon.ico" + snapshot_favicon_url = snapshot_output_url(snapshot, favicon_path) + screenshot_result = result_by_plugin.get("screenshot") + if screenshot_result is not None and screenshot_result.status == ArchiveResult.StatusChoices.SUCCEEDED: + snapshot_preview_link = snapshot_view_url(snapshot) + screenshot_path = archiveresult_output_path(screenshot_result) or "screenshot/screenshot.png" + snapshot_preview_url = snapshot_output_url(snapshot, screenshot_path) + snapshot_preview_link = snapshot_view_url(snapshot, screenshot_path) + if snapshot_favicon_url: + snapshot_fallback_urls.append(snapshot_favicon_url) + elif snapshot_favicon_url: + snapshot_preview_url = snapshot_favicon_url + + if snapshot["status"] == Snapshot.StatusChoices.STARTED: + snapshot_screencast_url = screencast_frame_url(crawl_id, active_crawl_objects[crawl_id].output_dir) + snapshot_screencast_link = snapshot_view_url(snapshot) if snapshot_screencast_url else "" + + def plugin_sort_key(ar): + status_order = { + ArchiveResult.StatusChoices.STARTED: 0, + ArchiveResult.StatusChoices.QUEUED: 1, + ArchiveResult.StatusChoices.SUCCEEDED: 2, + ArchiveResult.StatusChoices.NORESULTS: 3, + ArchiveResult.StatusChoices.FAILED: 4, + } + return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "") + + for ar in sorted(snapshot_results, key=plugin_sort_key): + status = ar.status + process = ar.process_record + progress_value = 0 + if status in ( + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ArchiveResult.StatusChoices.NORESULTS, + ): + progress_value = 100 + elif status == ArchiveResult.StatusChoices.STARTED: + started_at = ar.start_ts or (process.started_at if process else None) + timeout = process.timeout if process else 120 + if started_at and timeout: + elapsed = max(0.0, (now - started_at).total_seconds()) + progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100))) + else: + progress_value = 1 + else: + progress_value = 0 + + plugin_progress_values.append(progress_value) + plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin) + + plugin_payload = { + "id": str(ar.id), + "plugin": ar.plugin, + "label": label, + "hook_name": hook_name, + "phase": phase, + "status": status, + "process_id": str(process.id) if process else None, + "admin_url": f"/admin/core/archiveresult/{ar.id}/change/", + } + output_path = archiveresult_output_path(ar) + if output_path: + plugin_payload["output_path"] = output_path + plugin_payload["output_url"] = snapshot_view_url(snapshot, output_path) + if status == ArchiveResult.StatusChoices.STARTED and process: + plugin_payload["pid"] = process.pid + if status == ArchiveResult.StatusChoices.STARTED: + plugin_payload["progress"] = progress_value + plugin_payload["timeout"] = process.timeout if process else 120 + plugin_payload["source"] = "archiveresult" + all_plugins.append(plugin_payload) + seen_plugin_keys.add(str(process.id) if process else f"{ar.plugin}:{hook_name}") + + for proc_payload, proc_started_at in process_records_by_snapshot.get(str(snapshot["id"]), []): + if not is_current_run_timestamp(proc_started_at, snapshot_run_started_at): + continue + proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}") + if proc_key in seen_plugin_keys: + continue + seen_plugin_keys.add(proc_key) + all_plugins.append(proc_payload) + + proc_status = proc_payload.get("status") + if proc_status in ("succeeded", "failed", "skipped"): + plugin_progress_values.append(100) + elif proc_status == "started": + plugin_progress_values.append(1) + else: + plugin_progress_values.append(0) + + total_plugins = len(all_plugins) + completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded") + failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed") + pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued") + + snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0 + worker_state = "running" if snapshot_process_pids.get(str(snapshot["id"])) else "waiting" + if ( + snapshot["status"] == Snapshot.StatusChoices.STARTED + and worker_state == "waiting" + and not all_plugins + and snapshot["modified_at"] + and (now - snapshot["modified_at"]).total_seconds() > 30 + ): + worker_state = "waiting" if orchestrator_running else "crashed" + + if snapshot["status"] == Snapshot.StatusChoices.QUEUED and not snapshot_process_pids.get(str(snapshot["id"])): + compact_snapshot = [ + str(snapshot["id"]), + snapshot_display_url(snapshot["url"]), + ] + if snapshot_title: + compact_snapshot.append(snapshot_title) + active_snapshots_for_crawl.append(compact_snapshot) + continue + + snapshot_payload = { + "id": str(snapshot["id"]), + "url": snapshot_display_url(snapshot["url"]), + "title": snapshot_title, + "status": snapshot["status"], + "worker_state": worker_state, + } + if snapshot["status"] != Snapshot.StatusChoices.QUEUED or all_plugins or snapshot_process_pids.get(str(snapshot["id"])): + snapshot_payload.update( + { + "view_url": snapshot_view_url(snapshot), + "started": (snapshot["downloaded_at"] or snapshot["created_at"]).isoformat() + if (snapshot["downloaded_at"] or snapshot["created_at"]) + else None, + "progress": snapshot_progress, + "total_plugins": total_plugins, + "completed_plugins": completed_plugins, + "failed_plugins": failed_plugins, + "pending_plugins": pending_plugins, + "all_plugins": all_plugins, + }, + ) + if snapshot_favicon_url: + snapshot_payload["favicon_url"] = snapshot_favicon_url + if snapshot_preview_url: + snapshot_payload["preview_url"] = snapshot_preview_url + snapshot_payload["preview_link"] = snapshot_preview_link + if snapshot_screencast_url: + snapshot_payload["screencast_url"] = snapshot_screencast_url + snapshot_payload["screencast_link"] = snapshot_screencast_link + if snapshot_fallback_urls: + snapshot_payload["preview_fallbacks"] = snapshot_fallback_urls + if snapshot_process_pids.get(str(snapshot["id"])): + snapshot_payload["worker_pid"] = snapshot_process_pids[str(snapshot["id"])] + + active_snapshots_for_crawl.append(snapshot_payload) + + # Check if crawl can start (for debugging stuck crawls) + can_start = bool(crawl["urls"]) + urls_preview = crawl["urls"][:60] if crawl["urls"] else None + crawl_tags = [tag.strip() for tag in (crawl["tags_str"] or "").replace("\n", ",").split(",") if tag.strip()] + persona_details = persona_details_by_id.get(str(crawl["persona_id"])) if crawl["persona_id"] else None + persona_name = persona_details["name"] if persona_details else "Default" + persona_details = persona_details or persona_details_by_name.get(persona_name) + crawl_output_size = crawl_output_sizes_by_crawl.get(crawl_id, 0) + avg_snapshot_size = int(crawl_output_size / completed_snapshots) if completed_snapshots else 0 + crawl_obj = active_crawl_objects[crawl_id] + effective_crawl_config = get_config(crawl=crawl_obj, resolve_plugins=False) + max_urls = int(effective_crawl_config.CRAWL_MAX_URLS or 0) + crawl_max_size = int(effective_crawl_config.CRAWL_MAX_SIZE or 0) + crawl_timeout = int(effective_crawl_config.CRAWL_TIMEOUT or 0) + snapshot_max_size = int(effective_crawl_config.SNAPSHOT_MAX_SIZE or 0) + + # Check if retry_at is in the future (would prevent worker from claiming) + retry_at_future = crawl["retry_at"] > now if crawl["retry_at"] else False + is_paused = crawl_obj.is_paused + seconds_until_retry = ( + 0 if is_paused else int((crawl["retry_at"] - now).total_seconds()) if crawl["retry_at"] and retry_at_future else 0 + ) + crawl_worker_state = ( + "running" + if crawl_process_pids.get(crawl_id) + or any(isinstance(snapshot, dict) and snapshot.get("worker_pid") for snapshot in active_snapshots_for_crawl) + else "waiting" + ) + if is_paused: + crawl_worker_state = "paused" + elif ( + crawl["status"] == Crawl.StatusChoices.STARTED + and crawl_worker_state == "waiting" + and (started_snapshots or pending_snapshots) + ): + crawl_worker_state = "waiting" if orchestrator_running else "crashed" + + active_crawls.append( + { + "id": crawl_id, + "label": (next((line.strip() for line in (crawl["urls"] or "").splitlines() if line.strip()), "") or crawl_id)[:60], + "status": crawl["status"], + "is_paused": is_paused, + "started": crawl["created_at"].isoformat() if crawl["created_at"] else None, + "progress": crawl_progress, + "created_by": crawl["created_by__username"], + "persona": persona_name, + "persona_admin_url": persona_details["admin_url"] if persona_details else None, + "max_depth": crawl["max_depth"], + "max_urls": max_urls, + "max_crawl_size": crawl_max_size, + "crawl_timeout": crawl_timeout, + "max_snapshot_size": snapshot_max_size, + "max_crawl_size_display": printable_filesize(crawl_max_size) if crawl_max_size else "unlimited", + "crawl_timeout_display": f"{crawl_timeout}s" if crawl_timeout else "unlimited", + "max_snapshot_size_display": printable_filesize(snapshot_max_size) if snapshot_max_size else "unlimited", + "crawl_output_size": crawl_output_size, + "avg_snapshot_size": avg_snapshot_size, + "crawl_output_size_display": printable_filesize(crawl_output_size) if crawl_output_size else "0 B", + "avg_snapshot_size_display": printable_filesize(avg_snapshot_size) if avg_snapshot_size else "0 B", + "tags": crawl_tags, + "urls_count": urls_count, + "total_snapshots": total_snapshots, + "completed_snapshots": completed_snapshots, + "started_snapshots": started_snapshots, + "failed_snapshots": 0, + "pending_snapshots": pending_snapshots, + "cancelled_snapshots": cancelled_snapshots, + "setup_plugins": crawl_setup_plugins, + "setup_total_plugins": crawl_setup_total, + "setup_completed_plugins": crawl_setup_completed, + "setup_failed_plugins": crawl_setup_failed, + "setup_pending_plugins": crawl_setup_pending, + "screencast_url": crawl_screencast_url, + "screencast_link": crawl_screencast_link, + "active_snapshots": active_snapshots_for_crawl, + "queued_snapshots_hidden": queued_snapshot_overflow_by_crawl.get(crawl_id, 0), + "can_start": can_start, + "urls_preview": urls_preview, + "retry_at_future": retry_at_future, + "seconds_until_retry": seconds_until_retry, + "worker_pid": crawl_process_pids.get(crawl_id), + "worker_state": crawl_worker_state, + }, + ) + + payload = { + "is_admin": is_admin, + "scope": { + "snapshot_id": str(scoped_snapshot.id) if scoped_snapshot is not None else "", + "crawl_id": crawl_id_filter, + }, + "orchestrator_running": orchestrator_running, + "orchestrator_pid": orchestrator_pid, + "total_workers": total_workers, + "crawls_active": crawls_active, + "crawls_queued": crawls_queued, + "crawls_recent": crawls_recent, + "snapshots_active": snapshots_active, + "snapshots_queued": snapshots_queued, + "archiveresults_active": archiveresults_active, + "archiveresults_queued": archiveresults_queued, + "downloads_active": downloads_active, + "downloads_queued": downloads_queued, + "indexing_active": indexing_active, + "indexing_queued": indexing_queued, + "active_crawls": active_crawls, + "queued_crawls_hidden": queued_crawls_hidden, + "server_time": timezone.now().isoformat(), + } + try: + import ujson + + return HttpResponse(ujson.dumps(payload), content_type="application/json") + except ImportError: + return JsonResponse(payload) + except Exception as e: + import traceback + + return JsonResponse( + { + "error": str(e), + "traceback": traceback.format_exc(), + "orchestrator_running": False, + "total_workers": 0, + "crawls_active": 0, + "crawls_queued": 0, + "crawls_recent": 0, + "snapshots_active": 0, + "snapshots_queued": 0, + "archiveresults_active": 0, + "archiveresults_queued": 0, + "downloads_active": 0, + "downloads_queued": 0, + "indexing_active": 0, + "indexing_queued": 0, + "active_crawls": [], + "server_time": timezone.now().isoformat(), + }, + status=500, + ) diff --git a/archivebox/purge.py b/archivebox/purge.py deleted file mode 100755 index e2e4e97c56..0000000000 --- a/archivebox/purge.py +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/env python3 - -import re -from argparse import ArgumentParser -from os.path import exists, join -from shutil import rmtree -from typing import List - -from config import ARCHIVE_DIR, OUTPUT_DIR -from index import (parse_json_links_index, write_html_links_index, - write_json_links_index) - - -def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: - if not exists(join(OUTPUT_DIR, 'index.json')): - exit('index.json is missing; nothing to do') - - compiled = [re.compile(r) for r in regexes] - links = parse_json_links_index(OUTPUT_DIR) - filtered = [] - remaining = [] - - for l in links: - url = l['url'] - for r in compiled: - if r.search(url): - filtered.append((l, r)) - break - else: - remaining.append(l) - - if not filtered: - exit('Search did not match any entries.') - - print('Filtered out {}/{} urls:'.format(len(filtered), len(links))) - - for link, regex in filtered: - url = link['url'] - print(' {url} via {regex}'.format(url=url, regex=regex.pattern)) - - if not proceed: - answer = input('Remove {} entries from index? [y/n] '.format( - len(filtered))) - proceed = answer.strip().lower() in ('y', 'yes') - - if not proceed: - exit('Aborted') - - write_json_links_index(OUTPUT_DIR, remaining) - write_html_links_index(OUTPUT_DIR, remaining) - - if delete: - for link, _ in filtered: - data_dir = join(ARCHIVE_DIR, link['timestamp']) - if exists(data_dir): - rmtree(data_dir) - - -if __name__ == '__main__': - p = ArgumentParser('Index purging tool') - p.add_argument( - '--regex', - '-r', - action='append', - help='Regular expression matching URLs to purge', - ) - p.add_argument( - '--delete', - '-d', - action='store_true', - default=False, - help='Delete webpage files from archive', - ) - p.add_argument( - '--yes', - '-y', - action='store_true', - default=False, - help='Do not prompt for confirmation', - ) - - args = p.parse_args() - if args.regex: - cleanup_index(args.regex, proceed=args.yes, delete=args.delete) - else: - p.print_help() diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py new file mode 100644 index 0000000000..ff37bca4b6 --- /dev/null +++ b/archivebox/search/__init__.py @@ -0,0 +1 @@ +__package__ = "archivebox.search" diff --git a/archivebox/search/admin.py b/archivebox/search/admin.py new file mode 100644 index 0000000000..988ee51f61 --- /dev/null +++ b/archivebox/search/admin.py @@ -0,0 +1,99 @@ +__package__ = "archivebox.search" + +from django.contrib import admin +from django.contrib.admin.views.main import ChangeList +from django.db.models import Case, IntegerField, Value, When + +from archivebox.search.config import ( + get_default_search_mode, + get_search_mode, + get_search_mode_backend, + get_search_mode_base, + get_search_mode_options, +) +from archivebox.search.query import query_search_index +from archivebox.search.views import get_cached_admin_search_ids + + +class SearchResultsChangeList(ChangeList): + """Django admin ChangeList with ArchiveBox search mode state.""" + + def __init__(self, request, *args, **kwargs): + """Capture normalized search mode before Django builds results.""" + self.search_mode = get_search_mode(request.GET.get("search_mode"), config=request.archivebox_config) + self.search_mode_backend = get_search_mode_backend(self.search_mode, config=request.archivebox_config) + super().__init__(request, *args, **kwargs) + self.embedded_changelist = request.GET.get("_embedded") == "crawl" + + def get_results(self, request): + """Populate normal admin results plus search-index hint state.""" + super().get_results(request) + self.show_search_index_hint = bool( + self.opts.model_name == "snapshot" + and self.query + and self.result_count == 0 + and get_search_mode_base(self.search_mode, config=request.archivebox_config) == "deep" + and self.search_mode_backend, + ) + + def get_filters_params(self, params=None): + """Remove UI-only search params before admin filter processing.""" + lookup_params = super().get_filters_params(params) + lookup_params.pop("search_mode", None) + lookup_params.pop("_embedded", None) + lookup_params.pop("per_page", None) + return lookup_params + + +class SearchResultsAdminMixin(admin.ModelAdmin): + """Mixin that routes admin searches through ArchiveBox search modes.""" + + show_search_mode_selector = True + + def get_changelist(self, request, **kwargs): + """Return the ArchiveBox search-aware ChangeList class.""" + return SearchResultsChangeList + + def get_default_search_mode(self): + """Return the default search mode for the current request config.""" + return get_default_search_mode(config=self.request.archivebox_config) + + def get_search_mode_options(self): + """Return selector options for the current request config.""" + return get_search_mode_options(config=self.request.archivebox_config) + + def get_search_results(self, request, queryset, search_term: str): + """Apply admin search semantics to a changelist queryset.""" + + search_term = search_term.strip() + if not search_term: + return super().get_search_results(request, queryset, search_term) + search_mode = get_search_mode(request.GET.get("search_mode"), config=request.archivebox_config) + if queryset.model._meta.label_lower == "core.snapshot" and request.GET.get("_embedded") != "crawl": + cached_ids = get_cached_admin_search_ids(request) + if cached_ids is not None: + if not cached_ids: + return queryset.none(), False + search_rank = Case( + *(When(pk=snapshot_id, then=Value(index)) for index, snapshot_id in enumerate(cached_ids)), + output_field=IntegerField(), + ) + return queryset.filter(pk__in=cached_ids).annotate(search_rank=search_rank).order_by("search_rank"), False + return queryset.none(), False + + if get_search_mode_base(search_mode, config=request.archivebox_config) == "meta": + qs, use_distinct = super().get_search_results(request, queryset, search_term) + return qs, use_distinct + if request.GET.get("_embedded") == "crawl": + try: + return queryset.filter( + pk__in=query_search_index( + search_term, + search_mode=search_mode, + config=request.archivebox_config, + ).values("pk"), + ), False + except Exception as err: + print(f"[!] Error while using search backend: {err.__class__.__name__} {err}") + return queryset.none(), False + return queryset.none(), False diff --git a/archivebox/search/apps.py b/archivebox/search/apps.py new file mode 100644 index 0000000000..4e58c7d77a --- /dev/null +++ b/archivebox/search/apps.py @@ -0,0 +1,11 @@ +__package__ = "archivebox.search" + +from django.apps import AppConfig + + +class SearchConfig(AppConfig): + """Register search templates and admin integration with Django.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "archivebox.search" + verbose_name = "Search" diff --git a/archivebox/search/backends.py b/archivebox/search/backends.py new file mode 100644 index 0000000000..135faad7cf --- /dev/null +++ b/archivebox/search/backends.py @@ -0,0 +1,69 @@ +__package__ = "archivebox.search" + +import os +from contextlib import contextmanager +from typing import Any + +from archivebox.config.common import get_config + + +_search_backends_cache: dict | None = None + + +@contextmanager +def search_backend_env(config: dict[str, Any] | None = None, **config_kwargs: Any): + """Temporarily expose resolved search config through os.environ for backend code.""" + config = config or get_config(**config_kwargs) + updates = {} + for key, value in config.items(): + if not str(key).startswith("SEARCH_BACKEND_"): + continue + if value is None: + continue + if isinstance(value, (str, int, float, bool, os.PathLike)): + updates[str(key)] = str(value) + previous = {key: os.environ.get(key) for key in updates} + os.environ.update(updates) + try: + yield + finally: + for key, value in previous.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value + + +def normalize_search_backend_name(backend_name: str | None) -> str: + """Normalize a backend name for config and plugin lookup.""" + return (backend_name or "").strip().lower().replace("-", "_") + + +def get_available_backends() -> dict: + """Discover search backend plugin modules and cache them in memory.""" + global _search_backends_cache + + if _search_backends_cache is None: + from archivebox.plugins.discovery import get_search_backends + + _search_backends_cache = get_search_backends() + + return _search_backends_cache + + +def get_backend(config: dict[str, Any] | None = None, **config_kwargs: Any) -> Any: + """Resolve the configured search backend module.""" + config = config or get_config(**config_kwargs) + backend_name = normalize_search_backend_name(config.SEARCH_BACKEND_ENGINE) + backends = get_available_backends() + + if backend_name in backends: + return backends[backend_name] + + if "ripgrep" in backends: + return backends["ripgrep"] + + available = list(backends.keys()) + raise RuntimeError( + f'Search backend "{backend_name}" not found. Available backends: {available or "none"}', + ) diff --git a/archivebox/search/config.py b/archivebox/search/config.py new file mode 100644 index 0000000000..71948a24c8 --- /dev/null +++ b/archivebox/search/config.py @@ -0,0 +1,77 @@ +__package__ = "archivebox.search" + +from typing import Any + +from archivebox.config.common import get_config +from archivebox.search.backends import get_available_backends, normalize_search_backend_name + + +SEARCH_MODES = ("meta", "contents", "deep") + + +def get_default_search_mode(config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + """Choose the default search mode from config and discovered backends.""" + config = config or get_config(**config_kwargs) + backend_name = normalize_search_backend_name(config.SEARCH_BACKEND_ENGINE) + backends = get_available_backends() + if backend_name in backends: + return f"deep:{backend_name}" + if "ripgrep" in backends: + return "deep:ripgrep" + return "contents" + + +def get_search_mode(search_mode: str | None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + """Normalize a user-supplied search mode or fall back to the default.""" + normalized = (search_mode or "").strip().lower().replace(" ", "") + if normalized == "content": + normalized = "contents" + if normalized in SEARCH_MODES: + return normalized + if ":" in normalized: + mode, backend_name = normalized.split(":", 1) + backend_name = normalize_search_backend_name(backend_name) + if mode == "content": + mode = "contents" + if mode in {"contents", "deep"} and backend_name in get_available_backends(): + return f"{mode}:{backend_name}" + return get_default_search_mode(config=config, **config_kwargs) + + +def get_search_mode_base(search_mode: str | None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str: + """Return the mode portion of a normalized search mode.""" + return get_search_mode(search_mode, config=config, **config_kwargs).split(":", 1)[0] + + +def get_search_mode_backend(search_mode: str | None, config: dict[str, Any] | None = None, **config_kwargs: Any) -> str | None: + """Return the backend portion of a backend-qualified search mode.""" + normalized = get_search_mode(search_mode, config=config, **config_kwargs) + if ":" not in normalized: + return None + return normalized.split(":", 1)[1] + + +def get_search_mode_options(config: dict[str, Any] | None = None, **config_kwargs: Any) -> list[dict[str, str]]: + """Build search mode choices for admin and public selectors.""" + config = config or get_config(**config_kwargs) + backends = get_available_backends() + configured_backend = normalize_search_backend_name(config.SEARCH_BACKEND_ENGINE) + backend_names = [ + *([configured_backend] if configured_backend in backends else []), + *(name for name in sorted(backends) if name != configured_backend), + ] + options = [ + {"value": "meta", "label": "meta"}, + {"value": "contents", "label": "deep"}, + ] + if backend_names: + options.extend( + { + "value": f"deep:{backend_name}", + "label": f"deep:{backend_name}", + } + for backend_name in backend_names + ) + else: + options.append({"value": "deep", "label": "deep"}) + return options diff --git a/archivebox/search/query.py b/archivebox/search/query.py new file mode 100644 index 0000000000..2e01537d0e --- /dev/null +++ b/archivebox/search/query.py @@ -0,0 +1,297 @@ +__package__ = "archivebox.search" + +from typing import Any + +from django.db import connection +from django.db.models import Case, IntegerField, Q, QuerySet, Value, When + +from archivebox.config.common import get_config +from archivebox.misc.logging import stderr +from archivebox.misc.util import enforce_types +from archivebox.search.backends import get_available_backends, get_backend, normalize_search_backend_name, search_backend_env +from archivebox.search.config import get_search_mode, get_search_mode_backend, get_search_mode_base + + +MAX_SEARCH_RANK_IDS = 500 + + +def escape_like_query(query: str) -> str: + """Escape a string for SQLite LIKE matching.""" + return query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + + +def crawl_config_values_search_wave(query: str) -> Q | None: + """Build a Snapshot Q predicate matching values inside Crawl.config.""" + if connection.vendor != "sqlite": + return None + + from archivebox.crawls.models import Crawl + + pattern = f"%{escape_like_query(query).lower()}%" + matching_crawls = Crawl.objects.extra( + where=[ + """ + EXISTS ( + SELECT 1 + FROM json_tree(config) + WHERE json_tree.atom IS NOT NULL + AND LOWER(CAST(json_tree.atom AS TEXT)) LIKE %s ESCAPE '\\' + ) + """, + ], + params=[pattern], + ) + return Q(crawl_id__in=matching_crawls.values("pk")) + + +def snapshot_metadata_search_waves(query: str, *, include_id_matches: bool = False) -> list[Q]: + """Build ordered metadata predicates for Snapshot search.""" + waves = [] + if include_id_matches: + waves.append(Q(id__istartswith=query) | Q(id__iendswith=query)) + + waves.extend( + [ + Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query), + Q(tags__name__icontains=query), + Q(notes__icontains=query) | Q(crawl__notes__icontains=query) | Q(crawl__label__icontains=query), + Q(crawl__created_by__username=query), + ], + ) + + config_wave = crawl_config_values_search_wave(query) + if config_wave is not None: + waves.append(config_wave) + return waves + + +def prioritize_metadata_matches( + base_queryset: QuerySet, + metadata_queryset: QuerySet, + fulltext_queryset: QuerySet, + *, + deep_queryset: QuerySet | None = None, + ordering: list[str] | tuple[str, ...] | None = None, +) -> QuerySet: + """Rank metadata hits before backend full-text hits.""" + metadata_ids = list(metadata_queryset.values_list("pk", flat=True).distinct()[: MAX_SEARCH_RANK_IDS + 1]) + metadata_id_set = set(metadata_ids) + fulltext_ids = [ + pk for pk in fulltext_queryset.values_list("pk", flat=True).distinct()[: MAX_SEARCH_RANK_IDS + 1] if pk not in metadata_id_set + ] + fulltext_id_set = set(fulltext_ids) + deep_ids = [] + if deep_queryset is not None: + deep_ids = [ + pk + for pk in deep_queryset.values_list("pk", flat=True).distinct()[: MAX_SEARCH_RANK_IDS + 1] + if pk not in metadata_id_set and pk not in fulltext_id_set + ] + + if not metadata_ids and not fulltext_ids and not deep_ids: + return base_queryset.none() + + if any(len(ids) > MAX_SEARCH_RANK_IDS for ids in (metadata_ids, fulltext_ids, deep_ids)): + search_filter = Q() + if metadata_ids: + search_filter |= Q(pk__in=metadata_queryset.values("pk").distinct()) + if fulltext_ids: + search_filter |= Q(pk__in=fulltext_queryset.values("pk").distinct()) + if deep_queryset is not None and deep_ids: + search_filter |= Q(pk__in=deep_queryset.values("pk").distinct()) + qs = base_queryset.filter(search_filter) + if ordering is not None: + qs = qs.order_by(*ordering) + return qs.distinct() + + qs = base_queryset.filter(pk__in=[*metadata_ids, *fulltext_ids, *deep_ids]).annotate( + search_rank=Case( + When(pk__in=metadata_ids, then=Value(0)), + When(pk__in=fulltext_ids, then=Value(1)), + default=Value(2), + output_field=IntegerField(), + ), + ) + + if ordering is not None: + qs = qs.order_by("search_rank", *ordering) + + return qs.distinct() + + +def apply_snapshot_search( + base_queryset: QuerySet, + query: str, + *, + search_mode: str | None = None, + config: dict[str, Any] | None = None, + ordering: list[str] | tuple[str, ...] | None = None, + max_results: int | None = None, + skip_backend_when_metadata_satisfies_limit: bool = False, + include_metadata_for_forced_backend: bool = False, + include_id_matches: bool = False, +) -> QuerySet: + """Apply shared CLI/API/public/admin Snapshot search semantics.""" + query = (query or "").strip() + if not query: + return base_queryset + + config = config or get_config() + search_mode = get_search_mode(search_mode, config=config) + search_mode_base = get_search_mode_base(search_mode, config=config) + search_mode_backend = get_search_mode_backend(search_mode, config=config) + metadata_filter = Q() + for wave in snapshot_metadata_search_waves(query, include_id_matches=include_id_matches): + metadata_filter |= wave + metadata_queryset = base_queryset.filter(metadata_filter) + + if search_mode_base == "meta": + return metadata_queryset.distinct() + + if skip_backend_when_metadata_satisfies_limit and max_results: + metadata_ids = list(metadata_queryset.values_list("pk", flat=True).distinct()[:max_results]) + if len(metadata_ids) >= max_results: + return metadata_queryset.distinct() + + if search_mode_base == "deep": + fulltext_search_mode = f"contents:{search_mode_backend}" if search_mode_backend else "contents" + fulltext_queryset = query_search_index(query, search_mode=fulltext_search_mode, config=config, max_results=max_results) + deep_queryset = query_search_index(query, search_mode=search_mode, config=config, max_results=max_results) + return prioritize_metadata_matches( + base_queryset, + metadata_queryset, + fulltext_queryset, + deep_queryset=deep_queryset, + ordering=ordering, + ) + + backend_queryset = query_search_index(query, search_mode=search_mode, config=config, max_results=max_results) + if search_mode_backend and not include_metadata_for_forced_backend: + return base_queryset.filter(pk__in=backend_queryset.values("pk")).distinct() + + return prioritize_metadata_matches( + base_queryset, + metadata_queryset, + backend_queryset, + ordering=ordering, + ) + + +@enforce_types +def query_search_index( + query: str, + search_mode: str | None = None, + config: dict[str, Any] | None = None, + max_results: int | None = None, + **config_kwargs: Any, +) -> QuerySet: + """Return a Snapshot queryset from backend search IDs.""" + from archivebox.core.models import Snapshot + + config = config or get_config(**config_kwargs) + search_mode = "contents" if search_mode is None else get_search_mode(search_mode, config=config) + search_mode_base = get_search_mode_base(search_mode, config=config) + if search_mode_base == "meta": + return Snapshot.objects.none() + + snapshot_pks = list(iter_query_search_ids(query, search_mode=search_mode, config=config, max_results=max_results)) + return Snapshot.objects.filter(pk__in=list(dict.fromkeys(snapshot_pks))) + + +def iter_query_search_ids( + query: str, + search_mode: str | None = None, + config: dict[str, Any] | None = None, + max_results: int | None = None, + **config_kwargs: Any, +): + """Yield snapshot IDs from configured search backend modules.""" + config = config or get_config(**config_kwargs) + search_mode = "contents" if search_mode is None else get_search_mode(search_mode, config=config) + search_mode_base = get_search_mode_base(search_mode, config=config) + forced_backend = get_search_mode_backend(search_mode, config=config) + if search_mode_base == "meta": + return + + backends = get_available_backends() + configured_backend = normalize_search_backend_name(config.SEARCH_BACKEND_ENGINE) + if forced_backend: + if forced_backend not in backends: + raise RuntimeError( + f'Search backend "{forced_backend}" not found. Available backends: {list(backends) or "none"}', + ) + backend_names = [forced_backend] + elif search_mode_base == "deep": + backend_names = [ + *([configured_backend] if configured_backend in backends and configured_backend != "ripgrep" else []), + *(name for name in backends if name not in {configured_backend, "ripgrep"}), + *(["ripgrep"] if "ripgrep" in backends else []), + ] + elif configured_backend in backends: + backend_names = [configured_backend] + elif "ripgrep" in backends: + backend_names = ["ripgrep"] + else: + get_backend() + return + + if "sonic" in backend_names: + from archivebox.core.takeover_util import ensure_daemon_stack + + ensure_daemon_stack(reason="search query") + + errors: list[Exception] = [] + successful_backends = 0 + seen: set[str] = set() + try: + for backend_name in backend_names: + backend = backends[backend_name] + try: + with search_backend_env(config=config): + if backend_name == "ripgrep": + ids = backend.iter_search(query, search_mode=search_mode_base) + else: + ids = backend.search(query) + for snapshot_id in ids: + if snapshot_id in seen: + continue + seen.add(snapshot_id) + yield snapshot_id + if max_results and len(seen) >= max_results: + return + successful_backends += 1 + except Exception as err: + errors.append(err) + if search_mode_base != "deep" or forced_backend: + raise + except Exception as err: + stderr() + stderr( + f"[X] The search backend threw an exception={err}:", + color="red", + ) + raise + else: + if not successful_backends and errors and search_mode_base == "deep": + raise errors[0] + + +@enforce_types +def flush_search_index(snapshots: QuerySet, config: dict[str, Any] | None = None, **config_kwargs: Any) -> None: + """Remove Snapshot IDs from the configured search backend index.""" + config = config or get_config(**config_kwargs) + if not snapshots: + return + + backend = get_backend(config=config) + snapshot_pks = [str(pk) for pk in snapshots.values_list("pk", flat=True)] + + try: + with search_backend_env(config=config): + backend.flush(snapshot_pks) + except Exception as err: + stderr() + stderr( + f"[X] The search backend threw an exception={err}:", + color="red", + ) diff --git a/archivebox/search/sonic_daemon.py b/archivebox/search/sonic_daemon.py new file mode 100644 index 0000000000..c75bf070fb --- /dev/null +++ b/archivebox/search/sonic_daemon.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +import asyncio +import json +import time + +from abx_dl.events import ProcessStdoutEvent + + +def register_sonic_daemon_event_handler(bus) -> None: + async def on_ProcessStdoutEvent__require_sonic_daemon(event: ProcessStdoutEvent) -> None: + try: + record = json.loads(event.line) + except (json.JSONDecodeError, ValueError): + return + if not isinstance(record, dict) or record.get("type") != "SonicDaemonStartEvent": + return + + from abx_plugins.plugins.search_backend_sonic.daemon import ( + SonicDaemonStartEvent, + is_port_listening, + ) + from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker + + daemon_event = SonicDaemonStartEvent.from_record(record) + deadline = time.monotonic() + 30.0 + last_worker = None + + while time.monotonic() < deadline: + if is_port_listening(daemon_event.host, daemon_event.port): + return + + supervisor = get_existing_supervisord_process(quiet=True) + worker = get_worker(supervisor, daemon_event.worker_name) if supervisor is not None else None + if isinstance(worker, dict): + last_worker = worker + if worker.get("statename") not in {"STARTING", "RUNNING", "BACKOFF", "STOPPING"}: + break + + await asyncio.sleep(0.5) + + if is_port_listening(daemon_event.host, daemon_event.port): + return + + supervisor = get_existing_supervisord_process(quiet=True) + if supervisor is None: + raise RuntimeError("Sonic search backend is required, but ArchiveBox supervisord is not running") + + worker = get_worker(supervisor, daemon_event.worker_name) + if worker is None and last_worker is not None: + worker = last_worker + if not worker: + raise RuntimeError(f"Sonic search backend worker is not configured: {daemon_event.worker_name}") + if worker.get("statename") != "RUNNING": + raise RuntimeError( + f"Sonic search backend worker is {worker.get('statename')}: {worker.get('description')}", + ) + raise RuntimeError(f"Sonic search backend is not listening at {daemon_event.url}") + + bus.on(ProcessStdoutEvent, on_ProcessStdoutEvent__require_sonic_daemon) diff --git a/archivebox/search/views.py b/archivebox/search/views.py new file mode 100644 index 0000000000..8d1f2799e2 --- /dev/null +++ b/archivebox/search/views.py @@ -0,0 +1,390 @@ +__package__ = "archivebox.search" + +import asyncio +import hashlib +import json +import threading +import time +from copy import copy +from queue import Full, Queue +from urllib.parse import urlsplit +from uuid import UUID + +from django.core.cache import cache +from django.db import connections +from django.db.models import Q +from django.http import HttpResponseForbidden, QueryDict, StreamingHttpResponse + +from archivebox.search.config import get_search_mode, get_search_mode_base +from archivebox.search.query import crawl_config_values_search_wave, iter_query_search_ids + + +SEARCH_RESULT_CACHE_TTL = 60 +URL_PREFIX_SEARCH_LIMIT = 500 + + +def get_admin_search_cache_key(request, url: str | None = None) -> str: + """Build the cache key for one user and changelist URL.""" + # Search streams publish IDs for one exact changelist URL. Keeping the URL + # whole makes sidebar filters, ordering, and user scope part of the key. + payload = json.dumps( + { + "user": str(request.user.pk or "anon"), + "url": url or request.get_full_path(), + }, + sort_keys=True, + ) + return f"abx:admin-search:{hashlib.sha256(payload.encode()).hexdigest()}" + + +def get_public_search_cache_key(request, url: str | None = None) -> str: + """Build the cache key for one public search URL.""" + payload = json.dumps( + { + "url": url or request.get_full_path(), + }, + sort_keys=True, + ) + return f"abx:public-search:{hashlib.sha256(payload.encode()).hexdigest()}" + + +def get_cached_admin_search_ids(request) -> list[str] | None: + """Return streamed admin search IDs from Django cache.""" + cached = cache.get(get_admin_search_cache_key(request)) + if isinstance(cached, dict): + return cached.get("ids") or [] + return None + + +def get_cached_public_search_ids(request) -> list[str] | None: + """Return streamed public search IDs from Django cache.""" + cached = get_cached_public_search_state(request) + if isinstance(cached, dict): + return cached.get("ids") or [] + return None + + +def get_cached_public_search_state(request) -> dict | None: + """Return streamed public search state from Django cache.""" + cached = cache.get(get_public_search_cache_key(request)) + return cached if isinstance(cached, dict) else None + + +def iter_url_search_prefixes(query: str): + """Yield URL prefixes that can use indexed startswith scans for common search input.""" + query = query.strip().lower() + if not query or any(char.isspace() for char in query): + return + + prefixes = [] + + def add(prefix: str): + if prefix and prefix not in prefixes: + prefixes.append(prefix) + + add(query) + if "://" in query: + parsed = urlsplit(query) + if parsed.scheme and parsed.netloc: + host = parsed.netloc + path = parsed.path or "" + if parsed.query: + path = f"{path}?{parsed.query}" + if host.startswith("www."): + add(f"{parsed.scheme}://{host[4:]}{path}") + else: + add(f"{parsed.scheme}://www.{host}{path}") + else: + trimmed = query.lstrip("/") + for scheme in ("https://", "http://"): + add(f"{scheme}{trimmed}") + if trimmed.startswith("www."): + add(f"{scheme}{trimmed[4:]}") + else: + add(f"{scheme}www.{trimmed}") + + yield from prefixes + + +def url_prefix_upper_bound(prefix: str) -> str: + """Return the exclusive upper bound for an indexed URL prefix range.""" + if not prefix: + return prefix + return f"{prefix[:-1]}{chr(ord(prefix[-1]) + 1)}" + + +def iter_url_prefix_search_ids(prefix: str, queryset): + """Yield IDs for one URL prefix using the URL index, then apply caller filters.""" + if not prefix: + return + + model = queryset.model + db_alias = queryset.db + connection = connections[db_alias] + table = connection.ops.quote_name(model._meta.db_table) + pk_column = connection.ops.quote_name(model._meta.pk.column) + url_column = connection.ops.quote_name(model._meta.get_field("url").column) + upper_bound = url_prefix_upper_bound(prefix) + raw_ids = [] + + with connection.cursor() as cursor: + cursor.execute( + f""" + SELECT {pk_column} + FROM {table} + WHERE {url_column} >= %s AND {url_column} < %s + ORDER BY {url_column} + LIMIT %s + """, + [prefix, upper_bound, URL_PREFIX_SEARCH_LIMIT], + ) + raw_ids = [str(row[0]).replace("-", "") for row in cursor.fetchall()] + + if not raw_ids: + return + + valid_ids = {str(pk).replace("-", "") for pk in queryset.filter(pk__in=raw_ids).values_list("pk", flat=True)} + for snapshot_id in raw_ids: + if snapshot_id in valid_ids: + yield snapshot_id + + +def iter_meta_search_ids(query, queryset): + """Yield metadata search matches from a filtered Snapshot queryset.""" + seen = set() + try: + snapshot_id = UUID(query) + except ValueError: + snapshot_id = None + if snapshot_id: + for pk in queryset.filter(pk=snapshot_id).values_list("pk", flat=True): + seen.add(pk) + yield pk + + for prefix in iter_url_search_prefixes(query): + for pk in iter_url_prefix_search_ids(prefix, queryset): + if pk in seen: + continue + seen.add(pk) + yield pk + + waves = [ + Q(timestamp__startswith=query) | Q(title__istartswith=query), + Q(url__icontains=query), + Q(title__icontains=query), + Q(tags__name__icontains=query), + Q(notes__icontains=query), + ] + for wave in waves: + for pk in queryset.filter(wave).values_list("pk", flat=True).distinct().iterator(chunk_size=500): + if pk in seen: + continue + seen.add(pk) + yield pk + + crawl_metadata_wave = Q(crawl__notes__icontains=query) | Q(crawl__label__icontains=query) | Q(crawl__created_by__username=query) + if not seen: + for pk in queryset.filter(crawl_metadata_wave).values_list("pk", flat=True).distinct().iterator(chunk_size=500): + seen.add(pk) + yield pk + + config_wave = crawl_config_values_search_wave(query) + if config_wave is not None and not seen: + for pk in queryset.filter(config_wave).values_list("pk", flat=True).distinct().iterator(chunk_size=500): + seen.add(pk) + yield pk + + +def normalize_search_result_id(snapshot_id) -> str | None: + """Return a compact Snapshot ID string from a search provider result.""" + snapshot_id = str(snapshot_id).strip().lower().replace("-", "") + if len(snapshot_id) != 32: + return None + return snapshot_id + + +def iter_filtered_search_result_ids(iterator, queryset, *, flush_max_delay=0.05): + """Yield provider IDs that still match the filtered queryset. + + This is the single intersection/dedupe path used for metadata and every + search backend. It flushes by elapsed time so sparse providers stream rows + as soon as IDs are found instead of waiting for a fixed batch size. + """ + batch = [] + seen = set() + queued = set() + last_flush_at = 0.0 + + def flush_batch(): + nonlocal batch, queued, last_flush_at + if not batch: + return + batch_ids = batch + batch = [] + queued = set() + last_flush_at = time.monotonic() + valid = {str(pk).replace("-", "") for pk in queryset.filter(pk__in=batch_ids).values_list("pk", flat=True)} + for snapshot_id in batch_ids: + if snapshot_id in valid and snapshot_id not in seen: + seen.add(snapshot_id) + yield snapshot_id + + for snapshot_id in iterator: + snapshot_id = normalize_search_result_id(snapshot_id) + if not snapshot_id or snapshot_id in seen or snapshot_id in queued: + continue + batch.append(snapshot_id) + queued.add(snapshot_id) + if not seen or time.monotonic() - last_flush_at >= flush_max_delay: + yield from flush_batch() + if batch: + yield from flush_batch() + + +def iter_search_result_ids(query, base_queryset, *, search_mode, config): + """Yield filtered Snapshot IDs from the selected search provider.""" + search_mode_base = get_search_mode_base(search_mode, config=config) + provider = ( + iter_meta_search_ids(query, base_queryset) + if search_mode_base == "meta" + else iter_query_search_ids(query, search_mode=search_mode, config=config) + ) + yield from iter_filtered_search_result_ids(provider, base_queryset) + + +def snapshot_search_stream_response(query, base_queryset, *, search_mode, config, cache_key, thread_name): + """Stream Snapshot search progress and cache matching IDs for a list view.""" + if not query: + return StreamingHttpResponse((), content_type="text/plain") + + async def snapshot_ids(): + seen = set() + ids = [] + last_sent = 0 + last_sent_at = time.monotonic() + stream_max_delay = 0.05 + stream_padding = " " * 4096 + cache.set(cache_key, {"ids": [], "done": False}, SEARCH_RESULT_CACHE_TTL) + queue = Queue(maxsize=8) + stop_event = threading.Event() + + def emit(item): + while not stop_event.is_set(): + try: + queue.put(item, timeout=0.1) + return + except Full: + continue + + def publish_count(done=False): + nonlocal last_sent, last_sent_at + cache.set(cache_key, {"ids": list(ids), "done": done}, SEARCH_RESULT_CACHE_TTL) + last_sent = len(ids) + last_sent_at = time.monotonic() + emit(f"{last_sent}{stream_padding}\n") + + def run_search(): + iterator = None + try: + iterator = iter_search_result_ids(query, base_queryset, search_mode=search_mode, config=config) + for snapshot_id in iterator: + if stop_event.is_set(): + break + snapshot_id = normalize_search_result_id(snapshot_id) + if not snapshot_id or snapshot_id in seen: + continue + seen.add(snapshot_id) + ids.append(snapshot_id) + if len(ids) == 1 or time.monotonic() - last_sent_at >= stream_max_delay: + publish_count() + if not stop_event.is_set() and len(ids) != last_sent: + publish_count(done=True) + except BaseException as err: + emit(err) + finally: + if iterator is not None: + try: + iterator.close() + except AttributeError: + pass + cache.set(cache_key, {"ids": list(ids), "done": True}, SEARCH_RESULT_CACHE_TTL) + emit(None) + + threading.Thread(target=run_search, name=thread_name, daemon=True).start() + yield f"0{stream_padding}\n" + try: + while True: + item = await asyncio.to_thread(queue.get) + if item is None: + break + if isinstance(item, BaseException): + raise item + yield item + finally: + stop_event.set() + + response = StreamingHttpResponse(snapshot_ids(), content_type="text/plain") + response["X-Accel-Buffering"] = "no" + return response + + +def admin_snapshot_search_stream_view(model_admin, request): + """Stream admin Snapshot search progress and cache matching IDs.""" + query = (request.GET.get("q") or "").strip() + config = request.archivebox_config + search_mode = get_search_mode(request.GET.get("search_mode"), config=config) + + search_url = request.GET.get("search_url") or request.get_full_path() + target_url = urlsplit(search_url) + target_get = QueryDict(target_url.query, mutable=True) + for key in ("q", "search_mode", "p", "search_url"): + target_get.pop(key, None) + + filter_request = copy(request) + filter_request.path = target_url.path or request.path + filter_request.path_info = target_url.path or request.path_info + filter_request.GET = target_get + filter_request.archivebox_config = config + + # Build the same filtered base queryset the changelist uses, but with the + # search params stripped. The stream intersects each wave with this queryset + # before writing IDs into the short-lived cache consumed by the changelist. + current_request = model_admin.__dict__.get("request") + try: + base_queryset = model_admin.get_changelist_instance(filter_request).queryset + finally: + model_admin.request = current_request + + return snapshot_search_stream_response( + query, + base_queryset, + search_mode=search_mode, + config=config, + cache_key=get_admin_search_cache_key(request, search_url), + thread_name="admin-snapshot-search-stream", + ) + + +def public_snapshot_search_stream_view(request): + """Stream public Snapshot search progress and cache matching IDs.""" + from archivebox.config.common import get_request_config + from archivebox.core.models import Snapshot + from archivebox.core.permissions import public_snapshots_queryset + + config = getattr(request, "archivebox_config", None) or get_request_config(request, resolve_plugins=False) + if not request.user.is_authenticated and not config.PUBLIC_INDEX: + return HttpResponseForbidden("Public index is disabled") + + query = (request.GET.get("q") or "").strip() + search_mode = get_search_mode(request.GET.get("search_mode"), config=config) + search_url = request.GET.get("search_url") or request.get_full_path() + base_queryset = public_snapshots_queryset(Snapshot.objects.all()) + + return snapshot_search_stream_response( + query, + base_queryset, + search_mode=search_mode, + config=config, + cache_key=get_public_search_cache_key(request, search_url), + thread_name="public-snapshot-search-stream", + ) diff --git a/archivebox/services/__init__.py b/archivebox/services/__init__.py new file mode 100644 index 0000000000..7709e39a60 --- /dev/null +++ b/archivebox/services/__init__.py @@ -0,0 +1,23 @@ +from .archive_result_service import ArchiveResultService +from .binary_service import ArchiveBoxBinaryService, ArchiveBoxDBBinaryCacheBackend +from .crawl_service import CrawlService +from .machine_service import MachineService +from .process_service import ProcessService +from .runner import run_binary, run_crawl, run_install, run_pending_crawls +from .snapshot_service import SnapshotService +from .tag_service import TagService + +__all__ = [ + "ArchiveResultService", + "ArchiveBoxBinaryService", + "ArchiveBoxDBBinaryCacheBackend", + "CrawlService", + "MachineService", + "ProcessService", + "SnapshotService", + "TagService", + "run_binary", + "run_crawl", + "run_install", + "run_pending_crawls", +] diff --git a/archivebox/services/archive_result_service.py b/archivebox/services/archive_result_service.py new file mode 100644 index 0000000000..b5a7abb78a --- /dev/null +++ b/archivebox/services/archive_result_service.py @@ -0,0 +1,471 @@ +from __future__ import annotations + +import asyncio +import inspect +import json +import os +import sys +import time +from collections import defaultdict +from collections.abc import Iterable +from contextlib import contextmanager +from functools import wraps +from pathlib import Path +from typing import Any, Protocol, runtime_checkable + +from asgiref.sync import sync_to_async +from django.db import IntegrityError +from django.utils import timezone + +from abx_dl.events import PROCESS_EXIT_SKIPPED, ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent +from abx_dl.output_files import guess_mimetype +from abx_dl.services.base import BaseService + +from .process_service import parse_event_datetime + + +def _perf_trace(label): + def decorator(func): + if inspect.iscoroutinefunction(func): + + @wraps(func) + async def async_wrapper(*args, **kwargs): + if os.environ.get("ARCHIVEBOX_PERF_TRACE") != "1": + return await func(*args, **kwargs) + started_at = time.perf_counter() + try: + return await func(*args, **kwargs) + finally: + elapsed_ms = (time.perf_counter() - started_at) * 1000 + print(f"PERF_TRACE label={label} ms={elapsed_ms:.3f}", file=sys.stderr, flush=True) + + return async_wrapper + + @wraps(func) + def sync_wrapper(*args, **kwargs): + if os.environ.get("ARCHIVEBOX_PERF_TRACE") != "1": + return func(*args, **kwargs) + started_at = time.perf_counter() + try: + return func(*args, **kwargs) + finally: + elapsed_ms = (time.perf_counter() - started_at) * 1000 + print(f"PERF_TRACE label={label} ms={elapsed_ms:.3f}", file=sys.stderr, flush=True) + + return sync_wrapper + + return decorator + + +@contextmanager +def _perf_span(label: str): + if os.environ.get("ARCHIVEBOX_PERF_TRACE") != "1": + yield + return + started_at = time.perf_counter() + try: + yield + finally: + elapsed_ms = (time.perf_counter() - started_at) * 1000 + print(f"PERF_TRACE label={label} ms={elapsed_ms:.3f}", file=sys.stderr, flush=True) + + +@runtime_checkable +class ModelDumpable(Protocol): + def model_dump(self) -> dict[str, Any]: ... + + +def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]: + exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid"} + output_files: dict[str, dict] = {} + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + + if not plugin_dir.exists(): + return output_files, total_size, "" + + for file_path in plugin_dir.rglob("*"): + if not file_path.is_file(): + continue + if ".hooks" in file_path.parts: + continue + if file_path.name in exclude_names: + continue + try: + stat = file_path.stat() + except OSError: + continue + mime_type = guess_mimetype(file_path) or "application/octet-stream" + relative_path = str(file_path.relative_to(plugin_dir)) + output_files[relative_path] = { + "extension": file_path.suffix.lower().lstrip("."), + "mimetype": mime_type, + "size": stat.st_size, + } + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + + output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return output_files, total_size, output_mimetypes + + +def _coerce_output_file_size(value: Any) -> int: + try: + return max(int(value or 0), 0) + except (TypeError, ValueError): + return 0 + + +def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]: + def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]: + normalized = dict(metadata) + if "extension" not in normalized: + normalized["extension"] = Path(path).suffix.lower().lstrip(".") + if "mimetype" not in normalized: + guessed = guess_mimetype(path) + if guessed: + normalized["mimetype"] = guessed + return normalized + + if raw_output_files is None: + return {} + + if isinstance(raw_output_files, str): + try: + raw_output_files = json.loads(raw_output_files) + except json.JSONDecodeError: + return {} + + if isinstance(raw_output_files, dict): + normalized: dict[str, dict] = {} + for path, metadata in raw_output_files.items(): + if not path: + continue + metadata_dict = dict(metadata) if isinstance(metadata, dict) else {} + metadata_dict.pop("path", None) + normalized[str(path)] = _enrich_metadata(str(path), metadata_dict) + return normalized + + if not isinstance(raw_output_files, Iterable): + return {} + + normalized: dict[str, dict] = {} + for item in raw_output_files: + if isinstance(item, str): + normalized[item] = _enrich_metadata(item, {}) + continue + if isinstance(item, ModelDumpable): + item = item.model_dump() + if not isinstance(item, dict): + continue + path = str(item.get("path") or "").strip() + if not path: + continue + normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")}) + + return normalized + + +def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool: + return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values()) + + +def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]: + mime_sizes: dict[str, int] = defaultdict(int) + total_size = 0 + + for metadata in output_files.values(): + if not isinstance(metadata, dict): + continue + size = _coerce_output_file_size(metadata.get("size")) + mimetype = str(metadata.get("mimetype") or "").strip() + total_size += size + if mimetype and size: + mime_sizes[mimetype] += size + + output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)) + return total_size, output_mimetypes + + +def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]: + normalized_output_files = _normalize_output_files(raw_output_files) + if normalized_output_files and _has_structured_output_metadata(normalized_output_files): + output_size, output_mimetypes = _summarize_output_files(normalized_output_files) + return normalized_output_files, output_size, output_mimetypes + return _collect_output_metadata(plugin_dir) + + +def _normalize_status(status: str) -> str: + if status == "noresult": + return "noresults" + return status or "failed" + + +def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str: + title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip() + if not title: + return "" + if title.lower() in {"pending...", "no title found"}: + return "" + if title == snapshot_url: + return "" + if "/" in title and title.lower().endswith(".txt"): + return "" + return title + + +def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str: + if plugin != "title": + return "" + + title_file = Path(snapshot_output_dir) / "title" / "title.txt" + if title_file.exists(): + try: + file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url) + except OSError: + file_title = "" + if file_title: + return file_title + + return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url) + + +def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool: + current = (current_title or "").strip() + if not current or current.lower() == "pending..." or current == snapshot_url: + return True + return len(next_title) > len(current) + + +def _has_content_files(output_files: Any) -> bool: + return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files)) + + +def _is_signal_interrupted_exit(exit_code: int) -> bool: + return exit_code < 0 or (exit_code >= 128 and exit_code != PROCESS_EXIT_SKIPPED) + + +def _iter_archiveresult_records(stdout: str) -> list[dict]: + records: list[dict] = [] + for raw_line in stdout.splitlines(): + line = raw_line.strip() + if not line.startswith("{"): + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("type") == "ArchiveResult": + records.append(record) + return records + + +@_perf_trace("archivebox.ArchiveResultService._save_archiveresult_event_sync") +def _save_archiveresult_event_to_db( + event: ArchiveResultEvent, + process_started: ProcessStartedEvent | None, +) -> None: + """Project one ArchiveResultEvent with a single thread-sensitive ORM hop. + + Django's async ORM still delegates each query to sync Django work. The hot + search/index maintenance path was paying that handoff separately for + Snapshot lookup, Process lookup, ArchiveResult lookup, update, and title + checks. Keep the public ArchiveResultEvent path intact, but run the DB + projection as one short synchronous block so SQLite sees the same indexed + reads/writes without per-query asyncio/threadpool churn. + """ + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Process + + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.snapshot_lookup"): + snapshot = Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").first() + if snapshot is None: + return + + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.plugin_dir"): + plugin_dir = ( + Path(process_started.output_dir) + if process_started is not None and process_started.output_dir + else Path(snapshot.output_dir) / event.plugin + ) + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.resolve_output_metadata"): + output_files, output_size, output_mimetypes = _resolve_output_metadata(event.output_files, plugin_dir) + + process = None + if process_started is not None: + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.process_lookup"): + started_at = parse_event_datetime(process_started.start_ts) + if started_at is None: + raise ValueError("ProcessStartedEvent.start_ts is required") + process_query = Process.objects.filter( + pwd=process_started.output_dir, + cmd=[process_started.hook_path, *process_started.hook_args], + started_at=started_at, + ) + if process_started.pid: + process_query = process_query.filter(pid=process_started.pid) + process = process_query.order_by("-modified_at").first() + + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.prepare_defaults"): + start_ts = parse_event_datetime(event.start_ts) + end_ts = parse_event_datetime(event.end_ts) or timezone.now() + defaults = { + "status": _normalize_status(event.status), + "output_str": event.output_str, + "output_json": event.output_json, + "output_files": output_files, + "output_size": output_size, + "output_mimetypes": output_mimetypes, + "start_ts": start_ts or timezone.now(), + "end_ts": end_ts, + } + if process is not None: + defaults["process_id"] = process.id + if event.error: + defaults["notes"] = event.error + + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.result_lookup"): + result = ArchiveResult.objects.filter( + snapshot=snapshot, + plugin=event.plugin, + hook_name=event.hook_name, + ).first() + if result is None: + try: + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.result_create"): + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin=event.plugin, + hook_name=event.hook_name, + **defaults, + ) + except IntegrityError: + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.result_get_after_integrity"): + result = ArchiveResult.objects.get( + snapshot=snapshot, + plugin=event.plugin, + hook_name=event.hook_name, + ) + + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.diff_fields"): + update_fields = [] + for field, value in defaults.items(): + if result.__dict__[field] != value: + setattr(result, field, value) + update_fields.append(field) + if update_fields: + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.result_update"): + result.save(update_fields=[*update_fields, "modified_at"]) + + if result.status == ArchiveResult.StatusChoices.QUEUED: + # ArchiveResult has no retry_at column. If a shutdown/takeover projects + # a killed hook back to QUEUED, wake the parent Snapshot/Crawl so the + # next runner retries that exact hook instead of waiting on a stale + # active-state lease. + snapshot.update_and_requeue(retry_at=timezone.now()) + + if result.status in (ArchiveResult.StatusChoices.SUCCEEDED, ArchiveResult.StatusChoices.NORESULTS): + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.title_update"): + title_output_str = result.output_str if result.status == ArchiveResult.StatusChoices.SUCCEEDED else "" + next_title = _extract_snapshot_title(str(plugin_dir.parent), event.plugin, title_output_str, snapshot_url=snapshot.url) + if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url): + snapshot.title = next_title + snapshot.save(update_fields=["title", "modified_at"]) + + +class ArchiveResultService(BaseService): + LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent] + EMITS = [] + + def __init__(self, bus): + self._completed_process_event_ids: set[str] = set() + self._save_locks: dict[tuple[str, str, str], asyncio.Lock] = {} + super().__init__(bus) + self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db) + self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db) + + @_perf_trace("archivebox.ArchiveResultService.on_ArchiveResultEvent__save_to_db") + async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None: + with _perf_span("archivebox.ArchiveResultService.on_ArchiveResultEvent.find_process_started"): + process_started = await self.bus.find( + ProcessStartedEvent, + past=True, + future=False, + where=lambda candidate: self.bus.event_is_child_of(event, candidate), + ) + + key = (str(event.snapshot_id), event.plugin, event.hook_name) + lock = self._save_locks.setdefault(key, asyncio.Lock()) + async with lock: + await sync_to_async(_save_archiveresult_event_to_db, thread_sensitive=True)(event, process_started) + + @_perf_trace("archivebox.ArchiveResultService.on_ProcessCompletedEvent__save_to_db") + async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None: + if event.event_id in self._completed_process_event_ids: + return + self._completed_process_event_ids.add(event.event_id) + + if not event.hook_name.startswith("on_Snapshot"): + return + with _perf_span("archivebox.ArchiveResultService.on_ProcessCompletedEvent.find_snapshot_event"): + snapshot_event = await self.bus.find( + SnapshotEvent, + past=True, + future=False, + where=lambda candidate: self.bus.event_is_child_of(event, candidate), + ) + if snapshot_event is None: + return + + with _perf_span("archivebox.ArchiveResultService.on_ProcessCompletedEvent.parse_stdout_records"): + records = _iter_archiveresult_records(event.stdout) + if records: + if len(records) > 1: + raise RuntimeError( + f"Hook {event.plugin_name}:{event.hook_name} emitted {len(records)} ArchiveResult records; expected exactly one", + ) + for record in records: + record_status = _normalize_status(record.get("status") or "") + record_failed = record_status == "failed" or (not record_status and event.exit_code not in (0, PROCESS_EXIT_SKIPPED)) + with _perf_span("archivebox.ArchiveResultService.on_ProcessCompletedEvent.emit_archive_result_record"): + await event.emit( + ArchiveResultEvent( + snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id, + plugin=record.get("plugin") or event.plugin_name, + hook_name=record.get("hook_name") or event.hook_name, + status=record_status, + output_str=record.get("output_str") or "", + output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None, + output_files=event.output_files, + start_ts=event.start_ts, + end_ts=event.end_ts, + error=record.get("error") or (event.stderr if record_failed else ""), + ), + ).now() + return + + # TODO: consider moving this fallback derivation into abx-dl itself. + # First try both patterns: if the whole abx-dl process crashes, restarting + # the snapshot may be enough, but don't guess before validating it. + process_interrupted = _is_signal_interrupted_exit(event.exit_code) + process_failed = event.exit_code not in (0, PROCESS_EXIT_SKIPPED) and not process_interrupted + with _perf_span("archivebox.ArchiveResultService.on_ProcessCompletedEvent.emit_archive_result_fallback"): + await event.emit( + ArchiveResultEvent( + snapshot_id=snapshot_event.snapshot_id, + plugin=event.plugin_name, + hook_name=event.hook_name, + status=( + "queued" + if process_interrupted + else "failed" + if process_failed + else ("succeeded" if _has_content_files(event.output_files) else "noresult") + ), + output_str=event.stderr if process_failed else "", + output_files=event.output_files, + start_ts=event.start_ts, + end_ts=event.end_ts, + error=event.stderr if process_failed else "", + ), + ).now() diff --git a/archivebox/services/binary_service.py b/archivebox/services/binary_service.py new file mode 100644 index 0000000000..92a2c148ab --- /dev/null +++ b/archivebox/services/binary_service.py @@ -0,0 +1,423 @@ +from __future__ import annotations + +import asyncio +import json +from collections.abc import Mapping +from pathlib import Path +from typing import Any + +from asgiref.sync import sync_to_async +from django.utils import timezone + +from abxpkg import Binary as AbxBinary +from abxpkg import BinProvider, PROVIDER_CLASS_BY_NAME +from abxpkg.binary_service import BinaryEvent, BinaryRequestEvent +from abxbus import BaseEvent, EventBus +from abx_dl.services.base import BaseService + + +class ArchiveBoxDBBinaryCacheBackend: + """ArchiveBox machine.Binary projection backend for abxpkg BinaryCacheService.""" + + async def get(self, request: BinaryRequestEvent) -> AbxBinary | None: + from archivebox.config.common import get_config + from archivebox.machine.models import Binary, Machine, _canonical_binary_name + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + binary_name = _canonical_binary_name(request.name) + if not binary_name: + return None + + persisted_overrides = _persisted_overrides_for_request(request) + native_overrides = request.overrides or {} + requested_provider_names = _provider_names(request.binproviders) + await sync_to_async(get_config, thread_sensitive=True)() + + installed_qs = ( + Binary.objects.filter(machine=machine, name=binary_name, status=Binary.StatusChoices.INSTALLED) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + ) + async for installed in installed_qs: + installed_path = Path(installed.abspath).expanduser().resolve(strict=False) + if not await sync_to_async(installed_path.exists, thread_sensitive=True)(): + await _mark_binary_queued(installed) + continue + if persisted_overrides and installed.overrides != persisted_overrides: + await _mark_binary_queued(installed) + continue + + provider_name = (installed.binprovider or installed.binproviders.split(",", 1)[0]).strip() + if provider_name and provider_name not in requested_provider_names: + await _mark_binary_queued(installed) + continue + + provider = _provider_for_name(provider_name, installed.name, native_overrides) + if await sync_to_async(_cached_provider_path_is_stale, thread_sensitive=True)(installed_path, provider, installed.name): + await _mark_binary_queued(installed) + continue + + binary_env = BinProvider.build_exec_env(providers=[provider], base_env={}) if provider is not None else {} + provider_names = _provider_names(installed.binproviders or request.binproviders or "env") + return AbxBinary.model_validate( + { + "name": request.name, + "description": request.description, + "binproviders": _providers_for_names(provider_names), + "overrides": native_overrides, + "loaded_binprovider": provider, + "loaded_abspath": installed.abspath, + "loaded_version": installed.version or None, + "loaded_sha256": installed.sha256 or None, + "env": binary_env, + }, + ) + + existing = await Binary.objects.filter(machine=machine, name=binary_name).order_by("-modified_at").afirst() + if existing is None: + await Binary.objects.acreate( + machine=machine, + name=binary_name, + binproviders=_binproviders_to_str(request.binproviders), + overrides=persisted_overrides, + status=Binary.StatusChoices.QUEUED, + ) + else: + changed = False + requested_binproviders = _binproviders_to_str(request.binproviders) + if requested_binproviders and existing.binproviders != requested_binproviders: + existing.binproviders = requested_binproviders + changed = True + if persisted_overrides and existing.overrides != persisted_overrides: + existing.overrides = persisted_overrides + changed = True + if existing.status != Binary.StatusChoices.QUEUED: + existing.status = Binary.StatusChoices.QUEUED + existing.retry_at = None + changed = True + if changed: + await existing.asave(update_fields=["binproviders", "overrides", "status", "retry_at", "modified_at"]) + return None + + async def set(self, request: BinaryRequestEvent | None, binary: AbxBinary) -> None: + from archivebox.machine.models import Binary, Machine, _canonical_binary_name + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + binary_name = _canonical_binary_name(binary.name) + if not binary_name: + return + request_context = request.extra_context if request is not None else {} + binary_id = str(request_context.get("binary_id") or "") + if binary_id: + existing = await Binary.objects.filter(id=binary_id).afirst() + else: + existing = None + if existing is None: + existing, _created = await Binary.objects.aget_or_create( + machine=machine, + name=binary_name, + defaults={"status": Binary.StatusChoices.QUEUED}, + ) + + existing.abspath = str(binary.loaded_abspath or "") + if binary.loaded_version: + existing.version = str(binary.loaded_version) + if binary.loaded_sha256: + existing.sha256 = str(binary.loaded_sha256) + existing.binproviders = _binproviders_to_str( + request.binproviders if request is not None else [provider.name for provider in binary.binproviders], + ) + if binary.loaded_binprovider is not None: + existing.binprovider = binary.loaded_binprovider.name + existing.overrides = _persisted_overrides_for_request(request) if request is not None else binary.overrides + existing.status = Binary.StatusChoices.INSTALLED + existing.retry_at = None + await existing.asave( + update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"], + ) + + async def invalidate(self, request: BinaryRequestEvent, binary: AbxBinary, reason: str) -> None: + from archivebox.machine.models import Binary, Machine, _canonical_binary_name + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + binary_name = _canonical_binary_name(request.name) + if not binary_name: + return + installed = ( + await Binary.objects.filter(machine=machine, name=binary_name, status=Binary.StatusChoices.INSTALLED) + .exclude(abspath="") + .exclude(abspath__isnull=True) + .order_by("-modified_at") + .afirst() + ) + if installed is None: + return + installed.status = Binary.StatusChoices.QUEUED + installed.retry_at = None + await installed.asave(update_fields=["status", "retry_at", "modified_at"]) + + +class ArchiveBoxBinaryService(BaseService): + """Preserve ArchiveBox's legacy Binary Process rows around abxpkg requests.""" + + LISTENS_TO = [BinaryRequestEvent, BinaryEvent] + EMITS: list[type[BaseEvent]] = [] + + def __init__(self, bus: EventBus): + super().__init__(bus) + self.process_ids_by_request_id: dict[str, str] = {} + self._missing_finalize_tasks: set[asyncio.Task] = set() + self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent__project_process) + self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent__schedule_missing_finalize) + self.bus.on(BinaryEvent, self.on_BinaryEvent__finalize_process) + + async def on_BinaryRequestEvent__project_process(self, request: BinaryRequestEvent) -> None: + from archivebox.machine.models import Machine, Process, _canonical_binary_name + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + binary_name = _canonical_binary_name(request.name) + if not binary_name: + return + binary = await self._get_or_create_binary(machine, binary_name, request) + started_at = timezone.now() + output_dir = self._process_output_dir(binary, request) + await sync_to_async(output_dir.mkdir, thread_sensitive=True)(parents=True, exist_ok=True) + process = await Process.objects.acreate( + machine=machine, + iface=None, + process_type=Process.TypeChoices.BINARY, + worker_type="", + pwd=str(output_dir), + cmd=self._process_cmd(request), + env={}, + timeout=int(request.event_timeout or request.install_timeout or 600), + pid=None, + url=None, + started_at=started_at, + ended_at=None, + stdout="", + stderr="", + exit_code=None, + status=Process.StatusChoices.RUNNING, + retry_at=None, + binary=binary, + ) + self.process_ids_by_request_id[request.event_id] = str(process.id) + + async def on_BinaryEvent__finalize_process(self, event: BinaryEvent) -> None: + from archivebox.machine.models import Binary, Process, _canonical_binary_name + + request = await self.bus.find( + BinaryRequestEvent, + past=True, + future=False, + where=lambda candidate: self.bus.event_is_child_of(event, candidate), + ) + request = request if isinstance(request, BinaryRequestEvent) else None + process_id = self.process_ids_by_request_id.pop(request.event_id, "") if request is not None else "" + if not process_id: + return + process = await Process.objects.filter(id=process_id).select_related("binary").afirst() + if process is None: + return + binary_name = _canonical_binary_name(event.name) + binary = process.binary + if binary is not None and binary_name: + binary.abspath = event.abspath + if event.version: + binary.version = str(event.version) + if event.sha256: + binary.sha256 = str(event.sha256) + binary.binproviders = event.binproviders or binary.binproviders + binary.binprovider = event.binprovider or binary.binprovider + binary.status = Binary.StatusChoices.INSTALLED + binary.retry_at = None + await binary.asave( + update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "status", "retry_at", "modified_at"], + ) + process.ended_at = timezone.now() + process.stdout = json.dumps(self._binary_event_json(event, binary)) + "\n" + process.stderr = "" + process.exit_code = 0 + process.status = Process.StatusChoices.EXITED + await process.asave(update_fields=["ended_at", "stdout", "stderr", "exit_code", "status", "modified_at"]) + if binary is not None: + await sync_to_async(self._write_binary_index, thread_sensitive=True)(binary, process, Path(process.pwd)) + + async def _get_or_create_binary(self, machine, binary_name: str, request: BinaryRequestEvent): + from archivebox.machine.models import Binary + + binary_id = str(request.extra_context.get("binary_id") or "") + if binary_id: + binary = await Binary.objects.filter(id=binary_id).afirst() + if binary is not None: + return binary + binary = await Binary.objects.filter(machine=machine, name=binary_name).order_by("-modified_at").afirst() + if binary is not None: + return binary + return await Binary.objects.acreate( + machine=machine, + name=binary_name, + binproviders=_binproviders_to_str(request.binproviders), + overrides=_persisted_overrides_for_request(request), + status=Binary.StatusChoices.QUEUED, + ) + + def _process_cmd(self, request: BinaryRequestEvent) -> list[str]: + cmd = [ + "abxpkg", + "install", + f"--name={request.name}", + f"--binproviders={_binproviders_to_str(request.binproviders)}", + ] + if request.overrides: + cmd.append(f"--overrides={json.dumps(request.overrides, sort_keys=True)}") + return cmd + + def _binary_event_json(self, event: BinaryEvent, binary) -> dict[str, Any]: + if binary is not None: + data = binary.to_json() + else: + data = {"type": "Binary", "name": event.name} + data.update( + { + "type": "Binary", + "name": event.name, + "binproviders": event.binproviders, + "binprovider": event.binprovider, + "abspath": event.abspath, + "version": str(event.version or ""), + "sha256": event.sha256 or "", + "status": "installed", + }, + ) + return data + + async def _finalize_missing_process(self, request: BinaryRequestEvent) -> None: + from archivebox.machine.models import Process + + process_id = self.process_ids_by_request_id.pop(request.event_id, "") + if not process_id: + return + process = await Process.objects.filter(id=process_id).afirst() + if process is None or process.status == Process.StatusChoices.EXITED: + return + process.ended_at = timezone.now() + process.stderr = f"Binary request did not resolve: {request.name}" + process.exit_code = 1 + process.status = Process.StatusChoices.EXITED + await process.asave(update_fields=["ended_at", "stderr", "exit_code", "status", "modified_at"]) + + async def _finalize_request_when_done(self, request: BinaryRequestEvent) -> None: + try: + await request.wait(timeout=request.event_timeout) + except TimeoutError: + await self._finalize_missing_process(request) + return + binary_event = await self.bus.find( + BinaryEvent, + child_of=request, + past=True, + future=False, + name=request.name, + where=lambda candidate: bool(candidate.abspath), + ) + if not isinstance(binary_event, BinaryEvent): + await self._finalize_missing_process(request) + + def _schedule_missing_finalize(self, request: BinaryRequestEvent) -> None: + task = asyncio.create_task(self._finalize_request_when_done(request)) + self._missing_finalize_tasks.add(task) + task.add_done_callback(lambda done: self._missing_finalize_tasks.discard(done) or (None if done.cancelled() else done.exception())) + + async def flush_missing_finalizers(self) -> None: + if self._missing_finalize_tasks: + await asyncio.gather(*tuple(self._missing_finalize_tasks), return_exceptions=False) + + async def on_BinaryRequestEvent__schedule_missing_finalize(self, request: BinaryRequestEvent) -> None: + self._schedule_missing_finalize(request) + + def _process_output_dir(self, binary, request: BinaryRequestEvent) -> Path: + raw_output_dir = str(request.extra_context.get("output_dir") or "").strip() + if raw_output_dir: + output_dir = Path(raw_output_dir).expanduser() + if output_dir.name == str(binary.id): + return output_dir.parent + return output_dir + return binary.output_dir.parent + + def _write_binary_index(self, binary, process, output_dir: Path) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + index_path = output_dir / "index.jsonl" + with index_path.open("w", encoding="utf-8") as f: + f.write(json.dumps(binary.to_json()) + "\n") + f.write(json.dumps(process.to_json()) + "\n") + + +def _provider_names(binproviders: str | list[str] | None) -> list[str]: + if isinstance(binproviders, str): + raw_names = [part.strip() for part in binproviders.split(",")] + elif binproviders: + raw_names = [str(part).strip() for part in binproviders] + else: + raw_names = ["env"] + names: list[str] = [] + for name in raw_names: + if name and name not in names: + names.append(name) + return names or ["env"] + + +def _binproviders_to_str(binproviders: str | list[str] | None) -> str: + return ",".join(_provider_names(binproviders)) + + +def _providers_for_names(names: list[str]) -> list[BinProvider]: + providers: list[BinProvider] = [] + for name in names: + provider_class = PROVIDER_CLASS_BY_NAME.get(name) + if provider_class is not None: + providers.append(provider_class()) + return providers + + +def _provider_for_name(provider_name: str, binary_name: str, overrides: dict[str, Any] | None) -> BinProvider | None: + provider_class = PROVIDER_CLASS_BY_NAME.get(provider_name) + if provider_class is None: + return None + provider = provider_class() + provider_overrides = overrides.get(provider_name) if isinstance(overrides, dict) else None + if isinstance(provider_overrides, dict): + provider = provider.get_provider_with_overrides( + overrides={binary_name: provider_overrides}, + ) + return provider + + +async def _mark_binary_queued(binary) -> None: + from archivebox.machine.models import Binary + + if binary.status == Binary.StatusChoices.QUEUED: + return + binary.status = Binary.StatusChoices.QUEUED + binary.retry_at = None + await binary.asave(update_fields=["status", "retry_at", "modified_at"]) + + +def _cached_provider_path_is_stale(installed_path: Path, provider: BinProvider | None, binary_name: str) -> bool: + if provider is None: + return False + current_abspath = provider.get_abspath(binary_name, quiet=True, no_cache=True) + if not current_abspath: + return True + return Path(current_abspath).expanduser().resolve(strict=False) != installed_path + + +def _persisted_overrides_for_request(request: BinaryRequestEvent | None) -> dict[str, Any]: + if request is None: + return {} + raw_overrides = request.extra_context.get("raw_overrides") + if isinstance(raw_overrides, Mapping): + return dict(raw_overrides) + return dict(request.overrides or {}) diff --git a/archivebox/services/crawl_service.py b/archivebox/services/crawl_service.py new file mode 100644 index 0000000000..0ef951aceb --- /dev/null +++ b/archivebox/services/crawl_service.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +from datetime import timedelta + +from django.utils import timezone + +from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent +from abx_dl.services.base import BaseService +from archivebox.workers.models import ACTIVE_STATE_LEASE_SECONDS + + +class CrawlService(BaseService): + LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent] + EMITS = [] + + def __init__(self, bus, *, crawl_id: str): + self.crawl_id = crawl_id + super().__init__(bus) + self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db) + self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db) + self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db) + self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db) + + async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None: + from archivebox.crawls.models import Crawl + + await ( + Crawl.objects.filter(id=self.crawl_id) + .exclude( + status__in=Crawl.INACTIVE_STATES, + ) + .aupdate( + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now() + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS), + modified_at=timezone.now(), + ) + ) + + async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None: + from archivebox.crawls.models import Crawl + + await ( + Crawl.objects.filter(id=self.crawl_id) + .exclude( + status__in=Crawl.INACTIVE_STATES, + ) + .aupdate( + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now() + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS), + modified_at=timezone.now(), + ) + ) + + async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None: + from archivebox.crawls.models import Crawl + + # Cleanup is still inside the active crawl lifecycle. Snapshot hooks may + # have just written discovery output that the runner consumes before the + # completion phase, so only CrawlCompleted/finalize_run_state makes the + # final sealed-vs-requeue decision. + await ( + Crawl.objects.filter(id=self.crawl_id) + .exclude( + status__in=Crawl.INACTIVE_STATES, + ) + .aupdate( + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + modified_at=timezone.now(), + ) + ) + + async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = await Crawl.objects.aget(id=self.crawl_id) + if crawl.is_paused or crawl.status == Crawl.StatusChoices.SEALED: + return + is_finished = not await crawl.snapshot_set.filter(status__in=Snapshot.OPEN_STATES).aexists() + if not is_finished: + await ( + Crawl.objects.filter(id=self.crawl_id) + .exclude( + status__in=Crawl.INACTIVE_STATES, + ) + .aupdate( + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + modified_at=timezone.now(), + ) + ) + return + + await ( + Crawl.objects.filter(id=self.crawl_id) + .exclude( + status__in=Crawl.INACTIVE_STATES, + ) + .aupdate( + status=Crawl.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + ) diff --git a/archivebox/services/machine_service.py b/archivebox/services/machine_service.py new file mode 100644 index 0000000000..602cd788a0 --- /dev/null +++ b/archivebox/services/machine_service.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import Any + +from asgiref.sync import sync_to_async + +from abx_dl.events import MachineEvent +from abx_dl.services.base import BaseService + + +def _is_binary_event_key(key: str) -> bool: + """``MachineEvent`` projector only ever writes binary-related state. + + ``Machine.config`` mirrors ``ArchiveBox.conf`` so arbitrary user keys can + legitimately live there โ€” but they get there through the file โ†” DB sync, + not through events. Letting events write arbitrary keys would let an + untrusted plugin overwrite security-sensitive user config (the file โ†” DB + mirror is a security boundary), so the projector strips anything that + isn't a binary path or the binary install cache. + """ + if key.startswith("ABX_") and key.endswith("CACHE"): + return True + return key.endswith("_BINARY") + + +def _strip_to_binary_keys(config: dict[str, Any] | None) -> dict[str, Any]: + if not isinstance(config, dict): + return {} + return {key: value for key, value in config.items() if _is_binary_event_key(str(key))} + + +class MachineService(BaseService): + LISTENS_TO = [MachineEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db) + + async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None: + from archivebox.machine.models import Machine + + if event.config_type != "derived": + return + + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + old_config = dict(machine.config or {}) + config = dict(old_config) + + if event.config is not None: + binary_only = _strip_to_binary_keys(event.config) + config.update(binary_only) + elif event.method == "update": + key = event.key.replace("config/", "", 1).strip() + if key and _is_binary_event_key(key): + config[key] = event.value + elif event.method == "unset": + key = event.key.replace("config/", "", 1).strip() + if key and _is_binary_event_key(key): + config.pop(key, None) + else: + return + + if config == old_config: + return + machine.config = config + await machine.asave(update_fields=["config", "modified_at"]) diff --git a/archivebox/services/process_service.py b/archivebox/services/process_service.py new file mode 100644 index 0000000000..3cd9e26ca3 --- /dev/null +++ b/archivebox/services/process_service.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime +from typing import ClassVar + +from asgiref.sync import sync_to_async +from django.utils import timezone + +from abxbus import BaseEvent +from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, ProcessCompletedEvent, ProcessStartedEvent +from abx_dl.services.base import BaseService + + +def parse_event_datetime(value: str | None): + if not value: + return None + try: + dt = datetime.fromisoformat(value) + except ValueError: + return None + if timezone.is_naive(dt): + return timezone.make_aware(dt, timezone.get_current_timezone()) + return dt + + +def current_network_interface_with_machine(): + from archivebox.machine.models import NetworkInterface + + current_iface = NetworkInterface.current() + return NetworkInterface.objects.select_related("machine").get(id=current_iface.id) + + +class ProcessService(BaseService): + LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ + ProcessStartedEvent, + ProcessCompletedEvent, + CrawlCleanupEvent, + CrawlCompletedEvent, + ] + EMITS: ClassVar[list[type[BaseEvent]]] = [] + + def __init__(self, bus): + self._iface = None + self._completed_queue: asyncio.Queue[ProcessCompletedEvent | None] = asyncio.Queue() + self._completed_worker: asyncio.Task | None = None + super().__init__(bus) + self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db) + self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db) + self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__flush_completed) + self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__flush_completed) + + async def current_iface(self): + if self._iface is None: + self._iface = await sync_to_async(current_network_interface_with_machine, thread_sensitive=True)() + return self._iface + + async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None: + from archivebox.machine.models import Process + + iface = await self.current_iface() + process_type = event.process_type or Process.TypeChoices.HOOK + worker_type = event.worker_type or "" + started_at = parse_event_datetime(event.start_ts) + if started_at is None: + raise ValueError("ProcessStartedEvent.start_ts is required") + if event.pid: + process_query = Process.objects.filter(pid=event.pid, started_at=started_at) + else: + process_query = Process.objects.filter( + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + started_at=started_at, + ) + process = await process_query.order_by("-modified_at").afirst() + if process is None: + process = await Process.objects.acreate( + machine=iface.machine, + iface=iface, + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + env=event.env, + timeout=event.timeout, + pid=event.pid or None, + url=event.url or None, + started_at=started_at, + status=Process.StatusChoices.RUNNING, + retry_at=None, + ) + elif process.iface_id != iface.id or process.machine_id != iface.machine_id: + process.iface = iface + process.machine = iface.machine + await process.asave(update_fields=["iface", "machine", "modified_at"]) + + process.pwd = event.output_dir + process.cmd = [event.hook_path, *event.hook_args] + process.env = event.env + process.timeout = event.timeout + process.pid = event.pid or None + process.url = event.url or process.url + process.process_type = process_type or process.process_type + process.worker_type = worker_type or process.worker_type + process.started_at = started_at + process.status = process.StatusChoices.RUNNING + process.retry_at = None + await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)( + plugin_name=event.plugin_name, + hook_path=event.hook_path, + ) + await Process.objects.filter(id=process.id).aupdate( + pwd=process.pwd, + cmd=process.cmd, + env=process.env, + timeout=process.timeout, + pid=process.pid, + url=process.url, + process_type=process.process_type, + worker_type=process.worker_type, + started_at=process.started_at, + status=process.status, + retry_at=process.retry_at, + binary_id=process.binary_id, + modified_at=timezone.now(), + ) + + async def _completed_worker_loop(self) -> None: + while True: + event = await self._completed_queue.get() + try: + if event is None: + return + await self._save_completed_process_to_db(event) + finally: + self._completed_queue.task_done() + + def _ensure_completed_worker(self) -> None: + if self._completed_worker is None or self._completed_worker.done(): + self._completed_worker = asyncio.create_task(self._completed_worker_loop()) + + async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None: + self._ensure_completed_worker() + await self._completed_queue.put(event) + + async def flush_completed(self) -> None: + await self._completed_queue.join() + + async def on_CrawlCleanupEvent__flush_completed(self, event: CrawlCleanupEvent) -> None: + await self.flush_completed() + + async def on_CrawlCompletedEvent__flush_completed(self, event: CrawlCompletedEvent) -> None: + await self.flush_completed() + + async def _save_completed_process_to_db(self, event: ProcessCompletedEvent) -> None: + from archivebox.machine.models import Process + + iface = await self.current_iface() + process_type = event.process_type or Process.TypeChoices.HOOK + worker_type = event.worker_type or "" + started_at = parse_event_datetime(event.start_ts) + if started_at is None: + raise ValueError("ProcessCompletedEvent.start_ts is required") + if event.pid: + process_query = Process.objects.filter(pid=event.pid, started_at=started_at) + else: + process_query = Process.objects.filter( + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + started_at=started_at, + ) + process = await process_query.order_by("-modified_at").afirst() + if process is None: + await Process.objects.acreate( + machine=iface.machine, + iface=iface, + process_type=process_type, + worker_type=worker_type, + pwd=event.output_dir, + cmd=[event.hook_path, *event.hook_args], + env=event.env, + timeout=event.timeout, + pid=event.pid or None, + url=event.url or None, + started_at=started_at, + status=Process.StatusChoices.RUNNING, + retry_at=None, + ) + process = await process_query.order_by("-modified_at").afirst() + if process is None: + return + + missing_cmd = not process.cmd + updates = { + "machine_id": iface.machine_id, + "iface_id": iface.id, + "pwd": event.output_dir, + "pid": event.pid or process.pid, + "url": event.url or process.url, + "process_type": process_type or process.process_type, + "worker_type": worker_type or process.worker_type, + "started_at": started_at, + "ended_at": parse_event_datetime(event.end_ts) or timezone.now(), + "stdout": event.stdout, + "stderr": event.stderr, + "exit_code": event.exit_code, + "status": Process.StatusChoices.EXITED, + "retry_at": None, + "modified_at": timezone.now(), + } + if missing_cmd: + updates["cmd"] = [event.hook_path, *event.hook_args] + await Process.objects.filter(id=process.id).aupdate(**updates) diff --git a/archivebox/services/runner.py b/archivebox/services/runner.py new file mode 100644 index 0000000000..1010582f8e --- /dev/null +++ b/archivebox/services/runner.py @@ -0,0 +1,2339 @@ +from __future__ import annotations + +import asyncio +import contextvars +import os +import signal +import shutil +import sys +import threading +import time +from contextlib import nullcontext +from datetime import timedelta +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any + +from asgiref.sync import sync_to_async +from django.utils import timezone +from rich.console import Console +from rich.text import Text + +from abxpkg.binary_service import BinaryCacheService, BinaryRequestEvent, BinaryService +from abx_dl.events import ( + CrawlAbortEvent, + CrawlCleanupEvent, + CrawlCompletedEvent, + CrawlEvent, + CrawlSetupEvent, + CrawlStartEvent, + InstallEvent, + MachineEvent, + ProcessCompletedEvent, + ProcessEvent, + SnapshotCompletedEvent, + SnapshotEvent, + slow_warning_timeout, +) +from abx_dl.heartbeat import CrawlHeartbeat +from abx_dl.limits import CrawlLimitState +from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins +from abx_dl.orchestrator import ( + compute_install_phase_timeout, + compute_phase_timeout, + create_bus, + get_install_plugins, + install_plugins as abx_install_plugins, + setup_services as setup_abx_services, +) +from abx_dl.services.process_service import ProcessService as HookProcessService +from abx_dl.services.binary_service import PluginBinariesService, split_abxpkg_binary_request_overrides +from abx_dl.services.snapshot_service import SnapshotService as HookSnapshotService +from abx_dl.cli import LiveBusUI +from abxbus import BaseEvent +from abxbus.event_bus import EventBus, get_current_event, in_handler_context +from abxbus.event_handler import EventHandlerAbortedError, EventHandlerCancelledError + +from archivebox.config.common import ArchiveBoxBaseConfig, normalize_runtime_config +from archivebox.misc.db import run_db_analyze_batch +from archivebox.core.shutdown_util import foreground_shutdown_signals, raise_if_shutdown_requested +from archivebox.search.sonic_daemon import register_sonic_daemon_event_handler +from archivebox.workers.models import ACTIVE_STATE_LEASE_SECONDS + +from .archive_result_service import ArchiveResultService +from .binary_service import ArchiveBoxBinaryService, ArchiveBoxDBBinaryCacheBackend +from .crawl_service import CrawlService +from .machine_service import MachineService +from .process_service import ProcessService as PersistedProcessService +from .snapshot_service import SnapshotService, finalize_completed_snapshot +from .tag_service import TagService + + +QUEUED_PLUGIN_RESULT_BATCH_SIZE = 100 + + +def _bus_name(prefix: str, identifier: str) -> str: + normalized = "".join(ch if ch.isalnum() else "_" for ch in identifier) + return f"{prefix}_{normalized}" + + +def _runner_short_id(identifier) -> str: + return str(identifier).replace("-", "")[-8:] + + +def _runner_label(value: str, *, reserve: int) -> str: + width = max(24, shutil.get_terminal_size(fallback=(120, 40)).columns - reserve) + value = " ".join(str(value or "").split()) + if len(value) <= width: + return value + return f"{value[: max(0, width - 3)]}..." + + +def _runner_console_line(*, crawl=None, crawl_id=None, snapshot=None, status: str = "STARTED") -> None: + crawl_id = crawl.id if crawl is not None else crawl_id + line = Text() + line.append(f"[Crawl#{_runner_short_id(crawl_id)}]", style="cyan bold") + line.append(" ") + if snapshot is not None: + line.append(f"[Snapshot#{_runner_short_id(snapshot.id)}]", style="magenta bold") + line.append(" ") + status_styles = { + "STARTED": "green bold", + "SEALED": "blue bold", + "PAUSED": "yellow bold", + } + line.append(f"[{status}]", style=status_styles.get(status, "white bold")) + line.append(" ") + prefix_width = len(line.plain) + if snapshot is not None: + label = snapshot.url + else: + label = (crawl.label or "").strip() + if not label: + label = (crawl.urls or "").partition("\n")[0].strip() or str(crawl_id) + line.append(_runner_label(label, reserve=prefix_width)) + Console(highlight=False).print(line) + + +def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int: + selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins + return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name) + + +def _discover_archivebox_plugins() -> dict[str, Plugin]: + return discover_plugins(runtime="archivebox") + + +def _runner_task_context() -> contextvars.Context: + context = contextvars.copy_context() + context.run(EventBus.current_event_context.set, None) + context.run(EventBus.current_handler_id_context.set, None) + context.run(EventBus.current_eventbus_context.set, None) + return context + + +def _is_external_task_cancelled(error: asyncio.CancelledError) -> bool: + return not isinstance(error, (EventHandlerAbortedError, EventHandlerCancelledError)) + + +async def _emit_machine_config( + bus, + *, + config: dict[str, Any], + derived_config: dict[str, Any], + parent_event=None, +) -> None: + user_config = normalize_runtime_config(config) + user_config["ABX_RUNTIME"] = "archivebox" + derived_machine_config = normalize_runtime_config(derived_config) + user_event = MachineEvent( + config=user_config, + config_type="user", + ) + if parent_event is not None: + user_event.event_parent_id = parent_event.event_id + await bus.emit(user_event).now() + if derived_machine_config: + derived_event = MachineEvent( + config=derived_machine_config, + config_type="derived", + ) + if parent_event is not None: + derived_event.event_parent_id = parent_event.event_id + await bus.emit(derived_event).now() + + +async def _run_event_now(event, timeout: float | None = None): + await event.now(timeout=timeout) + await event.wait(timeout=timeout) + await event.event_results_list() + return event + + +def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool: + if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest: + return False + + from archivebox.machine.models import Machine, Process + from archivebox.workers.supervisord_util import RUNNER_WORKER, get_existing_supervisord_process, get_worker, start_worker + + supervisor = get_existing_supervisord_process() + runner_worker = get_worker(supervisor, "worker_runner") if supervisor else None + if runner_worker and runner_worker.get("statename") in ("STARTING", "RUNNING"): + return False + + machine = Machine.current() + Process.cleanup_stale_running(machine=machine) + running_orchestrators = Process.objects.filter( + machine=machine, + status=Process.StatusChoices.RUNNING, + process_type=Process.TypeChoices.ORCHESTRATOR, + ) + if any(proc.is_running for proc in running_orchestrators): + return False + + if supervisor is not None: + try: + start_worker(supervisor, RUNNER_WORKER) + return True + except Exception: + pass + + return False + + +class CrawlRunner: + def __init__( + self, + crawl, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, + show_progress: bool = True, + interactive_interrupts: bool = False, + config_overrides: dict[str, Any] | None = None, + selected_plugins_are_explicit: bool = True, + ): + self.crawl = crawl + self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0) + self.plugins = _discover_archivebox_plugins() + HookProcessService(self.bus, emit_jsonl=False, interactive_tty=interactive_interrupts) + register_sonic_daemon_event_handler(self.bus) + PersistedProcessService(self.bus) + ArchiveBoxBinaryService(self.bus) + BinaryCacheService(self.bus, backend=ArchiveBoxDBBinaryCacheBackend()) + BinaryService(self.bus) + TagService(self.bus) + CrawlService(self.bus, crawl_id=str(crawl.id)) + MachineService(self.bus) + self.process_discovered_snapshots_inline = process_discovered_snapshots_inline + self.show_progress = show_progress + self.interactive_interrupts = interactive_interrupts + self.config_overrides = dict(config_overrides or {}) + + async def ignore_snapshot(_snapshot_id: str) -> None: + return None + + SnapshotService( + self.bus, + crawl_id=str(crawl.id), + schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot, + ) + ArchiveResultService(self.bus) + self.selected_plugins = selected_plugins + self.selected_plugins_from_args = selected_plugins is not None and selected_plugins_are_explicit + self.initial_snapshot_ids = snapshot_ids + self.snapshot_tasks: dict[str, asyncio.Task[None]] = {} + self.snapshot_semaphore = asyncio.Semaphore(1) + self.max_concurrent_snapshots = 1 + self.persona = None + self.base_config: ArchiveBoxBaseConfig | dict[str, Any] = {} + self.derived_config: dict[str, Any] = {} + self.primary_url = "" + self.crawl_output_dir = "" + self._live_stream = None + self.root_crawl_event_id: str | None = None + self.root_crawl_start_event_id: str | None = None + self._run_task: asyncio.Task[None] | None = None + self._skip_wait_until_idle = False + # This is intentionally a synchronous OS-signal side channel, not bus + # state. During SIGINT/SIGTERM/SIGHUP, asyncio.run() may already be + # cancelling tasks and closing the loop, so abxbus cannot be relied on + # for timely delivery of a final "stop now" event. + self._signal_abort_requested = False + self._last_lease_heartbeat_at = 0.0 + + def _request_abort_from_signal(self, _sig: signal.Signals) -> None: + if os.environ.get("ARCHIVEBOX_RUNNER_DAEMON") == "1": + os._exit(128 + int(_sig)) + already_requested = self._signal_abort_requested + self._signal_abort_requested = True + self._skip_wait_until_idle = True + # The foreground signal handler runs while the event loop may be in the + # middle of shutdown. Flip cheap in-memory flags here and let normal + # finally blocks do cleanup; only cancel the runner task immediately for + # non-interactive commands or for a second interrupt escalation. + if (not self.interactive_interrupts or already_requested) and self._run_task is not None and not self._run_task.done(): + self._run_task.cancel() + + async def crawl_is_cancelled(self) -> bool: + from archivebox.crawls.models import Crawl + + if self._signal_abort_requested: + return True + if self.allow_maintenance_on_inactive_crawl: + # SEALED is the normal terminal state of a finished crawl, not a + # cancellation signal for maintenance work on its already-sealed + # snapshots (search backend backfill, fs migration, etc.). When the + # runner is invoked with explicit snapshot_ids + selected_plugins, + # treat sealed as completed rather than cancelled so the requested + # maintenance hooks can actually run. + return False + return await Crawl.objects.filter(id=self.crawl.id, status=Crawl.StatusChoices.SEALED).aexists() + + async def crawl_is_paused(self) -> bool: + from archivebox.crawls.models import Crawl + + crawl = await Crawl.objects.only("status").aget(id=self.crawl.id) + return crawl.is_paused + + async def watch_for_cancelled_crawl(self, parent_event: BaseEvent, *, poll_interval: float = 1.0) -> None: + while True: + await asyncio.sleep(poll_interval) + if not await self.crawl_is_cancelled(): + continue + abort_event = parent_event.emit(CrawlAbortEvent()) + await _run_event_now(abort_event, abort_event.event_timeout) + return + + def runtime_plugins(self) -> dict[str, Plugin]: + return filter_plugins(self.plugins, self.selected_plugins, include_providers=True) if self.selected_plugins else self.plugins + + @property + def allow_maintenance_on_inactive_crawl(self) -> bool: + """Run the requested hooks on a snapshot whose parent crawl is paused or sealed. + + Maintenance entry paths โ€” direct ``snapshot_ids + selected_plugins`` invocations + for search backend backfill, fs migration, plugin-targeted updates โ€” are + legitimately allowed to operate on finished/paused crawls. Without this gate, + ``crawl_is_cancelled`` would treat a SEALED parent as a cancellation signal + and short-circuit every guard before any hook ran, leaving the queued + ArchiveResult rows stuck and the orchestrator looping on them. + """ + return bool(self.initial_snapshot_ids and self.selected_plugins) + + async def run(self) -> None: + heartbeat = CrawlHeartbeat( + Path(self.crawl_output_dir), + runtime="archivebox", + crawl_id=str(self.crawl.id), + ) + root_snapshot_id: str | None = None + bus_destroyed = False + try: + first_signal_message = ( + "\n[๐Ÿ›‘] Got {signal_name}, aborting the active hook...\n" + if self.interactive_interrupts + else "\n[๐Ÿ›‘] Got {signal_name}, stopping gracefully...\n" + ) + self._run_task = asyncio.current_task() + # Do not raise KeyboardInterrupt directly from an OS signal while + # the asyncio loop is active. Python can inject it into whichever + # task is currently running, which produces noisy "Task exception + # was never retrieved" logs from unrelated abxbus housekeeping + # tasks. _request_abort_from_signal() cancels the runner task + # cooperatively instead; repeated signals still hard-exit in the + # shared foreground signal handler. + with foreground_shutdown_signals( + first_signal_message=first_signal_message, + on_signal=self._request_abort_from_signal, + raise_on_first_signal=False, + ): + snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)() + max_concurrent_snapshots = max(1, int(self.base_config.get("CRAWL_MAX_CONCURRENT_SNAPSHOTS", 1))) + self.max_concurrent_snapshots = max_concurrent_snapshots + self.snapshot_semaphore = asyncio.Semaphore(max_concurrent_snapshots) + live_ui = self._create_live_ui() + with live_ui if live_ui is not None else nullcontext(): + try: + await heartbeat.start() + await _emit_machine_config( + self.bus, + config=self.base_config, + derived_config=self.derived_config, + ) + if snapshot_ids: + root_snapshot_id = snapshot_ids[0] + await self.run_crawl(root_snapshot_id, snapshot_ids) + finally: + self._run_task = None + await heartbeat.stop() + await self.stop_snapshot_tasks() + try: + await self.bus.wait_until_idle(timeout=1.0 if self._skip_wait_until_idle else 30.0) + except TimeoutError: + pass + finally: + await self.bus.destroy(clear=False) + bus_destroyed = True + finally: + if not bus_destroyed: + self._run_task = None + await heartbeat.stop() + await self.stop_snapshot_tasks() + await self.bus.destroy(clear=False) + if self._live_stream is not None: + try: + self._live_stream.close() + except Exception: + pass + self._live_stream = None + await sync_to_async(self.finalize_run_state, thread_sensitive=True)() + + async def enqueue_snapshot(self, snapshot_id: str, crawl_start_event: CrawlStartEvent | None = None) -> None: + if await self.crawl_is_cancelled(): + return + if await self.crawl_is_paused() and not self.allow_maintenance_on_inactive_crawl: + return + task = self.snapshot_tasks.get(snapshot_id) + if task is not None and not task.done(): + return + current_event = crawl_start_event or get_current_event() + if isinstance(current_event, CrawlStartEvent): + task = asyncio.create_task(self.run_snapshot(snapshot_id, current_event), context=_runner_task_context()) + elif in_handler_context(): + return + else: + task = asyncio.create_task(self.run_snapshot(snapshot_id), context=_runner_task_context()) + self.snapshot_tasks[snapshot_id] = task + + async def stop_snapshot_tasks(self) -> None: + if not self.snapshot_tasks: + return + tasks = list(self.snapshot_tasks.values()) + if self._signal_abort_requested: + done = {task for task in tasks if task.done()} + pending = set(tasks) - done + else: + done, pending = await asyncio.wait(tasks, timeout=5.0) + for task in pending: + task.cancel() + await asyncio.gather(*done, *pending, return_exceptions=True) + self.snapshot_tasks.clear() + + async def wait_for_snapshot_tasks(self) -> None: + task_errors: list[Exception] = [] + stop_scheduling = False + while True: + pending_tasks: list[asyncio.Task[None]] = [] + for snapshot_id, task in list(self.snapshot_tasks.items()): + if task.done(): + if self.snapshot_tasks.get(snapshot_id) is task: + self.snapshot_tasks.pop(snapshot_id, None) + try: + task.result() + except asyncio.CancelledError as err: + if _is_external_task_cancelled(err): + raise + stop_scheduling = True + except Exception as err: + task_errors.append(err) + stop_scheduling = True + continue + pending_tasks.append(task) + if not pending_tasks: + if task_errors: + if len(task_errors) == 1: + raise task_errors[0] + raise ExceptionGroup("One or more snapshot tasks failed", task_errors) + if stop_scheduling: + return + await self.enqueue_pending_snapshots_from_projection() + if not self.snapshot_tasks: + return + continue + await self.heartbeat_active_leases() + done, _pending = await asyncio.wait(pending_tasks, timeout=10.0, return_when=asyncio.FIRST_COMPLETED) + if not done: + continue + for task in done: + for snapshot_id, tracked_task in list(self.snapshot_tasks.items()): + if tracked_task is task: + self.snapshot_tasks.pop(snapshot_id, None) + break + try: + task.result() + except asyncio.CancelledError as err: + if _is_external_task_cancelled(err): + raise + stop_scheduling = True + except Exception as err: + task_errors.append(err) + stop_scheduling = True + if self.snapshot_tasks and ( + await self.crawl_is_cancelled() or (await self.crawl_is_paused() and not self.allow_maintenance_on_inactive_crawl) + ): + stop_scheduling = True + if not stop_scheduling: + await self.enqueue_pending_snapshots_from_projection() + + async def heartbeat_active_leases(self) -> None: + if self._run_task is None: + return + now_monotonic = time.monotonic() + if now_monotonic - self._last_lease_heartbeat_at < 10.0: + return + self._last_lease_heartbeat_at = now_monotonic + lease_until = timezone.now() + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS) + active_snapshot_ids = [snapshot_id for snapshot_id, task in self.snapshot_tasks.items() if not task.done()] + + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + await Crawl.objects.filter(id=self.crawl.id, status=Crawl.StatusChoices.STARTED).aupdate( + retry_at=lease_until, + modified_at=timezone.now(), + ) + if active_snapshot_ids: + await Snapshot.objects.filter(id__in=active_snapshot_ids, status=Snapshot.StatusChoices.STARTED).aupdate( + retry_at=lease_until, + modified_at=timezone.now(), + ) + + async def drain_snapshot_tasks(self) -> None: + task_errors: list[Exception] = [] + while self.snapshot_tasks: + done, _pending = await asyncio.wait(list(self.snapshot_tasks.values()), return_when=asyncio.FIRST_COMPLETED) + for task in done: + for snapshot_id, tracked_task in list(self.snapshot_tasks.items()): + if tracked_task is task: + self.snapshot_tasks.pop(snapshot_id, None) + break + try: + task.result() + except asyncio.CancelledError as err: + if _is_external_task_cancelled(err): + raise + except Exception as err: + task_errors.append(err) + if task_errors: + if len(task_errors) == 1: + raise task_errors[0] + raise ExceptionGroup("One or more snapshot tasks failed", task_errors) + + async def enqueue_pending_snapshots_from_projection(self) -> None: + from archivebox.core.models import Snapshot + from archivebox.config.common import get_config + + if not isinstance(get_current_event(), CrawlStartEvent): + return + if await self.crawl_is_cancelled(): + return + if await self.crawl_is_paused() and not self.allow_maintenance_on_inactive_crawl: + return + + await sync_to_async(self.crawl.refresh_from_db, thread_sensitive=True)() + config = await sync_to_async(lambda: get_config(crawl=self.crawl), thread_sensitive=True)() + self.max_concurrent_snapshots = max(1, int(config["CRAWL_MAX_CONCURRENT_SNAPSHOTS"])) + + active_snapshot_ids = [snapshot_id for snapshot_id, task in self.snapshot_tasks.items() if not task.done()] + available_slots = max(0, self.max_concurrent_snapshots - len(active_snapshot_ids)) + if available_slots <= 0: + return + pending_snapshot_ids = await sync_to_async( + lambda: list( + self.crawl.snapshot_set.filter(status__in=Snapshot.RUNNABLE_STATES) + .exclude(id__in=active_snapshot_ids) + .filter(retry_at__lte=timezone.now()) + .order_by("depth", "created_at") + .values_list("id", flat=True)[:available_slots], + ), + thread_sensitive=True, + )() + for snapshot_id in pending_snapshot_ids: + if snapshot_id not in self.snapshot_tasks: + await self.enqueue_snapshot(snapshot_id) + + def load_run_state(self) -> list[str]: + from archivebox.config.common import get_config + from archivebox.core.models import Snapshot + from archivebox.plugins.hooks import discover_hooks + from archivebox.machine.models import Machine, NetworkInterface, Process + + self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else "" + current_iface = NetworkInterface.current(refresh=not self.allow_maintenance_on_inactive_crawl) + current_process = Process.current() + if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id: + current_process.iface = current_iface + current_process.machine = current_iface.machine + current_process.save(update_fields=["iface", "machine", "modified_at"]) + self.persona = self.crawl.resolve_persona() + self.base_config = get_config(crawl=self.crawl) + self.derived_config = dict(Machine.current().config or {}) + self.crawl_output_dir = str(self.crawl.output_dir) + if self.persona: + self.base_config.update( + self.persona.prepare_runtime_for_crawl( + self.crawl, + chrome_binary=self.base_config["CHROME_BINARY"], + ), + ) + self.base_config.update(self.config_overrides) + if self.selected_plugins is None: + raw_plugins = str(self.base_config.get("PLUGINS") or "").strip() + if raw_plugins: + self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] + else: + runtime_events = ("CrawlSetup", "CrawlCleanup", "Snapshot", "SnapshotCleanup") + runtime_plugins = { + hook.parent.name for event_name in runtime_events for hook in discover_hooks(event_name, config=self.base_config) + } + self.selected_plugins = sorted(runtime_plugins) or None + if self.initial_snapshot_ids: + # Direct snapshot maintenance paths are allowed to name paused + # snapshots explicitly. The runner still requires selected_plugins + # later, so this does not restart the crawl lifecycle. + return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids] + if self.crawl.is_paused: + return [] + pending_snapshots = list( + self.crawl.snapshot_set.filter(status__in=Snapshot.RUNNABLE_STATES) + .filter(retry_at__lte=timezone.now()) + .order_by("depth", "created_at"), + ) + if pending_snapshots: + return [str(snapshot.id) for snapshot in pending_snapshots] + if self.crawl.snapshot_set.exclude(status__in=[Snapshot.StatusChoices.SEALED, Snapshot.StatusChoices.PAUSED]).exists(): + return [] + created = self.crawl.create_snapshots_from_urls() + snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at")) + return [str(snapshot.id) for snapshot in snapshots] + + def finalize_run_state(self) -> None: + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + if self.persona: + self.persona.cleanup_runtime_for_crawl(self.crawl) + crawl = Crawl.objects.get(id=self.crawl.id) + if crawl.status == Crawl.StatusChoices.SEALED: + return + if crawl.is_paused: + return + if crawl.is_finished(): + if crawl.status != Crawl.StatusChoices.SEALED: + if crawl.status == Crawl.StatusChoices.STARTED: + crawl.sm.seal() + else: + crawl.update_and_requeue( + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + return + active_snapshots = crawl.snapshot_set.filter( + status__in=[ + Snapshot.StatusChoices.QUEUED, + Snapshot.StatusChoices.STARTED, + Snapshot.StatusChoices.PAUSED, + ], + ) + next_snapshot_retry = active_snapshots.order_by("retry_at", "created_at").values_list("retry_at", flat=True).first() + if crawl.status != Crawl.StatusChoices.STARTED: + crawl.update_and_requeue( + status=Crawl.StatusChoices.STARTED, + retry_at=crawl.retry_at or next_snapshot_retry or timezone.now(), + ) + return + crawl.update_and_requeue( + retry_at=crawl.retry_at or next_snapshot_retry or timezone.now(), + ) + + def _create_live_ui(self) -> LiveBusUI | None: + if not self.show_progress: + return None + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + if not interactive_tty: + return None + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8") + stream = self._live_stream + except OSError: + self._live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)" + live_ui = LiveBusUI( + self.bus, + total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins), + timeout_seconds=self.base_config["TIMEOUT"], + ui_console=ui_console, + interactive_tty=True, + ) + live_ui.print_intro( + url=self.primary_url or "crawl", + output_dir=Path(self.crawl_output_dir), + plugins_label=plugins_label, + ) + return live_ui + + def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]: + from archivebox.config.common import get_config + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.select_related("crawl", "crawl__created_by").get(id=snapshot_id) + self.crawl = snapshot.crawl + self.persona = snapshot.crawl.resolve_persona() + self.base_config = get_config(crawl=snapshot.crawl, persona=self.persona) + self.base_config.update(self.config_overrides) + self.crawl_output_dir = str(snapshot.crawl.output_dir) + runtime_chrome_overrides = {} + if self.persona: + if str(self.base_config.get("CHROME_ISOLATION") or "crawl").lower() == "snapshot": + runtime_chrome_overrides.update( + self.persona.prepare_runtime_for_snapshot( + snapshot, + chrome_binary=self.base_config["CHROME_BINARY"], + ), + ) + else: + crawl_downloads_dir = self.persona.runtime_downloads_dir_for_crawl(snapshot.crawl) + crawl_downloads_dir.mkdir(parents=True, exist_ok=True) + runtime_chrome_overrides.update( + { + "CHROME_USER_DATA_DIR": str(self.persona.runtime_profile_dir_for_crawl(snapshot.crawl)), + "CHROME_DOWNLOADS_DIR": str(crawl_downloads_dir), + }, + ) + snapshot_output_dir = str(snapshot.output_dir) + tags = snapshot.tags_str() + config = self.base_config.for_crawl_runtime( + crawl=snapshot.crawl, + snapshot=snapshot, + persona=self.persona, + runtime_overrides=runtime_chrome_overrides, + extra_context={ + "snapshot_id": str(snapshot.id), + "snapshot_depth": snapshot.depth, + "snapshot_url": snapshot.url, + "snapshot_title": snapshot.title or "", + "snapshot_tags": tags, + }, + ) + normalized_config = normalize_runtime_config(config) + return { + "id": str(snapshot.id), + "url": snapshot.url, + "title": snapshot.title, + "timestamp": snapshot.timestamp, + "bookmarked_at": snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else "", + "created_at": snapshot.created_at.isoformat() if snapshot.created_at else "", + "tags": tags, + "depth": snapshot.depth, + "status": snapshot.status, + "output_dir": snapshot_output_dir, + "config": normalized_config, + "_snapshot": snapshot, + } + + async def enqueue_discovered_snapshots_from_outputs(self, snapshot_payload: dict[str, Any]) -> None: + from archivebox.core.models import Snapshot + from archivebox.config.common import get_config + from archivebox.plugins.hooks import collect_urls_from_plugins + + await sync_to_async(self.crawl.refresh_from_db, thread_sensitive=True)() + if self.crawl.is_paused and not self.allow_maintenance_on_inactive_crawl: + return + if int(snapshot_payload["depth"]) >= self.crawl.max_depth: + return + + discovered_urls = await sync_to_async(collect_urls_from_plugins, thread_sensitive=True)(Path(snapshot_payload["output_dir"])) + if not discovered_urls: + return + + if self.crawl.status == self.crawl.StatusChoices.SEALED: + # Snapshot completion projectors can observe the root snapshot seal + # before the runner has consumed parser urls.jsonl output. A sealed + # crawl must not block those freshly discovered child snapshots; the + # runner is still inside the same crawl lifecycle and will seal it + # again after the discovered queue is empty. + await sync_to_async(self.crawl.update_and_requeue, thread_sensitive=True)( + status=self.crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + + parent_snapshot = await sync_to_async( + lambda: Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=snapshot_payload["id"]).first(), + thread_sensitive=True, + )() + if parent_snapshot is None: + return + config = await sync_to_async( + lambda: get_config(crawl=self.crawl, snapshot=parent_snapshot).for_crawl_runtime( + crawl=self.crawl, + snapshot=parent_snapshot, + persona=self.crawl.resolve_persona(), + crawl_output_dir=self.crawl.output_dir, + snapshot_output_dir=parent_snapshot.output_dir, + ), + thread_sensitive=True, + )() + if CrawlLimitState.from_config(config).get_stop_reason() in ("crawl_max_size", "crawl_timeout"): + return + + await sync_to_async(self.crawl.create_discovered_snapshots, thread_sensitive=True)( + parent_snapshot, + discovered_urls, + depth=parent_snapshot.depth + 1, + ) + if self.process_discovered_snapshots_inline and isinstance(get_current_event(), CrawlStartEvent): + await self.enqueue_pending_snapshots_from_projection() + + async def run_crawl(self, root_snapshot_id: str, snapshot_ids: list[str]) -> None: + snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(root_snapshot_id) + config = normalize_runtime_config(snapshot["config"]) + derived_config = normalize_runtime_config(self.derived_config) + output_dir = Path(self.crawl_output_dir) + plugins = self.runtime_plugins() + abx_snapshot = AbxSnapshot( + id=snapshot["id"], + url=snapshot["url"], + depth=int(snapshot["depth"]), + crawl_id=str(self.crawl.id), + ) + setup_hooks = [(plugin, hook) for plugin in plugins.values() for hook in plugin.filter_hooks("CrawlSetup")] + crawl_setup_phase_timeout = compute_phase_timeout(setup_hooks, config) + install_phase_timeout = compute_install_phase_timeout(get_install_plugins(plugins), config) + snapshot_hooks = [(plugin, hook) for plugin in plugins.values() for hook in plugin.filter_hooks("Snapshot")] + max_snapshot_count = max(1, int(config.get("CRAWL_MAX_URLS") or len(snapshot_ids) or 1)) + snapshot_phase_timeout = compute_phase_timeout(snapshot_hooks, config) + 120.0 + all_snapshots_phase_timeout = snapshot_phase_timeout * max_snapshot_count + crawl_cleanup_phase_timeout = crawl_setup_phase_timeout + crawl_lifecycle_timeout = ( + crawl_setup_phase_timeout + + all_snapshots_phase_timeout + + crawl_cleanup_phase_timeout + + CrawlCompletedEvent.model_fields["event_timeout"].default + + 30.0 + ) + await _emit_machine_config(self.bus, config=config, derived_config=derived_config) + install_cancel_watcher: asyncio.Task[None] | None = None + install_event = self.bus.emit( + InstallEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + event_timeout=install_phase_timeout, + event_handler_slow_timeout=slow_warning_timeout(install_phase_timeout), + ), + ) + + async def on_archivebox_InstallEvent(event: InstallEvent) -> None: + nonlocal install_cancel_watcher + if event.event_id != install_event.event_id: + return + install_cancel_watcher = asyncio.create_task(self.watch_for_cancelled_crawl(event)) + + on_archivebox_InstallEvent.__name__ = "on_archivebox_InstallEvent__cancel_watcher" + self.bus.on(InstallEvent, on_archivebox_InstallEvent) + setup_abx_services( + self.bus, + plugins=plugins, + url=snapshot["url"], + snapshot=abx_snapshot, + output_dir=output_dir, + install_enabled=True, + crawl_setup_enabled=True, + crawl_event_enabled=False, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=True, + crawl_completed_enabled=False, + crawl_setup_phase_timeout=crawl_setup_phase_timeout, + snapshot_phase_timeout=crawl_setup_phase_timeout, + snapshot_cleanup_phase_timeout=crawl_setup_phase_timeout, + crawl_cleanup_phase_timeout=crawl_setup_phase_timeout, + persist_derived=False, + auto_install=True, + emit_jsonl=False, + abort_requested=self.crawl_is_cancelled, + MachineService=None, + PluginBinariesService=PluginBinariesService, + BinaryCacheService=None, + BinaryService=None, + ProcessService=None, + ArchiveResultService=None, + TagService=None, + SnapshotService=None, + ) + try: + await _run_event_now(install_event, install_phase_timeout) + finally: + if install_cancel_watcher is not None: + install_cancel_watcher.cancel() + await asyncio.gather(install_cancel_watcher, return_exceptions=True) + + async def on_archivebox_CrawlStartEvent(event: CrawlStartEvent) -> None: + if event.event_id != self.root_crawl_start_event_id: + return + for snapshot_id in snapshot_ids: + if sum(1 for task in self.snapshot_tasks.values() if not task.done()) >= self.max_concurrent_snapshots: + break + if await self.crawl_is_cancelled(): + break + if await self.crawl_is_paused() and not self.allow_maintenance_on_inactive_crawl: + break + await self.enqueue_snapshot(snapshot_id) + await self.wait_for_snapshot_tasks() + + async def on_archivebox_CrawlEvent(event: CrawlEvent) -> None: + if event.event_id != self.root_crawl_event_id: + return + cancel_watcher = asyncio.create_task(self.watch_for_cancelled_crawl(event)) + try: + try: + if not await self.crawl_is_cancelled() and ( + not await self.crawl_is_paused() or self.allow_maintenance_on_inactive_crawl + ): + await _run_event_now( + event.emit( + CrawlSetupEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + event_timeout=crawl_setup_phase_timeout, + event_handler_slow_timeout=slow_warning_timeout(crawl_setup_phase_timeout), + ), + ), + crawl_setup_phase_timeout, + ) + if not await self.crawl_is_cancelled() and ( + not await self.crawl_is_paused() or self.allow_maintenance_on_inactive_crawl + ): + crawl_start_event = CrawlStartEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + event_timeout=all_snapshots_phase_timeout, + event_handler_timeout=all_snapshots_phase_timeout + 30.0, + event_handler_slow_timeout=slow_warning_timeout(all_snapshots_phase_timeout), + ) + self.root_crawl_start_event_id = crawl_start_event.event_id + await _run_event_now(event.emit(crawl_start_event), None) + finally: + if self.snapshot_tasks: + await self.drain_snapshot_tasks() + cleanup_event = event.emit( + CrawlCleanupEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + event_timeout=crawl_setup_phase_timeout, + event_handler_slow_timeout=slow_warning_timeout(crawl_setup_phase_timeout), + ), + ) + # Cleanup owns ProcessKillEvent emission for crawl-scoped + # setup hooks. Even during OS-signal shutdown we must drive + # it synchronously before bus teardown; otherwise daemon/bg + # setup hooks can outlive the foreground runner that + # launched them. _run_event_now() is already bounded by the + # crawl setup timeout and cleanup handlers provide their own + # hook-level grace periods. + await _run_event_now(cleanup_event, crawl_setup_phase_timeout) + finally: + cancel_watcher.cancel() + await asyncio.gather(cancel_watcher, return_exceptions=True) + completed_event = event.emit( + CrawlCompletedEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + ), + ) + # Same signal lifecycle as CrawlCleanupEvent above: completion is a + # normal bus event unless the interpreter is already unwinding from + # SIGINT/SIGTERM/SIGHUP, where synchronous bus delivery is no + # longer a dependable shutdown primitive. + if not self._signal_abort_requested: + await _run_event_now(completed_event, CrawlCompletedEvent.model_fields["event_timeout"].default) + + on_archivebox_CrawlStartEvent.__name__ = "on_archivebox_CrawlStartEvent__run_snapshots" + on_archivebox_CrawlEvent.__name__ = "on_archivebox_CrawlEvent__run_recursive_crawl" + self.bus.on(CrawlStartEvent, on_archivebox_CrawlStartEvent) + self.bus.on(CrawlEvent, on_archivebox_CrawlEvent) + + crawl_event = CrawlEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + event_timeout=crawl_lifecycle_timeout, + event_handler_timeout=crawl_lifecycle_timeout + 30.0, + event_handler_slow_timeout=slow_warning_timeout(crawl_lifecycle_timeout), + ) + self.root_crawl_event_id = crawl_event.event_id + await _run_event_now(self.bus.emit(crawl_event), None) + if await self.crawl_is_cancelled(): + self._skip_wait_until_idle = True + return + for plugin, hook in setup_hooks: + if hook.is_background: + continue + process_event = await self.bus.find( + ProcessEvent, + past=True, + future=crawl_setup_phase_timeout, + where=lambda candidate, plugin_name=plugin.name, hook_name=hook.name: ( + self.bus.event_is_child_of(candidate, crawl_event) + and candidate.plugin_name == plugin_name + and candidate.hook_name == hook_name + and candidate.output_dir == str(output_dir / plugin_name) + ), + ) + if process_event is None: + raise RuntimeError(f"Crawl setup hook {plugin.name}:{hook.name} did not start") + completed_process = await self.bus.find( + ProcessCompletedEvent, + child_of=process_event, + past=True, + future=crawl_setup_phase_timeout, + ) + if completed_process is None: + raise RuntimeError(f"Crawl setup hook {plugin.name}:{hook.name} did not complete") + await completed_process.wait(timeout=crawl_setup_phase_timeout) + await completed_process.event_results_list() + if completed_process.status == "failed": + raise RuntimeError(f"Crawl setup hook {plugin.name}:{hook.name} failed") + + async def run_snapshot(self, snapshot_id: str, crawl_start_event: CrawlStartEvent | None = None) -> None: + async with self.snapshot_semaphore: + crawl_start_event = crawl_start_event or get_current_event() + if not isinstance(crawl_start_event, CrawlStartEvent): + raise RuntimeError("Snapshot events must be emitted from a CrawlStartEvent handler") + snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id) + if snapshot["status"] == "sealed" and not self.selected_plugins: + await sync_to_async(run_snapshot_maintenance, thread_sensitive=True)(snapshot_id) + return + config = normalize_runtime_config(snapshot["config"]) + snapshot_config_plugins = [name.strip() for name in str(config.get("PLUGINS") or "").split(",") if name.strip()] + snapshot_selected_plugins = ( + self.selected_plugins if self.selected_plugins_from_args else (snapshot_config_plugins or self.selected_plugins) + ) + selected_hooks_by_plugin = None + if snapshot["status"] == "started": + _reset_count, running_count = await sync_to_async(snapshot["_snapshot"].reset_abandoned_results, thread_sensitive=True)() + if running_count: + await sync_to_async( + lambda: snapshot["_snapshot"].update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS), + ), + thread_sensitive=True, + )() + return + if await sync_to_async(snapshot["_snapshot"].is_finished_processing, thread_sensitive=True)(): + await sync_to_async(finalize_completed_snapshot, thread_sensitive=True)( + snapshot["id"], + output_dir=Path(snapshot["output_dir"]), + ) + return + if not self.selected_plugins_from_args: + queued_plugins, selected_hooks_by_plugin = await sync_to_async( + queued_plugins_and_hooks_for_snapshot, + thread_sensitive=True, + )(snapshot["id"]) + if queued_plugins: + if snapshot_selected_plugins: + queued_plugins = [plugin for plugin in queued_plugins if plugin in snapshot_selected_plugins] + selected_hooks_by_plugin = { + plugin: hooks for plugin, hooks in (selected_hooks_by_plugin or {}).items() if plugin in queued_plugins + } + snapshot_selected_plugins = queued_plugins + elif not self.selected_plugins_from_args: + queued_plugins, selected_hooks_by_plugin = await sync_to_async( + queued_plugins_and_hooks_for_snapshot, + thread_sensitive=True, + )(snapshot["id"]) + if queued_plugins: + if snapshot_selected_plugins: + queued_plugins = [plugin for plugin in queued_plugins if plugin in snapshot_selected_plugins] + selected_hooks_by_plugin = { + plugin: hooks for plugin, hooks in (selected_hooks_by_plugin or {}).items() if plugin in queued_plugins + } + snapshot_selected_plugins = queued_plugins + if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() in ( + "crawl_max_size", + "crawl_timeout", + ): + await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id) + return + derived_config = normalize_runtime_config(self.derived_config) + output_dir = Path(snapshot["output_dir"]) + plugins = ( + filter_plugins(self.plugins, snapshot_selected_plugins, include_providers=True) + if snapshot_selected_plugins + else self.plugins + ) + if selected_hooks_by_plugin is not None: + await sync_to_async(fail_unavailable_queued_hooks, thread_sensitive=True)( + snapshot["id"], + selected_hooks_by_plugin, + plugins, + ) + remaining_queued_plugins = await sync_to_async( + queued_plugins_for_snapshot, + thread_sensitive=True, + )(snapshot["id"]) + if snapshot_selected_plugins and remaining_queued_plugins: + remaining_queued_plugins = [plugin for plugin in remaining_queued_plugins if plugin in snapshot_selected_plugins] + if not remaining_queued_plugins: + await sync_to_async(run_snapshot_maintenance, thread_sensitive=True)(snapshot_id, output_dir=output_dir) + return + snapshot_selected_plugins = remaining_queued_plugins + plugins = filter_plugins(self.plugins, snapshot_selected_plugins, include_providers=True) + selected_hooks_by_plugin = include_background_prerequisite_hooks(selected_hooks_by_plugin, plugins) + abx_snapshot = AbxSnapshot( + id=snapshot["id"], + url=snapshot["url"], + depth=int(snapshot["depth"]), + crawl_id=str(self.crawl.id), + ) + snapshot_hooks = [(plugin, hook) for plugin in plugins.values() for hook in plugin.filter_hooks("Snapshot")] + snapshot_phase_timeout = compute_phase_timeout(snapshot_hooks, config) + 120.0 + await _emit_machine_config(self.bus, config=config, derived_config=derived_config, parent_event=crawl_start_event) + snapshot_service = HookSnapshotService( + self.bus, + url=snapshot["url"], + snapshot=abx_snapshot, + output_dir=output_dir, + plugins=plugins, + snapshot_phase_timeout=snapshot_phase_timeout, + snapshot_cleanup_enabled=True, + snapshot_cleanup_phase_timeout=snapshot_phase_timeout, + abort_requested=self.crawl_is_cancelled, + selected_hooks_by_plugin=selected_hooks_by_plugin, + ) + try: + snapshot_event = SnapshotEvent( + url=snapshot["url"], + snapshot_id=snapshot["id"], + output_dir=str(output_dir), + depth=int(snapshot["depth"]), + event_timeout=snapshot_phase_timeout, + event_handler_timeout=snapshot_phase_timeout, + event_handler_slow_timeout=slow_warning_timeout(snapshot_phase_timeout), + ) + snapshot_event.event_parent_id = crawl_start_event.event_id + emitted_snapshot_event = self.bus.emit(snapshot_event) + await _run_event_now(emitted_snapshot_event, snapshot_phase_timeout) + completed_snapshot = await self.bus.find( + SnapshotCompletedEvent, + child_of=emitted_snapshot_event, + past=True, + future=snapshot_phase_timeout, + ) + if completed_snapshot is None: + raise RuntimeError(f"Snapshot {snapshot_id} did not complete") + await completed_snapshot.wait(timeout=snapshot_phase_timeout) + await completed_snapshot.event_results_list() + # SnapshotCompletedEvent is the normal projection path, but the + # runner is the scheduler owner. Finalize idempotently here too + # so a completed snapshot cannot remain STARTED if the event was + # observed before its DB projector advanced the state machine. + crawl_limit_stop_reason = CrawlLimitState.from_config(config).get_stop_reason() + await sync_to_async(finalize_completed_snapshot, thread_sensitive=True)( + snapshot_id, + output_dir=output_dir, + crawl_limit_stop_reason=crawl_limit_stop_reason, + ) + if snapshot["status"] == "sealed": + await sync_to_async(run_snapshot_maintenance, thread_sensitive=True)(snapshot_id, output_dir=output_dir) + return + await self.enqueue_discovered_snapshots_from_outputs(snapshot) + await sync_to_async( + lambda: ( + self.crawl.sm.seal() + if self.crawl.status == self.crawl.StatusChoices.STARTED + and not self.crawl.snapshot_set.filter( + status__in=self.crawl.snapshot_set.model.OPEN_STATES, + ).exists() + else None + ), + thread_sensitive=True, + )() + finally: + snapshot_service.close() + + def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=snapshot_id).first() + if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED: + return + if snapshot.status == Snapshot.StatusChoices.STARTED: + snapshot.sm.seal() + return + snapshot.update_and_requeue( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + +def run_crawl( + crawl_id: str, + *, + snapshot_ids: list[str] | None = None, + selected_plugins: list[str] | None = None, + process_discovered_snapshots_inline: bool = True, + show_progress: bool = True, + interactive_interrupts: bool = False, + config_overrides: dict[str, Any] | None = None, + selected_plugins_are_explicit: bool = True, +) -> None: + from archivebox.crawls.models import Crawl + from django.db import close_old_connections + + def run_in_current_thread() -> None: + close_old_connections() + try: + crawl = Crawl.objects.get(id=crawl_id) + asyncio.run( + CrawlRunner( + crawl, + snapshot_ids=snapshot_ids, + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=process_discovered_snapshots_inline, + show_progress=show_progress, + interactive_interrupts=interactive_interrupts, + config_overrides=config_overrides, + selected_plugins_are_explicit=selected_plugins_are_explicit, + ).run(), + ) + finally: + close_old_connections() + + if threading.current_thread() is threading.main_thread(): + run_in_current_thread() + return + + errors: list[BaseException] = [] + + def run_in_worker_thread() -> None: + try: + run_in_current_thread() + except BaseException as err: + errors.append(err) + + worker = threading.Thread(target=run_in_worker_thread, name=f"archivebox-crawl-{crawl_id}") + worker.start() + worker.join() + if errors: + raise errors[0] + + +async def _run_binary(binary_id: str) -> None: + from archivebox.config.common import get_config + from archivebox.machine.models import Binary, Machine + + binary = await Binary.objects.aget(id=binary_id) + plugins = _discover_archivebox_plugins() + config = get_config(include_machine=False) + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + derived_config = normalize_runtime_config(machine.config) + config = config.for_crawl() + config = normalize_runtime_config(config) + bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0) + process_service = PersistedProcessService(bus) + binary_process_service = ArchiveBoxBinaryService(bus) + BinaryCacheService(bus, backend=ArchiveBoxDBBinaryCacheBackend()) + BinaryService(bus) + TagService(bus) + ArchiveResultService(bus) + MachineService(bus) + setup_abx_services( + bus, + plugins=plugins, + install_enabled=False, + crawl_setup_enabled=False, + crawl_start_enabled=False, + snapshot_cleanup_enabled=False, + crawl_cleanup_enabled=False, + persist_derived=False, + auto_install=True, + emit_jsonl=False, + BinaryCacheService=None, + BinaryService=None, + ) + await _emit_machine_config(bus, config=config, derived_config=derived_config) + + native_overrides, override_extra_context = split_abxpkg_binary_request_overrides(binary.overrides or None) + + try: + await bus.emit( + BinaryRequestEvent( + name=binary.name, + binproviders=binary.binproviders, + overrides=native_overrides or None, + extra_context={ + "plugin_name": "archivebox", + "hook_name": "archivebox_binary_run", + "output_dir": str(binary.output_dir), + "binary_id": str(binary.id), + "machine_id": str(binary.machine_id), + **override_extra_context, + }, + ), + ).now(first_result=True) + finally: + await bus.wait_until_idle() + await binary_process_service.flush_missing_finalizers() + await process_service.flush_completed() + + +def run_binary(binary_id: str) -> None: + asyncio.run(_run_binary(binary_id)) + + +def queued_plugins_and_hooks_for_snapshot(snapshot_id: str) -> tuple[list[str] | None, dict[str, set[str] | None] | None]: + from archivebox.core.models import ArchiveResult + + queued_results = list( + ArchiveResult.objects.filter( + snapshot_id=snapshot_id, + status=ArchiveResult.StatusChoices.QUEUED, + ) + .exclude(plugin="") + .only("id", "plugin", "hook_name"), + ) + + selected_hooks_by_plugin: dict[str, set[str] | None] = {} + queued_plugins = sorted({result.plugin for result in queued_results}) + for result in queued_results: + # hook_name is the modern scheduler identity. Empty hook_name rows are + # legacy plugin-level work and must keep running the whole plugin. + if not result.hook_name: + selected_hooks_by_plugin[result.plugin] = None + elif result.plugin not in selected_hooks_by_plugin: + selected_hooks_by_plugin[result.plugin] = {result.hook_name} + elif selected_hooks_by_plugin[result.plugin] is not None: + selected_hooks_by_plugin[result.plugin].add(result.hook_name) + if queued_plugins: + return queued_plugins, selected_hooks_by_plugin + return None, None + + +def queued_plugins_for_snapshot(snapshot_id: str) -> list[str] | None: + queued_plugins, _selected_hooks_by_plugin = queued_plugins_and_hooks_for_snapshot(snapshot_id) + return queued_plugins + + +def fail_unavailable_queued_hooks( + snapshot_id: str, + selected_hooks_by_plugin: dict[str, set[str] | None], + plugins: dict[str, Plugin], +) -> None: + from archivebox.core.models import ArchiveResult + + now = timezone.now() + for plugin_name, selected_hook_names in selected_hooks_by_plugin.items(): + if selected_hook_names is None or plugin_name not in plugins: + continue + available_hook_names = { + name for hook in plugins[plugin_name].filter_hooks("Snapshot") for name in (hook.name, Path(hook.name).stem) + } + missing_hook_names = [hook_name for hook_name in selected_hook_names if hook_name not in available_hook_names] + if not missing_hook_names: + continue + # Hook-level resume rows are durable scheduler state. If a plugin is + # installed but no longer exposes a queued hook, mark that row failed so + # the snapshot is not retried forever with no hook left to execute. + ArchiveResult.objects.filter( + snapshot_id=snapshot_id, + plugin=plugin_name, + hook_name__in=missing_hook_names, + status=ArchiveResult.StatusChoices.QUEUED, + ).update( + status=ArchiveResult.StatusChoices.FAILED, + start_ts=now, + end_ts=now, + output_str="Queued hook is no longer available in the installed plugin", + ) + + +def include_background_prerequisite_hooks( + selected_hooks_by_plugin: dict[str, set[str] | None], + plugins: dict[str, Plugin], +) -> dict[str, set[str] | None]: + expanded: dict[str, set[str] | None] = {} + for plugin_name, selected_hook_names in selected_hooks_by_plugin.items(): + if selected_hook_names is None or plugin_name not in plugins: + expanded[plugin_name] = selected_hook_names + continue + plugin_hooks = sorted(plugins[plugin_name].filter_hooks("Snapshot"), key=lambda hook: hook.sort_key) + selected_sort_keys = [ + hook.sort_key for hook in plugin_hooks if hook.name in selected_hook_names or Path(hook.name).stem in selected_hook_names + ] + if not selected_sort_keys: + expanded[plugin_name] = set(selected_hook_names) + continue + first_selected_sort_key = min(selected_sort_keys) + expanded_hook_names = set(selected_hook_names) + # Earlier background hooks publish live resources (e.g. Chrome tabs) + # needed by later foreground hooks, but completed foreground hooks stay + # final and are not rerun during hook-level resume. + for hook in plugin_hooks: + if hook.is_background and hook.sort_key < first_selected_sort_key: + expanded_hook_names.add(hook.name) + expanded_hook_names.add(Path(hook.name).stem) + expanded[plugin_name] = expanded_hook_names + return expanded + + +def snapshot_hooks_for_pending_archiveresults(snapshot) -> list[tuple[str, str]]: + from archivebox.config.common import get_config + + config = get_config(crawl=snapshot.crawl, snapshot=snapshot) + snapshot_plugin_names = [name.strip() for name in str((snapshot.config or {}).get("PLUGINS") or "").split(",") if name.strip()] + crawl_plugin_names = [name.strip() for name in str((snapshot.crawl.config or {}).get("PLUGINS") or "").split(",") if name.strip()] + config_plugin_names = [name.strip() for name in str(config.PLUGINS or "").split(",") if name.strip()] + plugin_names = snapshot_plugin_names or crawl_plugin_names or config_plugin_names + plugins = ( + filter_plugins(_discover_archivebox_plugins(), plugin_names, include_providers=True) + if plugin_names + else _discover_archivebox_plugins() + ) + return sorted((plugin.name, hook.name) for plugin in plugins.values() for hook in plugin.filter_hooks("Snapshot")) + + +def run_snapshot_maintenance(snapshot_id: str, *, output_dir: Path | None = None) -> bool: + from archivebox.core.models import ArchiveResult, Snapshot + + snapshot = Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=snapshot_id).first() + if snapshot is None: + return False + + has_queued_results = snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED).exists() + # retry_at is the scheduler signal for both lifecycle work and targeted + # maintenance. Filesystem migration/json rewriting is independent from + # queued ArchiveResult rows, so run it whenever this helper is called. + # The only thing queued rows change is the next scheduler value: + # - no queued rows left: clear retry_at because maintenance is done + # - queued rows remain: leave the Snapshot due so the sealed/paused runner + # branch can process those targeted plugin rows on the next tick + # This avoids reopening final/paused snapshots while also avoiding stranded + # queued ArchiveResults that have no independent scheduler. + current_retry_at = snapshot.retry_at + next_retry_at = timezone.now() if has_queued_results else None + snapshot.retry_at = next_retry_at + if snapshot.fs_migration_needed: + snapshot.save(update_fields=["retry_at", "modified_at"]) + else: + updated = snapshot.safe_update( + {"retry_at": next_retry_at}, + refresh=False, + extra_filter={ + "status": snapshot.StatusChoices.SEALED, + "retry_at": current_retry_at, + }, + ) + if not updated: + return False + snapshot.write_index_jsonl(output_dir=output_dir) + return True + + +def run_due_crawl(crawl, *, lock_seconds: int, interactive_interrupts: bool = False) -> bool: + try: + crawl.refresh_from_db(fields=["status", "retry_at", "modified_at"]) + except type(crawl).DoesNotExist: + return False + + if crawl.is_paused: + _runner_console_line(crawl=crawl, status="PAUSED") + return True + if crawl.status in (crawl.StatusChoices.QUEUED, crawl.StatusChoices.STARTED): + from archivebox.core.models import Snapshot + + now = timezone.now() + snapshot_count = crawl.snapshot_set.count() + due_active_snapshots = crawl.snapshot_set.filter( + status__in=Snapshot.RUNNABLE_STATES, + retry_at__lte=now, + ).exists() + if snapshot_count and due_active_snapshots: + # Child Snapshot rows own active work. Do not rewrite the parent + # row unless it is still the same STARTED row we selected; this + # avoids hot-looping on the parent while child work is ready without + # resurrecting a user cancellation that sealed the crawl after + # selection. + crawl.safe_update( + { + "status": crawl.StatusChoices.STARTED, + "retry_at": now + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS), + "modified_at": now, + }, + refresh=False, + extra_filter={"status": crawl.StatusChoices.STARTED}, + ) + return True + if snapshot_count and not due_active_snapshots: + if crawl.is_finished(): + if not crawl.claim_processing_lock(lock_seconds=lock_seconds): + return False + crawl.refresh_from_db() + crawl.sm.tick() + return True + + # retry_at is the only queue/ownership signal the runner sees. + # Clearing it on an unfinished crawl hides the row forever, so keep + # future snapshots scheduled and repair NULL queued child locks here. + unlocked_children = crawl.snapshot_set.filter( + status=Snapshot.StatusChoices.QUEUED, + retry_at__isnull=True, + ).update( + retry_at=now, + modified_at=now, + ) + if unlocked_children: + crawl.update_and_requeue(status=crawl.StatusChoices.STARTED, retry_at=now) + return True + + next_snapshot_retry = ( + crawl.snapshot_set.filter( + status__in=Snapshot.OPEN_STATES, + retry_at__gt=now, + ) + .order_by("retry_at", "created_at") + .values_list("retry_at", flat=True) + .first() + ) + crawl.update_and_requeue( + status=crawl.StatusChoices.STARTED, + retry_at=next_snapshot_retry or now + timedelta(seconds=10), + ) + return True + if not crawl.claim_processing_lock(lock_seconds=lock_seconds): + return False + crawl.refresh_from_db() + if crawl.status == crawl.StatusChoices.STARTED and crawl.is_finished(): + crawl.sm.tick() + return True + _runner_console_line(crawl=crawl) + run_crawl(str(crawl.id), process_discovered_snapshots_inline=True, interactive_interrupts=interactive_interrupts) + return True + + if crawl.status == crawl.StatusChoices.SEALED: + if not type(crawl).claim_for_worker(crawl, lock_seconds=lock_seconds): + return False + _runner_console_line(crawl=crawl, status="SEALED") + crawl.cleanup() + crawl.update_and_requeue(retry_at=None) + return True + + crawl.update_and_requeue(retry_at=None) + return True + + +def run_due_snapshot(snapshot, *, lock_seconds: int, interactive_interrupts: bool = False, runtime_config=None) -> bool: + from archivebox.core.models import Snapshot + + try: + snapshot = Snapshot.objects.get(pk=snapshot.pk) + except Snapshot.DoesNotExist: + return False + parent_reconciled = snapshot.reconcile_parent_lifecycle(lock_seconds=lock_seconds) + if parent_reconciled is not None: + return parent_reconciled + + if snapshot.is_paused: + selected_plugins = queued_plugins_for_snapshot(str(snapshot.id)) + if snapshot.fs_migration_needed and Snapshot.claim_for_worker(snapshot, lock_seconds=lock_seconds): + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot) + run_snapshot_maintenance(str(snapshot.id)) + if not selected_plugins: + # No targeted plugin rows remain, so put paused snapshots back + # behind the indefinite retry_at marker. If queued plugin rows + # remain, continue into the targeted plugin path below and let + # its finally block restore the paused marker after completion. + snapshot.restore_paused_scheduler_marker() + return True + snapshot.refresh_from_db() + if not selected_plugins: + # Paused is a real lifecycle state; retry_at=MAX is only the + # orchestrator selection marker. If a direct maintenance/update + # command bumps retry_at on a paused snapshot but there are no + # targeted ArchiveResult rows to run, restore the scheduler marker + # without changing status. + snapshot.restore_paused_scheduler_marker() + return True + if not Snapshot.claim_for_worker(snapshot, lock_seconds=lock_seconds): + return False + try: + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot) + # Explicit maintenance, e.g. `archivebox update --index-only`, may + # need to run search/index hooks for a paused snapshot. That should + # not resume the crawl or make unrelated queued work runnable, so + # selected_plugins is required and the paused state is restored in + # the finally block below. + run_crawl( + str(snapshot.crawl_id), + snapshot_ids=[str(snapshot.id)], + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=True, + interactive_interrupts=interactive_interrupts, + selected_plugins_are_explicit=False, + ) + finally: + # Targeted plugin rows can complete while the Snapshot remains + # paused. Put retry_at back at MAX so the orchestrator leaves the + # paused lifecycle alone until an explicit resume transition. + snapshot.restore_paused_scheduler_marker() + return True + if snapshot.status == Snapshot.StatusChoices.SEALED: + if not Snapshot.claim_for_worker(snapshot, lock_seconds=lock_seconds): + return False + snapshot.refresh_from_db() + snapshot.finalize_completed_upload_results() + maintenance_ran = False + if snapshot.fs_migration_needed: + # Final snapshots can still need filesystem/index maintenance after + # a data-dir migration, but queued ArchiveResult rows are the actual + # runnable work. Do the metadata rewrite first, then continue into + # the targeted plugin path in the same tick so large migrations do + # not starve search/index backfills behind a full maintenance pass. + maintenance_ran = run_snapshot_maintenance(str(snapshot.id)) + snapshot.refresh_from_db() + selected_plugins = queued_plugins_for_snapshot(str(snapshot.id)) + if selected_plugins: + search_only_plugins = all(plugin.startswith("search_backend_") for plugin in selected_plugins) + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot) + run_crawl( + str(snapshot.crawl_id), + snapshot_ids=[str(snapshot.id)], + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=True, + interactive_interrupts=interactive_interrupts, + selected_plugins_are_explicit=False, + ) + if search_only_plugins: + from archivebox.core.models import ArchiveResult + + has_queued_results = ArchiveResult.objects.filter( + snapshot_id=snapshot.id, + status=ArchiveResult.StatusChoices.QUEUED, + ).exists() + if not has_queued_results: + type(snapshot).objects.filter( + pk=snapshot.pk, + status=snapshot.StatusChoices.SEALED, + ).update( + retry_at=None, + modified_at=timezone.now(), + ) + return True + if maintenance_ran: + return True + return run_snapshot_maintenance(str(snapshot.id)) + + if snapshot.status == Snapshot.StatusChoices.STARTED: + _reset_count, running_count = snapshot.reset_abandoned_results() + if running_count: + snapshot.update_and_requeue(retry_at=timezone.now() + timedelta(seconds=ACTIVE_STATE_LEASE_SECONDS)) + return True + + if not snapshot.claim_processing_lock(lock_seconds=lock_seconds): + return False + snapshot.refresh_from_db() + if snapshot.status == Snapshot.StatusChoices.QUEUED: + if snapshot.archiveresult_set.exists() and snapshot.is_finished_processing(): + snapshot.sm.tick() + snapshot.refresh_from_db() + if snapshot.status == Snapshot.StatusChoices.SEALED: + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot, status="SEALED") + return True + # The runner owns queued Snapshot setup. Create missing enabled hook + # rows before ticking so maintenance-only final rows, e.g. search + # backfill on a paused snapshot, cannot make queued -> sealed skip the + # real extraction work after resume. + snapshot.create_pending_archiveresults(hooks=snapshot_hooks_for_pending_archiveresults(snapshot)) + snapshot.sm.tick() + snapshot.refresh_from_db() + if snapshot.status == Snapshot.StatusChoices.SEALED: + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot, status="SEALED") + return True + if snapshot.status == Snapshot.StatusChoices.STARTED and snapshot.archiveresult_set.exists() and snapshot.is_finished_processing(): + snapshot.sm.tick() + snapshot.refresh_from_db() + if snapshot.status == Snapshot.StatusChoices.SEALED: + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot, status="SEALED") + return True + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot) + run_crawl( + str(snapshot.crawl_id), + snapshot_ids=[str(snapshot.id)], + selected_plugins=queued_plugins_for_snapshot(str(snapshot.id)), + process_discovered_snapshots_inline=True, + interactive_interrupts=interactive_interrupts, + selected_plugins_are_explicit=False, + ) + snapshot.refresh_from_db() + if queued_plugins_for_snapshot(str(snapshot.id)): + # Hook-level resume work is tracked by queued ArchiveResult rows, not by + # the Snapshot lease. If a partial pass returns with rows still queued, + # wake the Snapshot immediately so takeover does not wait out a stale + # active-state lock before running the remaining hooks. + snapshot.update_and_requeue(retry_at=timezone.now()) + return True + + +def run_due_binary(binary, *, lock_seconds: int) -> bool: + binary_name = str(binary.name or "") + binary_path = Path(binary_name).expanduser() + if (binary_path.is_absolute() or binary_name.startswith("~")) and not binary_path.exists(): + binary.retry_at = None + binary.save(update_fields=["retry_at", "modified_at"]) + return True + if not binary.claim_processing_lock(lock_seconds=lock_seconds): + return False + run_binary(str(binary.id)) + return True + + +async def _run_install(plugin_names: list[str] | None = None) -> None: + from archivebox.config.common import get_config + from archivebox.machine.models import Machine + + plugins = _discover_archivebox_plugins() + config = get_config(include_machine=False) + machine = await sync_to_async(Machine.current, thread_sensitive=True)() + derived_config = normalize_runtime_config(machine.config) + config = config.for_crawl() + config = normalize_runtime_config(config) + bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0) + PersistedProcessService(bus) + ArchiveBoxBinaryService(bus) + BinaryCacheService(bus, backend=ArchiveBoxDBBinaryCacheBackend()) + BinaryService(bus) + TagService(bus) + ArchiveResultService(bus) + MachineService(bus) + await _emit_machine_config(bus, config=config, derived_config=derived_config) + live_stream = None + bus_destroyed = False + + try: + selected_plugins = filter_plugins(plugins, list(plugin_names), include_providers=True) if plugin_names else plugins + if not selected_plugins: + return + plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)" + timeout_seconds = config["TIMEOUT"] + stdout_is_tty = sys.stdout.isatty() + stderr_is_tty = sys.stderr.isatty() + interactive_tty = stdout_is_tty or stderr_is_tty + ui_console = None + live_ui = None + + if interactive_tty: + stream = sys.stderr if stderr_is_tty else sys.stdout + if os.path.exists("/dev/tty"): + try: + live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8") + stream = live_stream + except OSError: + live_stream = None + try: + terminal_size = os.get_terminal_size(stream.fileno()) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + except (AttributeError, OSError, ValueError): + terminal_size = shutil.get_terminal_size(fallback=(160, 40)) + terminal_width = terminal_size.columns + terminal_height = terminal_size.lines + ui_console = Console( + file=stream, + force_terminal=True, + width=terminal_width, + height=terminal_height, + _environ={ + "COLUMNS": str(terminal_width), + "LINES": str(terminal_height), + }, + ) + + with TemporaryDirectory(prefix="archivebox-install-") as temp_dir: + output_dir = Path(temp_dir) + if ui_console is not None: + live_ui = LiveBusUI( + bus, + total_hooks=_count_selected_hooks(selected_plugins, None), + timeout_seconds=timeout_seconds, + ui_console=ui_console, + interactive_tty=interactive_tty, + ) + live_ui.print_intro( + url="install", + output_dir=output_dir, + plugins_label=plugins_label, + ) + with live_ui if live_ui is not None else nullcontext(): + try: + await abx_install_plugins( + plugin_names=plugin_names, + plugins=plugins, + output_dir=output_dir, + config_overrides=config, + derived_config_overrides=derived_config, + emit_jsonl=False, + bus=bus, + MachineService=None, + BinaryCacheService=None, + BinaryService=None, + ) + finally: + try: + await bus.wait_until_idle() + finally: + await bus.destroy(clear=False) + bus_destroyed = True + if live_ui is not None: + live_ui.print_summary(output_dir=output_dir) + finally: + if not bus_destroyed: + await bus.destroy(clear=False) + try: + if live_stream is not None: + live_stream.close() + except Exception: + pass + + +def run_install(*, plugin_names: list[str] | None = None) -> None: + asyncio.run(_run_install(plugin_names=plugin_names)) + + +def _first_due_id(queryset): + return queryset.order_by("retry_at", "created_at").values_list("id", flat=True).first() + + +def _run_due_crawl_status(status: str, *, crawl_id: str | None, lock_seconds: int, interactive_interrupts: bool) -> bool: + from archivebox.crawls.models import Crawl + + due_crawls = Crawl.objects.filter( + retry_at__lte=timezone.now(), + status=status, + ) + if crawl_id: + due_crawls = due_crawls.filter(id=crawl_id) + due_crawl_id = _first_due_id(due_crawls) + if due_crawl_id is None: + return False + due_crawl = Crawl.objects.filter(id=due_crawl_id).first() + if due_crawl is None: + return True + run_due_crawl( + due_crawl, + lock_seconds=lock_seconds, + interactive_interrupts=interactive_interrupts, + ) + return True + + +def _run_due_snapshot_query(queryset, *, lock_seconds: int, interactive_interrupts: bool, runtime_config) -> bool: + due_snapshot_id = _first_due_id(queryset) + return _run_due_snapshot_id( + due_snapshot_id, + lock_seconds=lock_seconds, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ) + + +def _run_due_snapshot_id(snapshot_id, *, lock_seconds: int, interactive_interrupts: bool, runtime_config) -> bool: + from archivebox.core.models import Snapshot + + due_snapshot_id = snapshot_id + if due_snapshot_id is None: + return False + due_snapshot = Snapshot.objects.filter(id=due_snapshot_id).first() + if due_snapshot is None: + return True + run_due_snapshot( + due_snapshot, + lock_seconds=lock_seconds, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ) + return True + + +def _run_due_queued_plugin_result( + plugin_names: frozenset[str], + *, + crawl_id: str | None, + lock_seconds: int, + interactive_interrupts: bool, + runtime_config, + batch_size: int = QUEUED_PLUGIN_RESULT_BATCH_SIZE, +) -> bool: + from archivebox.core.models import ArchiveResult, Snapshot + from django.db.models import Exists, OuterRef + + if not plugin_names: + return False + now = timezone.now() + queued_results = ArchiveResult.objects.filter( + snapshot_id=OuterRef("pk"), + status=ArchiveResult.StatusChoices.QUEUED, + plugin__in=plugin_names, + ) + first_due_query = ( + ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.QUEUED, + plugin__in=plugin_names, + snapshot__retry_at__lte=now, + snapshot__status=Snapshot.StatusChoices.SEALED, + ) + .filter(**({"snapshot__crawl_id": crawl_id} if crawl_id else {})) + .values("snapshot_id", "snapshot__crawl_id")[:1] + ) + first_due_results = list(first_due_query) + if not first_due_results: + return False + root_crawl_id = str(first_due_results[0]["snapshot__crawl_id"]) + + due_snapshots = Snapshot.objects.filter( + retry_at__lte=now, + status=Snapshot.StatusChoices.SEALED, + ).filter(Exists(queued_results)) + if crawl_id: + due_snapshots = due_snapshots.filter(crawl_id=crawl_id) + batch_candidates = list( + # The crawl picker above starts from enabled queued ArchiveResult rows + # and uses a sliced LIMIT 1. Do not use QuerySet.first() here: it adds + # ordering and can turn this hot scheduler check into a temp-sort over + # hundreds of thousands of plugin rows. Once a crawl is selected, + # sibling order is irrelevant; the crawl_id/status index can fetch this + # small local batch directly while EXISTS proves the enabled queued + # plugin rows via the existing ArchiveResult unique index. + due_snapshots.filter(crawl_id=root_crawl_id).order_by()[:batch_size], + ) + if not batch_candidates: + return False + + selected_plugins: list[str] | None = None + claimed_snapshot_ids: list[str] = [] + for snapshot in batch_candidates: + snapshot_selected_plugins = [ + plugin_name for plugin_name in (queued_plugins_for_snapshot(str(snapshot.id)) or []) if plugin_name in plugin_names + ] + if not snapshot_selected_plugins: + continue + if selected_plugins is None: + selected_plugins = snapshot_selected_plugins + if snapshot_selected_plugins != selected_plugins: + continue + claimed = Snapshot.claim_for_worker(snapshot, lock_seconds=lock_seconds) + if not claimed: + continue + snapshot.refresh_from_db() + snapshot.finalize_completed_upload_results() + if snapshot.fs_migration_needed: + run_snapshot_maintenance(str(snapshot.id)) + snapshot.refresh_from_db() + if snapshot.status != Snapshot.StatusChoices.SEALED: + continue + claimed_snapshot_ids.append(str(snapshot.id)) + _runner_console_line(crawl_id=snapshot.crawl_id, snapshot=snapshot) + + if not claimed_snapshot_ids or selected_plugins is None: + return True + + config_overrides = { + "CRAWL_MAX_CONCURRENT_SNAPSHOTS": batch_size, + } + for plugin_name in selected_plugins: + if plugin_name.startswith("search_backend_"): + config_overrides[f"{plugin_name.upper()}_ENABLED"] = True + + run_crawl( + root_crawl_id, + snapshot_ids=claimed_snapshot_ids, + selected_plugins=selected_plugins, + process_discovered_snapshots_inline=True, + interactive_interrupts=interactive_interrupts, + config_overrides=config_overrides, + selected_plugins_are_explicit=False, + ) + if all(plugin.startswith("search_backend_") for plugin in selected_plugins): + queued_results = ArchiveResult.objects.filter( + snapshot_id=OuterRef("pk"), + status=ArchiveResult.StatusChoices.QUEUED, + plugin__in=selected_plugins, + ) + Snapshot.objects.filter( + id__in=claimed_snapshot_ids, + status=Snapshot.StatusChoices.SEALED, + ).annotate( + has_queued_results=Exists(queued_results), + ).filter( + has_queued_results=False, + ).update( + retry_at=None, + modified_at=timezone.now(), + ) + return True + + +def _run_due_binary() -> bool: + from archivebox.machine.models import Binary + + due_binary_id = ( + Binary.objects.filter(retry_at__lte=timezone.now()) + .exclude(status=Binary.StatusChoices.INSTALLED) + .order_by("retry_at", "created_at") + .values_list("id", flat=True) + .first() + ) + if due_binary_id is None: + return False + due_binary = Binary.objects.filter(id=due_binary_id).first() + if due_binary is None: + return True + run_due_binary(due_binary, lock_seconds=60) + return True + + +def _fast_forward_same_path_snapshot_fs_versions(batch_size: int = 10000) -> bool: + from django.db import connection + + from archivebox.core.models import Snapshot, ArchiveResult + + now = timezone.now() + current_version = Snapshot._fs_current_version() + same_path_versions = ("0.9.0", "0.9.1", "0.9.2", "0.9.3") + with connection.cursor() as cursor: + cursor.execute( + """ + UPDATE core_snapshot + SET fs_version = %s, + retry_at = CASE + WHEN EXISTS ( + SELECT 1 + FROM core_archiveresult + WHERE core_archiveresult.snapshot_id = core_snapshot.id + AND core_archiveresult.status = %s + ) + THEN retry_at + ELSE NULL + END, + modified_at = %s + WHERE id IN ( + SELECT id + FROM core_snapshot + WHERE status = %s + AND retry_at <= %s + AND fs_version IN (%s, %s, %s, %s) + ORDER BY retry_at, created_at + LIMIT %s + ) + """, + [ + current_version, + ArchiveResult.StatusChoices.QUEUED, + now, + Snapshot.StatusChoices.SEALED, + now, + *same_path_versions, + batch_size, + ], + ) + return bool(cursor.rowcount) + + +def run_pending_crawls( + *, + daemon: bool = False, + crawl_id: str | None = None, + maintenance_only: bool = False, + interactive_interrupts: bool = False, + maintenance_batch_size: int = QUEUED_PLUGIN_RESULT_BATCH_SIZE, +) -> int: + from archivebox.config.common import get_config + from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.plugins.discovery import discover_plugin_configs + from archivebox.plugins.hooks import discover_hooks + from archivebox.machine.models import Process + + crawl_claim_lock_seconds = 10 + runtime_config = get_config() + plugin_configs = discover_plugin_configs() + download_plugin_names = frozenset( + plugin_name + for plugin_name, plugin_config in plugin_configs.items() + if plugin_config.get("output_mimetypes") and not plugin_name.startswith("search_backend_") + ) + last_recovery_at = 0.0 + last_retention_at = 0.0 + last_retention_repair_at = 0.0 + last_analyze_at = 0.0 + analyze_queue: list[str] | None = None + analyze_sweep_started_at = 0.0 + orchestrator_started_at = time.monotonic() + while True: + raise_if_shutdown_requested() + now_monotonic = time.monotonic() + if now_monotonic - last_retention_at >= (60.0 if daemon else 1.0): + for model in (ArchiveResult, Snapshot, Crawl, Process): + # Keep the tight scheduler loop anchored on indexed delete_at + # columns only. Backfilling missing delete_at values has to read + # config JSON for models whose retention policy is scoped to a + # Crawl/Snapshot/Process. That repair is still required for + # correctness, but it belongs in the idle maintenance block + # below, not ahead of every claim attempt. + model.delete_expired(batch_size=100, backfill_missing=False) + last_retention_at = now_monotonic + + if daemon and crawl_id is None: + now = timezone.now() + for schedule in CrawlSchedule.objects.filter(is_enabled=True).select_related("template", "template__created_by"): + if schedule.is_due(now): + schedule.enqueue(queued_at=now) + + # Final-state download rows are always first: they have no parent crawl + # scheduler of their own, and leaving them behind makes the global + # counters report stale queued work while new crawls continue. + if _run_due_queued_plugin_result( + download_plugin_names, + crawl_id=crawl_id, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + batch_size=maintenance_batch_size, + ): + continue + + if _fast_forward_same_path_snapshot_fs_versions(): + continue + + if not maintenance_only: + active_snapshots = Snapshot.objects.filter( + retry_at__lte=timezone.now(), + crawl__status__in=Crawl.RUNNABLE_STATES, + status__in=Snapshot.RUNNABLE_STATES, + ) + if crawl_id: + active_snapshots = active_snapshots.filter(crawl_id=crawl_id) + if _run_due_snapshot_query( + active_snapshots, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ): + continue + + if not maintenance_only: + if _run_due_crawl_status( + Crawl.StatusChoices.QUEUED, + crawl_id=crawl_id, + lock_seconds=crawl_claim_lock_seconds, + interactive_interrupts=interactive_interrupts, + ): + continue + + if not maintenance_only: + if _run_due_crawl_status( + Crawl.StatusChoices.STARTED, + crawl_id=crawl_id, + lock_seconds=crawl_claim_lock_seconds, + interactive_interrupts=interactive_interrupts, + ): + continue + + if not maintenance_only: + # Canceled-crawl child sealing is important cleanup, but it must + # not starve live crawl work when a large bulk cancel leaves many + # children due at once. + cancelling_snapshots = Snapshot.objects.filter( + retry_at__lte=timezone.now(), + crawl__status=Crawl.StatusChoices.SEALED, + status=Snapshot.StatusChoices.STARTED, + ) + if crawl_id: + cancelling_snapshots = cancelling_snapshots.filter(crawl_id=crawl_id) + if _run_due_snapshot_query( + cancelling_snapshots, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ): + continue + + if not maintenance_only: + pausing_snapshots = Snapshot.objects.filter( + retry_at__lte=timezone.now(), + crawl__status=Crawl.StatusChoices.PAUSED, + status__in=Snapshot.RUNNABLE_STATES, + ) + if crawl_id: + pausing_snapshots = pausing_snapshots.filter(crawl_id=crawl_id) + if _run_due_snapshot_query( + pausing_snapshots, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ): + continue + + # Final active-state fallback uses only the retry_at scheduler index and + # selects an id first. Keep final SEALED rows out of this broad path so + # large filesystem/index backfills cannot starve newly queued crawls. + due_snapshots = Snapshot.objects.filter( + retry_at__lte=timezone.now(), + status__in=Snapshot.OPEN_STATES, + ) + if maintenance_only: + due_snapshots = due_snapshots.filter(status=Snapshot.StatusChoices.PAUSED) + if crawl_id: + due_snapshots = due_snapshots.filter(crawl_id=crawl_id) + if _run_due_snapshot_query( + due_snapshots, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ): + continue + + # Search backend selection is live crawl-execution config, not an + # installed-plugin list. Old queued rows for a backend that is disabled + # by the current Machine/Crawl/Snapshot config must remain queued so + # they can run if the user re-enables that backend, but they should not + # launch a standalone hook process just to skip after imports/config + # hydration. Refreshing here preserves mid-run config edits while using + # the same enabled-hook discovery path that created ArchiveResult rows. + runtime_config = get_config() + search_plugin_names = frozenset( + hook.parent.name for hook in discover_hooks("Snapshot", config=runtime_config) if hook.parent.name.startswith("search_backend_") + ) + if _run_due_queued_plugin_result( + search_plugin_names, + crawl_id=crawl_id, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + batch_size=maintenance_batch_size, + ): + continue + + # Broad final-state maintenance is intentionally a fallback. Specific + # queued plugin work above can use ArchiveResult's scheduler indexes; + # this branch may need to prove that no due sealed snapshot remains, so + # avoid paying that scan while targeted work is already available. + sealed_snapshots = Snapshot.objects.filter( + retry_at__lte=timezone.now(), + status=Snapshot.StatusChoices.SEALED, + ) + if search_plugin_names: + queued_search_snapshot_ids = ArchiveResult.objects.filter( + status=ArchiveResult.StatusChoices.QUEUED, + plugin__in=search_plugin_names, + ).values("snapshot_id") + sealed_snapshots = sealed_snapshots.exclude( + id__in=queued_search_snapshot_ids, + ) + if crawl_id: + sealed_snapshots = sealed_snapshots.filter(crawl_id=crawl_id) + if _run_due_snapshot_query( + sealed_snapshots, + lock_seconds=60, + interactive_interrupts=interactive_interrupts, + runtime_config=runtime_config, + ): + continue + + if not maintenance_only: + if _run_due_crawl_status( + Crawl.StatusChoices.SEALED, + crawl_id=crawl_id, + lock_seconds=crawl_claim_lock_seconds, + interactive_interrupts=interactive_interrupts, + ): + continue + + if crawl_id is None and not maintenance_only: + if _run_due_binary(): + continue + + now_monotonic = time.monotonic() + if now_monotonic - last_retention_repair_at >= (60.0 if daemon else 0.0): + for model in (ArchiveResult, Snapshot, Crawl, Process): + # No runnable work was found on this scheduler pass. This is + # the bounded repair point for missing retention deadlines, + # including ArchiveResult rows intentionally saved without + # delete_at in the plugin-result hot path. Running it here keeps + # DELETE_AFTER resolution fresh without making every hook event + # load parent Snapshot/Crawl config. + model.delete_expired(batch_size=100, backfill_missing=True) + last_retention_repair_at = now_monotonic + + if daemon: + now_monotonic = time.monotonic() + if now_monotonic - last_recovery_at >= 30.0: + from archivebox.core.recovery_util import recover_orchestrator_state + + recover_orchestrator_state() + last_recovery_at = now_monotonic + # SQLite query plans degrade as the snapshot/archiveresult tables grow + # past their last ANALYZE โ€” stale stats make the optimizer start large + # joins from auth_user/crawl instead of using the url index, blowing the + # snapshot detail page out to ~500ms. Refresh stats at most once per + # 24hr while the queue is idle, and only after the orchestrator has + # been alive for at least an hour so short server boots / one-off work + # never pay the cost. The sweep is batched one table per idle tick; + # individual table ANALYZE statements abort after 2min (progress + # handler) and the whole sweep is hard-capped at 5min so a + # pathological table cannot wedge maintenance forever. Any failure + # inside the maintenance hook is swallowed โ€” orchestrator must never + # be taken down by stats refresh. + try: + if ( + analyze_queue is None + and now_monotonic - orchestrator_started_at >= 3600.0 + and now_monotonic - last_analyze_at >= 86400.0 + ): + analyze_sweep_started_at = now_monotonic + analyze_queue = run_db_analyze_batch(None) + elif analyze_queue and now_monotonic - analyze_sweep_started_at >= 300.0: + # Sweep blew past the 5min hard cap โ€” abandon what's left + # and don't retry until the next 24hr window. + analyze_queue = None + last_analyze_at = now_monotonic + elif analyze_queue: + analyze_queue = run_db_analyze_batch(analyze_queue) + if analyze_queue is not None and not analyze_queue: + analyze_queue = None + last_analyze_at = now_monotonic + except Exception: + analyze_queue = None + last_analyze_at = now_monotonic + time.sleep(2.0) + continue + return 0 diff --git a/archivebox/services/snapshot_service.py b/archivebox/services/snapshot_service.py new file mode 100644 index 0000000000..8ae63c825b --- /dev/null +++ b/archivebox/services/snapshot_service.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import sys + +from asgiref.sync import sync_to_async +from django.utils import timezone +from django.core.exceptions import ValidationError +from rich import print as rprint +from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent +from abx_dl.limits import CrawlLimitState +from abx_dl.services.base import BaseService + + +def finalize_completed_snapshot( + snapshot_id: str, + *, + output_dir=None, + crawl_limit_stop_reason: str | None = None, +) -> None: + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.select_related("crawl", "crawl__created_by").filter(id=snapshot_id).first() + if snapshot is None: + return + + if snapshot.downloaded_at is None: + snapshot.downloaded_at = timezone.now() + snapshot.save(update_fields=["downloaded_at", "modified_at"]) + + stop_reason = crawl_limit_stop_reason if crawl_limit_stop_reason is not None else _crawl_limit_stop_reason(snapshot.crawl) + if snapshot.crawl_id and stop_reason in ("crawl_max_size", "crawl_timeout"): + Snapshot.objects.filter( + crawl_id=snapshot.crawl_id, + status=Snapshot.StatusChoices.QUEUED, + ).exclude(id=snapshot.id).update( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + + if snapshot.status == Snapshot.StatusChoices.QUEUED: + snapshot.sm.tick() + snapshot.refresh_from_db() + if snapshot.status == Snapshot.StatusChoices.STARTED and snapshot.is_finished_processing(): + snapshot.sm.seal() + snapshot.refresh_from_db() + + snapshot.write_index_jsonl(output_dir=output_dir) + + +def _crawl_limit_stop_reason(crawl) -> str: + from archivebox.config.common import get_config + + config_model = get_config(crawl=crawl) + config = config_model.for_crawl_runtime( + crawl=crawl, + persona=crawl.resolve_persona(), + ) + return CrawlLimitState.from_config(config).get_stop_reason() + + +class SnapshotService(BaseService): + LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent] + EMITS = [] + + def __init__(self, bus, *, crawl_id: str, schedule_snapshot): + self.crawl_id = crawl_id + self.schedule_snapshot = schedule_snapshot + super().__init__(bus) + self.bus.on(SnapshotEvent, self.on_SnapshotEvent) + self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent) + + async def on_SnapshotEvent(self, event: SnapshotEvent) -> None: + from archivebox.core.models import Snapshot + + snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl_id=self.crawl_id).afirst() + + if snapshot is not None: + if snapshot.is_paused: + return + if snapshot.status == Snapshot.StatusChoices.QUEUED: + try: + await sync_to_async(snapshot.sm.tick, thread_sensitive=True)() + except ValidationError as err: + if "ArchiveBox cannot archive its own admin, web, api, or snapshot URLs." not in str(err): + raise + await Snapshot.objects.filter(id=snapshot.id).aupdate( + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + modified_at=timezone.now(), + ) + rprint( + f"[red][X] Refusing to archive ArchiveBox internal URL for security: {snapshot.url}[/red]", + file=sys.stderr, + ) + return + await sync_to_async(snapshot.refresh_from_db, thread_sensitive=True)() + elif snapshot.status != Snapshot.StatusChoices.STARTED: + return + if snapshot.status != Snapshot.StatusChoices.STARTED: + return + await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)() + + async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None: + await sync_to_async(finalize_completed_snapshot, thread_sensitive=True)(event.snapshot_id) diff --git a/archivebox/services/tag_service.py b/archivebox/services/tag_service.py new file mode 100644 index 0000000000..22d6685dc7 --- /dev/null +++ b/archivebox/services/tag_service.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from abx_dl.events import TagEvent +from abx_dl.services.base import BaseService + + +class TagService(BaseService): + LISTENS_TO = [TagEvent] + EMITS = [] + + def __init__(self, bus): + super().__init__(bus) + self.bus.on(TagEvent, self.on_TagEvent__save_to_db) + + async def on_TagEvent__save_to_db(self, event: TagEvent) -> None: + from archivebox.core.models import Snapshot, SnapshotTag, Tag + + snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst() + if snapshot is None: + return + tag, _ = await Tag.objects.aget_or_create(name=event.name) + await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag) diff --git a/archivebox/static b/archivebox/static new file mode 120000 index 0000000000..5d01044d31 --- /dev/null +++ b/archivebox/static @@ -0,0 +1 @@ +templates/static \ No newline at end of file diff --git a/archivebox/templates/admin/actions.html b/archivebox/templates/admin/actions.html new file mode 100644 index 0000000000..9015f8b2f6 --- /dev/null +++ b/archivebox/templates/admin/actions.html @@ -0,0 +1,52 @@ +{% load i18n core_tags %} +
    +
    + {% block actions %} + {% block actions-form %} + {% for field in action_form %} + {% if field.name == "tags" %} + {{ field }} + {% else %} + {% if field.label %}{% else %}{{ field }}{% endif %} + {% endif %} + {% endfor %} + {% endblock %} + {% block actions-submit %} + + {% endblock %} + {% block actions-counter %} + {% if actions_selection_counter %} + + + 0 + / + {{ cl.result_list|length|intcomma }} + selected + + + {% if cl.opts.model_name == 'snapshot' %} + + {% if cl.full_result_count and cl.full_result_count != cl.result_count %} + {{ cl.result_count|intcomma }} + / + {{ cl.full_result_count|intcomma }} + total + {% else %} + {{ cl.result_count|intcomma }} + total + {% endif %} + + {% endif %} + {% if cl.result_count != cl.result_list|length %} + + + + {% endif %} + + {% endif %} + {% endblock %} + {% endblock %} +
    +
    diff --git a/archivebox/templates/admin/app_index.html b/archivebox/templates/admin/app_index.html new file mode 100644 index 0000000000..6868b497dd --- /dev/null +++ b/archivebox/templates/admin/app_index.html @@ -0,0 +1,18 @@ +{% extends "admin/index.html" %} +{% load i18n %} + +{% block bodyclass %}{{ block.super }} app-{{ app_label }}{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block sidebar %}{% endblock %} diff --git a/archivebox/templates/admin/auth/user/change_form.html b/archivebox/templates/admin/auth/user/change_form.html new file mode 100644 index 0000000000..b825a830f1 --- /dev/null +++ b/archivebox/templates/admin/auth/user/change_form.html @@ -0,0 +1,47 @@ +{% extends "admin/change_form.html" %} +{% load core_tags %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block object-tools-items %} +{% if original %} + {% api_token as api_token %} +
  • + + RSS + Snapshot Feed + +
  • +{% endif %} +{{ block.super }} +{% endblock %} diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html new file mode 100644 index 0000000000..16ea008b5e --- /dev/null +++ b/archivebox/templates/admin/base.html @@ -0,0 +1,2472 @@ +{% load i18n static tz core_tags %} +{% get_current_language as LANGUAGE_CODE %} +{% get_current_language_bidi as LANGUAGE_BIDI %} + + + + + {% block title %}Home{% endblock %} | ArchiveBox + + {% block blockbots %} + + {% endblock %} + + + {% api_token as api_token %} + + {% block extrastyle %} + + {% endblock %} + + {% if LANGUAGE_BIDI %} + + {% endif %} + + {% block responsive %} + + + {% if LANGUAGE_BIDI %} + + {% endif %} + {% endblock %} + + + + + + + + + + {% block extrahead %}{% endblock %} + + + + + {% system_warnings_banner %} + {% include 'progressbar.html' %} + +
    + {% if not is_popup %} + + + {% if has_permission %} + {% include 'progressmonitor/progress_monitor.html' %} + {% endif %} + + {% block breadcrumbs %} + + {% endblock %} + {% endif %} + + {% block messages %} + {% if messages %} +
      + {% for message in messages %} + {{ message|capfirst }} + {% endfor %} +
    + {% endif %} + {% endblock messages %} + +
    + {% block pretitle %}{% endblock %} + {% block content_title %}{# {% if title %}

    {{ title }}

    {% endif %} #}{% endblock %} + {% block content %} + {% block object-tools %}{% endblock %} + {{ content }} + {% endblock %} + {% block sidebar %}{% endblock %} +
    +
    + + {% block footer %}{% endblock %} +
    + + {% comment %} + {% if user.is_authenticated and user.is_superuser and CAN_UPGRADE %} + + {% endif %} + {% endcomment %} + + + + + diff --git a/archivebox/templates/admin/change_list.html b/archivebox/templates/admin/change_list.html new file mode 100644 index 0000000000..3c711d9511 --- /dev/null +++ b/archivebox/templates/admin/change_list.html @@ -0,0 +1,82 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} + +{% block title %}{% if cl.formset and cl.formset.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} + +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% if embedded_changelist %} embedded-change-list{% endif %}{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} + {% if not embedded_changelist %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endif %} + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% blocktranslate count counter=cl.formset.total_error_count %}Please correct the error below.{% plural %}Please correct the errors below.{% endblocktranslate %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} + {% include "admin/change_list_panel.html" %} +
    +{% endblock %} diff --git a/archivebox/templates/admin/change_list_panel.html b/archivebox/templates/admin/change_list_panel.html new file mode 100644 index 0000000000..edea90a5e2 --- /dev/null +++ b/archivebox/templates/admin/change_list_panel.html @@ -0,0 +1,107 @@ +{% load i18n admin_list core_tags %} + +{% if cl.model_admin.show_search_mode_selector %} + {% with current_search_mode=cl.params.search_mode|default:cl.model_admin.get_default_search_mode %} +
    + {% endwith %} +{% else %} +
    +{% endif %} +
    +
    + {% search_form cl %} + {% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% if cl.snapshot_is_grid_view %} + {% snapshots_grid cl %} + {% else %} + {% result_list cl %} + {% endif %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + +
    +
    +
    + {% if cl.has_filters and not embedded_changelist %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.is_facets_optional or cl.has_active_filters %}
    + {% if cl.is_facets_optional %}

    + {% if cl.add_facets %}{% translate "Hide counts" %} + {% else %}{% translate "Show counts" %}{% endif %} +

    {% endif %} + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} +
    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} +
    + +{% if cl.has_filters and not embedded_changelist %} + +{% endif %} +{% include "admin/snapshot_search_stream.html" %} diff --git a/archivebox/templates/admin/change_list_results.html b/archivebox/templates/admin/change_list_results.html new file mode 100644 index 0000000000..2328c115e4 --- /dev/null +++ b/archivebox/templates/admin/change_list_results.html @@ -0,0 +1,62 @@ +{% load i18n core_tags %} +{% if result_hidden_fields %} +
    {# DIV for HTML validation #} +{% for item in result_hidden_fields %}{{ item }}{% endfor %} +
    +{% endif %} +{% if results %} +
    + +{% if cl.opts.model_name == "snapshot" %} ++ + + + + +{% elif cl.opts.model_name == "crawl" %} ++ + + + + + +{% endif %} + + +{% for header in result_headers %} +{% endfor %} + + + +{% for result in results %} +{% if result.form and result.form.non_field_errors %} + +{% endif %} +{% with row_obj=cl.result_list|index:forloop.counter0 %} +{% for item in result %}{{ item }}{% endfor %} +{% endwith %} +{% endfor %} + +
    + {% if header.sortable and header.sort_priority > 0 %} +
    + + {% if num_sorted_fields > 1 %}{{ header.sort_priority }}{% endif %} + +
    + {% endif %} +
    {% if header.sortable %}{{ header.text|capfirst }}{% else %}{{ header.text|capfirst }}{% endif %}
    +
    +
    {{ result.form.non_field_errors }}
    +
    +{% elif cl.show_search_index_hint %} +
    +

    + 0 results from {{ cl.search_mode }}. + If this looks wrong, the search index may need to be updated: + archivebox update --index-only +

    +
    +{% endif %} diff --git a/archivebox/templates/admin/core/archiveresult/change_list.html b/archivebox/templates/admin/core/archiveresult/change_list.html new file mode 100644 index 0000000000..ab7d70569f --- /dev/null +++ b/archivebox/templates/admin/core/archiveresult/change_list.html @@ -0,0 +1,142 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_list %} + +{% block title %}{% if cl.formset and cl.formset.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %} + {{ block.super }} + + {% if cl.formset %} + + {% endif %} + {% if cl.formset or action_form %} + + {% endif %} + {{ media.css }} + {% if not actions_on_top and not actions_on_bottom %} + + {% endif %} +{% endblock %} + +{% block extrahead %} +{{ block.super }} +{{ media.js }} +{% endblock %} + +{% block bodyclass %}{{ block.super }} app-{{ opts.app_label }} model-{{ opts.model_name }} change-list{% endblock %} + +{% if not is_popup %} +{% block breadcrumbs %} + +{% endblock %} +{% endif %} + +{% block coltype %}{% endblock %} + +{% block content %} +
    + {% block object-tools %} +
      + {% block object-tools-items %} + {% change_list_object_tools %} + {% endblock %} +
    + {% endblock %} + {% if cl.formset and cl.formset.errors %} +

    + {% if cl.formset.total_error_count == 1 %}{% translate "Please correct the error below." %}{% else %}{% translate "Please correct the errors below." %}{% endif %} +

    + {{ cl.formset.non_form_errors }} + {% endif %} +
    +
    +
    + {% block search %}{% search_form cl %}{% endblock %} + {% block date_hierarchy %}{% if cl.date_hierarchy %}{% date_hierarchy cl %}{% endif %}{% endblock %} + +
    {% csrf_token %} + {% if cl.formset %} +
    {{ cl.formset.management_form }}
    + {% endif %} + + {% block result_list %} + {% if action_form and actions_on_top and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% result_list cl %} + {% if action_form and actions_on_bottom and cl.show_admin_actions %}{% admin_actions %}{% endif %} + {% endblock %} + {% block pagination %} + + {% endblock %} +
    +
    +
    + {% if cl.has_filters %} +
    +

    + {% translate 'Filter' %} + +

    + {% if cl.has_active_filters %}

    + ✖ {% translate "Clear all filters" %} +

    {% endif %} + {% for spec in cl.filter_specs %}{% admin_list_filter cl spec %}{% endfor %} +
    + {% endif %} +
    +
    + {% if cl.has_filters %} + + {% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_form.html b/archivebox/templates/admin/core/tag/change_form.html new file mode 100644 index 0000000000..3c24f485fc --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_form.html @@ -0,0 +1,268 @@ +{% extends "admin/change_form.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag tag-form-page{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
    +
    +

    {% if add %}New Tag{% else %}Edit Tag{% endif %}

    +

    Similar tags are shown below while typing.

    +
    +
    +
    + Matches + Current tags +
    +
    + Links + Open filtered snapshots +
    +
    +
    +{{ block.super }} +{% endblock %} + +{% block after_field_sets %} +{{ block.super }} +
    +

    Similar Tags

    +

    Updates while typing.

    +
    +
    + +{{ tag_similar_cards|json_script:"abx-tag-similar-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/core/tag/change_list.html b/archivebox/templates/admin/core/tag/change_list.html new file mode 100644 index 0000000000..ef3aa553c0 --- /dev/null +++ b/archivebox/templates/admin/core/tag/change_list.html @@ -0,0 +1,916 @@ +{% extends "admin/base_site.html" %} + +{% block bodyclass %}{{ block.super }} app-core model-tag change-list tag-admin-page{% endblock %} + +{% block object-tools %}{% endblock %} + +{% block extrastyle %} +{{ block.super }} + +{% endblock %} + +{% block content %} +
    +
    +
    + + +
    +
    + + + +
    +
    + +
    +
    + {% csrf_token %} +
    + + +
    +
    +
    +
    + +
    +
    +
    Loading tags...
    +
    +
    +
    + +{{ initial_tag_cards|json_script:"abx-tag-cards-data" }} + + +{% endblock %} diff --git a/archivebox/templates/admin/crawls/crawl/change_form.html b/archivebox/templates/admin/crawls/crawl/change_form.html new file mode 100644 index 0000000000..fb3452680e --- /dev/null +++ b/archivebox/templates/admin/crawls/crawl/change_form.html @@ -0,0 +1,49 @@ +{% extends "admin/change_form.html" %} +{% load add_preserved_filters from admin_urls %} + +{% block object-tools-items %} +{% if original %} +
  • + + Stop reason: + {% if crawl_stop_reason %} + {{ crawl_stop_reason }} + {% else %} + none + {% endif %} + + {% if original.status != "sealed" and not original.is_paused %} +
    + {% csrf_token %} + + + + +
    + {% endif %} + {% if original.status == "sealed" or original.is_paused %} +
    + {% csrf_token %} + + + + +
    + {% endif %} +
  • +{% endif %} +{% for tool in objectactions %} +
  • + {% url tools_view_name pk=object_id tool=tool.name as action_url %} + {% include 'django_object_actions/action_trigger.html' %} +
  • +{% endfor %} +{{ block.super }} +{% endblock %} + +{% block content %} +{{ block.super }} +{% if crawl_snapshots_changelist %} +
    {{ crawl_snapshots_changelist }}
    +{% endif %} +{% endblock %} diff --git a/archivebox/templates/admin/crawls/crawl/snapshots_changelist.html b/archivebox/templates/admin/crawls/crawl/snapshots_changelist.html new file mode 100644 index 0000000000..925d243b97 --- /dev/null +++ b/archivebox/templates/admin/crawls/crawl/snapshots_changelist.html @@ -0,0 +1,9 @@ +{% load i18n %} + +
    +
    + {% translate "Snapshots in this crawl" %} + {% translate "Open full changelist" %} +
    + {% include "admin/change_list_panel.html" with changelist_form_action=snapshot_changelist_url %} +
    diff --git a/archivebox/templates/admin/index.html b/archivebox/templates/admin/index.html new file mode 100644 index 0000000000..dd4ab44538 --- /dev/null +++ b/archivebox/templates/admin/index.html @@ -0,0 +1,670 @@ +{% extends "admin/base_site.html" %} +{% load i18n static log %} + +{% block extrastyle %} +{{ block.super }} + + +{% endblock %} + +{% block coltype %}colMS{% endblock %} +{% block bodyclass %}{{ block.super }} dashboard abx-admin-dashboard-page{% endblock %} +{% block nav-breadcrumbs %}{% endblock %} +{% block nav-sidebar %}{% endblock %} + +{% block content %} +{% if app_label %} +
    + {% include "admin/app_list.html" with app_list=app_list show_changelinks=True %} +
    +{% else %} +
    +
    +

    {{ title|default:_("Admin Views") }}

    +
    + + {% if app_list %} +
    +
    +
    +
    +

    {% translate "Main" %}

    +
    + +
    +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Crawl" or model.object_name == "CrawlSchedule" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Snapshot" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "ArchiveResult" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    + +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Tag" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Persona" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "User" %} + {% include "admin/index_model_card.html" with model=model tone="main" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    + +
    +
    +

    {% translate "Environment" %}

    +
    +
    + {% for app in app_list %} + {% if app.name == "Environment" %} + {% for model in app.models %} + {% if model.name == "Configuration" %} + {% include "admin/index_model_card.html" with model=model tone="environment" %} + {% endif %} + {% endfor %} + {% for model in app.models %} + {% if model.name == "Dependencies" %} + {% include "admin/index_model_card.html" with model=model tone="environment" %} + {% endif %} + {% endfor %} + {% for model in app.models %} + {% if model.name == "Plugins" %} + {% include "admin/index_model_card.html" with model=model tone="environment" %} + {% endif %} + {% endfor %} + {% for model in app.models %} + {% if model.name == "Workers" %} + {% include "admin/index_model_card.html" with model=model tone="environment" %} + {% endif %} + {% endfor %} + {% for model in app.models %} + {% if model.name == "Logs" %} + {% include "admin/index_model_card.html" with model=model tone="environment" %} + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} +
    +
    + +
    +
    +

    {% translate "Machine" %}

    +
    +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Machine" %} + {% include "admin/index_model_card.html" with model=model tone="machine" %} + {% endif %} + {% endfor %} + {% endfor %} + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "NetworkInterface" %} + {% include "admin/index_model_card.html" with model=model tone="machine" %} + {% endif %} + {% endfor %} + {% endfor %} + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Binary" %} + {% include "admin/index_model_card.html" with model=model tone="machine" %} + {% endif %} + {% endfor %} + {% endfor %} + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "Process" %} + {% include "admin/index_model_card.html" with model=model tone="machine" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    + +
    +
    +

    {% translate "REST API" %}

    +
    +
    + {% for app in app_list %} + {% for model in app.models %} + {% if model.object_name == "APIToken" or model.object_name == "OutboundWebhook" or model.object_name == "Webhook" %} + {% include "admin/index_model_card.html" with model=model tone="advanced" %} + {% endif %} + {% endfor %} + {% endfor %} +
    +
    +
    + + +
    + {% else %} +

    {% translate "You do not have permission to view or edit anything." %}

    + {% endif %} +
    +{% endif %} +{% endblock %} + +{% block sidebar %}{% endblock %} diff --git a/archivebox/templates/admin/index_model_card.html b/archivebox/templates/admin/index_model_card.html new file mode 100644 index 0000000000..c65489f80a --- /dev/null +++ b/archivebox/templates/admin/index_model_card.html @@ -0,0 +1,123 @@ +{% load i18n %} + +
    + {% if model.admin_url %} + + {% else %} +
    + {% endif %} + + + + + {% if model.object_name == "Persona" %} + {% translate "Personas & Configs" %} + {% elif model.object_name == "User" %} + {% translate "Admin Users" %} + {% elif model.name == "Configuration" %} + {% translate "Admin Configuration" %} + {% else %} + {{ model.name }} + {% endif %} + + {% if model.object_count_label %} + {{ model.object_count_label }} + {% endif %} + + + {% if model.object_name == "Crawl" %} + Create and monitor URL collection jobs. + {% elif model.object_name == "CrawlSchedule" %} + Recurring crawl definitions and timers. + {% elif model.object_name == "Snapshot" %} + Archived pages and page-level metadata. + {% elif model.object_name == "ArchiveResult" %} + Extractor outputs generated per snapshot. + {% elif model.object_name == "Tag" %} + Labels, saved groupings, and collection filters. + {% elif model.object_name == "Persona" %} + Browser profiles, cookies, and crawl identity. + {% elif model.object_name == "User" %} + Admin users, permissions, and access. + {% elif model.object_name == "Machine" %} + Host identity and runtime configuration. + {% elif model.object_name == "NetworkInterface" %} + Network interfaces and endpoint metadata. + {% elif model.object_name == "Binary" %} + Installed tools discovered by ArchiveBox. + {% elif model.object_name == "Process" %} + Hook, binary, and crawl process history. + {% elif model.object_name == "APIToken" %} + API credentials for automation. + {% elif model.object_name == "OutboundWebhook" or model.object_name == "Webhook" %} + Outbound webhook integrations. + {% elif model.name == "Configuration" %} + Live settings and plugin configuration. + {% elif model.name == "Dependencies" %} + Runtime binaries and installed packages. + {% elif model.name == "Plugins" %} + Enabled extractors and integration plugins. + {% elif model.name == "Workers" %} + Background worker and queue state. + {% elif model.name == "Logs" %} + Runtime logs and diagnostics. + {% else %} + {{ model.object_name|default:model.name }} + {% endif %} + + + {% if model.admin_url %} + + {% else %} +
    + {% endif %} + {% if model.object_name == "Crawl" %} + + {% endif %} +
    diff --git a/archivebox/templates/admin/login.html b/archivebox/templates/admin/login.html new file mode 100644 index 0000000000..df5fc77d9e --- /dev/null +++ b/archivebox/templates/admin/login.html @@ -0,0 +1,92 @@ +{% extends "admin/base_site.html" %} +{% load i18n static core_tags %} + +{% block extrastyle %}{{ block.super }} +{{ form.media }} +{% endblock %} + +{% block bodyclass %}{{ block.super }} login{% endblock %} + + +{% block branding %}

    ArchiveBox Admin

    {% endblock %} +
    +
    +{% block usertools %} +{% endblock %} + +{% block nav-global %}{% endblock %} + +{% block content_title %} +
    + Log in to add, edit, and remove links from your archive. +


    +
    +{% endblock %} + +{% block breadcrumbs %}{% endblock %} + +{% block content %} +{% if form.errors and not form.non_field_errors %} +

    +{% if form.errors.items|length == 1 %}{% trans "Please correct the error below." %}{% else %}{% trans "Please correct the errors below." %}{% endif %} +

    +{% endif %} + +{% if form.non_field_errors %} +{% for error in form.non_field_errors %} +

    + {{ error }} +

    +{% endfor %} +{% endif %} + +
    + +{% if user.is_authenticated %} +

    +{% blocktrans trimmed %} + You are authenticated as {{ username }}, but are not authorized to + access this page. Would you like to login to a different account? +{% endblocktrans %} +

    +{% endif %} + +
    +
    {% csrf_token %} +
    + {{ form.username.errors }} + {{ form.username.label_tag }} {{ form.username }} +
    +
    + {{ form.password.errors }} + {{ form.password.label_tag }} {{ form.password }} + +
    +
    + +
    +
    + +
    +

    +
    +
    + To create a new admin user or reset a password, run:
    +
    +cd data/   # run commands inside your data folder
    +archivebox manage createsuperuser <username>
    +archivebox manage changepassword <username>
    +
    +
    +
    +
    + {% has_real_admin_users as real_admins_exist %} + {% if not real_admins_exist %} + (or set env vars ADMIN_USERNAME + ADMIN_PASSWORD) + {% endif %} +
    +
    + + +
    +{% endblock %} diff --git a/archivebox/templates/admin/personas/persona/change_form.html b/archivebox/templates/admin/personas/persona/change_form.html new file mode 100644 index 0000000000..efe74ff1c0 --- /dev/null +++ b/archivebox/templates/admin/personas/persona/change_form.html @@ -0,0 +1,272 @@ +{% extends "admin/change_form.html" %} +{% load static %} + +{% block bodyclass %}{{ block.super }} app-personas model-persona{% endblock %} + +{% block extrastyle %} +{{ block.super }} + + +{% endblock %} + +{% block extrahead %} +{{ block.super }} + +{% endblock %} + +{% block form_top %} +
    +
    +

    Create a new Persona (Authentication + Configuration Profile)

    +

    + Import cookies and settings from any Chrome-based browser by pasting in its Profile Path from chrome://version, + or by pasting in a CDP url from + chrome://inspect#remote-debugging (e.g. http://127.0.0.1:9222). +

    +
    +
    +{{ block.super }} +{% endblock %} + +{% block after_field_sets %} +{{ block.super }} +
    +

    Plugin Config

    +

    These typed controls update the same Persona config JSON shown above. Shared config keys stay synced across plugin sections.

    + {% include "plugins/plugin_config_grid.html" with plugin_groups=adminform.form.plugin_groups %} +
    +{% endblock %} diff --git a/archivebox/templates/admin/search_form.html b/archivebox/templates/admin/search_form.html new file mode 100644 index 0000000000..dc5c3ae302 --- /dev/null +++ b/archivebox/templates/admin/search_form.html @@ -0,0 +1,82 @@ +{% load i18n static %} +{% if cl.search_fields %} +
    +

    {% blocktranslate with name=cl.opts.verbose_name_plural %}Search {{ name }}{% endblocktranslate %}

    +
    +{% if cl.opts.model_name == 'snapshot' and not cl.embedded_changelist %} + {% if cl.snapshot_is_grid_view %} + + {% else %} + + {% endif %} +{% endif %} +{% if cl.embedded_changelist %} + +{% else %} + +{% endif %} +{% if cl.has_filters and not cl.embedded_changelist %} + +{% endif %} +
    +
    +{% endif %} diff --git a/archivebox/templates/admin/snapshot_search_stream.html b/archivebox/templates/admin/snapshot_search_stream.html new file mode 100644 index 0000000000..7b699e6143 --- /dev/null +++ b/archivebox/templates/admin/snapshot_search_stream.html @@ -0,0 +1,184 @@ +{% if opts.app_label == "core" and opts.model_name == "snapshot" and not embedded_changelist %} + +{% endif %} diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html new file mode 100644 index 0000000000..88c21d1348 --- /dev/null +++ b/archivebox/templates/admin/snapshots_grid.html @@ -0,0 +1,266 @@ +{% load i18n admin_urls static admin_list %} +{% load core_tags %} + +{% block extrastyle %} + + +{% endblock %} + +{% block content %} +
    + {% for obj in results %} +
    +
    + + + {{obj.bookmarked_at}} + + {{ obj.archive_size | file_size }} +
    + + {{obj.url}} + +
    + {% if obj.status == 'started' %} +
    + + Archiving... +
    + {% else %} + {{ obj.icons|safe }} + {% endif %} +
    +
    + {% endfor %} +
    +
    +{% endblock %} diff --git a/archivebox/templates/admin/submit_line.html b/archivebox/templates/admin/submit_line.html new file mode 100644 index 0000000000..7e33bec1dc --- /dev/null +++ b/archivebox/templates/admin/submit_line.html @@ -0,0 +1,16 @@ +{% load i18n admin_urls %} +
    +{% block submit-row %} +{% if show_save and not original %}{% endif %} +{% if show_save_as_new %}{% endif %} +{% if show_save_and_continue %}{% endif %} +{% if show_close %} + {% url opts|admin_urlname:'changelist' as changelist_url %} + {% translate 'Close' %} +{% endif %} +{% if show_delete_link and original %} + {% url opts|admin_urlname:'delete' original.pk|admin_urlquote as delete_url %} + {% translate "Delete" %} +{% endif %} +{% endblock %} +
    diff --git a/archivebox/templates/core/add.html b/archivebox/templates/core/add.html new file mode 100644 index 0000000000..21232e5c50 --- /dev/null +++ b/archivebox/templates/core/add.html @@ -0,0 +1,1390 @@ +{% extends "core/base.html" %} + +{% load static %} +{% load i18n %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block extra_head %} + +{% endblock %} + +{% block body %} +
    +
    + {% if stdout %} +

    Add new URLs to your archive: results

    +
    +                {{ stdout | safe }}
    +                

    +
    +
    +
    +   Add more URLs โž• +
    + {% else %} + +
    {% csrf_token %} +
    +

    Create a new Crawl

    +

    + A Crawl is a job that processes URLs and creates Snapshots (archived copies) for each URL discovered. +
    The settings below apply to the entire crawl and all snapshots it creates. +

    +
    +
    +

    + + ๐Ÿ’ฌ Crawl with AI   |   + Chrome Extension  Get the extension + + ๐Ÿ’ก Tip: Instantly save a single URL by visiting: + {{ web_base_url }}/web/https://example.com/url_to_save +

    +
    + + +
    +
    +
    +
    +
    + {{ form.url.label_tag }} +
    0 URLs detected
    +
    +
    + + {{ form.url }} +
    +
    + +
    + {% if form.url.errors %} +
    {{ form.url.errors }}
    + {% endif %} +
    + +
    +
    +
    + + {{ form.tag.label_tag }} +
    + {{ form.tag }} + {% if form.tag.errors %} +
    {{ form.tag.errors }}
    + {% endif %} +
    Tags will be applied to all snapshots created by this crawl.
    +
    + +
    +
    + + {{ form.persona.label_tag }} +
    + {{ form.persona }} + {% if form.persona.errors %} +
    {{ form.persona.errors }}
    + {% endif %} +
    + Authentication + configuration settings to use when saving URLs (cookies, user agent, resolution, timeouts, etc.) + {% if can_override_crawl_config %} + Create new profile / import from Chrome -> + {% endif %} +
    +
    + +
    +
    + + {{ form.permissions.label_tag }} +
    + {{ form.permissions }} + {% if form.permissions.errors %} +
    {{ form.permissions.errors }}
    + {% endif %} +
    Public lists it. Unlisted only serves direct links. Private requires admin login.
    +
    +
    + +
    +
    + {{ form.depth.label_tag }} + {{ form.depth }} + {% if form.depth.errors %} +
    {{ form.depth.errors }}
    + {% endif %} +
    Controls how many links deep the crawl will follow from the starting URLs.
    +
    +
    + +
    + {{ form.url_filters }} + {% if form.url_filters.errors %} +
    {{ form.url_filters.errors }}
    + {% endif %} +
    +
    + +
    +
    +
    + + {{ form.max_urls.label_tag }} +
    + {{ form.max_urls }} + {% if form.max_urls.errors %} +
    {{ form.max_urls.errors }}
    + {% endif %} +
    0 = unlimited. Whole numbers, e.g. 25, 300.
    +
    + +
    +
    + + {{ form.crawl_max_size.label_tag }} +
    + {{ form.crawl_max_size }} + {% if form.crawl_max_size.errors %} +
    {{ form.crawl_max_size.errors }}
    + {% endif %} +
    0 = unlimited. Sizes: 45mb, 1.5gb, 2tb.
    +
    + +
    +
    + + {{ form.crawl_timeout.label_tag }} +
    + {{ form.crawl_timeout }} + {% if form.crawl_timeout.errors %} +
    {{ form.crawl_timeout.errors }}
    + {% endif %} +
    0 = unlimited. Must be >10s: 11, 1.5m, 1hr.
    +
    + +
    +
    + + {{ form.timeout.label_tag }} +
    + {{ form.timeout }} + {% if form.timeout.errors %} +
    {{ form.timeout.errors }}
    + {% endif %} +
    Must be >10s: 11, 1.5m, 1hr.
    +
    + +
    +
    + + {{ form.snapshot_max_size.label_tag }} +
    + {{ form.snapshot_max_size }} + {% if form.snapshot_max_size.errors %} +
    {{ form.snapshot_max_size.errors }}
    + {% endif %} +
    0 = unlimited. Sizes: 45mb, 1.5gb, 2tb.
    +
    + +
    +
    + + {{ form.delete_after.label_tag }} +
    + {{ form.delete_after }} + {% if form.delete_after.errors %} +
    {{ form.delete_after.errors }}
    + {% endif %} +
    0 = keep forever. Durations: 1hr, 30d, 3mo.
    +
    + +
    +
    + + {{ form.crawl_max_concurrent_snapshots.label_tag }} +
    + {{ form.crawl_max_concurrent_snapshots }} + {% if form.crawl_max_concurrent_snapshots.errors %} +
    {{ form.crawl_max_concurrent_snapshots.errors }}
    + {% endif %} +
    Whole numbers, e.g. 1, 4, 12.
    +
    +
    + +
    + {{ form.notes.label_tag }} + {{ form.notes }} + {% if form.notes.errors %} +
    {{ form.notes.errors }}
    + {% endif %} +
    Optional description for this crawl (visible in the admin interface).
    +
    + +
    + + {% if can_override_crawl_config %} + +
    +

    Crawl Plugins

    +

    + Select which archiving methods to run for all snapshots in this crawl. If none selected, all available plugins will be used. + View plugin details โ†’ +

    + +
    + Presets: + โž• + {% for persona in recent_personas %} + + + โœŽ + + {% endfor %} + + + +
    + + {% include "plugins/plugin_config_grid.html" with plugin_groups=form.plugin_groups %} +
    + + +
    +
    +

    Advanced Crawl Options

    +

    Additional settings that control how this crawl processes URLs and creates snapshots.

    + +
    + {{ form.schedule.label_tag }} + {{ form.schedule }} + {% if form.schedule.errors %} +
    {{ form.schedule.errors }}
    + {% endif %} +
    + Optional: Schedule this crawl to repeat automatically. Examples:
    + daily - Run once per day
    + weekly - Run once per week
    + 0 */6 * * * - Every 6 hours (cron format)
    + 0 0 * * 0 - Every Sunday at midnight (cron format) +
    +
    + +
    + {{ form.start_paused }} + {{ form.start_paused.label_tag }} + {% if form.start_paused.errors %} +
    {{ form.start_paused.errors }}
    + {% endif %} +
    Create the crawl in a paused state. No snapshots will be created until you resume it.
    +
    + +
    + {{ form.config.label_tag }} + {{ form.config }} + {% if form.config.errors %} +
    {{ form.config.errors }}
    + {% endif %} +
    + Override crawl-scoped config options (e.g., TIMEOUT, USER_AGENT, URL_ALLOWLIST, URL_DENYLIST, PLUGINS). *_BINARY paths are managed on Persona or Machine config. +
    +
    +
    +
    + {% endif %} + +
    + +
    +
    +


    + + {% if absolute_add_path %} + + {% endif %} + + {% endif %} +
    +{% endblock %} + +{% block footer %}{% endblock %} + +{% block sidebar %}{% endblock %} diff --git a/archivebox/templates/core/base.html b/archivebox/templates/core/base.html new file mode 100644 index 0000000000..144fc9788b --- /dev/null +++ b/archivebox/templates/core/base.html @@ -0,0 +1,90 @@ +{% load static tz admin_urls core_tags %} + + + + + Archived Sites + + + + + + {% api_token as api_token %} + + + + {% block extra_head %} + {% endblock %} + + + {% system_warnings_banner %} +
    + +
    + {% block body %} + + {% endblock %} +
    + {% block footer %} + + {% endblock %} +
    + + + diff --git a/archivebox/templates/core/index_row.html b/archivebox/templates/core/index_row.html new file mode 100644 index 0000000000..f4c3230959 --- /dev/null +++ b/archivebox/templates/core/index_row.html @@ -0,0 +1 @@ +{% load core_tags %}{% snapshot_index_row link %} diff --git a/archivebox/templates/core/minimal_index.html b/archivebox/templates/core/minimal_index.html new file mode 100644 index 0000000000..356133fdea --- /dev/null +++ b/archivebox/templates/core/minimal_index.html @@ -0,0 +1,28 @@ + +{% load core_tags %} + + + Archived Sites + + + + + + + + + + + + + + + + + {% for link in links %} + {% snapshot_index_row link %} + {% endfor %} + +
    SavedPreviewSnapshot ({{num_links}})TagsStatusFilesSize
    + + diff --git a/archivebox/templates/core/navigation.html b/archivebox/templates/core/navigation.html new file mode 100644 index 0000000000..8f2bd509ea --- /dev/null +++ b/archivebox/templates/core/navigation.html @@ -0,0 +1,40 @@ +{% load i18n static %} + +
    + Add โž•     + Crawls | + Snapshots | + Log | + Tags     + {% if user.is_authenticated and user.is_superuser %} + ๐Ÿ’ฌ AI | + {% endif %} + Docs | + API | + Admin +     + {% if user.is_authenticated %} + {% block welcome-msg %} + {% trans 'User' %} + {% firstof user.get_short_name user.get_username %}     + {% endblock %} + {% block userlinks %} + {% if user.has_usable_password %} + Account / + {% endif %} + {% trans 'Log out' %} + {% endblock %} + {% elif request.COOKIES.archivebox_admin_logged_in == "1" %} + {% comment %} + Authenticated on the admin host but the session cookie is admin-host- + scoped (security boundary โ€” web.* must NEVER see the session). + The hint cookie is the only signal that crosses, so we render the + logged-out state's `Account` / `Log out` links pointing at admin host + so the user can still reach those pages from web.*. + {% endcomment %} + Account / + {% trans 'Log out' %} + {% else %} + {% trans 'Log in' %} + {% endif %} +
    diff --git a/archivebox/templates/core/progressbar.html b/archivebox/templates/core/progressbar.html new file mode 100644 index 0000000000..01884e4c57 --- /dev/null +++ b/archivebox/templates/core/progressbar.html @@ -0,0 +1,50 @@ + + diff --git a/archivebox/templates/core/public_index.html b/archivebox/templates/core/public_index.html new file mode 100644 index 0000000000..11145ed47b --- /dev/null +++ b/archivebox/templates/core/public_index.html @@ -0,0 +1,726 @@ +{% extends "base.html" %} +{% load static tz %} +{% load core_tags %} + +{% block extra_head %} + + +{% endblock %} + +{% block body %} +
    +
    + +
    + {% if public_search_stream_pending %} + Searching... + {% else %} + {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} + (page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}) + {% endif %} +
    +
    +
    + + + + + + + + + + + + + + {% for link in object_list %} + {% snapshot_index_row link %} + {% empty %} + + + + {% endfor %} + +
    SavedPreviewSnapshot ({{ page_obj.paginator.count }})TagsStatusFilesSize
    + {% if public_search_stream_pending %} + Searching {{ search_mode }} for matching snapshots... + {% elif show_search_index_hint %} + 0 results from {{ search_mode }}. If this looks wrong, the search index may need to be updated: + archivebox update --index-only + {% else %} + No snapshots found. + {% endif %} +
    +
    +
    + Showing {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} total +
    + + {% if page_obj.has_previous %} + « first + previous + {% endif %} + + + Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }} + + + {% if page_obj.has_next %} + next + {% endif %} + {% if page_obj.has_next %} + last » + {% endif %} + +
    +
    + +{% endblock %} diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html new file mode 100644 index 0000000000..69eedd85bd --- /dev/null +++ b/archivebox/templates/core/snapshot.html @@ -0,0 +1,1788 @@ +{% load static tz core_tags %} + + + + + {{title}} + + + + +
    + {% if snapshot_state == 'queued' or snapshot_state == 'started' or snapshot_state == 'paused' %} +
    + {% include "progressmonitor/progress_monitor.html" with progress_endpoint=progress_endpoint progress_scope="snapshot" %} +
    + {% endif %} + {% if has_outputs %} + + {% else %} +
    +
    + {% if snapshot_state == 'paused' %} +
    โธ๏ธ
    +

    Crawl paused

    +

    This snapshot's crawl is paused โ€” no extractors have run yet.

    +

    Resume the crawl to start archiving content.

    + {% elif snapshot_state == 'started' %} +
    โณ
    +

    Archiving in progressโ€ฆ

    +

    Extractors are running. Results will appear automatically.

    + {% elif snapshot_state == 'queued' %} +
    ๐Ÿ•“
    +

    Queued for archiving

    +

    This snapshot is waiting in the queue. Results will appear automatically once a worker picks it up.

    + {% else %} +
    ๐Ÿ“ญ
    +

    No archive outputs yet

    +

    This snapshot has no archived content to display.

    + {% endif %} + +
    +
    + {% endif %} +
    + + +
    +
    +
    +
    + {% web_base_url as web_base %} + + Archive Icon + ArchiveBox + +
    +
    + +
    + Favicon + {{title|truncatechars:120|safe}} + {% if title_tags %} + + {{ snapshot_permissions_icon }} {{ snapshot_permissions }} + {% for tag in title_tags %} + {{ tag.name }} + {% endfor %} + + {% else %} + + {{ snapshot_permissions_icon }} {{ snapshot_permissions }} + + {% endif %} + โ–พ +
    +
    +
    +
    +
    + {{num_outputs}} + {% if num_failures %} + + {{num_failures}} errors + {% endif %} +
    + + + +
    + {% if related_years %} +
    + {% for entry in related_years %} + {% if entry.snapshots|length > 1 %} +
    + {{ entry.year }} + +
    + {% else %} + + {% endif %} + {% endfor %} +
    + {% endif %} +
    +
    + {% if related_snapshots %} +
    + + + {{ related_snapshots|length }} + {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}} + + + +
    + {% else %} + + {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}} + + {% endif %} +
    + +
    +
    +
    +
    +
    + + + {% for result in archiveresults %} + {% with display_path=result.path display_url='' preview_url='' %} + {% if display_path %}{% snapshot_url snapshot display_path as display_url %}{% endif %} + {% if display_path %}{% snapshot_preview_url snapshot display_path as preview_url %}{% endif %} +
    +
    +
    + ๐Ÿ“ + {% if display_path %} + โฌ‡๏ธ + {% endif %} +
    + {% if display_path %} + +

    {% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}}{% if result.size %} ({{result.size|filesizeformat}}){% endif %}

    +
    + {% else %} +

    {% plugin_icon result.name %} {{result.name|plugin_name|truncatechars:20}}{% if result.size %} ({{result.size|filesizeformat}}){% endif %}

    + {% endif %} + {% if result.result %} + {% with plugin_base=result.name|plugin_name %} + {% if plugin_base == 'ytdlp' or plugin_base == 'yt-dlp' or plugin_base == 'youtube-dl' %} + {% plugin_card result.result %} + {% endif %} + {% endwith %} + {% endif %} +
    + {% if result.result and display_path %} + {% with plugin_base=result.name|plugin_name %} + {% if plugin_base != 'ytdlp' and plugin_base != 'yt-dlp' and plugin_base != 'youtube-dl' %} + {# Use plugin-specific card template when ArchiveResult is available #} +
    + + {% plugin_card result.result %} +
    + {% else %} + {# YT-DLP renders its file list in the body #} + {% endif %} + {% endwith %} + {% elif result.is_metadata and display_path %} +
    + + {% output_card snapshot display_path result.name %} +
    + {% elif display_path %} + {# Fall back to generic iframe for filesystem-discovered files #} +
    + + +
    + {% endif %} +
    + {% endwith %} + {% endfor %} + {% if loose_items %} +
    +
    +
    + ๐Ÿ“ +
    +

    ๐Ÿ“ฆ Other files

    +
    + {% for item in loose_items %} + {% if item.is_dir %} + ๐Ÿ“ {{item.name}} + {% else %} + ๐Ÿ“„ {{item.name}} + {% endif %} + {% endfor %} +
    +
    +
    + {% endif %} + {% if failed_items %} +
    +
    +
    + ๐Ÿ“ +
    +

    โš ๏ธ Failed

    +
    + {% for item in failed_items %} + {% if item.is_dir %} + ๐Ÿ“ {{item.name}} + {% else %} + ๐Ÿ“„ {{item.name}} + {% endif %} + {% endfor %} +
    +
    +
    + {% endif %} +
    +
    +
    + + + + + + diff --git a/archivebox/templates/core/static_index.html b/archivebox/templates/core/static_index.html new file mode 100644 index 0000000000..548132465a --- /dev/null +++ b/archivebox/templates/core/static_index.html @@ -0,0 +1,258 @@ +{% load static core_tags %} + + + + + Archived Sites + + + + + + + + + +
    +
    + +
    +
    + + + + + + + + + + + + + + + {% for link in links %} + {% snapshot_index_row link %} + {% endfor %} + +
    SavedPreviewSnapshot ({{num_links}})TagsStatusFilesSize
    + + + diff --git a/archivebox/templates/core/system_warnings_banner.html b/archivebox/templates/core/system_warnings_banner.html new file mode 100644 index 0000000000..9db4e112e8 --- /dev/null +++ b/archivebox/templates/core/system_warnings_banner.html @@ -0,0 +1,115 @@ +{% comment %} + Fixed red badge that hangs off the top center of the page. Five trigger + conditions (see ``system_warnings_banner`` in core_tags.py); precedence is + config/security first, then host-health: + + * mode="unconfigured" โ€” BASE_URL is not set. Always shown until the + operator pins it explicitly, even when CSRF + auto-derive or request-host fallback are keeping + the server functional. + * mode="unsafe" โ€” server is in a non-subdomain SERVER_SECURITY_MODE. + Archived content shares an origin with the admin UI. + * mode="low_disk" โ€” free space on DATA_DIR's filesystem is below 1GiB. + Archive jobs will fail until the operator frees space. + * mode="high_memory" โ€” virtual memory utilization above 95%; one OOM-kill + from a crash. + * mode="high_load" โ€” 15-min loadavg > 3 ร— cpu_count; saturated host. +{% endcomment %} +{% if mode == "low_disk" %} + +{% elif mode == "high_memory" %} + +{% elif mode == "high_load" %} + +{% elif mode == "unsafe" %} + +{% elif mode == "unconfigured" %} + +{% endif %} diff --git a/archivebox/templates/index.html b/archivebox/templates/index.html deleted file mode 100644 index 264deb4da2..0000000000 --- a/archivebox/templates/index.html +++ /dev/null @@ -1,196 +0,0 @@ - - - - Archived Sites - - - - - - - -
    -
    - Documentation   |   - Source   |   - Website -
    - -
    - Archived Sites - -
    - Last updated $time_updated -
    -
    - - - - - - - - - - $rows -
    BookmarkedSaved Link ($num_links)FilesOriginal URL
    - - - diff --git a/archivebox/templates/index_row.html b/archivebox/templates/index_row.html deleted file mode 100644 index d3174ec013..0000000000 --- a/archivebox/templates/index_row.html +++ /dev/null @@ -1,16 +0,0 @@ - - $bookmarked_date - - - - $title - $tags - - - - ๐Ÿ“„ - $num_outputs - - - $url - diff --git a/archivebox/templates/link_index.html b/archivebox/templates/link_index.html deleted file mode 100644 index 95aa6bb1a0..0000000000 --- a/archivebox/templates/link_index.html +++ /dev/null @@ -1,348 +0,0 @@ - - - - $title - - - - -
    -

    - - Archive Icon - - - โ–พ - - $title
    - - $base_url - -

    -
    - - - - - - - - - diff --git a/archivebox/templates/plugins/plugin_config_grid.html b/archivebox/templates/plugins/plugin_config_grid.html new file mode 100644 index 0000000000..383be367bb --- /dev/null +++ b/archivebox/templates/plugins/plugin_config_grid.html @@ -0,0 +1,760 @@ + +
    +
    + {% for group in plugin_groups %} + {% if group.plugins %} +
    + + + + {% if group.note %} + {{ group.note }} + {% endif %} + {% if group.show_selectors and group.select_all_group %} + + {% endif %} + +
    + {% for plugin in group.plugins %} +
    + +
    + {% if group.show_selectors %} + + + {% else %} +
    + {{ plugin.label }} +
    + {% endif %} + {% if plugin.description %} + + {% endif %} + Source + Docs + +
    +
    + {% if plugin.config_fields %} +
    + {% for field in plugin.config_fields %} +
    + + {% if field.kind == "boolean" %} + + + {% elif field.kind == "select" %} + + {% elif field.kind == "json" %} + + {% else %} + + {% endif %} + {% if field.description %} +
    {{ field.description }}
    + {% endif %} +
    + Current: + + {% if field.current_url %} + {{ field.current }} + {% else %} + {{ field.current }} + {% endif %} + + ยท Default: {{ field.default }} +
    +
    + {% endfor %} + {% if plugin.required_binary_links %} +
    + Uses: + {% for binary in plugin.required_binary_links %}{{ binary.name }}{% if not forloop.last %}, {% endif %}{% endfor %} +
    + {% endif %} +
    + {% else %} +
    + This plugin has no crawl-configurable options. + {% if plugin.required_binary_links %} +
    + Uses: + {% for binary in plugin.required_binary_links %}{{ binary.name }}{% if not forloop.last %}, {% endif %}{% endfor %} +
    + {% endif %} +
    + {% endif %} +
    +
    + {% endfor %} +
    +
    + {% endif %} + {% endfor %} +
    +
    + diff --git a/archivebox/templates/registration/password_change_form.html b/archivebox/templates/registration/password_change_form.html new file mode 100644 index 0000000000..6aff08740f --- /dev/null +++ b/archivebox/templates/registration/password_change_form.html @@ -0,0 +1,69 @@ +{% extends "admin/base_site.html" %} +{% load i18n static %} + +{% block title %}{% if form.errors %}{% translate "Error:" %} {% endif %}{{ block.super }}{% endblock %} +{% block extrastyle %}{{ block.super }}{% endblock %} +{% block userlinks %} + {% url 'django-admindocs-docroot' as docsroot %}{% if docsroot %}{% translate 'Documentation' %} / {% endif %} {% translate 'Change password' %} / +
    + {% csrf_token %} + +
    + {% include "admin/color_theme_toggle.html" %} +{% endblock %} +{% block breadcrumbs %} + +{% endblock %} + +{% block content %}
    + +
    {% csrf_token %} +
    +{% if form.errors %} +

    + {% blocktranslate count counter=form.errors.items|length %}Please correct the error below.{% plural %}Please correct the errors below.{% endblocktranslate %} +

    +{% endif %} + + +

    {% translate 'Please enter your old password, for securityโ€™s sake, and then enter your new password twice so we can verify you typed it in correctly.' %}

    + +
    + +
    + {{ form.old_password.errors }} +
    {{ form.old_password.label_tag }} {{ form.old_password }}
    +
    + +
    + {{ form.new_password1.errors }} +
    {{ form.new_password1.label_tag }} {{ form.new_password1 }}
    + {% if form.new_password1.help_text %} +
    {{ form.new_password1.help_text|safe }}
    + {% endif %} +
    + +
    + {{ form.new_password2.errors }} +
    {{ form.new_password2.label_tag }} {{ form.new_password2 }}
    + {% if form.new_password2.help_text %} +
    {{ form.new_password2.help_text|safe }}
    + {% endif %} +
    + +
    + +
    + + {% if perms.auth.change_user %} + {% translate 'Edit other user settings' %} → + {% endif %} +
    + +
    +
    + +{% endblock %} diff --git a/archivebox/templates/static/add.css b/archivebox/templates/static/add.css new file mode 100755 index 0000000000..55ed039e20 --- /dev/null +++ b/archivebox/templates/static/add.css @@ -0,0 +1,947 @@ +header { + font-family: "Roboto","Lucida Grande","DejaVu Sans","Bitstream Vera Sans",Verdana,Arial,sans-serif; + font-size: 13px; + color: white; + height: 30px; +} +.header-top { + color: white; +} + +.dashboard #content { + width: 100%; + margin-right: 0px; + margin-left: 0px; +} +#submit { + border: 1px solid rgba(0, 0, 0, 0.2); + padding: 10px; + border-radius: 4px; + background-color: #f5dd5d; + color: #333; + font-size: 18px; + font-weight: 800; +} +#add-form button[role="submit"]:hover { + background-color: #e5cd4d; +} +#add-form label { + display: block; + font-size: 16px; +} +#add-form textarea { + width: 100%; + min-height: 300px; +} +#delay-warning div { + border: 1px solid red; + border-radius: 4px; + margin: 10px; + padding: 10px; + font-size: 15px; + background-color: #8bc34a; +} +#stdout { + background-color: #fbfbfb; + padding: 10px 10px; + border-radius: 4px; + white-space: normal; +} +ul#id_depth { + list-style-type: none; + padding: 0; +} + +@keyframes spin { + 0% { + transform: rotate(0deg); + } + 100% { + transform: rotate(360deg); + } +} + +.loader { + border: 16px solid #f3f3f3; /* Light grey */ + border-top: 16px solid #3498db; /* Blue */ + border-radius: 50%; + width: 30px; + height: 30px; + box-sizing: border-box; + animation: spin 2s linear infinite; +} + + +textarea, select, input[type="text"], input[type="number"] { + border-radius: 4px; + border: 2px solid #004882; + box-shadow: 4px 4px 4px rgba(0,0,0,0.02); + width: 100%; + padding: 8px 12px; + font-family: inherit; + font-size: 14px; +} + +textarea { + min-height: 300px; +} + +input[type="text"], input[type="number"] { + min-height: 42px; +} + +textarea[rows="3"] { + min-height: 80px; +} + +select { + min-height: 40px; +} + +/* Crawl subtitle (grey explanation under the title) */ +.crawl-subtitle { + max-width: 760px; + margin: 4px auto 16px; + color: #6b7280; + font-size: 13px; + line-height: 1.5; +} + +.crawl-subtitle strong { + color: #475569; + font-weight: 600; +} + +/* Crawl explanation box */ +.crawl-explanation { + background-color: #e8f4f8; + border-left: 4px solid #004882; + padding: 15px 20px; + margin-bottom: 20px; + border-radius: 4px; +} + +.crawl-explanation p { + margin: 0; + line-height: 1.6; + color: #333; +} + +#add-form .crawl-tip a { + text-decoration: none !important; +} + +.crawl-tip-url { + padding: 2px 6px; + background: #f6f8fa; + border: 1px solid #d0d7de; + border-radius: 4px; + font-size: 13px; + white-space: nowrap; +} + +.crawl-tip-url-prefix { + color: #6b7280; +} + +.crawl-tip-url-target { + color: #116329; + font-weight: 600; +} + +/* Form sections */ +.form-section { + margin-bottom: 30px; + padding: 20px; + background-color: #f9f9f9; + border-radius: 8px; +} + +.form-section h3 { + margin-top: 0; + margin-bottom: 15px; + color: #004882; + font-size: 18px; +} + +.section-description { + margin: 0 0 15px 0; + color: #666; + font-size: 14px; + line-height: 1.5; +} + +.section-description a { + color: #004882; + text-decoration: none; + font-weight: 500; +} + +.section-description a:hover { + text-decoration: underline; +} + +.help-text code { + background-color: #f5f5f5; + padding: 2px 6px; + border-radius: 3px; + font-family: monospace; + font-size: 12px; + color: #333; +} + +.form-field { + margin-bottom: 20px; +} + +.tags-persona-row { + display: grid; + grid-template-columns: minmax(0, 6fr) minmax(240px, 3fr) minmax(170px, 2fr); + gap: 18px; + align-items: stretch; +} + +.tags-persona-row .form-field { + display: flex; + flex-direction: column; + min-width: 0; +} + +.tags-persona-row .tag-editor-container, +.tags-persona-row select[name="persona"], +.tags-persona-row select[name="permissions"] { + box-sizing: border-box; + min-height: 46px; +} + +.tags-persona-row select[name="persona"], +.tags-persona-row select[name="permissions"] { + height: 62px; +} + +.persona-field .help-text a { + display: inline-block; + margin-top: 2px; +} + +.settings-row { + display: grid; + grid-template-columns: minmax(220px, 300px) minmax(420px, 1fr); + gap: 18px; + align-items: start; + margin-bottom: 12px; +} + +.form-field label { + display: block; + font-size: 16px; + font-weight: 600; + margin-bottom: 8px; +} + +.field-label-with-icon { + display: flex; + align-items: center; + gap: 7px; + margin-bottom: 8px; +} + +.field-label-with-icon label { + margin-bottom: 0; +} + +.field-label-icon { + font-size: 17px; + line-height: 1; +} + +#same-domain-toggle-slot { + margin-top: 12px; + display: flex; + flex-direction: column; + align-items: flex-start; + gap: 5px; +} + +.crawl-limit-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); + gap: 10px; + margin: 0 0 20px; +} + +.crawl-limit-field { + margin-bottom: 0; + padding: 10px; + border: 1px solid #dbe3ea; + border-radius: 6px; + background: #fff; + min-width: 0; +} + +.crawl-limit-field label { + margin-bottom: 6px; + font-size: 13px; + font-weight: 700; + color: #1f2937; +} + +.crawl-limit-field .field-label-with-icon { + gap: 5px; + margin-bottom: 6px; +} + +.crawl-limit-field .field-label-with-icon label { + margin-bottom: 0; +} + +.crawl-limit-field .field-label-icon { + font-size: 14px; +} + +.crawl-limit-field input { + min-height: 34px; + padding: 6px 8px; + font-size: 13px; +} + +.crawl-limit-field .help-text { + margin-top: 5px; + font-size: 11px; + line-height: 1.25; +} + +.field-header { + display: flex; + align-items: center; + gap: 10px; + margin-bottom: 8px; +} + +.field-header label { + margin-bottom: 0; +} + +.url-workbench { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(280px, 360px); + gap: 18px; + align-items: start; +} + +.url-editor-column { + min-width: 0; +} + +.url-editor-shell { + position: relative; +} + +.url-editor-shell textarea[name="url"] { + position: relative; + z-index: 2; + background: transparent; + color: #1f2937; + -webkit-text-fill-color: #1f2937; + caret-color: #1f2937; + min-height: 240px; + height: 240px; + line-height: 1.5; + resize: vertical; +} + +.url-editor-shell textarea[name="url"]::selection { + background: rgba(0, 72, 130, 0.18); +} + +.url-highlight-layer { + position: absolute; + inset: 2px; + z-index: 1; + margin: 0; + padding: 8px 12px; + overflow: auto; + pointer-events: none; + white-space: pre-wrap; + overflow-wrap: anywhere; + word-break: break-word; + font-family: inherit; + font-size: 14px; + line-height: 1.5; + color: transparent; + background: transparent; + border-radius: 2px; + scrollbar-width: none; +} + +.url-highlight-layer::-webkit-scrollbar { + display: none; +} + +.url-highlight-segment { + border-radius: 3px; +} + +.detected-urls-panel { + display: flex; + flex-direction: column; + margin-top: 38px; + min-height: 240px; + padding: 12px 14px; + background: linear-gradient(180deg, #fff 0%, #f6f8fb 100%); + border: 1px solid #d7e2eb; + border-radius: 8px; + overflow: hidden; +} + +.detected-url-limited .detected-url-value { + text-decoration: line-through; + opacity: 0.72; +} + +.detected-urls-header { + display: flex; + align-items: baseline; + justify-content: space-between; + gap: 12px; + margin-bottom: 10px; +} + +.detected-urls-summary { + font-size: 12px; + color: #5f6c78; +} + +.detected-urls-list { + flex: 1; + min-height: 0; + display: grid; + align-content: start; + gap: 8px; + overflow: auto; + padding-right: 4px; +} + +.detected-urls-empty { + padding: 8px 0; + color: #6b7280; + font-size: 13px; + line-height: 1.5; +} + +.detected-url-item { + display: grid; + gap: 8px; + padding: 10px 12px; + border-left: 4px solid var(--detected-url-border, #d0d7de); + border-radius: 6px; + background: linear-gradient(90deg, var(--detected-url-bg, rgba(0, 0, 0, 0.03)), rgba(255, 255, 255, 0.96) 28%); +} + +.detected-url-topline { + display: flex; + align-items: center; + justify-content: space-between; + gap: 8px; +} + +.detected-url-controls { + display: flex; + flex-wrap: nowrap; + gap: 6px; + min-width: 0; +} + +.detected-url-number { + width: 20px; + height: 20px; + display: inline-flex; + align-items: center; + justify-content: center; + border-radius: 999px; + background: rgba(15, 23, 42, 0.08); + color: #24303b; + font-size: 10px; + font-weight: 700; +} + +.detected-url-body { + min-width: 0; +} + +.detected-url-value { + display: block; + font-size: 12px; + line-height: 1.45; + color: #1f2937; + overflow-wrap: anywhere; +} + +.detected-url-toggle-btn { + flex: 0 0 auto; + display: inline-flex; + align-items: center; + justify-content: center; + padding: 4px 8px; + min-height: 24px; + border: 1px solid rgba(148, 163, 184, 0.4); + border-radius: 999px; + background: rgba(148, 163, 184, 0.12); + color: #64748b; + font-size: 11px; + font-weight: 700; + line-height: 1; + white-space: nowrap; + transition: background-color 120ms ease, border-color 120ms ease, color 120ms ease; + cursor: pointer; +} + +.detected-url-toggle-btn:hover { + background: rgba(15, 23, 42, 0.08); +} + +.detected-url-toggle-btn-inactive:hover { + border-color: rgba(180, 35, 24, 0.28); + background: rgba(180, 35, 24, 0.10); + color: #b42318; +} + +.detected-url-toggle-btn-active:hover { + border-color: rgba(22, 101, 52, 0.28); + background: rgba(22, 101, 52, 0.10); + color: #166534; +} + +.detected-url-toggle-btn-disabled, +.detected-url-toggle-btn-disabled:hover { + border-color: rgba(203, 213, 225, 0.55); + background: rgba(226, 232, 240, 0.45); + color: #94a3b8; + cursor: not-allowed; +} + +.detected-url-message { + margin-top: 4px; + font-size: 11px; + color: #617080; + line-height: 1.45; +} + +.detected-url-allowlisted .detected-url-value { + color: #166534; +} + +.detected-url-denied .detected-url-value { + color: #b42318; + text-decoration: line-through; + text-decoration-thickness: 1.5px; +} + +.detected-url-denied .detected-url-message { + color: #b42318; +} + +.detected-url-filtered .detected-url-value { + color: #6b7280; +} + +.form-field .help-text { + font-size: 12px; + color: #666; + margin-top: 4px; + font-style: italic; +} + +.form-field .error { + color: #ba2121; + font-size: 13px; + margin-top: 4px; +} + +.tag-editor-container { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 6px; + padding: 8px 12px; + min-height: 44px; + background: #fff; + border: 2px solid #004882; + border-radius: 4px; + box-shadow: 4px 4px 4px rgba(0,0,0,0.02); + cursor: text; +} + +.tag-editor-container:focus-within { + border-color: #2c7ec1; +} + +.tag-pills { + display: flex; + flex-wrap: wrap; + gap: 6px; + align-items: center; +} + +.tag-pill { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 4px 8px 4px 10px; + background: var(--tag-bg, #e2e8f0); + color: var(--tag-fg, #1e293b); + border-radius: 16px; + border: 1px solid var(--tag-border, #cbd5e1); + font-size: 13px; + font-weight: 500; +} + +.tag-remove-btn { + display: inline-flex; + align-items: center; + justify-content: center; + width: 16px; + height: 16px; + padding: 0; + margin: 0; + border: 1px solid rgba(15, 23, 42, 0.12); + border-radius: 50%; + background: rgba(15, 23, 42, 0.08); + color: inherit; + font-size: 14px; + line-height: 1; + cursor: pointer; +} + +.tag-inline-input { + flex: 1; + min-width: 120px; + padding: 4px 0; + border: none !important; + box-shadow: none !important; + outline: none; + background: transparent; +} + +.tag-inline-input::placeholder { + color: #7c8b98; +} + +.url-filters-widget textarea { + min-height: 58px; + font-family: monospace; + font-size: 13px; +} + +.url-filters-field > label { + display: none; +} + +.url-filters-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; +} + +.url-filter-label-row { + display: flex; + align-items: baseline; + flex-wrap: nowrap; + gap: 10px; + width: 100%; + margin-bottom: 6px; +} + +.url-filters-column .url-filter-label { + display: block; + font-size: 14px; + margin-bottom: 0; +} + +.url-filter-label-main { + font-weight: 600; + white-space: nowrap; +} + +.url-filter-label-note { + display: inline-block; + flex: 0 0 auto; + margin-left: auto; + font-size: 12px; + color: #7a7a7a; + font-weight: 400; + font-style: italic; + text-align: right; + white-space: nowrap; +} + +.url-filters-toggle { + display: inline-flex !important; + align-items: center; + gap: 8px; + margin-top: 0; + font-size: 14px !important; + font-weight: 600; +} + +.url-filters-toggle input[type="checkbox"] { + width: auto; + margin: 0; +} + +.url-filters-toggle-with-help { + flex-wrap: wrap; + gap: 5px 8px; +} + +.url-filters-toggle-with-help small { + width: 100%; + margin-left: 24px; + color: #6d7882; + font-size: 12px; + font-weight: 400; + line-height: 1.25; +} + +.checkbox-field { + display: flex; + align-items: center; + gap: 10px; +} + +.checkbox-field input[type="checkbox"] { + width: auto; + margin: 0; +} + +.checkbox-field label { + margin: 0; + font-weight: normal; +} + +/* URL Counter */ +.url-counter { + display: inline-block; + padding: 4px 10px; + font-size: 13px; + font-weight: 600; + color: #666; + background-color: #f5f5f5; + border-radius: 4px; + border: 1px solid #ddd; +} + +.url-counter-positive { + color: #155724; + background-color: #d4edda; + border-color: #c3e6cb; +} + +@media (max-width: 1020px) { + .tags-persona-row { + grid-template-columns: 1fr; + } + + .settings-row { + grid-template-columns: 1fr; + } + + .url-workbench { + grid-template-columns: 1fr; + } + + .detected-urls-panel { + margin-top: 0; + } + + .url-filters-grid { + grid-template-columns: 1fr; + } +} + +/* Plugin Presets */ +.plugin-presets { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 8px; + margin-bottom: 18px; + padding: 15px; + background-color: #f8f9fa; + border: 1px solid #dee2e6; + border-radius: 6px; +} + +.preset-label { + font-weight: 600; + color: #495057; + margin-right: 8px; +} + +.preset-btn { + display: inline-flex; + align-items: center; + padding: 6px 14px; + font-size: 13px; + font-weight: 500; + background-color: white; + border: 1px solid #ced4da; + border-radius: 4px; + cursor: pointer; + color: inherit; + text-decoration: none; + transition: all 0.2s; + white-space: nowrap; +} + +.preset-btn:hover { + background-color: #e9ecef; + border-color: #adb5bd; + text-decoration: none; + transform: translateY(-1px); + box-shadow: 0 2px 4px rgba(0,0,0,0.1); +} + +a.preset-btn, +a.preset-btn:visited, +a.preset-btn:hover, +a.preset-btn:focus, +a.preset-btn:active { + text-decoration: none !important; +} + +#add-form .plugin-presets a.preset-btn, +#add-form .plugin-presets a.preset-btn:link, +#add-form .plugin-presets a.preset-btn:visited, +#add-form .plugin-presets a.preset-btn:hover, +#add-form .plugin-presets a.preset-btn:focus, +#add-form .plugin-presets a.preset-btn:active { + text-decoration: none !important; +} + +.preset-btn:active { + transform: translateY(0); + box-shadow: none; +} + +.persona-preset-btn { + color: #004882; + border-color: #b7d3ea; + background-color: #f5fbff; +} + +.persona-preset-btn.persona-preset-btn-active { + color: #ffffff; + border-color: #004882; + background-color: #004882; + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.25); +} + +.persona-preset-btn.persona-preset-btn-active:hover { + background-color: #003060; + border-color: #003060; +} + +.persona-preset-wrap { + display: inline-flex; + align-items: stretch; +} + +.persona-preset-wrap .persona-preset-btn { + border-top-right-radius: 0; + border-bottom-right-radius: 0; + padding-right: 10px; +} + +.persona-edit-btn { + margin-left: -1px; + padding: 6px 8px; + color: #004882; + border-color: #b7d3ea; + border-top-left-radius: 0; + border-bottom-left-radius: 0; + background-color: #eaf5ff; +} + +.persona-create-btn { + justify-content: center; + min-width: 28px; + padding: 6px 8px; + color: #116329; + border-color: #b6e3c6; + background-color: #f0fff4; +} + +/* Advanced section (collapsible) */ +.advanced-section { + background-color: white; + border: 1px solid #ddd; + border-radius: 6px; + padding: 15px; +} + +.advanced-section summary { + cursor: pointer; + user-select: none; + list-style: none; +} + +.advanced-section summary::-webkit-details-marker { + display: none; +} + +.advanced-section summary h3 { + display: inline-block; + margin: 0; + color: #004882; +} + +.advanced-section summary h3:before { + content: 'โ–ถ '; + display: inline-block; + transition: transform 0.2s; +} + +.advanced-section[open] summary h3:before { + transform: rotate(90deg); +} + +.advanced-section summary:hover { + color: #003060; +} + +.advanced-section[open] .form-field { + margin-top: 20px; +} + +/* Depth radio buttons */ +ul#id_depth li { + margin-bottom: 8px; +} + +/* Focus indicators for accessibility */ +input:focus, select:focus, textarea:focus, button:focus { + outline: 3px solid #4A90E2; + outline-offset: 2px; +} + +/* Responsive layout */ +@media (max-width: 768px) { + .crawl-limit-grid { + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + } + + .plugin-presets { + flex-direction: column; + align-items: stretch; + } + + .preset-label { + margin-bottom: 4px; + } + + .preset-btn { + width: 100%; + text-align: center; + } +} diff --git a/archivebox/templates/static/admin-inline-tags.js b/archivebox/templates/static/admin-inline-tags.js new file mode 100644 index 0000000000..d25aba13b2 --- /dev/null +++ b/archivebox/templates/static/admin-inline-tags.js @@ -0,0 +1,258 @@ +(function() { + function computeTagStyle(tagName) { + var hash = 0; + var name = String(tagName || '').toLowerCase(); + for (var i = 0; i < name.length; i++) { + hash = (hash * 31 + name.charCodeAt(i)) % 360; + } + return { + bg: 'hsl(' + hash + ', 70%, 92%)', + border: 'hsl(' + hash + ', 60%, 82%)', + fg: 'hsl(' + hash + ', 35%, 28%)' + }; + } + + function applyTagStyle(el, tagName) { + var colors = computeTagStyle(tagName); + el.style.setProperty('--tag-bg', colors.bg); + el.style.setProperty('--tag-border', colors.border); + el.style.setProperty('--tag-fg', colors.fg); + } + + function getApiKey() { + return (window.ARCHIVEBOX_API_KEY || '').trim(); + } + + function buildApiUrl(path) { + var apiKey = getApiKey(); + if (!apiKey) return path; + var sep = path.indexOf('?') !== -1 ? '&' : '?'; + return path + sep + 'api_key=' + encodeURIComponent(apiKey); + } + + function getCSRFToken() { + var cookies = document.cookie.split(';'); + for (var i = 0; i < cookies.length; i++) { + var cookie = cookies[i].trim(); + if (cookie.startsWith('csrftoken=')) { + return cookie.substring('csrftoken='.length); + } + } + var input = document.querySelector('input[name="csrfmiddlewaretoken"]'); + return input ? input.value : ''; + } + + function buildApiHeaders() { + var headers = { + 'Content-Type': 'application/json' + }; + var apiKey = getApiKey(); + if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey; + var csrfToken = getCSRFToken(); + if (csrfToken) headers['X-CSRFToken'] = csrfToken; + return headers; + } + + function parseTags(el) { + if (el._tagData) return el._tagData; + var raw = el.dataset.tags || '[]'; + try { + el._tagData = JSON.parse(raw); + } catch (e) { + el._tagData = []; + } + return el._tagData; + } + + function setTags(el, tags) { + el._tagData = tags; + el.dataset.tags = JSON.stringify(tags); + } + + function rebuildPills(el) { + var tags = parseTags(el); + var container = el.querySelector('.tag-pills-inline'); + if (!container) return; + container.innerHTML = ''; + tags.forEach(function(td) { + var pill = document.createElement('span'); + pill.className = 'tag-pill'; + pill.setAttribute('data-tag', td.name); + pill.setAttribute('data-tag-id', td.id); + applyTagStyle(pill, td.name); + + var link = document.createElement('a'); + link.href = '/admin/core/snapshot/?tags__id__exact=' + td.id; + link.className = 'tag-link'; + link.textContent = td.name; + pill.appendChild(link); + + var removeBtn = document.createElement('button'); + removeBtn.type = 'button'; + removeBtn.className = 'tag-remove-btn'; + removeBtn.setAttribute('data-tag-id', td.id); + removeBtn.setAttribute('data-tag-name', td.name); + removeBtn.innerHTML = '×'; + pill.appendChild(removeBtn); + + container.appendChild(pill); + }); + } + + function addTag(el, tagName) { + tagName = String(tagName || '').trim(); + if (!tagName) return; + + var tags = parseTags(el); + var exists = tags.some(function(t) { + return t.name.toLowerCase() === tagName.toLowerCase(); + }); + if (exists) return; + + var snapshotId = el.dataset.snapshotId || ''; + fetch(buildApiUrl('/api/v1/core/tags/add-to-snapshot/'), { + method: 'POST', + headers: buildApiHeaders(), + body: JSON.stringify({ + snapshot_id: snapshotId, + tag_name: tagName + }) + }) + .then(function(response) { return response.json(); }) + .then(function(data) { + if (data.success) { + tags.push({ id: data.tag_id, name: data.tag_name }); + tags.sort(function(a, b) { return a.name.toLowerCase().localeCompare(b.name.toLowerCase()); }); + setTags(el, tags); + rebuildPills(el); + } + }) + .catch(function(err) { + console.error('Error adding tag:', err); + }); + } + + function removeTag(el, tagId) { + var snapshotId = el.dataset.snapshotId || ''; + fetch(buildApiUrl('/api/v1/core/tags/remove-from-snapshot/'), { + method: 'POST', + headers: buildApiHeaders(), + body: JSON.stringify({ + snapshot_id: snapshotId, + tag_id: tagId + }) + }) + .then(function(response) { return response.json(); }) + .then(function(data) { + if (data.success) { + var tags = parseTags(el).filter(function(t) { return t.id !== tagId; }); + setTags(el, tags); + rebuildPills(el); + } + }) + .catch(function(err) { + console.error('Error removing tag:', err); + }); + } + + var autocompleteTimers = new WeakMap(); + + function fetchAutocomplete(el, query, datalist) { + if (!datalist) return; + var existing = autocompleteTimers.get(el); + if (existing) window.clearTimeout(existing); + + var timer = window.setTimeout(function() { + if (!query || query.length < 1) { + datalist.innerHTML = ''; + return; + } + + fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))) + .then(function(response) { return response.json(); }) + .then(function(data) { + datalist.innerHTML = ''; + (data.tags || []).forEach(function(tag) { + var option = document.createElement('option'); + option.value = tag.name; + datalist.appendChild(option); + }); + }) + .catch(function(err) { + console.log('Autocomplete error:', err); + }); + }, 150); + + autocompleteTimers.set(el, timer); + } + + function handleContainerClick(event) { + var target = event.target; + var container = target.closest('.tag-editor-inline'); + if (!container) return; + + if (target.classList.contains('tag-remove-btn')) { + event.stopPropagation(); + event.preventDefault(); + var tagId = parseInt(target.getAttribute('data-tag-id'), 10); + if (tagId) removeTag(container, tagId); + return; + } + + if (!target.classList.contains('tag-link')) { + var input = container.querySelector('input.tag-inline-input-sm'); + if (input) input.focus(); + } + } + + function handleInputKeydown(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + var container = input.closest('.tag-editor-inline'); + if (!container) return; + + var value = input.value.trim(); + if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') { + event.preventDefault(); + if (value) { + value.split(',').forEach(function(tag) { addTag(container, tag.trim()); }); + input.value = ''; + } + } + } + + function handleInputEvent(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + var container = input.closest('.tag-editor-inline'); + if (!container) return; + var datalist = container.querySelector('datalist'); + fetchAutocomplete(container, input.value, datalist); + } + + function handleInputFocus(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + input.placeholder = 'add tag...'; + } + + function handleInputBlur(event) { + var input = event.target; + if (!input || !input.matches('input.tag-inline-input-sm')) return; + input.placeholder = '+'; + } + + function init() { + document.addEventListener('click', handleContainerClick); + document.addEventListener('keydown', handleInputKeydown); + document.addEventListener('input', handleInputEvent); + document.addEventListener('focusin', handleInputFocus); + document.addEventListener('focusout', handleInputBlur); + } + + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', init); + } else { + init(); + } +})(); diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css new file mode 100755 index 0000000000..581fdcb75e --- /dev/null +++ b/archivebox/templates/static/admin.css @@ -0,0 +1,2976 @@ +/* * { + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} */ + +#logo { + height: 30px; + vertical-align: -6px; + padding-right: 5px; +} +#site-name:hover a { + opacity: 0.9; +} +#site-name .loader { + height: 25px; + width: 25px; + display: inline-block; + border-width: 3px; + vertical-align: -3px; + margin-right: 5px; + margin-top: 2px; +} +#branding h1, #branding h1 a:link, #branding h1 a:visited { + color: mintcream; +} +#header { + background: #aa1e55; + padding: 6px 14px; +} +#content { + padding: 8px 8px; +} +#user-tools { + font-size: 13px; + +} +#user-tools a.navbar-add-link, +#user-tools a.navbar-add-link:link, +#user-tools a.navbar-add-link:visited { + border: 1px solid #fff; + border-radius: 4px; + padding: 2px 6px; +} + +div.breadcrumbs { + background: #772948; + color: #f5dd5d; + padding: 6px 15px; +} + +#toolbar #searchbar { + height: 25px; +} + +/* View Mode Switcher - Prominent Toggle */ +#snapshot-view-mode { + float: right; + margin-bottom: -40px; + display: inline-flex; + align-items: center; + margin-top: 3px; + margin-right: 10px; + font-size: 13px; + background: #f1f5f9; + border: 1px solid #e2e8f0; + border-radius: 8px; + padding: 2px; + gap: 2px; +} +#snapshot-view-mode a { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 4px; + padding: 6px 12px; + color: #64748b; + text-decoration: none; + border-radius: 6px; + font-weight: 500; + transition: all 0.15s ease; + white-space: nowrap; +} +#snapshot-view-mode a:hover { + color: #334155; + background: #e2e8f0; +} +#snapshot-view-mode a.active { + background: #fff; + color: #1e293b; + box-shadow: 0 1px 3px rgba(0,0,0,0.1); +} +#snapshot-view-mode .view-icon { + font-size: 14px; + line-height: 1; +} + +.model-snapshot.change-list div.breadcrumbs, +.model-snapshot.change-list #content .object-tools { + display: none; +} + +.module h2, .module caption, .inline-group h2 { + background: #772948; +} + +#content .adv-data textarea { + width: 82vw; + max-width: 100%; + min-height: 100px; + height: auto; + background-color: #145454; + color: #f1f1fd; + font-size: 12px; + font-family: monospace; + border-radius: 8px; + line-height: 1.2; + padding: 6px 9px; +} + + +#content .object-tools { + margin-top: -35px; + margin-right: -10px; + float: right; +} + +body.change-list #content .object-tools { + margin-top: 0; + margin-right: 0; + float: none; + display: flex; + justify-content: flex-end; + clear: both; +} + +#content .object-tools a:link, +#content .object-tools a:visited, +#content .object-tools form button { + border-radius: 0px; + background-color: #f5dd5d; + color: #333; + font-size: 12px; + font-weight: 800; +} +#content .object-tools form { + display: inline; + margin: 0; +} +#content .object-tools form button { + border: 0; + cursor: pointer; +} + +#content .object-tools a.addlink { + background-blend-mode: difference; +} + +#content #changelist #toolbar { + padding: 0px; + background: none; + margin-bottom: 10px; + border-top: 0px; + border-bottom: 0px; + display: inline-flex; + width: auto; + max-width: 100%; + box-sizing: border-box; + position: relative; + z-index: 1; +} + +#content #changelist #toolbar #changelist-search, +#content #changelist #toolbar #changelist-search > div { + width: auto; + max-width: 100%; + flex: 0 1 auto; + box-sizing: border-box; +} + +#content #changelist #toolbar #searchbar { + width: clamp(180px, 32vw, 420px); + max-width: 100%; + flex: 0 1 auto; +} + +#content #changelist #toolbar form input[type="submit"] { + border-color: #aa1e55; +} + +#content #changelist-filter li.selected a { + color: #aa1e55; +} + + +#content #changelist .actions .button { + border-radius: 0px; + background-color: #f5dd5d; + color: #333; + font-size: 12px; + font-weight: 800; + margin-right: 4px; + box-shadow: 4px 4px 4px rgba(0,0,0,0.02); + border: 1px solid rgba(0,0,0,0.08); +} +#content #changelist .actions .button:hover { + border: 1px solid rgba(0,0,0,0.2); + opacity: 0.9; +} +#content #changelist .actions .button[name=verify_snapshots], #content #changelist .actions .button[name=update_titles] { + background-color: #dedede; + color: #333; +} +#content #changelist .actions .button[name=update_snapshots] { + background-color: #9ee54b; + color: #333; +} +#content #changelist .actions .button[name=resnapshot_snapshot] { + background-color:lightseagreen; + color: #333; +} +#content #changelist .actions .button[name=overwrite_snapshots] { + background-color: #ffaa31; + color: #333; + margin-left: 10px; +} +#content #changelist .actions .button[name=delete_snapshots] { + background-color: #f91f74; + color: rgb(255 248 252 / 64%); +} +#content #changelist .actions .button[name=add_tags] { +} +#content #changelist .actions .button[name=remove_tags] { + margin-right: 25px; +} + +#content #changelist .actions > label { + max-height: 25px; +} +#content #changelist .actions > label { + width: auto !important; + min-width: 90px; +} +#content #changelist .actions > label > select { + margin-top: 3px; +} + +body.change-list:not(.model-snapshot) #changelist .changelist-form-container > div { + display: flex; + flex-wrap: wrap; + align-items: flex-start; + gap: 10px 12px; + min-width: 0; +} + +body.change-list:not(.model-snapshot) #changelist #toolbar { + flex: 2 1 520px; + min-width: min(100%, 420px); + margin: 0 !important; +} + +body.change-list:not(.model-snapshot) #changelist #toolbar #changelist-search, +body.change-list:not(.model-snapshot) #changelist #toolbar #changelist-search > div { + display: flex !important; + flex-wrap: wrap !important; + align-items: center; + gap: 8px; + width: 100%; + min-width: 0; + max-width: 100%; + white-space: normal !important; +} + +body.change-list:not(.model-snapshot) #changelist #toolbar #searchbar { + flex: 1 1 220px; + min-width: 180px; + width: auto; +} + +body.change-list:not(.model-snapshot) #changelist #toolbar .search-mode-selector { + flex: 1 1 300px !important; + flex-wrap: wrap !important; + margin-left: 0 !important; + min-width: 260px; + white-space: normal !important; +} + +body.change-list #changelist #changelist-form { + display: contents; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top { + flex: 3 1 560px; + width: auto; + min-width: 0; + max-width: 100%; + box-sizing: border-box; + margin-left: auto; + flex-wrap: wrap !important; + overflow: visible; + align-items: flex-start; + align-content: flex-start; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-left { + flex: 0 1 auto; + flex-wrap: wrap !important; + min-width: 0; + align-items: flex-start; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-right { + margin-left: 0; + flex-wrap: wrap; + align-items: flex-start; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons { + display: flex !important; + flex-wrap: wrap; + align-items: center; + gap: 4px; + min-width: 0; +} + +body.change-list:not(.model-snapshot) #changelist .actions-tags-with-buttons { + display: inline-flex; + align-items: center; + gap: 4px; + flex: 1 1 280px; + min-width: 220px; + max-width: 420px; +} + +body.change-list:not(.model-snapshot) #changelist .actions-tags-with-buttons .tag-editor-container { + flex: 1 1 180px; + width: auto; + max-width: none; +} + +body.change-list:not(.model-snapshot) #content #changelist .actions-tags-with-buttons .button[name="add_tags"], +body.change-list:not(.model-snapshot) #content #changelist .actions-tags-with-buttons .button[name="remove_tags"] { + flex: 0 0 30px; + width: 30px; + min-width: 30px; + padding-left: 0; + padding-right: 0; + margin: 0; + justify-content: center; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-bottom, +body.change-list:not(.model-snapshot) #changelist #changelist-form > .results, +body.change-list:not(.model-snapshot) #changelist #changelist-form > .changelist-footer, +body.change-list:not(.model-snapshot) #changelist .xfull { + flex: 1 0 100%; + width: 100%; + max-width: 100%; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .results, +body.change-list:not(.model-snapshot) #changelist #changelist-form > .changelist-footer, +body.change-list:not(.model-snapshot) #changelist #toolbar, +body.change-list:not(.model-snapshot) #changelist .xfull { + margin-right: 0 !important; +} + +body.change-list:not(.model-snapshot) #changelist #changelist-form > .results { + overflow-x: auto; + -webkit-overflow-scrolling: touch; +} + +.model-snapshot.change-list #result_list { + min-width: 1060px; +} + +@media (max-width: 1180px) { + body.change-list #changelist { + grid-template-columns: minmax(0, 1fr) !important; + } + + body.change-list #changelist-filter { + width: auto !important; + min-width: 0 !important; + max-width: none !important; + } + + body.change-list:not(.model-snapshot) #changelist #toolbar, + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top { + flex: 1 0 100%; + width: 100%; + margin-left: 0; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar .small.quiet { + flex: 1 0 100%; + width: 100%; + white-space: normal; + text-align: left; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top { + max-width: 100%; + box-sizing: border-box; + } +} + +@media (max-width: 1180px) { + body.change-list #content, + body.change-list #content-main, + body.change-list #changelist, + body.change-list #changelist .changelist-form-container, + body.change-list:not(.model-snapshot) #changelist .changelist-form-container > div { + max-width: 100%; + min-width: 0; + box-sizing: border-box; + } + + body.change-list #changelist .changelist-form-container > div { + gap: 8px; + } + + body.change-list:not(.model-snapshot) #changelist #toolbar #changelist-search > div { + display: grid !important; + grid-template-columns: 24px minmax(0, 1fr) auto; + gap: 7px; + align-items: center; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar label[for="searchbar"] { + grid-column: 1; + justify-content: center; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar #searchbar { + grid-column: 2; + flex: none !important; + width: 100% !important; + min-width: 0; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar form input[type="submit"] { + grid-column: 3; + flex: none !important; + width: auto !important; + min-width: 76px; + white-space: nowrap; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar .search-mode-selector { + grid-column: 1 / -1; + display: grid !important; + grid-template-columns: repeat(3, minmax(0, 1fr)); + flex: none !important; + width: 100% !important; + max-width: 100%; + min-width: 0; + gap: 6px !important; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar .small.quiet { + grid-column: 1 / -1; + flex: none !important; + width: 100% !important; + max-width: 100%; + min-width: 0; + white-space: normal; + text-align: left; + } + + body.change-list:not(.model-snapshot) #content #changelist #toolbar .search-mode-selector label { + justify-content: center; + min-width: 0 !important; + width: 100%; + padding-left: 8px !important; + padding-right: 8px !important; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top { + padding: 8px; + gap: 8px; + overflow-x: visible !important; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-left, + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-right, + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons { + width: 100%; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-right { + justify-content: flex-start; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-left { + display: grid !important; + grid-template-columns: 1fr; + gap: 8px; + white-space: normal !important; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons { + display: flex !important; + flex-wrap: wrap; + gap: 6px; + } + + body.change-list:not(.model-snapshot) #content #changelist .actions .button { + flex: 1 1 135px; + width: auto; + min-height: 30px; + margin: 0; + justify-content: center; + text-align: center; + white-space: nowrap; + } + + body.change-list:not(.model-snapshot) #content #changelist .actions .button[name="add_tags"], + body.change-list:not(.model-snapshot) #content #changelist .actions .button[name="remove_tags"] { + flex: 0 0 48px; + width: 48px; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons .button[name="resnapshot_snapshot"], + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons .button[name="update_snapshots"], + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons .button[name="overwrite_snapshots"], + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-buttons .button[name="delete_snapshots"] { + margin-left: 0; + margin-right: 0; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-right { + display: grid !important; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 6px; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-tags-with-buttons { + width: 100% !important; + max-width: 100% !important; + flex-basis: 100% !important; + min-width: 0; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .actions-tags-with-buttons .tag-editor-container { + width: auto !important; + max-width: none !important; + flex: 1 1 160px !important; + } + + body.change-list:not(.model-snapshot) #changelist #changelist-form > .actions-top .action-counter { + width: 100%; + white-space: normal; + } + +} + +/* Filter Sidebar - Improved Layout */ +#content #changelist-filter { + background: #fff; + border: 1px solid #e2e8f0; + border-radius: 10px; + box-shadow: 0 1px 3px rgba(0,0,0,0.05); + overflow: hidden; +} +#content #changelist-filter h2 { + border-radius: 0; + background: #f8fafc; + color: #475569; + font-size: 11px; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.05em; + padding: 10px 12px; + margin: 0; + border-bottom: 1px solid #e2e8f0; +} +#content #changelist-filter h3 { + font-size: 11px; + font-weight: 600; + color: #64748b; + text-transform: uppercase; + letter-spacing: 0.03em; + padding: 10px 12px 4px; + margin: 0; + background: transparent; +} +#content #changelist-filter ul { + padding: 0 6px 8px; + margin: 0; + list-style: none; +} +#content #changelist-filter li { + margin: 0; +} +#content #changelist-filter li a { + display: block; + padding: 6px 10px; + color: #475569; + text-decoration: none; + font-size: 12px; + border-radius: 5px; + transition: background 0.15s ease, color 0.15s ease; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; +} +#content #changelist-filter li a:hover { + background: #f1f5f9; + color: #1e293b; +} +#content #changelist-filter li.selected a { + background: #eff6ff; + color: #2563eb; + font-weight: 500; +} +#content #changelist-filter-clear { + padding: 8px 12px; + margin: 0; + border-bottom: 1px solid #e2e8f0; + background: #fef2f2; +} +#content #changelist-filter-clear a { + color: #dc2626; + font-size: 12px; + font-weight: 500; + text-decoration: none; +} +#content #changelist-filter-clear a:hover { + text-decoration: underline; +} + +#changelist .paginator { + border-top: 0px; + border-bottom: 0px; +} + +@media (min-width: 767px) { + #content #changelist-filter { + top: 35px; + width: 160px; + margin-bottom: 35px; + } + + .change-list .filtered .results, + .change-list .filtered .paginator, + .filtered #toolbar, + .filtered div.xfull { + margin-right: 168px; + } +} + +#content a img.favicon { + height: 20px; + max-width: 28px; + vertical-align: -5px; + padding-right: 6px; +} + +#content img.snapshot-preview { + width: 30px; + height: 30px; + max-width: 30px; + max-height: 30px; + object-fit: contain; + border-radius: 4px; + display: block; + margin: 0 auto; +} + +#content img.snapshot-preview.screenshot { + width: 100px; + height: 100px; + max-width: 100px; + max-height: 100px; + object-fit: cover; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + width: 100px; + max-width: 100px; +} + +#content td, #content th { + vertical-align: middle; + padding: 4px; +} + +#content #changelist table input { + vertical-align: -2px; +} + +#content thead th .text a { + padding: 8px 4px; +} + +#content th.field-added, #content td.field-updated { + word-break: break-word; + min-width: 135px; + white-space: normal; +} + +#content th.field-title_str { + min-width: 300px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-title_str { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + padding-left: 2px; + padding-right: 2px; +} + +#content th.column-created_at, +#content tbody th.field-created_at { + width: 1%; + white-space: nowrap; +} + +#content th.column-action-checkbox, +#content th.action-checkbox-column, +#content td.action-checkbox { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-preview_icon, +#content td.field-preview_icon { + padding-left: 2px; + padding-right: 2px; +} + +body.model-snapshot.change-list #result_list col.snapshot-permissions-col, +body.model-snapshot.change-list #result_list th.column-permissions_badge, +body.model-snapshot.change-list #result_list td.field-permissions_badge, +body.model-crawl.change-list #result_list col.snapshot-permissions-col, +body.model-crawl.change-list #result_list th.column-permissions_badge, +body.model-crawl.change-list #result_list td.field-permissions_badge { + width: 22px !important; + min-width: 22px !important; + max-width: 22px !important; +} + +body.model-snapshot.change-list #result_list th.column-permissions_badge, +body.model-snapshot.change-list #result_list td.field-permissions_badge, +body.model-crawl.change-list #result_list th.column-permissions_badge, +body.model-crawl.change-list #result_list td.field-permissions_badge { + padding-left: 0 !important; + padding-right: 0 !important; + text-align: center !important; + overflow: visible; +} + +body.model-snapshot.change-list #result_list th.column-permissions_badge .text, +body.model-snapshot.change-list #result_list th.column-permissions_badge .text span, +body.model-crawl.change-list #result_list th.column-permissions_badge .text, +body.model-crawl.change-list #result_list th.column-permissions_badge .text span { + display: inline-flex !important; + width: 22px !important; + min-width: 22px !important; + justify-content: center !important; + padding: 0 !important; +} + +body.model-crawl.change-list #result_list th.column-short_id, +body.model-crawl.change-list #result_list td.field-short_id { + width: 70px !important; + min-width: 70px !important; + max-width: 70px !important; +} + +body.model-crawl.change-list #result_list th.column-short_id, +body.model-crawl.change-list #result_list td.field-short_id { + padding-left: 4px !important; + padding-right: 4px !important; +} + +body.model-crawl.change-list #result_list td.field-short_id a { + display: inline-block; + max-width: 62px; + overflow: hidden; + text-overflow: clip; + white-space: nowrap; + font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace; + font-size: 12px; +} + +.snapshot-permissions-quick { + position: relative; + display: inline-flex; + align-items: center; + justify-content: center; +} + +.snapshot-permissions-button { + display: inline-flex; + align-items: center; + justify-content: center; + width: 24px; + height: 24px; + padding: 0; + border: 0; + border-radius: 999px; + background: transparent; + cursor: pointer; +} + +.snapshot-permissions-button:hover .snapshot-permissions-icon, +.snapshot-permissions-button:focus .snapshot-permissions-icon { + box-shadow: 0 0 0 2px #bfdbfe; +} + +.snapshot-permissions-icon { + display: inline-flex; + align-items: center; + justify-content: center; + width: 18px; + height: 18px; + border-radius: 999px; + font-size: 11px; + line-height: 1; +} + +.snapshot-permissions-menu { + position: absolute; + top: 26px; + left: 0; + z-index: 30; + min-width: 126px; + padding: 4px; + border: 1px solid #cbd5e1; + border-radius: 8px; + background: #ffffff; + box-shadow: 0 10px 24px rgba(15, 23, 42, 0.18); +} + +.snapshot-permissions-menu-item { + display: flex; + align-items: center; + gap: 8px; + width: 100%; + height: 30px; + padding: 0 8px; + border: 0; + border-radius: 6px; + background: transparent; + color: #334155; + font-size: 12px; + font-weight: 600; + text-align: left; + cursor: pointer; +} + +.snapshot-permissions-menu-item:hover, +.snapshot-permissions-menu-item.is-active { + background: #eff6ff; + color: #1d4ed8; +} + +#content th.column-created_at, +#content tbody th.field-created_at { + padding-left: 6px; + padding-right: 6px; +} + +#content th.column-action-checkbox, +#content th.action-checkbox-column, +#content td.action-checkbox { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-status_with_progress, +#content td.field-status_with_progress { + padding-left: 2px; + padding-right: 2px; + width: 90px; + max-width: 90px; +} + +#content th.field-size_with_stats, +#content td.field-size_with_stats { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-files, +#content td.field-files { + padding-left: 2px; + padding-right: 2px; + width: 212px; + max-width: 212px; +} + +#content th.field-files, +#content td.field-files { + padding-left: 2px; + padding-right: 2px; + width: 212px; + max-width: 212px; +} + +#content th.field-size_with_stats, +#content td.field-size_with_stats { + padding-left: 2px; + padding-right: 2px; +} + +#content th.field-status_with_progress, +#content td.field-status_with_progress { + padding-left: 2px; + padding-right: 2px; + width: 90px; + max-width: 90px; +} + +#content th.field-tags_inline, +#content td.field-tags_inline { + max-width: 400px; + width: 380px; + min-width: 340px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-tags_inline .tag-pills-inline { + flex-wrap: wrap; +} + +#content td.field-tags_inline .tag-editor-inline { + max-width: 400px; +} + +#content td.field-tags_inline .tag-editor-inline.readonly { + padding-right: 0; +} + +#content th.field-tags_inline, +#content td.field-tags_inline { + max-width: 220px; + width: 220px; + padding-left: 2px; + padding-right: 2px; +} + +#content td.field-tags_inline .tag-pills-inline { + flex-wrap: wrap; +} + +#content td.field-tags_inline .tag-editor-inline { + max-width: 220px; +} + +#content td.field-files { + white-space: nowrap; + width: 212px; + max-width: 212px; + overflow: hidden; +} +#content td.field-files .files-icons a { + display: inline-flex; + align-items: center; + justify-content: center; + padding: 0; + margin: 0; + line-height: 1; + width: 16px; + height: 16px; + min-width: 16px; +} +#content td.field-files .files-icons svg, +#content td.field-files .files-icons img { + display: block; + margin: 0; + width: 16px; + height: 16px; +} +#content td.field-files .exists-True { + opacity: 1; +} +#content td.field-files .exists-False { + opacity: 0.1; + filter: grayscale(100%); +} +#content td.field-size { + white-space: nowrap; +} + +#content td.field-url_str { + word-break: break-all; + min-width: 200px; +} + +#content tr b.status-pending { + font-weight: 200; + opacity: 0.6; +} + +.loader { + border: 16px solid #f3f3f3; /* Light grey */ + border-top: 16px solid #3498db; /* Blue */ + border-radius: 50%; + width: 30px; + height: 30px; + box-sizing: border-box; + animation: spin 2s linear infinite; +} + +@keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } +} + +.tag { + float: right; + border-radius: 5px; + background-color: #bfdfff; + padding: 2px 5px; + margin-left: 4px; + margin-top: 1px; +} + +.files-icons { + display: inline-flex; + flex-wrap: wrap; + gap: 2px; + vertical-align: middle; +} + +.files-icons--compact { + display: inline-grid; + grid-auto-flow: column; + grid-auto-columns: auto; + grid-template-rows: repeat(4, auto); + gap: 2px 4px; + justify-content: start; + align-content: start; + max-width: 212px; +} + +.files-icons a { + display: inline-flex; + align-items: center; + justify-content: center; + text-decoration: none; +} + +.files-icons .abx-output-icon { + width: 16px; + height: 16px; + display: inline-flex; + align-items: center; + justify-content: center; + border-radius: 3px; + color: #475569; + background: #f8fafc; + border: 1px solid #e2e8f0; + box-shadow: none; + font-size: 7px; + font-weight: 700; + line-height: 1; + letter-spacing: 0; +} + +.files-icons .abx-output-icon svg { + width: 16px; + height: 16px; + display: block; +} + +.exists-False { + opacity: 0.1; + filter: grayscale(100%); +} + + +#result_list tbody td.field-cmd_str pre, +#result_list tbody td.field-output_str pre { + max-width: 22vw; + word-wrap: anywhere; + white-space: break-spaces; + max-height: 40px; + overflow: hidden; + margin: 2px; + background-color: rgba(0,0,0,0.05); + padding: 1px 4px 16px 8px; + border-radius: 4px; +} + +body.model-archiveresult.change-list #result_list td.field-cmd_str { + width: 180px !important; + max-width: 180px !important; + min-width: 0 !important; +} + +body.model-archiveresult.change-list #result_list td.field-cmd_str > div, +body.model-archiveresult.change-list #result_list td.field-cmd_str code { + width: 100% !important; + max-width: 100% !important; + min-width: 0 !important; +} + +.archivebox-zip-button { + position: relative; +} + +.archivebox-zip-spinner { + display: none; + width: 1em; + height: 1em; + border: 2px solid #2563eb; + border-right-color: transparent; + border-radius: 999px; + animation: archivebox-zip-spin 0.75s linear infinite; + flex: 0 0 auto; + box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.45); + vertical-align: -0.125em; +} + +.archivebox-zip-button.is-loading { + pointer-events: none !important; + opacity: 0.96; +} + +.archivebox-zip-button.is-loading .archivebox-zip-spinner { + display: inline-block; +} + +.archivebox-zip-button.is-loading .archivebox-zip-label { + opacity: 0.72; +} + +.archivebox-zip-button[data-loading-mode="spinner-only"].is-loading { + gap: 0 !important; + justify-content: center !important; +} + +.archivebox-zip-button[data-loading-mode="spinner-only"].is-loading .archivebox-zip-label { + flex: 0 0 0 !important; + width: 0 !important; + min-width: 0 !important; + margin: 0 !important; + padding: 0 !important; + overflow: hidden; + white-space: nowrap; + opacity: 0; +} + +@keyframes archivebox-zip-spin { + to { + transform: rotate(360deg); + } +} + +body.model-archiveresult.change-list #result_list { + table-layout: fixed; + width: 100%; +} + +body.model-archiveresult.change-list #result_list th.column-cmd_str, +body.model-archiveresult.change-list #result_list td.field-cmd_str { + width: 180px !important; + max-width: 180px !important; + min-width: 0 !important; + overflow: hidden !important; + box-sizing: border-box; +} + +body.model-archiveresult.change-list #result_list th.column-details_link, +body.model-archiveresult.change-list #result_list td.field-details_link { + width: 64px; + max-width: 64px; +} + +body.model-archiveresult.change-list #result_list th.column-zip_link, +body.model-archiveresult.change-list #result_list td.field-zip_link { + width: 62px; + max-width: 62px; + white-space: nowrap; +} + +body.model-archiveresult.change-list #result_list th.column-created_at, +body.model-archiveresult.change-list #result_list td.field-created_at { + width: 96px; + max-width: 96px; +} + +body.model-archiveresult.change-list #result_list th.column-snapshot_info, +body.model-archiveresult.change-list #result_list td.field-snapshot_info { + width: 150px; + max-width: 150px; +} + +body.model-archiveresult.change-list #result_list th.column-tags_inline, +body.model-archiveresult.change-list #result_list td.field-tags_inline { + width: 82px; + max-width: 82px; +} + +body.model-archiveresult.change-list #result_list th.column-status_badge, +body.model-archiveresult.change-list #result_list td.field-status_badge { + width: 84px; + max-width: 84px; +} + +body.model-archiveresult.change-list #result_list th.column-plugin_with_icon, +body.model-archiveresult.change-list #result_list td.field-plugin_with_icon { + width: 92px; + max-width: 92px; +} + +body.model-archiveresult.change-list #result_list th.column-process_link, +body.model-archiveresult.change-list #result_list td.field-process_link { + width: 56px; + white-space: nowrap; +} + +body.model-archiveresult.change-list #result_list th.column-machine_link, +body.model-archiveresult.change-list #result_list td.field-machine_link { + width: 108px; + max-width: 108px; +} + +body.model-archiveresult.change-list #result_list th.column-output_str_display, +body.model-archiveresult.change-list #result_list td.field-output_str_display { + width: 110px; + max-width: 110px; +} + +body.model-archiveresult.change-list #result_list td.field-snapshot_info a { + display: block; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +body.model-archiveresult.change-list #result_list td.field-machine_link a, +body.model-archiveresult.change-list #result_list td.field-output_str_display a, +body.model-archiveresult.change-list #result_list td.field-output_str_display span, +body.model-archiveresult.change-list #result_list td.field-plugin_with_icon a:last-child { + display: block; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +body.model-archiveresult.change-list #result_list td.field-cmd_str > div, +body.model-archiveresult.change-list #result_list td.field-cmd_str code { + width: 100% !important; + min-width: 0 !important; + max-width: 100% !important; + box-sizing: border-box; +} + +body.filters-collapsed #content #changelist-filter { + display: none !important; +} + +body.filters-collapsed .change-list .filtered .results, +body.filters-collapsed .change-list .filtered .paginator, +body.filters-collapsed .filtered #toolbar, +body.filters-collapsed .filtered div.xfull { + margin-right: 0 !important; +} + +body.filters-collapsed #content #changelist-filter { + display: none !important; +} + +body.filters-collapsed .change-list .filtered .results, +body.filters-collapsed .change-list .filtered .paginator, +body.filters-collapsed .filtered #toolbar, +body.filters-collapsed .filtered div.xfull { + margin-right: 0 !important; +} + +#result_list tbody td.field-extractor { + font-weight: 800; + font-variant: small-caps; +} + +#result_list tbody td.field-status, +#result_list tbody td.field-status_badge { + font-variant: small-caps; +} + +body.model-archiveresult.change-list #result_list tbody tr { + transition: background-color 0.15s ease, opacity 0.15s ease; +} + +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.started), +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.backoff) { + background: rgba(251, 191, 36, 0.14); +} + +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.failed) { + background: rgba(239, 68, 68, 0.12); +} + +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.succeeded) { + background: rgba(34, 197, 94, 0.11); +} + +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.skipped), +body.model-archiveresult.change-list #result_list tbody tr:has(td.field-status_badge .status-badge.noresults) { + background: rgba(148, 163, 184, 0.10); + opacity: 0.82; +} + +.inline-group .tabular td.original p { + margin-top: -28px; +} + +tbody .output-link { + float: right; + margin-bottom: -25px; + margin-right: -3px; + margin-top: -4px; + opacity: 0.4; + box-shadow: 4px 4px 4px rgba(0,0,0,0.1); +} +tbody .output-link:hover {opacity: 1;} + + + +@keyframes fadeIn { + 0% { opacity: 0; } + 30% { opacity: 0.1;} + 100% { opacity: 1; } +} + +.fade-in-progress-url { + animation: fadeIn 14s; +} + +/* Snapshot Progress Spinner */ +.snapshot-progress-spinner { + display: inline-block; + width: 12px; + height: 12px; + border: 2px solid #e2e8f0; + border-top-color: #3b82f6; + border-radius: 50%; + animation: snapshot-spin 0.8s linear infinite; +} + +@keyframes snapshot-spin { + to { transform: rotate(360deg); } +} + +/* Status Badges */ +.status-badge { + display: inline-block; + padding: 2px 8px; + border-radius: 12px; + font-size: 11px; + font-weight: 500; +} +.status-badge.queued { background: #fef3c7; color: #f59e0b; } +.status-badge.started { background: #dbeafe; color: #3b82f6; } +.status-badge.sealed { background: #d1fae5; color: #10b981; } +.status-badge.succeeded { background: #d1fae5; color: #10b981; } +.status-badge.failed { background: #fee2e2; color: #ef4444; } +.status-badge.backoff { background: #fef3c7; color: #f59e0b; } +.status-badge.skipped { background: #f3f4f6; color: #6b7280; } +.status-badge.noresults { background: #f1f5f9; color: #64748b; } + +/* Progress Bar */ +.snapshot-progress-bar { + background: #e2e8f0; + border-radius: 4px; + height: 6px; + overflow: hidden; +} +.snapshot-progress-bar-fill { + height: 100%; + transition: width 0.3s ease; + border-radius: 4px; +} + +@media (max-width: 1180px) { + .model-snapshot.change-list #result_list { + display: block; + min-width: 0 !important; + width: 100% !important; + border-collapse: separate; + } + + .model-snapshot.change-list #result_list thead { + display: none; + } + + .model-snapshot.change-list #result_list tbody { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 8px; + padding: 8px; + } + + .model-snapshot.change-list #result_list tbody tr { + display: grid; + grid-template-columns: 24px minmax(0, 1fr) auto; + grid-template-areas: + "check date status" + "check title title" + "check tags tags" + "check files size"; + gap: 5px 8px; + align-items: start; + min-width: 0; + margin: 0 !important; + padding: 10px !important; + border: 1px solid #e2e8f0; + border-radius: 8px; + background: #ffffff; + } + + .model-snapshot.change-list #result_list tbody tr.selected { + border-color: #facc15; + background: #fffde7; + } + + .model-snapshot.change-list #result_list tbody th, + .model-snapshot.change-list #result_list tbody td { + display: block; + width: auto !important; + min-width: 0 !important; + max-width: none !important; + padding: 0 !important; + border: 0 !important; + background: transparent !important; + box-sizing: border-box; + overflow: visible; + } + + .model-snapshot.change-list #result_list tbody td.action-checkbox { + grid-area: check; + padding-top: 2px !important; + } + + .model-snapshot.change-list #result_list tbody th.field-created_at, + .model-snapshot.change-list #result_list tbody td.field-created_at { + grid-area: date; + font-size: 12px; + color: #64748b; + white-space: nowrap; + } + + .model-snapshot.change-list #result_list tbody td.field-preview_icon { + display: none; + } + + .model-snapshot.change-list #result_list tbody td.field-title_str { + grid-area: title; + } + + .model-snapshot.change-list #result_list tbody td.field-title_str a { + display: block; + max-width: 100%; + overflow-wrap: anywhere; + } + + .model-snapshot.change-list #result_list tbody td.field-tags_inline { + grid-area: tags; + } + + .model-snapshot.change-list #result_list tbody td.field-status_with_progress { + grid-area: status; + justify-self: end; + } + + .model-snapshot.change-list #result_list tbody td.field-files { + grid-area: files; + white-space: normal; + } + + .model-snapshot.change-list #result_list tbody td.field-files .files-icons, + .model-snapshot.change-list #result_list tbody td.field-files .files-icons--compact { + display: flex !important; + flex-wrap: wrap; + max-width: 100%; + gap: 4px; + } + + .model-snapshot.change-list #result_list tbody td.field-size_with_stats { + grid-area: size; + justify-self: end; + color: #64748b; + font-size: 12px; + text-align: right; + white-space: nowrap; + } +} + +@media (max-width: 720px) { + .model-snapshot.change-list #changelist #toolbar #changelist-search > div { + grid-template-columns: minmax(0, 1fr) auto; + } + + .model-snapshot.change-list #changelist #toolbar label[for="searchbar"] { + top: 0; + } + + .model-snapshot.change-list #changelist #toolbar .search-mode-selector { + gap: 5px !important; + } + + .model-snapshot.change-list #changelist #toolbar .search-mode-selector label { + font-size: 12px !important; + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .model-snapshot.change-list #result_list tbody { + grid-template-columns: minmax(0, 1fr); + padding: 8px; + } +} + +.model-snapshot.change-list .cards { + grid-template-columns: repeat(auto-fill, minmax(250px, 1fr)); + gap: 12px; + padding: 10px 8px; +} + +.model-snapshot.change-list .cards .card { + min-width: 0; +} + +.model-snapshot.change-list .cards .card .card-info { + display: grid; + grid-template-columns: minmax(0, 1fr) auto; + gap: 5px 8px; + align-items: center; + padding: 6px 8px; + text-align: left; +} + +.model-snapshot.change-list .cards .card .card-info:not(.card-meta) > a { + grid-column: 1 / -1; + justify-self: center; + max-width: 100%; +} + +.model-snapshot.change-list .cards .card .card-info.card-meta { + grid-template-columns: 18px minmax(0, 1fr) auto; +} + +.model-snapshot.change-list .cards .card .card-info.card-meta > a { + grid-column: auto; + justify-self: center; + text-align: center; +} + +.model-snapshot.change-list .cards .card .card-info .timestamp { + display: block; + max-width: 100%; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.model-snapshot.change-list .cards .card .card-info.card-meta .card-size { + max-width: 72px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.model-snapshot.change-list .cards .card .card-info > div { + min-width: 0; +} + +.model-snapshot.change-list .cards .card .card-info .files-icons, +.model-snapshot.change-list .cards .card .card-outputs .files-icons { + display: inline-flex !important; + flex-flow: row wrap !important; + justify-content: center !important; + align-items: center !important; + gap: 2px !important; + max-width: 100%; +} + +.model-snapshot.change-list .cards .card .card-info .files-icons a, +.model-snapshot.change-list .cards .card .card-info .files-icons .abx-output-icon, +.model-snapshot.change-list .cards .card .card-outputs .files-icons a, +.model-snapshot.change-list .cards .card .card-outputs .files-icons .abx-output-icon { + flex: 0 0 16px; +} + +.model-snapshot.change-list .cards .card .card-info label { + display: inline-grid; + grid-template-columns: auto 18px; + gap: 8px; + align-items: center; + justify-self: end; + width: auto; + height: auto; + margin: 0; +} + +.model-snapshot.change-list .cards .card .card-info input[type=checkbox] { + float: none; + margin: 0; +} + +.model-snapshot.change-list .cards .card .card-thumbnail { + background: #fbfafb; +} + +@media (max-width: 900px) { + .model-snapshot.change-list .cards { + grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); + } +} + +@media (max-width: 430px) { + .model-snapshot.change-list .cards { + grid-template-columns: minmax(0, 1fr); + padding: 8px 6px; + } + + .model-snapshot.change-list .cards .card .card-info { + grid-template-columns: minmax(0, 1fr) auto; + padding: 6px; + } + + .model-snapshot.change-list .cards .card .card-info.card-meta { + grid-template-columns: 18px minmax(0, 1fr) auto; + } +} + +@media (max-width: 900px) { + .model-snapshot.change-list #result_list tbody { + grid-template-columns: minmax(0, 1fr); + } +} + +@media (max-width: 430px) { + .model-snapshot.change-list #content { + padding-left: 6px; + padding-right: 6px; + } + + .model-snapshot.change-list #changelist #toolbar, + .model-snapshot.change-list #changelist #changelist-form > .actions-top { + padding: 7px !important; + } + + .model-snapshot.change-list #changelist #toolbar form input[type="submit"] { + min-width: 72px; + padding-left: 10px; + padding-right: 10px; + } + + .model-snapshot.change-list #changelist .actions-tags-with-buttons { + grid-template-columns: minmax(0, 1fr) 34px 34px; + } + + .model-snapshot.change-list #result_list tbody tr { + grid-template-columns: 24px minmax(0, 1fr); + grid-template-areas: + "check date" + "check title" + "check tags" + "check status" + "check files" + "check size"; + } + + .model-snapshot.change-list #result_list tbody td.field-status_with_progress, + .model-snapshot.change-list #result_list tbody td.field-size_with_stats { + justify-self: start; + text-align: left; + } +} + +body.embedded-change-list #content { + padding: 0; +} + +body.embedded-change-list #header, +body.embedded-change-list .breadcrumbs, +body.embedded-change-list #progress, +body.embedded-change-list #progress-monitor, +body.embedded-change-list #footer { + display: none; +} + +body.embedded-change-list #content-main { + width: 100%; +} + +body.embedded-change-list #changelist { + margin: 0; + border-radius: 0; + box-shadow: none; +} + +body.embedded-change-list #changelist-filter { + display: none; +} + +body.embedded-change-list.model-snapshot.change-list #changelist .changelist-form-container { + display: block; +} + +body.embedded-change-list.model-snapshot.change-list #toolbar, +body.embedded-change-list.model-snapshot.change-list #changelist .actions { + border-left: 0; + border-right: 0; + border-radius: 0; +} + +/* Snapshot changelist final layout pass. + The toolbar, action bar, results, and footer are children of the wrapper inside + .changelist-form-container, so the row layout has to be applied there. */ +.model-snapshot.change-list #changelist .changelist-form-container > div { + display: grid !important; + grid-template-columns: minmax(480px, 0.8fr) minmax(720px, 1.2fr); + gap: 8px; + align-items: stretch; +} + +.model-snapshot.change-list #changelist #toolbar { + grid-column: 1; + grid-row: 1; + margin: 0 !important; + box-sizing: border-box; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top { + grid-column: 2; + grid-row: 1; + margin: 0; + align-self: stretch; + box-sizing: border-box; +} + +.model-snapshot.change-list #changelist #changelist-form > .cards, +.model-snapshot.change-list #changelist #changelist-form > .results, +.model-snapshot.change-list #changelist #changelist-form > .changelist-footer, +.model-snapshot.change-list #changelist .paginator, +.model-snapshot.change-list #changelist .xfull { + grid-column: 1 / -1; +} + +.model-snapshot.change-list #toolbar .changelist-toolbar-row { + display: grid; + grid-template-columns: 36px minmax(0, 1fr) 96px; + gap: 8px; + align-items: start; +} + +.model-snapshot.change-list #toolbar .snapshot-view-icon-toggle { + display: inline-flex; + width: 36px; + min-width: 36px; + height: 40px; + padding: 0; + margin: 0; + align-items: center; + justify-content: center; +} + +.model-snapshot.change-list #toolbar .snapshot-view-icon { + display: grid; + width: 15px; + height: 15px; + gap: 2px; +} + +.model-snapshot.change-list #toolbar .snapshot-view-icon span { + display: block; + min-width: 0; + min-height: 0; + border: 1px solid #64748b; + border-radius: 2px; + background: #f8fafc; +} + +.model-snapshot.change-list #toolbar .snapshot-view-icon-grid { + grid-template-columns: repeat(2, 1fr); + grid-template-rows: repeat(2, 1fr); +} + +.model-snapshot.change-list #toolbar .snapshot-view-icon-list { + grid-template-rows: repeat(3, 1fr); +} + +.model-snapshot.change-list #toolbar .filter-pane-toggle { + display: inline-flex; + width: 96px; + min-width: 96px; + height: 40px; + padding: 0 10px; + margin: 0; + align-items: center; + justify-content: center; + white-space: nowrap; +} + +.model-snapshot.change-list #toolbar #changelist-search { + min-width: 0; +} + +.model-snapshot.change-list #toolbar .search-mode-selector { + grid-column: 1 / -1; + display: flex !important; + flex-wrap: nowrap !important; + gap: 6px !important; + width: 100%; + min-width: 0; + margin: 0 !important; + overflow: visible; +} + +.model-snapshot.change-list #toolbar .search-mode-selector .search-mode-option { + flex: 0 0 auto; + height: 28px; + justify-content: center; + padding: 4px 8px !important; + white-space: nowrap; + overflow: visible; +} + +.model-snapshot.change-list #toolbar .search-mode-selector .search-mode-meta, +.model-snapshot.change-list #toolbar .search-mode-selector .search-mode-deep { + min-width: 78px !important; +} + +.model-snapshot.change-list #toolbar .search-mode-selector .search-mode-contents { + min-width: 112px !important; +} + +.model-snapshot.change-list #toolbar .small.quiet { + grid-column: 1 / -1; + margin: 0 !important; + line-height: 1.35; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .actions-left { + display: grid !important; + grid-template-columns: minmax(170px, 0.65fr) minmax(280px, 1.35fr) auto; + gap: 8px; + align-items: center; + min-width: 0; + width: 100%; + white-space: normal !important; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons { + display: grid !important; + grid-template-columns: minmax(0, 1fr) 34px 34px; + gap: 0; + align-items: stretch; + min-width: 0 !important; + width: 100% !important; + max-width: none !important; + height: 34px; + box-sizing: border-box; + border: 1px solid #cbd5e1; + border-radius: 8px; + background: #ffffff; + overflow: hidden; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .tag-editor-container { + width: auto !important; + max-width: none !important; + height: 32px !important; + min-height: 0 !important; + padding: 0 9px !important; + box-sizing: border-box; + flex: 1 1 auto !important; + border: 0; + border-radius: 0; + box-shadow: none; + overflow: hidden; + flex-wrap: nowrap; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .tag-inline-input { + min-width: 0; + height: auto; + min-height: 0; + padding: 0; + margin: 0; + border: 0; + border-radius: 0; + box-shadow: none; + background: transparent; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .button[name="add_tags"], +.model-snapshot.change-list #changelist .actions-tags-with-buttons .button[name="remove_tags"] { + width: 34px; + min-width: 34px; + height: 32px; + padding: 0; + margin: 0; + border: 0; + border-left: 1px solid #cbd5e1; + border-radius: 0; + justify-content: center; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons { + display: grid !important; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 5px; + min-width: 0; + width: 100%; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button { + width: 100%; + min-width: 0; + height: 34px; + padding-left: 6px; + padding-right: 6px; + font-size: 12px; + margin: 0; + justify-content: center; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-counter { + align-self: center; + justify-self: start; + white-space: nowrap; +} + +@media (max-width: 1420px) { + .model-snapshot.change-list #changelist .changelist-form-container > div { + grid-template-columns: minmax(420px, 0.8fr) minmax(560px, 1.2fr); + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .actions-left { + grid-template-columns: minmax(170px, 0.55fr) minmax(340px, 1.45fr); + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button { + font-size: 11px; + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary { + grid-column: 1 / -1; + grid-row: auto; + } +} + +@media (max-width: 900px) { + .model-snapshot.change-list #changelist .changelist-form-container > div { + grid-template-columns: minmax(0, 1fr); + } + + .model-snapshot.change-list #changelist #toolbar, + .model-snapshot.change-list #changelist #changelist-form > .actions-top { + grid-column: 1; + grid-row: auto; + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary { + overflow-x: auto; + max-width: 100%; + } +} + +@media (max-width: 520px) { + .model-snapshot.change-list #toolbar .changelist-toolbar-row { + grid-template-columns: 36px minmax(0, 1fr) 84px; + } + + .model-snapshot.change-list #toolbar .filter-pane-toggle { + width: 84px; + min-width: 84px; + padding-left: 6px; + padding-right: 6px; + } + + .model-snapshot.change-list #toolbar .search-mode-selector label span[aria-hidden="true"] { + display: none; + } +} + +#toolbar .search-input-wrap { + position: relative; + display: block; + min-width: 0; +} + +#toolbar .search-input-addon { + position: absolute; + left: 1px; + top: 1px; + bottom: 1px; + z-index: 2; + display: inline-flex; + align-items: center; + gap: 4px; + box-sizing: border-box; + max-width: calc(100% - 2px); + padding: 0 8px; + border-right: 1px solid #cbd5e1; + border-radius: 7px 0 0 7px; + background: #f8fafc; +} + +#toolbar .search-input-addon label[for="searchbar"], +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap .search-input-addon label[for="searchbar"] { + position: static !important; + display: inline-flex; + align-items: center; + justify-content: center !important; + width: 16px !important; + min-width: 16px !important; + height: 16px !important; + margin: 0 !important; + transform: none !important; + opacity: 0.72; + pointer-events: none; +} + +#toolbar .search-input-addon label[for="searchbar"] img, +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap .search-input-addon label[for="searchbar"] img { + display: block; + width: 16px; + height: 16px; +} + +#toolbar .search-mode-select { + max-width: 82px; + height: 28px; + padding: 0 18px 0 0; + border: 0; + outline: 0; + background: transparent; + color: #475569; + font: 600 12px/1.2 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + letter-spacing: 0; +} + +#toolbar .search-mode-select:focus { + color: #0f172a; +} + +#toolbar .search-input-wrap #searchbar, +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap #searchbar { + padding-left: 42px !important; +} + +#toolbar .search-input-wrap-with-mode #searchbar, +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap-with-mode #searchbar { + padding-left: 120px !important; +} + +.model-snapshot.change-list #content #changelist #toolbar #changelist-search > div { + grid-template-columns: minmax(0, 1fr) auto !important; +} + +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap { + grid-column: 1 !important; + grid-row: 1 !important; +} + +.model-snapshot.change-list #content #changelist #toolbar form input[type="submit"] { + grid-column: 2 !important; + grid-row: 1 !important; +} + +#content form .search-input-wrap { + display: block !important; + max-width: none !important; + box-sizing: border-box; +} + +#content form .search-input-addon { + position: absolute !important; + left: 1px !important; + top: 1px !important; + bottom: 1px !important; + width: 112px !important; + padding: 0 8px !important; +} + +#content form .search-mode-select { + position: static !important; + width: auto !important; + max-width: 82px !important; + min-width: 0 !important; + height: 28px !important; + padding-left: 0 !important; +} + +#content form .search-input-wrap #searchbar { + width: 100% !important; + max-width: none !important; + box-sizing: border-box; +} + +/* Snapshot action row visual polish. Keep this last so older Django admin button + and tag-widget rules cannot leak inconsistent type styles into the row. */ +.model-snapshot.change-list #changelist #changelist-form > .actions-top, +.model-snapshot.change-list #changelist #changelist-form > .actions-top * { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + letter-spacing: 0; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top { + padding: 10px 14px !important; + background: #ffffff; + border-color: #e2e8f0; + border-radius: 8px; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .actions-left { + align-items: center; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons { + height: 40px; + border: 1px solid #cbd5e1; + border-radius: 8px; + background: #ffffff; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .tag-editor-container { + height: 38px !important; + min-height: 0 !important; + padding: 0 12px !important; + align-items: center; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .tag-inline-input { + font-size: 14px; + font-weight: 400; + color: #334155; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .tag-inline-input::placeholder { + color: #94a3b8; +} + +.model-snapshot.change-list #changelist .actions-tags-with-buttons .button[name="add_tags"], +.model-snapshot.change-list #changelist .actions-tags-with-buttons .button[name="remove_tags"] { + height: 38px; + width: 40px; + min-width: 40px; + font-size: 17px; + font-weight: 600; + line-height: 1; + color: #1f2937; + background: #f6e46f; + border-left: 1px solid #d9c759; + box-shadow: none; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons { + grid-template-columns: repeat(4, minmax(84px, 1fr)); + gap: 6px; + align-items: center; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button { + height: 40px; + min-width: 0; + padding: 0 10px; + border-radius: 6px; + border: 0; + box-shadow: none; + font-size: 14px !important; + font-weight: 600; + line-height: 1.2; + color: #102a2c; + text-align: center; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="resnapshot_snapshot"] { + background: #4fb5ad; + color: #082f2c; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="update_snapshots"] { + background: #a8df57; + color: #17310b; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="overwrite_snapshots"] { + background: #f7ad43; + color: #3b2405; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="delete_snapshots"] { + background: #e43d79; + color: #fff7fb; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="set_snapshot_permissions"] { + background: #f6e46f; + color: #1f2937; + text-decoration: none; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-permissions-wrapper { + display: inline-flex; + min-width: 0; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-permissions-wrapper > .button { + width: 100%; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary { + display: inline-flex; + flex-direction: column; + grid-column: 3; + grid-row: 1; + gap: 3px; + align-items: flex-start; + align-self: center; + justify-self: start; + color: #475569; + font-size: 14px; + font-weight: 500; + line-height: 1.2; + white-space: nowrap; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .action-selected-count, +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .action-total-count { + color: #475569; + font-size: 14px; + font-weight: 500; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary a { + color: #2563eb; + font-size: 14px; + font-weight: 600; + text-decoration-thickness: 1px; + text-underline-offset: 2px; +} + +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .action-counter, +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .question, +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .all, +.model-snapshot.change-list #changelist #changelist-form > .actions-top .action-summary .clear { + display: none !important; +} + +@media (max-width: 1180px) { + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons { + grid-template-columns: repeat(4, minmax(78px, 1fr)); + } + + .model-snapshot.change-list #changelist #changelist-form > .actions-top .action-buttons .button { + padding: 0 7px; + font-size: 13px !important; + } +} + +/* Snapshot search input group: mode dropdown + search icon inside the field. */ +.model-snapshot.change-list #toolbar #changelist-search > div { + display: grid !important; + grid-template-columns: minmax(0, 1fr) auto; + gap: 8px; + align-items: center; +} + +.model-snapshot.change-list #toolbar .search-input-wrap { + grid-column: 1; + grid-row: 1; + position: relative; + display: grid; + grid-template-columns: auto minmax(0, 1fr); + align-items: center; + width: 100%; + min-width: 0; + height: 40px; + border: 1px solid #cbd5e1; + border-radius: 8px; + background: #ffffff; + overflow: hidden; + box-sizing: border-box; +} + +.model-snapshot.change-list #toolbar .search-input-addon { + position: relative !important; + left: auto !important; + top: auto !important; + bottom: auto !important; + z-index: auto; + display: inline-flex; + align-items: center; + justify-content: center; + gap: 2px; + height: 100%; + width: 46px; + min-width: 46px; + max-width: 46px; + padding: 0; + border-right: 1px solid #e2e8f0; + border-radius: 0; + background: #f8fafc; + box-sizing: border-box; + cursor: pointer; +} + +.model-snapshot.change-list #toolbar .search-input-wrap-with-mode .search-input-addon::before { + content: ""; + position: absolute; + left: 12px; + top: 10px; + z-index: 4; + display: block; + width: 11px; + height: 11px; + border: 2px solid #64748b; + border-radius: 999px; + background: transparent; + pointer-events: none; +} + +.model-snapshot.change-list #toolbar .search-input-wrap-with-mode .search-input-addon::after { + content: ""; + position: absolute; + left: 25px; + top: 24px; + z-index: 4; + display: block; + width: 7px; + height: 2px; + border-radius: 999px; + background: #64748b; + transform: rotate(45deg); + pointer-events: none; +} + +.model-snapshot.change-list #toolbar .search-input-wrap-with-mode .search-input-addon label[for="searchbar"], +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap-with-mode .search-input-addon label[for="searchbar"] { + position: absolute !important; + z-index: 0; + display: none !important; + align-items: center !important; + justify-content: center !important; + width: 17px !important; + min-width: 17px !important; + height: 17px !important; + margin: 0 !important; + transform: none !important; + opacity: 0; + pointer-events: none; +} + +.model-snapshot.change-list #toolbar .search-input-wrap-with-mode .search-input-addon label[for="searchbar"] img { + width: 17px; + height: 17px; + display: block; +} + +.model-snapshot.change-list #toolbar .search-mode-select { + appearance: none; + -webkit-appearance: none; + position: absolute; + inset: 0 auto 0 0; + width: 46px !important; + min-width: 46px !important; + max-width: 46px !important; + height: 100%; + padding: 0; + border: 0; + border-radius: 0; + background: transparent; + color: #111827; + font-size: 14px; + font-weight: 500; + line-height: 1; + outline: 0; + opacity: 1; + text-indent: -9999px; + cursor: pointer; + z-index: 3; +} + +.model-snapshot.change-list #toolbar .search-mode-select option { + background: #ffffff; + color: #111827; + font-size: 14px; + line-height: 1.4; + text-indent: 0; +} + +.model-snapshot.change-list #toolbar .search-mode-caret { + position: absolute; + right: 9px; + top: 50%; + display: block; + z-index: 2; + color: #64748b; + font-size: 10px; + line-height: 1; + pointer-events: none; + transform: translateY(-45%); +} + +.model-snapshot.change-list #toolbar .search-input-wrap #searchbar, +.model-snapshot.change-list #content #changelist #toolbar .search-input-wrap #searchbar { + grid-column: 2; + grid-row: 1; + width: 100% !important; + min-width: 0; + height: 100%; + margin: 0 !important; + padding: 0 12px !important; + border: 0; + border-radius: 0; + box-shadow: none; + background: #ffffff; + color: #111827; + font-size: 14px; + font-weight: 400; + box-sizing: border-box; +} + +.model-snapshot.change-list #toolbar .search-input-wrap #searchbar:focus { + outline: 0; + box-shadow: inset 0 0 0 1px #93c5fd; +} + +.model-snapshot.change-list #toolbar form input[type="submit"] { + grid-column: 2; + grid-row: 1; + height: 40px; + min-width: 86px; + padding: 0 16px; + margin: 0; + border-radius: 8px; + font-size: 14px; + font-weight: 600; +} + +#content #changelist #toolbar .changelist-search-submit { + border-color: #aa1e55; +} + +body.change-list:not(.model-snapshot) #changelist #toolbar .changelist-search-submit { + flex: 0 0 auto; +} + +.model-snapshot.change-list #toolbar .changelist-search-submit { + grid-column: 2; + grid-row: 1; + height: 40px; + min-width: 86px; + padding: 0 16px; + margin: 0; + border-radius: 8px; + font-size: 14px; + font-weight: 600; + white-space: nowrap; +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar .changelist-toolbar-row { + grid-template-columns: minmax(0, 1fr); +} + +.snapshot-changelist-widget.model-snapshot.change-list #changelist .changelist-form-container > div { + grid-template-columns: minmax(0, 1fr); +} + +.snapshot-changelist-widget.model-snapshot.change-list #changelist #toolbar, +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .actions-top, +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .cards, +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .results, +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .changelist-footer, +.snapshot-changelist-widget.model-snapshot.change-list #changelist .paginator, +.snapshot-changelist-widget.model-snapshot.change-list #changelist .xfull { + grid-column: 1; + grid-row: auto; +} + +.snapshot-changelist-widget.model-snapshot.change-list #changelist { + display: block; + width: 100%; + max-width: 100%; + min-width: 0; + flex: none; + overflow: hidden; +} + +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .results { + overflow-x: auto; +} + +.snapshot-changelist-widget.model-snapshot.change-list #changelist #changelist-form > .actions-top .actions-left { + grid-template-columns: minmax(0, 1fr); +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar .search-mode-selector { + flex-wrap: wrap !important; +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar #changelist-search { + grid-column: 1; + grid-row: 1; + display: block !important; + width: 100%; + min-width: 0; +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar #changelist-search > div { + display: grid !important; + grid-template-columns: minmax(0, 1fr) auto; + gap: 8px; + align-items: center; + width: 100% !important; + min-width: 0; +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar .search-input-wrap { + grid-column: 1 !important; + grid-row: 1 !important; + width: 100% !important; +} + +.snapshot-changelist-widget.model-snapshot.change-list #toolbar .changelist-search-submit { + grid-column: 2 !important; + grid-row: 1 !important; +} + +/* Crawl changelist: match the snapshot toolbar/action layout without the grid/list toggle. */ +.model-crawl.change-list #changelist .changelist-form-container > div { + display: grid !important; + grid-template-columns: minmax(420px, 1fr) auto; + grid-template-areas: + "toolbar actions" + "results results" + "footer footer"; + gap: 10px 12px; + align-items: start; + min-width: 0; +} + +.model-crawl.change-list #toolbar { + grid-area: toolbar; + width: auto; + min-width: 0; + margin: 0 !important; + position: relative; + z-index: 2; +} + +.model-crawl.change-list #toolbar .changelist-toolbar-row { + display: grid; + grid-template-columns: minmax(0, 1fr) 96px; + gap: 8px; + align-items: start; +} + +.model-crawl.change-list #toolbar #changelist-search { + display: block !important; + width: 100% !important; + min-width: 0; + position: relative; + z-index: 2; + pointer-events: auto; +} + +.model-crawl.change-list #content #changelist #toolbar #changelist-search > div, +.model-crawl.change-list #toolbar #changelist-search > div { + display: grid !important; + grid-template-columns: minmax(0, 1fr) auto; + grid-template-areas: + "search submit" + "count count"; + gap: 8px; + align-items: center; + width: 100% !important; + min-width: 0; + flex-wrap: nowrap !important; +} + +.model-crawl.change-list #toolbar .search-input-wrap { + grid-area: search; + position: relative; + z-index: 3; + display: grid !important; + grid-template-columns: 46px minmax(0, 1fr); + align-items: center; + width: 100%; + min-width: 0; + height: 40px; + border: 1px solid #cbd5e1; + border-radius: 8px; + background: #ffffff; + overflow: hidden; + box-sizing: border-box; + pointer-events: auto; +} + +.model-crawl.change-list #toolbar .search-input-addon { + position: relative !important; + left: auto !important; + top: auto !important; + bottom: auto !important; + z-index: 1; + display: inline-flex; + align-items: center; + justify-content: center; + grid-column: 1; + grid-row: 1; + width: 46px !important; + min-width: 46px !important; + max-width: 46px !important; + height: 100%; + padding: 0 !important; + border-right: 1px solid #e2e8f0; + border-radius: 0; + background: #f8fafc; + box-sizing: border-box; +} + +.model-crawl.change-list #toolbar .search-input-addon label { + position: static !important; + display: inline-flex; + align-items: center; + justify-content: center; + width: 17px !important; + min-width: 17px !important; + height: 17px !important; + margin: 0 !important; + transform: none !important; + opacity: 0.72; + pointer-events: none; +} + +.model-crawl.change-list #toolbar .search-input-addon img { + display: block; + width: 17px; + height: 17px; +} + +.model-crawl.change-list #toolbar .search-input-wrap #searchbar, +.model-crawl.change-list #content #changelist #toolbar .search-input-wrap #searchbar { + grid-column: 2; + grid-row: 1; + width: 100% !important; + min-width: 0; + height: 100%; + margin: 0 !important; + padding: 0 12px !important; + border: 0; + border-radius: 0; + box-shadow: none; + background: #ffffff; + color: #111827; + font-size: 14px; + font-weight: 400; + box-sizing: border-box; + position: relative; + z-index: 4; + pointer-events: auto; +} + +.model-crawl.change-list #toolbar .changelist-search-submit, +.model-crawl.change-list #toolbar .filter-pane-toggle { + display: inline-flex; + align-items: center; + justify-content: center; + height: 40px; + margin: 0; + border-radius: 8px; + font-size: 14px; + font-weight: 600; + white-space: nowrap; +} + +.model-crawl.change-list #toolbar .changelist-search-submit { + grid-area: submit; + min-width: 86px; + padding: 0 16px; +} + +.model-crawl.change-list #content #changelist #toolbar .small.quiet { + grid-area: count; + margin: -2px 0 0; + color: #64748b; + font-size: 12px; + line-height: 1.2; +} + +.model-crawl.change-list #toolbar .filter-pane-toggle { + width: 96px; + min-width: 96px; + padding: 0 10px; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top { + grid-area: actions; + justify-self: end; + width: auto; + max-width: none; + min-width: 0; + margin: 0 !important; + padding: 0; + background: transparent; + border: 0; + overflow: visible; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .actions-left { + display: flex !important; + align-items: center; + justify-content: flex-end; + gap: 8px; + min-width: 0; + white-space: nowrap; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons { + display: grid !important; + grid-auto-flow: column; + grid-auto-columns: minmax(74px, auto); + gap: 6px; + align-items: center; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button { + display: inline-flex; + align-items: center; + gap: 6px; + height: 40px; + min-width: 0; + padding: 0 10px; + border-radius: 6px; + border: 0; + box-shadow: none; + font-size: 14px !important; + font-weight: 600; + line-height: 1.2; + text-align: center; + justify-content: center; +} + +.model-crawl.change-list #content .object-tools { + display: none !important; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button::before { + display: inline-flex; + align-items: center; + justify-content: center; + width: 14px; + min-width: 14px; + font-size: 13px; + line-height: 1; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="pause_selected_crawls"]::before { + content: "โธ"; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="resume_selected_crawls"]::before { + content: "โ–ถ"; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="seal_selected_crawls"]::before { + content: "โœ“"; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="delete_selected_batched"]::before { + content: "๐Ÿ—‘"; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="set_crawl_permissions"]::before { + content: "๐Ÿ‘"; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="pause_selected_crawls"] { + background: #4fb5ad; + color: #082f2c; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="resume_selected_crawls"] { + background: #a8df57; + color: #17310b; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="seal_selected_crawls"] { + background: #f7ad43; + color: #3b2405; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="delete_selected_batched"] { + background: #e43d79; + color: #fff7fb; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-buttons .button[name="set_crawl_permissions"] { + background: #f6e46f; + color: #1f2937; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-permissions-wrapper { + display: inline-flex; + min-width: 130px; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-permissions-wrapper > .button { + width: 100%; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-summary { + display: inline-flex; + align-items: center; + justify-content: center; + height: 40px; + min-height: 40px; + padding: 0 4px; + margin-left: 2px; + color: #475569; + font-size: 14px; + font-weight: 500; + line-height: 1; + white-space: nowrap; + overflow: visible; +} + +.model-crawl.change-list #changelist #changelist-form > .actions-top .action-summary .action-selected-count { + display: inline-flex; + align-items: center; + height: 40px; + color: inherit; + font: inherit; + line-height: 1; +} + +.model-crawl.change-list #result_list .crawl-status-group { + display: inline-flex; + align-items: center; + gap: 4px; + flex-wrap: nowrap; +} + +.model-crawl.change-list #result_list .crawl-status, +.model-crawl.change-list #result_list .crawl-status-reason { + display: inline-flex; + align-items: center; + min-height: 20px; + padding: 2px 7px; + border-radius: 999px; + font-size: 12px; + font-weight: 700; + line-height: 1.2; + white-space: nowrap; +} + +.model-crawl.change-list #result_list .crawl-status { + text-transform: uppercase; +} + +.model-crawl.change-list #result_list .crawl-status--queued { + background: #fef9c3; + color: #854d0e; +} + +.model-crawl.change-list #result_list .crawl-status--started { + background: #dcfce7; + color: #166534; +} + +.model-crawl.change-list #result_list .crawl-status--paused { + background: #fee2e2; + color: #991b1b; +} + +.model-crawl.change-list #result_list .crawl-status--sealed { + background: #dbeafe; + color: #1d4ed8; +} + +.model-crawl.change-list #result_list .crawl-status-reason { + background: #f1f5f9; + color: #475569; + min-height: 18px; + padding: 1px 6px; + font-size: 10px; + font-weight: 600; + text-transform: capitalize; +} + +.model-crawl.change-list #result_list .crawl-status-reason--crawl_max_urls, +.model-crawl.change-list #result_list .crawl-status-reason--crawl_max_size, +.model-crawl.change-list #result_list .crawl-status-reason--snapshot_max_size { + background: #fee2e2; + color: #991b1b; +} + +.model-crawl.change-list #result_list .crawl-status-reason--crawl_timeout { + background: #ffedd5; + color: #9a3412; +} + +.model-crawl.change-list #changelist #changelist-form > .results, +.model-crawl.change-list #changelist #changelist-form > .changelist-footer, +.model-crawl.change-list #changelist #changelist-form > .actions-bottom, +.model-crawl.change-list #changelist .xfull { + grid-area: auto; + grid-column: 1 / -1; + width: 100%; + max-width: 100%; +} + +.model-crawl.change-list #changelist #changelist-form > .results { + grid-area: results; + overflow-x: auto; + -webkit-overflow-scrolling: touch; +} + +.model-crawl.change-list #changelist #changelist-form > .changelist-footer { + grid-area: footer; +} + +@media (max-width: 1180px) { + .model-crawl.change-list #changelist .changelist-form-container > div { + grid-template-columns: minmax(0, 1fr); + grid-template-areas: + "toolbar" + "actions" + "results" + "footer"; + } + + .model-crawl.change-list #changelist #changelist-form > .actions-top { + justify-self: stretch; + width: 100%; + } + + .model-crawl.change-list #changelist #changelist-form > .actions-top .actions-left { + justify-content: flex-start; + overflow-x: auto; + padding-bottom: 2px; + } +} + +@media (max-width: 720px) { + .model-crawl.change-list #toolbar .changelist-toolbar-row { + grid-template-columns: minmax(0, 1fr); + } + + .model-crawl.change-list #toolbar .filter-pane-toggle { + width: 100%; + min-width: 0; + } +} + +body.change-list #changelist #changelist-form > .actions-top .action-buttons[hidden], +body.change-list #changelist #changelist-form > .actions-top .actions-tags-with-buttons[hidden] { + display: none !important; +} diff --git a/archivebox/templates/static/admin/crawls/crawl_admin.js b/archivebox/templates/static/admin/crawls/crawl_admin.js new file mode 100644 index 0000000000..6a258c3800 --- /dev/null +++ b/archivebox/templates/static/admin/crawls/crawl_admin.js @@ -0,0 +1,16 @@ +(function() { + document.addEventListener('click', function(event) { + var button = event.target.closest('.crawl-resume-row, .crawl-pause-row'); + if (!button) return; + + var form = document.getElementById('changelist-form'); + var actionSelect = form ? form.querySelector('select[name="action"]') : null; + if (!form || !actionSelect) return; + + form.querySelectorAll('input[name="_selected_action"]').forEach(function(checkbox) { + checkbox.checked = checkbox.value === button.getAttribute('data-crawl-id'); + }); + actionSelect.value = button.classList.contains('crawl-pause-row') ? 'pause_selected_crawls' : 'resume_selected_crawls'; + form.submit(); + }); +})(); diff --git a/archivebox/templates/static/admin/crawls/crawl_change.css b/archivebox/templates/static/admin/crawls/crawl_change.css new file mode 100644 index 0000000000..fe03986276 --- /dev/null +++ b/archivebox/templates/static/admin/crawls/crawl_change.css @@ -0,0 +1,294 @@ +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config { + flex: 1 1 100% !important; + max-width: 100% !important; + min-width: 100% !important; +} + +body.model-crawl.change-form #content { + max-width: none; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview > div { + padding: 14px; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row { + padding: 10px 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline { + display: grid !important; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 14px; + align-items: start; + width: 100%; + max-width: 100% !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline > div { + min-width: 0; + margin: 0 !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline input, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline select, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline textarea, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline .readonly { + width: 100% !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row.field-max_depth.field-max_urls.field-crawl_max_size .help, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .timezonewarning { + display: none !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row.field-stop_reason_display { + padding: 4px 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row.field-stop_reason_display .readonly { + min-height: 0; + padding: 6px 8px !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .field-notes, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .field-tags_editor { + min-width: 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row.field-notes.field-tags_editor .form-multiline { + grid-template-columns: repeat(2, minmax(0, 1fr)); +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .field-url_filters > div, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .field-url_filters #id_url_filters_container { + width: 100%; + max-width: none; + min-width: 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config > div, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .form-row, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .field-config, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .field-config > div, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .key-value-editor { + width: 100% !important; + max-width: none !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .key-value-rows { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .key-value-row { + margin: 0 !important; + padding: 12px; + border: 1px solid #e2e8f0; + border-radius: 8px; + background: #fbfdff; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .kv-inputs { + display: grid !important; + grid-template-columns: minmax(180px, 0.42fr) minmax(260px, 1fr) 34px; + gap: 10px !important; + align-items: center !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .kv-key, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .kv-value { + width: 100% !important; + min-width: 0; + flex: none !important; + font-size: 13.5px !important; + line-height: 1.45 !important; + padding: 8px 10px !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .kv-help { + margin-top: 7px !important; + color: #526070 !important; + font-size: 12px !important; + line-height: 1.35; +} + +.archivebox-crawl-resume-tool { + display: inline-flex; + align-items: center; + gap: 8px; +} + +.crawl-resume-action-form { + display: inline-flex; + margin: 0; +} + +.crawl-stop-reason, +.crawl-stop-reason-inline { + display: inline-flex; + align-items: center; + gap: 5px; + padding: 3px 8px; + border: 1px solid #cbd5e1; + border-radius: 6px; + background: #f8fafc; + color: #334155; + font-size: 12px; + line-height: 1.25; + white-space: nowrap; +} + +.crawl-stop-reason--empty, +.crawl-resume-muted { + color: #94a3b8; +} + +.crawl-status-group { + display: inline-flex; + align-items: center; + gap: 4px; + flex-wrap: nowrap; +} + +.crawl-status, +.crawl-status-reason { + display: inline-flex; + align-items: center; + min-height: 20px; + padding: 2px 7px; + border-radius: 999px; + color: #334155; + font-size: 12px; + font-weight: 700; + line-height: 1.2; + white-space: nowrap; +} + +.crawl-status { + text-transform: uppercase; +} + +.crawl-status--queued { + background: #fef9c3; + color: #854d0e; +} + +.crawl-status--started { + background: #dcfce7; + color: #166534; +} + +.crawl-status--paused { + background: #fee2e2; + color: #991b1b; +} + +.crawl-status--sealed { + background: #dbeafe; + color: #1d4ed8; +} + +.crawl-status-reason { + background: #f1f5f9; + color: #475569; + min-height: 18px; + padding: 1px 6px; + font-size: 10px; + font-weight: 600; + text-transform: capitalize; +} + +.crawl-status-reason--crawl_max_urls, +.crawl-status-reason--crawl_max_size, +.crawl-status-reason--snapshot_max_size { + background: #fee2e2; + color: #991b1b; +} + +.crawl-status-reason--crawl_timeout { + background: #ffedd5; + color: #9a3412; +} + +.crawl-resume-row, +.crawl-resume-submit, +.crawl-pause-row, +.crawl-pause-submit { + cursor: pointer; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots > div, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .form-row, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .field-snapshots_changelist, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .field-snapshots_changelist > div { + flex: 1 1 100% !important; + max-width: 100% !important; + min-width: 100% !important; + width: 100% !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots > div { + padding: 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .form-row, +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .field-snapshots_changelist { + padding: 0; + border: 0; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .field-snapshots_changelist > label { + display: none !important; +} + +body.model-crawl.change-form #content-main form fieldset.crawl-admin-snapshots .readonly { + padding: 0 !important; + border: 0 !important; + background: transparent !important; + font-family: inherit !important; + line-height: inherit !important; +} + +.crawl-snapshots-embed { + width: 100%; + overflow: hidden; + border-radius: 0 0 12px 12px; + background: #fff; +} + +.crawl-snapshots-embed__toolbar { + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + padding: 10px 14px; + border-bottom: 1px solid #e2e8f0; + background: #f8fafc; +} + +.crawl-snapshots-embed__toolbar strong { + color: #334155; + font-size: 13px; +} + +.crawl-snapshots-embed__toolbar .button { + padding: 6px 10px; + font-size: 12px; +} + +@media (max-width: 900px) { + body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-multiline, + body.model-crawl.change-form #content-main form fieldset.crawl-admin-overview .form-row.field-notes.field-tags_editor .form-multiline, + body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .key-value-rows { + grid-template-columns: 1fr; + } + + body.model-crawl.change-form #content-main form fieldset.crawl-admin-config .kv-inputs { + grid-template-columns: 1fr; + align-items: stretch !important; + } +} diff --git a/archivebox/templates/static/archive.png b/archivebox/templates/static/archive.png old mode 100644 new mode 100755 index 009e561330..898abdd56e Binary files a/archivebox/templates/static/archive.png and b/archivebox/templates/static/archive.png differ diff --git a/archivebox/templates/static/bootstrap.min.css b/archivebox/templates/static/bootstrap.min.css new file mode 100755 index 0000000000..8538d2f582 --- /dev/null +++ b/archivebox/templates/static/bootstrap.min.css @@ -0,0 +1,6 @@ +/*! + * Bootstrap v4.0.0-alpha.6 (https://getbootstrap.com) + * Copyright 2011-2017 The Bootstrap Authors + * Copyright 2011-2017 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + *//*! normalize.css v5.0.0 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;line-height:1.15;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,footer,header,nav,section{display:block}h1{font-size:2em;margin:.67em 0}figcaption,figure,main{display:block}figure{margin:1em 40px}hr{-webkit-box-sizing:content-box;box-sizing:content-box;height:0;overflow:visible}pre{font-family:monospace,monospace;font-size:1em}a{background-color:transparent;-webkit-text-decoration-skip:objects}a:active,a:hover{outline-width:0}abbr[title]{border-bottom:none;text-decoration:underline;text-decoration:underline dotted}b,strong{font-weight:inherit}b,strong{font-weight:bolder}code,kbd,samp{font-family:monospace,monospace;font-size:1em}dfn{font-style:italic}mark{background-color:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}audio,video{display:inline-block}audio:not([controls]){display:none;height:0}img{border-style:none}svg:not(:root){overflow:hidden}button,input,optgroup,select,textarea{font-family:sans-serif;font-size:100%;line-height:1.15;margin:0}button,input{overflow:visible}button,select{text-transform:none}[type=reset],[type=submit],button,html [type=button]{-webkit-appearance:button}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{border-style:none;padding:0}[type=button]:-moz-focusring,[type=reset]:-moz-focusring,[type=submit]:-moz-focusring,button:-moz-focusring{outline:1px dotted ButtonText}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}legend{-webkit-box-sizing:border-box;box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}progress{display:inline-block;vertical-align:baseline}textarea{overflow:auto}[type=checkbox],[type=radio]{-webkit-box-sizing:border-box;box-sizing:border-box;padding:0}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}details,menu{display:block}summary{display:list-item}canvas{display:inline-block}template{display:none}[hidden]{display:none}@media print{*,::after,::before,blockquote::first-letter,blockquote::first-line,div::first-letter,div::first-line,li::first-letter,li::first-line,p::first-letter,p::first-line{text-shadow:none!important;-webkit-box-shadow:none!important;box-shadow:none!important}a,a:visited{text-decoration:underline}abbr[title]::after{content:" (" attr(title) ")"}pre{white-space:pre-wrap!important}blockquote,pre{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}h2,h3,p{orphans:3;widows:3}h2,h3{page-break-after:avoid}.navbar{display:none}.badge{border:1px solid #000}.table{border-collapse:collapse!important}.table td,.table th{background-color:#fff!important}.table-bordered td,.table-bordered th{border:1px solid #ddd!important}}html{-webkit-box-sizing:border-box;box-sizing:border-box}*,::after,::before{-webkit-box-sizing:inherit;box-sizing:inherit}@-ms-viewport{width:device-width}html{-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:transparent}body{font-family:-apple-system,system-ui,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-size:1rem;font-weight:400;line-height:1.5;color:#292b2c;background-color:#fff}[tabindex="-1"]:focus{outline:0!important}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{cursor:help}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}a{color:#0275d8;text-decoration:none}a:focus,a:hover{color:#014c8c;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus,a:not([href]):not([tabindex]):hover{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle}[role=button]{cursor:pointer}[role=button],a,area,button,input,label,select,summary,textarea{-ms-touch-action:manipulation;touch-action:manipulation}table{border-collapse:collapse;background-color:transparent}caption{padding-top:.75rem;padding-bottom:.75rem;color:#636c72;text-align:left;caption-side:bottom}th{text-align:left}label{display:inline-block;margin-bottom:.5rem}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,select,textarea{line-height:inherit}input[type=checkbox]:disabled,input[type=radio]:disabled{cursor:not-allowed}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit}input[type=search]{-webkit-appearance:none}output{display:inline-block}[hidden]{display:none!important}.h1,.h2,.h3,.h4,.h5,.h6,h1,h2,h3,h4,h5,h6{margin-bottom:.5rem;font-family:inherit;font-weight:500;line-height:1.1;color:inherit}.h1,h1{font-size:2.5rem}.h2,h2{font-size:2rem}.h3,h3{font-size:1.75rem}.h4,h4{font-size:1.5rem}.h5,h5{font-size:1.25rem}.h6,h6{font-size:1rem}.lead{font-size:1.25rem;font-weight:300}.display-1{font-size:6rem;font-weight:300;line-height:1.1}.display-2{font-size:5.5rem;font-weight:300;line-height:1.1}.display-3{font-size:4.5rem;font-weight:300;line-height:1.1}.display-4{font-size:3.5rem;font-weight:300;line-height:1.1}hr{margin-top:1rem;margin-bottom:1rem;border:0;border-top:1px solid rgba(0,0,0,.1)}.small,small{font-size:80%;font-weight:400}.mark,mark{padding:.2em;background-color:#fcf8e3}.list-unstyled{padding-left:0;list-style:none}.list-inline{padding-left:0;list-style:none}.list-inline-item{display:inline-block}.list-inline-item:not(:last-child){margin-right:5px}.initialism{font-size:90%;text-transform:uppercase}.blockquote{padding:.5rem 1rem;margin-bottom:1rem;font-size:1.25rem;border-left:.25rem solid #eceeef}.blockquote-footer{display:block;font-size:80%;color:#636c72}.blockquote-footer::before{content:"\2014 \00A0"}.blockquote-reverse{padding-right:1rem;padding-left:0;text-align:right;border-right:.25rem solid #eceeef;border-left:0}.blockquote-reverse .blockquote-footer::before{content:""}.blockquote-reverse .blockquote-footer::after{content:"\00A0 \2014"}.img-fluid{max-width:100%;height:auto}.img-thumbnail{padding:.25rem;background-color:#fff;border:1px solid #ddd;border-radius:.25rem;-webkit-transition:all .2s ease-in-out;-o-transition:all .2s ease-in-out;transition:all .2s ease-in-out;max-width:100%;height:auto}.figure{display:inline-block}.figure-img{margin-bottom:.5rem;line-height:1}.figure-caption{font-size:90%;color:#636c72}code,kbd,pre,samp{font-family:Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace}code{padding:.2rem .4rem;font-size:90%;color:#bd4147;background-color:#f7f7f9;border-radius:.25rem}a>code{padding:0;color:inherit;background-color:inherit}kbd{padding:.2rem .4rem;font-size:90%;color:#fff;background-color:#292b2c;border-radius:.2rem}kbd kbd{padding:0;font-size:100%;font-weight:700}pre{display:block;margin-top:0;margin-bottom:1rem;font-size:90%;color:#292b2c}pre code{padding:0;font-size:inherit;color:inherit;background-color:transparent;border-radius:0}.pre-scrollable{max-height:340px;overflow-y:scroll}.container{position:relative;margin-left:auto;margin-right:auto;padding-right:15px;padding-left:15px}@media (min-width:576px){.container{padding-right:15px;padding-left:15px}}@media (min-width:768px){.container{padding-right:15px;padding-left:15px}}@media (min-width:992px){.container{padding-right:15px;padding-left:15px}}@media (min-width:1200px){.container{padding-right:15px;padding-left:15px}}@media (min-width:576px){.container{width:540px;max-width:100%}}@media (min-width:768px){.container{width:720px;max-width:100%}}@media (min-width:992px){.container{width:960px;max-width:100%}}@media (min-width:1200px){.container{width:1140px;max-width:100%}}.container-fluid{position:relative;margin-left:auto;margin-right:auto;padding-right:15px;padding-left:15px}@media (min-width:576px){.container-fluid{padding-right:15px;padding-left:15px}}@media (min-width:768px){.container-fluid{padding-right:15px;padding-left:15px}}@media (min-width:992px){.container-fluid{padding-right:15px;padding-left:15px}}@media (min-width:1200px){.container-fluid{padding-right:15px;padding-left:15px}}.row{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:wrap;-ms-flex-wrap:wrap;flex-wrap:wrap;margin-right:-15px;margin-left:-15px}@media (min-width:576px){.row{margin-right:-15px;margin-left:-15px}}@media (min-width:768px){.row{margin-right:-15px;margin-left:-15px}}@media (min-width:992px){.row{margin-right:-15px;margin-left:-15px}}@media (min-width:1200px){.row{margin-right:-15px;margin-left:-15px}}.no-gutters{margin-right:0;margin-left:0}.no-gutters>.col,.no-gutters>[class*=col-]{padding-right:0;padding-left:0}.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9{position:relative;width:100%;min-height:1px;padding-right:15px;padding-left:15px}@media (min-width:576px){.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9{padding-right:15px;padding-left:15px}}@media (min-width:768px){.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9{padding-right:15px;padding-left:15px}}@media (min-width:992px){.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9{padding-right:15px;padding-left:15px}}@media (min-width:1200px){.col,.col-1,.col-10,.col-11,.col-12,.col-2,.col-3,.col-4,.col-5,.col-6,.col-7,.col-8,.col-9,.col-lg,.col-lg-1,.col-lg-10,.col-lg-11,.col-lg-12,.col-lg-2,.col-lg-3,.col-lg-4,.col-lg-5,.col-lg-6,.col-lg-7,.col-lg-8,.col-lg-9,.col-md,.col-md-1,.col-md-10,.col-md-11,.col-md-12,.col-md-2,.col-md-3,.col-md-4,.col-md-5,.col-md-6,.col-md-7,.col-md-8,.col-md-9,.col-sm,.col-sm-1,.col-sm-10,.col-sm-11,.col-sm-12,.col-sm-2,.col-sm-3,.col-sm-4,.col-sm-5,.col-sm-6,.col-sm-7,.col-sm-8,.col-sm-9,.col-xl,.col-xl-1,.col-xl-10,.col-xl-11,.col-xl-12,.col-xl-2,.col-xl-3,.col-xl-4,.col-xl-5,.col-xl-6,.col-xl-7,.col-xl-8,.col-xl-9{padding-right:15px;padding-left:15px}}.col{-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-auto{-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.col-1{-webkit-box-flex:0;-webkit-flex:0 0 8.333333%;-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-2{-webkit-box-flex:0;-webkit-flex:0 0 16.666667%;-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-3{-webkit-box-flex:0;-webkit-flex:0 0 25%;-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-4{-webkit-box-flex:0;-webkit-flex:0 0 33.333333%;-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-5{-webkit-box-flex:0;-webkit-flex:0 0 41.666667%;-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-6{-webkit-box-flex:0;-webkit-flex:0 0 50%;-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-7{-webkit-box-flex:0;-webkit-flex:0 0 58.333333%;-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-8{-webkit-box-flex:0;-webkit-flex:0 0 66.666667%;-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-9{-webkit-box-flex:0;-webkit-flex:0 0 75%;-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-10{-webkit-box-flex:0;-webkit-flex:0 0 83.333333%;-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-11{-webkit-box-flex:0;-webkit-flex:0 0 91.666667%;-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-12{-webkit-box-flex:0;-webkit-flex:0 0 100%;-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.pull-0{right:auto}.pull-1{right:8.333333%}.pull-2{right:16.666667%}.pull-3{right:25%}.pull-4{right:33.333333%}.pull-5{right:41.666667%}.pull-6{right:50%}.pull-7{right:58.333333%}.pull-8{right:66.666667%}.pull-9{right:75%}.pull-10{right:83.333333%}.pull-11{right:91.666667%}.pull-12{right:100%}.push-0{left:auto}.push-1{left:8.333333%}.push-2{left:16.666667%}.push-3{left:25%}.push-4{left:33.333333%}.push-5{left:41.666667%}.push-6{left:50%}.push-7{left:58.333333%}.push-8{left:66.666667%}.push-9{left:75%}.push-10{left:83.333333%}.push-11{left:91.666667%}.push-12{left:100%}.offset-1{margin-left:8.333333%}.offset-2{margin-left:16.666667%}.offset-3{margin-left:25%}.offset-4{margin-left:33.333333%}.offset-5{margin-left:41.666667%}.offset-6{margin-left:50%}.offset-7{margin-left:58.333333%}.offset-8{margin-left:66.666667%}.offset-9{margin-left:75%}.offset-10{margin-left:83.333333%}.offset-11{margin-left:91.666667%}@media (min-width:576px){.col-sm{-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-sm-auto{-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.col-sm-1{-webkit-box-flex:0;-webkit-flex:0 0 8.333333%;-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-sm-2{-webkit-box-flex:0;-webkit-flex:0 0 16.666667%;-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-sm-3{-webkit-box-flex:0;-webkit-flex:0 0 25%;-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-sm-4{-webkit-box-flex:0;-webkit-flex:0 0 33.333333%;-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-sm-5{-webkit-box-flex:0;-webkit-flex:0 0 41.666667%;-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-sm-6{-webkit-box-flex:0;-webkit-flex:0 0 50%;-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-sm-7{-webkit-box-flex:0;-webkit-flex:0 0 58.333333%;-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-sm-8{-webkit-box-flex:0;-webkit-flex:0 0 66.666667%;-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-sm-9{-webkit-box-flex:0;-webkit-flex:0 0 75%;-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-sm-10{-webkit-box-flex:0;-webkit-flex:0 0 83.333333%;-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-sm-11{-webkit-box-flex:0;-webkit-flex:0 0 91.666667%;-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-sm-12{-webkit-box-flex:0;-webkit-flex:0 0 100%;-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.pull-sm-0{right:auto}.pull-sm-1{right:8.333333%}.pull-sm-2{right:16.666667%}.pull-sm-3{right:25%}.pull-sm-4{right:33.333333%}.pull-sm-5{right:41.666667%}.pull-sm-6{right:50%}.pull-sm-7{right:58.333333%}.pull-sm-8{right:66.666667%}.pull-sm-9{right:75%}.pull-sm-10{right:83.333333%}.pull-sm-11{right:91.666667%}.pull-sm-12{right:100%}.push-sm-0{left:auto}.push-sm-1{left:8.333333%}.push-sm-2{left:16.666667%}.push-sm-3{left:25%}.push-sm-4{left:33.333333%}.push-sm-5{left:41.666667%}.push-sm-6{left:50%}.push-sm-7{left:58.333333%}.push-sm-8{left:66.666667%}.push-sm-9{left:75%}.push-sm-10{left:83.333333%}.push-sm-11{left:91.666667%}.push-sm-12{left:100%}.offset-sm-0{margin-left:0}.offset-sm-1{margin-left:8.333333%}.offset-sm-2{margin-left:16.666667%}.offset-sm-3{margin-left:25%}.offset-sm-4{margin-left:33.333333%}.offset-sm-5{margin-left:41.666667%}.offset-sm-6{margin-left:50%}.offset-sm-7{margin-left:58.333333%}.offset-sm-8{margin-left:66.666667%}.offset-sm-9{margin-left:75%}.offset-sm-10{margin-left:83.333333%}.offset-sm-11{margin-left:91.666667%}}@media (min-width:768px){.col-md{-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-md-auto{-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.col-md-1{-webkit-box-flex:0;-webkit-flex:0 0 8.333333%;-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-md-2{-webkit-box-flex:0;-webkit-flex:0 0 16.666667%;-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-md-3{-webkit-box-flex:0;-webkit-flex:0 0 25%;-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-md-4{-webkit-box-flex:0;-webkit-flex:0 0 33.333333%;-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-md-5{-webkit-box-flex:0;-webkit-flex:0 0 41.666667%;-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-md-6{-webkit-box-flex:0;-webkit-flex:0 0 50%;-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-md-7{-webkit-box-flex:0;-webkit-flex:0 0 58.333333%;-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-md-8{-webkit-box-flex:0;-webkit-flex:0 0 66.666667%;-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-md-9{-webkit-box-flex:0;-webkit-flex:0 0 75%;-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-md-10{-webkit-box-flex:0;-webkit-flex:0 0 83.333333%;-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-md-11{-webkit-box-flex:0;-webkit-flex:0 0 91.666667%;-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-md-12{-webkit-box-flex:0;-webkit-flex:0 0 100%;-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.pull-md-0{right:auto}.pull-md-1{right:8.333333%}.pull-md-2{right:16.666667%}.pull-md-3{right:25%}.pull-md-4{right:33.333333%}.pull-md-5{right:41.666667%}.pull-md-6{right:50%}.pull-md-7{right:58.333333%}.pull-md-8{right:66.666667%}.pull-md-9{right:75%}.pull-md-10{right:83.333333%}.pull-md-11{right:91.666667%}.pull-md-12{right:100%}.push-md-0{left:auto}.push-md-1{left:8.333333%}.push-md-2{left:16.666667%}.push-md-3{left:25%}.push-md-4{left:33.333333%}.push-md-5{left:41.666667%}.push-md-6{left:50%}.push-md-7{left:58.333333%}.push-md-8{left:66.666667%}.push-md-9{left:75%}.push-md-10{left:83.333333%}.push-md-11{left:91.666667%}.push-md-12{left:100%}.offset-md-0{margin-left:0}.offset-md-1{margin-left:8.333333%}.offset-md-2{margin-left:16.666667%}.offset-md-3{margin-left:25%}.offset-md-4{margin-left:33.333333%}.offset-md-5{margin-left:41.666667%}.offset-md-6{margin-left:50%}.offset-md-7{margin-left:58.333333%}.offset-md-8{margin-left:66.666667%}.offset-md-9{margin-left:75%}.offset-md-10{margin-left:83.333333%}.offset-md-11{margin-left:91.666667%}}@media (min-width:992px){.col-lg{-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-lg-auto{-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.col-lg-1{-webkit-box-flex:0;-webkit-flex:0 0 8.333333%;-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-lg-2{-webkit-box-flex:0;-webkit-flex:0 0 16.666667%;-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-lg-3{-webkit-box-flex:0;-webkit-flex:0 0 25%;-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-lg-4{-webkit-box-flex:0;-webkit-flex:0 0 33.333333%;-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-lg-5{-webkit-box-flex:0;-webkit-flex:0 0 41.666667%;-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-lg-6{-webkit-box-flex:0;-webkit-flex:0 0 50%;-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-lg-7{-webkit-box-flex:0;-webkit-flex:0 0 58.333333%;-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-lg-8{-webkit-box-flex:0;-webkit-flex:0 0 66.666667%;-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-lg-9{-webkit-box-flex:0;-webkit-flex:0 0 75%;-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-lg-10{-webkit-box-flex:0;-webkit-flex:0 0 83.333333%;-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-lg-11{-webkit-box-flex:0;-webkit-flex:0 0 91.666667%;-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-lg-12{-webkit-box-flex:0;-webkit-flex:0 0 100%;-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.pull-lg-0{right:auto}.pull-lg-1{right:8.333333%}.pull-lg-2{right:16.666667%}.pull-lg-3{right:25%}.pull-lg-4{right:33.333333%}.pull-lg-5{right:41.666667%}.pull-lg-6{right:50%}.pull-lg-7{right:58.333333%}.pull-lg-8{right:66.666667%}.pull-lg-9{right:75%}.pull-lg-10{right:83.333333%}.pull-lg-11{right:91.666667%}.pull-lg-12{right:100%}.push-lg-0{left:auto}.push-lg-1{left:8.333333%}.push-lg-2{left:16.666667%}.push-lg-3{left:25%}.push-lg-4{left:33.333333%}.push-lg-5{left:41.666667%}.push-lg-6{left:50%}.push-lg-7{left:58.333333%}.push-lg-8{left:66.666667%}.push-lg-9{left:75%}.push-lg-10{left:83.333333%}.push-lg-11{left:91.666667%}.push-lg-12{left:100%}.offset-lg-0{margin-left:0}.offset-lg-1{margin-left:8.333333%}.offset-lg-2{margin-left:16.666667%}.offset-lg-3{margin-left:25%}.offset-lg-4{margin-left:33.333333%}.offset-lg-5{margin-left:41.666667%}.offset-lg-6{margin-left:50%}.offset-lg-7{margin-left:58.333333%}.offset-lg-8{margin-left:66.666667%}.offset-lg-9{margin-left:75%}.offset-lg-10{margin-left:83.333333%}.offset-lg-11{margin-left:91.666667%}}@media (min-width:1200px){.col-xl{-webkit-flex-basis:0;-ms-flex-preferred-size:0;flex-basis:0;-webkit-box-flex:1;-webkit-flex-grow:1;-ms-flex-positive:1;flex-grow:1;max-width:100%}.col-xl-auto{-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;width:auto}.col-xl-1{-webkit-box-flex:0;-webkit-flex:0 0 8.333333%;-ms-flex:0 0 8.333333%;flex:0 0 8.333333%;max-width:8.333333%}.col-xl-2{-webkit-box-flex:0;-webkit-flex:0 0 16.666667%;-ms-flex:0 0 16.666667%;flex:0 0 16.666667%;max-width:16.666667%}.col-xl-3{-webkit-box-flex:0;-webkit-flex:0 0 25%;-ms-flex:0 0 25%;flex:0 0 25%;max-width:25%}.col-xl-4{-webkit-box-flex:0;-webkit-flex:0 0 33.333333%;-ms-flex:0 0 33.333333%;flex:0 0 33.333333%;max-width:33.333333%}.col-xl-5{-webkit-box-flex:0;-webkit-flex:0 0 41.666667%;-ms-flex:0 0 41.666667%;flex:0 0 41.666667%;max-width:41.666667%}.col-xl-6{-webkit-box-flex:0;-webkit-flex:0 0 50%;-ms-flex:0 0 50%;flex:0 0 50%;max-width:50%}.col-xl-7{-webkit-box-flex:0;-webkit-flex:0 0 58.333333%;-ms-flex:0 0 58.333333%;flex:0 0 58.333333%;max-width:58.333333%}.col-xl-8{-webkit-box-flex:0;-webkit-flex:0 0 66.666667%;-ms-flex:0 0 66.666667%;flex:0 0 66.666667%;max-width:66.666667%}.col-xl-9{-webkit-box-flex:0;-webkit-flex:0 0 75%;-ms-flex:0 0 75%;flex:0 0 75%;max-width:75%}.col-xl-10{-webkit-box-flex:0;-webkit-flex:0 0 83.333333%;-ms-flex:0 0 83.333333%;flex:0 0 83.333333%;max-width:83.333333%}.col-xl-11{-webkit-box-flex:0;-webkit-flex:0 0 91.666667%;-ms-flex:0 0 91.666667%;flex:0 0 91.666667%;max-width:91.666667%}.col-xl-12{-webkit-box-flex:0;-webkit-flex:0 0 100%;-ms-flex:0 0 100%;flex:0 0 100%;max-width:100%}.pull-xl-0{right:auto}.pull-xl-1{right:8.333333%}.pull-xl-2{right:16.666667%}.pull-xl-3{right:25%}.pull-xl-4{right:33.333333%}.pull-xl-5{right:41.666667%}.pull-xl-6{right:50%}.pull-xl-7{right:58.333333%}.pull-xl-8{right:66.666667%}.pull-xl-9{right:75%}.pull-xl-10{right:83.333333%}.pull-xl-11{right:91.666667%}.pull-xl-12{right:100%}.push-xl-0{left:auto}.push-xl-1{left:8.333333%}.push-xl-2{left:16.666667%}.push-xl-3{left:25%}.push-xl-4{left:33.333333%}.push-xl-5{left:41.666667%}.push-xl-6{left:50%}.push-xl-7{left:58.333333%}.push-xl-8{left:66.666667%}.push-xl-9{left:75%}.push-xl-10{left:83.333333%}.push-xl-11{left:91.666667%}.push-xl-12{left:100%}.offset-xl-0{margin-left:0}.offset-xl-1{margin-left:8.333333%}.offset-xl-2{margin-left:16.666667%}.offset-xl-3{margin-left:25%}.offset-xl-4{margin-left:33.333333%}.offset-xl-5{margin-left:41.666667%}.offset-xl-6{margin-left:50%}.offset-xl-7{margin-left:58.333333%}.offset-xl-8{margin-left:66.666667%}.offset-xl-9{margin-left:75%}.offset-xl-10{margin-left:83.333333%}.offset-xl-11{margin-left:91.666667%}}.table{width:100%;max-width:100%;margin-bottom:1rem}.table td,.table th{padding:.75rem;vertical-align:top;border-top:1px solid #eceeef}.table thead th{vertical-align:bottom;border-bottom:2px solid #eceeef}.table tbody+tbody{border-top:2px solid #eceeef}.table .table{background-color:#fff}.table-sm td,.table-sm th{padding:.3rem}.table-bordered{border:1px solid #eceeef}.table-bordered td,.table-bordered th{border:1px solid #eceeef}.table-bordered thead td,.table-bordered thead th{border-bottom-width:2px}.table-striped tbody tr:nth-of-type(odd){background-color:rgba(0,0,0,.05)}.table-hover tbody tr:hover{background-color:rgba(0,0,0,.075)}.table-active,.table-active>td,.table-active>th{background-color:rgba(0,0,0,.075)}.table-hover .table-active:hover{background-color:rgba(0,0,0,.075)}.table-hover .table-active:hover>td,.table-hover .table-active:hover>th{background-color:rgba(0,0,0,.075)}.table-success,.table-success>td,.table-success>th{background-color:#dff0d8}.table-hover .table-success:hover{background-color:#d0e9c6}.table-hover .table-success:hover>td,.table-hover .table-success:hover>th{background-color:#d0e9c6}.table-info,.table-info>td,.table-info>th{background-color:#d9edf7}.table-hover .table-info:hover{background-color:#c4e3f3}.table-hover .table-info:hover>td,.table-hover .table-info:hover>th{background-color:#c4e3f3}.table-warning,.table-warning>td,.table-warning>th{background-color:#fcf8e3}.table-hover .table-warning:hover{background-color:#faf2cc}.table-hover .table-warning:hover>td,.table-hover .table-warning:hover>th{background-color:#faf2cc}.table-danger,.table-danger>td,.table-danger>th{background-color:#f2dede}.table-hover .table-danger:hover{background-color:#ebcccc}.table-hover .table-danger:hover>td,.table-hover .table-danger:hover>th{background-color:#ebcccc}.thead-inverse th{color:#fff;background-color:#292b2c}.thead-default th{color:#464a4c;background-color:#eceeef}.table-inverse{color:#fff;background-color:#292b2c}.table-inverse td,.table-inverse th,.table-inverse thead th{border-color:#fff}.table-inverse.table-bordered{border:0}.table-responsive{display:block;width:100%;overflow-x:auto;-ms-overflow-style:-ms-autohiding-scrollbar}.table-responsive.table-bordered{border:0}.form-control{display:block;width:100%;padding:.5rem .75rem;font-size:1rem;line-height:1.25;color:#464a4c;background-color:#fff;background-image:none;-webkit-background-clip:padding-box;background-clip:padding-box;border:1px solid rgba(0,0,0,.15);border-radius:.25rem;-webkit-transition:border-color ease-in-out .15s,-webkit-box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,-webkit-box-shadow ease-in-out .15s;-o-transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s;transition:border-color ease-in-out .15s,box-shadow ease-in-out .15s,-webkit-box-shadow ease-in-out .15s}.form-control::-ms-expand{background-color:transparent;border:0}.form-control:focus{color:#464a4c;background-color:#fff;border-color:#5cb3fd;outline:0}.form-control::-webkit-input-placeholder{color:#636c72;opacity:1}.form-control::-moz-placeholder{color:#636c72;opacity:1}.form-control:-ms-input-placeholder{color:#636c72;opacity:1}.form-control::placeholder{color:#636c72;opacity:1}.form-control:disabled,.form-control[readonly]{background-color:#eceeef;opacity:1}.form-control:disabled{cursor:not-allowed}select.form-control:not([size]):not([multiple]){height:calc(2.25rem + 2px)}select.form-control:focus::-ms-value{color:#464a4c;background-color:#fff}.form-control-file,.form-control-range{display:block}.col-form-label{padding-top:calc(.5rem - 1px * 2);padding-bottom:calc(.5rem - 1px * 2);margin-bottom:0}.col-form-label-lg{padding-top:calc(.75rem - 1px * 2);padding-bottom:calc(.75rem - 1px * 2);font-size:1.25rem}.col-form-label-sm{padding-top:calc(.25rem - 1px * 2);padding-bottom:calc(.25rem - 1px * 2);font-size:.875rem}.col-form-legend{padding-top:.5rem;padding-bottom:.5rem;margin-bottom:0;font-size:1rem}.form-control-static{padding-top:.5rem;padding-bottom:.5rem;margin-bottom:0;line-height:1.25;border:solid transparent;border-width:1px 0}.form-control-static.form-control-lg,.form-control-static.form-control-sm,.input-group-lg>.form-control-static.form-control,.input-group-lg>.form-control-static.input-group-addon,.input-group-lg>.input-group-btn>.form-control-static.btn,.input-group-sm>.form-control-static.form-control,.input-group-sm>.form-control-static.input-group-addon,.input-group-sm>.input-group-btn>.form-control-static.btn{padding-right:0;padding-left:0}.form-control-sm,.input-group-sm>.form-control,.input-group-sm>.input-group-addon,.input-group-sm>.input-group-btn>.btn{padding:.25rem .5rem;font-size:.875rem;border-radius:.2rem}.input-group-sm>.input-group-btn>select.btn:not([size]):not([multiple]),.input-group-sm>select.form-control:not([size]):not([multiple]),.input-group-sm>select.input-group-addon:not([size]):not([multiple]),select.form-control-sm:not([size]):not([multiple]){height:1.8125rem}.form-control-lg,.input-group-lg>.form-control,.input-group-lg>.input-group-addon,.input-group-lg>.input-group-btn>.btn{padding:.75rem 1.5rem;font-size:1.25rem;border-radius:.3rem}.input-group-lg>.input-group-btn>select.btn:not([size]):not([multiple]),.input-group-lg>select.form-control:not([size]):not([multiple]),.input-group-lg>select.input-group-addon:not([size]):not([multiple]),select.form-control-lg:not([size]):not([multiple]){height:3.166667rem}.form-group{margin-bottom:1rem}.form-text{display:block;margin-top:.25rem}.form-check{position:relative;display:block;margin-bottom:.5rem}.form-check.disabled .form-check-label{color:#636c72;cursor:not-allowed}.form-check-label{padding-left:1.25rem;margin-bottom:0;cursor:pointer}.form-check-input{position:absolute;margin-top:.25rem;margin-left:-1.25rem}.form-check-input:only-child{position:static}.form-check-inline{display:inline-block}.form-check-inline .form-check-label{vertical-align:middle}.form-check-inline+.form-check-inline{margin-left:.75rem}.form-control-feedback{margin-top:.25rem}.form-control-danger,.form-control-success,.form-control-warning{padding-right:2.25rem;background-repeat:no-repeat;background-position:center right .5625rem;-webkit-background-size:1.125rem 1.125rem;background-size:1.125rem 1.125rem}.has-success .col-form-label,.has-success .custom-control,.has-success .form-check-label,.has-success .form-control-feedback,.has-success .form-control-label{color:#5cb85c}.has-success .form-control{border-color:#5cb85c}.has-success .input-group-addon{color:#5cb85c;border-color:#5cb85c;background-color:#eaf6ea}.has-success .form-control-success{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3E%3Cpath fill='%235cb85c' d='M2.3 6.73L.6 4.53c-.4-1.04.46-1.4 1.1-.8l1.1 1.4 3.4-3.8c.6-.63 1.6-.27 1.2.7l-4 4.6c-.43.5-.8.4-1.1.1z'/%3E%3C/svg%3E")}.has-warning .col-form-label,.has-warning .custom-control,.has-warning .form-check-label,.has-warning .form-control-feedback,.has-warning .form-control-label{color:#f0ad4e}.has-warning .form-control{border-color:#f0ad4e}.has-warning .input-group-addon{color:#f0ad4e;border-color:#f0ad4e;background-color:#fff}.has-warning .form-control-warning{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3E%3Cpath fill='%23f0ad4e' d='M4.4 5.324h-.8v-2.46h.8zm0 1.42h-.8V5.89h.8zM3.76.63L.04 7.075c-.115.2.016.425.26.426h7.397c.242 0 .372-.226.258-.426C6.726 4.924 5.47 2.79 4.253.63c-.113-.174-.39-.174-.494 0z'/%3E%3C/svg%3E")}.has-danger .col-form-label,.has-danger .custom-control,.has-danger .form-check-label,.has-danger .form-control-feedback,.has-danger .form-control-label{color:#d9534f}.has-danger .form-control{border-color:#d9534f}.has-danger .input-group-addon{color:#d9534f;border-color:#d9534f;background-color:#fdf7f7}.has-danger .form-control-danger{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='%23d9534f' viewBox='-2 -2 7 7'%3E%3Cpath stroke='%23d9534f' d='M0 0l3 3m0-3L0 3'/%3E%3Ccircle r='.5'/%3E%3Ccircle cx='3' r='.5'/%3E%3Ccircle cy='3' r='.5'/%3E%3Ccircle cx='3' cy='3' r='.5'/%3E%3C/svg%3E")}.form-inline{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-flow:row wrap;-ms-flex-flow:row wrap;flex-flow:row wrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.form-inline .form-check{width:100%}@media (min-width:576px){.form-inline label{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;margin-bottom:0}.form-inline .form-group{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-flex:0;-webkit-flex:0 0 auto;-ms-flex:0 0 auto;flex:0 0 auto;-webkit-flex-flow:row wrap;-ms-flex-flow:row wrap;flex-flow:row wrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;margin-bottom:0}.form-inline .form-control{display:inline-block;width:auto;vertical-align:middle}.form-inline .form-control-static{display:inline-block}.form-inline .input-group{width:auto}.form-inline .form-control-label{margin-bottom:0;vertical-align:middle}.form-inline .form-check{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;width:auto;margin-top:0;margin-bottom:0}.form-inline .form-check-label{padding-left:0}.form-inline .form-check-input{position:relative;margin-top:0;margin-right:.25rem;margin-left:0}.form-inline .custom-control{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:0}.form-inline .custom-control-indicator{position:static;display:inline-block;margin-right:.25rem;vertical-align:text-bottom}.form-inline .has-feedback .form-control-feedback{top:0}}.btn{display:inline-block;font-weight:400;line-height:1.25;text-align:center;white-space:nowrap;vertical-align:middle;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;border:1px solid transparent;padding:.5rem 1rem;font-size:1rem;border-radius:.25rem;-webkit-transition:all .2s ease-in-out;-o-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.btn:focus,.btn:hover{text-decoration:none}.btn.focus,.btn:focus{outline:0;-webkit-box-shadow:0 0 0 2px rgba(2,117,216,.25);box-shadow:0 0 0 2px rgba(2,117,216,.25)}.btn.disabled,.btn:disabled{cursor:not-allowed;opacity:.65}.btn.active,.btn:active{background-image:none}a.btn.disabled,fieldset[disabled] a.btn{pointer-events:none}.btn-primary{color:#fff;background-color:#0275d8;border-color:#0275d8}.btn-primary:hover{color:#fff;background-color:#025aa5;border-color:#01549b}.btn-primary.focus,.btn-primary:focus{-webkit-box-shadow:0 0 0 2px rgba(2,117,216,.5);box-shadow:0 0 0 2px rgba(2,117,216,.5)}.btn-primary.disabled,.btn-primary:disabled{background-color:#0275d8;border-color:#0275d8}.btn-primary.active,.btn-primary:active,.show>.btn-primary.dropdown-toggle{color:#fff;background-color:#025aa5;background-image:none;border-color:#01549b}.btn-secondary{color:#292b2c;background-color:#fff;border-color:#ccc}.btn-secondary:hover{color:#292b2c;background-color:#e6e6e6;border-color:#adadad}.btn-secondary.focus,.btn-secondary:focus{-webkit-box-shadow:0 0 0 2px rgba(204,204,204,.5);box-shadow:0 0 0 2px rgba(204,204,204,.5)}.btn-secondary.disabled,.btn-secondary:disabled{background-color:#fff;border-color:#ccc}.btn-secondary.active,.btn-secondary:active,.show>.btn-secondary.dropdown-toggle{color:#292b2c;background-color:#e6e6e6;background-image:none;border-color:#adadad}.btn-info{color:#fff;background-color:#5bc0de;border-color:#5bc0de}.btn-info:hover{color:#fff;background-color:#31b0d5;border-color:#2aabd2}.btn-info.focus,.btn-info:focus{-webkit-box-shadow:0 0 0 2px rgba(91,192,222,.5);box-shadow:0 0 0 2px rgba(91,192,222,.5)}.btn-info.disabled,.btn-info:disabled{background-color:#5bc0de;border-color:#5bc0de}.btn-info.active,.btn-info:active,.show>.btn-info.dropdown-toggle{color:#fff;background-color:#31b0d5;background-image:none;border-color:#2aabd2}.btn-success{color:#fff;background-color:#5cb85c;border-color:#5cb85c}.btn-success:hover{color:#fff;background-color:#449d44;border-color:#419641}.btn-success.focus,.btn-success:focus{-webkit-box-shadow:0 0 0 2px rgba(92,184,92,.5);box-shadow:0 0 0 2px rgba(92,184,92,.5)}.btn-success.disabled,.btn-success:disabled{background-color:#5cb85c;border-color:#5cb85c}.btn-success.active,.btn-success:active,.show>.btn-success.dropdown-toggle{color:#fff;background-color:#449d44;background-image:none;border-color:#419641}.btn-warning{color:#fff;background-color:#f0ad4e;border-color:#f0ad4e}.btn-warning:hover{color:#fff;background-color:#ec971f;border-color:#eb9316}.btn-warning.focus,.btn-warning:focus{-webkit-box-shadow:0 0 0 2px rgba(240,173,78,.5);box-shadow:0 0 0 2px rgba(240,173,78,.5)}.btn-warning.disabled,.btn-warning:disabled{background-color:#f0ad4e;border-color:#f0ad4e}.btn-warning.active,.btn-warning:active,.show>.btn-warning.dropdown-toggle{color:#fff;background-color:#ec971f;background-image:none;border-color:#eb9316}.btn-danger{color:#fff;background-color:#d9534f;border-color:#d9534f}.btn-danger:hover{color:#fff;background-color:#c9302c;border-color:#c12e2a}.btn-danger.focus,.btn-danger:focus{-webkit-box-shadow:0 0 0 2px rgba(217,83,79,.5);box-shadow:0 0 0 2px rgba(217,83,79,.5)}.btn-danger.disabled,.btn-danger:disabled{background-color:#d9534f;border-color:#d9534f}.btn-danger.active,.btn-danger:active,.show>.btn-danger.dropdown-toggle{color:#fff;background-color:#c9302c;background-image:none;border-color:#c12e2a}.btn-outline-primary{color:#0275d8;background-image:none;background-color:transparent;border-color:#0275d8}.btn-outline-primary:hover{color:#fff;background-color:#0275d8;border-color:#0275d8}.btn-outline-primary.focus,.btn-outline-primary:focus{-webkit-box-shadow:0 0 0 2px rgba(2,117,216,.5);box-shadow:0 0 0 2px rgba(2,117,216,.5)}.btn-outline-primary.disabled,.btn-outline-primary:disabled{color:#0275d8;background-color:transparent}.btn-outline-primary.active,.btn-outline-primary:active,.show>.btn-outline-primary.dropdown-toggle{color:#fff;background-color:#0275d8;border-color:#0275d8}.btn-outline-secondary{color:#ccc;background-image:none;background-color:transparent;border-color:#ccc}.btn-outline-secondary:hover{color:#fff;background-color:#ccc;border-color:#ccc}.btn-outline-secondary.focus,.btn-outline-secondary:focus{-webkit-box-shadow:0 0 0 2px rgba(204,204,204,.5);box-shadow:0 0 0 2px rgba(204,204,204,.5)}.btn-outline-secondary.disabled,.btn-outline-secondary:disabled{color:#ccc;background-color:transparent}.btn-outline-secondary.active,.btn-outline-secondary:active,.show>.btn-outline-secondary.dropdown-toggle{color:#fff;background-color:#ccc;border-color:#ccc}.btn-outline-info{color:#5bc0de;background-image:none;background-color:transparent;border-color:#5bc0de}.btn-outline-info:hover{color:#fff;background-color:#5bc0de;border-color:#5bc0de}.btn-outline-info.focus,.btn-outline-info:focus{-webkit-box-shadow:0 0 0 2px rgba(91,192,222,.5);box-shadow:0 0 0 2px rgba(91,192,222,.5)}.btn-outline-info.disabled,.btn-outline-info:disabled{color:#5bc0de;background-color:transparent}.btn-outline-info.active,.btn-outline-info:active,.show>.btn-outline-info.dropdown-toggle{color:#fff;background-color:#5bc0de;border-color:#5bc0de}.btn-outline-success{color:#5cb85c;background-image:none;background-color:transparent;border-color:#5cb85c}.btn-outline-success:hover{color:#fff;background-color:#5cb85c;border-color:#5cb85c}.btn-outline-success.focus,.btn-outline-success:focus{-webkit-box-shadow:0 0 0 2px rgba(92,184,92,.5);box-shadow:0 0 0 2px rgba(92,184,92,.5)}.btn-outline-success.disabled,.btn-outline-success:disabled{color:#5cb85c;background-color:transparent}.btn-outline-success.active,.btn-outline-success:active,.show>.btn-outline-success.dropdown-toggle{color:#fff;background-color:#5cb85c;border-color:#5cb85c}.btn-outline-warning{color:#f0ad4e;background-image:none;background-color:transparent;border-color:#f0ad4e}.btn-outline-warning:hover{color:#fff;background-color:#f0ad4e;border-color:#f0ad4e}.btn-outline-warning.focus,.btn-outline-warning:focus{-webkit-box-shadow:0 0 0 2px rgba(240,173,78,.5);box-shadow:0 0 0 2px rgba(240,173,78,.5)}.btn-outline-warning.disabled,.btn-outline-warning:disabled{color:#f0ad4e;background-color:transparent}.btn-outline-warning.active,.btn-outline-warning:active,.show>.btn-outline-warning.dropdown-toggle{color:#fff;background-color:#f0ad4e;border-color:#f0ad4e}.btn-outline-danger{color:#d9534f;background-image:none;background-color:transparent;border-color:#d9534f}.btn-outline-danger:hover{color:#fff;background-color:#d9534f;border-color:#d9534f}.btn-outline-danger.focus,.btn-outline-danger:focus{-webkit-box-shadow:0 0 0 2px rgba(217,83,79,.5);box-shadow:0 0 0 2px rgba(217,83,79,.5)}.btn-outline-danger.disabled,.btn-outline-danger:disabled{color:#d9534f;background-color:transparent}.btn-outline-danger.active,.btn-outline-danger:active,.show>.btn-outline-danger.dropdown-toggle{color:#fff;background-color:#d9534f;border-color:#d9534f}.btn-link{font-weight:400;color:#0275d8;border-radius:0}.btn-link,.btn-link.active,.btn-link:active,.btn-link:disabled{background-color:transparent}.btn-link,.btn-link:active,.btn-link:focus{border-color:transparent}.btn-link:hover{border-color:transparent}.btn-link:focus,.btn-link:hover{color:#014c8c;text-decoration:underline;background-color:transparent}.btn-link:disabled{color:#636c72}.btn-link:disabled:focus,.btn-link:disabled:hover{text-decoration:none}.btn-group-lg>.btn,.btn-lg{padding:.75rem 1.5rem;font-size:1.25rem;border-radius:.3rem}.btn-group-sm>.btn,.btn-sm{padding:.25rem .5rem;font-size:.875rem;border-radius:.2rem}.btn-block{display:block;width:100%}.btn-block+.btn-block{margin-top:.5rem}input[type=button].btn-block,input[type=reset].btn-block,input[type=submit].btn-block{width:100%}.fade{opacity:0;-webkit-transition:opacity .15s linear;-o-transition:opacity .15s linear;transition:opacity .15s linear}.fade.show{opacity:1}.collapse{display:none}.collapse.show{display:block}tr.collapse.show{display:table-row}tbody.collapse.show{display:table-row-group}.collapsing{position:relative;height:0;overflow:hidden;-webkit-transition:height .35s ease;-o-transition:height .35s ease;transition:height .35s ease}.dropdown,.dropup{position:relative}.dropdown-toggle::after{display:inline-block;width:0;height:0;margin-left:.3em;vertical-align:middle;content:"";border-top:.3em solid;border-right:.3em solid transparent;border-left:.3em solid transparent}.dropdown-toggle:focus{outline:0}.dropup .dropdown-toggle::after{border-top:0;border-bottom:.3em solid}.dropdown-menu{position:absolute;top:100%;left:0;z-index:1000;display:none;float:left;min-width:10rem;padding:.5rem 0;margin:.125rem 0 0;font-size:1rem;color:#292b2c;text-align:left;list-style:none;background-color:#fff;-webkit-background-clip:padding-box;background-clip:padding-box;border:1px solid rgba(0,0,0,.15);border-radius:.25rem}.dropdown-divider{height:1px;margin:.5rem 0;overflow:hidden;background-color:#eceeef}.dropdown-item{display:block;width:100%;padding:3px 1.5rem;clear:both;font-weight:400;color:#292b2c;text-align:inherit;white-space:nowrap;background:0 0;border:0}.dropdown-item:focus,.dropdown-item:hover{color:#1d1e1f;text-decoration:none;background-color:#f7f7f9}.dropdown-item.active,.dropdown-item:active{color:#fff;text-decoration:none;background-color:#0275d8}.dropdown-item.disabled,.dropdown-item:disabled{color:#636c72;cursor:not-allowed;background-color:transparent}.show>.dropdown-menu{display:block}.show>a{outline:0}.dropdown-menu-right{right:0;left:auto}.dropdown-menu-left{right:auto;left:0}.dropdown-header{display:block;padding:.5rem 1.5rem;margin-bottom:0;font-size:.875rem;color:#636c72;white-space:nowrap}.dropdown-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:990}.dropup .dropdown-menu{top:auto;bottom:100%;margin-bottom:.125rem}.btn-group,.btn-group-vertical{position:relative;display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;vertical-align:middle}.btn-group-vertical>.btn,.btn-group>.btn{position:relative;-webkit-box-flex:0;-webkit-flex:0 1 auto;-ms-flex:0 1 auto;flex:0 1 auto}.btn-group-vertical>.btn:hover,.btn-group>.btn:hover{z-index:2}.btn-group-vertical>.btn.active,.btn-group-vertical>.btn:active,.btn-group-vertical>.btn:focus,.btn-group>.btn.active,.btn-group>.btn:active,.btn-group>.btn:focus{z-index:2}.btn-group .btn+.btn,.btn-group .btn+.btn-group,.btn-group .btn-group+.btn,.btn-group .btn-group+.btn-group,.btn-group-vertical .btn+.btn,.btn-group-vertical .btn+.btn-group,.btn-group-vertical .btn-group+.btn,.btn-group-vertical .btn-group+.btn-group{margin-left:-1px}.btn-toolbar{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-pack:start;-webkit-justify-content:flex-start;-ms-flex-pack:start;justify-content:flex-start}.btn-toolbar .input-group{width:auto}.btn-group>.btn:not(:first-child):not(:last-child):not(.dropdown-toggle){border-radius:0}.btn-group>.btn:first-child{margin-left:0}.btn-group>.btn:first-child:not(:last-child):not(.dropdown-toggle){border-bottom-right-radius:0;border-top-right-radius:0}.btn-group>.btn:last-child:not(:first-child),.btn-group>.dropdown-toggle:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.btn-group>.btn-group{float:left}.btn-group>.btn-group:not(:first-child):not(:last-child)>.btn{border-radius:0}.btn-group>.btn-group:first-child:not(:last-child)>.btn:last-child,.btn-group>.btn-group:first-child:not(:last-child)>.dropdown-toggle{border-bottom-right-radius:0;border-top-right-radius:0}.btn-group>.btn-group:last-child:not(:first-child)>.btn:first-child{border-bottom-left-radius:0;border-top-left-radius:0}.btn-group .dropdown-toggle:active,.btn-group.open .dropdown-toggle{outline:0}.btn+.dropdown-toggle-split{padding-right:.75rem;padding-left:.75rem}.btn+.dropdown-toggle-split::after{margin-left:0}.btn-group-sm>.btn+.dropdown-toggle-split,.btn-sm+.dropdown-toggle-split{padding-right:.375rem;padding-left:.375rem}.btn-group-lg>.btn+.dropdown-toggle-split,.btn-lg+.dropdown-toggle-split{padding-right:1.125rem;padding-left:1.125rem}.btn-group-vertical{display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-box-align:start;-webkit-align-items:flex-start;-ms-flex-align:start;align-items:flex-start;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center}.btn-group-vertical .btn,.btn-group-vertical .btn-group{width:100%}.btn-group-vertical>.btn+.btn,.btn-group-vertical>.btn+.btn-group,.btn-group-vertical>.btn-group+.btn,.btn-group-vertical>.btn-group+.btn-group{margin-top:-1px;margin-left:0}.btn-group-vertical>.btn:not(:first-child):not(:last-child){border-radius:0}.btn-group-vertical>.btn:first-child:not(:last-child){border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn:last-child:not(:first-child){border-top-right-radius:0;border-top-left-radius:0}.btn-group-vertical>.btn-group:not(:first-child):not(:last-child)>.btn{border-radius:0}.btn-group-vertical>.btn-group:first-child:not(:last-child)>.btn:last-child,.btn-group-vertical>.btn-group:first-child:not(:last-child)>.dropdown-toggle{border-bottom-right-radius:0;border-bottom-left-radius:0}.btn-group-vertical>.btn-group:last-child:not(:first-child)>.btn:first-child{border-top-right-radius:0;border-top-left-radius:0}[data-toggle=buttons]>.btn input[type=checkbox],[data-toggle=buttons]>.btn input[type=radio],[data-toggle=buttons]>.btn-group>.btn input[type=checkbox],[data-toggle=buttons]>.btn-group>.btn input[type=radio]{position:absolute;clip:rect(0,0,0,0);pointer-events:none}.input-group{position:relative;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;width:100%}.input-group .form-control{position:relative;z-index:2;-webkit-box-flex:1;-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;width:1%;margin-bottom:0}.input-group .form-control:active,.input-group .form-control:focus,.input-group .form-control:hover{z-index:3}.input-group .form-control,.input-group-addon,.input-group-btn{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center}.input-group .form-control:not(:first-child):not(:last-child),.input-group-addon:not(:first-child):not(:last-child),.input-group-btn:not(:first-child):not(:last-child){border-radius:0}.input-group-addon,.input-group-btn{white-space:nowrap;vertical-align:middle}.input-group-addon{padding:.5rem .75rem;margin-bottom:0;font-size:1rem;font-weight:400;line-height:1.25;color:#464a4c;text-align:center;background-color:#eceeef;border:1px solid rgba(0,0,0,.15);border-radius:.25rem}.input-group-addon.form-control-sm,.input-group-sm>.input-group-addon,.input-group-sm>.input-group-btn>.input-group-addon.btn{padding:.25rem .5rem;font-size:.875rem;border-radius:.2rem}.input-group-addon.form-control-lg,.input-group-lg>.input-group-addon,.input-group-lg>.input-group-btn>.input-group-addon.btn{padding:.75rem 1.5rem;font-size:1.25rem;border-radius:.3rem}.input-group-addon input[type=checkbox],.input-group-addon input[type=radio]{margin-top:0}.input-group .form-control:not(:last-child),.input-group-addon:not(:last-child),.input-group-btn:not(:first-child)>.btn-group:not(:last-child)>.btn,.input-group-btn:not(:first-child)>.btn:not(:last-child):not(.dropdown-toggle),.input-group-btn:not(:last-child)>.btn,.input-group-btn:not(:last-child)>.btn-group>.btn,.input-group-btn:not(:last-child)>.dropdown-toggle{border-bottom-right-radius:0;border-top-right-radius:0}.input-group-addon:not(:last-child){border-right:0}.input-group .form-control:not(:first-child),.input-group-addon:not(:first-child),.input-group-btn:not(:first-child)>.btn,.input-group-btn:not(:first-child)>.btn-group>.btn,.input-group-btn:not(:first-child)>.dropdown-toggle,.input-group-btn:not(:last-child)>.btn-group:not(:first-child)>.btn,.input-group-btn:not(:last-child)>.btn:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.form-control+.input-group-addon:not(:first-child){border-left:0}.input-group-btn{position:relative;font-size:0;white-space:nowrap}.input-group-btn>.btn{position:relative;-webkit-box-flex:1;-webkit-flex:1 1 0%;-ms-flex:1 1 0%;flex:1 1 0%}.input-group-btn>.btn+.btn{margin-left:-1px}.input-group-btn>.btn:active,.input-group-btn>.btn:focus,.input-group-btn>.btn:hover{z-index:3}.input-group-btn:not(:last-child)>.btn,.input-group-btn:not(:last-child)>.btn-group{margin-right:-1px}.input-group-btn:not(:first-child)>.btn,.input-group-btn:not(:first-child)>.btn-group{z-index:2;margin-left:-1px}.input-group-btn:not(:first-child)>.btn-group:active,.input-group-btn:not(:first-child)>.btn-group:focus,.input-group-btn:not(:first-child)>.btn-group:hover,.input-group-btn:not(:first-child)>.btn:active,.input-group-btn:not(:first-child)>.btn:focus,.input-group-btn:not(:first-child)>.btn:hover{z-index:3}.custom-control{position:relative;display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;min-height:1.5rem;padding-left:1.5rem;margin-right:1rem;cursor:pointer}.custom-control-input{position:absolute;z-index:-1;opacity:0}.custom-control-input:checked~.custom-control-indicator{color:#fff;background-color:#0275d8}.custom-control-input:focus~.custom-control-indicator{-webkit-box-shadow:0 0 0 1px #fff,0 0 0 3px #0275d8;box-shadow:0 0 0 1px #fff,0 0 0 3px #0275d8}.custom-control-input:active~.custom-control-indicator{color:#fff;background-color:#8fcafe}.custom-control-input:disabled~.custom-control-indicator{cursor:not-allowed;background-color:#eceeef}.custom-control-input:disabled~.custom-control-description{color:#636c72;cursor:not-allowed}.custom-control-indicator{position:absolute;top:.25rem;left:0;display:block;width:1rem;height:1rem;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-color:#ddd;background-repeat:no-repeat;background-position:center center;-webkit-background-size:50% 50%;background-size:50% 50%}.custom-checkbox .custom-control-indicator{border-radius:.25rem}.custom-checkbox .custom-control-input:checked~.custom-control-indicator{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3E%3Cpath fill='%23fff' d='M6.564.75l-3.59 3.612-1.538-1.55L0 4.26 2.974 7.25 8 2.193z'/%3E%3C/svg%3E")}.custom-checkbox .custom-control-input:indeterminate~.custom-control-indicator{background-color:#0275d8;background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 4'%3E%3Cpath stroke='%23fff' d='M0 2h4'/%3E%3C/svg%3E")}.custom-radio .custom-control-indicator{border-radius:50%}.custom-radio .custom-control-input:checked~.custom-control-indicator{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='-4 -4 8 8'%3E%3Ccircle r='3' fill='%23fff'/%3E%3C/svg%3E")}.custom-controls-stacked{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column}.custom-controls-stacked .custom-control{margin-bottom:.25rem}.custom-controls-stacked .custom-control+.custom-control{margin-left:0}.custom-select{display:inline-block;max-width:100%;height:calc(2.25rem + 2px);padding:.375rem 1.75rem .375rem .75rem;line-height:1.25;color:#464a4c;vertical-align:middle;background:#fff url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3E%3Cpath fill='%23333' d='M2 0L0 2h4zm0 5L0 3h4z'/%3E%3C/svg%3E") no-repeat right .75rem center;-webkit-background-size:8px 10px;background-size:8px 10px;border:1px solid rgba(0,0,0,.15);border-radius:.25rem;-moz-appearance:none;-webkit-appearance:none}.custom-select:focus{border-color:#5cb3fd;outline:0}.custom-select:focus::-ms-value{color:#464a4c;background-color:#fff}.custom-select:disabled{color:#636c72;cursor:not-allowed;background-color:#eceeef}.custom-select::-ms-expand{opacity:0}.custom-select-sm{padding-top:.375rem;padding-bottom:.375rem;font-size:75%}.custom-file{position:relative;display:inline-block;max-width:100%;height:2.5rem;margin-bottom:0;cursor:pointer}.custom-file-input{min-width:14rem;max-width:100%;height:2.5rem;margin:0;filter:alpha(opacity=0);opacity:0}.custom-file-control{position:absolute;top:0;right:0;left:0;z-index:5;height:2.5rem;padding:.5rem 1rem;line-height:1.5;color:#464a4c;pointer-events:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;background-color:#fff;border:1px solid rgba(0,0,0,.15);border-radius:.25rem}.custom-file-control:lang(en)::after{content:"Choose file..."}.custom-file-control::before{position:absolute;top:-1px;right:-1px;bottom:-1px;z-index:6;display:block;height:2.5rem;padding:.5rem 1rem;line-height:1.5;color:#464a4c;background-color:#eceeef;border:1px solid rgba(0,0,0,.15);border-radius:0 .25rem .25rem 0}.custom-file-control:lang(en)::before{content:"Browse"}.nav{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding-left:0;margin-bottom:0;list-style:none}.nav-link{display:block;padding:.5em 1em}.nav-link:focus,.nav-link:hover{text-decoration:none}.nav-link.disabled{color:#636c72;cursor:not-allowed}.nav-tabs{border-bottom:1px solid #ddd}.nav-tabs .nav-item{margin-bottom:-1px}.nav-tabs .nav-link{border:1px solid transparent;border-top-right-radius:.25rem;border-top-left-radius:.25rem}.nav-tabs .nav-link:focus,.nav-tabs .nav-link:hover{border-color:#eceeef #eceeef #ddd}.nav-tabs .nav-link.disabled{color:#636c72;background-color:transparent;border-color:transparent}.nav-tabs .nav-item.show .nav-link,.nav-tabs .nav-link.active{color:#464a4c;background-color:#fff;border-color:#ddd #ddd #fff}.nav-tabs .dropdown-menu{margin-top:-1px;border-top-right-radius:0;border-top-left-radius:0}.nav-pills .nav-link{border-radius:.25rem}.nav-pills .nav-item.show .nav-link,.nav-pills .nav-link.active{color:#fff;cursor:default;background-color:#0275d8}.nav-fill .nav-item{-webkit-box-flex:1;-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;text-align:center}.nav-justified .nav-item{-webkit-box-flex:1;-webkit-flex:1 1 100%;-ms-flex:1 1 100%;flex:1 1 100%;text-align:center}.tab-content>.tab-pane{display:none}.tab-content>.active{display:block}.navbar{position:relative;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;padding:.5rem 1rem}.navbar-brand{display:inline-block;padding-top:.25rem;padding-bottom:.25rem;margin-right:1rem;font-size:1.25rem;line-height:inherit;white-space:nowrap}.navbar-brand:focus,.navbar-brand:hover{text-decoration:none}.navbar-nav{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;padding-left:0;margin-bottom:0;list-style:none}.navbar-nav .nav-link{padding-right:0;padding-left:0}.navbar-text{display:inline-block;padding-top:.425rem;padding-bottom:.425rem}.navbar-toggler{-webkit-align-self:flex-start;-ms-flex-item-align:start;align-self:flex-start;padding:.25rem .75rem;font-size:1.25rem;line-height:1;background:0 0;border:1px solid transparent;border-radius:.25rem}.navbar-toggler:focus,.navbar-toggler:hover{text-decoration:none}.navbar-toggler-icon{display:inline-block;width:1.5em;height:1.5em;vertical-align:middle;content:"";background:no-repeat center center;-webkit-background-size:100% 100%;background-size:100% 100%}.navbar-toggler-left{position:absolute;left:1rem}.navbar-toggler-right{position:absolute;right:1rem}@media (max-width:575px){.navbar-toggleable .navbar-nav .dropdown-menu{position:static;float:none}.navbar-toggleable>.container{padding-right:0;padding-left:0}}@media (min-width:576px){.navbar-toggleable{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable .navbar-nav{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row}.navbar-toggleable .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-toggleable>.container{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable .navbar-collapse{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important;width:100%}.navbar-toggleable .navbar-toggler{display:none}}@media (max-width:767px){.navbar-toggleable-sm .navbar-nav .dropdown-menu{position:static;float:none}.navbar-toggleable-sm>.container{padding-right:0;padding-left:0}}@media (min-width:768px){.navbar-toggleable-sm{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-sm .navbar-nav{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row}.navbar-toggleable-sm .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-toggleable-sm>.container{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-sm .navbar-collapse{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important;width:100%}.navbar-toggleable-sm .navbar-toggler{display:none}}@media (max-width:991px){.navbar-toggleable-md .navbar-nav .dropdown-menu{position:static;float:none}.navbar-toggleable-md>.container{padding-right:0;padding-left:0}}@media (min-width:992px){.navbar-toggleable-md{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-md .navbar-nav{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row}.navbar-toggleable-md .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-toggleable-md>.container{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-md .navbar-collapse{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important;width:100%}.navbar-toggleable-md .navbar-toggler{display:none}}@media (max-width:1199px){.navbar-toggleable-lg .navbar-nav .dropdown-menu{position:static;float:none}.navbar-toggleable-lg>.container{padding-right:0;padding-left:0}}@media (min-width:1200px){.navbar-toggleable-lg{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-lg .navbar-nav{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row}.navbar-toggleable-lg .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-toggleable-lg>.container{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-lg .navbar-collapse{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important;width:100%}.navbar-toggleable-lg .navbar-toggler{display:none}}.navbar-toggleable-xl{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-xl .navbar-nav .dropdown-menu{position:static;float:none}.navbar-toggleable-xl>.container{padding-right:0;padding-left:0}.navbar-toggleable-xl .navbar-nav{-webkit-box-orient:horizontal;-webkit-box-direction:normal;-webkit-flex-direction:row;-ms-flex-direction:row;flex-direction:row}.navbar-toggleable-xl .navbar-nav .nav-link{padding-right:.5rem;padding-left:.5rem}.navbar-toggleable-xl>.container{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-wrap:nowrap;-ms-flex-wrap:nowrap;flex-wrap:nowrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center}.navbar-toggleable-xl .navbar-collapse{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important;width:100%}.navbar-toggleable-xl .navbar-toggler{display:none}.navbar-light .navbar-brand,.navbar-light .navbar-toggler{color:rgba(0,0,0,.9)}.navbar-light .navbar-brand:focus,.navbar-light .navbar-brand:hover,.navbar-light .navbar-toggler:focus,.navbar-light .navbar-toggler:hover{color:rgba(0,0,0,.9)}.navbar-light .navbar-nav .nav-link{color:rgba(0,0,0,.5)}.navbar-light .navbar-nav .nav-link:focus,.navbar-light .navbar-nav .nav-link:hover{color:rgba(0,0,0,.7)}.navbar-light .navbar-nav .nav-link.disabled{color:rgba(0,0,0,.3)}.navbar-light .navbar-nav .active>.nav-link,.navbar-light .navbar-nav .nav-link.active,.navbar-light .navbar-nav .nav-link.open,.navbar-light .navbar-nav .open>.nav-link{color:rgba(0,0,0,.9)}.navbar-light .navbar-toggler{border-color:rgba(0,0,0,.1)}.navbar-light .navbar-toggler-icon{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(0, 0, 0, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E")}.navbar-light .navbar-text{color:rgba(0,0,0,.5)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-toggler{color:#fff}.navbar-inverse .navbar-brand:focus,.navbar-inverse .navbar-brand:hover,.navbar-inverse .navbar-toggler:focus,.navbar-inverse .navbar-toggler:hover{color:#fff}.navbar-inverse .navbar-nav .nav-link{color:rgba(255,255,255,.5)}.navbar-inverse .navbar-nav .nav-link:focus,.navbar-inverse .navbar-nav .nav-link:hover{color:rgba(255,255,255,.75)}.navbar-inverse .navbar-nav .nav-link.disabled{color:rgba(255,255,255,.25)}.navbar-inverse .navbar-nav .active>.nav-link,.navbar-inverse .navbar-nav .nav-link.active,.navbar-inverse .navbar-nav .nav-link.open,.navbar-inverse .navbar-nav .open>.nav-link{color:#fff}.navbar-inverse .navbar-toggler{border-color:rgba(255,255,255,.1)}.navbar-inverse .navbar-toggler-icon{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255, 255, 255, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E")}.navbar-inverse .navbar-text{color:rgba(255,255,255,.5)}.card{position:relative;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;background-color:#fff;border:1px solid rgba(0,0,0,.125);border-radius:.25rem}.card-block{-webkit-box-flex:1;-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;padding:1.25rem}.card-title{margin-bottom:.75rem}.card-subtitle{margin-top:-.375rem;margin-bottom:0}.card-text:last-child{margin-bottom:0}.card-link:hover{text-decoration:none}.card-link+.card-link{margin-left:1.25rem}.card>.list-group:first-child .list-group-item:first-child{border-top-right-radius:.25rem;border-top-left-radius:.25rem}.card>.list-group:last-child .list-group-item:last-child{border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.card-header{padding:.75rem 1.25rem;margin-bottom:0;background-color:#f7f7f9;border-bottom:1px solid rgba(0,0,0,.125)}.card-header:first-child{border-radius:calc(.25rem - 1px) calc(.25rem - 1px) 0 0}.card-footer{padding:.75rem 1.25rem;background-color:#f7f7f9;border-top:1px solid rgba(0,0,0,.125)}.card-footer:last-child{border-radius:0 0 calc(.25rem - 1px) calc(.25rem - 1px)}.card-header-tabs{margin-right:-.625rem;margin-bottom:-.75rem;margin-left:-.625rem;border-bottom:0}.card-header-pills{margin-right:-.625rem;margin-left:-.625rem}.card-primary{background-color:#0275d8;border-color:#0275d8}.card-primary .card-footer,.card-primary .card-header{background-color:transparent}.card-success{background-color:#5cb85c;border-color:#5cb85c}.card-success .card-footer,.card-success .card-header{background-color:transparent}.card-info{background-color:#5bc0de;border-color:#5bc0de}.card-info .card-footer,.card-info .card-header{background-color:transparent}.card-warning{background-color:#f0ad4e;border-color:#f0ad4e}.card-warning .card-footer,.card-warning .card-header{background-color:transparent}.card-danger{background-color:#d9534f;border-color:#d9534f}.card-danger .card-footer,.card-danger .card-header{background-color:transparent}.card-outline-primary{background-color:transparent;border-color:#0275d8}.card-outline-secondary{background-color:transparent;border-color:#ccc}.card-outline-info{background-color:transparent;border-color:#5bc0de}.card-outline-success{background-color:transparent;border-color:#5cb85c}.card-outline-warning{background-color:transparent;border-color:#f0ad4e}.card-outline-danger{background-color:transparent;border-color:#d9534f}.card-inverse{color:rgba(255,255,255,.65)}.card-inverse .card-footer,.card-inverse .card-header{background-color:transparent;border-color:rgba(255,255,255,.2)}.card-inverse .card-blockquote,.card-inverse .card-footer,.card-inverse .card-header,.card-inverse .card-title{color:#fff}.card-inverse .card-blockquote .blockquote-footer,.card-inverse .card-link,.card-inverse .card-subtitle,.card-inverse .card-text{color:rgba(255,255,255,.65)}.card-inverse .card-link:focus,.card-inverse .card-link:hover{color:#fff}.card-blockquote{padding:0;margin-bottom:0;border-left:0}.card-img{border-radius:calc(.25rem - 1px)}.card-img-overlay{position:absolute;top:0;right:0;bottom:0;left:0;padding:1.25rem}.card-img-top{border-top-right-radius:calc(.25rem - 1px);border-top-left-radius:calc(.25rem - 1px)}.card-img-bottom{border-bottom-right-radius:calc(.25rem - 1px);border-bottom-left-radius:calc(.25rem - 1px)}@media (min-width:576px){.card-deck{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-flow:row wrap;-ms-flex-flow:row wrap;flex-flow:row wrap}.card-deck .card{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-flex:1;-webkit-flex:1 0 0%;-ms-flex:1 0 0%;flex:1 0 0%;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column}.card-deck .card:not(:first-child){margin-left:15px}.card-deck .card:not(:last-child){margin-right:15px}}@media (min-width:576px){.card-group{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-flow:row wrap;-ms-flex-flow:row wrap;flex-flow:row wrap}.card-group .card{-webkit-box-flex:1;-webkit-flex:1 0 0%;-ms-flex:1 0 0%;flex:1 0 0%}.card-group .card+.card{margin-left:0;border-left:0}.card-group .card:first-child{border-bottom-right-radius:0;border-top-right-radius:0}.card-group .card:first-child .card-img-top{border-top-right-radius:0}.card-group .card:first-child .card-img-bottom{border-bottom-right-radius:0}.card-group .card:last-child{border-bottom-left-radius:0;border-top-left-radius:0}.card-group .card:last-child .card-img-top{border-top-left-radius:0}.card-group .card:last-child .card-img-bottom{border-bottom-left-radius:0}.card-group .card:not(:first-child):not(:last-child){border-radius:0}.card-group .card:not(:first-child):not(:last-child) .card-img-bottom,.card-group .card:not(:first-child):not(:last-child) .card-img-top{border-radius:0}}@media (min-width:576px){.card-columns{-webkit-column-count:3;-moz-column-count:3;column-count:3;-webkit-column-gap:1.25rem;-moz-column-gap:1.25rem;column-gap:1.25rem}.card-columns .card{display:inline-block;width:100%;margin-bottom:.75rem}}.breadcrumb{padding:.75rem 1rem;margin-bottom:1rem;list-style:none;background-color:#eceeef;border-radius:.25rem}.breadcrumb::after{display:block;content:"";clear:both}.breadcrumb-item{float:left}.breadcrumb-item+.breadcrumb-item::before{display:inline-block;padding-right:.5rem;padding-left:.5rem;color:#636c72;content:"/"}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:underline}.breadcrumb-item+.breadcrumb-item:hover::before{text-decoration:none}.breadcrumb-item.active{color:#636c72}.pagination{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;padding-left:0;list-style:none;border-radius:.25rem}.page-item:first-child .page-link{margin-left:0;border-bottom-left-radius:.25rem;border-top-left-radius:.25rem}.page-item:last-child .page-link{border-bottom-right-radius:.25rem;border-top-right-radius:.25rem}.page-item.active .page-link{z-index:2;color:#fff;background-color:#0275d8;border-color:#0275d8}.page-item.disabled .page-link{color:#636c72;pointer-events:none;cursor:not-allowed;background-color:#fff;border-color:#ddd}.page-link{position:relative;display:block;padding:.5rem .75rem;margin-left:-1px;line-height:1.25;color:#0275d8;background-color:#fff;border:1px solid #ddd}.page-link:focus,.page-link:hover{color:#014c8c;text-decoration:none;background-color:#eceeef;border-color:#ddd}.pagination-lg .page-link{padding:.75rem 1.5rem;font-size:1.25rem}.pagination-lg .page-item:first-child .page-link{border-bottom-left-radius:.3rem;border-top-left-radius:.3rem}.pagination-lg .page-item:last-child .page-link{border-bottom-right-radius:.3rem;border-top-right-radius:.3rem}.pagination-sm .page-link{padding:.25rem .5rem;font-size:.875rem}.pagination-sm .page-item:first-child .page-link{border-bottom-left-radius:.2rem;border-top-left-radius:.2rem}.pagination-sm .page-item:last-child .page-link{border-bottom-right-radius:.2rem;border-top-right-radius:.2rem}.badge{display:inline-block;padding:.25em .4em;font-size:75%;font-weight:700;line-height:1;color:#fff;text-align:center;white-space:nowrap;vertical-align:baseline;border-radius:.25rem}.badge:empty{display:none}.btn .badge{position:relative;top:-1px}a.badge:focus,a.badge:hover{color:#fff;text-decoration:none;cursor:pointer}.badge-pill{padding-right:.6em;padding-left:.6em;border-radius:10rem}.badge-default{background-color:#636c72}.badge-default[href]:focus,.badge-default[href]:hover{background-color:#4b5257}.badge-primary{background-color:#0275d8}.badge-primary[href]:focus,.badge-primary[href]:hover{background-color:#025aa5}.badge-success{background-color:#5cb85c}.badge-success[href]:focus,.badge-success[href]:hover{background-color:#449d44}.badge-info{background-color:#5bc0de}.badge-info[href]:focus,.badge-info[href]:hover{background-color:#31b0d5}.badge-warning{background-color:#f0ad4e}.badge-warning[href]:focus,.badge-warning[href]:hover{background-color:#ec971f}.badge-danger{background-color:#d9534f}.badge-danger[href]:focus,.badge-danger[href]:hover{background-color:#c9302c}.jumbotron{padding:2rem 1rem;margin-bottom:2rem;background-color:#eceeef;border-radius:.3rem}@media (min-width:576px){.jumbotron{padding:4rem 2rem}}.jumbotron-hr{border-top-color:#d0d5d8}.jumbotron-fluid{padding-right:0;padding-left:0;border-radius:0}.alert{padding:.75rem 1.25rem;margin-bottom:1rem;border:1px solid transparent;border-radius:.25rem}.alert-heading{color:inherit}.alert-link{font-weight:700}.alert-dismissible .close{position:relative;top:-.75rem;right:-1.25rem;padding:.75rem 1.25rem;color:inherit}.alert-success{background-color:#dff0d8;border-color:#d0e9c6;color:#3c763d}.alert-success hr{border-top-color:#c1e2b3}.alert-success .alert-link{color:#2b542c}.alert-info{background-color:#d9edf7;border-color:#bcdff1;color:#31708f}.alert-info hr{border-top-color:#a6d5ec}.alert-info .alert-link{color:#245269}.alert-warning{background-color:#fcf8e3;border-color:#faf2cc;color:#8a6d3b}.alert-warning hr{border-top-color:#f7ecb5}.alert-warning .alert-link{color:#66512c}.alert-danger{background-color:#f2dede;border-color:#ebcccc;color:#a94442}.alert-danger hr{border-top-color:#e4b9b9}.alert-danger .alert-link{color:#843534}@-webkit-keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}@-o-keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}@keyframes progress-bar-stripes{from{background-position:1rem 0}to{background-position:0 0}}.progress{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;overflow:hidden;font-size:.75rem;line-height:1rem;text-align:center;background-color:#eceeef;border-radius:.25rem}.progress-bar{height:1rem;color:#fff;background-color:#0275d8}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);-webkit-background-size:1rem 1rem;background-size:1rem 1rem}.progress-bar-animated{-webkit-animation:progress-bar-stripes 1s linear infinite;-o-animation:progress-bar-stripes 1s linear infinite;animation:progress-bar-stripes 1s linear infinite}.media{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:start;-webkit-align-items:flex-start;-ms-flex-align:start;align-items:flex-start}.media-body{-webkit-box-flex:1;-webkit-flex:1 1 0%;-ms-flex:1 1 0%;flex:1 1 0%}.list-group{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;padding-left:0;margin-bottom:0}.list-group-item-action{width:100%;color:#464a4c;text-align:inherit}.list-group-item-action .list-group-item-heading{color:#292b2c}.list-group-item-action:focus,.list-group-item-action:hover{color:#464a4c;text-decoration:none;background-color:#f7f7f9}.list-group-item-action:active{color:#292b2c;background-color:#eceeef}.list-group-item{position:relative;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-flex-flow:row wrap;-ms-flex-flow:row wrap;flex-flow:row wrap;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;padding:.75rem 1.25rem;margin-bottom:-1px;background-color:#fff;border:1px solid rgba(0,0,0,.125)}.list-group-item:first-child{border-top-right-radius:.25rem;border-top-left-radius:.25rem}.list-group-item:last-child{margin-bottom:0;border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.list-group-item:focus,.list-group-item:hover{text-decoration:none}.list-group-item.disabled,.list-group-item:disabled{color:#636c72;cursor:not-allowed;background-color:#fff}.list-group-item.disabled .list-group-item-heading,.list-group-item:disabled .list-group-item-heading{color:inherit}.list-group-item.disabled .list-group-item-text,.list-group-item:disabled .list-group-item-text{color:#636c72}.list-group-item.active{z-index:2;color:#fff;background-color:#0275d8;border-color:#0275d8}.list-group-item.active .list-group-item-heading,.list-group-item.active .list-group-item-heading>.small,.list-group-item.active .list-group-item-heading>small{color:inherit}.list-group-item.active .list-group-item-text{color:#daeeff}.list-group-flush .list-group-item{border-right:0;border-left:0;border-radius:0}.list-group-flush:first-child .list-group-item:first-child{border-top:0}.list-group-flush:last-child .list-group-item:last-child{border-bottom:0}.list-group-item-success{color:#3c763d;background-color:#dff0d8}a.list-group-item-success,button.list-group-item-success{color:#3c763d}a.list-group-item-success .list-group-item-heading,button.list-group-item-success .list-group-item-heading{color:inherit}a.list-group-item-success:focus,a.list-group-item-success:hover,button.list-group-item-success:focus,button.list-group-item-success:hover{color:#3c763d;background-color:#d0e9c6}a.list-group-item-success.active,button.list-group-item-success.active{color:#fff;background-color:#3c763d;border-color:#3c763d}.list-group-item-info{color:#31708f;background-color:#d9edf7}a.list-group-item-info,button.list-group-item-info{color:#31708f}a.list-group-item-info .list-group-item-heading,button.list-group-item-info .list-group-item-heading{color:inherit}a.list-group-item-info:focus,a.list-group-item-info:hover,button.list-group-item-info:focus,button.list-group-item-info:hover{color:#31708f;background-color:#c4e3f3}a.list-group-item-info.active,button.list-group-item-info.active{color:#fff;background-color:#31708f;border-color:#31708f}.list-group-item-warning{color:#8a6d3b;background-color:#fcf8e3}a.list-group-item-warning,button.list-group-item-warning{color:#8a6d3b}a.list-group-item-warning .list-group-item-heading,button.list-group-item-warning .list-group-item-heading{color:inherit}a.list-group-item-warning:focus,a.list-group-item-warning:hover,button.list-group-item-warning:focus,button.list-group-item-warning:hover{color:#8a6d3b;background-color:#faf2cc}a.list-group-item-warning.active,button.list-group-item-warning.active{color:#fff;background-color:#8a6d3b;border-color:#8a6d3b}.list-group-item-danger{color:#a94442;background-color:#f2dede}a.list-group-item-danger,button.list-group-item-danger{color:#a94442}a.list-group-item-danger .list-group-item-heading,button.list-group-item-danger .list-group-item-heading{color:inherit}a.list-group-item-danger:focus,a.list-group-item-danger:hover,button.list-group-item-danger:focus,button.list-group-item-danger:hover{color:#a94442;background-color:#ebcccc}a.list-group-item-danger.active,button.list-group-item-danger.active{color:#fff;background-color:#a94442;border-color:#a94442}.embed-responsive{position:relative;display:block;width:100%;padding:0;overflow:hidden}.embed-responsive::before{display:block;content:""}.embed-responsive .embed-responsive-item,.embed-responsive embed,.embed-responsive iframe,.embed-responsive object,.embed-responsive video{position:absolute;top:0;bottom:0;left:0;width:100%;height:100%;border:0}.embed-responsive-21by9::before{padding-top:42.857143%}.embed-responsive-16by9::before{padding-top:56.25%}.embed-responsive-4by3::before{padding-top:75%}.embed-responsive-1by1::before{padding-top:100%}.close{float:right;font-size:1.5rem;font-weight:700;line-height:1;color:#000;text-shadow:0 1px 0 #fff;opacity:.5}.close:focus,.close:hover{color:#000;text-decoration:none;cursor:pointer;opacity:.75}button.close{padding:0;cursor:pointer;background:0 0;border:0;-webkit-appearance:none}.modal-open{overflow:hidden}.modal{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1050;display:none;overflow:hidden;outline:0}.modal.fade .modal-dialog{-webkit-transition:-webkit-transform .3s ease-out;transition:-webkit-transform .3s ease-out;-o-transition:-o-transform .3s ease-out;transition:transform .3s ease-out;transition:transform .3s ease-out,-webkit-transform .3s ease-out,-o-transform .3s ease-out;-webkit-transform:translate(0,-25%);-o-transform:translate(0,-25%);transform:translate(0,-25%)}.modal.show .modal-dialog{-webkit-transform:translate(0,0);-o-transform:translate(0,0);transform:translate(0,0)}.modal-open .modal{overflow-x:hidden;overflow-y:auto}.modal-dialog{position:relative;width:auto;margin:10px}.modal-content{position:relative;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-orient:vertical;-webkit-box-direction:normal;-webkit-flex-direction:column;-ms-flex-direction:column;flex-direction:column;background-color:#fff;-webkit-background-clip:padding-box;background-clip:padding-box;border:1px solid rgba(0,0,0,.2);border-radius:.3rem;outline:0}.modal-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1040;background-color:#000}.modal-backdrop.fade{opacity:0}.modal-backdrop.show{opacity:.5}.modal-header{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;-ms-flex-pack:justify;justify-content:space-between;padding:15px;border-bottom:1px solid #eceeef}.modal-title{margin-bottom:0;line-height:1.5}.modal-body{position:relative;-webkit-box-flex:1;-webkit-flex:1 1 auto;-ms-flex:1 1 auto;flex:1 1 auto;padding:15px}.modal-footer{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:end;-webkit-justify-content:flex-end;-ms-flex-pack:end;justify-content:flex-end;padding:15px;border-top:1px solid #eceeef}.modal-footer>:not(:first-child){margin-left:.25rem}.modal-footer>:not(:last-child){margin-right:.25rem}.modal-scrollbar-measure{position:absolute;top:-9999px;width:50px;height:50px;overflow:scroll}@media (min-width:576px){.modal-dialog{max-width:500px;margin:30px auto}.modal-sm{max-width:300px}}@media (min-width:992px){.modal-lg{max-width:800px}}.tooltip{position:absolute;z-index:1070;display:block;font-family:-apple-system,system-ui,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-style:normal;font-weight:400;letter-spacing:normal;line-break:auto;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;white-space:normal;word-break:normal;word-spacing:normal;font-size:.875rem;word-wrap:break-word;opacity:0}.tooltip.show{opacity:.9}.tooltip.bs-tether-element-attached-bottom,.tooltip.tooltip-top{padding:5px 0;margin-top:-3px}.tooltip.bs-tether-element-attached-bottom .tooltip-inner::before,.tooltip.tooltip-top .tooltip-inner::before{bottom:0;left:50%;margin-left:-5px;content:"";border-width:5px 5px 0;border-top-color:#000}.tooltip.bs-tether-element-attached-left,.tooltip.tooltip-right{padding:0 5px;margin-left:3px}.tooltip.bs-tether-element-attached-left .tooltip-inner::before,.tooltip.tooltip-right .tooltip-inner::before{top:50%;left:0;margin-top:-5px;content:"";border-width:5px 5px 5px 0;border-right-color:#000}.tooltip.bs-tether-element-attached-top,.tooltip.tooltip-bottom{padding:5px 0;margin-top:3px}.tooltip.bs-tether-element-attached-top .tooltip-inner::before,.tooltip.tooltip-bottom .tooltip-inner::before{top:0;left:50%;margin-left:-5px;content:"";border-width:0 5px 5px;border-bottom-color:#000}.tooltip.bs-tether-element-attached-right,.tooltip.tooltip-left{padding:0 5px;margin-left:-3px}.tooltip.bs-tether-element-attached-right .tooltip-inner::before,.tooltip.tooltip-left .tooltip-inner::before{top:50%;right:0;margin-top:-5px;content:"";border-width:5px 0 5px 5px;border-left-color:#000}.tooltip-inner{max-width:200px;padding:3px 8px;color:#fff;text-align:center;background-color:#000;border-radius:.25rem}.tooltip-inner::before{position:absolute;width:0;height:0;border-color:transparent;border-style:solid}.popover{position:absolute;top:0;left:0;z-index:1060;display:block;max-width:276px;padding:1px;font-family:-apple-system,system-ui,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-style:normal;font-weight:400;letter-spacing:normal;line-break:auto;line-height:1.5;text-align:left;text-align:start;text-decoration:none;text-shadow:none;text-transform:none;white-space:normal;word-break:normal;word-spacing:normal;font-size:.875rem;word-wrap:break-word;background-color:#fff;-webkit-background-clip:padding-box;background-clip:padding-box;border:1px solid rgba(0,0,0,.2);border-radius:.3rem}.popover.bs-tether-element-attached-bottom,.popover.popover-top{margin-top:-10px}.popover.bs-tether-element-attached-bottom::after,.popover.bs-tether-element-attached-bottom::before,.popover.popover-top::after,.popover.popover-top::before{left:50%;border-bottom-width:0}.popover.bs-tether-element-attached-bottom::before,.popover.popover-top::before{bottom:-11px;margin-left:-11px;border-top-color:rgba(0,0,0,.25)}.popover.bs-tether-element-attached-bottom::after,.popover.popover-top::after{bottom:-10px;margin-left:-10px;border-top-color:#fff}.popover.bs-tether-element-attached-left,.popover.popover-right{margin-left:10px}.popover.bs-tether-element-attached-left::after,.popover.bs-tether-element-attached-left::before,.popover.popover-right::after,.popover.popover-right::before{top:50%;border-left-width:0}.popover.bs-tether-element-attached-left::before,.popover.popover-right::before{left:-11px;margin-top:-11px;border-right-color:rgba(0,0,0,.25)}.popover.bs-tether-element-attached-left::after,.popover.popover-right::after{left:-10px;margin-top:-10px;border-right-color:#fff}.popover.bs-tether-element-attached-top,.popover.popover-bottom{margin-top:10px}.popover.bs-tether-element-attached-top::after,.popover.bs-tether-element-attached-top::before,.popover.popover-bottom::after,.popover.popover-bottom::before{left:50%;border-top-width:0}.popover.bs-tether-element-attached-top::before,.popover.popover-bottom::before{top:-11px;margin-left:-11px;border-bottom-color:rgba(0,0,0,.25)}.popover.bs-tether-element-attached-top::after,.popover.popover-bottom::after{top:-10px;margin-left:-10px;border-bottom-color:#f7f7f7}.popover.bs-tether-element-attached-top .popover-title::before,.popover.popover-bottom .popover-title::before{position:absolute;top:0;left:50%;display:block;width:20px;margin-left:-10px;content:"";border-bottom:1px solid #f7f7f7}.popover.bs-tether-element-attached-right,.popover.popover-left{margin-left:-10px}.popover.bs-tether-element-attached-right::after,.popover.bs-tether-element-attached-right::before,.popover.popover-left::after,.popover.popover-left::before{top:50%;border-right-width:0}.popover.bs-tether-element-attached-right::before,.popover.popover-left::before{right:-11px;margin-top:-11px;border-left-color:rgba(0,0,0,.25)}.popover.bs-tether-element-attached-right::after,.popover.popover-left::after{right:-10px;margin-top:-10px;border-left-color:#fff}.popover-title{padding:8px 14px;margin-bottom:0;font-size:1rem;background-color:#f7f7f7;border-bottom:1px solid #ebebeb;border-top-right-radius:calc(.3rem - 1px);border-top-left-radius:calc(.3rem - 1px)}.popover-title:empty{display:none}.popover-content{padding:9px 14px}.popover::after,.popover::before{position:absolute;display:block;width:0;height:0;border-color:transparent;border-style:solid}.popover::before{content:"";border-width:11px}.popover::after{content:"";border-width:10px}.carousel{position:relative}.carousel-inner{position:relative;width:100%;overflow:hidden}.carousel-item{position:relative;display:none;width:100%}@media (-webkit-transform-3d){.carousel-item{-webkit-transition:-webkit-transform .6s ease-in-out;transition:-webkit-transform .6s ease-in-out;-o-transition:-o-transform .6s ease-in-out;transition:transform .6s ease-in-out;transition:transform .6s ease-in-out,-webkit-transform .6s ease-in-out,-o-transform .6s ease-in-out;-webkit-backface-visibility:hidden;backface-visibility:hidden;-webkit-perspective:1000px;perspective:1000px}}@supports ((-webkit-transform:translate3d(0,0,0)) or (transform:translate3d(0,0,0))){.carousel-item{-webkit-transition:-webkit-transform .6s ease-in-out;transition:-webkit-transform .6s ease-in-out;-o-transition:-o-transform .6s ease-in-out;transition:transform .6s ease-in-out;transition:transform .6s ease-in-out,-webkit-transform .6s ease-in-out,-o-transform .6s ease-in-out;-webkit-backface-visibility:hidden;backface-visibility:hidden;-webkit-perspective:1000px;perspective:1000px}}.carousel-item-next,.carousel-item-prev,.carousel-item.active{display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex}.carousel-item-next,.carousel-item-prev{position:absolute;top:0}@media (-webkit-transform-3d){.carousel-item-next.carousel-item-left,.carousel-item-prev.carousel-item-right{-webkit-transform:translate3d(0,0,0);transform:translate3d(0,0,0)}.active.carousel-item-right,.carousel-item-next{-webkit-transform:translate3d(100%,0,0);transform:translate3d(100%,0,0)}.active.carousel-item-left,.carousel-item-prev{-webkit-transform:translate3d(-100%,0,0);transform:translate3d(-100%,0,0)}}@supports ((-webkit-transform:translate3d(0,0,0)) or (transform:translate3d(0,0,0))){.carousel-item-next.carousel-item-left,.carousel-item-prev.carousel-item-right{-webkit-transform:translate3d(0,0,0);transform:translate3d(0,0,0)}.active.carousel-item-right,.carousel-item-next{-webkit-transform:translate3d(100%,0,0);transform:translate3d(100%,0,0)}.active.carousel-item-left,.carousel-item-prev{-webkit-transform:translate3d(-100%,0,0);transform:translate3d(-100%,0,0)}}.carousel-control-next,.carousel-control-prev{position:absolute;top:0;bottom:0;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-align:center;-webkit-align-items:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;width:15%;color:#fff;text-align:center;opacity:.5}.carousel-control-next:focus,.carousel-control-next:hover,.carousel-control-prev:focus,.carousel-control-prev:hover{color:#fff;text-decoration:none;outline:0;opacity:.9}.carousel-control-prev{left:0}.carousel-control-next{right:0}.carousel-control-next-icon,.carousel-control-prev-icon{display:inline-block;width:20px;height:20px;background:transparent no-repeat center center;-webkit-background-size:100% 100%;background-size:100% 100%}.carousel-control-prev-icon{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3E%3Cpath d='M4 0l-4 4 4 4 1.5-1.5-2.5-2.5 2.5-2.5-1.5-1.5z'/%3E%3C/svg%3E")}.carousel-control-next-icon{background-image:url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3E%3Cpath d='M1.5 0l-1.5 1.5 2.5 2.5-2.5 2.5 1.5 1.5 4-4-4-4z'/%3E%3C/svg%3E")}.carousel-indicators{position:absolute;right:0;bottom:10px;left:0;z-index:15;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-box-pack:center;-webkit-justify-content:center;-ms-flex-pack:center;justify-content:center;padding-left:0;margin-right:15%;margin-left:15%;list-style:none}.carousel-indicators li{position:relative;-webkit-box-flex:1;-webkit-flex:1 0 auto;-ms-flex:1 0 auto;flex:1 0 auto;max-width:30px;height:3px;margin-right:3px;margin-left:3px;text-indent:-999px;cursor:pointer;background-color:rgba(255,255,255,.5)}.carousel-indicators li::before{position:absolute;top:-10px;left:0;display:inline-block;width:100%;height:10px;content:""}.carousel-indicators li::after{position:absolute;bottom:-10px;left:0;display:inline-block;width:100%;height:10px;content:""}.carousel-indicators .active{background-color:#fff}.carousel-caption{position:absolute;right:15%;bottom:20px;left:15%;z-index:10;padding-top:20px;padding-bottom:20px;color:#fff;text-align:center}.align-baseline{vertical-align:baseline!important}.align-top{vertical-align:top!important}.align-middle{vertical-align:middle!important}.align-bottom{vertical-align:bottom!important}.align-text-bottom{vertical-align:text-bottom!important}.align-text-top{vertical-align:text-top!important}.bg-faded{background-color:#f7f7f7}.bg-primary{background-color:#0275d8!important}a.bg-primary:focus,a.bg-primary:hover{background-color:#025aa5!important}.bg-success{background-color:#5cb85c!important}a.bg-success:focus,a.bg-success:hover{background-color:#449d44!important}.bg-info{background-color:#5bc0de!important}a.bg-info:focus,a.bg-info:hover{background-color:#31b0d5!important}.bg-warning{background-color:#f0ad4e!important}a.bg-warning:focus,a.bg-warning:hover{background-color:#ec971f!important}.bg-danger{background-color:#d9534f!important}a.bg-danger:focus,a.bg-danger:hover{background-color:#c9302c!important}.bg-inverse{background-color:#292b2c!important}a.bg-inverse:focus,a.bg-inverse:hover{background-color:#101112!important}.border-0{border:0!important}.border-top-0{border-top:0!important}.border-right-0{border-right:0!important}.border-bottom-0{border-bottom:0!important}.border-left-0{border-left:0!important}.rounded{border-radius:.25rem}.rounded-top{border-top-right-radius:.25rem;border-top-left-radius:.25rem}.rounded-right{border-bottom-right-radius:.25rem;border-top-right-radius:.25rem}.rounded-bottom{border-bottom-right-radius:.25rem;border-bottom-left-radius:.25rem}.rounded-left{border-bottom-left-radius:.25rem;border-top-left-radius:.25rem}.rounded-circle{border-radius:50%}.rounded-0{border-radius:0}.clearfix::after{display:block;content:"";clear:both}.d-none{display:none!important}.d-inline{display:inline!important}.d-inline-block{display:inline-block!important}.d-block{display:block!important}.d-table{display:table!important}.d-table-cell{display:table-cell!important}.d-flex{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important}.d-inline-flex{display:-webkit-inline-box!important;display:-webkit-inline-flex!important;display:-ms-inline-flexbox!important;display:inline-flex!important}@media (min-width:576px){.d-sm-none{display:none!important}.d-sm-inline{display:inline!important}.d-sm-inline-block{display:inline-block!important}.d-sm-block{display:block!important}.d-sm-table{display:table!important}.d-sm-table-cell{display:table-cell!important}.d-sm-flex{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important}.d-sm-inline-flex{display:-webkit-inline-box!important;display:-webkit-inline-flex!important;display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:768px){.d-md-none{display:none!important}.d-md-inline{display:inline!important}.d-md-inline-block{display:inline-block!important}.d-md-block{display:block!important}.d-md-table{display:table!important}.d-md-table-cell{display:table-cell!important}.d-md-flex{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important}.d-md-inline-flex{display:-webkit-inline-box!important;display:-webkit-inline-flex!important;display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:992px){.d-lg-none{display:none!important}.d-lg-inline{display:inline!important}.d-lg-inline-block{display:inline-block!important}.d-lg-block{display:block!important}.d-lg-table{display:table!important}.d-lg-table-cell{display:table-cell!important}.d-lg-flex{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important}.d-lg-inline-flex{display:-webkit-inline-box!important;display:-webkit-inline-flex!important;display:-ms-inline-flexbox!important;display:inline-flex!important}}@media (min-width:1200px){.d-xl-none{display:none!important}.d-xl-inline{display:inline!important}.d-xl-inline-block{display:inline-block!important}.d-xl-block{display:block!important}.d-xl-table{display:table!important}.d-xl-table-cell{display:table-cell!important}.d-xl-flex{display:-webkit-box!important;display:-webkit-flex!important;display:-ms-flexbox!important;display:flex!important}.d-xl-inline-flex{display:-webkit-inline-box!important;display:-webkit-inline-flex!important;display:-ms-inline-flexbox!important;display:inline-flex!important}}.flex-first{-webkit-box-ordinal-group:0;-webkit-order:-1;-ms-flex-order:-1;order:-1}.flex-last{-webkit-box-ordinal-group:2;-webkit-order:1;-ms-flex-order:1;order:1}.flex-unordered{-webkit-box-ordinal-group:1;-webkit-order:0;-ms-flex-order:0;order:0}.flex-row{-webkit-box-orient:horizontal!important;-webkit-box-direction:normal!important;-webkit-flex-direction:row!important;-ms-flex-direction:row!important;flex-direction:row!important}.flex-column{-webkit-box-orient:vertical!important;-webkit-box-direction:normal!important;-webkit-flex-direction:column!important;-ms-flex-direction:column!important;flex-direction:column!important}.flex-row-reverse{-webkit-box-orient:horizontal!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:row-reverse!important;-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-column-reverse{-webkit-box-orient:vertical!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:column-reverse!important;-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-wrap{-webkit-flex-wrap:wrap!important;-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-nowrap{-webkit-flex-wrap:nowrap!important;-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-wrap-reverse{-webkit-flex-wrap:wrap-reverse!important;-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.justify-content-start{-webkit-box-pack:start!important;-webkit-justify-content:flex-start!important;-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-end{-webkit-box-pack:end!important;-webkit-justify-content:flex-end!important;-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-center{-webkit-box-pack:center!important;-webkit-justify-content:center!important;-ms-flex-pack:center!important;justify-content:center!important}.justify-content-between{-webkit-box-pack:justify!important;-webkit-justify-content:space-between!important;-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-around{-webkit-justify-content:space-around!important;-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-start{-webkit-box-align:start!important;-webkit-align-items:flex-start!important;-ms-flex-align:start!important;align-items:flex-start!important}.align-items-end{-webkit-box-align:end!important;-webkit-align-items:flex-end!important;-ms-flex-align:end!important;align-items:flex-end!important}.align-items-center{-webkit-box-align:center!important;-webkit-align-items:center!important;-ms-flex-align:center!important;align-items:center!important}.align-items-baseline{-webkit-box-align:baseline!important;-webkit-align-items:baseline!important;-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-stretch{-webkit-box-align:stretch!important;-webkit-align-items:stretch!important;-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-start{-webkit-align-content:flex-start!important;-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-end{-webkit-align-content:flex-end!important;-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-center{-webkit-align-content:center!important;-ms-flex-line-pack:center!important;align-content:center!important}.align-content-between{-webkit-align-content:space-between!important;-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-around{-webkit-align-content:space-around!important;-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-stretch{-webkit-align-content:stretch!important;-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-auto{-webkit-align-self:auto!important;-ms-flex-item-align:auto!important;-ms-grid-row-align:auto!important;align-self:auto!important}.align-self-start{-webkit-align-self:flex-start!important;-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-end{-webkit-align-self:flex-end!important;-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-center{-webkit-align-self:center!important;-ms-flex-item-align:center!important;-ms-grid-row-align:center!important;align-self:center!important}.align-self-baseline{-webkit-align-self:baseline!important;-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-stretch{-webkit-align-self:stretch!important;-ms-flex-item-align:stretch!important;-ms-grid-row-align:stretch!important;align-self:stretch!important}@media (min-width:576px){.flex-sm-first{-webkit-box-ordinal-group:0;-webkit-order:-1;-ms-flex-order:-1;order:-1}.flex-sm-last{-webkit-box-ordinal-group:2;-webkit-order:1;-ms-flex-order:1;order:1}.flex-sm-unordered{-webkit-box-ordinal-group:1;-webkit-order:0;-ms-flex-order:0;order:0}.flex-sm-row{-webkit-box-orient:horizontal!important;-webkit-box-direction:normal!important;-webkit-flex-direction:row!important;-ms-flex-direction:row!important;flex-direction:row!important}.flex-sm-column{-webkit-box-orient:vertical!important;-webkit-box-direction:normal!important;-webkit-flex-direction:column!important;-ms-flex-direction:column!important;flex-direction:column!important}.flex-sm-row-reverse{-webkit-box-orient:horizontal!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:row-reverse!important;-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-sm-column-reverse{-webkit-box-orient:vertical!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:column-reverse!important;-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-sm-wrap{-webkit-flex-wrap:wrap!important;-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-sm-nowrap{-webkit-flex-wrap:nowrap!important;-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-sm-wrap-reverse{-webkit-flex-wrap:wrap-reverse!important;-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.justify-content-sm-start{-webkit-box-pack:start!important;-webkit-justify-content:flex-start!important;-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-sm-end{-webkit-box-pack:end!important;-webkit-justify-content:flex-end!important;-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-sm-center{-webkit-box-pack:center!important;-webkit-justify-content:center!important;-ms-flex-pack:center!important;justify-content:center!important}.justify-content-sm-between{-webkit-box-pack:justify!important;-webkit-justify-content:space-between!important;-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-sm-around{-webkit-justify-content:space-around!important;-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-sm-start{-webkit-box-align:start!important;-webkit-align-items:flex-start!important;-ms-flex-align:start!important;align-items:flex-start!important}.align-items-sm-end{-webkit-box-align:end!important;-webkit-align-items:flex-end!important;-ms-flex-align:end!important;align-items:flex-end!important}.align-items-sm-center{-webkit-box-align:center!important;-webkit-align-items:center!important;-ms-flex-align:center!important;align-items:center!important}.align-items-sm-baseline{-webkit-box-align:baseline!important;-webkit-align-items:baseline!important;-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-sm-stretch{-webkit-box-align:stretch!important;-webkit-align-items:stretch!important;-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-sm-start{-webkit-align-content:flex-start!important;-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-sm-end{-webkit-align-content:flex-end!important;-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-sm-center{-webkit-align-content:center!important;-ms-flex-line-pack:center!important;align-content:center!important}.align-content-sm-between{-webkit-align-content:space-between!important;-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-sm-around{-webkit-align-content:space-around!important;-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-sm-stretch{-webkit-align-content:stretch!important;-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-sm-auto{-webkit-align-self:auto!important;-ms-flex-item-align:auto!important;-ms-grid-row-align:auto!important;align-self:auto!important}.align-self-sm-start{-webkit-align-self:flex-start!important;-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-sm-end{-webkit-align-self:flex-end!important;-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-sm-center{-webkit-align-self:center!important;-ms-flex-item-align:center!important;-ms-grid-row-align:center!important;align-self:center!important}.align-self-sm-baseline{-webkit-align-self:baseline!important;-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-sm-stretch{-webkit-align-self:stretch!important;-ms-flex-item-align:stretch!important;-ms-grid-row-align:stretch!important;align-self:stretch!important}}@media (min-width:768px){.flex-md-first{-webkit-box-ordinal-group:0;-webkit-order:-1;-ms-flex-order:-1;order:-1}.flex-md-last{-webkit-box-ordinal-group:2;-webkit-order:1;-ms-flex-order:1;order:1}.flex-md-unordered{-webkit-box-ordinal-group:1;-webkit-order:0;-ms-flex-order:0;order:0}.flex-md-row{-webkit-box-orient:horizontal!important;-webkit-box-direction:normal!important;-webkit-flex-direction:row!important;-ms-flex-direction:row!important;flex-direction:row!important}.flex-md-column{-webkit-box-orient:vertical!important;-webkit-box-direction:normal!important;-webkit-flex-direction:column!important;-ms-flex-direction:column!important;flex-direction:column!important}.flex-md-row-reverse{-webkit-box-orient:horizontal!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:row-reverse!important;-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-md-column-reverse{-webkit-box-orient:vertical!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:column-reverse!important;-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-md-wrap{-webkit-flex-wrap:wrap!important;-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-md-nowrap{-webkit-flex-wrap:nowrap!important;-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-md-wrap-reverse{-webkit-flex-wrap:wrap-reverse!important;-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.justify-content-md-start{-webkit-box-pack:start!important;-webkit-justify-content:flex-start!important;-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-md-end{-webkit-box-pack:end!important;-webkit-justify-content:flex-end!important;-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-md-center{-webkit-box-pack:center!important;-webkit-justify-content:center!important;-ms-flex-pack:center!important;justify-content:center!important}.justify-content-md-between{-webkit-box-pack:justify!important;-webkit-justify-content:space-between!important;-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-md-around{-webkit-justify-content:space-around!important;-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-md-start{-webkit-box-align:start!important;-webkit-align-items:flex-start!important;-ms-flex-align:start!important;align-items:flex-start!important}.align-items-md-end{-webkit-box-align:end!important;-webkit-align-items:flex-end!important;-ms-flex-align:end!important;align-items:flex-end!important}.align-items-md-center{-webkit-box-align:center!important;-webkit-align-items:center!important;-ms-flex-align:center!important;align-items:center!important}.align-items-md-baseline{-webkit-box-align:baseline!important;-webkit-align-items:baseline!important;-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-md-stretch{-webkit-box-align:stretch!important;-webkit-align-items:stretch!important;-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-md-start{-webkit-align-content:flex-start!important;-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-md-end{-webkit-align-content:flex-end!important;-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-md-center{-webkit-align-content:center!important;-ms-flex-line-pack:center!important;align-content:center!important}.align-content-md-between{-webkit-align-content:space-between!important;-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-md-around{-webkit-align-content:space-around!important;-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-md-stretch{-webkit-align-content:stretch!important;-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-md-auto{-webkit-align-self:auto!important;-ms-flex-item-align:auto!important;-ms-grid-row-align:auto!important;align-self:auto!important}.align-self-md-start{-webkit-align-self:flex-start!important;-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-md-end{-webkit-align-self:flex-end!important;-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-md-center{-webkit-align-self:center!important;-ms-flex-item-align:center!important;-ms-grid-row-align:center!important;align-self:center!important}.align-self-md-baseline{-webkit-align-self:baseline!important;-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-md-stretch{-webkit-align-self:stretch!important;-ms-flex-item-align:stretch!important;-ms-grid-row-align:stretch!important;align-self:stretch!important}}@media (min-width:992px){.flex-lg-first{-webkit-box-ordinal-group:0;-webkit-order:-1;-ms-flex-order:-1;order:-1}.flex-lg-last{-webkit-box-ordinal-group:2;-webkit-order:1;-ms-flex-order:1;order:1}.flex-lg-unordered{-webkit-box-ordinal-group:1;-webkit-order:0;-ms-flex-order:0;order:0}.flex-lg-row{-webkit-box-orient:horizontal!important;-webkit-box-direction:normal!important;-webkit-flex-direction:row!important;-ms-flex-direction:row!important;flex-direction:row!important}.flex-lg-column{-webkit-box-orient:vertical!important;-webkit-box-direction:normal!important;-webkit-flex-direction:column!important;-ms-flex-direction:column!important;flex-direction:column!important}.flex-lg-row-reverse{-webkit-box-orient:horizontal!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:row-reverse!important;-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-lg-column-reverse{-webkit-box-orient:vertical!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:column-reverse!important;-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-lg-wrap{-webkit-flex-wrap:wrap!important;-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-lg-nowrap{-webkit-flex-wrap:nowrap!important;-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-lg-wrap-reverse{-webkit-flex-wrap:wrap-reverse!important;-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.justify-content-lg-start{-webkit-box-pack:start!important;-webkit-justify-content:flex-start!important;-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-lg-end{-webkit-box-pack:end!important;-webkit-justify-content:flex-end!important;-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-lg-center{-webkit-box-pack:center!important;-webkit-justify-content:center!important;-ms-flex-pack:center!important;justify-content:center!important}.justify-content-lg-between{-webkit-box-pack:justify!important;-webkit-justify-content:space-between!important;-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-lg-around{-webkit-justify-content:space-around!important;-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-lg-start{-webkit-box-align:start!important;-webkit-align-items:flex-start!important;-ms-flex-align:start!important;align-items:flex-start!important}.align-items-lg-end{-webkit-box-align:end!important;-webkit-align-items:flex-end!important;-ms-flex-align:end!important;align-items:flex-end!important}.align-items-lg-center{-webkit-box-align:center!important;-webkit-align-items:center!important;-ms-flex-align:center!important;align-items:center!important}.align-items-lg-baseline{-webkit-box-align:baseline!important;-webkit-align-items:baseline!important;-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-lg-stretch{-webkit-box-align:stretch!important;-webkit-align-items:stretch!important;-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-lg-start{-webkit-align-content:flex-start!important;-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-lg-end{-webkit-align-content:flex-end!important;-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-lg-center{-webkit-align-content:center!important;-ms-flex-line-pack:center!important;align-content:center!important}.align-content-lg-between{-webkit-align-content:space-between!important;-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-lg-around{-webkit-align-content:space-around!important;-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-lg-stretch{-webkit-align-content:stretch!important;-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-lg-auto{-webkit-align-self:auto!important;-ms-flex-item-align:auto!important;-ms-grid-row-align:auto!important;align-self:auto!important}.align-self-lg-start{-webkit-align-self:flex-start!important;-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-lg-end{-webkit-align-self:flex-end!important;-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-lg-center{-webkit-align-self:center!important;-ms-flex-item-align:center!important;-ms-grid-row-align:center!important;align-self:center!important}.align-self-lg-baseline{-webkit-align-self:baseline!important;-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-lg-stretch{-webkit-align-self:stretch!important;-ms-flex-item-align:stretch!important;-ms-grid-row-align:stretch!important;align-self:stretch!important}}@media (min-width:1200px){.flex-xl-first{-webkit-box-ordinal-group:0;-webkit-order:-1;-ms-flex-order:-1;order:-1}.flex-xl-last{-webkit-box-ordinal-group:2;-webkit-order:1;-ms-flex-order:1;order:1}.flex-xl-unordered{-webkit-box-ordinal-group:1;-webkit-order:0;-ms-flex-order:0;order:0}.flex-xl-row{-webkit-box-orient:horizontal!important;-webkit-box-direction:normal!important;-webkit-flex-direction:row!important;-ms-flex-direction:row!important;flex-direction:row!important}.flex-xl-column{-webkit-box-orient:vertical!important;-webkit-box-direction:normal!important;-webkit-flex-direction:column!important;-ms-flex-direction:column!important;flex-direction:column!important}.flex-xl-row-reverse{-webkit-box-orient:horizontal!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:row-reverse!important;-ms-flex-direction:row-reverse!important;flex-direction:row-reverse!important}.flex-xl-column-reverse{-webkit-box-orient:vertical!important;-webkit-box-direction:reverse!important;-webkit-flex-direction:column-reverse!important;-ms-flex-direction:column-reverse!important;flex-direction:column-reverse!important}.flex-xl-wrap{-webkit-flex-wrap:wrap!important;-ms-flex-wrap:wrap!important;flex-wrap:wrap!important}.flex-xl-nowrap{-webkit-flex-wrap:nowrap!important;-ms-flex-wrap:nowrap!important;flex-wrap:nowrap!important}.flex-xl-wrap-reverse{-webkit-flex-wrap:wrap-reverse!important;-ms-flex-wrap:wrap-reverse!important;flex-wrap:wrap-reverse!important}.justify-content-xl-start{-webkit-box-pack:start!important;-webkit-justify-content:flex-start!important;-ms-flex-pack:start!important;justify-content:flex-start!important}.justify-content-xl-end{-webkit-box-pack:end!important;-webkit-justify-content:flex-end!important;-ms-flex-pack:end!important;justify-content:flex-end!important}.justify-content-xl-center{-webkit-box-pack:center!important;-webkit-justify-content:center!important;-ms-flex-pack:center!important;justify-content:center!important}.justify-content-xl-between{-webkit-box-pack:justify!important;-webkit-justify-content:space-between!important;-ms-flex-pack:justify!important;justify-content:space-between!important}.justify-content-xl-around{-webkit-justify-content:space-around!important;-ms-flex-pack:distribute!important;justify-content:space-around!important}.align-items-xl-start{-webkit-box-align:start!important;-webkit-align-items:flex-start!important;-ms-flex-align:start!important;align-items:flex-start!important}.align-items-xl-end{-webkit-box-align:end!important;-webkit-align-items:flex-end!important;-ms-flex-align:end!important;align-items:flex-end!important}.align-items-xl-center{-webkit-box-align:center!important;-webkit-align-items:center!important;-ms-flex-align:center!important;align-items:center!important}.align-items-xl-baseline{-webkit-box-align:baseline!important;-webkit-align-items:baseline!important;-ms-flex-align:baseline!important;align-items:baseline!important}.align-items-xl-stretch{-webkit-box-align:stretch!important;-webkit-align-items:stretch!important;-ms-flex-align:stretch!important;align-items:stretch!important}.align-content-xl-start{-webkit-align-content:flex-start!important;-ms-flex-line-pack:start!important;align-content:flex-start!important}.align-content-xl-end{-webkit-align-content:flex-end!important;-ms-flex-line-pack:end!important;align-content:flex-end!important}.align-content-xl-center{-webkit-align-content:center!important;-ms-flex-line-pack:center!important;align-content:center!important}.align-content-xl-between{-webkit-align-content:space-between!important;-ms-flex-line-pack:justify!important;align-content:space-between!important}.align-content-xl-around{-webkit-align-content:space-around!important;-ms-flex-line-pack:distribute!important;align-content:space-around!important}.align-content-xl-stretch{-webkit-align-content:stretch!important;-ms-flex-line-pack:stretch!important;align-content:stretch!important}.align-self-xl-auto{-webkit-align-self:auto!important;-ms-flex-item-align:auto!important;-ms-grid-row-align:auto!important;align-self:auto!important}.align-self-xl-start{-webkit-align-self:flex-start!important;-ms-flex-item-align:start!important;align-self:flex-start!important}.align-self-xl-end{-webkit-align-self:flex-end!important;-ms-flex-item-align:end!important;align-self:flex-end!important}.align-self-xl-center{-webkit-align-self:center!important;-ms-flex-item-align:center!important;-ms-grid-row-align:center!important;align-self:center!important}.align-self-xl-baseline{-webkit-align-self:baseline!important;-ms-flex-item-align:baseline!important;align-self:baseline!important}.align-self-xl-stretch{-webkit-align-self:stretch!important;-ms-flex-item-align:stretch!important;-ms-grid-row-align:stretch!important;align-self:stretch!important}}.float-left{float:left!important}.float-right{float:right!important}.float-none{float:none!important}@media (min-width:576px){.float-sm-left{float:left!important}.float-sm-right{float:right!important}.float-sm-none{float:none!important}}@media (min-width:768px){.float-md-left{float:left!important}.float-md-right{float:right!important}.float-md-none{float:none!important}}@media (min-width:992px){.float-lg-left{float:left!important}.float-lg-right{float:right!important}.float-lg-none{float:none!important}}@media (min-width:1200px){.float-xl-left{float:left!important}.float-xl-right{float:right!important}.float-xl-none{float:none!important}}.fixed-top{position:fixed;top:0;right:0;left:0;z-index:1030}.fixed-bottom{position:fixed;right:0;bottom:0;left:0;z-index:1030}.sticky-top{position:-webkit-sticky;position:sticky;top:0;z-index:1030}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.w-25{width:25%!important}.w-50{width:50%!important}.w-75{width:75%!important}.w-100{width:100%!important}.h-25{height:25%!important}.h-50{height:50%!important}.h-75{height:75%!important}.h-100{height:100%!important}.mw-100{max-width:100%!important}.mh-100{max-height:100%!important}.m-0{margin:0 0!important}.mt-0{margin-top:0!important}.mr-0{margin-right:0!important}.mb-0{margin-bottom:0!important}.ml-0{margin-left:0!important}.mx-0{margin-right:0!important;margin-left:0!important}.my-0{margin-top:0!important;margin-bottom:0!important}.m-1{margin:.25rem .25rem!important}.mt-1{margin-top:.25rem!important}.mr-1{margin-right:.25rem!important}.mb-1{margin-bottom:.25rem!important}.ml-1{margin-left:.25rem!important}.mx-1{margin-right:.25rem!important;margin-left:.25rem!important}.my-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.m-2{margin:.5rem .5rem!important}.mt-2{margin-top:.5rem!important}.mr-2{margin-right:.5rem!important}.mb-2{margin-bottom:.5rem!important}.ml-2{margin-left:.5rem!important}.mx-2{margin-right:.5rem!important;margin-left:.5rem!important}.my-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.m-3{margin:1rem 1rem!important}.mt-3{margin-top:1rem!important}.mr-3{margin-right:1rem!important}.mb-3{margin-bottom:1rem!important}.ml-3{margin-left:1rem!important}.mx-3{margin-right:1rem!important;margin-left:1rem!important}.my-3{margin-top:1rem!important;margin-bottom:1rem!important}.m-4{margin:1.5rem 1.5rem!important}.mt-4{margin-top:1.5rem!important}.mr-4{margin-right:1.5rem!important}.mb-4{margin-bottom:1.5rem!important}.ml-4{margin-left:1.5rem!important}.mx-4{margin-right:1.5rem!important;margin-left:1.5rem!important}.my-4{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.m-5{margin:3rem 3rem!important}.mt-5{margin-top:3rem!important}.mr-5{margin-right:3rem!important}.mb-5{margin-bottom:3rem!important}.ml-5{margin-left:3rem!important}.mx-5{margin-right:3rem!important;margin-left:3rem!important}.my-5{margin-top:3rem!important;margin-bottom:3rem!important}.p-0{padding:0 0!important}.pt-0{padding-top:0!important}.pr-0{padding-right:0!important}.pb-0{padding-bottom:0!important}.pl-0{padding-left:0!important}.px-0{padding-right:0!important;padding-left:0!important}.py-0{padding-top:0!important;padding-bottom:0!important}.p-1{padding:.25rem .25rem!important}.pt-1{padding-top:.25rem!important}.pr-1{padding-right:.25rem!important}.pb-1{padding-bottom:.25rem!important}.pl-1{padding-left:.25rem!important}.px-1{padding-right:.25rem!important;padding-left:.25rem!important}.py-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.p-2{padding:.5rem .5rem!important}.pt-2{padding-top:.5rem!important}.pr-2{padding-right:.5rem!important}.pb-2{padding-bottom:.5rem!important}.pl-2{padding-left:.5rem!important}.px-2{padding-right:.5rem!important;padding-left:.5rem!important}.py-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.p-3{padding:1rem 1rem!important}.pt-3{padding-top:1rem!important}.pr-3{padding-right:1rem!important}.pb-3{padding-bottom:1rem!important}.pl-3{padding-left:1rem!important}.px-3{padding-right:1rem!important;padding-left:1rem!important}.py-3{padding-top:1rem!important;padding-bottom:1rem!important}.p-4{padding:1.5rem 1.5rem!important}.pt-4{padding-top:1.5rem!important}.pr-4{padding-right:1.5rem!important}.pb-4{padding-bottom:1.5rem!important}.pl-4{padding-left:1.5rem!important}.px-4{padding-right:1.5rem!important;padding-left:1.5rem!important}.py-4{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.p-5{padding:3rem 3rem!important}.pt-5{padding-top:3rem!important}.pr-5{padding-right:3rem!important}.pb-5{padding-bottom:3rem!important}.pl-5{padding-left:3rem!important}.px-5{padding-right:3rem!important;padding-left:3rem!important}.py-5{padding-top:3rem!important;padding-bottom:3rem!important}.m-auto{margin:auto!important}.mt-auto{margin-top:auto!important}.mr-auto{margin-right:auto!important}.mb-auto{margin-bottom:auto!important}.ml-auto{margin-left:auto!important}.mx-auto{margin-right:auto!important;margin-left:auto!important}.my-auto{margin-top:auto!important;margin-bottom:auto!important}@media (min-width:576px){.m-sm-0{margin:0 0!important}.mt-sm-0{margin-top:0!important}.mr-sm-0{margin-right:0!important}.mb-sm-0{margin-bottom:0!important}.ml-sm-0{margin-left:0!important}.mx-sm-0{margin-right:0!important;margin-left:0!important}.my-sm-0{margin-top:0!important;margin-bottom:0!important}.m-sm-1{margin:.25rem .25rem!important}.mt-sm-1{margin-top:.25rem!important}.mr-sm-1{margin-right:.25rem!important}.mb-sm-1{margin-bottom:.25rem!important}.ml-sm-1{margin-left:.25rem!important}.mx-sm-1{margin-right:.25rem!important;margin-left:.25rem!important}.my-sm-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.m-sm-2{margin:.5rem .5rem!important}.mt-sm-2{margin-top:.5rem!important}.mr-sm-2{margin-right:.5rem!important}.mb-sm-2{margin-bottom:.5rem!important}.ml-sm-2{margin-left:.5rem!important}.mx-sm-2{margin-right:.5rem!important;margin-left:.5rem!important}.my-sm-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.m-sm-3{margin:1rem 1rem!important}.mt-sm-3{margin-top:1rem!important}.mr-sm-3{margin-right:1rem!important}.mb-sm-3{margin-bottom:1rem!important}.ml-sm-3{margin-left:1rem!important}.mx-sm-3{margin-right:1rem!important;margin-left:1rem!important}.my-sm-3{margin-top:1rem!important;margin-bottom:1rem!important}.m-sm-4{margin:1.5rem 1.5rem!important}.mt-sm-4{margin-top:1.5rem!important}.mr-sm-4{margin-right:1.5rem!important}.mb-sm-4{margin-bottom:1.5rem!important}.ml-sm-4{margin-left:1.5rem!important}.mx-sm-4{margin-right:1.5rem!important;margin-left:1.5rem!important}.my-sm-4{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.m-sm-5{margin:3rem 3rem!important}.mt-sm-5{margin-top:3rem!important}.mr-sm-5{margin-right:3rem!important}.mb-sm-5{margin-bottom:3rem!important}.ml-sm-5{margin-left:3rem!important}.mx-sm-5{margin-right:3rem!important;margin-left:3rem!important}.my-sm-5{margin-top:3rem!important;margin-bottom:3rem!important}.p-sm-0{padding:0 0!important}.pt-sm-0{padding-top:0!important}.pr-sm-0{padding-right:0!important}.pb-sm-0{padding-bottom:0!important}.pl-sm-0{padding-left:0!important}.px-sm-0{padding-right:0!important;padding-left:0!important}.py-sm-0{padding-top:0!important;padding-bottom:0!important}.p-sm-1{padding:.25rem .25rem!important}.pt-sm-1{padding-top:.25rem!important}.pr-sm-1{padding-right:.25rem!important}.pb-sm-1{padding-bottom:.25rem!important}.pl-sm-1{padding-left:.25rem!important}.px-sm-1{padding-right:.25rem!important;padding-left:.25rem!important}.py-sm-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.p-sm-2{padding:.5rem .5rem!important}.pt-sm-2{padding-top:.5rem!important}.pr-sm-2{padding-right:.5rem!important}.pb-sm-2{padding-bottom:.5rem!important}.pl-sm-2{padding-left:.5rem!important}.px-sm-2{padding-right:.5rem!important;padding-left:.5rem!important}.py-sm-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.p-sm-3{padding:1rem 1rem!important}.pt-sm-3{padding-top:1rem!important}.pr-sm-3{padding-right:1rem!important}.pb-sm-3{padding-bottom:1rem!important}.pl-sm-3{padding-left:1rem!important}.px-sm-3{padding-right:1rem!important;padding-left:1rem!important}.py-sm-3{padding-top:1rem!important;padding-bottom:1rem!important}.p-sm-4{padding:1.5rem 1.5rem!important}.pt-sm-4{padding-top:1.5rem!important}.pr-sm-4{padding-right:1.5rem!important}.pb-sm-4{padding-bottom:1.5rem!important}.pl-sm-4{padding-left:1.5rem!important}.px-sm-4{padding-right:1.5rem!important;padding-left:1.5rem!important}.py-sm-4{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.p-sm-5{padding:3rem 3rem!important}.pt-sm-5{padding-top:3rem!important}.pr-sm-5{padding-right:3rem!important}.pb-sm-5{padding-bottom:3rem!important}.pl-sm-5{padding-left:3rem!important}.px-sm-5{padding-right:3rem!important;padding-left:3rem!important}.py-sm-5{padding-top:3rem!important;padding-bottom:3rem!important}.m-sm-auto{margin:auto!important}.mt-sm-auto{margin-top:auto!important}.mr-sm-auto{margin-right:auto!important}.mb-sm-auto{margin-bottom:auto!important}.ml-sm-auto{margin-left:auto!important}.mx-sm-auto{margin-right:auto!important;margin-left:auto!important}.my-sm-auto{margin-top:auto!important;margin-bottom:auto!important}}@media (min-width:768px){.m-md-0{margin:0 0!important}.mt-md-0{margin-top:0!important}.mr-md-0{margin-right:0!important}.mb-md-0{margin-bottom:0!important}.ml-md-0{margin-left:0!important}.mx-md-0{margin-right:0!important;margin-left:0!important}.my-md-0{margin-top:0!important;margin-bottom:0!important}.m-md-1{margin:.25rem .25rem!important}.mt-md-1{margin-top:.25rem!important}.mr-md-1{margin-right:.25rem!important}.mb-md-1{margin-bottom:.25rem!important}.ml-md-1{margin-left:.25rem!important}.mx-md-1{margin-right:.25rem!important;margin-left:.25rem!important}.my-md-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.m-md-2{margin:.5rem .5rem!important}.mt-md-2{margin-top:.5rem!important}.mr-md-2{margin-right:.5rem!important}.mb-md-2{margin-bottom:.5rem!important}.ml-md-2{margin-left:.5rem!important}.mx-md-2{margin-right:.5rem!important;margin-left:.5rem!important}.my-md-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.m-md-3{margin:1rem 1rem!important}.mt-md-3{margin-top:1rem!important}.mr-md-3{margin-right:1rem!important}.mb-md-3{margin-bottom:1rem!important}.ml-md-3{margin-left:1rem!important}.mx-md-3{margin-right:1rem!important;margin-left:1rem!important}.my-md-3{margin-top:1rem!important;margin-bottom:1rem!important}.m-md-4{margin:1.5rem 1.5rem!important}.mt-md-4{margin-top:1.5rem!important}.mr-md-4{margin-right:1.5rem!important}.mb-md-4{margin-bottom:1.5rem!important}.ml-md-4{margin-left:1.5rem!important}.mx-md-4{margin-right:1.5rem!important;margin-left:1.5rem!important}.my-md-4{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.m-md-5{margin:3rem 3rem!important}.mt-md-5{margin-top:3rem!important}.mr-md-5{margin-right:3rem!important}.mb-md-5{margin-bottom:3rem!important}.ml-md-5{margin-left:3rem!important}.mx-md-5{margin-right:3rem!important;margin-left:3rem!important}.my-md-5{margin-top:3rem!important;margin-bottom:3rem!important}.p-md-0{padding:0 0!important}.pt-md-0{padding-top:0!important}.pr-md-0{padding-right:0!important}.pb-md-0{padding-bottom:0!important}.pl-md-0{padding-left:0!important}.px-md-0{padding-right:0!important;padding-left:0!important}.py-md-0{padding-top:0!important;padding-bottom:0!important}.p-md-1{padding:.25rem .25rem!important}.pt-md-1{padding-top:.25rem!important}.pr-md-1{padding-right:.25rem!important}.pb-md-1{padding-bottom:.25rem!important}.pl-md-1{padding-left:.25rem!important}.px-md-1{padding-right:.25rem!important;padding-left:.25rem!important}.py-md-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.p-md-2{padding:.5rem .5rem!important}.pt-md-2{padding-top:.5rem!important}.pr-md-2{padding-right:.5rem!important}.pb-md-2{padding-bottom:.5rem!important}.pl-md-2{padding-left:.5rem!important}.px-md-2{padding-right:.5rem!important;padding-left:.5rem!important}.py-md-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.p-md-3{padding:1rem 1rem!important}.pt-md-3{padding-top:1rem!important}.pr-md-3{padding-right:1rem!important}.pb-md-3{padding-bottom:1rem!important}.pl-md-3{padding-left:1rem!important}.px-md-3{padding-right:1rem!important;padding-left:1rem!important}.py-md-3{padding-top:1rem!important;padding-bottom:1rem!important}.p-md-4{padding:1.5rem 1.5rem!important}.pt-md-4{padding-top:1.5rem!important}.pr-md-4{padding-right:1.5rem!important}.pb-md-4{padding-bottom:1.5rem!important}.pl-md-4{padding-left:1.5rem!important}.px-md-4{padding-right:1.5rem!important;padding-left:1.5rem!important}.py-md-4{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.p-md-5{padding:3rem 3rem!important}.pt-md-5{padding-top:3rem!important}.pr-md-5{padding-right:3rem!important}.pb-md-5{padding-bottom:3rem!important}.pl-md-5{padding-left:3rem!important}.px-md-5{padding-right:3rem!important;padding-left:3rem!important}.py-md-5{padding-top:3rem!important;padding-bottom:3rem!important}.m-md-auto{margin:auto!important}.mt-md-auto{margin-top:auto!important}.mr-md-auto{margin-right:auto!important}.mb-md-auto{margin-bottom:auto!important}.ml-md-auto{margin-left:auto!important}.mx-md-auto{margin-right:auto!important;margin-left:auto!important}.my-md-auto{margin-top:auto!important;margin-bottom:auto!important}}@media (min-width:992px){.m-lg-0{margin:0 0!important}.mt-lg-0{margin-top:0!important}.mr-lg-0{margin-right:0!important}.mb-lg-0{margin-bottom:0!important}.ml-lg-0{margin-left:0!important}.mx-lg-0{margin-right:0!important;margin-left:0!important}.my-lg-0{margin-top:0!important;margin-bottom:0!important}.m-lg-1{margin:.25rem .25rem!important}.mt-lg-1{margin-top:.25rem!important}.mr-lg-1{margin-right:.25rem!important}.mb-lg-1{margin-bottom:.25rem!important}.ml-lg-1{margin-left:.25rem!important}.mx-lg-1{margin-right:.25rem!important;margin-left:.25rem!important}.my-lg-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.m-lg-2{margin:.5rem .5rem!important}.mt-lg-2{margin-top:.5rem!important}.mr-lg-2{margin-right:.5rem!important}.mb-lg-2{margin-bottom:.5rem!important}.ml-lg-2{margin-left:.5rem!important}.mx-lg-2{margin-right:.5rem!important;margin-left:.5rem!important}.my-lg-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.m-lg-3{margin:1rem 1rem!important}.mt-lg-3{margin-top:1rem!important}.mr-lg-3{margin-right:1rem!important}.mb-lg-3{margin-bottom:1rem!important}.ml-lg-3{margin-left:1rem!important}.mx-lg-3{margin-right:1rem!important;margin-left:1rem!important}.my-lg-3{margin-top:1rem!important;margin-bottom:1rem!important}.m-lg-4{margin:1.5rem 1.5rem!important}.mt-lg-4{margin-top:1.5rem!important}.mr-lg-4{margin-right:1.5rem!important}.mb-lg-4{margin-bottom:1.5rem!important}.ml-lg-4{margin-left:1.5rem!important}.mx-lg-4{margin-right:1.5rem!important;margin-left:1.5rem!important}.my-lg-4{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.m-lg-5{margin:3rem 3rem!important}.mt-lg-5{margin-top:3rem!important}.mr-lg-5{margin-right:3rem!important}.mb-lg-5{margin-bottom:3rem!important}.ml-lg-5{margin-left:3rem!important}.mx-lg-5{margin-right:3rem!important;margin-left:3rem!important}.my-lg-5{margin-top:3rem!important;margin-bottom:3rem!important}.p-lg-0{padding:0 0!important}.pt-lg-0{padding-top:0!important}.pr-lg-0{padding-right:0!important}.pb-lg-0{padding-bottom:0!important}.pl-lg-0{padding-left:0!important}.px-lg-0{padding-right:0!important;padding-left:0!important}.py-lg-0{padding-top:0!important;padding-bottom:0!important}.p-lg-1{padding:.25rem .25rem!important}.pt-lg-1{padding-top:.25rem!important}.pr-lg-1{padding-right:.25rem!important}.pb-lg-1{padding-bottom:.25rem!important}.pl-lg-1{padding-left:.25rem!important}.px-lg-1{padding-right:.25rem!important;padding-left:.25rem!important}.py-lg-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.p-lg-2{padding:.5rem .5rem!important}.pt-lg-2{padding-top:.5rem!important}.pr-lg-2{padding-right:.5rem!important}.pb-lg-2{padding-bottom:.5rem!important}.pl-lg-2{padding-left:.5rem!important}.px-lg-2{padding-right:.5rem!important;padding-left:.5rem!important}.py-lg-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.p-lg-3{padding:1rem 1rem!important}.pt-lg-3{padding-top:1rem!important}.pr-lg-3{padding-right:1rem!important}.pb-lg-3{padding-bottom:1rem!important}.pl-lg-3{padding-left:1rem!important}.px-lg-3{padding-right:1rem!important;padding-left:1rem!important}.py-lg-3{padding-top:1rem!important;padding-bottom:1rem!important}.p-lg-4{padding:1.5rem 1.5rem!important}.pt-lg-4{padding-top:1.5rem!important}.pr-lg-4{padding-right:1.5rem!important}.pb-lg-4{padding-bottom:1.5rem!important}.pl-lg-4{padding-left:1.5rem!important}.px-lg-4{padding-right:1.5rem!important;padding-left:1.5rem!important}.py-lg-4{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.p-lg-5{padding:3rem 3rem!important}.pt-lg-5{padding-top:3rem!important}.pr-lg-5{padding-right:3rem!important}.pb-lg-5{padding-bottom:3rem!important}.pl-lg-5{padding-left:3rem!important}.px-lg-5{padding-right:3rem!important;padding-left:3rem!important}.py-lg-5{padding-top:3rem!important;padding-bottom:3rem!important}.m-lg-auto{margin:auto!important}.mt-lg-auto{margin-top:auto!important}.mr-lg-auto{margin-right:auto!important}.mb-lg-auto{margin-bottom:auto!important}.ml-lg-auto{margin-left:auto!important}.mx-lg-auto{margin-right:auto!important;margin-left:auto!important}.my-lg-auto{margin-top:auto!important;margin-bottom:auto!important}}@media (min-width:1200px){.m-xl-0{margin:0 0!important}.mt-xl-0{margin-top:0!important}.mr-xl-0{margin-right:0!important}.mb-xl-0{margin-bottom:0!important}.ml-xl-0{margin-left:0!important}.mx-xl-0{margin-right:0!important;margin-left:0!important}.my-xl-0{margin-top:0!important;margin-bottom:0!important}.m-xl-1{margin:.25rem .25rem!important}.mt-xl-1{margin-top:.25rem!important}.mr-xl-1{margin-right:.25rem!important}.mb-xl-1{margin-bottom:.25rem!important}.ml-xl-1{margin-left:.25rem!important}.mx-xl-1{margin-right:.25rem!important;margin-left:.25rem!important}.my-xl-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.m-xl-2{margin:.5rem .5rem!important}.mt-xl-2{margin-top:.5rem!important}.mr-xl-2{margin-right:.5rem!important}.mb-xl-2{margin-bottom:.5rem!important}.ml-xl-2{margin-left:.5rem!important}.mx-xl-2{margin-right:.5rem!important;margin-left:.5rem!important}.my-xl-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.m-xl-3{margin:1rem 1rem!important}.mt-xl-3{margin-top:1rem!important}.mr-xl-3{margin-right:1rem!important}.mb-xl-3{margin-bottom:1rem!important}.ml-xl-3{margin-left:1rem!important}.mx-xl-3{margin-right:1rem!important;margin-left:1rem!important}.my-xl-3{margin-top:1rem!important;margin-bottom:1rem!important}.m-xl-4{margin:1.5rem 1.5rem!important}.mt-xl-4{margin-top:1.5rem!important}.mr-xl-4{margin-right:1.5rem!important}.mb-xl-4{margin-bottom:1.5rem!important}.ml-xl-4{margin-left:1.5rem!important}.mx-xl-4{margin-right:1.5rem!important;margin-left:1.5rem!important}.my-xl-4{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.m-xl-5{margin:3rem 3rem!important}.mt-xl-5{margin-top:3rem!important}.mr-xl-5{margin-right:3rem!important}.mb-xl-5{margin-bottom:3rem!important}.ml-xl-5{margin-left:3rem!important}.mx-xl-5{margin-right:3rem!important;margin-left:3rem!important}.my-xl-5{margin-top:3rem!important;margin-bottom:3rem!important}.p-xl-0{padding:0 0!important}.pt-xl-0{padding-top:0!important}.pr-xl-0{padding-right:0!important}.pb-xl-0{padding-bottom:0!important}.pl-xl-0{padding-left:0!important}.px-xl-0{padding-right:0!important;padding-left:0!important}.py-xl-0{padding-top:0!important;padding-bottom:0!important}.p-xl-1{padding:.25rem .25rem!important}.pt-xl-1{padding-top:.25rem!important}.pr-xl-1{padding-right:.25rem!important}.pb-xl-1{padding-bottom:.25rem!important}.pl-xl-1{padding-left:.25rem!important}.px-xl-1{padding-right:.25rem!important;padding-left:.25rem!important}.py-xl-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.p-xl-2{padding:.5rem .5rem!important}.pt-xl-2{padding-top:.5rem!important}.pr-xl-2{padding-right:.5rem!important}.pb-xl-2{padding-bottom:.5rem!important}.pl-xl-2{padding-left:.5rem!important}.px-xl-2{padding-right:.5rem!important;padding-left:.5rem!important}.py-xl-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.p-xl-3{padding:1rem 1rem!important}.pt-xl-3{padding-top:1rem!important}.pr-xl-3{padding-right:1rem!important}.pb-xl-3{padding-bottom:1rem!important}.pl-xl-3{padding-left:1rem!important}.px-xl-3{padding-right:1rem!important;padding-left:1rem!important}.py-xl-3{padding-top:1rem!important;padding-bottom:1rem!important}.p-xl-4{padding:1.5rem 1.5rem!important}.pt-xl-4{padding-top:1.5rem!important}.pr-xl-4{padding-right:1.5rem!important}.pb-xl-4{padding-bottom:1.5rem!important}.pl-xl-4{padding-left:1.5rem!important}.px-xl-4{padding-right:1.5rem!important;padding-left:1.5rem!important}.py-xl-4{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.p-xl-5{padding:3rem 3rem!important}.pt-xl-5{padding-top:3rem!important}.pr-xl-5{padding-right:3rem!important}.pb-xl-5{padding-bottom:3rem!important}.pl-xl-5{padding-left:3rem!important}.px-xl-5{padding-right:3rem!important;padding-left:3rem!important}.py-xl-5{padding-top:3rem!important;padding-bottom:3rem!important}.m-xl-auto{margin:auto!important}.mt-xl-auto{margin-top:auto!important}.mr-xl-auto{margin-right:auto!important}.mb-xl-auto{margin-bottom:auto!important}.ml-xl-auto{margin-left:auto!important}.mx-xl-auto{margin-right:auto!important;margin-left:auto!important}.my-xl-auto{margin-top:auto!important;margin-bottom:auto!important}}.text-justify{text-align:justify!important}.text-nowrap{white-space:nowrap!important}.text-truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.text-left{text-align:left!important}.text-right{text-align:right!important}.text-center{text-align:center!important}@media (min-width:576px){.text-sm-left{text-align:left!important}.text-sm-right{text-align:right!important}.text-sm-center{text-align:center!important}}@media (min-width:768px){.text-md-left{text-align:left!important}.text-md-right{text-align:right!important}.text-md-center{text-align:center!important}}@media (min-width:992px){.text-lg-left{text-align:left!important}.text-lg-right{text-align:right!important}.text-lg-center{text-align:center!important}}@media (min-width:1200px){.text-xl-left{text-align:left!important}.text-xl-right{text-align:right!important}.text-xl-center{text-align:center!important}}.text-lowercase{text-transform:lowercase!important}.text-uppercase{text-transform:uppercase!important}.text-capitalize{text-transform:capitalize!important}.font-weight-normal{font-weight:400}.font-weight-bold{font-weight:700}.font-italic{font-style:italic}.text-white{color:#fff!important}.text-muted{color:#636c72!important}a.text-muted:focus,a.text-muted:hover{color:#4b5257!important}.text-primary{color:#0275d8!important}a.text-primary:focus,a.text-primary:hover{color:#025aa5!important}.text-success{color:#5cb85c!important}a.text-success:focus,a.text-success:hover{color:#449d44!important}.text-info{color:#5bc0de!important}a.text-info:focus,a.text-info:hover{color:#31b0d5!important}.text-warning{color:#f0ad4e!important}a.text-warning:focus,a.text-warning:hover{color:#ec971f!important}.text-danger{color:#d9534f!important}a.text-danger:focus,a.text-danger:hover{color:#c9302c!important}.text-gray-dark{color:#292b2c!important}a.text-gray-dark:focus,a.text-gray-dark:hover{color:#101112!important}.text-hide{font:0/0 a;color:transparent;text-shadow:none;background-color:transparent;border:0}.invisible{visibility:hidden!important}.hidden-xs-up{display:none!important}@media (max-width:575px){.hidden-xs-down{display:none!important}}@media (min-width:576px){.hidden-sm-up{display:none!important}}@media (max-width:767px){.hidden-sm-down{display:none!important}}@media (min-width:768px){.hidden-md-up{display:none!important}}@media (max-width:991px){.hidden-md-down{display:none!important}}@media (min-width:992px){.hidden-lg-up{display:none!important}}@media (max-width:1199px){.hidden-lg-down{display:none!important}}@media (min-width:1200px){.hidden-xl-up{display:none!important}}.hidden-xl-down{display:none!important}.visible-print-block{display:none!important}@media print{.visible-print-block{display:block!important}}.visible-print-inline{display:none!important}@media print{.visible-print-inline{display:inline!important}}.visible-print-inline-block{display:none!important}@media print{.visible-print-inline-block{display:inline-block!important}}@media print{.hidden-print{display:none!important}}/*# sourceMappingURL=bootstrap.min.css.map */ diff --git a/archivebox/templates/static/chrome_extension_icon.png b/archivebox/templates/static/chrome_extension_icon.png new file mode 100644 index 0000000000..34092dff2e Binary files /dev/null and b/archivebox/templates/static/chrome_extension_icon.png differ diff --git a/archivebox/templates/static/directory_index.html b/archivebox/templates/static/directory_index.html new file mode 100644 index 0000000000..003f6f974f --- /dev/null +++ b/archivebox/templates/static/directory_index.html @@ -0,0 +1,396 @@ +{% load i18n %} + + + + + + + + {% blocktranslate %}Index of {{ directory }}{% endblocktranslate %} + + + +
    + + +
    +
    +

    ArchiveBox File Index

    +

    {% blocktranslate %}Index of {{ directory }}{% endblocktranslate %}

    +
    + {{ file_list|length }} item{{ file_list|length|pluralize }} +
    +
    {{ directory }}
    +
    + + {% if file_list or directory != "/" %} + + {% else %} +
    This directory is empty.
    + {% endif %} +
    +
    + + + diff --git a/archivebox/templates/static/favicon.ico b/archivebox/templates/static/favicon.ico new file mode 100755 index 0000000000..e372700647 Binary files /dev/null and b/archivebox/templates/static/favicon.ico differ diff --git a/archivebox/templates/static/jquery-3.7.1.slim.min.js b/archivebox/templates/static/jquery-3.7.1.slim.min.js new file mode 100755 index 0000000000..35906b9293 --- /dev/null +++ b/archivebox/templates/static/jquery-3.7.1.slim.min.js @@ -0,0 +1,2 @@ +/*! jQuery v3.7.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-deprecated/ajax-event-alias,-effects,-effects/animatedSelector,-effects/Tween | (c) OpenJS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(ie,e){"use strict";var oe=[],r=Object.getPrototypeOf,ae=oe.slice,g=oe.flat?function(e){return oe.flat.call(e)}:function(e){return oe.concat.apply([],e)},s=oe.push,se=oe.indexOf,n={},i=n.toString,ue=n.hasOwnProperty,o=ue.toString,a=o.call(Object),le={},v=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType&&"function"!=typeof e.item},y=function(e){return null!=e&&e===e.window},m=ie.document,u={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||m).createElement("script");if(o.text=e,t)for(r in u)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function x(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[i.call(e)]||"object":typeof e}var t="3.7.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-deprecated/ajax-event-alias,-effects,-effects/animatedSelector,-effects/Tween",l=/HTML$/i,ce=function(e,t){return new ce.fn.init(e,t)};function c(e){var t=!!e&&"length"in e&&e.length,n=x(e);return!v(e)&&!y(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+ge+")"+ge+"*"),b=new RegExp(ge+"|>"),A=new RegExp(g),D=new RegExp("^"+t+"$"),N={ID:new RegExp("^#("+t+")"),CLASS:new RegExp("^\\.("+t+")"),TAG:new RegExp("^("+t+"|[*])"),ATTR:new RegExp("^"+d),PSEUDO:new RegExp("^"+g),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+ge+"*(even|odd|(([+-]|)(\\d*)n|)"+ge+"*(?:([+-]|)"+ge+"*(\\d+)|))"+ge+"*\\)|)","i"),bool:new RegExp("^(?:"+f+")$","i"),needsContext:new RegExp("^"+ge+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+ge+"*((?:-\\d)?\\d*)"+ge+"*\\)|)(?=[^-]|$)","i")},L=/^(?:input|select|textarea|button)$/i,j=/^h\d$/i,O=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,P=/[+~]/,H=new RegExp("\\\\[\\da-fA-F]{1,6}"+ge+"?|\\\\([^\\r\\n\\f])","g"),q=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},R=function(){V()},M=K(function(e){return!0===e.disabled&&fe(e,"fieldset")},{dir:"parentNode",next:"legend"});try{E.apply(oe=ae.call(ye.childNodes),ye.childNodes),oe[ye.childNodes.length].nodeType}catch(e){E={apply:function(e,t){me.apply(e,ae.call(t))},call:function(e){me.apply(e,ae.call(arguments,1))}}}function I(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,d=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==d&&9!==d&&11!==d)return n;if(!r&&(V(e),e=e||C,T)){if(11!==d&&(u=O.exec(t)))if(i=u[1]){if(9===d){if(!(a=e.getElementById(i)))return n;if(a.id===i)return E.call(n,a),n}else if(f&&(a=f.getElementById(i))&&I.contains(e,a)&&a.id===i)return E.call(n,a),n}else{if(u[2])return E.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&e.getElementsByClassName)return E.apply(n,e.getElementsByClassName(i)),n}if(!(h[t+" "]||p&&p.test(t))){if(c=t,f=e,1===d&&(b.test(t)||m.test(t))){(f=P.test(t)&&X(e.parentNode)||e)==e&&le.scope||((s=e.getAttribute("id"))?s=ce.escapeSelector(s):e.setAttribute("id",s=k)),o=(l=Y(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+G(l[o]);c=l.join(",")}try{return E.apply(n,f.querySelectorAll(c)),n}catch(e){h(t,!0)}finally{s===k&&e.removeAttribute("id")}}}return re(t.replace(ve,"$1"),e,n,r)}function W(){var r=[];return function e(t,n){return r.push(t+" ")>x.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function B(e){return e[k]=!0,e}function F(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function $(t){return function(e){return fe(e,"input")&&e.type===t}}function _(t){return function(e){return(fe(e,"input")||fe(e,"button"))&&e.type===t}}function z(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&M(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function U(a){return B(function(o){return o=+o,B(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function X(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}function V(e){var t,n=e?e.ownerDocument||e:ye;return n!=C&&9===n.nodeType&&n.documentElement&&(r=(C=n).documentElement,T=!ce.isXMLDoc(C),i=r.matches||r.webkitMatchesSelector||r.msMatchesSelector,r.msMatchesSelector&&ye!=C&&(t=C.defaultView)&&t.top!==t&&t.addEventListener("unload",R),le.getById=F(function(e){return r.appendChild(e).id=ce.expando,!C.getElementsByName||!C.getElementsByName(ce.expando).length}),le.disconnectedMatch=F(function(e){return i.call(e,"*")}),le.scope=F(function(){return C.querySelectorAll(":scope")}),le.cssHas=F(function(){try{return C.querySelector(":has(*,:jqfake)"),!1}catch(e){return!0}}),le.getById?(x.filter.ID=function(e){var t=e.replace(H,q);return function(e){return e.getAttribute("id")===t}},x.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&T){var n=t.getElementById(e);return n?[n]:[]}}):(x.filter.ID=function(e){var n=e.replace(H,q);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},x.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&T){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),x.find.TAG=function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):t.querySelectorAll(e)},x.find.CLASS=function(e,t){if("undefined"!=typeof t.getElementsByClassName&&T)return t.getElementsByClassName(e)},p=[],F(function(e){var t;r.appendChild(e).innerHTML="",e.querySelectorAll("[selected]").length||p.push("\\["+ge+"*(?:value|"+f+")"),e.querySelectorAll("[id~="+k+"-]").length||p.push("~="),e.querySelectorAll("a#"+k+"+*").length||p.push(".#.+[+~]"),e.querySelectorAll(":checked").length||p.push(":checked"),(t=C.createElement("input")).setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),r.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&p.push(":enabled",":disabled"),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||p.push("\\["+ge+"*name"+ge+"*="+ge+"*(?:''|\"\")")}),le.cssHas||p.push(":has"),p=p.length&&new RegExp(p.join("|")),l=function(e,t){if(e===t)return a=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!le.sortDetached&&t.compareDocumentPosition(e)===n?e===C||e.ownerDocument==ye&&I.contains(ye,e)?-1:t===C||t.ownerDocument==ye&&I.contains(ye,t)?1:o?se.call(o,e)-se.call(o,t):0:4&n?-1:1)}),C}for(e in I.matches=function(e,t){return I(e,null,null,t)},I.matchesSelector=function(e,t){if(V(e),T&&!h[t+" "]&&(!p||!p.test(t)))try{var n=i.call(e,t);if(n||le.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){h(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(H,q),e[3]=(e[3]||e[4]||e[5]||"").replace(H,q),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||I.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&I.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return N.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&A.test(n)&&(t=Y(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(H,q).toLowerCase();return"*"===e?function(){return!0}:function(e){return fe(e,t)}},CLASS:function(e){var t=s[e+" "];return t||(t=new RegExp("(^|"+ge+")"+e+"("+ge+"|$)"))&&s(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=I.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function T(e,n,r){return v(n)?ce.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?ce.grep(e,function(e){return e===n!==r}):"string"!=typeof n?ce.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(ce.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||E,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:k.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof ce?t[0]:t,ce.merge(this,ce.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:m,!0)),C.test(r[1])&&ce.isPlainObject(t))for(r in t)v(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=m.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):v(e)?void 0!==n.ready?n.ready(e):e(ce):ce.makeArray(e,this)}).prototype=ce.fn,E=ce(m);var S=/^(?:parents|prev(?:Until|All))/,A={children:!0,contents:!0,next:!0,prev:!0};function D(e,t){while((e=e[t])&&1!==e.nodeType);return e}ce.fn.extend({has:function(e){var t=ce(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,Ce=/^$|^module$|\/(?:java|ecma)script/i;re=m.createDocumentFragment().appendChild(m.createElement("div")),(be=m.createElement("input")).setAttribute("type","radio"),be.setAttribute("checked","checked"),be.setAttribute("name","t"),re.appendChild(be),le.checkClone=re.cloneNode(!0).cloneNode(!0).lastChild.checked,re.innerHTML="",le.noCloneChecked=!!re.cloneNode(!0).lastChild.defaultValue,re.innerHTML="",le.option=!!re.lastChild;var Te={thead:[1,"","
    "],col:[2,"","
    "],tr:[2,"","
    "],td:[3,"","
    "],_default:[0,"",""]};function Ee(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&fe(e,t)?ce.merge([e],n):n}function ke(e,t){for(var n=0,r=e.length;n",""]);var Se=/<|&#?\w+;/;function Ae(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),d=[],p=0,h=e.length;p\s*$/g;function Re(e,t){return fe(e,"table")&&fe(11!==t.nodeType?t:t.firstChild,"tr")&&ce(e).children("tbody")[0]||e}function Me(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function Ie(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function We(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(_.hasData(e)&&(s=_.get(e).events))for(i in _.remove(t,"handle events"),s)for(n=0,r=s[i].length;n
    ",2===yt.childNodes.length),ce.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(le.createHTMLDocument?((r=(t=m.implementation.createHTMLDocument("")).createElement("base")).href=m.location.href,t.head.appendChild(r)):t=m),o=!n&&[],(i=C.exec(e))?[t.createElement(i[1])]:(i=Ae([e],t,o),o&&o.length&&ce(o).remove(),ce.merge([],i.childNodes)));var r,i,o},ce.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=ce.css(e,"position"),c=ce(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=ce.css(e,"top"),u=ce.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),v(t)&&(t=t.call(e,n,ce.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},ce.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){ce.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===ce.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===ce.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=ce(e).offset()).top+=ce.css(e,"borderTopWidth",!0),i.left+=ce.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-ce.css(r,"marginTop",!0),left:t.left-i.left-ce.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===ce.css(e,"position"))e=e.offsetParent;return e||K})}}),ce.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;ce.fn[t]=function(e){return R(this,function(e,t,n){var r;if(y(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),ce.each(["top","left"],function(e,n){ce.cssHooks[n]=Qe(le.pixelPosition,function(e,t){if(t)return t=Ve(e,n),$e.test(t)?ce(e).position()[n]+"px":t})}),ce.each({Height:"height",Width:"width"},function(a,s){ce.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){ce.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return R(this,function(e,t,n){var r;return y(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?ce.css(e,t,i):ce.style(e,t,n,i)},s,n?e:void 0,n)}})}),ce.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.on("mouseenter",e).on("mouseleave",t||e)}}),ce.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){ce.fn[n]=function(e,t){return 0.select2-results__options{max-height:200px;overflow-y:auto}.select2-container--default .select2-results__option .select2-results__option{padding-left:1em}.select2-container--default .select2-results__option .select2-results__option .select2-results__group{padding-left:0}.select2-container--default .select2-results__option .select2-results__option .select2-results__option{margin-left:-1em;padding-left:2em}.select2-container--default .select2-results__option .select2-results__option .select2-results__option .select2-results__option{margin-left:-2em;padding-left:3em}.select2-container--default .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option{margin-left:-3em;padding-left:4em}.select2-container--default .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option{margin-left:-4em;padding-left:5em}.select2-container--default .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option .select2-results__option{margin-left:-5em;padding-left:6em}.select2-container--default .select2-results__option--group{padding:0}.select2-container--default .select2-results__option--disabled{color:#999}.select2-container--default .select2-results__option--selected{background-color:#ddd}.select2-container--default .select2-results__option--highlighted.select2-results__option--selectable{background-color:#5897fb;color:white}.select2-container--default .select2-results__group{cursor:default;display:block;padding:6px}.select2-container--classic .select2-selection--single{background-color:#f7f7f7;border:1px solid #aaa;border-radius:4px;outline:0;background-image:-webkit-linear-gradient(top, #fff 50%, #eee 100%);background-image:-o-linear-gradient(top, #fff 50%, #eee 100%);background-image:linear-gradient(to bottom, #fff 50%, #eee 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFFFFFFF', endColorstr='#FFEEEEEE', GradientType=0)}.select2-container--classic .select2-selection--single:focus{border:1px solid #5897fb}.select2-container--classic .select2-selection--single .select2-selection__rendered{color:#444;line-height:28px}.select2-container--classic .select2-selection--single .select2-selection__clear{cursor:pointer;float:right;font-weight:bold;height:26px;margin-right:20px}.select2-container--classic .select2-selection--single .select2-selection__placeholder{color:#999}.select2-container--classic .select2-selection--single .select2-selection__arrow{background-color:#ddd;border:none;border-left:1px solid #aaa;border-top-right-radius:4px;border-bottom-right-radius:4px;height:26px;position:absolute;top:1px;right:1px;width:20px;background-image:-webkit-linear-gradient(top, #eee 50%, #ccc 100%);background-image:-o-linear-gradient(top, #eee 50%, #ccc 100%);background-image:linear-gradient(to bottom, #eee 50%, #ccc 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFEEEEEE', endColorstr='#FFCCCCCC', GradientType=0)}.select2-container--classic .select2-selection--single .select2-selection__arrow b{border-color:#888 transparent transparent transparent;border-style:solid;border-width:5px 4px 0 4px;height:0;left:50%;margin-left:-4px;margin-top:-2px;position:absolute;top:50%;width:0}.select2-container--classic[dir="rtl"] .select2-selection--single .select2-selection__clear{float:left}.select2-container--classic[dir="rtl"] .select2-selection--single .select2-selection__arrow{border:none;border-right:1px solid #aaa;border-radius:0;border-top-left-radius:4px;border-bottom-left-radius:4px;left:1px;right:auto}.select2-container--classic.select2-container--open .select2-selection--single{border:1px solid #5897fb}.select2-container--classic.select2-container--open .select2-selection--single .select2-selection__arrow{background:transparent;border:none}.select2-container--classic.select2-container--open .select2-selection--single .select2-selection__arrow b{border-color:transparent transparent #888 transparent;border-width:0 4px 5px 4px}.select2-container--classic.select2-container--open.select2-container--above .select2-selection--single{border-top:none;border-top-left-radius:0;border-top-right-radius:0;background-image:-webkit-linear-gradient(top, #fff 0%, #eee 50%);background-image:-o-linear-gradient(top, #fff 0%, #eee 50%);background-image:linear-gradient(to bottom, #fff 0%, #eee 50%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFFFFFFF', endColorstr='#FFEEEEEE', GradientType=0)}.select2-container--classic.select2-container--open.select2-container--below .select2-selection--single{border-bottom:none;border-bottom-left-radius:0;border-bottom-right-radius:0;background-image:-webkit-linear-gradient(top, #eee 50%, #fff 100%);background-image:-o-linear-gradient(top, #eee 50%, #fff 100%);background-image:linear-gradient(to bottom, #eee 50%, #fff 100%);background-repeat:repeat-x;filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#FFEEEEEE', endColorstr='#FFFFFFFF', GradientType=0)}.select2-container--classic .select2-selection--multiple{background-color:white;border:1px solid #aaa;border-radius:4px;cursor:text;outline:0;padding-bottom:5px;padding-right:5px}.select2-container--classic .select2-selection--multiple:focus{border:1px solid #5897fb}.select2-container--classic .select2-selection--multiple .select2-selection__clear{display:none}.select2-container--classic .select2-selection--multiple .select2-selection__choice{background-color:#e4e4e4;border:1px solid #aaa;border-radius:4px;display:inline-block;margin-left:5px;margin-top:5px;padding:0}.select2-container--classic .select2-selection--multiple .select2-selection__choice__display{cursor:default;padding-left:2px;padding-right:5px}.select2-container--classic .select2-selection--multiple .select2-selection__choice__remove{background-color:transparent;border:none;border-top-left-radius:4px;border-bottom-left-radius:4px;color:#888;cursor:pointer;font-size:1em;font-weight:bold;padding:0 4px}.select2-container--classic .select2-selection--multiple .select2-selection__choice__remove:hover{color:#555;outline:none}.select2-container--classic[dir="rtl"] .select2-selection--multiple .select2-selection__choice{margin-left:5px;margin-right:auto}.select2-container--classic[dir="rtl"] .select2-selection--multiple .select2-selection__choice__display{padding-left:5px;padding-right:2px}.select2-container--classic[dir="rtl"] .select2-selection--multiple .select2-selection__choice__remove{border-top-left-radius:0;border-bottom-left-radius:0;border-top-right-radius:4px;border-bottom-right-radius:4px}.select2-container--classic.select2-container--open .select2-selection--multiple{border:1px solid #5897fb}.select2-container--classic.select2-container--open.select2-container--above .select2-selection--multiple{border-top:none;border-top-left-radius:0;border-top-right-radius:0}.select2-container--classic.select2-container--open.select2-container--below .select2-selection--multiple{border-bottom:none;border-bottom-left-radius:0;border-bottom-right-radius:0}.select2-container--classic .select2-search--dropdown .select2-search__field{border:1px solid #aaa;outline:0}.select2-container--classic .select2-search--inline .select2-search__field{outline:0;box-shadow:none}.select2-container--classic .select2-dropdown{background-color:#fff;border:1px solid transparent}.select2-container--classic .select2-dropdown--above{border-bottom:none}.select2-container--classic .select2-dropdown--below{border-top:none}.select2-container--classic .select2-results>.select2-results__options{max-height:200px;overflow-y:auto}.select2-container--classic .select2-results__option--group{padding:0}.select2-container--classic .select2-results__option--disabled{color:grey}.select2-container--classic .select2-results__option--highlighted.select2-results__option--selectable{background-color:#3875d7;color:#fff}.select2-container--classic .select2-results__group{cursor:default;display:block;padding:6px}.select2-container--classic.select2-container--open .select2-dropdown{border-color:#5897fb} diff --git a/archivebox/templates/static/select2.min.js b/archivebox/templates/static/select2.min.js new file mode 100755 index 0000000000..445a67987c --- /dev/null +++ b/archivebox/templates/static/select2.min.js @@ -0,0 +1,2 @@ +/*! Select2 4.1.0-rc.0 | https://github.com/select2/select2/blob/master/LICENSE.md */ +!function(n){"function"==typeof define&&define.amd?define(["jquery"],n):"object"==typeof module&&module.exports?module.exports=function(e,t){return void 0===t&&(t="undefined"!=typeof window?require("jquery"):require("jquery")(e)),n(t),t}:n(jQuery)}(function(t){var e,n,s,p,r,o,h,f,g,m,y,v,i,a,_,s=((u=t&&t.fn&&t.fn.select2&&t.fn.select2.amd?t.fn.select2.amd:u)&&u.requirejs||(u?n=u:u={},g={},m={},y={},v={},i=Object.prototype.hasOwnProperty,a=[].slice,_=/\.js$/,h=function(e,t){var n,s,i=c(e),r=i[0],t=t[1];return e=i[1],r&&(n=x(r=l(r,t))),r?e=n&&n.normalize?n.normalize(e,(s=t,function(e){return l(e,s)})):l(e,t):(r=(i=c(e=l(e,t)))[0],e=i[1],r&&(n=x(r))),{f:r?r+"!"+e:e,n:e,pr:r,p:n}},f={require:function(e){return w(e)},exports:function(e){var t=g[e];return void 0!==t?t:g[e]={}},module:function(e){return{id:e,uri:"",exports:g[e],config:(t=e,function(){return y&&y.config&&y.config[t]||{}})};var t}},r=function(e,t,n,s){var i,r,o,a,l,c=[],u=typeof n,d=A(s=s||e);if("undefined"==u||"function"==u){for(t=!t.length&&n.length?["require","exports","module"]:t,a=0;a":">",'"':""","'":"'","/":"/"};return"string"!=typeof e?e:String(e).replace(/[&<>"'\/\\]/g,function(e){return t[e]})},s.__cache={};var n=0;return s.GetUniqueElementId=function(e){var t=e.getAttribute("data-select2-id");return null!=t||(t=e.id?"select2-data-"+e.id:"select2-data-"+(++n).toString()+"-"+s.generateChars(4),e.setAttribute("data-select2-id",t)),t},s.StoreData=function(e,t,n){e=s.GetUniqueElementId(e);s.__cache[e]||(s.__cache[e]={}),s.__cache[e][t]=n},s.GetData=function(e,t){var n=s.GetUniqueElementId(e);return t?s.__cache[n]&&null!=s.__cache[n][t]?s.__cache[n][t]:r(e).data(t):s.__cache[n]},s.RemoveData=function(e){var t=s.GetUniqueElementId(e);null!=s.__cache[t]&&delete s.__cache[t],e.removeAttribute("data-select2-id")},s.copyNonInternalCssClasses=function(e,t){var n=(n=e.getAttribute("class").trim().split(/\s+/)).filter(function(e){return 0===e.indexOf("select2-")}),t=(t=t.getAttribute("class").trim().split(/\s+/)).filter(function(e){return 0!==e.indexOf("select2-")}),t=n.concat(t);e.setAttribute("class",t.join(" "))},s}),u.define("select2/results",["jquery","./utils"],function(d,p){function s(e,t,n){this.$element=e,this.data=n,this.options=t,s.__super__.constructor.call(this)}return p.Extend(s,p.Observable),s.prototype.render=function(){var e=d('
      ');return this.options.get("multiple")&&e.attr("aria-multiselectable","true"),this.$results=e},s.prototype.clear=function(){this.$results.empty()},s.prototype.displayMessage=function(e){var t=this.options.get("escapeMarkup");this.clear(),this.hideLoading();var n=d(''),s=this.options.get("translations").get(e.message);n.append(t(s(e.args))),n[0].className+=" select2-results__message",this.$results.append(n)},s.prototype.hideMessages=function(){this.$results.find(".select2-results__message").remove()},s.prototype.append=function(e){this.hideLoading();var t=[];if(null!=e.results&&0!==e.results.length){e.results=this.sort(e.results);for(var n=0;n",{class:"select2-results__options select2-results__options--nested",role:"none"});i.append(l),o.append(a),o.append(i)}else this.template(e,t);return p.StoreData(t,"data",e),t},s.prototype.bind=function(t,e){var i=this,n=t.id+"-results";this.$results.attr("id",n),t.on("results:all",function(e){i.clear(),i.append(e.data),t.isOpen()&&(i.setClasses(),i.highlightFirstItem())}),t.on("results:append",function(e){i.append(e.data),t.isOpen()&&i.setClasses()}),t.on("query",function(e){i.hideMessages(),i.showLoading(e)}),t.on("select",function(){t.isOpen()&&(i.setClasses(),i.options.get("scrollAfterSelect")&&i.highlightFirstItem())}),t.on("unselect",function(){t.isOpen()&&(i.setClasses(),i.options.get("scrollAfterSelect")&&i.highlightFirstItem())}),t.on("open",function(){i.$results.attr("aria-expanded","true"),i.$results.attr("aria-hidden","false"),i.setClasses(),i.ensureHighlightVisible()}),t.on("close",function(){i.$results.attr("aria-expanded","false"),i.$results.attr("aria-hidden","true"),i.$results.removeAttr("aria-activedescendant")}),t.on("results:toggle",function(){var e=i.getHighlightedResults();0!==e.length&&e.trigger("mouseup")}),t.on("results:select",function(){var e,t=i.getHighlightedResults();0!==t.length&&(e=p.GetData(t[0],"data"),t.hasClass("select2-results__option--selected")?i.trigger("close",{}):i.trigger("select",{data:e}))}),t.on("results:previous",function(){var e,t=i.getHighlightedResults(),n=i.$results.find(".select2-results__option--selectable"),s=n.index(t);s<=0||(e=s-1,0===t.length&&(e=0),(s=n.eq(e)).trigger("mouseenter"),t=i.$results.offset().top,n=s.offset().top,s=i.$results.scrollTop()+(n-t),0===e?i.$results.scrollTop(0):n-t<0&&i.$results.scrollTop(s))}),t.on("results:next",function(){var e,t=i.getHighlightedResults(),n=i.$results.find(".select2-results__option--selectable"),s=n.index(t)+1;s>=n.length||((e=n.eq(s)).trigger("mouseenter"),t=i.$results.offset().top+i.$results.outerHeight(!1),n=e.offset().top+e.outerHeight(!1),e=i.$results.scrollTop()+n-t,0===s?i.$results.scrollTop(0):tthis.$results.outerHeight()||s<0)&&this.$results.scrollTop(n))},s.prototype.template=function(e,t){var n=this.options.get("templateResult"),s=this.options.get("escapeMarkup"),e=n(e,t);null==e?t.style.display="none":"string"==typeof e?t.innerHTML=s(e):d(t).append(e)},s}),u.define("select2/keys",[],function(){return{BACKSPACE:8,TAB:9,ENTER:13,SHIFT:16,CTRL:17,ALT:18,ESC:27,SPACE:32,PAGE_UP:33,PAGE_DOWN:34,END:35,HOME:36,LEFT:37,UP:38,RIGHT:39,DOWN:40,DELETE:46}}),u.define("select2/selection/base",["jquery","../utils","../keys"],function(n,s,i){function r(e,t){this.$element=e,this.options=t,r.__super__.constructor.call(this)}return s.Extend(r,s.Observable),r.prototype.render=function(){var e=n('');return this._tabindex=0,null!=s.GetData(this.$element[0],"old-tabindex")?this._tabindex=s.GetData(this.$element[0],"old-tabindex"):null!=this.$element.attr("tabindex")&&(this._tabindex=this.$element.attr("tabindex")),e.attr("title",this.$element.attr("title")),e.attr("tabindex",this._tabindex),e.attr("aria-disabled","false"),this.$selection=e},r.prototype.bind=function(e,t){var n=this,s=e.id+"-results";this.container=e,this.$selection.on("focus",function(e){n.trigger("focus",e)}),this.$selection.on("blur",function(e){n._handleBlur(e)}),this.$selection.on("keydown",function(e){n.trigger("keypress",e),e.which===i.SPACE&&e.preventDefault()}),e.on("results:focus",function(e){n.$selection.attr("aria-activedescendant",e.data._resultId)}),e.on("selection:update",function(e){n.update(e.data)}),e.on("open",function(){n.$selection.attr("aria-expanded","true"),n.$selection.attr("aria-owns",s),n._attachCloseHandler(e)}),e.on("close",function(){n.$selection.attr("aria-expanded","false"),n.$selection.removeAttr("aria-activedescendant"),n.$selection.removeAttr("aria-owns"),n.$selection.trigger("focus"),n._detachCloseHandler(e)}),e.on("enable",function(){n.$selection.attr("tabindex",n._tabindex),n.$selection.attr("aria-disabled","false")}),e.on("disable",function(){n.$selection.attr("tabindex","-1"),n.$selection.attr("aria-disabled","true")})},r.prototype._handleBlur=function(e){var t=this;window.setTimeout(function(){document.activeElement==t.$selection[0]||n.contains(t.$selection[0],document.activeElement)||t.trigger("blur",e)},1)},r.prototype._attachCloseHandler=function(e){n(document.body).on("mousedown.select2."+e.id,function(e){var t=n(e.target).closest(".select2");n(".select2.select2-container--open").each(function(){this!=t[0]&&s.GetData(this,"element").select2("close")})})},r.prototype._detachCloseHandler=function(e){n(document.body).off("mousedown.select2."+e.id)},r.prototype.position=function(e,t){t.find(".selection").append(e)},r.prototype.destroy=function(){this._detachCloseHandler(this.container)},r.prototype.update=function(e){throw new Error("The `update` method must be defined in child classes.")},r.prototype.isEnabled=function(){return!this.isDisabled()},r.prototype.isDisabled=function(){return this.options.get("disabled")},r}),u.define("select2/selection/single",["jquery","./base","../utils","../keys"],function(e,t,n,s){function i(){i.__super__.constructor.apply(this,arguments)}return n.Extend(i,t),i.prototype.render=function(){var e=i.__super__.render.call(this);return e[0].classList.add("select2-selection--single"),e.html(''),e},i.prototype.bind=function(t,e){var n=this;i.__super__.bind.apply(this,arguments);var s=t.id+"-container";this.$selection.find(".select2-selection__rendered").attr("id",s).attr("role","textbox").attr("aria-readonly","true"),this.$selection.attr("aria-labelledby",s),this.$selection.attr("aria-controls",s),this.$selection.on("mousedown",function(e){1===e.which&&n.trigger("toggle",{originalEvent:e})}),this.$selection.on("focus",function(e){}),this.$selection.on("blur",function(e){}),t.on("focus",function(e){t.isOpen()||n.$selection.trigger("focus")})},i.prototype.clear=function(){var e=this.$selection.find(".select2-selection__rendered");e.empty(),e.removeAttr("title")},i.prototype.display=function(e,t){var n=this.options.get("templateSelection");return this.options.get("escapeMarkup")(n(e,t))},i.prototype.selectionContainer=function(){return e("")},i.prototype.update=function(e){var t,n;0!==e.length?(n=e[0],t=this.$selection.find(".select2-selection__rendered"),e=this.display(n,t),t.empty().append(e),(n=n.title||n.text)?t.attr("title",n):t.removeAttr("title")):this.clear()},i}),u.define("select2/selection/multiple",["jquery","./base","../utils"],function(i,e,c){function r(e,t){r.__super__.constructor.apply(this,arguments)}return c.Extend(r,e),r.prototype.render=function(){var e=r.__super__.render.call(this);return e[0].classList.add("select2-selection--multiple"),e.html('
        '),e},r.prototype.bind=function(e,t){var n=this;r.__super__.bind.apply(this,arguments);var s=e.id+"-container";this.$selection.find(".select2-selection__rendered").attr("id",s),this.$selection.on("click",function(e){n.trigger("toggle",{originalEvent:e})}),this.$selection.on("click",".select2-selection__choice__remove",function(e){var t;n.isDisabled()||(t=i(this).parent(),t=c.GetData(t[0],"data"),n.trigger("unselect",{originalEvent:e,data:t}))}),this.$selection.on("keydown",".select2-selection__choice__remove",function(e){n.isDisabled()||e.stopPropagation()})},r.prototype.clear=function(){var e=this.$selection.find(".select2-selection__rendered");e.empty(),e.removeAttr("title")},r.prototype.display=function(e,t){var n=this.options.get("templateSelection");return this.options.get("escapeMarkup")(n(e,t))},r.prototype.selectionContainer=function(){return i('
      • ')},r.prototype.update=function(e){if(this.clear(),0!==e.length){for(var t=[],n=this.$selection.find(".select2-selection__rendered").attr("id")+"-choice-",s=0;s')).attr("title",s()),e.attr("aria-label",s()),e.attr("aria-describedby",n),a.StoreData(e[0],"data",t),this.$selection.prepend(e),this.$selection[0].classList.add("select2-selection--clearable"))},e}),u.define("select2/selection/search",["jquery","../utils","../keys"],function(s,a,l){function e(e,t,n){e.call(this,t,n)}return e.prototype.render=function(e){var t=this.options.get("translations").get("search"),n=s('');this.$searchContainer=n,this.$search=n.find("textarea"),this.$search.prop("autocomplete",this.options.get("autocomplete")),this.$search.attr("aria-label",t());e=e.call(this);return this._transferTabIndex(),e.append(this.$searchContainer),e},e.prototype.bind=function(e,t,n){var s=this,i=t.id+"-results",r=t.id+"-container";e.call(this,t,n),s.$search.attr("aria-describedby",r),t.on("open",function(){s.$search.attr("aria-controls",i),s.$search.trigger("focus")}),t.on("close",function(){s.$search.val(""),s.resizeSearch(),s.$search.removeAttr("aria-controls"),s.$search.removeAttr("aria-activedescendant"),s.$search.trigger("focus")}),t.on("enable",function(){s.$search.prop("disabled",!1),s._transferTabIndex()}),t.on("disable",function(){s.$search.prop("disabled",!0)}),t.on("focus",function(e){s.$search.trigger("focus")}),t.on("results:focus",function(e){e.data._resultId?s.$search.attr("aria-activedescendant",e.data._resultId):s.$search.removeAttr("aria-activedescendant")}),this.$selection.on("focusin",".select2-search--inline",function(e){s.trigger("focus",e)}),this.$selection.on("focusout",".select2-search--inline",function(e){s._handleBlur(e)}),this.$selection.on("keydown",".select2-search--inline",function(e){var t;e.stopPropagation(),s.trigger("keypress",e),s._keyUpPrevented=e.isDefaultPrevented(),e.which!==l.BACKSPACE||""!==s.$search.val()||0<(t=s.$selection.find(".select2-selection__choice").last()).length&&(t=a.GetData(t[0],"data"),s.searchRemoveChoice(t),e.preventDefault())}),this.$selection.on("click",".select2-search--inline",function(e){s.$search.val()&&e.stopPropagation()});var t=document.documentMode,o=t&&t<=11;this.$selection.on("input.searchcheck",".select2-search--inline",function(e){o?s.$selection.off("input.search input.searchcheck"):s.$selection.off("keyup.search")}),this.$selection.on("keyup.search input.search",".select2-search--inline",function(e){var t;o&&"input"===e.type?s.$selection.off("input.search input.searchcheck"):(t=e.which)!=l.SHIFT&&t!=l.CTRL&&t!=l.ALT&&t!=l.TAB&&s.handleSearch(e)})},e.prototype._transferTabIndex=function(e){this.$search.attr("tabindex",this.$selection.attr("tabindex")),this.$selection.attr("tabindex","-1")},e.prototype.createPlaceholder=function(e,t){this.$search.attr("placeholder",t.text)},e.prototype.update=function(e,t){var n=this.$search[0]==document.activeElement;this.$search.attr("placeholder",""),e.call(this,t),this.resizeSearch(),n&&this.$search.trigger("focus")},e.prototype.handleSearch=function(){var e;this.resizeSearch(),this._keyUpPrevented||(e=this.$search.val(),this.trigger("query",{term:e})),this._keyUpPrevented=!1},e.prototype.searchRemoveChoice=function(e,t){this.trigger("unselect",{data:t}),this.$search.val(t.text),this.handleSearch()},e.prototype.resizeSearch=function(){this.$search.css("width","25px");var e="100%";""===this.$search.attr("placeholder")&&(e=.75*(this.$search.val().length+1)+"em"),this.$search.css("width",e)},e}),u.define("select2/selection/selectionCss",["../utils"],function(n){function e(){}return e.prototype.render=function(e){var t=e.call(this),e=this.options.get("selectionCssClass")||"";return-1!==e.indexOf(":all:")&&(e=e.replace(":all:",""),n.copyNonInternalCssClasses(t[0],this.$element[0])),t.addClass(e),t},e}),u.define("select2/selection/eventRelay",["jquery"],function(o){function e(){}return e.prototype.bind=function(e,t,n){var s=this,i=["open","opening","close","closing","select","selecting","unselect","unselecting","clear","clearing"],r=["opening","closing","selecting","unselecting","clearing"];e.call(this,t,n),t.on("*",function(e,t){var n;-1!==i.indexOf(e)&&(t=t||{},n=o.Event("select2:"+e,{params:t}),s.$element.trigger(n),-1!==r.indexOf(e)&&(t.prevented=n.isDefaultPrevented()))})},e}),u.define("select2/translation",["jquery","require"],function(t,n){function s(e){this.dict=e||{}}return s.prototype.all=function(){return this.dict},s.prototype.get=function(e){return this.dict[e]},s.prototype.extend=function(e){this.dict=t.extend({},e.all(),this.dict)},s._cache={},s.loadPath=function(e){var t;return e in s._cache||(t=n(e),s._cache[e]=t),new s(s._cache[e])},s}),u.define("select2/diacritics",[],function(){return{"โ’ถ":"A","๏ผก":"A","ร€":"A","ร":"A","ร‚":"A","แบฆ":"A","แบค":"A","แบช":"A","แบจ":"A","รƒ":"A","ฤ€":"A","ฤ‚":"A","แบฐ":"A","แบฎ":"A","แบด":"A","แบฒ":"A","ศฆ":"A","ว ":"A","ร„":"A","วž":"A","แบข":"A","ร…":"A","วบ":"A","ว":"A","ศ€":"A","ศ‚":"A","แบ ":"A","แบฌ":"A","แบถ":"A","แธ€":"A","ฤ„":"A","ศบ":"A","โฑฏ":"A","๊œฒ":"AA","ร†":"AE","วผ":"AE","วข":"AE","๊œด":"AO","๊œถ":"AU","๊œธ":"AV","๊œบ":"AV","๊œผ":"AY","โ’ท":"B","๏ผข":"B","แธ‚":"B","แธ„":"B","แธ†":"B","ษƒ":"B","ฦ‚":"B","ฦ":"B","โ’ธ":"C","๏ผฃ":"C","ฤ†":"C","ฤˆ":"C","ฤŠ":"C","ฤŒ":"C","ร‡":"C","แธˆ":"C","ฦ‡":"C","ศป":"C","๊œพ":"C","โ’น":"D","๏ผค":"D","แธŠ":"D","ฤŽ":"D","แธŒ":"D","แธ":"D","แธ’":"D","แธŽ":"D","ฤ":"D","ฦ‹":"D","ฦŠ":"D","ฦ‰":"D","๊น":"D","วฑ":"DZ","ว„":"DZ","วฒ":"Dz","ว…":"Dz","โ’บ":"E","๏ผฅ":"E","รˆ":"E","ร‰":"E","รŠ":"E","แป€":"E","แบพ":"E","แป„":"E","แป‚":"E","แบผ":"E","ฤ’":"E","แธ”":"E","แธ–":"E","ฤ”":"E","ฤ–":"E","ร‹":"E","แบบ":"E","ฤš":"E","ศ„":"E","ศ†":"E","แบธ":"E","แป†":"E","ศจ":"E","แธœ":"E","ฤ˜":"E","แธ˜":"E","แธš":"E","ฦ":"E","ฦŽ":"E","โ’ป":"F","๏ผฆ":"F","แธž":"F","ฦ‘":"F","๊ป":"F","โ’ผ":"G","๏ผง":"G","วด":"G","ฤœ":"G","แธ ":"G","ฤž":"G","ฤ ":"G","วฆ":"G","ฤข":"G","วค":"G","ฦ“":"G","๊ž ":"G","๊ฝ":"G","๊พ":"G","โ’ฝ":"H","๏ผจ":"H","ฤค":"H","แธข":"H","แธฆ":"H","ศž":"H","แธค":"H","แธจ":"H","แธช":"H","ฤฆ":"H","โฑง":"H","โฑต":"H","๊ž":"H","โ’พ":"I","๏ผฉ":"I","รŒ":"I","ร":"I","รŽ":"I","ฤจ":"I","ฤช":"I","ฤฌ":"I","ฤฐ":"I","ร":"I","แธฎ":"I","แปˆ":"I","ว":"I","ศˆ":"I","ศŠ":"I","แปŠ":"I","ฤฎ":"I","แธฌ":"I","ฦ—":"I","โ’ฟ":"J","๏ผช":"J","ฤด":"J","ษˆ":"J","โ“€":"K","๏ผซ":"K","แธฐ":"K","วจ":"K","แธฒ":"K","ฤถ":"K","แธด":"K","ฦ˜":"K","โฑฉ":"K","๊€":"K","๊‚":"K","๊„":"K","๊žข":"K","โ“":"L","๏ผฌ":"L","ฤฟ":"L","ฤน":"L","ฤฝ":"L","แธถ":"L","แธธ":"L","ฤป":"L","แธผ":"L","แธบ":"L","ล":"L","ศฝ":"L","โฑข":"L","โฑ ":"L","๊ˆ":"L","๊†":"L","๊ž€":"L","ว‡":"LJ","วˆ":"Lj","โ“‚":"M","๏ผญ":"M","แธพ":"M","แน€":"M","แน‚":"M","โฑฎ":"M","ฦœ":"M","โ“ƒ":"N","๏ผฎ":"N","วธ":"N","ลƒ":"N","ร‘":"N","แน„":"N","ล‡":"N","แน†":"N","ล…":"N","แนŠ":"N","แนˆ":"N","ศ ":"N","ฦ":"N","๊ž":"N","๊žค":"N","วŠ":"NJ","ว‹":"Nj","โ“„":"O","๏ผฏ":"O","ร’":"O","ร“":"O","ร”":"O","แป’":"O","แป":"O","แป–":"O","แป”":"O","ร•":"O","แนŒ":"O","ศฌ":"O","แนŽ":"O","ลŒ":"O","แน":"O","แน’":"O","ลŽ":"O","ศฎ":"O","ศฐ":"O","ร–":"O","ศช":"O","แปŽ":"O","ล":"O","ว‘":"O","ศŒ":"O","ศŽ":"O","ฦ ":"O","แปœ":"O","แปš":"O","แป ":"O","แปž":"O","แปข":"O","แปŒ":"O","แป˜":"O","วช":"O","วฌ":"O","ร˜":"O","วพ":"O","ฦ†":"O","ฦŸ":"O","๊Š":"O","๊Œ":"O","ล’":"OE","ฦข":"OI","๊Ž":"OO","ศข":"OU","โ“…":"P","๏ผฐ":"P","แน”":"P","แน–":"P","ฦค":"P","โฑฃ":"P","๊":"P","๊’":"P","๊”":"P","โ“†":"Q","๏ผฑ":"Q","๊–":"Q","๊˜":"Q","ษŠ":"Q","โ“‡":"R","๏ผฒ":"R","ล”":"R","แน˜":"R","ล˜":"R","ศ":"R","ศ’":"R","แนš":"R","แนœ":"R","ล–":"R","แนž":"R","ษŒ":"R","โฑค":"R","๊š":"R","๊žฆ":"R","๊ž‚":"R","โ“ˆ":"S","๏ผณ":"S","แบž":"S","ลš":"S","แนค":"S","ลœ":"S","แน ":"S","ล ":"S","แนฆ":"S","แนข":"S","แนจ":"S","ศ˜":"S","ลž":"S","โฑพ":"S","๊žจ":"S","๊ž„":"S","โ“‰":"T","๏ผด":"T","แนช":"T","ลค":"T","แนฌ":"T","ศš":"T","ลข":"T","แนฐ":"T","แนฎ":"T","ลฆ":"T","ฦฌ":"T","ฦฎ":"T","ศพ":"T","๊ž†":"T","๊œจ":"TZ","โ“Š":"U","๏ผต":"U","ร™":"U","รš":"U","ร›":"U","ลจ":"U","แนธ":"U","ลช":"U","แนบ":"U","ลฌ":"U","รœ":"U","ว›":"U","ว—":"U","ว•":"U","ว™":"U","แปฆ":"U","ลฎ":"U","ลฐ":"U","ว“":"U","ศ”":"U","ศ–":"U","ฦฏ":"U","แปช":"U","แปจ":"U","แปฎ":"U","แปฌ":"U","แปฐ":"U","แปค":"U","แนฒ":"U","ลฒ":"U","แนถ":"U","แนด":"U","ษ„":"U","โ“‹":"V","๏ผถ":"V","แนผ":"V","แนพ":"V","ฦฒ":"V","๊ž":"V","ษ…":"V","๊ ":"VY","โ“Œ":"W","๏ผท":"W","แบ€":"W","แบ‚":"W","ลด":"W","แบ†":"W","แบ„":"W","แบˆ":"W","โฑฒ":"W","โ“":"X","๏ผธ":"X","แบŠ":"X","แบŒ":"X","โ“Ž":"Y","๏ผน":"Y","แปฒ":"Y","ร":"Y","ลถ":"Y","แปธ":"Y","ศฒ":"Y","แบŽ":"Y","ลธ":"Y","แปถ":"Y","แปด":"Y","ฦณ":"Y","ษŽ":"Y","แปพ":"Y","โ“":"Z","๏ผบ":"Z","ลน":"Z","แบ":"Z","ลป":"Z","ลฝ":"Z","แบ’":"Z","แบ”":"Z","ฦต":"Z","ศค":"Z","โฑฟ":"Z","โฑซ":"Z","๊ข":"Z","โ“":"a","๏ฝ":"a","แบš":"a","ร ":"a","รก":"a","รข":"a","แบง":"a","แบฅ":"a","แบซ":"a","แบฉ":"a","รฃ":"a","ฤ":"a","ฤƒ":"a","แบฑ":"a","แบฏ":"a","แบต":"a","แบณ":"a","ศง":"a","วก":"a","รค":"a","วŸ":"a","แบฃ":"a","รฅ":"a","วป":"a","วŽ":"a","ศ":"a","ศƒ":"a","แบก":"a","แบญ":"a","แบท":"a","แธ":"a","ฤ…":"a","โฑฅ":"a","ษ":"a","๊œณ":"aa","รฆ":"ae","วฝ":"ae","วฃ":"ae","๊œต":"ao","๊œท":"au","๊œน":"av","๊œป":"av","๊œฝ":"ay","โ“‘":"b","๏ฝ‚":"b","แธƒ":"b","แธ…":"b","แธ‡":"b","ฦ€":"b","ฦƒ":"b","ษ“":"b","โ“’":"c","๏ฝƒ":"c","ฤ‡":"c","ฤ‰":"c","ฤ‹":"c","ฤ":"c","รง":"c","แธ‰":"c","ฦˆ":"c","ศผ":"c","๊œฟ":"c","โ†„":"c","โ““":"d","๏ฝ„":"d","แธ‹":"d","ฤ":"d","แธ":"d","แธ‘":"d","แธ“":"d","แธ":"d","ฤ‘":"d","ฦŒ":"d","ษ–":"d","ษ—":"d","๊บ":"d","วณ":"dz","ว†":"dz","โ“”":"e","๏ฝ…":"e","รจ":"e","รฉ":"e","รช":"e","แป":"e","แบฟ":"e","แป…":"e","แปƒ":"e","แบฝ":"e","ฤ“":"e","แธ•":"e","แธ—":"e","ฤ•":"e","ฤ—":"e","รซ":"e","แบป":"e","ฤ›":"e","ศ…":"e","ศ‡":"e","แบน":"e","แป‡":"e","ศฉ":"e","แธ":"e","ฤ™":"e","แธ™":"e","แธ›":"e","ษ‡":"e","ษ›":"e","ว":"e","โ“•":"f","๏ฝ†":"f","แธŸ":"f","ฦ’":"f","๊ผ":"f","โ“–":"g","๏ฝ‡":"g","วต":"g","ฤ":"g","แธก":"g","ฤŸ":"g","ฤก":"g","วง":"g","ฤฃ":"g","วฅ":"g","ษ ":"g","๊žก":"g","แตน":"g","๊ฟ":"g","โ“—":"h","๏ฝˆ":"h","ฤฅ":"h","แธฃ":"h","แธง":"h","ศŸ":"h","แธฅ":"h","แธฉ":"h","แธซ":"h","แบ–":"h","ฤง":"h","โฑจ":"h","โฑถ":"h","ษฅ":"h","ฦ•":"hv","โ“˜":"i","๏ฝ‰":"i","รฌ":"i","รญ":"i","รฎ":"i","ฤฉ":"i","ฤซ":"i","ฤญ":"i","รฏ":"i","แธฏ":"i","แป‰":"i","ว":"i","ศ‰":"i","ศ‹":"i","แป‹":"i","ฤฏ":"i","แธญ":"i","ษจ":"i","ฤฑ":"i","โ“™":"j","๏ฝŠ":"j","ฤต":"j","วฐ":"j","ษ‰":"j","โ“š":"k","๏ฝ‹":"k","แธฑ":"k","วฉ":"k","แธณ":"k","ฤท":"k","แธต":"k","ฦ™":"k","โฑช":"k","๊":"k","๊ƒ":"k","๊…":"k","๊žฃ":"k","โ“›":"l","๏ฝŒ":"l","ล€":"l","ฤบ":"l","ฤพ":"l","แธท":"l","แธน":"l","ฤผ":"l","แธฝ":"l","แธป":"l","ลฟ":"l","ล‚":"l","ฦš":"l","ษซ":"l","โฑก":"l","๊‰":"l","๊ž":"l","๊‡":"l","ว‰":"lj","โ“œ":"m","๏ฝ":"m","แธฟ":"m","แน":"m","แนƒ":"m","ษฑ":"m","ษฏ":"m","โ“":"n","๏ฝŽ":"n","วน":"n","ล„":"n","รฑ":"n","แน…":"n","ลˆ":"n","แน‡":"n","ล†":"n","แน‹":"n","แน‰":"n","ฦž":"n","ษฒ":"n","ล‰":"n","๊ž‘":"n","๊žฅ":"n","วŒ":"nj","โ“ž":"o","๏ฝ":"o","รฒ":"o","รณ":"o","รด":"o","แป“":"o","แป‘":"o","แป—":"o","แป•":"o","รต":"o","แน":"o","ศญ":"o","แน":"o","ล":"o","แน‘":"o","แน“":"o","ล":"o","ศฏ":"o","ศฑ":"o","รถ":"o","ศซ":"o","แป":"o","ล‘":"o","ว’":"o","ศ":"o","ศ":"o","ฦก":"o","แป":"o","แป›":"o","แปก":"o","แปŸ":"o","แปฃ":"o","แป":"o","แป™":"o","วซ":"o","วญ":"o","รธ":"o","วฟ":"o","ษ”":"o","๊‹":"o","๊":"o","ษต":"o","ล“":"oe","ฦฃ":"oi","ศฃ":"ou","๊":"oo","โ“Ÿ":"p","๏ฝ":"p","แน•":"p","แน—":"p","ฦฅ":"p","แตฝ":"p","๊‘":"p","๊“":"p","๊•":"p","โ“ ":"q","๏ฝ‘":"q","ษ‹":"q","๊—":"q","๊™":"q","โ“ก":"r","๏ฝ’":"r","ล•":"r","แน™":"r","ล™":"r","ศ‘":"r","ศ“":"r","แน›":"r","แน":"r","ล—":"r","แนŸ":"r","ษ":"r","ษฝ":"r","๊›":"r","๊žง":"r","๊žƒ":"r","โ“ข":"s","๏ฝ“":"s","รŸ":"s","ล›":"s","แนฅ":"s","ล":"s","แนก":"s","ลก":"s","แนง":"s","แนฃ":"s","แนฉ":"s","ศ™":"s","ลŸ":"s","ศฟ":"s","๊žฉ":"s","๊ž…":"s","แบ›":"s","โ“ฃ":"t","๏ฝ”":"t","แนซ":"t","แบ—":"t","ลฅ":"t","แนญ":"t","ศ›":"t","ลฃ":"t","แนฑ":"t","แนฏ":"t","ลง":"t","ฦญ":"t","สˆ":"t","โฑฆ":"t","๊ž‡":"t","๊œฉ":"tz","โ“ค":"u","๏ฝ•":"u","รน":"u","รบ":"u","รป":"u","ลฉ":"u","แนน":"u","ลซ":"u","แนป":"u","ลญ":"u","รผ":"u","วœ":"u","ว˜":"u","ว–":"u","วš":"u","แปง":"u","ลฏ":"u","ลฑ":"u","ว”":"u","ศ•":"u","ศ—":"u","ฦฐ":"u","แปซ":"u","แปฉ":"u","แปฏ":"u","แปญ":"u","แปฑ":"u","แปฅ":"u","แนณ":"u","ลณ":"u","แนท":"u","แนต":"u","ส‰":"u","โ“ฅ":"v","๏ฝ–":"v","แนฝ":"v","แนฟ":"v","ส‹":"v","๊Ÿ":"v","สŒ":"v","๊ก":"vy","โ“ฆ":"w","๏ฝ—":"w","แบ":"w","แบƒ":"w","ลต":"w","แบ‡":"w","แบ…":"w","แบ˜":"w","แบ‰":"w","โฑณ":"w","โ“ง":"x","๏ฝ˜":"x","แบ‹":"x","แบ":"x","โ“จ":"y","๏ฝ™":"y","แปณ":"y","รฝ":"y","ลท":"y","แปน":"y","ศณ":"y","แบ":"y","รฟ":"y","แปท":"y","แบ™":"y","แปต":"y","ฦด":"y","ษ":"y","แปฟ":"y","โ“ฉ":"z","๏ฝš":"z","ลบ":"z","แบ‘":"z","ลผ":"z","ลพ":"z","แบ“":"z","แบ•":"z","ฦถ":"z","ศฅ":"z","ษ€":"z","โฑฌ":"z","๊ฃ":"z","ฮ†":"ฮ‘","ฮˆ":"ฮ•","ฮ‰":"ฮ—","ฮŠ":"ฮ™","ฮช":"ฮ™","ฮŒ":"ฮŸ","ฮŽ":"ฮฅ","ฮซ":"ฮฅ","ฮ":"ฮฉ","ฮฌ":"ฮฑ","ฮญ":"ฮต","ฮฎ":"ฮท","ฮฏ":"ฮน","ฯŠ":"ฮน","ฮ":"ฮน","ฯŒ":"ฮฟ","ฯ":"ฯ…","ฯ‹":"ฯ…","ฮฐ":"ฯ…","ฯŽ":"ฯ‰","ฯ‚":"ฯƒ","โ€™":"'"}}),u.define("select2/data/base",["../utils"],function(n){function s(e,t){s.__super__.constructor.call(this)}return n.Extend(s,n.Observable),s.prototype.current=function(e){throw new Error("The `current` method must be defined in child classes.")},s.prototype.query=function(e,t){throw new Error("The `query` method must be defined in child classes.")},s.prototype.bind=function(e,t){},s.prototype.destroy=function(){},s.prototype.generateResultId=function(e,t){e=e.id+"-result-";return e+=n.generateChars(4),null!=t.id?e+="-"+t.id.toString():e+="-"+n.generateChars(4),e},s}),u.define("select2/data/select",["./base","../utils","jquery"],function(e,a,l){function n(e,t){this.$element=e,this.options=t,n.__super__.constructor.call(this)}return a.Extend(n,e),n.prototype.current=function(e){var t=this;e(Array.prototype.map.call(this.$element[0].querySelectorAll(":checked"),function(e){return t.item(l(e))}))},n.prototype.select=function(i){var e,r=this;if(i.selected=!0,null!=i.element&&"option"===i.element.tagName.toLowerCase())return i.element.selected=!0,void this.$element.trigger("input").trigger("change");this.$element.prop("multiple")?this.current(function(e){var t=[];(i=[i]).push.apply(i,e);for(var n=0;nthis.maximumInputLength?this.trigger("results:message",{message:"inputTooLong",args:{maximum:this.maximumInputLength,input:t.term,params:t}}):e.call(this,t,n)},e}),u.define("select2/data/maximumSelectionLength",[],function(){function e(e,t,n){this.maximumSelectionLength=n.get("maximumSelectionLength"),e.call(this,t,n)}return e.prototype.bind=function(e,t,n){var s=this;e.call(this,t,n),t.on("select",function(){s._checkIfMaximumSelected()})},e.prototype.query=function(e,t,n){var s=this;this._checkIfMaximumSelected(function(){e.call(s,t,n)})},e.prototype._checkIfMaximumSelected=function(e,t){var n=this;this.current(function(e){e=null!=e?e.length:0;0=n.maximumSelectionLength?n.trigger("results:message",{message:"maximumSelected",args:{maximum:n.maximumSelectionLength}}):t&&t()})},e}),u.define("select2/dropdown",["jquery","./utils"],function(t,e){function n(e,t){this.$element=e,this.options=t,n.__super__.constructor.call(this)}return e.Extend(n,e.Observable),n.prototype.render=function(){var e=t('');return e.attr("dir",this.options.get("dir")),this.$dropdown=e},n.prototype.bind=function(){},n.prototype.position=function(e,t){},n.prototype.destroy=function(){this.$dropdown.remove()},n}),u.define("select2/dropdown/search",["jquery"],function(r){function e(){}return e.prototype.render=function(e){var t=e.call(this),n=this.options.get("translations").get("search"),e=r('');return this.$searchContainer=e,this.$search=e.find("input"),this.$search.prop("autocomplete",this.options.get("autocomplete")),this.$search.attr("aria-label",n()),t.prepend(e),t},e.prototype.bind=function(e,t,n){var s=this,i=t.id+"-results";e.call(this,t,n),this.$search.on("keydown",function(e){s.trigger("keypress",e),s._keyUpPrevented=e.isDefaultPrevented()}),this.$search.on("input",function(e){r(this).off("keyup")}),this.$search.on("keyup input",function(e){s.handleSearch(e)}),t.on("open",function(){s.$search.attr("tabindex",0),s.$search.attr("aria-controls",i),s.$search.trigger("focus"),window.setTimeout(function(){s.$search.trigger("focus")},0)}),t.on("close",function(){s.$search.attr("tabindex",-1),s.$search.removeAttr("aria-controls"),s.$search.removeAttr("aria-activedescendant"),s.$search.val(""),s.$search.trigger("blur")}),t.on("focus",function(){t.isOpen()||s.$search.trigger("focus")}),t.on("results:all",function(e){null!=e.query.term&&""!==e.query.term||(s.showSearch(e)?s.$searchContainer[0].classList.remove("select2-search--hide"):s.$searchContainer[0].classList.add("select2-search--hide"))}),t.on("results:focus",function(e){e.data._resultId?s.$search.attr("aria-activedescendant",e.data._resultId):s.$search.removeAttr("aria-activedescendant")})},e.prototype.handleSearch=function(e){var t;this._keyUpPrevented||(t=this.$search.val(),this.trigger("query",{term:t})),this._keyUpPrevented=!1},e.prototype.showSearch=function(e,t){return!0},e}),u.define("select2/dropdown/hidePlaceholder",[],function(){function e(e,t,n,s){this.placeholder=this.normalizePlaceholder(n.get("placeholder")),e.call(this,t,n,s)}return e.prototype.append=function(e,t){t.results=this.removePlaceholder(t.results),e.call(this,t)},e.prototype.normalizePlaceholder=function(e,t){return t="string"==typeof t?{id:"",text:t}:t},e.prototype.removePlaceholder=function(e,t){for(var n=t.slice(0),s=t.length-1;0<=s;s--){var i=t[s];this.placeholder.id===i.id&&n.splice(s,1)}return n},e}),u.define("select2/dropdown/infiniteScroll",["jquery"],function(n){function e(e,t,n,s){this.lastParams={},e.call(this,t,n,s),this.$loadingMore=this.createLoadingMore(),this.loading=!1}return e.prototype.append=function(e,t){this.$loadingMore.remove(),this.loading=!1,e.call(this,t),this.showLoadingMore(t)&&(this.$results.append(this.$loadingMore),this.loadMoreIfNeeded())},e.prototype.bind=function(e,t,n){var s=this;e.call(this,t,n),t.on("query",function(e){s.lastParams=e,s.loading=!0}),t.on("query:append",function(e){s.lastParams=e,s.loading=!0}),this.$results.on("scroll",this.loadMoreIfNeeded.bind(this))},e.prototype.loadMoreIfNeeded=function(){var e=n.contains(document.documentElement,this.$loadingMore[0]);!this.loading&&e&&(e=this.$results.offset().top+this.$results.outerHeight(!1),this.$loadingMore.offset().top+this.$loadingMore.outerHeight(!1)<=e+50&&this.loadMore())},e.prototype.loadMore=function(){this.loading=!0;var e=n.extend({},{page:1},this.lastParams);e.page++,this.trigger("query:append",e)},e.prototype.showLoadingMore=function(e,t){return t.pagination&&t.pagination.more},e.prototype.createLoadingMore=function(){var e=n('
      • '),t=this.options.get("translations").get("loadingMore");return e.html(t(this.lastParams)),e},e}),u.define("select2/dropdown/attachBody",["jquery","../utils"],function(u,o){function e(e,t,n){this.$dropdownParent=u(n.get("dropdownParent")||document.body),e.call(this,t,n)}return e.prototype.bind=function(e,t,n){var s=this;e.call(this,t,n),t.on("open",function(){s._showDropdown(),s._attachPositioningHandler(t),s._bindContainerResultHandlers(t)}),t.on("close",function(){s._hideDropdown(),s._detachPositioningHandler(t)}),this.$dropdownContainer.on("mousedown",function(e){e.stopPropagation()})},e.prototype.destroy=function(e){e.call(this),this.$dropdownContainer.remove()},e.prototype.position=function(e,t,n){t.attr("class",n.attr("class")),t[0].classList.remove("select2"),t[0].classList.add("select2-container--open"),t.css({position:"absolute",top:-999999}),this.$container=n},e.prototype.render=function(e){var t=u(""),e=e.call(this);return t.append(e),this.$dropdownContainer=t},e.prototype._hideDropdown=function(e){this.$dropdownContainer.detach()},e.prototype._bindContainerResultHandlers=function(e,t){var n;this._containerResultsHandlersBound||(n=this,t.on("results:all",function(){n._positionDropdown(),n._resizeDropdown()}),t.on("results:append",function(){n._positionDropdown(),n._resizeDropdown()}),t.on("results:message",function(){n._positionDropdown(),n._resizeDropdown()}),t.on("select",function(){n._positionDropdown(),n._resizeDropdown()}),t.on("unselect",function(){n._positionDropdown(),n._resizeDropdown()}),this._containerResultsHandlersBound=!0)},e.prototype._attachPositioningHandler=function(e,t){var n=this,s="scroll.select2."+t.id,i="resize.select2."+t.id,r="orientationchange.select2."+t.id,t=this.$container.parents().filter(o.hasScroll);t.each(function(){o.StoreData(this,"select2-scroll-position",{x:u(this).scrollLeft(),y:u(this).scrollTop()})}),t.on(s,function(e){var t=o.GetData(this,"select2-scroll-position");u(this).scrollTop(t.y)}),u(window).on(s+" "+i+" "+r,function(e){n._positionDropdown(),n._resizeDropdown()})},e.prototype._detachPositioningHandler=function(e,t){var n="scroll.select2."+t.id,s="resize.select2."+t.id,t="orientationchange.select2."+t.id;this.$container.parents().filter(o.hasScroll).off(n),u(window).off(n+" "+s+" "+t)},e.prototype._positionDropdown=function(){var e=u(window),t=this.$dropdown[0].classList.contains("select2-dropdown--above"),n=this.$dropdown[0].classList.contains("select2-dropdown--below"),s=null,i=this.$container.offset();i.bottom=i.top+this.$container.outerHeight(!1);var r={height:this.$container.outerHeight(!1)};r.top=i.top,r.bottom=i.top+r.height;var o=this.$dropdown.outerHeight(!1),a=e.scrollTop(),l=e.scrollTop()+e.height(),c=ai.bottom+o,a={left:i.left,top:r.bottom},l=this.$dropdownParent;"static"===l.css("position")&&(l=l.offsetParent());i={top:0,left:0};(u.contains(document.body,l[0])||l[0].isConnected)&&(i=l.offset()),a.top-=i.top,a.left-=i.left,t||n||(s="below"),e||!c||t?!c&&e&&t&&(s="below"):s="above",("above"==s||t&&"below"!==s)&&(a.top=r.top-i.top-o),null!=s&&(this.$dropdown[0].classList.remove("select2-dropdown--below"),this.$dropdown[0].classList.remove("select2-dropdown--above"),this.$dropdown[0].classList.add("select2-dropdown--"+s),this.$container[0].classList.remove("select2-container--below"),this.$container[0].classList.remove("select2-container--above"),this.$container[0].classList.add("select2-container--"+s)),this.$dropdownContainer.css(a)},e.prototype._resizeDropdown=function(){var e={width:this.$container.outerWidth(!1)+"px"};this.options.get("dropdownAutoWidth")&&(e.minWidth=e.width,e.position="relative",e.width="auto"),this.$dropdown.css(e)},e.prototype._showDropdown=function(e){this.$dropdownContainer.appendTo(this.$dropdownParent),this._positionDropdown(),this._resizeDropdown()},e}),u.define("select2/dropdown/minimumResultsForSearch",[],function(){function e(e,t,n,s){this.minimumResultsForSearch=n.get("minimumResultsForSearch"),this.minimumResultsForSearch<0&&(this.minimumResultsForSearch=1/0),e.call(this,t,n,s)}return e.prototype.showSearch=function(e,t){return!(function e(t){for(var n=0,s=0;s');return e.attr("dir",this.options.get("dir")),this.$container=e,this.$container[0].classList.add("select2-container--"+this.options.get("theme")),r.StoreData(e[0],"element",this.$element),e},o}),u.define("jquery-mousewheel",["jquery"],function(e){return e}),u.define("jquery.select2",["jquery","jquery-mousewheel","./select2/core","./select2/defaults","./select2/utils"],function(i,e,r,t,o){var a;return null==i.fn.select2&&(a=["open","close","destroy"],i.fn.select2=function(t){if("object"==typeof(t=t||{}))return this.each(function(){var e=i.extend(!0,{},t);new r(i(this),e)}),this;if("string"!=typeof t)throw new Error("Invalid arguments for Select2: "+t);var n,s=Array.prototype.slice.call(arguments,1);return this.each(function(){var e=o.GetData(this,"select2");null==e&&window.console&&console.error&&console.error("The select2('"+t+"') method was called on an element that is not using Select2."),n=e[t].apply(e,s)}),-1 bool: + resolved = path.expanduser().resolve(strict=False) + if resolved == PYTEST_BASETEMP_ROOT or PYTEST_BASETEMP_ROOT in resolved.parents: + return False + return resolved == REPO_ROOT or REPO_ROOT in resolved.parents + + +def _assert_not_repo_path(path: Path, *, label: str) -> None: + if _is_repo_path(path): + raise AssertionError(f"{label} must not point inside the repo root during tests: {path}") + + +def _assert_safe_runtime_paths(*, cwd: Path | None = None, env: dict[str, str] | None = None) -> None: + if cwd is not None: + _assert_not_repo_path(cwd, label="cwd") + + for key in ("CRAWL_DIR", "SNAP_DIR"): + value = (env or {}).get(key) + if value: + _assert_not_repo_path(Path(value), label=key) + + +def _test_source_pythonpath() -> str: + entries: list[str] = [str(REPO_ROOT.resolve(strict=False))] + for repo_name in ("abxpkg", "abx-plugins", "abx-dl"): + for repo_path in (WORKSPACE_ROOT / repo_name, REPO_ROOT / repo_name): + if repo_path.exists(): + entries.append(str(repo_path.resolve(strict=False))) + break + return os.pathsep.join(entries) + + +def _set_test_source_pythonpath(env: dict[str, str]) -> None: + source_pythonpath = _test_source_pythonpath() + existing_entries = [ + str(Path(entry).expanduser().resolve(strict=False)) + for entry in (env.get("PYTHONPATH") or "").split(os.pathsep) + if entry and Path(entry).expanduser().is_absolute() + ] + entries = [entry for entry in [*source_pythonpath.split(os.pathsep), *existing_entries] if entry] + if entries: + env["PYTHONPATH"] = os.pathsep.join(dict.fromkeys(entries)) + else: + env.pop("PYTHONPATH", None) + + +def _sync_archivebox_test_data_dir(data_dir: Path) -> None: + from archivebox.config import constants as constants_mod + from archivebox.config import paths as paths_mod + + data_dir = data_dir.resolve() + archive_dir = data_dir / constants_mod.CONSTANTS.ARCHIVE_DIR_NAME + users_dir = archive_dir / constants_mod.CONSTANTS.USERS_DIR_NAME + + paths_mod.DATA_DIR = data_dir + paths_mod.ARCHIVE_DIR = archive_dir + paths_mod.USERS_DIR = users_dir + paths_mod.DATABASE_FILE = data_dir / constants_mod.CONSTANTS.SQL_INDEX_FILENAME + + constants_mod.CONSTANTS.DATA_DIR = data_dir + constants_mod.CONSTANTS.ARCHIVE_DIR = archive_dir + constants_mod.CONSTANTS.USERS_DIR = users_dir + constants_mod.CONSTANTS.COLLECTION_ID = paths_mod.get_collection_id(data_dir) + constants_mod.CONSTANTS.SOURCES_DIR = data_dir / constants_mod.CONSTANTS.SOURCES_DIR_NAME + constants_mod.CONSTANTS.PERSONAS_DIR = data_dir / constants_mod.CONSTANTS.PERSONAS_DIR_NAME + constants_mod.CONSTANTS.LOGS_DIR = data_dir / constants_mod.CONSTANTS.LOGS_DIR_NAME + constants_mod.CONSTANTS.CACHE_DIR = data_dir / constants_mod.CONSTANTS.CACHE_DIR_NAME + constants_mod.CONSTANTS.CUSTOM_TEMPLATES_DIR = data_dir / constants_mod.CONSTANTS.CUSTOM_TEMPLATES_DIR_NAME + constants_mod.CONSTANTS.USER_PLUGINS_DIR = data_dir / constants_mod.CONSTANTS.CUSTOM_PLUGINS_DIR_NAME + constants_mod.CONSTANTS.CONFIG_FILE = data_dir / constants_mod.CONSTANTS.CONFIG_FILENAME + constants_mod.CONSTANTS.DATABASE_FILE = data_dir / constants_mod.CONSTANTS.SQL_INDEX_FILENAME + constants_mod.CONSTANTS.DEFAULT_TMP_DIR = data_dir / constants_mod.CONSTANTS.TMP_DIR_NAME / constants_mod.CONSTANTS.MACHINE_ID + + constants_mod.CONSTANTS_CONFIG.update( + {key: value for key, value in constants_mod.CONSTANTS.__dict__.items() if key.isupper() and not key.startswith("_")}, + ) + + +# ============================================================================= +# CLI Helpers (defined before fixtures that use them) +# ============================================================================= + + +class ArchiveBoxCmdResult: + """Process-like result for completed and live ArchiveBox CLI commands.""" + + def __init__(self, args: list[str], process: subprocess.Popen) -> None: + self.args = args + self._process = process + self._stdout = None + self._stderr = None + + @property + def stdout(self): + if self._stdout is None: + return self._process.stdout + return self._stdout + + @property + def stderr(self): + if self._stderr is None: + return self._process.stderr + return self._stderr + + @property + def stdin(self): + return self._process.stdin + + @property + def returncode(self) -> int | None: + return self._process.returncode + + @property + def pid(self) -> int | None: + return self._process.pid + + def poll(self) -> int | None: + return self._process.poll() + + def wait(self, timeout: float | None = None) -> int | None: + return self._process.wait(timeout=timeout) + + def communicate(self, input=None, timeout: float | None = None): + self._stdout, self._stderr = self._process.communicate(input=input, timeout=timeout) + return self._stdout, self._stderr + + def terminate(self) -> None: + self._process.terminate() + + def kill(self) -> None: + self._process.kill() + + def send_signal(self, sig: int) -> None: + self._process.send_signal(sig) + + +def run_archivebox_cmd( + args: list[str], + *, + cwd: Path | None = None, + input: str | bytes | None = None, + timeout: int = 60, + env: dict[str, str] | None = None, + check: bool = False, + text: bool = True, + capture_output: bool = True, + stdout: Any = None, + stderr: Any = None, + stdin: Any = None, + wait: bool = True, + start_new_session: bool = False, + default_cli_env: bool = False, + disable_extractors: bool = False, + replace_env: bool = False, +) -> ArchiveBoxCmdResult: + """Run an ArchiveBox CLI command under test isolation.""" + cwd = cwd or Path.cwd() + cmd = ["archivebox", *args] + + _assert_not_repo_path(cwd, label="cwd") + + run_env = {} if replace_env else os.environ.copy() + if default_cli_env or disable_extractors or env is not None: + if default_cli_env: + run_env["USE_COLOR"] = "False" + run_env["SHOW_PROGRESS"] = "False" + if disable_extractors: + run_env.update( + { + "SAVE_ARCHIVEDOTORG": "False", + "SAVE_TITLE": "False", + "SAVE_FAVICON": "False", + "SAVE_WGET": "False", + "SAVE_WARC": "False", + "SAVE_PDF": "False", + "SAVE_SCREENSHOT": "False", + "SAVE_DOM": "False", + "SAVE_SINGLEFILE": "False", + "SAVE_READABILITY": "False", + "SAVE_MERCURY": "False", + "SAVE_GIT": "False", + "SAVE_YTDLP": "False", + "SAVE_HEADERS": "False", + "SAVE_HTMLTOTEXT": "False", + }, + ) + if env: + run_env.update(env) + _set_test_source_pythonpath(run_env) + + _assert_safe_runtime_paths(cwd=cwd, env=run_env) + + if stdin is not None: + assert input is None, "pass either input or stdin, not both" + if wait: + input = stdin + if isinstance(input, str): + text = True + + if capture_output: + stdout = subprocess.PIPE if stdout is None else stdout + stderr = subprocess.PIPE if stderr is None else stderr + + process = subprocess.Popen( + cmd, + stdin=subprocess.PIPE if wait and input is not None else stdin, + stdout=stdout, + stderr=stderr, + text=text, + cwd=cwd, + env=run_env, + start_new_session=start_new_session, + ) + result = ArchiveBoxCmdResult(cmd, process) + + if wait: + try: + result.communicate(input=input, timeout=timeout) + except subprocess.TimeoutExpired: + process.kill() + result.communicate() + raise + if check and result.returncode: + raise subprocess.CalledProcessError( + result.returncode, + cmd, + output=result.stdout, + stderr=result.stderr, + ) + + return result + + +def find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None: + candidates = {snapshot_id} + if len(snapshot_id) == 32: + candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}") + elif len(snapshot_id) == 36 and "-" in snapshot_id: + candidates.add(snapshot_id.replace("-", "")) + + for needle in candidates: + for path in data_dir.rglob(needle): + if path.is_dir(): + return path + return None + + +# ============================================================================= +# Fixtures +# ============================================================================= + + +def pytest_configure(): + import django + from django.apps import apps + + if not apps.ready: + django.setup() + + +@pytest.fixture(autouse=True) +def isolate_test_runtime(tmp_path, monkeypatch): + """ + Run each pytest test from an isolated temp cwd and restore env mutations. + + The maintained pytest suite lives under ``archivebox/tests``. Many of those + CLI tests shell out without passing ``cwd=`` explicitly, so the safest + contract is that every test starts in its own temp directory and any + in-process ``os.environ`` edits are rolled back afterwards. + + ArchiveBox derives DATA_DIR from cwd, so subprocess helpers pass the target + collection as cwd instead of using DATA_DIR as an override. + """ + _assert_not_repo_path(tmp_path, label="tmp_path") + original_cwd = Path.cwd() + original_env = os.environ.copy() + original_chdir = os.chdir + original_popen = subprocess.Popen + os.chdir(tmp_path) + _sync_archivebox_test_data_dir(tmp_path) + os.environ.pop("DATA_DIR", None) + + def reset_machine_model_caches() -> None: + import archivebox.machine.models as machine_models + + machine_models._CURRENT_MACHINE = None + machine_models._CURRENT_INTERFACE = None + machine_models._CURRENT_PROCESS = None + machine_models._CURRENT_BINARIES.clear() + + def guarded_chdir(path: os.PathLike[str] | str) -> None: + _assert_not_repo_path(Path(path), label="cwd") + original_chdir(path) + _sync_archivebox_test_data_dir(Path(path)) + + def guarded_popen(*args: Any, **kwargs: Any): + cwd = kwargs.get("cwd") + env = kwargs.get("env") + if cwd is not None: + _assert_not_repo_path(Path(cwd), label="cwd") + _assert_safe_runtime_paths(cwd=Path(cwd) if cwd is not None else None, env=env) + return original_popen(*args, **kwargs) + + monkeypatch.setattr(os, "chdir", guarded_chdir) + monkeypatch.setattr(subprocess, "Popen", guarded_popen) + reset_machine_model_caches() + try: + _assert_safe_runtime_paths(cwd=Path.cwd(), env=os.environ) + yield + finally: + reset_machine_model_caches() + original_chdir(original_cwd) + _sync_archivebox_test_data_dir(original_cwd) + os.environ.clear() + os.environ.update(original_env) + + +def pytest_sessionfinish(session, exitstatus): + shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True) + + +@pytest.fixture +def isolated_data_dir(tmp_path): + """ + Create isolated DATA_DIR for each test. + + Uses tmp_path for complete isolation. + """ + data_dir = tmp_path / "archivebox_data" + data_dir.mkdir() + return data_dir + + +@pytest.fixture +def hermetic_lib_dir(tmp_path, monkeypatch): + """ + Point LIB_DIR at a tmp directory so the test can write fake binaries + without touching the real ``~/Library/Application Support/abx/lib`` (which + can contain symlinks to SIP-protected system binaries on macOS). + + Opt-in only: most tests should reuse the cached real LIB_DIR for speed โ€” + rebuilding from scratch per-test adds ~10ร— overhead. Use this only when + the test synthesizes binary paths or validates LIB_DIR-relative behavior. + """ + import archivebox.machine.models as machine_models + + lib_dir = tmp_path / "lib" + (lib_dir / "bin").mkdir(parents=True, exist_ok=True) + monkeypatch.setenv("LIB_DIR", str(lib_dir)) + monkeypatch.setenv("ABXPKG_LIB_DIR", str(lib_dir)) + machine_models._CURRENT_MACHINE = None + machine_models._CURRENT_PROCESS = None + return lib_dir + + +@pytest.fixture +def initialized_archive(tmp_path): + """ + Initialize ArchiveBox archive in isolated directory. + + Runs `archivebox init` via subprocess to set up database and directories. + """ + _cmd_result = run_archivebox_cmd( + ["init", "--quick"], + cwd=tmp_path, + timeout=60, + default_cli_env=True, + disable_extractors=True, + ) + stderr, returncode = _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox init failed: {stderr}" + return tmp_path + + +@pytest.fixture +def recursive_test_site(): + pages = { + "/": """ + + + Root + + + + About + Blog + Contact + + + """.strip().encode("utf-8"), + "/about": """ + + + Deep About + + + """.strip().encode("utf-8"), + "/blog": """ + + + Deep Blog + + + """.strip().encode("utf-8"), + "/contact": """ + + + Deep Contact + + + """.strip().encode("utf-8"), + "/deep/about": b"

        Deep About

        ", + "/deep/blog": b"

        Deep Blog

        ", + "/deep/contact": b"

        Deep Contact

        ", + "/favicon.ico": b"test-icon", + } + + class RecursiveHandler(BaseHTTPRequestHandler): + def do_GET(self): + body = pages.get(self.path) + if body is None: + self.send_response(404) + self.end_headers() + return + + self.send_response(200) + if self.path.endswith(".ico"): + self.send_header("Content-Type", "image/x-icon") + else: + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, format, *args): + return + + server = ThreadingHTTPServer(("127.0.0.1", 0), RecursiveHandler) + thread = Thread(target=server.serve_forever, daemon=True) + thread.start() + try: + base_url = f"http://127.0.0.1:{server.server_address[1]}" + yield { + "base_url": base_url, + "root_url": f"{base_url}/", + "child_urls": [f"{base_url}/about", f"{base_url}/blog", f"{base_url}/contact"], + "deep_urls": [f"{base_url}/deep/about", f"{base_url}/deep/blog", f"{base_url}/deep/contact"], + } + finally: + server.shutdown() + server.server_close() + thread.join(timeout=5) + + +@pytest.fixture +def archivebox_daemon_server(initialized_archive, free_tcp_port_factory): + """ + Start a real daemonized ArchiveBox server in this test's DATA_DIR and + always stop its supervisord before the test exits. + """ + started: list[tuple[Path, dict[str, str]]] = [] + + def start(**env_overrides: str): + env_config = { + "SEARCH_BACKEND_SONIC_HOST_NAME": "127.0.0.1", + "SEARCH_BACKEND_SONIC_PORT": str(free_tcp_port_factory()), + **{key: str(value) for key, value in env_overrides.items()}, + } + env = cli_env( + live=True, + **env_config, + ) + port = free_tcp_port_factory() + result = run_archivebox_cmd( + ["server", "--daemonize", f"127.0.0.1:{port}"], + cwd=initialized_archive, + env=env, + timeout=90, + ) + assert result.returncode == 0, result.stderr or result.stdout + started.append((initialized_archive, env)) + return SimpleNamespace( + data_dir=initialized_archive, + env=env, + port=port, + worker_state=lambda: _archivebox_worker_state(initialized_archive, env), + wait_for_workers=lambda names, timeout=45: _wait_for_archivebox_workers(initialized_archive, env, names, timeout=timeout), + ) + + try: + yield start + finally: + for cwd, env in reversed(started): + _stop_archivebox_supervisord(cwd, env) + + +def wait_for_process(predicate: Callable[[psutil.Process, str], bool], *, timeout: float = 20.0) -> psutil.Process: + deadline = time.time() + timeout + last_seen: list[str] = [] + while time.time() < deadline: + last_seen = [] + for proc in psutil.process_iter(["pid", "ppid", "cmdline"]): + try: + cmdline = proc.info.get("cmdline") or [] + command = " ".join(cmdline) + last_seen.append(f"{proc.info.get('pid')} {proc.info.get('ppid')} {command}") + if predicate(proc, command): + return proc + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + time.sleep(0.2) + raise AssertionError("No matching live process found. Last seen:\n" + "\n".join(last_seen[-50:])) + + +def pid_is_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + return True + + +def wait_for_pid_to_disappear(pid: int, *, timeout: float = 20.0) -> None: + deadline = time.time() + timeout + while time.time() < deadline: + if not pid_is_alive(pid): + return + time.sleep(0.1) + raise AssertionError(f"PID {pid} is still running") + + +def cleanup_process_group(group_pid: int | None, *child_pids: int | None) -> None: + if group_pid and pid_is_alive(group_pid): + try: + os.killpg(group_pid, signal.SIGKILL) + except ProcessLookupError: + pass + except OSError: + try: + os.kill(group_pid, signal.SIGKILL) + except ProcessLookupError: + pass + for pid in child_pids: + if pid and pid_is_alive(pid): + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + +def cli_env( + *, + port: int | None = None, + plugins_root: Path | None = None, + replace: bool = False, + disable_extractors: bool = False, + live: bool = False, + server: bool = False, + wget: bool = False, + **extra: str, +) -> dict[str, str]: + env = {} if replace else os.environ.copy() + _set_test_source_pythonpath(env) + env.update({"USE_COLOR": "False", "SHOW_PROGRESS": "False"}) + + if disable_extractors or live or server: + env.update( + { + "PLUGINS": "__archivebox_test_no_plugins__", + "SAVE_ARCHIVEDOTORG": "False", + "SAVE_TITLE": "False", + "SAVE_FAVICON": "False", + "SAVE_WGET": "False", + "SAVE_WARC": "False", + "SAVE_PDF": "False", + "SAVE_SCREENSHOT": "False", + "SAVE_DOM": "False", + "SAVE_SINGLEFILE": "False", + "SAVE_READABILITY": "False", + "SAVE_MERCURY": "False", + "SAVE_GIT": "False", + "SAVE_YTDLP": "False", + "SAVE_HEADERS": "False", + "SAVE_HTMLTOTEXT": "False", + }, + ) + + if live: + env.update( + { + "TIMEOUT": "60", + "WGET_TIMEOUT": "45", + "CRAWL_MAX_CONCURRENT_SNAPSHOTS": "1", + "PARSE_HTML_URLS_ENABLED": "True", + "PARSE_DOM_OUTLINKS_ENABLED": "False", + "SEARCH_BACKEND_ENGINE": "sqlite", + }, + ) + + if server: + assert port is not None, "port is required when server=True" + env.update( + { + "PLUGINS": "wget", + "BIND_ADDR": f"127.0.0.1:{port}", + "BASE_URL": f"http://archivebox.localhost:{port}", + "ALLOWED_HOSTS": "*", + "PUBLIC_ADD_VIEW": "True", + "TIMEOUT": "30", + "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*|example\.com", + "SAVE_WGET": "True", + "USE_CHROME": "False", + }, + ) + + if wget: + env.update({"PLUGINS": "wget", "SAVE_WGET": "True"}) + + if plugins_root is not None: + env["ABX_PLUGINS_DIR"] = str(plugins_root) + + env.update(extra) + return env + + +def wait_for_port_open(host: str, port: int, *, timeout: float = 30.0) -> None: + deadline = time.time() + timeout + while time.time() < deadline: + try: + with socket.create_connection((host, port), timeout=0.25): + return + except OSError: + time.sleep(0.1) + raise AssertionError(f"server did not listen on {host}:{port}") + + +def wait_for_log(log_path: Path, text: str, *, timeout: float = 30.0) -> str: + deadline = time.time() + timeout + content = "" + while time.time() < deadline: + if log_path.exists(): + content = log_path.read_text(encoding="utf-8", errors="replace") + if text in content: + return content + time.sleep(0.1) + raise AssertionError(f"timed out waiting for {text!r} in {log_path}:\n{content}") + + +def wait_for_log_count(log_path: Path, text: str, count: int, *, timeout: float = 30.0) -> str: + deadline = time.time() + timeout + content = "" + while time.time() < deadline: + if log_path.exists(): + content = log_path.read_text(encoding="utf-8", errors="replace") + if content.count(text) >= count: + return content + time.sleep(0.1) + raise AssertionError(f"timed out waiting for {count} occurrences of {text!r} in {log_path}:\n{content}") + + +def wait_for_log_pattern(log_path: Path, pattern: str, *, timeout: float = 30.0) -> re.Match[str]: + deadline = time.time() + timeout + content = "" + while time.time() < deadline: + if log_path.exists(): + content = log_path.read_text(encoding="utf-8", errors="replace") + match = re.search(pattern, content) + if match: + return match + time.sleep(0.1) + raise AssertionError(f"timed out waiting for pattern {pattern!r} in {log_path}:\n{content}") + + +def supervisor_pid_from_log(log_path: Path) -> int: + content = log_path.read_text(encoding="utf-8", errors="replace") + matches = re.findall(r"Supervisord connected \(pid=(\d+)\)", content) + assert matches, content + return int(matches[-1]) + + +def worker_pid_from_log(log_path: Path, worker_name: str) -> int: + content = log_path.read_text(encoding="utf-8", errors="replace") + matches = re.findall(rf"Worker {re.escape(worker_name)}: started RUNNING \(pid (\d+),", content) + assert matches, content + return int(matches[-1]) + + +def wait_for_worker_pid_from_log(log_path: Path, worker_name: str, *, timeout: float = 45.0) -> int: + deadline = time.time() + timeout + last_error = "" + while time.time() < deadline: + try: + return worker_pid_from_log(log_path, worker_name) + except AssertionError as err: + last_error = str(err) + time.sleep(0.1) + raise AssertionError(last_error or f"timed out waiting for worker {worker_name!r} in {log_path}") + + +def pgrep_data_dir(data_dir: Path) -> list[str]: + result = subprocess.run(["pgrep", "-af", str(data_dir)], capture_output=True, text=True, timeout=5) + lines = [line for line in result.stdout.splitlines() if "pgrep -af" not in line] + + for runtime_root in (Path("/tmp/archivebox"), data_dir / "tmp"): + for config_path in runtime_root.glob("*/supervisord.conf"): + try: + config_text = config_path.read_text(encoding="utf-8", errors="replace") + except OSError: + continue + if str(data_dir) not in config_text: + continue + pid_path = config_path.with_name("supervisord.pid") + try: + pid = int(pid_path.read_text(encoding="utf-8").strip()) + except (OSError, ValueError): + continue + if not pid_is_alive(pid): + continue + ps_line = subprocess.run( + ["ps", "-p", str(pid), "-o", "pid=,ppid=,command="], + capture_output=True, + text=True, + timeout=5, + ).stdout.strip() + if ps_line: + lines.append(ps_line) + + return sorted(set(lines)) + + +def assert_no_processes_for_data_dir(data_dir: Path, *, timeout: float = 10.0) -> None: + deadline = time.time() + timeout + remaining: list[str] = [] + while time.time() < deadline: + remaining = pgrep_data_dir(data_dir) + if not remaining: + return + time.sleep(0.25) + raise AssertionError("processes still reference test DATA_DIR:\n" + "\n".join(remaining)) + + +def kill_processes_for_data_dir(data_dir: Path) -> None: + for line in pgrep_data_dir(data_dir): + try: + pid = int(line.split(None, 1)[0]) + except (IndexError, ValueError): + continue + if pid != os.getpid(): + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + +def start_archivebox_server( + cwd: Path, + *, + port: int, + env: dict[str, str] | None = None, + daemonize: bool | None = None, + log_name: str | None = None, + wait_for_log_text: str | None = "Tailing worker logs", +): + if daemonize is None: + daemonize = log_name is None + + args = ["server", f"127.0.0.1:{port}"] + if daemonize: + args.insert(1, "--daemonize") + + log_path = cwd / log_name if log_name else None + log = log_path.open("w", encoding="utf-8") if log_path else None + proc = run_archivebox_cmd( + args, + cwd=cwd, + env=env or cli_env(live=True), + stdout=log if log else None, + stderr=subprocess.STDOUT if log else None, + text=daemonize, + start_new_session=not daemonize, + wait=daemonize, + ) + if log is not None: + log.close() + proc.log_path = log_path + if daemonize: + assert proc.returncode == 0, proc.stderr or proc.stdout + return proc + wait_for_port_open("127.0.0.1", port) + if log_path is not None and wait_for_log_text is not None: + wait_for_log(log_path, wait_for_log_text, timeout=30.0) + return proc + + +def stop_archivebox_process(proc: subprocess.Popen[str], sig=signal.SIGTERM, *, timeout: float = 15.0) -> str: + if proc.poll() is None: + try: + os.killpg(proc.pid, sig) + except (ProcessLookupError, OSError): + try: + os.kill(proc.pid, sig) + except ProcessLookupError: + pass + try: + stdout, _stderr = proc.communicate(timeout=timeout) + return stdout or "" + except subprocess.TimeoutExpired: + try: + os.killpg(proc.pid, signal.SIGKILL) + except (ProcessLookupError, OSError): + try: + os.kill(proc.pid, signal.SIGKILL) + except ProcessLookupError: + pass + stdout, _stderr = proc.communicate(timeout=5) + return stdout or "" + + +def run_queued_crawls(cwd: Path, env: dict[str, str] | None = None, timeout: int = 180) -> None: + script = """ +import json +from archivebox.crawls.models import Crawl +print(json.dumps([str(crawl_id) for crawl_id in Crawl.objects.order_by("created_at").values_list("id", flat=True)])) +""" + _cmd_result = run_archivebox_cmd(["manage", "shell", "-c", script], cwd=cwd, timeout=60, env=env) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + crawl_ids = json.loads(stdout.strip().splitlines()[-1]) + for crawl_id in crawl_ids: + _cmd_result = run_archivebox_cmd(["run", f"--crawl-id={crawl_id}"], cwd=cwd, timeout=timeout, env=env) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox run --crawl-id={crawl_id} failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + + +def _run_archivebox_manage_shell(cwd: Path, env: dict[str, str], script: str, timeout: int = 60) -> str: + result = run_archivebox_cmd( + ["manage", "shell", "-c", script], + cwd=cwd, + env=env, + timeout=timeout, + ) + assert result.returncode == 0, result.stderr or result.stdout + return result.stdout + + +def _archivebox_worker_state(cwd: Path, env: dict[str, str]) -> dict[str, Any]: + stdout = _run_archivebox_manage_shell( + cwd, + env, + """ +import json +from archivebox.workers.supervisord_util import get_existing_supervisord_process, get_worker +supervisor = get_existing_supervisord_process(quiet=True) +workers = {} +if supervisor: + for name in ("worker_daphne", "worker_sonic", "worker_runner"): + workers[name] = get_worker(supervisor, name) +print(json.dumps(workers, default=str)) +""", + ) + return json.loads(stdout.strip().splitlines()[-1]) + + +def _stop_archivebox_supervisord(cwd: Path, env: dict[str, str]) -> None: + _run_archivebox_manage_shell( + cwd, + env, + "from archivebox.workers.supervisord_util import stop_existing_supervisord_process; stop_existing_supervisord_process()", + timeout=30, + ) + + +def _wait_for_archivebox_workers(cwd: Path, env: dict[str, str], names: tuple[str, ...] | list[str], timeout: int = 45) -> dict[str, Any]: + deadline = time.time() + timeout + state: dict[str, Any] = {} + while time.time() < deadline: + state = _archivebox_worker_state(cwd, env) + if all(state.get(name, {}).get("statename") == "RUNNING" for name in names): + return state + time.sleep(1) + return state + + +def stop_process(proc: subprocess.Popen[str]) -> tuple[str, str]: + if proc.poll() is None: + proc.terminate() + try: + return proc.communicate(timeout=5) + except subprocess.TimeoutExpired: + proc.kill() + return proc.communicate() + + +def run_python_cwd( + script: str, + cwd: Path, + timeout: int = 60, +) -> tuple[str, str, int]: + _assert_not_repo_path(cwd, label="cwd") + base_env = os.environ.copy() + _assert_safe_runtime_paths(cwd=cwd, env=base_env) + result = subprocess.run( + [sys.executable, "-"], + input=script, + capture_output=True, + text=True, + cwd=cwd, + env=base_env, + timeout=timeout, + ) + return result.stdout, result.stderr, result.returncode + + +# ============================================================================= +# Server/API Integration Helpers +# ============================================================================= + +API_TEST_HOST = "api.archivebox.localhost:8000" +ADMIN_TEST_HOST = "admin.archivebox.localhost:8000" +WEB_TEST_HOST = "web.archivebox.localhost:8000" + + +@pytest.fixture +def admin_user(request): + from django.contrib.auth import get_user_model + + username = f"admin_{abs(hash(request.node.nodeid))}" + return get_user_model().objects.create_superuser( + username=username, + email=f"{username}@example.com", + password="testpassword", + ) + + +@pytest.fixture +def admin_client(client, admin_user): + client.force_login(admin_user) + return client + + +@pytest.fixture +def crawl(admin_user, db): + from archivebox.crawls.models import Crawl + + return Crawl.objects.create( + urls="https://example.com\nhttps://example.org", + tags_str="alpha,beta", + created_by=admin_user, + ) + + +@pytest.fixture +def snapshot(crawl, db): + from archivebox.core.models import Snapshot + + return Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + + +@pytest.fixture +def tagged_data(crawl, admin_user): + from archivebox.core.models import Snapshot, Tag + + tag = Tag.objects.create(name="Alpha Research", created_by=admin_user) + first = Snapshot.objects.create( + url="https://example.com/one", + title="Example One", + crawl=crawl, + ) + second = Snapshot.objects.create( + url="https://example.com/two", + title="Example Two", + crawl=crawl, + ) + first.tags.add(tag) + second.tags.add(tag) + return tag, [first, second] + + +@pytest.fixture +def api_admin_user(request): + from django.contrib.auth import get_user_model + + username = f"apiadmin_{abs(hash(request.node.nodeid))}" + return get_user_model().objects.create_superuser( + username=username, + email=f"{username}@example.com", + password="testpass123", + ) + + +@pytest.fixture +def api_token(api_admin_user): + from archivebox.api.auth import get_or_create_api_token + + token = get_or_create_api_token(api_admin_user) + assert token is not None + return token + + +@pytest.fixture +def api_headers(api_token) -> dict[str, str]: + return api_auth_headers(api_token.token, django_client=True) + + +def api_auth_headers(api_token: str, *, django_client: bool = False, port: int | None = None) -> dict[str, str]: + host = f"api.archivebox.localhost:{port}" if port is not None else API_TEST_HOST + if django_client: + return { + "HTTP_HOST": host, + "HTTP_X_ARCHIVEBOX_API_KEY": api_token, + } + return { + "Host": host, + "X-ArchiveBox-API-Key": api_token, + } + + +def wait_for_live_api(port: int, *, path: str = "/api/v1/docs"): + return wait_for_http(port, host=f"api.archivebox.localhost:{port}", path=path) + + +def live_api_request(port: int, method: str, path: str, *, api_token: str, timeout: int = 30, **kwargs): + return requests.request( + method, + f"http://127.0.0.1:{port}{path}", + headers=api_auth_headers(api_token, port=port), + timeout=timeout, + **kwargs, + ) + + +def api_client_request( + client, + method: str, + path: str, + *, + payload: dict[str, Any] | None = None, + api_token: str | None = None, + headers: dict[str, str] | None = None, + **kwargs, +): + request_kwargs = dict(kwargs) + if payload is not None: + request_kwargs["data"] = json.dumps(payload) + request_kwargs["content_type"] = "application/json" + if headers is None: + assert api_token is not None + headers = api_auth_headers(api_token, django_client=True) + request_kwargs.update(headers) + return getattr(client, method.lower())(path, **request_kwargs) + + +def init_archive(cwd: Path) -> None: + result = run_archivebox_cmd( + ["init", "--quick"], + cwd=cwd, + timeout=60, + ) + assert result.returncode == 0, result.stderr + + +def get_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return sock.getsockname()[1] + + +def stop_server(cwd: Path) -> None: + script = textwrap.dedent( + """ + import os + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + from archivebox.workers.supervisord_util import stop_existing_supervisord_process + stop_existing_supervisord_process() + print('stopped') + """, + ) + run_python_cwd(script, cwd=cwd, timeout=30) + + +def wait_for_http( + port: int, + host: str, + path: str = "/", + timeout: float = 30.0, + process: subprocess.Popen[str] | None = None, +) -> requests.Response: + deadline = time.time() + timeout + last_exc = None + while time.time() < deadline: + if process is not None and process.poll() is not None: + raise AssertionError(f"Server exited before becoming ready with code {process.returncode}") + try: + response = requests.get( + f"http://127.0.0.1:{port}{path}", + headers={"Host": host}, + timeout=2, + allow_redirects=False, + ) + if response.status_code < 500: + return response + last_exc = f"HTTP {response.status_code}" + except requests.RequestException as exc: + last_exc = exc + time.sleep(0.5) + raise AssertionError(f"Timed out waiting for HTTP on {host}: {last_exc}") + + +def make_latest_schedule_due(cwd: Path) -> None: + from archivebox.crawls.models import Crawl, CrawlSchedule + from archivebox.tests.test_orm_helpers import use_archivebox_db + from django.utils import timezone + + with use_archivebox_db(cwd): + schedule = CrawlSchedule.objects.order_by("-created_at").select_related("template").first() + assert schedule is not None + Crawl.objects.filter(pk=schedule.template_id).update( + created_at=timezone.now() - timedelta(days=2), + modified_at=timezone.now() - timedelta(days=2), + ) + + +def get_snapshot_file_text(cwd: Path, url: str) -> str: + script = textwrap.dedent( + f""" + import os + from pathlib import Path + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first() + assert snapshot is not None, 'missing snapshot' + assert snapshot.status == 'sealed', snapshot.status + + snapshot_dir = Path(snapshot.output_dir) + candidates = [] + preferred_patterns = ( + 'wget/**/index.html', + 'wget/**/*.html', + 'trafilatura/content.html', + 'trafilatura/content.txt', + 'defuddle/content.html', + 'defuddle/content.txt', + ) + for pattern in preferred_patterns: + for candidate in snapshot_dir.glob(pattern): + if candidate.is_file(): + candidates.append(candidate) + + if not candidates: + for candidate in snapshot_dir.rglob('*'): + if not candidate.is_file(): + continue + rel = candidate.relative_to(snapshot_dir) + if rel.parts and rel.parts[0] == 'responses': + continue + if len(rel.parts) == 1 and rel.name == 'index.html': + continue + if candidate.suffix not in ('.html', '.htm', '.txt'): + continue + if candidate.name in ('stdout.log', 'stderr.log'): + continue + candidates.append(candidate) + + assert candidates, f'no captured html/txt files found in {{snapshot_dir}}' + print(candidates[0].read_text(errors='ignore')) + """, + ) + stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60) + assert code == 0, stderr + return stdout + + +def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str: + deadline = time.time() + timeout + last_error = None + while time.time() < deadline: + try: + return get_snapshot_file_text(cwd, url) + except AssertionError as err: + last_error = err + time.sleep(2) + raise AssertionError(f"timed out waiting for captured content for {url}: {last_error}") + + +def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]: + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + from archivebox.tests.test_orm_helpers import use_archivebox_db + + with use_archivebox_db(cwd): + scheduled_snapshots = Snapshot.objects.filter(url=scheduled_url).count() + one_shot_snapshots = Snapshot.objects.filter(url=one_shot_url).count() + scheduled_crawls = Crawl.objects.filter(schedule__isnull=False, urls=scheduled_url).count() + return scheduled_snapshots, one_shot_snapshots, scheduled_crawls + + +def get_depth_counts(cwd: Path) -> dict[int, int]: + from archivebox.core.models import Snapshot + from archivebox.tests.test_orm_helpers import use_archivebox_db + + with use_archivebox_db(cwd): + return {depth: Snapshot.objects.filter(depth=depth).count() for depth in set(Snapshot.objects.values_list("depth", flat=True))} + + +def get_crawl_runtime_state(cwd: Path, crawl_id: str) -> dict[str, object]: + from archivebox.core.models import ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.tests.test_orm_helpers import use_archivebox_db + from archivebox.workers.models import RETRY_AT_MAX + + with use_archivebox_db(cwd): + crawl = Crawl.objects.get(id=crawl_id) + snapshots = list( + crawl.snapshot_set.order_by("created_at").values( + "id", + "url", + "status", + "retry_at", + ), + ) + results = list( + ArchiveResult.objects.filter(snapshot__crawl=crawl) + .order_by("snapshot_id", "plugin", "hook_name") + .values( + "snapshot_id", + "plugin", + "hook_name", + "status", + "retry_at", + "output_files", + "output_size", + ), + ) + + return { + "retry_at_max": RETRY_AT_MAX, + "crawl_status": crawl.status, + "crawl_retry_at": crawl.retry_at, + "snapshots": snapshots, + "results": results, + } + + +def create_admin_and_token(cwd: Path) -> str: + script = textwrap.dedent( + """ + import os + from datetime import timedelta + from django.utils import timezone + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') + import django + django.setup() + + from django.contrib.auth import get_user_model + from archivebox.api.models import APIToken + + User = get_user_model() + user, _ = User.objects.get_or_create( + username='apitestadmin', + defaults={ + 'email': 'apitestadmin@example.com', + 'is_staff': True, + 'is_superuser': True, + }, + ) + user.is_staff = True + user.is_superuser = True + user.set_password('testpass123') + user.save() + + token = APIToken.objects.create( + created_by=user, + expires=timezone.now() + timedelta(days=1), + ) + print(token.token) + """, + ) + stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60) + assert code == 0, stderr + return stdout.strip().splitlines()[-1] + + +def wait_for_archive_outputs( + cwd: Path, + url: str, + timeout: int = 120, + interval: float = 1.0, +) -> bool: + script = textwrap.dedent( + f"""\ + from pathlib import Path + + import os + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + import django + django.setup() + + from archivebox.core.models import Snapshot + + snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first() + if snapshot is None or snapshot.status != 'sealed': + raise SystemExit(1) + + output_rel = None + for output in snapshot.discover_outputs(): + candidate = output.get('path') + if not candidate or candidate.startswith('responses/'): + continue + if Path(snapshot.output_dir, candidate).is_file(): + output_rel = candidate + break + if output_rel is None: + fallback = Path(snapshot.output_dir, 'index.jsonl') + if fallback.exists(): + output_rel = 'index.jsonl' + if output_rel is None: + snapshot_dir = Path(snapshot.output_dir) + for candidate in snapshot_dir.rglob('*'): + if not candidate.is_file(): + continue + rel_path = candidate.relative_to(snapshot_dir) + if rel_path.parts and rel_path.parts[0] == 'responses': + continue + if rel_path.name in {"stdout.log", "stderr.log"}: + continue + output_rel = str(rel_path) + break + if output_rel is None: + raise SystemExit(1) + + responses_root = Path(snapshot.output_dir) / 'responses' + if not responses_root.exists(): + raise SystemExit(1) + if not any(candidate.is_file() and snapshot.domain in candidate.relative_to(responses_root).parts for candidate in responses_root.rglob('*')): + raise SystemExit(1) + + print('READY') + """, + ) + + deadline = time.time() + timeout + while time.time() < deadline: + stdout, _stderr, returncode = run_python_cwd(script, cwd=cwd, timeout=30) + if returncode == 0 and "READY" in stdout: + return True + time.sleep(interval) + return False + + +def _get_machine_type() -> str: + import platform + + os_name = platform.system().lower() + arch = platform.machine().lower() + in_docker = os.environ.get("IN_DOCKER", "").lower() in ("1", "true", "yes") + suffix = "-docker" if in_docker else "" + return f"{arch}-{os_name}{suffix}" + + +def _find_cached_chrome(lib_dir: Path) -> Path | None: + candidates = [ + lib_dir / "puppeteer" / "chromium", + lib_dir / "puppeteer", + lib_dir / "ms-playwright", + lib_dir / "pnpm" / "packages" / "chrome" / "node_modules" / "puppeteer" / ".local-chromium", + ] + for base in candidates: + if not base.exists(): + continue + for path in base.rglob("Chromium.app/Contents/MacOS/Chromium"): + return path + for path in base.rglob("chrome-linux/chrome"): + return path + for path in base.rglob("chrome-linux64/chrome"): + return path + return None + + +def _find_system_browser() -> Path | None: + candidates = [ + Path("/usr/bin/chromium"), + Path("/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary"), + Path("/Applications/Chromium.app/Contents/MacOS/Chromium"), + Path("/usr/bin/chromium-browser"), + ] + for candidate in candidates: + if candidate.exists(): + return candidate + return None + + +@pytest.fixture(scope="class") +def real_archive_with_example(tmp_path_factory, request): + """ + Initialize archive and add https://example.com using responses only. + Uses cwd for DATA_DIR. + """ + tmp_path = tmp_path_factory.mktemp("archivebox_data") + if request.cls is not None: + request.cls.data_dir = tmp_path + + _cmd_result = run_archivebox_cmd( + ["init", "--quick"], + cwd=tmp_path, + timeout=120, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox init failed: {stderr}" + + _cmd_result = run_archivebox_cmd( + [ + "config", + "--set", + "BIND_ADDR=127.0.0.1:8000", + "BASE_URL=http://archivebox.localhost:8000", + "PUBLIC_INDEX=True", + "PUBLIC_ADD_VIEW=True", + ], + cwd=tmp_path, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox config failed: {stderr}" + + add_env = { + "RESPONSES_ENABLED": "True", + "SHOW_PROGRESS": "False", + "USE_COLOR": "False", + "RESPONSES_TIMEOUT": "30", + } + system_browser = _find_system_browser() + if system_browser: + add_env["CHROME_BINARY"] = str(system_browser) + _cmd_result = run_archivebox_cmd( + ["add", "--depth=0", "--plugins=responses", "https://example.com"], + cwd=tmp_path, + timeout=600, + env=add_env, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox add failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + + ready = wait_for_archive_outputs(tmp_path, "https://example.com", timeout=60) + assert ready, f"archivebox add did not produce required outputs within timeout:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + + return tmp_path + + +# ============================================================================= +# Output Assertions +# ============================================================================= + + +def parse_jsonl_output(stdout: str) -> list[dict[str, Any]]: + """Parse JSONL output into list of dicts via Process parser.""" + from archivebox.machine.models import Process + + return Process.parse_records_from_text(stdout or "") + + +def stdout_lines(stdout: str) -> list[str]: + return [line for line in stdout.splitlines() if line.strip()] + + +def assert_jsonl_only(stdout: str) -> None: + lines = stdout_lines(stdout) + assert lines, "Expected stdout to contain JSONL records" + assert all(line.lstrip().startswith("{") for line in lines), stdout + + +def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1): + """Assert output contains at least min_count records of type.""" + records = parse_jsonl_output(stdout) + matching = [r for r in records if r.get("type") == record_type] + assert len(matching) >= min_count, f"Expected >= {min_count} {record_type}, got {len(matching)}" + return matching + + +def assert_jsonl_pass_through(stdout: str, input_records: list[dict[str, Any]]): + """Assert that input records appear in output (pass-through behavior).""" + output_records = parse_jsonl_output(stdout) + output_ids = {r.get("id") for r in output_records if r.get("id")} + + for input_rec in input_records: + input_id = input_rec.get("id") + if input_id: + assert input_id in output_ids, f"Input record {input_id} not found in output (pass-through failed)" + + +def assert_record_has_fields(record: dict[str, Any], required_fields: list[str]): + """Assert record has all required fields with non-None values.""" + for field in required_fields: + assert field in record, f"Record missing field: {field}" + assert record[field] is not None, f"Record field is None: {field}" + + +# ============================================================================= +# Test Data Factories +# ============================================================================= + + +def create_test_url(domain: str = "example.com", path: str | None = None) -> str: + """Generate unique test URL.""" + path = path or secrets.token_hex(4) + return f"https://{domain}/{path}" + + +def create_test_crawl_json(urls: list[str] | None = None, **kwargs) -> dict[str, Any]: + """Create Crawl JSONL record for testing.""" + urls = urls or [create_test_url()] + return { + "type": "Crawl", + "urls": "\n".join(urls), + "max_depth": kwargs.get("max_depth", 0), + "tags_str": kwargs.get("tags_str", ""), + "status": kwargs.get("status", "queued"), + **{k: v for k, v in kwargs.items() if k not in ("max_depth", "tags_str", "status")}, + } + + +def create_test_snapshot_json(url: str | None = None, **kwargs) -> dict[str, Any]: + """Create Snapshot JSONL record for testing.""" + return { + "type": "Snapshot", + "url": url or create_test_url(), + "tags_str": kwargs.get("tags_str", ""), + "status": kwargs.get("status", "queued"), + **{k: v for k, v in kwargs.items() if k not in ("tags_str", "status")}, + } diff --git a/archivebox/tests/firefox_export.html b/archivebox/tests/firefox_export.html deleted file mode 100644 index 99d0bd0e2f..0000000000 --- a/archivebox/tests/firefox_export.html +++ /dev/null @@ -1,34 +0,0 @@ - - - -Bookmarks -

        Bookmarks Menu

        - -

        -

        Recently Bookmarked -
        Recent Tags -

        Mozilla Firefox

        -

        -

        Help and Tutorials -
        Customize Firefox -
        Get Involved -
        About Us -

        -

        [Folder Name]

        -

        -

        firefox export bookmarks at DuckDuckGo -
        archive firefox bookmarks at DuckDuckGo -
        nodiscc (nodiscc) ยท GitHub -
        pirate/ArchiveBox ยท Github -
        Phonotactic Reconstruction of Encrypted VoIP Conversations -
        Firefox Bookmarks Archiver - gHacks Tech News -

        -

        Bookmarks Toolbar

        -
        Add bookmarks to this folder to see them displayed on the Bookmarks Toolbar -

        -

        Most Visited -
        Getting Started -

        -

        diff --git a/archivebox/tests/migrations_helpers.py b/archivebox/tests/migrations_helpers.py new file mode 100644 index 0000000000..dc4a4d63e1 --- /dev/null +++ b/archivebox/tests/migrations_helpers.py @@ -0,0 +1,1223 @@ +#!/usr/bin/env python3 +""" +Helper functions and schema definitions for migration tests. + +This module provides: +- Schema definitions for each major ArchiveBox version (0.4.x, 0.7.x, 0.8.x) +- Data seeding functions to populate test databases +- Helper functions to run archivebox commands and verify results +""" + +import json +import sqlite3 +from pathlib import Path +from datetime import datetime, timezone + +from archivebox.tests.conftest import cli_env, run_archivebox_cmd +from archivebox.uuid_compat import uuid7 + + +# ============================================================================= +# Schema Definitions for Each Version +# ============================================================================= + +SCHEMA_0_4 = """ +-- Django system tables (minimal) +CREATE TABLE IF NOT EXISTS django_migrations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + app VARCHAR(255) NOT NULL, + name VARCHAR(255) NOT NULL, + applied DATETIME NOT NULL +); + +-- Core tables for 0.4.x +CREATE TABLE IF NOT EXISTS core_snapshot ( + id CHAR(32) PRIMARY KEY, + url VARCHAR(2000) NOT NULL UNIQUE, + timestamp VARCHAR(32) NOT NULL UNIQUE, + title VARCHAR(128), + tags VARCHAR(256), + added DATETIME NOT NULL, + updated DATETIME +); +CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url); +CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp); +CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added); +""" + +SCHEMA_0_7 = """ +-- Django system tables (complete for 0.7.x) +CREATE TABLE IF NOT EXISTS django_migrations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + app VARCHAR(255) NOT NULL, + name VARCHAR(255) NOT NULL, + applied DATETIME NOT NULL +); + +CREATE TABLE IF NOT EXISTS django_content_type ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + app_label VARCHAR(100) NOT NULL, + model VARCHAR(100) NOT NULL, + UNIQUE(app_label, model) +); + +CREATE TABLE IF NOT EXISTS auth_permission ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(255) NOT NULL, + content_type_id INTEGER NOT NULL REFERENCES django_content_type(id), + codename VARCHAR(100) NOT NULL, + UNIQUE(content_type_id, codename) +); + +CREATE TABLE IF NOT EXISTS auth_group ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(150) NOT NULL UNIQUE +); + +CREATE TABLE IF NOT EXISTS auth_group_permissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + group_id INTEGER NOT NULL REFERENCES auth_group(id), + permission_id INTEGER NOT NULL REFERENCES auth_permission(id), + UNIQUE(group_id, permission_id) +); + +CREATE TABLE IF NOT EXISTS auth_user ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + password VARCHAR(128) NOT NULL, + last_login DATETIME, + is_superuser BOOL NOT NULL, + username VARCHAR(150) NOT NULL UNIQUE, + first_name VARCHAR(150) NOT NULL, + last_name VARCHAR(150) NOT NULL, + email VARCHAR(254) NOT NULL, + is_staff BOOL NOT NULL, + is_active BOOL NOT NULL, + date_joined DATETIME NOT NULL +); + +CREATE TABLE IF NOT EXISTS auth_user_groups ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL REFERENCES auth_user(id), + group_id INTEGER NOT NULL REFERENCES auth_group(id), + UNIQUE(user_id, group_id) +); + +CREATE TABLE IF NOT EXISTS auth_user_user_permissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL REFERENCES auth_user(id), + permission_id INTEGER NOT NULL REFERENCES auth_permission(id), + UNIQUE(user_id, permission_id) +); + +CREATE TABLE IF NOT EXISTS django_admin_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + action_time DATETIME NOT NULL, + object_id TEXT, + object_repr VARCHAR(200) NOT NULL, + action_flag SMALLINT UNSIGNED NOT NULL, + change_message TEXT NOT NULL, + content_type_id INTEGER REFERENCES django_content_type(id), + user_id INTEGER NOT NULL REFERENCES auth_user(id) +); + +CREATE TABLE IF NOT EXISTS django_session ( + session_key VARCHAR(40) NOT NULL PRIMARY KEY, + session_data TEXT NOT NULL, + expire_date DATETIME NOT NULL +); + +-- Core tables for 0.7.x +CREATE TABLE IF NOT EXISTS core_tag ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(100) NOT NULL UNIQUE, + slug VARCHAR(100) NOT NULL UNIQUE +); + +CREATE TABLE IF NOT EXISTS core_snapshot ( + id CHAR(32) PRIMARY KEY, + url VARCHAR(2000) NOT NULL UNIQUE, + timestamp VARCHAR(32) NOT NULL UNIQUE, + title VARCHAR(512), + added DATETIME NOT NULL, + updated DATETIME +); +CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url); +CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp); +CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added); + +-- Many-to-many for snapshot tags +CREATE TABLE IF NOT EXISTS core_snapshot_tags ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id), + tag_id INTEGER NOT NULL REFERENCES core_tag(id), + UNIQUE(snapshot_id, tag_id) +); + +CREATE TABLE IF NOT EXISTS core_archiveresult ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id), + extractor VARCHAR(32) NOT NULL, + cmd TEXT, + pwd VARCHAR(256), + cmd_version VARCHAR(128), + output VARCHAR(1024), + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(16) NOT NULL +); +CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id); +CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor); + +-- Insert required content types +INSERT INTO django_content_type (app_label, model) VALUES +('contenttypes', 'contenttype'), +('auth', 'permission'), +('auth', 'group'), +('auth', 'user'), +('admin', 'logentry'), +('sessions', 'session'), +('core', 'snapshot'), +('core', 'archiveresult'), +('core', 'tag'); +""" + +SCHEMA_0_8 = """ +-- Django system tables (complete for 0.8.x) +CREATE TABLE IF NOT EXISTS django_migrations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + app VARCHAR(255) NOT NULL, + name VARCHAR(255) NOT NULL, + applied DATETIME NOT NULL +); + +CREATE TABLE IF NOT EXISTS django_content_type ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + app_label VARCHAR(100) NOT NULL, + model VARCHAR(100) NOT NULL, + UNIQUE(app_label, model) +); + +CREATE TABLE IF NOT EXISTS auth_permission ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(255) NOT NULL, + content_type_id INTEGER NOT NULL REFERENCES django_content_type(id), + codename VARCHAR(100) NOT NULL, + UNIQUE(content_type_id, codename) +); + +CREATE TABLE IF NOT EXISTS auth_group ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(150) NOT NULL UNIQUE +); + +CREATE TABLE IF NOT EXISTS auth_group_permissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + group_id INTEGER NOT NULL REFERENCES auth_group(id), + permission_id INTEGER NOT NULL REFERENCES auth_permission(id), + UNIQUE(group_id, permission_id) +); + +CREATE TABLE IF NOT EXISTS auth_user ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + password VARCHAR(128) NOT NULL, + last_login DATETIME, + is_superuser BOOL NOT NULL, + username VARCHAR(150) NOT NULL UNIQUE, + first_name VARCHAR(150) NOT NULL, + last_name VARCHAR(150) NOT NULL, + email VARCHAR(254) NOT NULL, + is_staff BOOL NOT NULL, + is_active BOOL NOT NULL, + date_joined DATETIME NOT NULL +); + +CREATE TABLE IF NOT EXISTS auth_user_groups ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL REFERENCES auth_user(id), + group_id INTEGER NOT NULL REFERENCES auth_group(id), + UNIQUE(user_id, group_id) +); + +CREATE TABLE IF NOT EXISTS auth_user_user_permissions ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL REFERENCES auth_user(id), + permission_id INTEGER NOT NULL REFERENCES auth_permission(id), + UNIQUE(user_id, permission_id) +); + +CREATE TABLE IF NOT EXISTS django_admin_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + action_time DATETIME NOT NULL, + object_id TEXT, + object_repr VARCHAR(200) NOT NULL, + action_flag SMALLINT UNSIGNED NOT NULL, + change_message TEXT NOT NULL, + content_type_id INTEGER REFERENCES django_content_type(id), + user_id INTEGER NOT NULL REFERENCES auth_user(id) +); + +CREATE TABLE IF NOT EXISTS django_session ( + session_key VARCHAR(40) NOT NULL PRIMARY KEY, + session_data TEXT NOT NULL, + expire_date DATETIME NOT NULL +); + +-- Machine app tables (added in 0.8.x) +CREATE TABLE IF NOT EXISTS machine_machine ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + modified_at DATETIME, + guid VARCHAR(64) NOT NULL UNIQUE, + hostname VARCHAR(63), + hw_in_docker BOOLEAN NOT NULL DEFAULT 0, + hw_in_vm BOOLEAN NOT NULL DEFAULT 0, + hw_manufacturer VARCHAR(63), + hw_product VARCHAR(63), + hw_uuid VARCHAR(255), + os_arch VARCHAR(15), + os_family VARCHAR(15), + os_platform VARCHAR(63), + os_release VARCHAR(63), + os_kernel VARCHAR(255), + stats TEXT DEFAULT '{}', + config TEXT DEFAULT '{}', + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE IF NOT EXISTS machine_networkinterface ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + modified_at DATETIME, + machine_id CHAR(36) NOT NULL REFERENCES machine_machine(id), + mac_address VARCHAR(17), + ip_public VARCHAR(45), + ip_local VARCHAR(45), + dns_server VARCHAR(45), + hostname VARCHAR(63), + iface VARCHAR(15), + isp VARCHAR(63), + city VARCHAR(63), + region VARCHAR(63), + country VARCHAR(63), + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE IF NOT EXISTS machine_dependency ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + modified_at DATETIME, + bin_name VARCHAR(63) NOT NULL UNIQUE, + bin_providers VARCHAR(127) NOT NULL DEFAULT '*', + overrides TEXT DEFAULT '{}', + config TEXT DEFAULT '{}' +); + +CREATE TABLE IF NOT EXISTS machine_binary ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + modified_at DATETIME, + machine_id CHAR(36) REFERENCES machine_machine(id), + dependency_id CHAR(36) REFERENCES machine_dependency(id), + name VARCHAR(63), + binprovider VARCHAR(31), + abspath VARCHAR(255), + version VARCHAR(32), + sha256 VARCHAR(64), + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +-- API app tables (added in 0.8.x) +CREATE TABLE IF NOT EXISTS api_apitoken ( + id CHAR(36) PRIMARY KEY, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + created_at DATETIME NOT NULL, + modified_at DATETIME, + token VARCHAR(32) NOT NULL UNIQUE, + expires DATETIME +); + +CREATE TABLE IF NOT EXISTS api_outboundwebhook ( + id CHAR(36) PRIMARY KEY, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + created_at DATETIME NOT NULL, + modified_at DATETIME, + name VARCHAR(255) NOT NULL DEFAULT '', + signal VARCHAR(255) NOT NULL, + ref VARCHAR(255) NOT NULL, + endpoint VARCHAR(2083) NOT NULL, + headers TEXT DEFAULT '{}', + auth_token VARCHAR(4000) NOT NULL DEFAULT '', + enabled BOOLEAN NOT NULL DEFAULT 1, + keep_last_response BOOLEAN NOT NULL DEFAULT 0, + last_response TEXT NOT NULL DEFAULT '', + last_success DATETIME, + last_failure DATETIME, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +-- Core Tag table (AutoField PK in 0.8.x) +CREATE TABLE IF NOT EXISTS core_tag ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name VARCHAR(100) NOT NULL UNIQUE, + slug VARCHAR(100) NOT NULL UNIQUE, + created_at DATETIME, + modified_at DATETIME, + created_by_id INTEGER REFERENCES auth_user(id) +); + +-- Crawls tables (new in 0.8.x) +CREATE TABLE IF NOT EXISTS crawls_crawlschedule ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + modified_at DATETIME, + schedule VARCHAR(64) NOT NULL, + is_enabled BOOLEAN NOT NULL DEFAULT 1, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + template_id CHAR(36) REFERENCES crawls_crawl(id), + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +CREATE TABLE IF NOT EXISTS crawls_crawl ( + id CHAR(36) PRIMARY KEY, + created_at DATETIME NOT NULL, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + modified_at DATETIME, + urls TEXT NOT NULL, + config TEXT DEFAULT '{}', + max_depth SMALLINT UNSIGNED NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id CHAR(36), + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + schedule_id CHAR(36), + output_dir VARCHAR(256) NOT NULL DEFAULT '', + status VARCHAR(16) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); + +-- Core Snapshot table (0.8.x with UUID PK, status, crawl FK) +CREATE TABLE IF NOT EXISTS core_snapshot ( + id CHAR(36) PRIMARY KEY, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + created_at DATETIME NOT NULL, + modified_at DATETIME, + url VARCHAR(2000) NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + bookmarked_at DATETIME NOT NULL, + crawl_id CHAR(36) REFERENCES crawls_crawl(id), + title VARCHAR(512), + downloaded_at DATETIME, + depth SMALLINT UNSIGNED NOT NULL DEFAULT 0, + retry_at DATETIME, + status VARCHAR(16) NOT NULL DEFAULT 'queued', + config TEXT DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(256), + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); +CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url); +CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp); +CREATE INDEX IF NOT EXISTS core_snapshot_created_at ON core_snapshot(created_at); + +-- Many-to-many for snapshot tags +CREATE TABLE IF NOT EXISTS core_snapshot_tags ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id), + tag_id INTEGER NOT NULL REFERENCES core_tag(id), + UNIQUE(snapshot_id, tag_id) +); + +-- Core ArchiveResult table (0.8.x with AutoField PK + UUID, status) +CREATE TABLE IF NOT EXISTS core_archiveresult ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid CHAR(36) UNIQUE, + created_by_id INTEGER NOT NULL REFERENCES auth_user(id), + created_at DATETIME NOT NULL, + modified_at DATETIME, + snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id), + extractor VARCHAR(32) NOT NULL, + pwd VARCHAR(256), + cmd TEXT, + cmd_version VARCHAR(128), + output VARCHAR(1024), + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(16) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(256), + iface_id INTEGER, + config TEXT DEFAULT '{}', + num_uses_failed INTEGER NOT NULL DEFAULT 0, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0 +); +CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id); +CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor); + +-- Insert required content types +INSERT INTO django_content_type (app_label, model) VALUES +('contenttypes', 'contenttype'), +('auth', 'permission'), +('auth', 'group'), +('auth', 'user'), +('admin', 'logentry'), +('sessions', 'session'), +('core', 'snapshot'), +('core', 'archiveresult'), +('core', 'tag'), +('machine', 'machine'), +('machine', 'networkinterface'), +('machine', 'dependency'), +('machine', 'binary'), +('crawls', 'crawl'), +('crawls', 'crawlschedule'), +('crawls', 'seed'), +('api', 'apitoken'), +('api', 'outboundwebhook'); +""" + + +# ============================================================================= +# Test Data Generators +# ============================================================================= + + +def generate_uuid() -> str: + """Generate a UUID string without dashes for SQLite.""" + return uuid7().hex + + +def generate_timestamp() -> str: + """Generate a timestamp string like ArchiveBox uses.""" + return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") + ".000000" + + +def seed_0_4_data(db_path: Path) -> dict[str, list[dict]]: + """Seed a 0.4.x database with realistic test data.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + created_data = { + "snapshots": [], + "tags_str": [], + } + + test_urls = [ + ("https://example.com/page1", "Example Page 1", "news,tech"), + ("https://example.org/article", "Article Title", "blog,reading"), + ("https://github.com/user/repo", "GitHub Repository", "code,github"), + ("https://news.ycombinator.com/item?id=12345", "HN Discussion", "news,discussion"), + ("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", "reference,wiki"), + ] + + for i, (url, title, tags) in enumerate(test_urls): + snapshot_id = generate_uuid() + timestamp = f"2024010{i + 1}120000.000000" + added = f"2024-01-0{i + 1} 12:00:00" + + cursor.execute( + """ + INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + (snapshot_id, url, timestamp, title, tags, added, added), + ) + + created_data["snapshots"].append( + { + "id": snapshot_id, + "url": url, + "timestamp": timestamp, + "title": title, + "tags": tags, + }, + ) + created_data["tags_str"].append(tags) + + cursor.execute(""" + INSERT INTO django_migrations (app, name, applied) + VALUES ('core', '0001_initial', datetime('now')) + """) + + conn.commit() + conn.close() + + return created_data + + +def seed_0_7_data(db_path: Path) -> dict[str, list[dict]]: + """Seed a 0.7.x database with realistic test data.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + created_data = { + "users": [], + "snapshots": [], + "tags": [], + "archiveresults": [], + } + + # Create a user + cursor.execute(""" + INSERT INTO auth_user (password, is_superuser, username, first_name, last_name, + email, is_staff, is_active, date_joined) + VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User', + 'admin@example.com', 1, 1, datetime('now')) + """) + user_id = cursor.lastrowid + created_data["users"].append({"id": user_id, "username": "admin"}) + + # Create 5 tags + tag_names = ["news", "tech", "blog", "reference", "code"] + for name in tag_names: + cursor.execute( + """ + INSERT INTO core_tag (name, slug) VALUES (?, ?) + """, + (name, name.lower()), + ) + tag_id = cursor.lastrowid + created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()}) + + # Create 5 snapshots + test_urls = [ + ("https://example.com/page1", "Example Page 1"), + ("https://example.org/article", "Article Title"), + ("https://github.com/user/repo", "GitHub Repository"), + ("https://news.ycombinator.com/item?id=12345", "HN Discussion"), + ("https://en.wikipedia.org/wiki/Test", "Wikipedia Test"), + ] + + for i, (url, title) in enumerate(test_urls): + snapshot_id = generate_uuid() + timestamp = f"2024010{i + 1}120000.000000" + added = f"2024-01-0{i + 1} 12:00:00" + + cursor.execute( + """ + INSERT INTO core_snapshot (id, url, timestamp, title, added, updated) + VALUES (?, ?, ?, ?, ?, ?) + """, + (snapshot_id, url, timestamp, title, added, added), + ) + + created_data["snapshots"].append( + { + "id": snapshot_id, + "url": url, + "timestamp": timestamp, + "title": title, + }, + ) + + # Assign 2 tags to each snapshot + tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]] + for tag_id in tag_ids: + cursor.execute( + """ + INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?) + """, + (snapshot_id, tag_id), + ) + + # Create 5 archive results for each snapshot + extractors = ["title", "favicon", "screenshot", "singlefile", "wget"] + statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"] + + for j, (extractor, status) in enumerate(zip(extractors, statuses)): + cursor.execute( + """ + INSERT INTO core_archiveresult + (snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + snapshot_id, + extractor, + json.dumps([extractor, "--version"]), + f"/data/archive/{timestamp}", + "1.0.0", + f"{extractor}/index.html" if status == "succeeded" else "", + f"2024-01-0{i + 1} 12:00:0{j}", + f"2024-01-0{i + 1} 12:00:1{j}", + status, + ), + ) + + created_data["archiveresults"].append( + { + "snapshot_id": snapshot_id, + "extractor": extractor, + "status": status, + }, + ) + + # Record migrations as applied (0.7.x migrations up to 0022) + migrations = [ + ("contenttypes", "0001_initial"), + ("contenttypes", "0002_remove_content_type_name"), + ("auth", "0001_initial"), + ("auth", "0002_alter_permission_name_max_length"), + ("auth", "0003_alter_user_email_max_length"), + ("auth", "0004_alter_user_username_opts"), + ("auth", "0005_alter_user_last_login_null"), + ("auth", "0006_require_contenttypes_0002"), + ("auth", "0007_alter_validators_add_error_messages"), + ("auth", "0008_alter_user_username_max_length"), + ("auth", "0009_alter_user_last_name_max_length"), + ("auth", "0010_alter_group_name_max_length"), + ("auth", "0011_update_proxy_permissions"), + ("auth", "0012_alter_user_first_name_max_length"), + ("admin", "0001_initial"), + ("admin", "0002_logentry_remove_auto_add"), + ("admin", "0003_logentry_add_action_flag_choices"), + ("sessions", "0001_initial"), + ("core", "0001_initial"), + ("core", "0002_auto_20200625_1521"), + ("core", "0003_auto_20200630_1034"), + ("core", "0004_auto_20200713_1552"), + ("core", "0005_auto_20200728_0326"), + ("core", "0006_auto_20201012_1520"), + ("core", "0007_archiveresult"), + ("core", "0008_auto_20210105_1421"), + ("core", "0009_auto_20210216_1038"), + ("core", "0010_auto_20210216_1055"), + ("core", "0011_auto_20210216_1331"), + ("core", "0012_auto_20210216_1425"), + ("core", "0013_auto_20210218_0729"), + ("core", "0014_auto_20210218_0729"), + ("core", "0015_auto_20210218_0730"), + ("core", "0016_auto_20210218_1204"), + ("core", "0017_auto_20210219_0211"), + ("core", "0018_auto_20210327_0952"), + ("core", "0019_auto_20210401_0654"), + ("core", "0020_auto_20210410_1031"), + ("core", "0021_auto_20220914_0934"), + ("core", "0022_auto_20231023_2008"), + ] + + for app, name in migrations: + cursor.execute( + """ + INSERT INTO django_migrations (app, name, applied) + VALUES (?, ?, datetime('now')) + """, + (app, name), + ) + + conn.commit() + conn.close() + + return created_data + + +def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]: + """Seed a 0.8.x database with realistic test data including Crawls.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + created_data = { + "users": [], + "crawls": [], + "snapshots": [], + "tags": [], + "archiveresults": [], + } + + # Create a user + cursor.execute(""" + INSERT INTO auth_user (password, is_superuser, username, first_name, last_name, + email, is_staff, is_active, date_joined) + VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User', + 'admin@example.com', 1, 1, datetime('now')) + """) + user_id = cursor.lastrowid + created_data["users"].append({"id": user_id, "username": "admin"}) + + # Create 5 tags + tag_names = ["news", "tech", "blog", "reference", "code"] + for name in tag_names: + cursor.execute( + """ + INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id) + VALUES (?, ?, datetime('now'), datetime('now'), ?) + """, + (name, name.lower(), user_id), + ) + tag_id = cursor.lastrowid + created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()}) + + # Create 2 Crawls (0.9.0 schema - no seeds) + test_crawls = [ + ("https://example.com\nhttps://example.org", 0, "Example Crawl"), + ("https://github.com/ArchiveBox", 1, "GitHub Crawl"), + ] + + for i, (urls, max_depth, label) in enumerate(test_crawls): + crawl_id = generate_uuid() + cursor.execute( + """ + INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls, + config, max_depth, tags_str, label, status, retry_at, + num_uses_failed, num_uses_succeeded) + VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0) + """, + (crawl_id, user_id, urls, max_depth, label), + ) + + created_data["crawls"].append( + { + "id": crawl_id, + "urls": urls, + "max_depth": max_depth, + "label": label, + }, + ) + + # Create 5 snapshots linked to crawls + test_urls = [ + ("https://example.com/page1", "Example Page 1", created_data["crawls"][0]["id"]), + ("https://example.org/article", "Article Title", created_data["crawls"][0]["id"]), + ("https://github.com/user/repo", "GitHub Repository", created_data["crawls"][1]["id"]), + ("https://news.ycombinator.com/item?id=12345", "HN Discussion", None), + ("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", None), + ] + + for i, (url, title, crawl_id) in enumerate(test_urls): + snapshot_id = generate_uuid() + timestamp = f"2024010{i + 1}120000.000000" + created_at = f"2024-01-0{i + 1} 12:00:00" + + cursor.execute( + """ + INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp, + bookmarked_at, crawl_id, title, depth, status, config, notes) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '') + """, + (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title), + ) + + created_data["snapshots"].append( + { + "id": snapshot_id, + "url": url, + "timestamp": timestamp, + "title": title, + "crawl_id": crawl_id, + }, + ) + + # Assign 2 tags to each snapshot + tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]] + for tag_id in tag_ids: + cursor.execute( + """ + INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?) + """, + (snapshot_id, tag_id), + ) + + # Create 5 archive results for each snapshot + extractors = ["title", "favicon", "screenshot", "singlefile", "wget"] + statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"] + + for j, (extractor, status) in enumerate(zip(extractors, statuses)): + result_uuid = generate_uuid() + cursor.execute( + """ + INSERT INTO core_archiveresult + (uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd, + cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?) + """, + ( + result_uuid, + user_id, + f"2024-01-0{i + 1} 12:00:0{j}", + f"2024-01-0{i + 1} 12:00:1{j}", + snapshot_id, + extractor, + f"/data/archive/{timestamp}", + json.dumps([extractor, "--version"]), + "1.0.0", + f"{extractor}/index.html" if status == "succeeded" else "", + f"2024-01-0{i + 1} 12:00:0{j}", + f"2024-01-0{i + 1} 12:00:1{j}", + status, + f"{extractor}", + ), + ) + + created_data["archiveresults"].append( + { + "uuid": result_uuid, + "snapshot_id": snapshot_id, + "extractor": extractor, + "status": status, + }, + ) + + # Record migrations as applied (0.8.x migrations) + migrations = [ + ("contenttypes", "0001_initial"), + ("contenttypes", "0002_remove_content_type_name"), + ("auth", "0001_initial"), + ("auth", "0002_alter_permission_name_max_length"), + ("auth", "0003_alter_user_email_max_length"), + ("auth", "0004_alter_user_username_opts"), + ("auth", "0005_alter_user_last_login_null"), + ("auth", "0006_require_contenttypes_0002"), + ("auth", "0007_alter_validators_add_error_messages"), + ("auth", "0008_alter_user_username_max_length"), + ("auth", "0009_alter_user_last_name_max_length"), + ("auth", "0010_alter_group_name_max_length"), + ("auth", "0011_update_proxy_permissions"), + ("auth", "0012_alter_user_first_name_max_length"), + ("admin", "0001_initial"), + ("admin", "0002_logentry_remove_auto_add"), + ("admin", "0003_logentry_add_action_flag_choices"), + ("sessions", "0001_initial"), + ("core", "0001_initial"), + ("core", "0002_auto_20200625_1521"), + ("core", "0003_auto_20200630_1034"), + ("core", "0004_auto_20200713_1552"), + ("core", "0005_auto_20200728_0326"), + ("core", "0006_auto_20201012_1520"), + ("core", "0007_archiveresult"), + ("core", "0008_auto_20210105_1421"), + ("core", "0009_auto_20210216_1038"), + ("core", "0010_auto_20210216_1055"), + ("core", "0011_auto_20210216_1331"), + ("core", "0012_auto_20210216_1425"), + ("core", "0013_auto_20210218_0729"), + ("core", "0014_auto_20210218_0729"), + ("core", "0015_auto_20210218_0730"), + ("core", "0016_auto_20210218_1204"), + ("core", "0017_auto_20210219_0211"), + ("core", "0018_auto_20210327_0952"), + ("core", "0019_auto_20210401_0654"), + ("core", "0020_auto_20210410_1031"), + ("core", "0021_auto_20220914_0934"), + ("core", "0022_auto_20231023_2008"), + # For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces + ("core", "0023_alter_archiveresult_options_archiveresult_abid_and_more"), + ("core", "0024_auto_20240513_1143"), + ("core", "0025_alter_archiveresult_uuid"), + ("core", "0026_archiveresult_created_archiveresult_created_by_and_more"), + ("core", "0027_update_snapshot_ids"), + ("core", "0028_alter_archiveresult_uuid"), + ("core", "0029_alter_archiveresult_id"), + ("core", "0030_alter_archiveresult_uuid"), + ("core", "0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more"), + ("core", "0032_alter_archiveresult_id"), + ("core", "0033_rename_id_archiveresult_old_id"), + ("core", "0034_alter_archiveresult_old_id_alter_archiveresult_uuid"), + ("core", "0035_remove_archiveresult_uuid_archiveresult_id"), + ("core", "0036_alter_archiveresult_id_alter_archiveresult_old_id"), + ("core", "0037_rename_id_snapshot_old_id"), + ("core", "0038_rename_uuid_snapshot_id"), + ("core", "0039_rename_snapshot_archiveresult_snapshot_old"), + ("core", "0040_archiveresult_snapshot"), + ("core", "0041_alter_archiveresult_snapshot_and_more"), + ("core", "0042_remove_archiveresult_snapshot_old"), + ("core", "0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more"), + ("core", "0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more"), + ("core", "0045_alter_snapshot_old_id"), + ("core", "0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more"), + ("core", "0047_alter_snapshottag_unique_together_and_more"), + ("core", "0048_alter_archiveresult_snapshot_and_more"), + ("core", "0049_rename_snapshot_snapshottag_snapshot_old_and_more"), + ("core", "0050_alter_snapshottag_snapshot_old"), + ("core", "0051_snapshottag_snapshot_alter_snapshottag_snapshot_old"), + ("core", "0052_alter_snapshottag_unique_together_and_more"), + ("core", "0053_remove_snapshottag_snapshot_old"), + ("core", "0054_alter_snapshot_timestamp"), + ("core", "0055_alter_tag_slug"), + ("core", "0056_remove_tag_uuid"), + ("core", "0057_rename_id_tag_old_id"), + ("core", "0058_alter_tag_old_id"), + ("core", "0059_tag_id"), + ("core", "0060_alter_tag_id"), + ("core", "0061_rename_tag_snapshottag_old_tag_and_more"), + ("core", "0062_alter_snapshottag_old_tag"), + ("core", "0063_snapshottag_tag_alter_snapshottag_old_tag"), + ("core", "0064_alter_snapshottag_unique_together_and_more"), + ("core", "0065_remove_snapshottag_old_tag"), + ("core", "0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id"), + ("core", "0067_alter_snapshottag_tag"), + ("core", "0068_alter_archiveresult_options"), + ("core", "0069_alter_archiveresult_created_alter_snapshot_added_and_more"), + ("core", "0070_alter_archiveresult_created_by_alter_snapshot_added_and_more"), + ("core", "0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more"), + ("core", "0072_rename_added_snapshot_bookmarked_at_and_more"), + ("core", "0073_rename_created_archiveresult_created_at_and_more"), + ("core", "0074_alter_snapshot_downloaded_at"), + # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs + # We already recorded 0023-0074 above, so Django will know the state + # For 0.8.x: Record original machine migrations (before squashing) + # DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs + ("machine", "0001_initial"), + ("machine", "0002_alter_machine_stats_installedbinary"), + ("machine", "0003_alter_installedbinary_options_and_more"), + ("machine", "0004_alter_installedbinary_abspath_and_more"), + # Then the new migrations after squashing + ("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"), + ("machine", "0004_drop_dependency_table"), + # Crawls must come before core.0024 because 0024_b depends on it + ("crawls", "0001_initial"), + # Core 0024 migrations chain (in dependency order) + ("core", "0024_b_clear_config_fields"), + ("core", "0024_c_disable_fk_checks"), + ("core", "0024_d_fix_crawls_config"), + ("core", "0024_snapshot_crawl"), + ("core", "0024_f_add_snapshot_config"), + ("core", "0025_allow_duplicate_urls_per_crawl"), + # For 0.8.x: Record original api migration (before squashing) + # DO NOT record 0001_squashed here - it replaces 0001 for fresh installs + ("api", "0001_initial"), + ("api", "0002_alter_apitoken_options"), + ("api", "0003_rename_user_apitoken_created_by_apitoken_abid_and_more"), + ("api", "0004_alter_apitoken_id_alter_apitoken_uuid"), + ("api", "0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more"), + ("api", "0006_remove_outboundwebhook_uuid_apitoken_id_and_more"), + ("api", "0007_alter_apitoken_created_by"), + ("api", "0008_alter_apitoken_created_alter_apitoken_created_by_and_more"), + ("api", "0009_rename_created_apitoken_created_at_and_more"), + # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies + # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations + # Do NOT record 0026+ as they need to be tested during migration + ] + + for app, name in migrations: + cursor.execute( + """ + INSERT INTO django_migrations (app, name, applied) + VALUES (?, ?, datetime('now')) + """, + (app, name), + ) + + conn.commit() + conn.close() + + return created_data + + +# ============================================================================= +# Helper Functions +# ============================================================================= + + +def run_archivebox_migration_cmd(data_dir: Path, args: list, timeout: int = 60, env: dict | None = None): + """Run archivebox command in subprocess with given data directory.""" + base_env = cli_env( + disable_extractors=True, + PLUGINS="__archivebox_test_no_plugins__", + ) + if env: + base_env.update(env) + + return run_archivebox_cmd( + args, + env=base_env, + cwd=data_dir, + timeout=timeout, + replace_env=True, + ) + + +def create_data_dir_structure(data_dir: Path): + """Create the basic ArchiveBox data directory structure.""" + (data_dir / "archive").mkdir(parents=True, exist_ok=True) + (data_dir / "sources").mkdir(parents=True, exist_ok=True) + (data_dir / "logs").mkdir(parents=True, exist_ok=True) + + +def verify_snapshot_count(db_path: Path, expected: int) -> tuple[bool, str]: + """Verify the number of snapshots in the database.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_snapshot") + count = cursor.fetchone()[0] + conn.close() + + if count == expected: + return True, f"Snapshot count OK: {count}" + return False, f"Snapshot count mismatch: expected {expected}, got {count}" + + +def verify_tag_count(db_path: Path, expected: int) -> tuple[bool, str]: + """Verify the number of tags in the database (exact match).""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_tag") + count = cursor.fetchone()[0] + conn.close() + + if count == expected: + return True, f"Tag count OK: {count}" + return False, f"Tag count mismatch: expected {expected}, got {count}" + + +def verify_archiveresult_count(db_path: Path, expected: int) -> tuple[bool, str]: + """Verify the number of archive results in the database.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + count = cursor.fetchone()[0] + conn.close() + + if count == expected: + return True, f"ArchiveResult count OK: {count}" + return False, f"ArchiveResult count mismatch: expected {expected}, got {count}" + + +def verify_snapshot_urls(db_path: Path, expected_urls: list[str]) -> tuple[bool, str]: + """Verify ALL expected URLs exist in snapshots.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url FROM core_snapshot") + actual_urls = {row[0] for row in cursor.fetchall()} + conn.close() + + missing = set(expected_urls) - actual_urls + if not missing: + return True, "All URLs preserved" + return False, f"Missing URLs: {missing}" + + +def verify_snapshot_titles(db_path: Path, expected_titles: dict[str, str]) -> tuple[bool, str]: + """Verify ALL snapshot titles are preserved.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url, title FROM core_snapshot") + actual = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + + mismatches = [] + for url, expected_title in expected_titles.items(): + if url not in actual: + mismatches.append(f"{url}: missing from database") + elif actual[url] != expected_title: + mismatches.append(f"{url}: expected '{expected_title}', got '{actual[url]}'") + + if not mismatches: + return True, "All titles preserved" + return False, f"Title mismatches: {mismatches}" + + +def verify_foreign_keys(db_path: Path) -> tuple[bool, str]: + """Verify foreign key relationships are intact.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check ArchiveResult -> Snapshot FK + cursor.execute(""" + SELECT COUNT(*) FROM core_archiveresult ar + WHERE NOT EXISTS (SELECT 1 FROM core_snapshot s WHERE s.id = ar.snapshot_id) + """) + orphaned_results = cursor.fetchone()[0] + + conn.close() + + if orphaned_results == 0: + return True, "Foreign keys intact" + return False, f"Found {orphaned_results} orphaned ArchiveResults" + + +def verify_all_snapshots_in_output(output: str, snapshots: list[dict]) -> tuple[bool, str]: + """Verify ALL snapshots appear in command output (not just one).""" + missing = [] + for snapshot in snapshots: + url_fragment = snapshot["url"][:30] + title = snapshot.get("title", "") + if url_fragment not in output and (not title or title not in output): + missing.append(snapshot["url"]) + + if not missing: + return True, "All snapshots found in output" + return False, f"Missing snapshots in output: {missing}" + + +def verify_crawl_count(db_path: Path, expected: int) -> tuple[bool, str]: + """Verify the number of crawls in the database.""" + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + count = cursor.fetchone()[0] + conn.close() + + if count == expected: + return True, f"Crawl count OK: {count}" + return False, f"Crawl count mismatch: expected {expected}, got {count}" + + +def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> tuple[bool, str]: + """ + Verify that ArchiveResults were properly migrated to Process records. + + Checks: + 1. All ArchiveResults have process_id set + 2. Process count matches ArchiveResult count + 3. Binary records created for unique cmd_version values + 4. Status mapping is correct + """ + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check all ArchiveResults have process_id + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL") + null_count = cursor.fetchone()[0] + + if null_count > 0: + conn.close() + return False, f"Found {null_count} ArchiveResults without process_id" + + # Check Process count + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + + if process_count != expected_archiveresult_count: + conn.close() + return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}" + + # Check status mapping + cursor.execute(""" + SELECT ar.status, p.status, p.exit_code + FROM core_archiveresult ar + JOIN machine_process p ON ar.process_id = p.id + """) + + status_errors = [] + for ar_status, p_status, p_exit_code in cursor.fetchall(): + expected_p_status, expected_exit_code = { + "queued": ("queued", None), + "started": ("running", None), + "backoff": ("queued", None), + "succeeded": ("exited", 0), + "failed": ("exited", 1), + "skipped": ("exited", None), + }.get(ar_status, ("queued", None)) + + if p_status != expected_p_status: + status_errors.append(f"AR status {ar_status} โ†’ Process {p_status}, expected {expected_p_status}") + + if p_exit_code != expected_exit_code: + status_errors.append(f"AR status {ar_status} โ†’ exit_code {p_exit_code}, expected {expected_exit_code}") + + if status_errors: + conn.close() + return False, f"Status mapping errors: {'; '.join(status_errors[:5])}" + + conn.close() + return True, f"Process migration verified: {process_count} Processes created" diff --git a/archivebox/tests/pinboard_export.html b/archivebox/tests/pinboard_export.html deleted file mode 100644 index e12b5e4150..0000000000 --- a/archivebox/tests/pinboard_export.html +++ /dev/null @@ -1,12 +0,0 @@ - - -Pinboard Bookmarks -

        Bookmarks

        -
        -

        - -

        Algo VPN scripts -
        uLisp - -
        -

        diff --git a/archivebox/tests/pinboard_export.json b/archivebox/tests/pinboard_export.json deleted file mode 100644 index c39d08dddd..0000000000 --- a/archivebox/tests/pinboard_export.json +++ /dev/null @@ -1,8 +0,0 @@ -[{"href":"https:\/\/en.wikipedia.org\/wiki\/International_Typographic_Style","description":"International Typographic Style - Wikipedia, the free encyclopedia","extended":"","meta":"32f4cc916e6f5919cc19aceb10559cc1","hash":"3dd64e155e16731d20350bec6bef7cb5","time":"2016-06-07T11:27:08Z","shared":"no","toread":"yes","tags":""}, -{"href":"https:\/\/news.ycombinator.com\/item?id=11686984","description":"Announcing Certbot: EFF's Client for Let's Encrypt | Hacker News","extended":"","meta":"4a49602ba5d20ec3505c75d38ebc1d63","hash":"1c1acb53a5bd520e8529ce4f9600abee","time":"2016-05-13T05:46:16Z","shared":"no","toread":"yes","tags":""}, -{"href":"https:\/\/github.com\/google\/styleguide","description":"GitHub - google\/styleguide: Style guides for Google-originated open-source projects","extended":"","meta":"15a8d50f7295f18ccb6dd19cb689c68a","hash":"1028bf9872d8e4ea1b1858f4044abb58","time":"2016-02-24T08:49:25Z","shared":"no","toread":"no","tags":"code.style.guide programming reference web.dev"}, -{"href":"http:\/\/en.wikipedia.org\/wiki\/List_of_XML_and_HTML_character_entity_references","description":"List of XML and HTML character entity references - Wikipedia, the free encyclopedia","extended":"","meta":"6683a70f0f59c92c0bfd0bce653eab69","hash":"344d975c6251a8d460971fa2c43d9bbb","time":"2014-06-16T04:17:15Z","shared":"no","toread":"no","tags":"html reference web.dev typography"}, -{"href":"https:\/\/pushover.net\/","description":"Pushover: Simple Notifications for Android, iOS, and Desktop","extended":"","meta":"1e68511234d9390d10b7772c8ccc4b9e","hash":"bb93374ead8a937b18c7c46e13168a7d","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"app android"}, -{"href":"http:\/\/www.reddit.com\/r\/Android","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 1"}, -{"href":"http:\/\/www.reddit.com\/r\/Android2","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e2","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 2"}, -{"href":"http:\/\/www.reddit.com\/r\/Android3","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e4","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 3"}] diff --git a/archivebox/tests/pinboard_export.rss b/archivebox/tests/pinboard_export.rss deleted file mode 100644 index a300720a61..0000000000 --- a/archivebox/tests/pinboard_export.rss +++ /dev/null @@ -1,46 +0,0 @@ - - - - Pinboard (private aaronmueller) - https://pinboard.in/u:aaronmueller/private/ - - - - - - - - - - - Mehkee - Mechanical Keyboard Parts & Accessories - 2018-11-08T21:29:32+00:00 - https://mehkee.com/ - aaronmueller - keyboard gadget diy - http://pinboard.in/ - http://pinboard.in/u:aaronmueller/b:xxx/ - - - - - - - - - - QMK Firmware - An open source firmware for AVR and ARM based keyboards - 2018-11-06T22:36:21+00:00 - https://qmk.fm/ - aaronmueller - firmware keyboard - http://pinboard.in/ - http://pinboard.in/u:aaronmueller/b:xxx/ - - - - - - - - diff --git a/archivebox/tests/pinboard_export.xml b/archivebox/tests/pinboard_export.xml deleted file mode 100644 index 9dce0f5469..0000000000 --- a/archivebox/tests/pinboard_export.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - diff --git a/archivebox/tests/pinboard_export_2.json b/archivebox/tests/pinboard_export_2.json deleted file mode 100644 index b106039cff..0000000000 --- a/archivebox/tests/pinboard_export_2.json +++ /dev/null @@ -1,2 +0,0 @@ -[{"href":"https:\/\/github.com\/trailofbits\/algo","description":"Algo VPN scripts","extended":"","meta":"62325ba3b577683aee854d7f191034dc","hash":"18d708f67bb26d843b1cac4530bb52aa","time":"2018-11-19T08:38:53Z","shared":"no","toread":"yes","tags":"vpn scripts"}, -{"href":"http:\/\/www.ulisp.com\/","description":"uLisp","extended":"","meta":"7bd0c0ef31f69d1459e3d37366e742b3","hash":"2a17ae95925a03a5b9bb38cf7f6c6f9b","time":"2018-11-16T13:20:12Z","shared":"no","toread":"yes","tags":"arduino avr embedded lisp"}] diff --git a/archivebox/tests/pocket_export.html b/archivebox/tests/pocket_export.html deleted file mode 100644 index bb51c0c683..0000000000 --- a/archivebox/tests/pocket_export.html +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - Pocket Export - - -

        Unread

        - - -

        Read Archive

        - - - diff --git a/archivebox/tests/rss_export.xml b/archivebox/tests/rss_export.xml deleted file mode 100644 index 69eb9bc29c..0000000000 --- a/archivebox/tests/rss_export.xml +++ /dev/null @@ -1,228 +0,0 @@ - - - - -My Reading List: Read and Unread -Items I've saved to read -http://readitlaterlist.com/users/nikisweeting/feed/all - - - - -<![CDATA[Cell signaling]]> -Unread -https://en.wikipedia.org/wiki/Cell_signaling -https://en.wikipedia.org/wiki/Cell_signaling -Mon, 30 Oct 2017 01:12:10 -0500 - - -<![CDATA[Hayflick limit]]> -Unread -https://en.wikipedia.org/wiki/Hayflick_limit -https://en.wikipedia.org/wiki/Hayflick_limit -Mon, 30 Oct 2017 01:11:38 -0500 - - -<![CDATA[Even moderate drinking by parents can upset children โ€“ย study]]> -Unread -https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal -https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal -Mon, 30 Oct 2017 01:11:30 -0500 - - -<![CDATA[How Merkle trees enable the decentralized Web]]> -Unread -https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web -https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web -Mon, 30 Oct 2017 01:11:30 -0500 - - -<![CDATA[Inertial navigation system]]> -Unread -https://en.wikipedia.org/wiki/Inertial_navigation_system -https://en.wikipedia.org/wiki/Inertial_navigation_system -Mon, 30 Oct 2017 01:10:10 -0500 - - -<![CDATA[Dead reckoning]]> -Unread -https://en.wikipedia.org/wiki/Dead_reckoning -https://en.wikipedia.org/wiki/Dead_reckoning -Mon, 30 Oct 2017 01:10:08 -0500 - - -<![CDATA[Calling Rust From Python]]> -Unread -https://bheisler.github.io/post/calling-rust-in-python -https://bheisler.github.io/post/calling-rust-in-python -Mon, 30 Oct 2017 01:04:33 -0500 - - -<![CDATA[Why would anyone choose Docker over fat binaries?]]> -Unread -http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries -http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries -Sun, 29 Oct 2017 14:57:25 -0500 - - -<![CDATA[]]> -Unread -https://heml.io -https://heml.io -Sun, 29 Oct 2017 14:55:26 -0500 - - -<![CDATA[A surprising amount of people want to be in North Korea]]> -Unread -https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad -https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad -Sat, 28 Oct 2017 05:41:41 -0500 - - -<![CDATA[Learning a Hierarchy]]> -Unread -https://blog.openai.com/learning-a-hierarchy -https://blog.openai.com/learning-a-hierarchy -Thu, 26 Oct 2017 16:43:48 -0500 - - -<![CDATA[High Performance Browser Networking]]> -Unread -https://hpbn.co -https://hpbn.co -Wed, 25 Oct 2017 19:05:24 -0500 - - -<![CDATA[What tender and juicy drama is going on at your school/workplace?]]> -Unread -https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v -https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v -Wed, 25 Oct 2017 18:05:58 -0500 - - -<![CDATA[Using an SSH Bastion Host]]> -Unread -https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host -https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host -Wed, 25 Oct 2017 11:38:47 -0500 - - -<![CDATA[Let's Define "undefined" | NathanShane.me]]> -Unread -https://nathanshane.me/blog/let's-define-undefined -https://nathanshane.me/blog/let's-define-undefined -Wed, 25 Oct 2017 11:32:59 -0500 - - -<![CDATA[Control theory]]> -Unread -https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function -https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function -Tue, 24 Oct 2017 22:57:43 -0500 - - -<![CDATA[J012-86-intractable.pdf]]> -Unread -http://mit.edu/~jnt/Papers/J012-86-intractable.pdf -http://mit.edu/~jnt/Papers/J012-86-intractable.pdf -Tue, 24 Oct 2017 22:56:32 -0500 - - -<![CDATA[Dynamic Programming: First Principles]]> -Unread -http://flawlessrhetoric.com/Dynamic-Programming-First-Principles -http://flawlessrhetoric.com/Dynamic-Programming-First-Principles -Tue, 24 Oct 2017 22:56:30 -0500 - - -<![CDATA[What Would Happen If There Were No Number 6?]]> -Unread -https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6 -https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6 -Tue, 24 Oct 2017 22:21:59 -0500 - - -<![CDATA[Ten Basic Rules for Adventure]]> -Unread -https://outsideonline.com/2252916/10-basic-rules-adventure -https://outsideonline.com/2252916/10-basic-rules-adventure -Tue, 24 Oct 2017 20:56:25 -0500 - - -<![CDATA[Insects Are In Serious Trouble]]> -Unread -https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true -https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true -Mon, 23 Oct 2017 23:10:10 -0500 - - -<![CDATA[Netflix/bless]]> -Unread -https://github.com/Netflix/bless -https://github.com/Netflix/bless -Mon, 23 Oct 2017 23:04:46 -0500 - - -<![CDATA[Getting Your First 10 Customers]]> -Unread -https://stripe.com/atlas/guides/starting-sales -https://stripe.com/atlas/guides/starting-sales -Mon, 23 Oct 2017 22:27:36 -0500 - - -<![CDATA[GPS Hardware]]> -Unread -https://novasummits.com/gps-hardware -https://novasummits.com/gps-hardware -Mon, 23 Oct 2017 04:44:40 -0500 - - -<![CDATA[Bicycle Tires and Tubes]]> -Unread -http://sheldonbrown.com/tires.html#pressure -http://sheldonbrown.com/tires.html#pressure -Mon, 23 Oct 2017 01:28:32 -0500 - - -<![CDATA[Tire light is on]]> -Unread -https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe -https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe -Mon, 23 Oct 2017 01:21:42 -0500 - - -<![CDATA[Bad_Salish_Boo ?? on Twitter]]> -Unread -https://t.co/PDLlNjACv9 -https://t.co/PDLlNjACv9 -Sat, 21 Oct 2017 06:48:07 -0500 - - -<![CDATA[Is an Open Marriage a Happier Marriage?]]> -Unread -https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html -https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html -Fri, 20 Oct 2017 13:08:52 -0500 - - -<![CDATA[The Invention of Monogamy]]> -Unread -https://thenib.com/the-invention-of-monogamy -https://thenib.com/the-invention-of-monogamy -Fri, 20 Oct 2017 12:19:00 -0500 - - -<![CDATA[Google Chrome May Add a Permission to Stop In-Browser Cryptocurrency Miners]]> -Unread -https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners -https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners -Fri, 20 Oct 2017 03:57:41 -0500 - - - - diff --git a/archivebox/tests/test_api_archiveresult.py b/archivebox/tests/test_api_archiveresult.py new file mode 100644 index 0000000000..0678292318 --- /dev/null +++ b/archivebox/tests/test_api_archiveresult.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_core_archiveresults.py and test_api_v1_core_archiveresult_archiveresult_id.py. diff --git a/archivebox/tests/test_api_cli.py b/archivebox/tests/test_api_cli.py new file mode 100644 index 0000000000..b4267b1b68 --- /dev/null +++ b/archivebox/tests/test_api_cli.py @@ -0,0 +1 @@ +# CLI endpoint tests moved to test_api_v1_cli_add.py and test_api_v1_cli_update.py. diff --git a/archivebox/tests/test_api_cli_schedule.py b/archivebox/tests/test_api_cli_schedule.py new file mode 100644 index 0000000000..def038026b --- /dev/null +++ b/archivebox/tests/test_api_cli_schedule.py @@ -0,0 +1 @@ +# CLI schedule endpoint tests moved to test_api_v1_cli_schedule.py. diff --git a/archivebox/tests/test_api_crawl.py b/archivebox/tests/test_api_crawl.py new file mode 100644 index 0000000000..4924867570 --- /dev/null +++ b/archivebox/tests/test_api_crawl.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_crawls_crawl_crawl_id.py. diff --git a/archivebox/tests/test_api_crud.py b/archivebox/tests/test_api_crud.py new file mode 100644 index 0000000000..98b61d355e --- /dev/null +++ b/archivebox/tests/test_api_crud.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_workflow_core_token_auth_side_effects.py and exact endpoint files under test_api_v1_core_*. diff --git a/archivebox/tests/test_api_delete_paths.py b/archivebox/tests/test_api_delete_paths.py new file mode 100644 index 0000000000..46113b2082 --- /dev/null +++ b/archivebox/tests/test_api_delete_paths.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_core_snapshot_snapshot_id.py and test_api_v1_crawls_crawl_crawl_id.py. diff --git a/archivebox/tests/test_api_personas.py b/archivebox/tests/test_api_personas.py new file mode 100644 index 0000000000..227efc93f7 --- /dev/null +++ b/archivebox/tests/test_api_personas.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_personas_sync.py and test_api_v1_personas_personas.py. diff --git a/archivebox/tests/test_api_remove.py b/archivebox/tests/test_api_remove.py new file mode 100644 index 0000000000..80a2395fb8 --- /dev/null +++ b/archivebox/tests/test_api_remove.py @@ -0,0 +1 @@ +# CLI remove endpoint tests moved to test_api_v1_cli_remove.py. diff --git a/archivebox/tests/test_api_rss.py b/archivebox/tests/test_api_rss.py new file mode 100644 index 0000000000..dd3acbbcf7 --- /dev/null +++ b/archivebox/tests/test_api_rss.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_core_snapshots_rss.py and test_api_v1_crawls_crawl_crawl_id.py. diff --git a/archivebox/tests/test_api_search.py b/archivebox/tests/test_api_search.py new file mode 100644 index 0000000000..df3443bebe --- /dev/null +++ b/archivebox/tests/test_api_search.py @@ -0,0 +1 @@ +# Tests moved to test_api_v1_core_snapshots.py. diff --git a/archivebox/tests/test_api_v1_auth_check_api_token.py b/archivebox/tests/test_api_v1_auth_check_api_token.py new file mode 100644 index 0000000000..d03e73d4d5 --- /dev/null +++ b/archivebox/tests/test_api_v1_auth_check_api_token.py @@ -0,0 +1,19 @@ +import pytest + +from archivebox.tests.conftest import API_TEST_HOST, api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_token): + response = api_client_request( + client, + "post", + "/api/v1/auth/check_api_token", + payload={"token": api_token.token}, + headers={"HTTP_HOST": API_TEST_HOST}, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_auth_get_api_token.py b/archivebox/tests/test_api_v1_auth_get_api_token.py new file mode 100644 index 0000000000..6a6f402733 --- /dev/null +++ b/archivebox/tests/test_api_v1_auth_get_api_token.py @@ -0,0 +1,22 @@ +import pytest + +from archivebox.tests.conftest import API_TEST_HOST, api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user): + response = api_client_request( + client, + "post", + "/api/v1/auth/get_api_token", + payload={ + "username": api_admin_user.username, + "password": "testpass123", + }, + headers={"HTTP_HOST": API_TEST_HOST}, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_cli_add.py b/archivebox/tests/test_api_v1_cli_add.py new file mode 100644 index 0000000000..1eb8933a55 --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_add.py @@ -0,0 +1,394 @@ +import pytest +import json +from pathlib import Path + +from .conftest import ( + api_client_request, + cli_env, + create_admin_and_token, + get_free_port, + init_archive, + live_api_request, + start_archivebox_server, + stop_server, +) +from archivebox.core.models import Snapshot, SnapshotTag +from archivebox.crawls.models import Crawl +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +IMPORT_FORMAT_EXPECTATIONS = { + "rss": { + "url": "https://example.com/", + "title": "RSS Example Import", + "date": "2024-01-01", + "tags": {"rss-tag", "metadata"}, + }, + "netscape": { + "url": "https://www.iana.org/domains/reserved", + "title": "IANA Reserved Domains", + "date": "2024-01-02", + "tags": {"netscape-tag", "metadata"}, + }, + "dom": { + "url": "https://www.iana.org/help/example-domains", + }, + "json": { + "url": "https://example.com/?archivebox-json-import=1", + "title": "JSON Import Example", + "date": "2024-01-03", + "tags": {"json-tag", "metadata"}, + }, + "jsonl": { + "url": "https://example.com/?archivebox-jsonl-import=1", + "title": "JSONL Import Example", + "date": "2024-01-04", + "tags": {"jsonl-tag", "metadata"}, + }, + "txt": { + "url": "https://example.org/", + }, +} + + +def write_import_format_files(base_dir: Path) -> dict[str, Path]: + files = { + "rss": base_dir / "test_rss.xml", + "netscape": base_dir / "test_netscape.html", + "dom": base_dir / "test_dom.html", + "json": base_dir / "test_bookmarks.json", + "jsonl": base_dir / "test_bookmarks.jsonl", + "txt": base_dir / "test_urls.txt", + } + files["rss"].write_text( + """ + + + ArchiveBox RSS import fixture + https://example.com/ + ArchiveBox RSS import fixture + + RSS Example Import + https://example.com/ + https://example.com/ + Mon, 01 Jan 2024 00:00:00 GMT + rss-tag + metadata + + + +""", + encoding="utf-8", + ) + files["netscape"].write_text( + """ + +Bookmarks +

        Bookmarks

        +

        +

        IANA Reserved Domains +

        +""", + encoding="utf-8", + ) + files["dom"].write_text( + """ + + DOM import fixture + + IANA Example Domains + + +""", + encoding="utf-8", + ) + files["json"].write_text( + json.dumps( + { + "url": "https://example.com/?archivebox-json-import=1", + "title": "JSON Import Example", + "tags": ["json-tag", "metadata"], + "bookmarked_at": "2024-01-03T00:00:00+00:00", + }, + ) + + "\n", + encoding="utf-8", + ) + files["jsonl"].write_text( + json.dumps( + { + "url": "https://example.com/?archivebox-jsonl-import=1", + "title": "JSONL Import Example", + "tags": "jsonl-tag,metadata", + "bookmarked_at": "2024-01-04T00:00:00+00:00", + }, + ) + + "\n", + encoding="utf-8", + ) + files["txt"].write_text( + "Plain text import fixture containing https://example.org/ as a real live URL.\n", + encoding="utf-8", + ) + return files + + +IMPORT_FORMAT_ENV = { + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", + "PLUGINS": "parse_html_urls,parse_jsonl_urls,parse_netscape_urls,parse_rss_urls,parse_txt_urls,wget,headers", + "SAVE_WGET": "True", + "SAVE_HEADERS": "True", + "USE_CHROME": "False", + "URL_ALLOWLIST": r"example\.com|example\.org|iana\.org|www\.iana\.org", +} + + +def wait_for_import_processing(cwd: Path, expected_urls: set[str], *, timeout: float = 120.0) -> None: + import time + + deadline = time.time() + timeout + while time.time() < deadline: + with use_archivebox_db(cwd): + snapshot_started = Snapshot.objects.filter(url__in=expected_urls).exists() + if snapshot_started: + return + time.sleep(1) + raise AssertionError("timed out waiting for import crawl processing to start") + + +def wait_for_expected_import_snapshots( + cwd: Path, + expected_urls: set[str], + *, + timeout: float = 180.0, + expected_tags: set[str] | None = None, +) -> None: + import time + + allowed_statuses = {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + deadline = time.time() + timeout + while time.time() < deadline: + with use_archivebox_db(cwd): + snapshots = list(Snapshot.objects.filter(url__in=expected_urls).values("id", "url", "status")) + tag_names_by_snapshot_id = {} + if expected_tags and snapshots: + for snapshot_id, tag_name in SnapshotTag.objects.filter( + snapshot_id__in=[snapshot["id"] for snapshot in snapshots], + ).values_list("snapshot_id", "tag__name"): + tag_names_by_snapshot_id.setdefault(snapshot_id, set()).add(tag_name) + counts = {url: 0 for url in expected_urls} + bad_statuses = [] + missing_tags = {} + for snapshot in snapshots: + counts[snapshot["url"]] += 1 + if snapshot["status"] not in allowed_statuses: + bad_statuses.append((snapshot["url"], snapshot["status"])) + if expected_tags: + tag_names = tag_names_by_snapshot_id.get(snapshot["id"], set()) + missing = expected_tags - tag_names + if missing: + missing_tags[snapshot["url"]] = missing + if all(count == 1 for count in counts.values()) and not bad_statuses and not missing_tags: + return + time.sleep(1) + raise AssertionError( + f"timed out waiting for one queued/started/sealed snapshot per URL, got counts={counts}, bad_statuses={bad_statuses}, missing_tags={missing_tags}", + ) + + +def malicious_add_inputs(tmp_path: Path, *, safe_url: str) -> tuple[list[str], Path]: + other_crawl_source = tmp_path / "sources" / "other_crawl_source.txt" + other_crawl_source.parent.mkdir(parents=True, exist_ok=True) + other_crawl_source.write_text("https://example.com/not-owned-by-this-crawl\n", encoding="utf-8") + canary = tmp_path / "archivebox_shell_injection_canary" + return ( + [ + safe_url, + "file:///etc/hosts", + "/etc/hosts", + "../../../../etc/passwd", + f"file://{other_crawl_source}", + str(other_crawl_source), + f"'; touch {canary}; #", + f'" && touch {canary} && echo "', + f"$(touch {canary})", + f"`touch {canary}`", + """ + +]> + + + &localfile;file:///etc/passwd + + +""", + ], + canary, + ) + + +def assert_no_file_or_shell_payload_snapshots(cwd: Path, *, canary: Path) -> None: + with use_archivebox_db(cwd): + snapshots = list(Snapshot.objects.all()) + assert not canary.exists() + assert not [snapshot.url for snapshot in snapshots if str(snapshot.url).startswith("file:")] + for forbidden in ("/etc/hosts", "/etc/passwd", "other_crawl_source", "archivebox_shell_injection_canary"): + assert not [snapshot.url for snapshot in snapshots if forbidden in str(snapshot.url)] + + +def test_basic_success_case_request(client, tmp_path, api_headers): + init_archive(tmp_path) + submitted_url = "https://example.com/api-cli-add-basic" + + response = api_client_request( + client, + "post", + "/api/v1/cli/add", + payload={ + "urls": [submitted_url], + "depth": 0, + "parser": "url_list", + "plugins": "__archivebox_test_no_plugins__", + "index_only": True, + }, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + assert crawl.urls == submitted_url + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + assert (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") == submitted_url + + +@pytest.mark.timeout(360) +def test_api_cli_add_import_text_formats_preserve_metadata_and_crawl_inner_urls(tmp_path): + """REST API add should accept rich import text and queue real inner URLs with metadata preserved.""" + init_archive(tmp_path) + import_files = write_import_format_files(tmp_path) + expected_urls = {case["url"] for case in IMPORT_FORMAT_EXPECTATIONS.values()} + port = get_free_port() + env = cli_env(port=port, server=True, **IMPORT_FORMAT_ENV) + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + for import_name, import_path in import_files.items(): + response = live_api_request( + port, + "post", + "/api/v1/cli/add", + api_token=api_token, + json={ + "urls": [import_path.read_text(encoding="utf-8")], + "depth": 0, + "tag": "api-import", + "plugins": IMPORT_FORMAT_ENV["PLUGINS"], + "index_only": False, + }, + ) + assert response.status_code == 200, response.text + body = response.json() + assert body["success"] is True + assert body["result"]["crawl_id"] + with use_archivebox_db(tmp_path): + crawl = Crawl.objects.get(id=body["result"]["crawl_id"]) + root_snapshot = crawl.snapshot_set.get(url=Snapshot.INTERNAL_INPUT_URL) + root_input = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + source_text = import_path.read_text(encoding="utf-8") + assert crawl.urls == source_text + assert root_input == source_text + + wait_for_import_processing(tmp_path, expected_urls) + stop_server(tmp_path) + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_expected_import_snapshots(tmp_path, expected_urls) + + for import_name, expected in IMPORT_FORMAT_EXPECTATIONS.items(): + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.filter(url=expected["url"]).order_by("-created_at").first() + assert snapshot is not None, f"{import_name} did not create Snapshot for {expected['url']}" + snapshot_id = str(snapshot.id) + + snapshot_response = live_api_request( + port, + "get", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + ) + assert snapshot_response.status_code == 200, snapshot_response.text + assert snapshot_response.json()["url"] == expected["url"] + finally: + stop_server(tmp_path) + + with use_archivebox_db(tmp_path): + crawls = list(Crawl.objects.order_by("created_at")) + snapshots_by_url = {snapshot.url: snapshot for snapshot in Snapshot.objects.prefetch_related("tags").filter(url__in=expected_urls)} + tags_by_url = {snapshot.url: set(snapshot.tags.values_list("name", flat=True)) for snapshot in snapshots_by_url.values()} + + assert len(crawls) == len(import_files) + assert [crawl.urls for crawl in crawls] == [path.read_text(encoding="utf-8") for path in import_files.values()] + assert all(crawl.tags_str == "api-import" for crawl in crawls) + assert all(crawl.status in {Crawl.StatusChoices.STARTED, Crawl.StatusChoices.SEALED} for crawl in crawls) + assert len(snapshots_by_url) == len(expected_urls) + + for import_name, expected in IMPORT_FORMAT_EXPECTATIONS.items(): + snapshot = snapshots_by_url.get(expected["url"]) + assert snapshot is not None, f"{import_name} did not create Snapshot for {expected['url']}" + assert snapshot.status in {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + if expected.get("title"): + assert snapshot.title == expected["title"] + if expected.get("date"): + assert snapshot.bookmarked_at.date().isoformat() == expected["date"] + if expected.get("tags"): + assert expected["tags"] | {"api-import"} <= tags_by_url[snapshot.url] + + +@pytest.mark.timeout(240) +def test_api_cli_add_rejects_file_path_and_shell_injection_payloads(tmp_path): + """REST add must not let path, file://, traversal, or shell strings become archiveable URLs.""" + init_archive(tmp_path) + safe_url = "https://example.com/?archivebox-api-security=1" + inputs, canary = malicious_add_inputs(tmp_path, safe_url=safe_url) + port = get_free_port() + env = cli_env(port=port, server=True, **IMPORT_FORMAT_ENV) + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + response = live_api_request( + port, + "post", + "/api/v1/cli/add", + api_token=api_token, + json={ + "urls": inputs, + "depth": 0, + "tag": "api-security", + "plugins": IMPORT_FORMAT_ENV["PLUGINS"], + "index_only": False, + }, + ) + assert response.status_code == 200, response.text + assert response.json()["success"] is True + + wait_for_expected_import_snapshots(tmp_path, {safe_url}, timeout=120) + finally: + stop_server(tmp_path) + + wait_for_expected_import_snapshots(tmp_path, {safe_url}, timeout=30, expected_tags={"api-security"}) + assert_no_file_or_shell_payload_snapshots(tmp_path, canary=canary) + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.get(url=safe_url) + crawl = Crawl.objects.get() + assert crawl.status in {Crawl.StatusChoices.STARTED, Crawl.StatusChoices.SEALED} + assert snapshot.status in {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + with use_archivebox_db(tmp_path): + tag_names = set(SnapshotTag.objects.filter(snapshot=snapshot).values_list("tag__name", flat=True)) + assert "api-security" in tag_names diff --git a/archivebox/tests/test_api_v1_cli_remove.py b/archivebox/tests/test_api_v1_cli_remove.py new file mode 100644 index 0000000000..f1eabc3127 --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_remove.py @@ -0,0 +1,209 @@ +from datetime import datetime, timedelta +from pathlib import Path + +import pytest +from django.utils import timezone + +from archivebox.core.models import Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import api_client_request, init_archive + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _crawl(user, urls: str, label: str) -> Crawl: + return Crawl.objects.create( + urls=urls, + created_by=user, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + tags_str=label, + ) + + +def _snapshot(crawl: Crawl, url: str, *, status: str = Snapshot.StatusChoices.SEALED, tag: str = "", bookmarked_at=None) -> Snapshot: + snapshot = Snapshot.objects.create( + url=url, + crawl=crawl, + status=status, + retry_at=None, + bookmarked_at=bookmarked_at or timezone.now(), + ) + if tag: + tag_obj, _ = Tag.objects.get_or_create(name=tag) + snapshot.tags.add(tag_obj) + return snapshot + + +def _touch_output(snapshot: Snapshot) -> Path: + output_dir = Path(snapshot.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "api-remove-test.txt").write_text(str(snapshot.id)) + return output_dir + + +def _bulk_timeout_snapshots(crawl: Crawl, *, count: int = 30000) -> tuple[list[Snapshot], dict[str, Path]]: + base = timezone.make_aware(datetime(2026, 2, 1, 12, 0, 0)) + snapshots = [ + Snapshot( + url=f"https://example.com/remove-timeout-{idx}", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + timestamp=f"88{idx:030d}", + bookmarked_at=base + timedelta(seconds=idx), + created_at=base + timedelta(seconds=idx), + ) + for idx in range(count) + ] + Snapshot.objects.bulk_create(snapshots, batch_size=1000) + + sample = [*snapshots[-200:], *snapshots[:200]] + return snapshots, {str(snapshot.id): _touch_output(snapshot) for snapshot in sample} + + +def _post_remove(client, api_headers, body: dict): + return api_client_request( + client, + "post", + "/api/v1/cli/remove", + payload=body, + headers=api_headers, + ) + + +def test_cli_remove_api_removes_rows_and_respects_snapshot_filters(client, tmp_path, api_admin_user, api_headers): + init_archive(tmp_path) + base = timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)) + + crawl_a = _crawl(api_admin_user, "https://alpha.example.com/articles/needle", "crawl-a") + crawl_b = _crawl(api_admin_user, "https://beta.example.org/posts/needle", "crawl-b") + exact = _snapshot(crawl_a, "https://alpha.example.com/articles/exact", tag="api-keep", bookmarked_at=base) + keep = _snapshot(crawl_a, "https://alpha.example.com/articles/needle", tag="api-keep", bookmarked_at=base + timedelta(hours=1)) + wrong_status = _snapshot( + crawl_a, + "https://alpha.example.com/articles/needle-queued", + status=Snapshot.StatusChoices.QUEUED, + tag="api-keep", + bookmarked_at=base + timedelta(hours=2), + ) + other_crawl = _snapshot(crawl_b, "https://beta.example.org/posts/needle", tag="api-other", bookmarked_at=base + timedelta(hours=3)) + exact_dir = _touch_output(exact) + keep_dir = _touch_output(keep) + wrong_status_dir = _touch_output(wrong_status) + other_crawl_dir = _touch_output(other_crawl) + + exact_response = _post_remove( + client, + api_headers, + { + "filter_type": "exact", + "filter_patterns": [exact.url], + "timeout": 60, + }, + ) + assert exact_response.status_code == 200, exact_response.content + exact_payload = exact_response.json() + assert exact_payload["success"] is True + assert exact_payload["result"]["removed_count"] == 1 + assert exact_payload["result"]["removed_snapshot_ids"] == [str(exact.id)] + assert exact_payload["result"]["not_removed_count"] == 0 + assert not Snapshot.objects.filter(pk=exact.pk).exists() + assert not exact_dir.exists() + + filtered_response = _post_remove( + client, + api_headers, + { + "filter_type": "substring", + "filter_patterns": ["needle"], + "status": Snapshot.StatusChoices.SEALED, + "tag": "api-keep", + "url__istartswith": "https://alpha.example.com", + "crawl_id": str(crawl_a.id), + "after": (base + timedelta(minutes=30)).timestamp(), + "before": (base + timedelta(hours=2)).timestamp(), + "timeout": 60, + }, + ) + assert filtered_response.status_code == 200, filtered_response.content + filtered_payload = filtered_response.json() + assert filtered_payload["success"] is True + assert filtered_payload["result"]["removed_count"] == 1 + assert filtered_payload["result"]["removed_snapshot_ids"] == [str(keep.id)] + assert filtered_payload["result"]["not_removed_count"] == 0 + assert not Snapshot.objects.filter(pk=keep.pk).exists() + assert not keep_dir.exists() + assert Snapshot.objects.filter(pk=wrong_status.pk).exists() + assert Snapshot.objects.filter(pk=other_crawl.pk).exists() + assert wrong_status_dir.exists() + assert other_crawl_dir.exists() + + +def test_cli_remove_api_reports_timeout_and_clamps_timeout_to_sixty_seconds(client, tmp_path, api_admin_user, api_headers): + init_archive(tmp_path) + crawl = _crawl(api_admin_user, "https://example.com/remove-timeout-0", "timeout") + snapshots, output_dirs_by_id = _bulk_timeout_snapshots(crawl) + + timeout_response = _post_remove( + client, + api_headers, + { + "filter_type": "substring", + "filter_patterns": ["remove-timeout-"], + "timeout": 3, + }, + ) + assert timeout_response.status_code == 200, timeout_response.content + timeout_payload = timeout_response.json() + assert timeout_payload["success"] is False + assert timeout_payload["errors"] + assert set(timeout_payload["result"]) == { + "removed_count", + "removed_snapshot_ids", + "not_removed_count", + "not_removed_snapshot_ids", + "success", + "error", + "timeout", + } + assert timeout_payload["result"]["success"] is False + assert timeout_payload["result"]["timeout"] == 3.0 + assert timeout_payload["result"]["error"] + assert timeout_payload["result"]["removed_count"] == len(timeout_payload["result"]["removed_snapshot_ids"]) + assert timeout_payload["result"]["not_removed_count"] == len(timeout_payload["result"]["not_removed_snapshot_ids"]) + assert timeout_payload["result"]["removed_count"] > 0 + assert timeout_payload["result"]["not_removed_count"] > 0 + assert timeout_payload["result"]["removed_count"] + timeout_payload["result"]["not_removed_count"] == len(snapshots) + + removed_ids = set(timeout_payload["result"]["removed_snapshot_ids"]) + not_removed_ids = set(timeout_payload["result"]["not_removed_snapshot_ids"]) + assert Snapshot.objects.filter(url__icontains="remove-timeout-").count() == len(not_removed_ids) + assert removed_ids & set(output_dirs_by_id) + assert not_removed_ids & set(output_dirs_by_id) + for snapshot_id in removed_ids & set(output_dirs_by_id): + assert not Snapshot.objects.filter(pk=snapshot_id).exists() + assert not output_dirs_by_id[snapshot_id].exists() + for snapshot_id in not_removed_ids & set(output_dirs_by_id): + assert Snapshot.objects.filter(pk=snapshot_id).exists() + assert output_dirs_by_id[snapshot_id].exists() + + clamp_snapshot = _snapshot(crawl, "https://example.com/remove-timeout-clamp") + clamp_dir = _touch_output(clamp_snapshot) + clamp_response = _post_remove( + client, + api_headers, + { + "filter_type": "exact", + "filter_patterns": [clamp_snapshot.url], + "timeout": 999, + }, + ) + assert clamp_response.status_code == 200, clamp_response.content + clamp_payload = clamp_response.json() + assert clamp_payload["success"] is True + assert clamp_payload["result"]["timeout"] == 60.0 + assert clamp_payload["result"]["removed_snapshot_ids"] == [str(clamp_snapshot.id)] + assert not Snapshot.objects.filter(pk=clamp_snapshot.pk).exists() + assert not clamp_dir.exists() diff --git a/archivebox/tests/test_api_v1_cli_schedule.py b/archivebox/tests/test_api_v1_cli_schedule.py new file mode 100644 index 0000000000..97e919619b --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_schedule.py @@ -0,0 +1,71 @@ +from io import StringIO + +import pytest +import requests +from django.test import RequestFactory + +from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule +from archivebox.crawls.models import CrawlSchedule +from .conftest import ( + api_auth_headers, + cli_env, + create_admin_and_token, + get_free_port, + init_archive, + start_archivebox_server, + stop_server, + wait_for_http, +) + + +@pytest.mark.django_db +def test_schedule_api_creates_schedule_via_view_request(api_admin_user): + request = RequestFactory().post("/api/v1/cli/schedule") + request.user = api_admin_user + setattr(request, "stdout", StringIO()) + setattr(request, "stderr", StringIO()) + args = ScheduleCommandSchema( + every="daily", + import_path="https://example.com/feed.xml", + quiet=True, + ) + + response = cli_schedule(request, args) + + assert response["success"] is True + assert response["result_format"] == "json" + assert CrawlSchedule.objects.count() == 1 + assert len(response["result"]["created_schedule_ids"]) == 1 + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.timeout(180) +def test_api_v1_cli_schedule_creates_schedule_over_server(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True) + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_http(port, host=f"api.archivebox.localhost:{port}", path="/api/v1/docs") + + response = requests.post( + f"http://127.0.0.1:{port}/api/v1/cli/schedule", + headers=api_auth_headers(api_token, port=port), + json={ + "every": "daily", + "import_path": recursive_test_site["root_url"], + "quiet": True, + }, + timeout=10, + ) + + assert response.status_code == 200, response.text + payload = response.json() + assert payload["success"] is True + assert payload["result_format"] == "json" + assert len(payload["result"]["created_schedule_ids"]) == 1 + finally: + stop_server(tmp_path) diff --git a/archivebox/tests/test_api_v1_cli_search.py b/archivebox/tests/test_api_v1_cli_search.py new file mode 100644 index 0000000000..7fa60b7410 --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_search.py @@ -0,0 +1,31 @@ +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from .conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/api-cli-search-basic", created_by=api_admin_user) + Snapshot.objects.create(url="https://example.com/api-cli-search-basic", crawl=crawl) + + response = api_client_request( + client, + "post", + "/api/v1/cli/search", + payload={ + "filter_patterns": ["https://example.com/api-cli-search-basic"], + "filter_type": "exact", + "as_json": True, + "as_html": False, + "as_csv": "", + "with_headers": False, + }, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_cli_update.py b/archivebox/tests/test_api_v1_cli_update.py new file mode 100644 index 0000000000..23d7557652 --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_update.py @@ -0,0 +1,147 @@ +import json + +import pytest + +from .conftest import ( + api_client_request, + cli_env, + create_admin_and_token, + get_free_port, + init_archive, + live_api_request, + parse_jsonl_output, + run_archivebox_cmd, + start_archivebox_server, + stop_server, + wait_for_live_api, +) + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_cli_update_api_accepts_empty_json_without_traceback(client, tmp_path, api_headers): + init_archive(tmp_path) + + try: + response = api_client_request( + client, + "post", + "/api/v1/cli/update", + payload={}, + headers=api_headers, + ) + finally: + stop_server(tmp_path) + + assert response.status_code == 200, response.content + payload = response.json() + assert payload["success"] is True + assert "Traceback" not in response.content.decode() + + +@pytest.mark.timeout(180) +def test_cli_update_api_supports_all_snapshot_list_filters_with_real_rows(tmp_path): + env = cli_env(disable_extractors=True) + init_archive(tmp_path) + + records = [ + { + "type": "Snapshot", + "url": "https://alpha.example.com/articles/needle", + "title": "Needle Alpha", + "tags": "api-keep", + "timestamp": "1700000000", + "bookmarked_at": "2023-11-14T22:13:20+00:00", + }, + { + "type": "Snapshot", + "url": "https://beta.example.org/posts/haystack", + "title": "Haystack Beta", + "tags": "api-other", + "timestamp": "1710000000", + "bookmarked_at": "2024-03-09T16:00:00+00:00", + }, + { + "type": "Snapshot", + "url": "https://docs.archivebox.io/manual", + "title": "Manual Gamma", + "tags": "api-docs", + "timestamp": "1720000000", + "bookmarked_at": "2024-07-03T09:46:40+00:00", + }, + ] + stdin = "\n".join(json.dumps(record) for record in records) + "\n" + run_archivebox_cmd(["snapshot", "create"], cwd=tmp_path, stdin=stdin, env=env, check=True) + list_result = run_archivebox_cmd(["snapshot", "list", "--sort", "timestamp"], cwd=tmp_path, env=env, check=True) + initial_snapshots = {record["url"]: record for record in parse_jsonl_output(list_result.stdout) if record.get("type") == "Snapshot"} + alpha = initial_snapshots["https://alpha.example.com/articles/needle"] + run_archivebox_cmd( + ["snapshot", "update", "--status=paused"], + cwd=tmp_path, + stdin=json.dumps(alpha), + env=env, + check=True, + ) + + port = get_free_port() + env = { + **cli_env(port=port, server=True, PUBLIC_INDEX="True"), + **env, + } + api_token = create_admin_and_token(tmp_path) + + def assert_update_filter(label, body, expected_records): + response = live_api_request( + port, + "post", + "/api/v1/cli/update", + api_token=api_token, + json={**body, "batch_size": 100, "migrate_only": True}, + timeout=30, + ) + assert response.status_code == 200, f"{label}: {response.text}" + assert "Traceback" not in response.text + payload = response.json() + assert payload["success"] is True, label + expected_ids = {record["id"] for record in expected_records} + assert set(payload["result"]["snapshot_ids"]) == expected_ids, label + assert payload["result"]["matched_count"] == len(expected_ids), label + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + list_result = run_archivebox_cmd(["snapshot", "list", "--sort", "timestamp"], cwd=tmp_path, env=env, check=True) + snapshots = {record["url"]: record for record in parse_jsonl_output(list_result.stdout) if record.get("type") == "Snapshot"} + alpha = snapshots["https://alpha.example.com/articles/needle"] + beta = snapshots["https://beta.example.org/posts/haystack"] + gamma = snapshots["https://docs.archivebox.io/manual"] + status_result = run_archivebox_cmd( + ["snapshot", "list", "--status", alpha["status"]], + cwd=tmp_path, + env=env, + check=True, + ) + status_records = [record for record in parse_jsonl_output(status_result.stdout) if record.get("type") == "Snapshot"] + + cases = [ + ("status", {"status": alpha["status"]}, status_records), + ("filter_type exact", {"filter_type": "exact", "filter_patterns": [alpha["url"]]}, [alpha]), + ("filter_type substring", {"filter_type": "substring", "filter_patterns": ["needle"]}, [alpha]), + ("filter_type regex", {"filter_type": "regex", "filter_patterns": [r"alpha\.example\.com/.+needle"]}, [alpha]), + ("filter_type domain", {"filter_type": "domain", "filter_patterns": ["alpha.example.com"]}, [alpha]), + ("filter_type tag", {"filter_type": "tag", "filter_patterns": ["api-keep"]}, [alpha]), + ("filter_type timestamp", {"filter_type": "timestamp", "filter_patterns": [alpha["timestamp"]]}, [alpha]), + ("url__icontains", {"url__icontains": "needle"}, [alpha]), + ("url__istartswith", {"url__istartswith": "https://alpha.example.com"}, [alpha]), + ("tag", {"tag": "api-keep"}, [alpha]), + ("crawl_id", {"crawl_id": alpha["crawl_id"]}, [alpha]), + ("limit and sort", {"limit": 1, "sort": "timestamp"}, [alpha]), + ("search", {"search": "meta", "filter_patterns": ["Needle Alpha"]}, [alpha]), + ("before", {"before": 1715000000}, [alpha, beta]), + ("after", {"after": 1715000000}, [gamma]), + ("resume", {"resume": beta["timestamp"]}, [alpha, beta]), + ] + for label, body, expected_records in cases: + assert_update_filter(label, body, expected_records) + finally: + stop_server(tmp_path) diff --git a/archivebox/tests/test_api_v1_cli_workflow_add_search_update_remove.py b/archivebox/tests/test_api_v1_cli_workflow_add_search_update_remove.py new file mode 100644 index 0000000000..175e4dd9c1 --- /dev/null +++ b/archivebox/tests/test_api_v1_cli_workflow_add_search_update_remove.py @@ -0,0 +1,161 @@ +import time + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.test_orm_helpers import use_archivebox_db +from .conftest import ( + cli_env, + create_admin_and_token, + get_free_port, + init_archive, + live_api_request, + start_archivebox_server, + stop_server, + wait_for_live_api, +) + +pytestmark = pytest.mark.django_db(transaction=True) + + +@pytest.mark.timeout(180) +def test_cli_api_add_search_update_remove_over_server(tmp_path): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True, PUBLIC_INDEX="True") + api_token = create_admin_and_token(tmp_path) + target_url = "https://example.com/" + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + add_response = live_api_request( + port, + "post", + "/api/v1/cli/add", + api_token=api_token, + json={ + "urls": [target_url], + "tag": "api-cli", + "depth": 0, + "parser": "url_list", + "plugins": "parse_txt_urls,wget", + "update": True, + "overwrite": False, + "index_only": True, + }, + timeout=10, + ) + assert add_response.status_code == 200, add_response.text + add_payload = add_response.json() + assert add_payload["success"] is True + assert add_payload["result_format"] == "json" + assert add_payload["result"]["num_snapshots"] == 0 + crawl_id = add_payload["result"]["crawl_id"] + assert add_payload["result"]["snapshot_ids"] == [] + stop_server(tmp_path) + from archivebox.services.runner import run_crawl + + with use_archivebox_db(tmp_path): + run_crawl(crawl_id, show_progress=False) + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + deadline = time.time() + 180 + snapshot_id = None + snapshot_status = None + while time.time() < deadline: + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.filter(crawl_id=crawl_id, url=target_url).first() + if snapshot is not None: + snapshot_id = str(snapshot.id) + snapshot_status = snapshot.status + break + time.sleep(1) + assert snapshot_id is not None + assert snapshot_status is not None + + search_response = live_api_request( + port, + "post", + "/api/v1/cli/search", + api_token=api_token, + json={ + "filter_patterns": [target_url], + "filter_type": "exact", + "status": snapshot_status, + "sort": "bookmarked_at", + "as_json": True, + "as_html": False, + "as_csv": "", + "with_headers": False, + }, + timeout=10, + ) + assert search_response.status_code == 200, search_response.text + search_payload = search_response.json() + assert search_payload["success"] is True + assert search_payload["result_format"] == "json" + assert any(item["url"] == target_url for item in search_payload["result"]) + + update_response = live_api_request( + port, + "post", + "/api/v1/cli/update", + api_token=api_token, + json={ + "resume": None, + "after": 0, + "before": 4102444800, + "filter_type": "exact", + "filter_patterns": [target_url], + "batch_size": 1, + "continuous": False, + "migrate_only": True, + }, + timeout=20, + ) + assert update_response.status_code == 200, update_response.text + assert update_response.json()["success"] is True + stop_server(tmp_path) + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + with use_archivebox_db(tmp_path): + crawl_obj = Crawl.objects.filter(pk=crawl_id).first() + crawl = (crawl_obj.max_depth, crawl_obj.tags_str, crawl_obj.config) if crawl_obj else None + + assert crawl is not None + assert crawl[0] == 1 + assert crawl[1] == "api-cli" + assert crawl[2]["INDEX_ONLY"] is True + + remove_response = live_api_request( + port, + "post", + "/api/v1/cli/remove", + api_token=api_token, + json={ + "delete": True, + "after": 0, + "before": 4102444800, + "filter_type": "exact", + "filter_patterns": [target_url], + }, + timeout=20, + ) + assert remove_response.status_code == 200, remove_response.text + remove_payload = remove_response.json() + assert remove_payload["success"] is True + assert remove_payload["result"]["removed_count"] == 1 + assert snapshot_id in remove_payload["result"]["removed_snapshot_ids"] + + with use_archivebox_db(tmp_path): + snapshot_count = Snapshot.objects.filter(pk=snapshot_id).count() + + assert snapshot_count == 0 + finally: + stop_server(tmp_path) diff --git a/archivebox/tests/test_api_v1_core_any_id.py b/archivebox/tests/test_api_v1_core_any_id.py new file mode 100644 index 0000000000..da4efe39ea --- /dev/null +++ b/archivebox/tests/test_api_v1_core_any_id.py @@ -0,0 +1,16 @@ +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/any", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/any", crawl=crawl) + + response = client.get(f"/api/v1/core/any/{snapshot.id}", follow=True, **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_core_archiveresult_archiveresult_id.py b/archivebox/tests/test_api_v1_core_archiveresult_archiveresult_id.py new file mode 100644 index 0000000000..43285a2cb8 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_archiveresult_archiveresult_id.py @@ -0,0 +1,69 @@ +import pytest +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test.client import BOUNDARY, MULTIPART_CONTENT, encode_multipart + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/archiveresult-detail", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/archiveresult-detail", crawl=crawl) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="api-basic", + hook_name="on_Snapshot__api_basic", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="ok", + ) + + response = client.get(f"/api/v1/core/archiveresult/{result.id}", **api_headers) + + assert response.status_code == 200, response.content + + +def test_archiveresult_patch_upload_finalizes_queued_result(client, api_admin_user, api_headers): + crawl = Crawl.objects.create( + urls="https://example.com", + created_by=api_admin_user, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com/upload-patch", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="dom", + hook_name="on_Snapshot__archivebox_browser_extension_upload", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + response = client.generic( + "PATCH", + f"/api/v1/core/archiveresult/{result.id}", + encode_multipart( + BOUNDARY, + { + "files": SimpleUploadedFile("output.html", b"uploaded", content_type="text/html"), + "output_paths": "output.html", + "output_str": "output.html", + }, + ), + content_type=MULTIPART_CONTENT, + **api_headers, + ) + assert response.status_code == 200, response.content + + result.refresh_from_db() + snapshot.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert result.output_str == "output.html" + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is not None diff --git a/archivebox/tests/test_api_v1_core_archiveresults.py b/archivebox/tests/test_api_v1_core_archiveresults.py new file mode 100644 index 0000000000..09c3af9e7a --- /dev/null +++ b/archivebox/tests/test_api_v1_core_archiveresults.py @@ -0,0 +1,170 @@ +from datetime import timedelta + +import pytest +from django.db import connection +from django.test.utils import CaptureQueriesContext +from django.utils import timezone + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_archiveresult_upload_api_queues_snapshot_maintenance_without_finalizing(client, api_admin_user, api_headers): + crawl = Crawl.objects.create( + urls="https://example.com", + created_by=api_admin_user, + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + active_retry_at = timezone.now() + timedelta(minutes=5) + active_snapshot = Snapshot.objects.create( + url="https://example.com/active", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=active_retry_at, + ) + sealed_snapshot = Snapshot.objects.create( + url="https://example.com/sealed", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + active_response = client.post( + "/api/v1/core/archiveresults", + { + "snapshot_id": str(active_snapshot.id), + "plugin": "chrome_extension_dom", + "hook_name": "on_Snapshot__archivebox_browser_extension_upload", + "status": ArchiveResult.StatusChoices.SUCCEEDED, + "output_str": "uploaded active snapshot output", + }, + **api_headers, + ) + assert active_response.status_code == 200, active_response.content + active_snapshot.refresh_from_db() + assert active_snapshot.status == Snapshot.StatusChoices.STARTED + assert active_snapshot.retry_at == active_retry_at + assert active_snapshot.downloaded_at is not None + + sealed_response = client.post( + "/api/v1/core/archiveresults", + { + "snapshot_id": str(sealed_snapshot.id), + "plugin": "chrome_extension_mhtml", + "hook_name": "on_Snapshot__archivebox_browser_extension_upload", + "status": ArchiveResult.StatusChoices.SUCCEEDED, + "output_str": "uploaded sealed snapshot output", + }, + **api_headers, + ) + assert sealed_response.status_code == 200, sealed_response.content + sealed_snapshot.refresh_from_db() + assert sealed_snapshot.status == Snapshot.StatusChoices.SEALED + assert sealed_snapshot.retry_at is not None + assert sealed_snapshot.downloaded_at is not None + + +def test_archiveresults_api_limit_uses_exact_count_without_full_row_distinct(client, api_headers): + snapshot_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + payload={ + "url": "https://example.com/archive-result-pagination", + "title": "ArchiveResult pagination", + "status": Snapshot.StatusChoices.QUEUED, + }, + headers=api_headers, + ) + assert snapshot_response.status_code == 200, snapshot_response.content + snapshot_id = snapshot_response.json()["id"] + + for plugin_name in ("dom", "screenshot"): + result_response = client.post( + "/api/v1/core/archiveresults", + { + "snapshot_id": snapshot_id, + "plugin": plugin_name, + "hook_name": f"on_Snapshot__test_{plugin_name}", + "status": ArchiveResult.StatusChoices.SUCCEEDED, + "output_str": f"{plugin_name} output", + }, + **api_headers, + ) + assert result_response.status_code == 200, result_response.content + + total_archiveresults = ArchiveResult.objects.count() + with CaptureQueriesContext(connection) as captured_queries: + response = client.get( + "/api/v1/core/archiveresults?limit=1", + **api_headers, + ) + + assert response.status_code == 200, response.content + payload = response.json() + assert payload["count"] == total_archiveresults + assert payload["total_items"] == total_archiveresults + assert payload["limit"] == 1 + assert payload["num_items"] == 1 + + count_queries = [ + query["sql"] for query in captured_queries if "COUNT" in query["sql"].upper() and '"core_archiveresult"' in query["sql"] + ] + assert count_queries + assert not any("SELECT DISTINCT" in query.upper() for query in count_queries), count_queries + + +def test_archiveresults_api_join_filters_count_distinct_primary_keys(client, api_headers): + snapshot_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + payload={ + "url": "https://example.com/archive-result-tag-pagination", + "title": "ArchiveResult tag pagination", + "tags": ["api-tag-pagination-one", "api-tag-pagination-two"], + "status": Snapshot.StatusChoices.QUEUED, + }, + headers=api_headers, + ) + assert snapshot_response.status_code == 200, snapshot_response.content + snapshot_id = snapshot_response.json()["id"] + + result_response = client.post( + "/api/v1/core/archiveresults", + { + "snapshot_id": snapshot_id, + "plugin": "dom", + "hook_name": "on_Snapshot__test_tag_pagination", + "status": ArchiveResult.StatusChoices.SUCCEEDED, + "output_str": "tag pagination output", + }, + **api_headers, + ) + assert result_response.status_code == 200, result_response.content + + with CaptureQueriesContext(connection) as captured_queries: + response = client.get( + "/api/v1/core/archiveresults?search=api-tag-pagination&limit=1", + **api_headers, + ) + + assert response.status_code == 200, response.content + payload = response.json() + assert payload["count"] == 1 + assert payload["total_items"] == 1 + assert payload["num_items"] == 1 + assert [item["id"] for item in payload["items"]] == [result_response.json()["id"]] + + count_queries = [ + query["sql"] for query in captured_queries if "COUNT" in query["sql"].upper() and '"core_archiveresult"' in query["sql"] + ] + assert count_queries + assert any("SELECT DISTINCT" in query.upper() for query in count_queries), count_queries + assert not any('"core_archiveresult"."output_files" AS' in query for query in count_queries), count_queries + assert not any('"core_archiveresult"."notes" AS' in query for query in count_queries), count_queries diff --git a/archivebox/tests/test_api_v1_core_snapshot_snapshot_id.py b/archivebox/tests/test_api_v1_core_snapshot_snapshot_id.py new file mode 100644 index 0000000000..4d3e75a228 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_snapshot_snapshot_id.py @@ -0,0 +1,497 @@ +import json +import time +from pathlib import Path + +import pytest +from django.utils import timezone + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import run_archivebox_cmd +from archivebox.tests.test_orm_helpers import use_archivebox_db +from archivebox.workers.models import RETRY_AT_MAX + +from .conftest import ( + api_client_request, + cli_env, + create_admin_and_token, + get_crawl_runtime_state, + get_free_port, + init_archive, + live_api_request, + start_archivebox_server, + stop_server, + wait_for_live_api, + wait_for_snapshot_capture, +) + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _seed_archiveresult( + snapshot: Snapshot, + *, + plugin: str, + hook_name: str, + status: str, + output_text: str = "", + output_path: str | None = None, +) -> ArchiveResult: + output_files = {} + output_size = 0 + output_mimetypes = "" + if output_path is not None: + output_bytes = output_text.encode() + absolute_path = Path(snapshot.output_dir) / output_path + absolute_path.parent.mkdir(parents=True, exist_ok=True) + absolute_path.write_bytes(output_bytes) + output_size = len(output_bytes) + output_mimetypes = "text/plain" + output_files[output_path] = { + "extension": Path(output_path).suffix.lstrip("."), + "mimetype": "text/plain", + "size": output_size, + } + + now = timezone.now() + return ArchiveResult.objects.create( + snapshot=snapshot, + plugin=plugin, + hook_name=hook_name, + status=status, + output_str=output_path or output_text, + output_files=output_files, + output_size=output_size, + output_mimetypes=output_mimetypes, + start_ts=now if status != ArchiveResult.StatusChoices.QUEUED else None, + end_ts=now if status in ArchiveResult.FINAL_STATES else None, + ) + + +def _snapshot_hook_name(plugin_name: str) -> str: + from abx_dl.models import discover_plugins + + plugin = discover_plugins().get(plugin_name) + assert plugin is not None, f"missing test plugin {plugin_name}" + hooks = plugin.filter_hooks("Snapshot") + assert hooks, f"missing Snapshot hooks for {plugin_name}" + return hooks[0].name + + +def _snapshot_state(cwd: Path, url: str) -> dict[str, object]: + with use_archivebox_db(cwd): + snapshot = Snapshot.objects.select_related("crawl", "crawl__created_by").get(url=url) + snapshot_dir = Path(snapshot.output_dir) + crawl_dir = Path(snapshot.crawl.output_dir) + crawl_link = crawl_dir / "snapshots" / Snapshot.extract_domain_from_url(snapshot.url) / str(snapshot.id) + results = list( + ArchiveResult.objects.filter(snapshot=snapshot) + .order_by("plugin", "hook_name") + .values("plugin", "hook_name", "status", "output_files", "output_size"), + ) + return { + "id": str(snapshot.id), + "crawl_id": str(snapshot.crawl_id), + "status": snapshot.status, + "retry_at": snapshot.retry_at, + "downloaded_at": snapshot.downloaded_at, + "output_size": snapshot.output_size, + "snapshot_dir": snapshot_dir, + "crawl_dir": crawl_dir, + "crawl_link": crawl_link, + "results": results, + } + + +def _paused_snapshot_state(cwd: Path, snapshot_id: str) -> dict[str, object]: + with use_archivebox_db(cwd): + snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) + succeeded_results = ArchiveResult.objects.filter(snapshot=snapshot, status=ArchiveResult.StatusChoices.SUCCEEDED).count() + return { + "status": snapshot.status, + "retry_at": snapshot.retry_at, + "crawl_status": snapshot.crawl.status, + "succeeded_results": succeeded_results, + "snapshot_dir": Path(snapshot.output_dir), + } + + +def _wait_for_paused_scheduler_marker(cwd: Path, snapshot_id: str, timeout: int = 60) -> dict[str, object]: + deadline = time.time() + timeout + last_state: dict[str, object] = {} + while time.time() < deadline: + last_state = _paused_snapshot_state(cwd, snapshot_id) + if last_state["status"] == Snapshot.StatusChoices.PAUSED and last_state["retry_at"] == RETRY_AT_MAX: + return last_state + if last_state["status"] == Snapshot.StatusChoices.SEALED: + return last_state + time.sleep(1) + raise AssertionError(f"paused snapshot did not settle back to retry_at=MAX: {last_state}") + + +def _wait_for_crawl_snapshot_rows(cwd: Path, crawl_id: str, timeout: int = 45) -> dict[str, object]: + deadline = time.time() + timeout + latest_state: dict[str, object] | None = None + while time.time() < deadline: + latest_state = get_crawl_runtime_state(cwd, crawl_id) + if latest_state["snapshots"]: + return latest_state + time.sleep(0.2) + raise AssertionError(f"timed out waiting for snapshot rows for crawl {crawl_id}: {latest_state}") + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/snapshot-detail", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/snapshot-detail", crawl=crawl) + + response = client.get(f"/api/v1/core/snapshot/{snapshot.id}", **api_headers) + + assert response.status_code == 200, response.content + + +def test_snapshot_pause_resume_api_cascades_active_archiveresults_and_preserves_finished_rows( + tmp_path, + client, + recursive_test_site, +): + init_archive(tmp_path) + api_token = create_admin_and_token(tmp_path) + + with use_archivebox_db(tmp_path): + create_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + api_token=api_token, + payload={ + "url": recursive_test_site["root_url"], + "depth": 0, + "title": "Snapshot pause target", + "tags": ["snapshot-pause-e2e"], + "status": "queued", + }, + ) + assert create_response.status_code == 200, create_response.content.decode() + snapshot_id = json.loads(create_response.content.decode())["id"] + snapshot = Snapshot.objects.get(id=snapshot_id) + + queued_result = _seed_archiveresult( + snapshot, + plugin="manualqueue", + hook_name="on_Snapshot__manual_queue", + status=ArchiveResult.StatusChoices.QUEUED, + ) + started_result = _seed_archiveresult( + snapshot, + plugin="manualstart", + hook_name="on_Snapshot__manual_start", + status=ArchiveResult.StatusChoices.STARTED, + ) + succeeded_result = _seed_archiveresult( + snapshot, + plugin="manualdone", + hook_name="on_Snapshot__manual_done", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_text="finished result should stay finished", + output_path="manualdone/final.txt", + ) + failed_result = _seed_archiveresult( + snapshot, + plugin="manualfail", + hook_name="on_Snapshot__manual_fail", + status=ArchiveResult.StatusChoices.FAILED, + output_text="failed result should stay failed", + ) + + invalid_response = api_client_request( + client, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + payload={"action": "hold"}, + ) + assert invalid_response.status_code == 400 + snapshot = Snapshot.objects.get(id=snapshot_id) + assert snapshot.status == Snapshot.StatusChoices.QUEUED + + pause_response = api_client_request( + client, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + payload={"action": "pause"}, + ) + assert pause_response.status_code == 200, pause_response.content.decode() + assert json.loads(pause_response.content.decode())["status"] == Snapshot.StatusChoices.PAUSED + + snapshot.refresh_from_db() + crawl = Crawl.objects.get(id=snapshot.crawl_id) + assert snapshot.status == Snapshot.StatusChoices.PAUSED + assert snapshot.retry_at == RETRY_AT_MAX + assert crawl.status == Crawl.StatusChoices.QUEUED + + active_rows = { + row.plugin: (row.status, row.retry_at) for row in ArchiveResult.objects.filter(id__in=[queued_result.id, started_result.id]) + } + assert active_rows == { + "manualqueue": (ArchiveResult.StatusChoices.PAUSED, RETRY_AT_MAX), + "manualstart": (ArchiveResult.StatusChoices.PAUSED, RETRY_AT_MAX), + } + + finished_rows = { + row.plugin: (row.status, row.retry_at, row.output_size) + for row in ArchiveResult.objects.filter(id__in=[succeeded_result.id, failed_result.id]) + } + assert finished_rows["manualdone"][0] == ArchiveResult.StatusChoices.SUCCEEDED + assert finished_rows["manualdone"][1] is None + assert finished_rows["manualdone"][2] == len("finished result should stay finished") + assert finished_rows["manualfail"] == (ArchiveResult.StatusChoices.FAILED, None, 0) + + succeeded_row = ArchiveResult.objects.get(id=succeeded_result.id) + output_path = Path(snapshot.output_dir) / next(iter(succeeded_row.output_files)) + assert output_path.read_text() == "finished result should stay finished" + + resume_response = api_client_request( + client, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + payload={"action": "resume"}, + ) + assert resume_response.status_code == 200, resume_response.content.decode() + assert json.loads(resume_response.content.decode())["status"] == Snapshot.StatusChoices.QUEUED + + snapshot.refresh_from_db() + crawl.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert snapshot.retry_at != RETRY_AT_MAX + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is not None + assert crawl.retry_at != RETRY_AT_MAX + + resumed_rows = { + row.plugin: (row.status, row.retry_at) for row in ArchiveResult.objects.filter(id__in=[queued_result.id, started_result.id]) + } + assert resumed_rows["manualqueue"][0] == ArchiveResult.StatusChoices.QUEUED + assert resumed_rows["manualqueue"][1] is not None + assert resumed_rows["manualqueue"][1] != RETRY_AT_MAX + assert resumed_rows["manualstart"][0] == ArchiveResult.StatusChoices.QUEUED + assert resumed_rows["manualstart"][1] is not None + assert resumed_rows["manualstart"][1] != RETRY_AT_MAX + + assert ArchiveResult.objects.get(id=succeeded_result.id).status == ArchiveResult.StatusChoices.SUCCEEDED + assert ArchiveResult.objects.get(id=failed_result.id).status == ArchiveResult.StatusChoices.FAILED + assert output_path.read_text() == "finished result should stay finished" + + +def test_targeted_extract_retries_one_failed_archiveresult_while_snapshot_stays_paused( + tmp_path, + client, + recursive_test_site, +): + init_archive(tmp_path) + api_token = create_admin_and_token(tmp_path) + + with use_archivebox_db(tmp_path): + snapshot_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + api_token=api_token, + payload={ + "url": recursive_test_site["root_url"], + "depth": 0, + "title": "Paused targeted retry", + "tags": ["targeted-extract-pause"], + "status": "queued", + }, + ) + assert snapshot_response.status_code == 200, snapshot_response.content.decode() + snapshot_id = json.loads(snapshot_response.content.decode())["id"] + snapshot = Snapshot.objects.get(id=snapshot_id) + + wget_result = _seed_archiveresult( + snapshot, + plugin="wget", + hook_name=_snapshot_hook_name("wget"), + status=ArchiveResult.StatusChoices.FAILED, + output_text="initial failure before targeted retry", + ) + unrelated_result = _seed_archiveresult( + snapshot, + plugin="manualqueue", + hook_name="on_Snapshot__manual_queue", + status=ArchiveResult.StatusChoices.QUEUED, + ) + finished_result = _seed_archiveresult( + snapshot, + plugin="manualdone", + hook_name="on_Snapshot__manual_done", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_text="finished row must survive targeted retry", + output_path="manualdone/targeted.txt", + ) + + pause_response = api_client_request( + client, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + payload={"action": "pause"}, + ) + assert pause_response.status_code == 200, pause_response.content.decode() + assert json.loads(pause_response.content.decode())["status"] == Snapshot.StatusChoices.PAUSED + + snapshot = Snapshot.objects.get(id=snapshot_id) + assert snapshot.status == Snapshot.StatusChoices.PAUSED + assert snapshot.retry_at == RETRY_AT_MAX + assert ArchiveResult.objects.get(id=wget_result.id).status == ArchiveResult.StatusChoices.FAILED + assert ArchiveResult.objects.get(id=unrelated_result.id).status == ArchiveResult.StatusChoices.PAUSED + finished_row = ArchiveResult.objects.get(id=finished_result.id) + finished_output_path = Path(snapshot.output_dir) / next(iter(finished_row.output_files)) + assert finished_output_path.read_text() == "finished row must survive targeted retry" + + env = cli_env( + port=get_free_port(), + PLUGINS="wget", + SAVE_WGET="True", + WGET_WARC_ENABLED="False", + URL_ALLOWLIST=r"127\.0\.0\.1[:/].*", + ) + extract = run_archivebox_cmd( + ["extract", str(wget_result.id)], + cwd=tmp_path, + env=env, + timeout=150, + ) + assert extract.returncode == 0, f"STDOUT:\n{extract.stdout}\nSTDERR:\n{extract.stderr}" + + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.get(id=snapshot_id) + assert snapshot.status == Snapshot.StatusChoices.PAUSED + assert snapshot.retry_at == RETRY_AT_MAX + + retried_wget = ArchiveResult.objects.get(id=wget_result.id) + assert retried_wget.status == ArchiveResult.StatusChoices.SUCCEEDED + assert retried_wget.output_size > 0 + assert retried_wget.output_files + + unrelated = ArchiveResult.objects.get(id=unrelated_result.id) + assert unrelated.status == ArchiveResult.StatusChoices.PAUSED + assert unrelated.retry_at == RETRY_AT_MAX + + finished = ArchiveResult.objects.get(id=finished_result.id) + assert finished.status == ArchiveResult.StatusChoices.SUCCEEDED + assert finished.retry_at is None + assert finished_output_path.read_text() == "finished row must survive targeted retry" + + +@pytest.mark.timeout(240) +def test_paused_snapshot_survives_server_restart_and_resumes_via_api(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True, PLUGINS="wget", SAVE_WGET="True") + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + crawl_response = live_api_request( + port, + "post", + "/api/v1/crawls/crawls", + api_token=api_token, + json={ + "urls": [recursive_test_site["root_url"]], + "max_depth": 0, + "tags": ["snapshot-pause-restart-e2e"], + "config": {"PLUGINS": "wget", "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*"}, + }, + timeout=10, + ) + assert crawl_response.status_code == 200, crawl_response.text + crawl_id = crawl_response.json()["id"] + crawl_state = _wait_for_crawl_snapshot_rows(tmp_path, crawl_id) + snapshot_id = crawl_state["snapshots"][0]["id"] + + pause_response = live_api_request( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"action": "pause"}, + timeout=10, + ) + assert pause_response.status_code == 200, pause_response.text + + current_state = _paused_snapshot_state(tmp_path, snapshot_id) + if current_state["status"] == Snapshot.StatusChoices.SEALED: + assert current_state["succeeded_results"] > 0 + return + + paused_state = _wait_for_paused_scheduler_marker(tmp_path, snapshot_id) + if paused_state["status"] == Snapshot.StatusChoices.SEALED: + assert paused_state["succeeded_results"] > 0 + return + assert paused_state["succeeded_results"] == 0 + assert not list((paused_state["snapshot_dir"] / "wget").rglob("*.html")) + + stop_server(tmp_path) + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + restarted_state = _wait_for_paused_scheduler_marker(tmp_path, snapshot_id) + assert restarted_state["status"] == Snapshot.StatusChoices.PAUSED + assert restarted_state["succeeded_results"] == 0 + + resume_response = live_api_request( + port, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + json={"action": "resume"}, + timeout=10, + ) + assert resume_response.status_code == 200, resume_response.text + assert resume_response.json()["status"] == Snapshot.StatusChoices.QUEUED + + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180) + assert "Root" in captured_text + assert "About" in captured_text + + final_state = _snapshot_state(tmp_path, recursive_test_site["root_url"]) + assert final_state["status"] == Snapshot.StatusChoices.SEALED + assert final_state["downloaded_at"] is not None + assert any( + result["plugin"] == "wget" and result["status"] == ArchiveResult.StatusChoices.SUCCEEDED for result in final_state["results"] + ) + finally: + stop_server(tmp_path) + + +def test_rest_snapshot_delete_removes_output_dir(client, api_headers): + url = "https://example.com/delete-path-snapshot" + + response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + payload={"url": url, "depth": 0, "status": Snapshot.StatusChoices.QUEUED}, + headers=api_headers, + ) + assert response.status_code == 200, response.content.decode() + + snapshot = Snapshot.objects.get(url=url) + snapshot_dir = Path(snapshot.output_dir) + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "delete-path-test.txt").write_text("snapshot output") + assert snapshot_dir.exists() + + response = client.delete(f"/api/v1/core/snapshot/{snapshot.id}", **api_headers) + assert response.status_code == 200, response.content.decode() + assert not Snapshot.objects.filter(pk=snapshot.pk).exists() + assert not snapshot_dir.exists() diff --git a/archivebox/tests/test_api_v1_core_snapshots.py b/archivebox/tests/test_api_v1_core_snapshots.py new file mode 100644 index 0000000000..54edfa92ba --- /dev/null +++ b/archivebox/tests/test_api_v1_core_snapshots.py @@ -0,0 +1,46 @@ +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_snapshots_api_filters_status_column_and_rejects_legacy_status(client, api_admin_user, api_headers): + crawl = Crawl.objects.create( + urls="https://example.com", + created_by=api_admin_user, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com/api-status-queued", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + ) + sealed_snapshot = Snapshot.objects.create( + url="https://example.com/api-status-sealed", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + response = client.get( + "/api/v1/core/snapshots", + {"status": "sealed"}, + **api_headers, + ) + assert response.status_code == 200, response.content + payload = response.json() + items = payload["items"] if isinstance(payload, dict) and "items" in payload else payload + assert [item["id"] for item in items] == [str(sealed_snapshot.id)] + assert [item["status"] for item in items] == ["sealed"] + + legacy_response = client.get( + "/api/v1/core/snapshots", + {"status": "unarchived"}, + **api_headers, + ) + assert legacy_response.status_code == 400 + assert "Invalid snapshot status" in legacy_response.content.decode() diff --git a/archivebox/tests/test_api_v1_core_snapshots_rss.py b/archivebox/tests/test_api_v1_core_snapshots_rss.py new file mode 100644 index 0000000000..c5f2d1a54b --- /dev/null +++ b/archivebox/tests/test_api_v1_core_snapshots_rss.py @@ -0,0 +1,115 @@ +from datetime import datetime +from typing import cast + +import pytest +from django.contrib.auth import get_user_model +from django.contrib.auth.models import UserManager +from django.utils import timezone + + +pytestmark = pytest.mark.django_db + + +User = get_user_model() +ADMIN_HOST = "admin.archivebox.localhost:8000" + + +@pytest.fixture +def other_user(db): + return cast(UserManager, User.objects).create_user( + username="rssother", + email="rssother@test.com", + password="testpassword", + ) + + +def make_snapshot(*, user, url: str, title: str, bookmarked_at: datetime): + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create(urls=url, created_by=user) + snapshot = Snapshot.objects.create( + url=url, + title=title, + crawl=crawl, + bookmarked_at=bookmarked_at, + ) + return crawl, snapshot + + +def test_snapshots_rss_filters_by_user_and_orders_newest_first(client, api_token, api_admin_user, other_user): + from archivebox.core.models import Tag + + older_at = timezone.make_aware(datetime(2026, 5, 22, 8, 0, 0)) + newer_at = timezone.make_aware(datetime(2026, 5, 23, 8, 0, 0)) + _crawl, older_snapshot = make_snapshot( + user=api_admin_user, + url="https://example.com/rss-older", + title="Older & Escaped", + bookmarked_at=older_at, + ) + make_snapshot( + user=api_admin_user, + url="https://example.com/rss-newer", + title="Newer Snapshot", + bookmarked_at=newer_at, + ) + make_snapshot( + user=other_user, + url="https://example.com/rss-other-user", + title="Other User", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 23, 9, 0, 0)), + ) + older_snapshot.tags.add(Tag.objects.create(name="rss-tag", created_by=api_admin_user)) + + response = client.get( + "/api/v1/core/snapshots.rss", + {"created_by": api_admin_user.username, "limit": 50, "api_key": api_token.token}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + assert response["Content-Type"].startswith("application/rss+xml") + body = response.content.decode() + assert 'rss-tag" in body + assert "rss-other-user" not in body + assert body.index("rss-newer") < body.index("rss-older") + + +def test_snapshots_rss_supports_before_yyyymmdd_and_limit(client, api_token, api_admin_user): + make_snapshot( + user=api_admin_user, + url="https://example.com/rss-before-too-new", + title="Too New", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 24, 8, 0, 0)), + ) + make_snapshot( + user=api_admin_user, + url="https://example.com/rss-before-keep-one", + title="Keep One", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 23, 12, 0, 0)), + ) + make_snapshot( + user=api_admin_user, + url="https://example.com/rss-before-keep-two", + title="Keep Two", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 22, 12, 0, 0)), + ) + + response = client.get( + "/api/v1/core/snapshots.rss", + {"created_by": str(api_admin_user.pk), "before": "20260523", "limit": 1, "api_key": api_token.token}, + HTTP_HOST=ADMIN_HOST, + ) + + assert response.status_code == 200 + body = response.content.decode() + assert "rss-before-too-new" not in body + assert "rss-before-keep-one" in body + assert "rss-before-keep-two" not in body diff --git a/archivebox/tests/test_api_v1_core_tag_tag_id.py b/archivebox/tests/test_api_v1_core_tag_tag_id.py new file mode 100644 index 0000000000..2da14fdb9a --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tag_tag_id.py @@ -0,0 +1,15 @@ +import pytest + +from archivebox.core.models import Tag + + +pytestmark = pytest.mark.django_db(transaction=True) + + +@pytest.mark.parametrize("request_method", ("get", "delete")) +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers, request_method): + tag = Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + + response = getattr(client, request_method)(f"/api/v1/core/tag/{tag.id}", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_core_tag_tag_id_rename.py b/archivebox/tests/test_api_v1_core_tag_tag_id_rename.py new file mode 100644 index 0000000000..c5c6542683 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tag_tag_id_rename.py @@ -0,0 +1,38 @@ +import pytest + +from archivebox.core.models import Tag +from archivebox.tests.conftest import ADMIN_TEST_HOST, api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + tag = Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + + response = api_client_request( + client, + "post", + f"/api/v1/core/tag/{tag.id}/rename", + payload={"name": "api-basic-renamed"}, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + + +def test_tag_rename_api_updates_name(client, api_token, tagged_data): + tag, _ = tagged_data + + response = api_client_request( + client, + "post", + f"/api/v1/core/tag/{tag.id}/rename?api_key={api_token.token}", + payload={"name": "Alpha Archive"}, + headers={"HTTP_HOST": ADMIN_TEST_HOST}, + ) + + assert response.status_code == 200 + + tag.refresh_from_db() + assert tag.name == "Alpha Archive" diff --git a/archivebox/tests/test_api_v1_core_tag_tag_id_snapshots_jsonl.py b/archivebox/tests/test_api_v1_core_tag_tag_id_snapshots_jsonl.py new file mode 100644 index 0000000000..f595db6e92 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tag_tag_id_snapshots_jsonl.py @@ -0,0 +1,44 @@ +import json + +import pytest + +from archivebox.core.models import Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import ADMIN_TEST_HOST + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + tag = Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + crawl = Crawl.objects.create(urls="https://example.com/tag-jsonl-export", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/tag-jsonl-export", crawl=crawl) + snapshot.tags.add(tag) + + response = client.get(f"/api/v1/core/tag/{tag.id}/snapshots.jsonl", **api_headers) + + assert response.status_code == 200, response.content + + +def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data): + tag, _ = tagged_data + + response = client.get( + f"/api/v1/core/tag/{tag.id}/snapshots.jsonl", + {"api_key": api_token.token}, + HTTP_HOST=ADMIN_TEST_HOST, + ) + + assert response.status_code == 200 + assert response["Content-Type"].startswith("application/x-ndjson") + assert f"tag-{tag.slug}-snapshots.jsonl" in response["Content-Disposition"] + rows = [json.loads(line) for line in response.content.decode().splitlines()] + rows_by_url = {row["url"]: row for row in rows} + assert set(rows_by_url) == {"https://example.com/one", "https://example.com/two"} + assert rows_by_url["https://example.com/one"]["type"] == "Snapshot" + assert rows_by_url["https://example.com/one"]["title"] == "Example One" + assert rows_by_url["https://example.com/two"]["type"] == "Snapshot" + assert rows_by_url["https://example.com/two"]["title"] == "Example Two" + for row in rows: + assert "Alpha Research" in row["tags"].split(",") diff --git a/archivebox/tests/test_api_v1_core_tag_tag_id_urls_txt.py b/archivebox/tests/test_api_v1_core_tag_tag_id_urls_txt.py new file mode 100644 index 0000000000..88aaf8618e --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tag_tag_id_urls_txt.py @@ -0,0 +1,35 @@ +import pytest + +from archivebox.core.models import Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import ADMIN_TEST_HOST + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + tag = Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + crawl = Crawl.objects.create(urls="https://example.com/tag-url-export", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/tag-url-export", crawl=crawl) + snapshot.tags.add(tag) + + response = client.get(f"/api/v1/core/tag/{tag.id}/urls.txt", **api_headers) + + assert response.status_code == 200, response.content + + +def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data): + tag, snapshots = tagged_data + + response = client.get( + f"/api/v1/core/tag/{tag.id}/urls.txt", + {"api_key": api_token.token}, + HTTP_HOST=ADMIN_TEST_HOST, + ) + + assert response.status_code == 200 + assert response["Content-Type"].startswith("text/plain") + assert f"tag-{tag.slug}-urls.txt" in response["Content-Disposition"] + exported_urls = set(filter(None, response.content.decode().splitlines())) + assert exported_urls == {snapshot.url for snapshot in snapshots} diff --git a/archivebox/tests/test_api_v1_core_tags.py b/archivebox/tests/test_api_v1_core_tags.py new file mode 100644 index 0000000000..fcf7464a1f --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags.py @@ -0,0 +1,14 @@ +import pytest + +from archivebox.core.models import Tag + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + + response = client.get("/api/v1/core/tags", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_core_tags_add_to_snapshot.py b/archivebox/tests/test_api_v1_core_tags_add_to_snapshot.py new file mode 100644 index 0000000000..c732ca3926 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags_add_to_snapshot.py @@ -0,0 +1,25 @@ +import pytest + +from archivebox.core.models import Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/tag-add", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/tag-add", crawl=crawl) + tag = Tag.objects.create(name="api-basic-add-tag", created_by=api_admin_user) + + response = api_client_request( + client, + "post", + "/api/v1/core/tags/add-to-snapshot/", + payload={"snapshot_id": str(snapshot.id), "tag_id": tag.id}, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_core_tags_autocomplete.py b/archivebox/tests/test_api_v1_core_tags_autocomplete.py new file mode 100644 index 0000000000..71f242ebf8 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags_autocomplete.py @@ -0,0 +1,18 @@ +import pytest + +from archivebox.core.models import Tag + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_token): + Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + + response = client.get( + "/api/v1/core/tags/autocomplete/", + {"q": "api-basic", "api_key": api_token.token}, + HTTP_HOST="api.archivebox.localhost:8000", + ) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_core_tags_create.py b/archivebox/tests/test_api_v1_core_tags_create.py new file mode 100644 index 0000000000..c069c51552 --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags_create.py @@ -0,0 +1,19 @@ +import pytest + +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, api_headers): + response = api_client_request( + client, + "post", + "/api/v1/core/tags/create/", + payload={"name": "api-basic-created-tag"}, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_core_tags_remove_from_snapshot.py b/archivebox/tests/test_api_v1_core_tags_remove_from_snapshot.py new file mode 100644 index 0000000000..2b56770c9d --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags_remove_from_snapshot.py @@ -0,0 +1,26 @@ +import pytest + +from archivebox.core.models import Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/tag-remove", created_by=api_admin_user) + snapshot = Snapshot.objects.create(url="https://example.com/tag-remove", crawl=crawl) + tag = Tag.objects.create(name="api-basic-remove-tag", created_by=api_admin_user) + snapshot.tags.add(tag) + + response = api_client_request( + client, + "post", + "/api/v1/core/tags/remove-from-snapshot/", + payload={"snapshot_id": str(snapshot.id), "tag_id": tag.id}, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_core_tags_search.py b/archivebox/tests/test_api_v1_core_tags_search.py new file mode 100644 index 0000000000..47fe97f73c --- /dev/null +++ b/archivebox/tests/test_api_v1_core_tags_search.py @@ -0,0 +1,103 @@ +import pytest +from django.contrib.auth import get_user_model +from django.utils import timezone + +from archivebox.core.models import Snapshot, Tag +from archivebox.tests.conftest import ADMIN_TEST_HOST + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + Tag.objects.create(name="api-basic-tag", created_by=api_admin_user) + + response = client.get("/api/v1/core/tags/search/", {"q": "api-basic"}, **api_headers) + + assert response.status_code == 200, response.content + + +def test_tag_search_api_returns_card_payload(client, api_token, tagged_data): + tag, snapshots = tagged_data + + response = client.get( + "/api/v1/core/tags/search/", + {"q": "Alpha", "api_key": api_token.token}, + HTTP_HOST=ADMIN_TEST_HOST, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["sort"] == "created_desc" + assert payload["created_by"] == "" + assert payload["year"] == "" + assert payload["has_snapshots"] == "all" + assert payload["tags"][0]["id"] == tag.id + assert payload["tags"][0]["name"] == "Alpha Research" + assert payload["tags"][0]["num_snapshots"] == 2 + assert payload["tags"][0]["snapshots"] == [] + assert payload["tags"][0]["export_jsonl_url"].endswith(f"/api/v1/core/tag/{tag.id}/snapshots.jsonl") + assert payload["tags"][0]["filter_url"].endswith(f"/admin/core/snapshot/?tags__id__exact={tag.id}") + assert {snap.url for snap in snapshots} == {"https://example.com/one", "https://example.com/two"} + + +def test_tag_search_api_default_includes_empty_tags_and_counts_linked_snapshots(client, api_token, tagged_data, api_admin_user): + linked_tag, _snapshots = tagged_data + empty_tag = Tag.objects.create(name="Empty Tag", created_by=api_admin_user) + + response = client.get( + "/api/v1/core/tags/search/", + {"api_key": api_token.token}, + HTTP_HOST=ADMIN_TEST_HOST, + ) + + assert response.status_code == 200 + payload = response.json() + cards_by_name = {tag["name"]: tag for tag in payload["tags"]} + assert payload["has_snapshots"] == "all" + assert cards_by_name["Alpha Research"]["id"] == linked_tag.id + assert cards_by_name["Alpha Research"]["num_snapshots"] == 2 + assert cards_by_name["Empty Tag"]["id"] == empty_tag.id + assert cards_by_name["Empty Tag"]["num_snapshots"] == 0 + + +def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data): + from datetime import datetime + + other_user = get_user_model().objects.create_user( + username="tagother", + email="tagother@test.com", + password="unused", + ) + tag_with_snapshots = tagged_data[0] + empty_tag = Tag.objects.create(name="Zulu Empty", created_by=other_user) + alpha_tag = Tag.objects.create(name="Alpha Empty", created_by=other_user) + Snapshot.objects.create( + url="https://example.com/three", + title="Example Three", + crawl=crawl, + ).tags.add(alpha_tag) + + Tag.objects.filter(pk=empty_tag.pk).update(created_at=timezone.make_aware(datetime(2024, 1, 1, 12, 0, 0))) + Tag.objects.filter(pk=alpha_tag.pk).update(created_at=timezone.make_aware(datetime(2025, 1, 1, 12, 0, 0))) + Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0))) + + response = client.get( + "/api/v1/core/tags/search/", + { + "sort": "name_desc", + "created_by": str(other_user.pk), + "year": "2024", + "has_snapshots": "no", + "api_key": api_token.token, + }, + HTTP_HOST=ADMIN_TEST_HOST, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["sort"] == "name_desc" + assert payload["created_by"] == str(other_user.pk) + assert payload["year"] == "2024" + assert payload["has_snapshots"] == "no" + assert [tag["name"] for tag in payload["tags"]] == ["Zulu Empty"] diff --git a/archivebox/tests/test_api_v1_crawls_crawl_crawl_id.py b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id.py new file mode 100644 index 0000000000..5e72c1a480 --- /dev/null +++ b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id.py @@ -0,0 +1,661 @@ +import json +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import cast + +import pytest +from django.contrib.auth import get_user_model +from django.contrib.auth.models import UserManager +from django.utils import timezone + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.test_orm_helpers import use_archivebox_db +from archivebox.workers.models import RETRY_AT_MAX + +from .conftest import ( + api_client_request, + cli_env, + create_admin_and_token, + get_crawl_runtime_state, + get_free_port, + init_archive, + live_api_request, + run_archivebox_cmd, + start_archivebox_server, + stop_server, + wait_for_live_api, + wait_for_snapshot_capture, +) + + +pytestmark = pytest.mark.django_db(transaction=True) +User = get_user_model() +ADMIN_HOST = "admin.archivebox.localhost:8000" + + +@pytest.fixture +def other_user(db): + return cast(UserManager, User.objects).create_user( + username="rssother", + email="rssother@test.com", + password="testpassword", + ) + + +def _seed_archiveresult( + snapshot: Snapshot, + *, + plugin: str, + hook_name: str, + status: str, + output_text: str = "", + output_path: str | None = None, +) -> ArchiveResult: + output_files = {} + output_size = 0 + output_mimetypes = "" + if output_path is not None: + output_bytes = output_text.encode() + absolute_path = Path(snapshot.output_dir) / output_path + absolute_path.parent.mkdir(parents=True, exist_ok=True) + absolute_path.write_bytes(output_bytes) + output_size = len(output_bytes) + output_mimetypes = "text/plain" + output_files[output_path] = { + "extension": Path(output_path).suffix.lstrip("."), + "mimetype": "text/plain", + "size": output_size, + } + + now = timezone.now() + return ArchiveResult.objects.create( + snapshot=snapshot, + plugin=plugin, + hook_name=hook_name, + status=status, + output_str=output_path or output_text, + output_files=output_files, + output_size=output_size, + output_mimetypes=output_mimetypes, + start_ts=now if status != ArchiveResult.StatusChoices.QUEUED else None, + end_ts=now if status in ArchiveResult.FINAL_STATES else None, + ) + + +def wait_for_crawl_snapshot_rows(cwd, crawl_id, timeout=45): + deadline = time.time() + timeout + latest_state = None + while time.time() < deadline: + latest_state = get_crawl_runtime_state(cwd, crawl_id) + if latest_state["snapshots"]: + return latest_state + time.sleep(0.2) + raise AssertionError(f"timed out waiting for runner to create snapshots for crawl {crawl_id}: {latest_state}") + + +def wait_for_crawl_child_snapshots_paused_or_sealed(cwd, crawl_id, timeout=45): + deadline = time.time() + timeout + latest_state = None + while time.time() < deadline: + latest_state = get_crawl_runtime_state(cwd, crawl_id) + snapshots = latest_state["snapshots"] + if snapshots and all(snapshot["status"] in {"paused", "sealed"} for snapshot in snapshots): + return latest_state + time.sleep(0.2) + raise AssertionError(f"timed out waiting for runner to pause or seal snapshots for crawl {crawl_id}: {latest_state}") + + +def wait_for_crawl_wget_success_or_sealed(cwd, crawl_id, timeout=240): + deadline = time.time() + timeout + latest_state = None + while time.time() < deadline: + latest_state = get_crawl_runtime_state(cwd, crawl_id) + wget_results = [result for result in latest_state["results"] if result["plugin"] == "wget"] + if ( + latest_state["snapshots"] + and latest_state["snapshots"][0]["status"] == "sealed" + and any(result["status"] == "succeeded" and result["output_size"] > 0 for result in wget_results) + ): + return latest_state + if ( + latest_state["crawl_status"] == "sealed" + and latest_state["snapshots"] + and latest_state["snapshots"][0]["status"] == "sealed" + and all(result["status"] not in {"queued", "started", "paused"} for result in latest_state["results"]) + ): + return latest_state + time.sleep(2) + raise AssertionError(f"timed out waiting for crawl resume completion for crawl {crawl_id}: {latest_state}") + + +def wait_for_sqlite_index_result(cwd, crawl_id, timeout=45): + deadline = time.time() + timeout + latest_state = None + while time.time() < deadline: + latest_state = get_crawl_runtime_state(cwd, crawl_id) + final_results = [ + result + for result in latest_state["results"] + if result["plugin"] == "search_backend_sqlite" and result["status"] not in {"queued", "started", "paused"} + ] + if final_results: + return latest_state + time.sleep(0.2) + raise AssertionError(f"timed out waiting for sqlite index result for crawl {crawl_id}: {latest_state}") + + +def make_snapshot(*, user, url: str, title: str, bookmarked_at: datetime): + crawl = Crawl.objects.create(urls=url, created_by=user) + snapshot = Snapshot.objects.create( + url=url, + title=title, + crawl=crawl, + bookmarked_at=bookmarked_at, + ) + return crawl, snapshot + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/crawl-detail", created_by=api_admin_user) + + response = client.get(f"/api/v1/crawls/crawl/{crawl.id}", **api_headers) + + assert response.status_code == 200, response.content + + +def test_crawl_pause_resume_api_cascades_archiveresults_and_leaves_finished_snapshot_results_alone( + tmp_path, + client, + recursive_test_site, +): + init_archive(tmp_path) + api_token = create_admin_and_token(tmp_path) + + with use_archivebox_db(tmp_path): + crawl_response = api_client_request( + client, + "post", + "/api/v1/crawls/crawls", + api_token=api_token, + payload={ + "urls": [recursive_test_site["root_url"]], + "max_depth": 0, + "tags": ["crawl-archiveresult-pause"], + "config": {"PLUGINS": "wget", "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*"}, + }, + ) + assert crawl_response.status_code == 200, crawl_response.content.decode() + crawl_id = json.loads(crawl_response.content.decode())["id"] + from archivebox.services.runner import run_due_snapshot + + active_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + api_token=api_token, + payload={ + "url": recursive_test_site["root_url"], + "crawl_id": crawl_id, + "depth": 0, + "title": "Active child", + "status": "queued", + }, + ) + assert active_response.status_code == 200, active_response.content.decode() + active_snapshot = Snapshot.objects.get(id=json.loads(active_response.content.decode())["id"]) + + sealed_response = api_client_request( + client, + "post", + "/api/v1/core/snapshots", + api_token=api_token, + payload={ + "url": recursive_test_site["child_urls"][0], + "crawl_id": crawl_id, + "depth": 0, + "title": "Already sealed child", + "status": "queued", + }, + ) + assert sealed_response.status_code == 200, sealed_response.content.decode() + sealed_snapshot_id = json.loads(sealed_response.content.decode())["id"] + sealed_snapshot = Snapshot.objects.get(id=sealed_snapshot_id) + sealed_done = _seed_archiveresult( + sealed_snapshot, + plugin="sealedone", + hook_name="on_Snapshot__sealed_done", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_text="sealed snapshot result remains finished", + output_path="sealedone/final.txt", + ) + sealed_snapshot.sm.seal() + sealed_snapshot.refresh_from_db() + assert sealed_snapshot.status == Snapshot.StatusChoices.SEALED + assert sealed_snapshot.retry_at is None + + active_queued = _seed_archiveresult( + active_snapshot, + plugin="manualqueue", + hook_name="on_Snapshot__manual_queue", + status=ArchiveResult.StatusChoices.QUEUED, + ) + active_started = _seed_archiveresult( + active_snapshot, + plugin="manualstart", + hook_name="on_Snapshot__manual_start", + status=ArchiveResult.StatusChoices.STARTED, + ) + active_done = _seed_archiveresult( + active_snapshot, + plugin="manualdone", + hook_name="on_Snapshot__manual_done", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_text="parent cascade should not rewrite finished rows", + output_path="manualdone/cascade.txt", + ) + pause_response = api_client_request( + client, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + payload={"action": "pause"}, + ) + assert pause_response.status_code == 200, pause_response.content.decode() + assert json.loads(pause_response.content.decode())["status"] == Crawl.StatusChoices.PAUSED + + active_snapshot.refresh_from_db() + sealed_snapshot.refresh_from_db() + crawl = Crawl.objects.get(id=crawl_id) + assert crawl.status == Crawl.StatusChoices.PAUSED + assert crawl.retry_at == RETRY_AT_MAX + assert active_snapshot.status == Snapshot.StatusChoices.QUEUED + assert active_snapshot.retry_at is not None + assert active_snapshot.retry_at <= timezone.now() + assert ArchiveResult.objects.get(id=active_queued.id).status == ArchiveResult.StatusChoices.QUEUED + assert ArchiveResult.objects.get(id=active_started.id).status == ArchiveResult.StatusChoices.STARTED + + assert run_due_snapshot(active_snapshot, lock_seconds=60) is True + active_snapshot.refresh_from_db() + sealed_snapshot.refresh_from_db() + assert active_snapshot.status == Snapshot.StatusChoices.PAUSED + assert active_snapshot.retry_at == RETRY_AT_MAX + assert sealed_snapshot.status == Snapshot.StatusChoices.SEALED + assert sealed_snapshot.retry_at is None + + paused_rows = { + row.plugin: (row.status, row.retry_at) for row in ArchiveResult.objects.filter(id__in=[active_queued.id, active_started.id]) + } + assert paused_rows == { + "manualqueue": (ArchiveResult.StatusChoices.PAUSED, RETRY_AT_MAX), + "manualstart": (ArchiveResult.StatusChoices.PAUSED, RETRY_AT_MAX), + } + + active_done_row = ArchiveResult.objects.get(id=active_done.id) + sealed_done_row = ArchiveResult.objects.get(id=sealed_done.id) + active_done_path = Path(active_snapshot.output_dir) / next(iter(active_done_row.output_files)) + sealed_done_path = Path(sealed_snapshot.output_dir) / next(iter(sealed_done_row.output_files)) + assert active_done_row.status == ArchiveResult.StatusChoices.SUCCEEDED + assert active_done_row.retry_at is None + assert active_done_path.read_text() == "parent cascade should not rewrite finished rows" + assert sealed_done_row.status == ArchiveResult.StatusChoices.SUCCEEDED + assert sealed_done_row.retry_at is None + assert sealed_done_path.read_text() == "sealed snapshot result remains finished" + + resume_response = api_client_request( + client, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + payload={"action": "resume"}, + ) + assert resume_response.status_code == 200, resume_response.content.decode() + assert json.loads(resume_response.content.decode())["status"] == Crawl.StatusChoices.QUEUED + + active_snapshot.refresh_from_db() + sealed_snapshot.refresh_from_db() + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is not None + assert crawl.retry_at != RETRY_AT_MAX + assert active_snapshot.status == Snapshot.StatusChoices.QUEUED + assert active_snapshot.retry_at is not None + assert active_snapshot.retry_at != RETRY_AT_MAX + assert sealed_snapshot.status == Snapshot.StatusChoices.SEALED + assert sealed_snapshot.retry_at is None + + resumed_rows = { + row.plugin: (row.status, row.retry_at) for row in ArchiveResult.objects.filter(id__in=[active_queued.id, active_started.id]) + } + assert resumed_rows["manualqueue"][0] == ArchiveResult.StatusChoices.QUEUED + assert resumed_rows["manualqueue"][1] is not None + assert resumed_rows["manualqueue"][1] != RETRY_AT_MAX + assert resumed_rows["manualstart"][0] == ArchiveResult.StatusChoices.QUEUED + assert resumed_rows["manualstart"][1] is not None + assert resumed_rows["manualstart"][1] != RETRY_AT_MAX + assert ArchiveResult.objects.get(id=active_done.id).status == ArchiveResult.StatusChoices.SUCCEEDED + assert ArchiveResult.objects.get(id=sealed_done.id).status == ArchiveResult.StatusChoices.SUCCEEDED + assert active_done_path.read_text() == "parent cascade should not rewrite finished rows" + assert sealed_done_path.read_text() == "sealed snapshot result remains finished" + + +@pytest.mark.timeout(240) +def test_crawl_pause_resume_api_survives_server_restart_and_processes_after_resume(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True, PLUGINS="wget", SAVE_WGET="True") + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + crawl_response = live_api_request( + port, + "post", + "/api/v1/crawls/crawls", + api_token=api_token, + json={ + "urls": [recursive_test_site["root_url"]], + "max_depth": 0, + "tags": ["pause-resume-e2e"], + "config": {"PLUGINS": "wget", "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*"}, + }, + timeout=10, + ) + assert crawl_response.status_code == 200, crawl_response.text + crawl_id = crawl_response.json()["id"] + wait_for_crawl_snapshot_rows(tmp_path, crawl_id) + + pause_response = live_api_request( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"action": "pause"}, + timeout=10, + ) + assert pause_response.status_code == 200, pause_response.text + assert pause_response.json()["status"] == "paused" + + paused_state = wait_for_crawl_child_snapshots_paused_or_sealed(tmp_path, crawl_id) + assert paused_state["crawl_status"] == "paused" + assert paused_state["crawl_retry_at"] == paused_state["retry_at_max"] + assert len(paused_state["snapshots"]) == 1 + snapshot_finished_before_pause = paused_state["snapshots"][0]["status"] == "sealed" + if snapshot_finished_before_pause: + assert any(result["status"] == "succeeded" for result in paused_state["results"]) + else: + assert paused_state["snapshots"][0]["status"] == "paused" + assert paused_state["snapshots"][0]["retry_at"] == paused_state["retry_at_max"] + + stop_server(tmp_path) + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + restarted_state = get_crawl_runtime_state(tmp_path, crawl_id) + assert restarted_state["crawl_status"] == "paused" + assert restarted_state["crawl_retry_at"] == restarted_state["retry_at_max"] + if snapshot_finished_before_pause: + assert restarted_state["snapshots"][0]["status"] == "sealed" + assert any(result["status"] == "succeeded" for result in restarted_state["results"]) + return + assert restarted_state["snapshots"][0]["status"] == "paused" + assert restarted_state["snapshots"][0]["retry_at"] == restarted_state["retry_at_max"] + assert not any(result["status"] == "succeeded" for result in restarted_state["results"]) + + resume_response = live_api_request( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"action": "resume"}, + timeout=10, + ) + assert resume_response.status_code == 200, resume_response.text + assert resume_response.json()["status"] == "queued" + + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180) + assert "Root" in captured_text + assert "About" in captured_text + + final_state = get_crawl_runtime_state(tmp_path, crawl_id) + assert final_state["snapshots"][0]["status"] == "sealed" + wget_results = [result for result in final_state["results"] if result["plugin"] == "wget"] + assert wget_results + assert any(result["status"] == "succeeded" and result["output_size"] > 0 for result in wget_results) + finally: + stop_server(tmp_path) + + +@pytest.mark.timeout(420) +def test_update_index_only_runs_paused_search_rows_and_resume_later_runs_crawl(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True, PLUGINS="wget", SAVE_WGET="True") + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + crawl_response = live_api_request( + port, + "post", + "/api/v1/crawls/crawls", + api_token=api_token, + json={ + "urls": [recursive_test_site["root_url"]], + "max_depth": 0, + "tags": ["paused-index-e2e"], + "config": {"PLUGINS": "wget", "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*"}, + }, + timeout=10, + ) + assert crawl_response.status_code == 200, crawl_response.text + crawl_id = crawl_response.json()["id"] + wait_for_crawl_snapshot_rows(tmp_path, crawl_id) + + pause_response = live_api_request( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"action": "pause"}, + timeout=10, + ) + assert pause_response.status_code == 200, pause_response.text + paused_state = wait_for_crawl_child_snapshots_paused_or_sealed(tmp_path, crawl_id) + snapshot_finished_before_pause = paused_state["snapshots"][0]["status"] == "sealed" + finally: + stop_server(tmp_path) + + if snapshot_finished_before_pause: + indexed_state = get_crawl_runtime_state(tmp_path, crawl_id) + assert indexed_state["crawl_status"] == "paused" + assert indexed_state["crawl_retry_at"] == indexed_state["retry_at_max"] + assert indexed_state["snapshots"][0]["status"] == "sealed" + wget_results = [result for result in indexed_state["results"] if result["plugin"] == "wget"] + assert any(result["status"] == "succeeded" and result["output_size"] > 0 for result in wget_results) + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=60) + assert "Root" in captured_text + assert "About" in captured_text + return + + update_env = cli_env( + port=port, + PLUGINS="search_backend_sqlite", + SEARCH_BACKEND_ENGINE="sqlite", + ) + update_process = run_archivebox_cmd( + [ + "update", + "--index-only", + "--crawl-id", + crawl_id, + "--limit", + "1", + "--batch-size", + "1", + ], + cwd=tmp_path, + env=update_env, + timeout=120, + ) + assert update_process.returncode == 0, update_process.stderr + + indexed_state = wait_for_sqlite_index_result(tmp_path, crawl_id) + assert indexed_state["crawl_status"] == "paused" + assert indexed_state["crawl_retry_at"] == indexed_state["retry_at_max"] + assert indexed_state["snapshots"][0]["status"] == "paused" + assert indexed_state["snapshots"][0]["retry_at"] == indexed_state["retry_at_max"] + search_results = [result for result in indexed_state["results"] if result["plugin"] == "search_backend_sqlite"] + assert search_results + assert any(result["status"] not in {"queued", "started", "paused"} for result in search_results) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_live_api(port) + + still_paused_state = get_crawl_runtime_state(tmp_path, crawl_id) + assert still_paused_state["crawl_status"] == "paused" + assert still_paused_state["snapshots"][0]["status"] == "paused" + assert not any(result["plugin"] == "wget" and result["status"] == "succeeded" for result in still_paused_state["results"]) + + resume_response = live_api_request( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"action": "resume"}, + timeout=10, + ) + assert resume_response.status_code == 200, resume_response.text + assert resume_response.json()["status"] == "queued" + + resumed_state = wait_for_crawl_wget_success_or_sealed(tmp_path, crawl_id, timeout=240) + + assert resumed_state["snapshots"][0]["status"] == "sealed" + wget_results = [result for result in resumed_state["results"] if result["plugin"] == "wget"] + wget_succeeded = any(result["status"] == "succeeded" and result["output_size"] > 0 for result in wget_results) + if wget_succeeded: + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=60) + assert "Root" in captured_text + assert "About" in captured_text + else: + assert resumed_state["crawl_status"] == "sealed" + assert all(result["status"] not in {"queued", "started", "paused"} for result in resumed_state["results"]) + finally: + stop_server(tmp_path) + + +def test_crawl_cancel_api_defers_cleanup_to_runner(client, api_admin_user, api_headers): + from archivebox.services.runner import run_due_crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by=api_admin_user, + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now() + timedelta(minutes=5), + ) + child = Snapshot.objects.create( + url="https://example.com/cancel-child", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=timezone.now() + timedelta(minutes=5), + ) + crawl.output_dir.mkdir(parents=True, exist_ok=True) + pid_file = crawl.output_dir / "cleanup-test.pid" + pid_file.write_text("12345") + + response = api_client_request( + client, + "patch", + f"/api/v1/crawls/crawl/{crawl.id}", + payload={"action": "cancel"}, + headers=api_headers, + ) + assert response.status_code == 200, response.content + + crawl.refresh_from_db() + child.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is not None + assert crawl.retry_at <= timezone.now() + assert child.status == Snapshot.StatusChoices.STARTED + assert child.retry_at is not None + assert child.retry_at <= timezone.now() + assert pid_file.exists() + + assert run_due_crawl(crawl, lock_seconds=60) is True + crawl.refresh_from_db() + assert crawl.retry_at is None + assert not pid_file.exists() + + +def test_rest_crawl_delete_removes_crawl_and_snapshot_output_dirs(client, api_admin_user, api_headers): + url = "https://example.com/delete-path-crawl" + + crawl = Crawl.objects.create( + urls=url, + max_depth=0, + created_by=api_admin_user, + status=Crawl.StatusChoices.SEALED, + ) + snapshot = Snapshot.objects.create( + crawl=crawl, + url=url, + depth=0, + status=Snapshot.StatusChoices.SEALED, + ) + crawl_dir = Path(crawl.output_dir) + snapshot_dir = Path(snapshot.output_dir) + crawl_dir.mkdir(parents=True, exist_ok=True) + snapshot_dir.mkdir(parents=True, exist_ok=True) + (crawl_dir / "delete-path-crawl.txt").write_text("crawl output") + (snapshot_dir / "delete-path-snapshot.txt").write_text("snapshot output") + assert crawl_dir.exists() + assert snapshot_dir.exists() + + response = client.delete(f"/api/v1/crawls/crawl/{crawl.id}", **api_headers) + assert response.status_code == 200, response.content.decode() + assert not Crawl.objects.filter(pk=crawl.pk).exists() + assert not Snapshot.objects.filter(pk=snapshot.pk).exists() + assert not crawl_dir.exists() + assert not snapshot_dir.exists() + + +def test_crawl_as_rss_redirects_to_canonical_snapshots_feed(client, api_token, api_admin_user, other_user): + crawl, _snapshot = make_snapshot( + user=api_admin_user, + url="https://example.com/rss-crawl-feed", + title="Crawl Feed Snapshot", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 23, 8, 0, 0)), + ) + make_snapshot( + user=other_user, + url="https://example.com/rss-crawl-other", + title="Other Crawl Snapshot", + bookmarked_at=timezone.make_aware(datetime(2026, 5, 23, 9, 0, 0)), + ) + + response = client.get( + f"/api/v1/crawls/crawl/{crawl.id}", + {"as_rss": "true", "limit": 50, "api_key": api_token.token}, + HTTP_HOST=ADMIN_HOST, + follow=True, + ) + + assert response.status_code == 200 + assert response.redirect_chain + redirect_url = response.redirect_chain[0][0] + assert redirect_url.startswith("/api/v1/core/snapshots.rss?") + assert f"crawl_id={crawl.id}" in redirect_url + assert "as_rss" not in redirect_url + assert response["Content-Type"].startswith("application/rss+xml") + body = response.content.decode() + assert "rss-crawl-feed" in body + assert "rss-crawl-other" not in body diff --git a/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_filename.py b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_filename.py new file mode 100644 index 0000000000..b8b14317ba --- /dev/null +++ b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_filename.py @@ -0,0 +1,16 @@ +import pytest + +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/crawl-file-root", created_by=api_admin_user) + crawl.output_dir.mkdir(parents=True, exist_ok=True) + (crawl.output_dir / "basic.txt").write_text("ok") + + response = client.get(f"/api/v1/crawls/crawl/{crawl.id}/files/basic.txt", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_filename.py b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_filename.py new file mode 100644 index 0000000000..1488598225 --- /dev/null +++ b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_filename.py @@ -0,0 +1,17 @@ +import pytest + +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/crawl-file-nested", created_by=api_admin_user) + nested_dir = crawl.output_dir / "folder" + nested_dir.mkdir(parents=True, exist_ok=True) + (nested_dir / "basic.txt").write_text("ok") + + response = client.get(f"/api/v1/crawls/crawl/{crawl.id}/files/folder/basic.txt", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_subfolder_filename.py b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_subfolder_filename.py new file mode 100644 index 0000000000..3a3fbca352 --- /dev/null +++ b/archivebox/tests/test_api_v1_crawls_crawl_crawl_id_files_folder_subfolder_filename.py @@ -0,0 +1,17 @@ +import pytest + +from archivebox.crawls.models import Crawl + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_admin_user, api_headers): + crawl = Crawl.objects.create(urls="https://example.com/crawl-file-deep", created_by=api_admin_user) + nested_dir = crawl.output_dir / "folder" / "subfolder" + nested_dir.mkdir(parents=True, exist_ok=True) + (nested_dir / "basic.txt").write_text("ok") + + response = client.get(f"/api/v1/crawls/crawl/{crawl.id}/files/folder/subfolder/basic.txt", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_crawls_crawls.py b/archivebox/tests/test_api_v1_crawls_crawls.py new file mode 100644 index 0000000000..8a18d2cbe3 --- /dev/null +++ b/archivebox/tests/test_api_v1_crawls_crawls.py @@ -0,0 +1,38 @@ +from django.test import RequestFactory + +import pytest + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_create_crawl_api_queues_crawl_without_spawning_runner(): + from django.contrib.auth import get_user_model + + from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl + + user = get_user_model().objects.create_superuser( + username="runner-api-admin", + email="runner-api-admin@example.com", + password="testpassword", + ) + request = RequestFactory().post("/api/v1/crawls") + request.user = user + + crawl = create_crawl( + request, + CrawlCreateSchema( + urls=["https://example.com"], + max_depth=0, + tags=[], + tags_str="", + label="", + notes="", + config={}, + ), + ) + + assert str(crawl.id) + assert crawl.status == "queued" + assert crawl.retry_at is not None + assert crawl.snapshot_set.count() == 0 diff --git a/archivebox/tests/test_api_v1_machine_binaries.py b/archivebox/tests/test_api_v1_machine_binaries.py new file mode 100644 index 0000000000..9ae80e67be --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_binaries.py @@ -0,0 +1,22 @@ +import pytest + +from archivebox.machine.models import Binary, Machine + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + machine = Machine.current(refresh=True) + Binary.objects.create( + machine=machine, + name="api-basic-bin", + binprovider="env", + abspath="/usr/bin/env", + version="1.0", + status=Binary.StatusChoices.INSTALLED, + ) + + response = client.get("/api/v1/machine/binaries", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_machine_binary_binary_id.py b/archivebox/tests/test_api_v1_machine_binary_binary_id.py new file mode 100644 index 0000000000..a8c1d54161 --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_binary_binary_id.py @@ -0,0 +1,22 @@ +import pytest + +from archivebox.machine.models import Binary, Machine + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + machine = Machine.current(refresh=True) + binary = Binary.objects.create( + machine=machine, + name="api-basic-bin", + binprovider="env", + abspath="/usr/bin/env", + version="1.0", + status=Binary.StatusChoices.INSTALLED, + ) + + response = client.get(f"/api/v1/machine/binary/{binary.id}", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_machine_binary_by_name_name.py b/archivebox/tests/test_api_v1_machine_binary_by_name_name.py new file mode 100644 index 0000000000..45744fa2b3 --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_binary_by_name_name.py @@ -0,0 +1,22 @@ +import pytest + +from archivebox.machine.models import Binary, Machine + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + machine = Machine.current(refresh=True) + Binary.objects.create( + machine=machine, + name="api-basic-bin", + binprovider="env", + abspath="/usr/bin/env", + version="1.0", + status=Binary.StatusChoices.INSTALLED, + ) + + response = client.get("/api/v1/machine/binary/by-name/api-basic-bin", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_machine_machine_current.py b/archivebox/tests/test_api_v1_machine_machine_current.py new file mode 100644 index 0000000000..ec303476cc --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_machine_current.py @@ -0,0 +1,10 @@ +import pytest + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + response = client.get("/api/v1/machine/machine/current", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_machine_machine_machine_id.py b/archivebox/tests/test_api_v1_machine_machine_machine_id.py new file mode 100644 index 0000000000..db5a5ea7a4 --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_machine_machine_id.py @@ -0,0 +1,14 @@ +import pytest + +from archivebox.machine.models import Machine + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + machine = Machine.current(refresh=True) + + response = client.get(f"/api/v1/machine/machine/{machine.id}", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_machine_machines.py b/archivebox/tests/test_api_v1_machine_machines.py new file mode 100644 index 0000000000..3101c2d169 --- /dev/null +++ b/archivebox/tests/test_api_v1_machine_machines.py @@ -0,0 +1,14 @@ +import pytest + +from archivebox.machine.models import Machine + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, tmp_path, api_headers): + Machine.current(refresh=True) + + response = client.get("/api/v1/machine/machines", **api_headers) + + assert response.status_code == 200, response.content diff --git a/archivebox/tests/test_api_v1_personas_personas.py b/archivebox/tests/test_api_v1_personas_personas.py new file mode 100644 index 0000000000..04cb74a77d --- /dev/null +++ b/archivebox/tests/test_api_v1_personas_personas.py @@ -0,0 +1,57 @@ +import pytest + +from archivebox.personas.models import Persona +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_personas_api_returns_paginated_envelope(client, api_headers): + for name in ("api-persona-alpha", "api-persona-bravo", "api-persona-charlie"): + create_response = api_client_request( + client, + "post", + "/api/v1/personas/sync", + payload={ + "extension_persona_id": f"extension-{name}", + "name": name, + "settings": {}, + "cookies_txt": "", + "auth_json": {}, + }, + headers=api_headers, + ) + assert create_response.status_code == 200, create_response.content + assert create_response.json()["created"] is True + + total_personas = Persona.objects.count() + response = client.get( + "/api/v1/personas/personas?limit=1", + **api_headers, + ) + + assert response.status_code == 200, response.content + payload = response.json() + assert isinstance(payload, dict) + assert set(payload) >= {"items", "count", "total_items", "total_pages", "page", "limit", "offset", "num_items"} + assert payload["count"] == total_personas + assert payload["total_items"] == total_personas + assert payload["limit"] == 1 + assert payload["offset"] == 0 + assert payload["num_items"] == 1 + assert len(payload["items"]) == 1 + + limit_two_response = client.get( + "/api/v1/personas/personas?limit=2", + **api_headers, + ) + assert limit_two_response.status_code == 200, limit_two_response.content + limit_two_payload = limit_two_response.json() + assert isinstance(limit_two_payload, dict) + assert limit_two_payload["count"] == total_personas + assert limit_two_payload["total_items"] == total_personas + assert limit_two_payload["limit"] == 2 + assert limit_two_payload["offset"] == 0 + assert limit_two_payload["num_items"] == 2 + assert len(limit_two_payload["items"]) == 2 diff --git a/archivebox/tests/test_api_v1_personas_sync.py b/archivebox/tests/test_api_v1_personas_sync.py new file mode 100644 index 0000000000..cea4d5d223 --- /dev/null +++ b/archivebox/tests/test_api_v1_personas_sync.py @@ -0,0 +1,25 @@ +import pytest + +from archivebox.tests.conftest import api_client_request + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_basic_success_case_request(client, api_headers): + response = api_client_request( + client, + "post", + "/api/v1/personas/sync", + payload={ + "extension_persona_id": "extension-api-persona-basic", + "name": "api-persona-basic", + "settings": {}, + "cookies_txt": "", + "auth_json": {}, + }, + headers=api_headers, + ) + + assert response.status_code == 200, response.content + assert response.json()["success"] is True diff --git a/archivebox/tests/test_api_v1_workflow_core_token_auth_side_effects.py b/archivebox/tests/test_api_v1_workflow_core_token_auth_side_effects.py new file mode 100644 index 0000000000..a63c5aeac0 --- /dev/null +++ b/archivebox/tests/test_api_v1_workflow_core_token_auth_side_effects.py @@ -0,0 +1,222 @@ +import time + +import pytest +import requests + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import ( + cli_env, + create_admin_and_token, + get_free_port, + init_archive, + live_api_request, + start_archivebox_server, + stop_server, + wait_for_live_api, +) +from archivebox.tests.test_orm_helpers import use_archivebox_db + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def live_api_request_retrying_sqlite_lock(*args, attempts: int = 12, delay: float = 2.0, **kwargs): + response = None + for _attempt in range(attempts): + response = live_api_request(*args, **kwargs) + if response.status_code != 503 or "database is locked" not in response.text: + return response + time.sleep(delay) + assert response is not None + return response + + +@pytest.mark.timeout(180) +def test_core_api_workflow_uses_token_auth_and_persists_side_effects_over_server(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True, PUBLIC_INDEX="True") + api_token = create_admin_and_token(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + docs = wait_for_live_api(port) + assert docs.status_code == 200 + openapi = wait_for_live_api(port, path="/api/v1/openapi.json") + assert openapi.status_code == 200 + paths = openapi.json()["paths"] + assert "/api/v1/core/snapshots" in paths + assert "/api/v1/crawls/crawls" in paths + + unauth = requests.get( + f"http://127.0.0.1:{port}/api/v1/crawls/crawls", + headers={"Host": f"api.archivebox.localhost:{port}"}, + timeout=10, + ) + assert unauth.status_code in (401, 403) + bad_auth = requests.get( + f"http://127.0.0.1:{port}/api/v1/crawls/crawls", + headers={"Host": f"api.archivebox.localhost:{port}", "X-ArchiveBox-API-Key": "bad-token"}, + timeout=10, + ) + assert bad_auth.status_code in (401, 403) + + crawl_response = live_api_request( + port, + "post", + "/api/v1/crawls/crawls", + api_token=api_token, + json={ + "urls": [recursive_test_site["root_url"]], + "max_depth": 2, + "tags": ["api-depth-two"], + "label": "api crawl", + "notes": "created through REST API", + "config": { + "PLUGINS": "wget,parse_html_urls", + "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*", + "CRAWL_MAX_URLS": 7, + "CRAWL_MAX_SIZE": "0", + "SNAPSHOT_MAX_SIZE": "0", + }, + }, + timeout=10, + ) + assert crawl_response.status_code == 200, crawl_response.text + crawl_payload = crawl_response.json() + crawl_id = crawl_payload["id"] + assert crawl_payload["max_depth"] == 2 + assert crawl_payload["tags_str"] == "api-depth-two" + assert crawl_payload["config"]["PLUGINS"] == "wget,parse_html_urls" + assert crawl_payload["config"]["CRAWL_MAX_URLS"] == 7 + + snapshot_response = live_api_request( + port, + "post", + "/api/v1/core/snapshots", + api_token=api_token, + json={ + "url": recursive_test_site["child_urls"][0], + "crawl_id": crawl_id, + "depth": 1, + "title": "API child snapshot", + "tags": ["api-child"], + "status": "queued", + }, + timeout=10, + ) + assert snapshot_response.status_code == 200, snapshot_response.text + snapshot_payload = snapshot_response.json() + snapshot_id = snapshot_payload["id"] + assert snapshot_payload["url"] == recursive_test_site["child_urls"][0] + assert snapshot_payload["tags"] == ["api-child"] + + patch_snapshot = live_api_request_retrying_sqlite_lock( + port, + "patch", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + json={"status": "sealed", "tags": ["api-child", "api-patched"]}, + timeout=10, + ) + assert patch_snapshot.status_code == 200, patch_snapshot.text + assert patch_snapshot.json()["status"] == "sealed" + assert set(patch_snapshot.json()["tags"]) == {"api-child", "api-patched"} + + tag_create = live_api_request_retrying_sqlite_lock( + port, + "post", + "/api/v1/core/tags/create/", + api_token=api_token, + json={"name": "api-extra"}, + timeout=10, + ) + assert tag_create.status_code == 200, tag_create.text + tag_id = tag_create.json()["tag_id"] + + add_tag = live_api_request_retrying_sqlite_lock( + port, + "post", + "/api/v1/core/tags/add-to-snapshot/", + api_token=api_token, + json={"snapshot_id": snapshot_id, "tag_id": tag_id}, + timeout=10, + ) + assert add_tag.status_code == 200, add_tag.text + remove_tag = live_api_request_retrying_sqlite_lock( + port, + "post", + "/api/v1/core/tags/remove-from-snapshot/", + api_token=api_token, + json={"snapshot_id": snapshot_id, "tag_name": "api-extra"}, + timeout=10, + ) + assert remove_tag.status_code == 200, remove_tag.text + + crawl_patch = live_api_request_retrying_sqlite_lock( + port, + "patch", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + json={"status": "sealed", "tags": ["api-sealed"]}, + timeout=10, + ) + assert crawl_patch.status_code == 200, crawl_patch.text + assert crawl_patch.json()["status"] == "sealed" + assert crawl_patch.json()["tags_str"] == "api-sealed" + + snapshots_list = live_api_request( + port, + "get", + "/api/v1/core/snapshots?tag=api-patched&with_archiveresults=true", + api_token=api_token, + timeout=10, + ) + assert snapshots_list.status_code == 200, snapshots_list.text + snapshot_items = snapshots_list.json()["items"] + assert len(snapshot_items) == 1 + assert snapshot_items[0]["id"] == snapshot_id + archiveresults = snapshot_items[0]["archiveresults"] + assert {result["plugin"] for result in archiveresults}.issubset({"wget", "parse_html_urls"}) + assert {result["status"] for result in archiveresults}.issubset({"queued"}) + + bearer_response = requests.get( + f"http://127.0.0.1:{port}/api/v1/crawls/crawl/{crawl_id}", + headers={"Host": f"api.archivebox.localhost:{port}", "Authorization": f"Bearer {api_token}"}, + timeout=10, + ) + assert bearer_response.status_code == 200, bearer_response.text + query_response = requests.get( + f"http://127.0.0.1:{port}/api/v1/crawls/crawl/{crawl_id}?api_key={api_token}", + headers={"Host": f"api.archivebox.localhost:{port}"}, + timeout=10, + ) + assert query_response.status_code == 200, query_response.text + + delete_snapshot = live_api_request( + port, + "delete", + f"/api/v1/core/snapshot/{snapshot_id}", + api_token=api_token, + timeout=10, + ) + assert delete_snapshot.status_code == 200, delete_snapshot.text + assert delete_snapshot.json()["success"] is True + + delete_crawl = live_api_request( + port, + "delete", + f"/api/v1/crawls/crawl/{crawl_id}", + api_token=api_token, + timeout=10, + ) + assert delete_crawl.status_code == 200, delete_crawl.text + assert delete_crawl.json()["success"] is True + + with use_archivebox_db(tmp_path): + assert Crawl.objects.filter(pk=crawl_id).count() == 0 + assert Snapshot.objects.filter(pk=snapshot_id).count() == 0 + finally: + stop_server(tmp_path) diff --git a/archivebox/tests/test_api_v1_workflow_frozen_crawl_config_sources.py b/archivebox/tests/test_api_v1_workflow_frozen_crawl_config_sources.py new file mode 100644 index 0000000000..9f68f3af75 --- /dev/null +++ b/archivebox/tests/test_api_v1_workflow_frozen_crawl_config_sources.py @@ -0,0 +1,66 @@ +from django.test import RequestFactory + +import pytest + + +pytestmark = pytest.mark.django_db(transaction=True) + + +SENSITIVE_SECRET = "raw-twocaptcha-secret-for-frozen-crawl-test" + + +@pytest.fixture +def archivebox_db(initialized_archive): + from archivebox.tests.test_orm_helpers import use_archivebox_db + + with use_archivebox_db(initialized_archive): + yield initialized_archive + + +def _user(username="frozen-config-admin"): + from django.contrib.auth import get_user_model + + return get_user_model().objects.create_superuser( + username=username, + email=f"{username}@example.com", + password="testpassword", + ) + + +def test_api_create_and_cli_add_store_full_frozen_config(archivebox_db): + from archivebox.api.v1_crawls import CrawlCreateSchema, CrawlSchema, create_crawl + from archivebox.cli.archivebox_add import add + from archivebox.config.common import SENSITIVE_CONFIG_VALUE_REDACTED + + user = _user("frozen-config-api-admin") + request = RequestFactory().post("/api/v1/crawls") + request.user = user + + api_crawl = create_crawl( + request, + CrawlCreateSchema( + urls=["https://example.com/api"], + max_depth=0, + tags=[], + tags_str="", + label="API frozen config", + notes="", + config={"TWOCAPTCHA_API_KEY": SENSITIVE_SECRET, "TIMEOUT": 33, "SECRET_KEY": "must-not-freeze", "PUBLIC_ADD_VIEW": True}, + ), + ) + assert "CHECK_SSL_VALIDITY" in api_crawl.config + assert api_crawl.config["TIMEOUT"] == 33 + assert api_crawl.config["TWOCAPTCHA_API_KEY"] == SENSITIVE_SECRET + assert "SECRET_KEY" not in api_crawl.config + assert "PUBLIC_ADD_VIEW" not in api_crawl.config + assert CrawlSchema.resolve_config(api_crawl)["TWOCAPTCHA_API_KEY"] == SENSITIVE_CONFIG_VALUE_REDACTED + + cli_crawl, _snapshots = add( + "https://example.com/cli", + bg=True, + created_by_id=user.pk, + config={"TWOCAPTCHA_API_KEY": SENSITIVE_SECRET, "TIMEOUT": 44}, + ) + assert "CHECK_SSL_VALIDITY" in cli_crawl.config + assert cli_crawl.config["TIMEOUT"] == 44 + assert cli_crawl.config["TWOCAPTCHA_API_KEY"] == SENSITIVE_SECRET diff --git a/archivebox/tests/test_archive_result_service.py b/archivebox/tests/test_archive_result_service.py new file mode 100644 index 0000000000..c310d732b3 --- /dev/null +++ b/archivebox/tests/test_archive_result_service.py @@ -0,0 +1,724 @@ +from pathlib import Path +import pytest + + +from abxpkg.binary_service import BinaryRequestEvent +from abx_dl.events import ArchiveResultEvent, ProcessEvent, ProcessStartedEvent +from abx_dl.orchestrator import create_bus +from abx_dl.output_files import OutputFile + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _cleanup_machine_process_rows() -> None: + from archivebox.machine.models import Process + + Process.objects.all().delete() + + +def _create_snapshot(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + crawl.save() + + snapshot = Snapshot( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + snapshot.save() + return snapshot + + +def test_process_completed_projects_inline_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "wget" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "index.html").write_text("ok") + + bus = create_bus(name="test_inline_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status="succeeded", + output_str="wget/index.html", + output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)], + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg") + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert result.output_str == "wget/index.html" + assert "index.html" in result.output_files + assert result.output_files["index.html"] == {"extension": "html", "mimetype": "text/html", "size": 15} + assert result.output_size == 15 + _cleanup_machine_process_rows() + + +def test_archiveresult_event_retry_updates_existing_hook_row(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "wget" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "index.html").write_text("ok") + + service = ArchiveResultService(create_bus(name="test_archiveresult_retry_updates_existing_hook_row")) + first_event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status="failed", + output_str="timed out", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + retry_event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status="succeeded", + output_str="wget/index.html", + output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)], + start_ts="2026-03-22T12:01:00+00:00", + end_ts="2026-03-22T12:01:01+00:00", + ) + + async def emit_events() -> None: + await service.on_ArchiveResultEvent__save_to_db(first_event) + first_result_id = await ArchiveResult.objects.values_list("id", flat=True).aget( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + ) + await service.on_ArchiveResultEvent__save_to_db(retry_event) + retry_result = await ArchiveResult.objects.aget( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + ) + assert retry_result.id == first_result_id + assert retry_result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert retry_result.output_str == "wget/index.html" + + asyncio.run(emit_events()) + + assert ArchiveResult.objects.filter(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg").count() == 1 + _cleanup_machine_process_rows() + + +def test_archiveresult_duplicate_hook_rows_are_rejected(): + from django.db import IntegrityError, transaction + from archivebox.core.models import ArchiveResult + + snapshot = _create_snapshot() + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status=ArchiveResult.StatusChoices.FAILED, + ) + + with pytest.raises(IntegrityError), transaction.atomic(): + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + +def test_process_completed_projects_synthetic_failed_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "chrome" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_synthetic_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + status="failed", + output_str="Hook timed out after 60 seconds", + error="Hook timed out after 60 seconds", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:01:00+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait") + assert result.status == ArchiveResult.StatusChoices.FAILED + assert result.output_str == "Hook timed out after 60 seconds" + assert "Hook timed out" in result.notes + _cleanup_machine_process_rows() + + +def test_failed_title_archiveresult_does_not_overwrite_snapshot_title(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_failed_title_does_not_update_snapshot") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="failed", + output_str="No Chrome session found (chrome plugin must run first)", + error="No Chrome session found (chrome plugin must run first)", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js") + assert result.status == ArchiveResult.StatusChoices.FAILED + assert result.output_str == "No Chrome session found (chrome plugin must run first)" + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + assert snapshot.resolved_title == "" + _cleanup_machine_process_rows() + + +def test_snapshot_resolved_title_ignores_failed_title_output_str(): + from archivebox.core.models import ArchiveResult + + snapshot = _create_snapshot() + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title.js", + status=ArchiveResult.StatusChoices.FAILED, + output_str="No Chrome session found (chrome plugin must run first)", + ) + + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + assert snapshot.resolved_title == "" + _cleanup_machine_process_rows() + + +def test_snapshot_title_ignores_noresults_title_output_str(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_noresults_title_does_not_update_snapshot") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="TimeoutError: Navigation timeout of 54172 ms exceeded", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js") + assert result.status == ArchiveResult.StatusChoices.NORESULTS + assert result.output_str == "TimeoutError: Navigation timeout of 54172 ms exceeded" + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + assert snapshot.resolved_title == "" + _cleanup_machine_process_rows() + + +def test_snapshot_save_normalizes_url_title_to_none(): + from archivebox.core.models import Snapshot + + snapshot = _create_snapshot() + snapshot.title = snapshot.url + snapshot.save(update_fields=["title", "modified_at"]) + + snapshot.refresh_from_db() + assert snapshot.title is None + assert snapshot.resolved_title == "" + + created = Snapshot.objects.create( + url="https://example.com/title-normalize-create", + title="https://example.com/title-normalize-create", + crawl=snapshot.crawl, + ) + + created.refresh_from_db() + assert created.title is None + assert created.resolved_title == "" + _cleanup_machine_process_rows() + + +def test_process_completed_projects_noresults_archiveresult(): + from archivebox.core.models import ArchiveResult + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_noresults_archiveresult") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js") + assert result.status == ArchiveResult.StatusChoices.NORESULTS + assert result.output_str == "No title found" + + +def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state(): + from archivebox.core.models import ArchiveResult, Snapshot + + snapshot = _create_snapshot() + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + status=ArchiveResult.StatusChoices.FAILED, + output_str="timed out", + output_files={"stderr.log": {}}, + output_size=123, + output_mimetypes="text/plain", + ) + + reset_count = snapshot.retry_failed_archiveresults() + + snapshot.refresh_from_db() + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait") + assert reset_count == 1 + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert snapshot.current_step == 0 + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert result.output_str == "" + assert result.output_json is None + assert result.output_files == {} + assert result.output_size == 0 + assert result.output_mimetypes == "" + assert result.start_ts is None + assert result.end_ts is None + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + _cleanup_machine_process_rows() + + +def test_retry_failed_archiveresults_preserves_legacy_plugin_rows_without_hook_name(): + from archivebox.core.models import ArchiveResult, Snapshot + + snapshot = _create_snapshot() + legacy_result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="", + status=ArchiveResult.StatusChoices.FAILED, + output_str="legacy failure", + output_files={"index.html": {"size": 123}}, + output_size=123, + output_mimetypes="text/html", + ) + hook_result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__06_wget.finite.bg", + status=ArchiveResult.StatusChoices.FAILED, + output_str="hook failure", + output_files={"stderr.log": {}}, + output_size=10, + output_mimetypes="text/plain", + ) + + reset_count = snapshot.retry_failed_archiveresults() + + snapshot.refresh_from_db() + snapshot.crawl.refresh_from_db() + legacy_result.refresh_from_db() + hook_result.refresh_from_db() + + assert reset_count == 2 + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert snapshot.crawl.status == snapshot.crawl.StatusChoices.QUEUED + assert snapshot.crawl.retry_at is not None + assert legacy_result.status == ArchiveResult.StatusChoices.FAILED + assert legacy_result.output_str == "legacy failure" + assert legacy_result.output_files == {"index.html": {"size": 123}} + assert legacy_result.output_size == 123 + assert hook_result.status == ArchiveResult.StatusChoices.QUEUED + assert hook_result.output_str == "" + assert hook_result.output_files == {} + assert hook_result.output_size == 0 + _cleanup_machine_process_rows() + + +def test_process_completed_projects_snapshot_title_from_output_str(): + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + + bus = create_bus(name="test_snapshot_title_output_str") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="succeeded", + output_str="Example Domain", + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + snapshot.refresh_from_db() + assert snapshot.title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_process_completed_projects_snapshot_title_from_title_file(): + from archivebox.services.archive_result_service import ArchiveResultService + import asyncio + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "title.txt").write_text("Example Domain") + + bus = create_bus(name="test_snapshot_title_file") + service = ArchiveResultService(bus) + + event = ArchiveResultEvent( + snapshot_id=str(snapshot.id), + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)], + start_ts="2026-03-22T12:00:00+00:00", + end_ts="2026-03-22T12:00:01+00:00", + ) + + async def emit_event() -> None: + await service.on_ArchiveResultEvent__save_to_db(event) + + asyncio.run(emit_event()) + + snapshot.refresh_from_db() + assert snapshot.title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_snapshot_resolved_title_falls_back_to_title_file_without_db_title(): + from archivebox.core.models import ArchiveResult + + snapshot = _create_snapshot() + plugin_dir = Path(snapshot.output_dir) / "title" + plugin_dir.mkdir(parents=True, exist_ok=True) + (plugin_dir / "title.txt").write_text("Example Domain") + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title.js", + status="noresults", + output_str="No title found", + output_files={"title.txt": {}}, + ) + + snapshot.refresh_from_db() + assert snapshot.title in (None, "") + assert snapshot.resolved_title == "Example Domain" + _cleanup_machine_process_rows() + + +def test_collect_output_metadata_preserves_file_metadata(): + from archivebox.services.archive_result_service import _resolve_output_metadata + + output_files, output_size, output_mimetypes = _resolve_output_metadata( + [OutputFile(path="index.html", extension="html", mimetype="text/html", size=42)], + Path("/tmp/does-not-need-to-exist"), + ) + + assert output_files == { + "index.html": { + "extension": "html", + "mimetype": "text/html", + "size": 42, + }, + } + assert output_size == 42 + assert output_mimetypes == "text/html" + + +def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path): + from archivebox.services.archive_result_service import _collect_output_metadata + + plugin_dir = tmp_path / "wget" + warc_file = plugin_dir / "warc" / "capture.warc.gz" + warc_file.parent.mkdir(parents=True, exist_ok=True) + warc_file.write_bytes(b"warc-bytes") + + output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) + + assert output_files["warc/capture.warc.gz"] == { + "extension": "gz", + "mimetype": "application/warc", + "size": 10, + } + assert output_size == 10 + assert output_mimetypes == "application/warc" + + +@pytest.mark.django_db(transaction=True) +def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(tmp_path): + from archivebox.machine.models import Binary, NetworkInterface + from archivebox.machine.models import Process as MachineProcess + from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService + from abx_dl.services.process_service import ProcessService as DlProcessService + + iface = NetworkInterface.current() + machine = iface.machine + + binary = Binary.objects.create( + machine=machine, + name="postlight-parser", + abspath="/tmp/postlight-parser", + version="2.2.3", + binprovider="npm", + binproviders="npm", + status=Binary.StatusChoices.INSTALLED, + ) + + hook_path = tmp_path / "on_Snapshot__57_mercury.py" + hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8") + hook_path.chmod(0o755) + output_dir = tmp_path / "mercury" + output_dir.mkdir() + + bus = create_bus(name="test_process_started_binary_hydration") + DlProcessService(bus, emit_jsonl=False, interactive_tty=False) + ArchiveBoxProcessService(bus) + + async def run_test() -> None: + await bus.emit( + ProcessEvent( + plugin_name="mercury", + hook_name="on_Snapshot__57_mercury.py", + hook_path=str(hook_path), + hook_args=["--url=https://example.com"], + is_background=False, + output_dir=str(output_dir), + env={ + "MERCURY_BINARY": binary.abspath, + "NODE_BINARY": "/tmp/node", + }, + timeout=60, + url="https://example.com", + ), + ).now() + started = await bus.find( + ProcessStartedEvent, + past=True, + future=False, + hook_name="on_Snapshot__57_mercury.py", + output_dir=str(output_dir), + ) + assert started is not None + await started.wait() + await started.event_results_list() + + import asyncio + + asyncio.run(run_test()) + + process = MachineProcess.objects.get( + pwd=str(output_dir), + cmd=[str(hook_path), "--url=https://example.com"], + ) + assert process.binary_id == binary.id + assert process.iface_id == iface.id + + +@pytest.mark.django_db(transaction=True) +def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(tmp_path): + from archivebox.machine.models import Binary, NetworkInterface + from archivebox.machine.models import Process as MachineProcess + from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService + from abx_dl.services.process_service import ProcessService as DlProcessService + + iface = NetworkInterface.current() + machine = iface.machine + + node = Binary.objects.create( + machine=machine, + name="node", + abspath="/tmp/node", + version="22.0.0", + binprovider="env", + binproviders="env", + status=Binary.StatusChoices.INSTALLED, + ) + + hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js" + hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8") + hook_path.chmod(0o755) + output_dir = tmp_path / "parse-dom-outlinks" + output_dir.mkdir() + + bus = create_bus(name="test_process_started_node_fallback") + DlProcessService(bus, emit_jsonl=False, interactive_tty=False) + ArchiveBoxProcessService(bus) + + async def run_test() -> None: + await bus.emit( + ProcessEvent( + plugin_name="parse_dom_outlinks", + hook_name="on_Snapshot__75_parse_dom_outlinks.js", + hook_path=str(hook_path), + hook_args=["--url=https://example.com"], + is_background=False, + output_dir=str(output_dir), + env={"NODE_BINARY": node.abspath}, + timeout=60, + url="https://example.com", + ), + ).now() + started = await bus.find( + ProcessStartedEvent, + past=True, + future=False, + hook_name="on_Snapshot__75_parse_dom_outlinks.js", + output_dir=str(output_dir), + ) + assert started is not None + await started.wait() + await started.event_results_list() + + import asyncio + + asyncio.run(run_test()) + + process = MachineProcess.objects.get( + pwd=str(output_dir), + cmd=[str(hook_path), "--url=https://example.com"], + ) + assert process.binary_id == node.id + assert process.iface_id == iface.id + + +def test_binary_event_reuses_existing_installed_binary_row(): + from archivebox.machine.models import Binary, Machine + from archivebox.services.binary_service import ArchiveBoxDBBinaryCacheBackend + from abxpkg import PROVIDER_CLASS_BY_NAME + from abxpkg.binary_service import BinaryCacheService, BinaryService + import asyncio + + machine = Machine.current() + wget_path = PROVIDER_CLASS_BY_NAME["env"]().get_abspath("wget", quiet=True, no_cache=True) + assert wget_path + + binary = Binary.objects.create( + machine=machine, + name="wget", + abspath=str(wget_path), + version="9.9.9", + binprovider="env", + binproviders="env,apt,brew", + status=Binary.StatusChoices.INSTALLED, + ) + + bus = create_bus(name="test_binary_event_reuses_existing_installed_binary_row") + BinaryCacheService(bus, backend=ArchiveBoxDBBinaryCacheBackend()) + BinaryService(bus) + event = BinaryRequestEvent( + name="wget", + binproviders=binary.binproviders, + extra_context={ + "plugin_name": "wget", + "output_dir": "/tmp/wget", + }, + ) + + async def run_event(): + await bus.emit(event).now() + await bus.wait_until_idle() + + asyncio.run(run_event()) + + binary.refresh_from_db() + assert Binary.objects.filter(machine=machine, name="wget").count() == 1 + assert binary.status == Binary.StatusChoices.INSTALLED + assert binary.abspath == str(wget_path) + assert binary.version == "9.9.9" + assert binary.binprovider == "env" + assert binary.binproviders == "env,apt,brew" diff --git a/archivebox/tests/test_archiveresult_pause.py b/archivebox/tests/test_archiveresult_pause.py new file mode 100644 index 0000000000..d197406019 --- /dev/null +++ b/archivebox/tests/test_archiveresult_pause.py @@ -0,0 +1,2 @@ +# test_crawl_pause_resume_api_cascades_archiveresults_and_leaves_finished_snapshot_results_alone moved to test_api_v1_crawls_crawl_crawl_id.py. +# test_targeted_extract_retries_one_failed_archiveresult_while_snapshot_stays_paused moved to test_api_v1_core_snapshot_snapshot_id.py. diff --git a/archivebox/tests/test_auth_ldap.py b/archivebox/tests/test_auth_ldap.py new file mode 100644 index 0000000000..dc143e6d6c --- /dev/null +++ b/archivebox/tests/test_auth_ldap.py @@ -0,0 +1,189 @@ +""" +LDAP authentication tests for ArchiveBox. + +Tests LDAP configuration, validation, and integration with Django. +Per CLAUDE.md: NO MOCKS, NO SKIPS - all tests use real code paths. +""" + +from importlib.util import find_spec + +from archivebox.tests.conftest import run_archivebox_cmd + + +class TestLDAPConfig: + """Test LDAP configuration loading and validation.""" + + def test_ldap_config_defaults(self): + """Test that LDAP config loads with correct defaults.""" + from archivebox.config.common import get_config + + # Check default values + config = get_config() + assert not config.LDAP_ENABLED + assert config.LDAP_SERVER_URI is None + assert config.LDAP_BIND_DN is None + assert config.LDAP_BIND_PASSWORD is None + assert config.LDAP_USER_BASE is None + assert config.LDAP_USER_FILTER == "(uid=%(user)s)" + assert config.LDAP_USERNAME_ATTR == "username" + assert config.LDAP_FIRSTNAME_ATTR == "givenName" + assert config.LDAP_LASTNAME_ATTR == "sn" + assert config.LDAP_EMAIL_ATTR == "mail" + assert not config.LDAP_CREATE_SUPERUSER + + def test_ldap_config_validation_disabled(self): + """Test that validation passes when LDAP is disabled.""" + from archivebox.config.ldap import LDAPConfig + + config = LDAPConfig(LDAP_ENABLED=False) + is_valid, error_msg = config.validate_ldap_config() + + assert is_valid + assert error_msg == "" + + def test_ldap_config_validation_missing_fields(self): + """Test that validation fails when required fields are missing.""" + from archivebox.config.ldap import LDAPConfig + + # Enable LDAP but don't provide required fields + config = LDAPConfig(LDAP_ENABLED=True) + is_valid, error_msg = config.validate_ldap_config() + + assert not is_valid + assert "LDAP_* config options must all be set" in error_msg + assert "LDAP_SERVER_URI" in error_msg + assert "LDAP_BIND_DN" in error_msg + assert "LDAP_BIND_PASSWORD" in error_msg + assert "LDAP_USER_BASE" in error_msg + + def test_ldap_config_validation_complete(self): + """Test that validation passes when all required fields are provided.""" + from archivebox.config.ldap import LDAPConfig + + config = LDAPConfig( + LDAP_ENABLED=True, + LDAP_SERVER_URI="ldap://ldap-test.localhost:389", + LDAP_BIND_DN="cn=admin,dc=example,dc=com", + LDAP_BIND_PASSWORD="password", + LDAP_USER_BASE="ou=users,dc=example,dc=com", + ) + is_valid, error_msg = config.validate_ldap_config() + + assert is_valid + assert error_msg == "" + + def test_ldap_config_in_get_config(self): + """Test that LDAP_CONFIG is included in the typed config sections.""" + from archivebox.config.common import get_all_configs + + all_config = get_all_configs() + assert "LDAP_CONFIG" in all_config + assert all_config["LDAP_CONFIG"].__class__.__name__ == "LDAPConfig" + + +class TestLDAPIntegration: + """Test LDAP integration with Django settings.""" + + def test_django_settings_without_ldap_enabled(self): + """Test that Django settings work correctly when LDAP is disabled.""" + # Import Django settings (LDAP_ENABLED should be False by default) + from django.conf import settings + + # Should have default authentication backends + assert "django.contrib.auth.backends.RemoteUserBackend" in settings.AUTHENTICATION_BACKENDS + assert "django.contrib.auth.backends.ModelBackend" in settings.AUTHENTICATION_BACKENDS + + # LDAP backend should not be present when disabled + ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()] + assert len(ldap_backends) == 0, "LDAP backend should not be present when LDAP_ENABLED=False" + + def test_django_settings_with_ldap_library_check(self): + """Test that Django settings check for LDAP libraries when enabled.""" + ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None + + # If LDAP libraries are not available, settings should handle gracefully + if not ldap_available: + # Settings should have loaded without LDAP backend + from django.conf import settings + + ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()] + assert len(ldap_backends) == 0, "LDAP backend should not be present when libraries unavailable" + + +class TestLDAPAuthBackend: + """Test custom LDAP authentication backend.""" + + def test_ldap_backend_class_exists(self): + """Test that ArchiveBoxLDAPBackend class is defined.""" + from archivebox.ldap.auth import ArchiveBoxLDAPBackend + + assert ArchiveBoxLDAPBackend.authenticate_ldap_user is not None + + def test_ldap_backend_inherits_correctly(self): + """Test that ArchiveBoxLDAPBackend has correct inheritance.""" + from archivebox.ldap.auth import ArchiveBoxLDAPBackend + + # Should have authenticate_ldap_user method (from base or overridden) + assert callable(ArchiveBoxLDAPBackend.authenticate_ldap_user) + + +class TestArchiveBoxWithLDAP: + """Test ArchiveBox commands with LDAP configuration.""" + + def test_archivebox_init_without_ldap(self, tmp_path): + """Test that archivebox init works without LDAP enabled.""" + _cmd_result = run_archivebox_cmd( + ["init"], + cwd=tmp_path, + timeout=45, + env={"LDAP_ENABLED": "False"}, + default_cli_env=True, + disable_extractors=True, + ) + _, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + # Should succeed + assert code == 0, f"archivebox init failed: {stderr}" + + def test_archivebox_version_with_ldap_config(self, tmp_path): + """Test that archivebox version works with LDAP config set.""" + _cmd_result = run_archivebox_cmd( + ["version"], + cwd=tmp_path, + timeout=10, + env={ + "LDAP_ENABLED": "False", + "LDAP_SERVER_URI": "ldap://ldap-test.localhost:389", + }, + default_cli_env=True, + disable_extractors=True, + ) + _, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + # Should succeed + assert code == 0, f"archivebox version failed: {stderr}" + + +class TestLDAPConfigValidationInArchiveBox: + """Test LDAP config validation when running ArchiveBox commands.""" + + def test_archivebox_init_with_incomplete_ldap_config(self, tmp_path): + """Test that archivebox init fails with helpful error when LDAP config is incomplete.""" + _cmd_result = run_archivebox_cmd( + ["init"], + cwd=tmp_path, + timeout=45, + env={ + "LDAP_ENABLED": "True", + # Missing: LDAP_SERVER_URI, LDAP_BIND_DN, etc. + }, + default_cli_env=True, + disable_extractors=True, + ) + _, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + # Should fail with validation error + assert code != 0, "Should fail with incomplete LDAP config" + + # Check error message + assert "LDAP_* config options must all be set" in stderr, f"Expected validation error message in: {stderr}" diff --git a/archivebox/tests/test_binary_service.py b/archivebox/tests/test_binary_service.py new file mode 100644 index 0000000000..62dfad26ac --- /dev/null +++ b/archivebox/tests/test_binary_service.py @@ -0,0 +1,260 @@ +import json +import os +import shutil +import uuid +import asyncio +from pathlib import Path + +import pytest + +from archivebox.machine.models import Binary, Machine, Process +from archivebox.tests.conftest import parse_jsonl_output, run_archivebox_cmd +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _link_real_binary(bin_dir: Path, name: str, *, source: str | None = None) -> Path: + bin_dir.mkdir(parents=True, exist_ok=True) + source_path = shutil.which(source or name) + assert source_path, f"{source or name} must be installed for this integration test" + link = bin_dir / name + link.unlink(missing_ok=True) + link.symlink_to(source_path) + return link + + +def _runtime_env(data_dir: Path, bin_dir: Path, *, lib_dir: Path | None = None) -> dict[str, str]: + archivebox_bin = shutil.which("archivebox") + assert archivebox_bin, "archivebox console script must be available for CLI tests" + lib_dir = lib_dir or data_dir / "lib" + return { + "LIB_DIR": str(lib_dir), + "ABXPKG_LIB_DIR": str(lib_dir), + "PATH": os.pathsep.join([str(bin_dir), str(Path(archivebox_bin).parent), "/usr/bin", "/bin", "/usr/sbin", "/sbin"]), + } + + +def test_binary_request_preserves_raw_overrides_in_db_while_using_native_event(): + from abxpkg.binary_service import BinaryCacheService, BinaryEvent, BinaryRequestEvent, BinaryService + from abx_dl.orchestrator import create_bus + from archivebox.services.binary_service import ArchiveBoxDBBinaryCacheBackend + + machine = Machine.current() + raw_overrides = { + "pip": { + "install_args": ["imagesize>=2.0.0"], + "module_name": "imagesize", + }, + } + binary = Binary.objects.create( + machine=machine, + name="sh", + abspath="/bin/sh", + version="1.0.0", + binprovider="env", + binproviders="env,pip", + overrides=raw_overrides, + status=Binary.StatusChoices.INSTALLED, + ) + bus = create_bus(name=f"test_binary_raw_overrides_{uuid.uuid4().hex[:8]}") + BinaryCacheService(bus, backend=ArchiveBoxDBBinaryCacheBackend()) + BinaryService(bus) + binary_events: list[BinaryEvent] = [] + + async def on_BinaryEvent(event: BinaryEvent) -> None: + binary_events.append(event) + + bus.on(BinaryEvent, on_BinaryEvent) + + async def run_event() -> None: + await bus.emit( + BinaryRequestEvent( + name="sh", + binproviders="env,pip", + overrides={"pip": {"install_args": ["imagesize>=2.0.0"]}}, + extra_context={ + "raw_overrides": raw_overrides, + "provider_metadata": {"pip": {"module_name": "imagesize"}}, + }, + ), + ).now() + await bus.wait_until_idle() + + asyncio.run(run_event()) + + binary.refresh_from_db() + assert binary.status == Binary.StatusChoices.INSTALLED + assert binary.overrides == raw_overrides + assert binary_events + assert binary_events[-1].overrides == {"pip": {"install_args": ["imagesize>=2.0.0"]}} + assert binary_events[-1].extra_context["raw_overrides"] == raw_overrides + + +def test_binary_request_installs_env_binary_and_recovers_stale_cache(initialized_archive, tmp_path): + name = f"abx-e2e-rg-{uuid.uuid4().hex[:8]}" + bootstrap_bin_dir = tmp_path / "realbin" + provider_bin_dir = initialized_archive / "lib" / "env" / "bin" + _link_real_binary(bootstrap_bin_dir, "uv") + _link_real_binary(provider_bin_dir, name, source="rg") + + _cmd_result = run_archivebox_cmd( + ["run"], + cwd=initialized_archive, + stdin=json.dumps({"type": "BinaryRequest", "name": name, "binproviders": "env"}) + "\n", + timeout=120, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert returncode == 0, stderr + output_records = parse_jsonl_output(stdout) + assert any(record["type"] == "BinaryRequest" and record["name"] == name for record in output_records) + + with use_archivebox_db(initialized_archive): + binary = Binary.objects.get(name=name) + machine_id = str(binary.machine_id) + first_binary_id = str(binary.id) + first_abspath = Path(binary.abspath) + binary_processes = list(Process.objects.filter(process_type=Process.TypeChoices.BINARY).order_by("created_at")) + + assert binary.status == Binary.StatusChoices.INSTALLED + assert binary.version + assert binary.binprovider == "env" + assert binary.binproviders == "env" + assert first_abspath.exists() + assert first_abspath == provider_bin_dir / name + assert first_abspath.resolve() == Path(shutil.which("rg") or "").resolve() + assert first_abspath.is_relative_to(initialized_archive / "lib") + assert (initialized_archive / "lib" / "env" / "bin" / name).exists() + assert (initialized_archive / "machines" / machine_id / "binaries" / name / "index.jsonl").exists() + assert binary_processes + assert binary_processes[-1].status == Process.StatusChoices.EXITED + assert binary_processes[-1].exit_code == 0 + assert binary_processes[-1].ended_at is not None + assert binary_processes[-1].started_at < binary_processes[-1].ended_at + assert any(f"--name={name}" in arg for arg in binary_processes[-1].cmd) + + _cmd_result = run_archivebox_cmd( + ["version"], + cwd=initialized_archive, + timeout=60, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + default_cli_env=True, + disable_extractors=True, + ) + version_stdout, version_stderr, version_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert version_code == 0, version_stderr + assert name in version_stdout + assert binary.version in version_stdout + + first_abspath.unlink() + _link_real_binary(bootstrap_bin_dir, name, source="rg") + + _cmd_result = run_archivebox_cmd( + ["run", f"--binary-id={first_binary_id}"], + cwd=initialized_archive, + timeout=120, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + default_cli_env=True, + disable_extractors=True, + ) + rerun_stdout, rerun_stderr, rerun_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert rerun_code == 0, rerun_stdout + rerun_stderr + with use_archivebox_db(initialized_archive): + recovered = Binary.objects.get(pk=first_binary_id) + process_count = Process.objects.filter(process_type=Process.TypeChoices.BINARY).count() + + assert recovered.status == Binary.StatusChoices.INSTALLED + assert recovered.version == binary.version + assert Path(recovered.abspath).exists() + assert Path(recovered.abspath).resolve() == Path(shutil.which("rg") or "").resolve() + assert process_count >= 2 + + changed_lib_dir = tmp_path / "changed-lib" + changed_provider_bin_dir = changed_lib_dir / "env" / "bin" + _link_real_binary(changed_provider_bin_dir, name, source="rg") + + _cmd_result = run_archivebox_cmd( + ["run"], + cwd=initialized_archive, + stdin=json.dumps({"type": "BinaryRequest", "name": name, "binproviders": "env"}) + "\n", + timeout=120, + env=_runtime_env(initialized_archive, bootstrap_bin_dir, lib_dir=changed_lib_dir), + default_cli_env=True, + disable_extractors=True, + ) + relib_stdout, relib_stderr, relib_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert relib_code == 0, relib_stdout + relib_stderr + with use_archivebox_db(initialized_archive): + relibbed = Binary.objects.get(pk=first_binary_id) + + assert relibbed.status == Binary.StatusChoices.INSTALLED + assert relibbed.version == binary.version + assert Path(relibbed.abspath) == changed_provider_bin_dir / name + assert Path(relibbed.abspath).exists() + assert Path(relibbed.abspath).resolve() == Path(shutil.which("rg") or "").resolve() + + +def test_missing_binary_request_stays_queued_then_recovers_when_provider_can_resolve(initialized_archive, tmp_path): + name = f"abx-missing-rg-{uuid.uuid4().hex[:8]}" + bootstrap_bin_dir = tmp_path / "realbin" + provider_bin_dir = initialized_archive / "lib" / "env" / "bin" + _link_real_binary(bootstrap_bin_dir, "uv") + + _cmd_result = run_archivebox_cmd( + ["run"], + cwd=initialized_archive, + stdin=json.dumps({"type": "BinaryRequest", "name": name, "binproviders": "env"}) + "\n", + timeout=120, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert returncode == 0, stderr + assert any(record["type"] == "BinaryRequest" and record["name"] == name for record in parse_jsonl_output(stdout)) + + with use_archivebox_db(initialized_archive): + queued = Binary.objects.get(name=name) + queued_id = str(queued.id) + failed_process = Process.objects.filter(process_type=Process.TypeChoices.BINARY).latest("created_at") + machine_config = Machine.objects.get(pk=queued.machine_id).config or {} + + assert queued.status == Binary.StatusChoices.QUEUED + assert queued.abspath == "" + assert queued.retry_at is not None + assert failed_process.status == Process.StatusChoices.EXITED + assert failed_process.exit_code == 1 + assert f"{name.upper().replace('-', '_')}_BINARY" not in machine_config + assert not (provider_bin_dir / name).exists() + + _link_real_binary(provider_bin_dir, name, source="rg") + + _cmd_result = run_archivebox_cmd( + ["run", f"--binary-id={queued_id}"], + cwd=initialized_archive, + timeout=120, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + default_cli_env=True, + disable_extractors=True, + ) + recover_stdout, recover_stderr, recover_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert recover_code == 0, recover_stdout + recover_stderr + with use_archivebox_db(initialized_archive): + recovered = Binary.objects.get(pk=queued_id) + process_exit_codes = list( + Process.objects.filter(process_type=Process.TypeChoices.BINARY).order_by("created_at").values_list("exit_code", flat=True), + ) + + assert recovered.status == Binary.StatusChoices.INSTALLED + assert recovered.version + assert Path(recovered.abspath).exists() + assert Path(recovered.abspath) == provider_bin_dir / name + assert process_exit_codes[-2:] == [1, 0] diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py new file mode 100644 index 0000000000..6de356f478 --- /dev/null +++ b/archivebox/tests/test_cli_add.py @@ -0,0 +1,1146 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox add command. +Verify add creates snapshots in DB, crawls, source files, and archive directories. +""" + +import os +import json +from pathlib import Path + +import pytest +from django.db import connection +from django.utils import timezone + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.machine.models import Process +from archivebox.tests.conftest import ( + _find_system_browser, + cli_env, + find_snapshot_dir, + get_free_port, + run_archivebox_cmd, + run_queued_crawls, + start_archivebox_server, + stop_server, +) + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +IMPORT_FORMAT_EXPECTATIONS = { + "rss": { + "url": "https://example.com/", + "title": "RSS Example Import", + "date": "2024-01-01", + "tags": {"rss-tag", "metadata"}, + }, + "netscape": { + "url": "https://www.iana.org/domains/reserved", + "title": "IANA Reserved Domains", + "date": "2024-01-02", + "tags": {"netscape-tag", "metadata"}, + }, + "dom": { + "url": "https://www.iana.org/help/example-domains", + }, + "json": { + "url": "https://example.com/?archivebox-json-import=1", + "title": "JSON Import Example", + "date": "2024-01-03", + "tags": {"json-tag", "metadata"}, + }, + "jsonl": { + "url": "https://example.com/?archivebox-jsonl-import=1", + "title": "JSONL Import Example", + "date": "2024-01-04", + "tags": {"jsonl-tag", "metadata"}, + }, + "txt": { + "url": "https://example.org/", + }, +} + + +def write_import_format_files(base_dir: Path) -> dict[str, Path]: + files = { + "rss": base_dir / "test_rss.xml", + "netscape": base_dir / "test_netscape.html", + "dom": base_dir / "test_dom.html", + "json": base_dir / "test_bookmarks.json", + "jsonl": base_dir / "test_bookmarks.jsonl", + "txt": base_dir / "test_urls.txt", + } + files["rss"].write_text( + """ + + + ArchiveBox RSS import fixture + https://example.com/ + ArchiveBox RSS import fixture + + RSS Example Import + https://example.com/ + https://example.com/ + Mon, 01 Jan 2024 00:00:00 GMT + rss-tag + metadata + + + +""", + encoding="utf-8", + ) + files["netscape"].write_text( + """ + +Bookmarks +

        Bookmarks

        +

        +

        IANA Reserved Domains +

        +""", + encoding="utf-8", + ) + files["dom"].write_text( + """ + + DOM import fixture + + IANA Example Domains + + +""", + encoding="utf-8", + ) + files["json"].write_text( + json.dumps( + { + "url": "https://example.com/?archivebox-json-import=1", + "title": "JSON Import Example", + "tags": ["json-tag", "metadata"], + "bookmarked_at": "2024-01-03T00:00:00+00:00", + }, + ) + + "\n", + encoding="utf-8", + ) + files["jsonl"].write_text( + json.dumps( + { + "url": "https://example.com/?archivebox-jsonl-import=1", + "title": "JSONL Import Example", + "tags": "jsonl-tag,metadata", + "bookmarked_at": "2024-01-04T00:00:00+00:00", + }, + ) + + "\n", + encoding="utf-8", + ) + files["txt"].write_text( + "Plain text import fixture containing https://example.org/ as a real live URL.\n", + encoding="utf-8", + ) + return files + + +IMPORT_FORMAT_ENV = { + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", + "PLUGINS": "parse_html_urls,parse_jsonl_urls,parse_netscape_urls,parse_rss_urls,parse_txt_urls", + "SAVE_WGET": "False", + "SAVE_HEADERS": "False", + "USE_CHROME": "False", + "URL_ALLOWLIST": r"example\.com|example\.org|iana\.org|www\.iana\.org", +} + + +def wait_for_import_processing(cwd: Path, expected_urls: set[str], *, timeout: float = 120.0) -> None: + import time + + deadline = time.time() + timeout + while time.time() < deadline: + with use_archivebox_db(cwd): + snapshot_started = Snapshot.objects.filter(url__in=expected_urls).exists() + if snapshot_started: + return + time.sleep(1) + raise AssertionError("timed out waiting for import crawl processing to start") + + +def wait_for_expected_import_snapshots(cwd: Path, expected_urls: set[str], *, timeout: float = 180.0) -> None: + import time + + allowed_statuses = {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + deadline = time.time() + timeout + while time.time() < deadline: + with use_archivebox_db(cwd): + rows = list(Snapshot.objects.filter(url__in=expected_urls).values_list("url", "status")) + counts = {url: 0 for url in expected_urls} + bad_statuses = [] + for url, status in rows: + counts[url] += 1 + if status not in allowed_statuses: + bad_statuses.append((url, status)) + if all(count == 1 for count in counts.values()) and not bad_statuses: + return + time.sleep(1) + raise AssertionError( + f"timed out waiting for one queued/started/sealed snapshot per URL, got counts={counts}, bad_statuses={bad_statuses}", + ) + + +def malicious_add_inputs(tmp_path: Path, *, safe_url: str) -> tuple[list[str], Path]: + other_crawl_source = tmp_path / "sources" / "other_crawl_source.txt" + other_crawl_source.parent.mkdir(parents=True, exist_ok=True) + other_crawl_source.write_text("https://example.com/not-owned-by-this-crawl\n", encoding="utf-8") + canary = tmp_path / "archivebox_shell_injection_canary" + return ( + [ + safe_url, + "file:///etc/hosts", + "/etc/hosts", + "../../../../etc/passwd", + f"file://{other_crawl_source}", + str(other_crawl_source), + f"'; touch {canary}; #", + f'" && touch {canary} && echo "', + f"$(touch {canary})", + f"`touch {canary}`", + """ + +]> + + + &localfile;file:///etc/passwd + + +""", + ], + canary, + ) + + +def assert_no_file_or_shell_payload_snapshots(cwd: Path, *, canary: Path) -> None: + with use_archivebox_db(cwd): + snapshots = list(Snapshot.objects.all()) + assert not canary.exists() + assert not [snapshot.url for snapshot in snapshots if str(snapshot.url).startswith("file:")] + for forbidden in ("/etc/hosts", "/etc/passwd", "other_crawl_source", "archivebox_shell_injection_canary"): + assert not [snapshot.url for snapshot in snapshots if forbidden in str(snapshot.url)] + + +def test_add_single_url_records_url_in_crawl(initialized_archive): + """Test that adding a single URL queues a crawl with the submitted URL.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + root_input = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + + assert crawl.urls == "https://example.com" + assert crawl.get_urls_list() == ["https://example.com"] + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + assert root_snapshot.depth == 0 + assert root_input == "https://example.com" + + +@pytest.mark.timeout(360) +def test_add_stdin_import_formats_preserve_metadata_and_crawl_inner_urls(initialized_archive): + """`archivebox add < import-file` should normalize rich import formats before crawling URLs.""" + import_files = write_import_format_files(initialized_archive) + expected_urls = {case["url"] for case in IMPORT_FORMAT_EXPECTATIONS.values()} + port = get_free_port() + env = cli_env(port=port, server=True, **IMPORT_FORMAT_ENV) + + for import_path in import_files.values(): + source_text = import_path.read_text(encoding="utf-8") + result = run_archivebox_cmd( + ["add", "--bg", "--depth=0", "--tag=cli-stdin-import"], + cwd=initialized_archive, + env=env, + stdin=source_text, + timeout=360, + ) + assert result.returncode == 0, result.stderr or result.stdout + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").first() + assert crawl is not None + root_snapshot = crawl.snapshot_set.get(url=Snapshot.INTERNAL_INPUT_URL) + root_input = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + assert crawl.urls == source_text + assert root_input == source_text + + try: + start_archivebox_server(initialized_archive, env=env, port=port) + wait_for_import_processing(initialized_archive, expected_urls) + stop_server(initialized_archive) + start_archivebox_server(initialized_archive, env=env, port=port) + wait_for_expected_import_snapshots(initialized_archive, expected_urls) + + list_result = run_archivebox_cmd( + ["list", "--json"], + cwd=initialized_archive, + env=env, + timeout=60, + ) + assert list_result.returncode == 0, list_result.stderr or list_result.stdout + for expected_url in expected_urls: + assert expected_url in list_result.stdout + finally: + stop_server(initialized_archive) + + with use_archivebox_db(initialized_archive): + crawls = list(Crawl.objects.order_by("created_at")) + snapshots_by_url = {snapshot.url: snapshot for snapshot in Snapshot.objects.prefetch_related("tags").filter(url__in=expected_urls)} + tags_by_url = {snapshot.url: set(snapshot.tags.values_list("name", flat=True)) for snapshot in snapshots_by_url.values()} + + assert len(crawls) == len(import_files) + assert [crawl.urls for crawl in crawls] == [path.read_text(encoding="utf-8") for path in import_files.values()] + assert all(crawl.tags_str == "cli-stdin-import" for crawl in crawls) + assert all(crawl.status in {Crawl.StatusChoices.STARTED, Crawl.StatusChoices.SEALED} for crawl in crawls) + assert len(snapshots_by_url) == len(expected_urls) + + for import_name, expected in IMPORT_FORMAT_EXPECTATIONS.items(): + snapshot = snapshots_by_url.get(expected["url"]) + assert snapshot is not None, f"{import_name} did not create Snapshot for {expected['url']}" + assert snapshot.status in {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + if expected.get("title"): + assert snapshot.title == expected["title"] + if expected.get("date"): + assert snapshot.bookmarked_at.date().isoformat() == expected["date"] + if expected.get("tags"): + assert expected["tags"] | {"cli-stdin-import"} <= tags_by_url[snapshot.url] + + +@pytest.mark.timeout(240) +def test_add_rejects_file_path_and_shell_injection_payloads(initialized_archive): + """CLI add must not turn user-supplied local paths or shell payloads into snapshots.""" + safe_url = "https://example.com/?archivebox-cli-security=1" + inputs, canary = malicious_add_inputs(initialized_archive, safe_url=safe_url) + port = get_free_port() + env = cli_env(port=port, server=True, **IMPORT_FORMAT_ENV) + + result = run_archivebox_cmd( + ["add", "--bg", "--depth=0", "--tag=cli-security"], + cwd=initialized_archive, + env=env, + stdin="\n".join(inputs), + timeout=120, + ) + assert result.returncode == 0, result.stderr or result.stdout + + try: + start_archivebox_server(initialized_archive, env=env, port=port) + wait_for_expected_import_snapshots(initialized_archive, {safe_url}, timeout=120) + finally: + stop_server(initialized_archive) + + assert_no_file_or_shell_payload_snapshots(initialized_archive, canary=canary) + with use_archivebox_db(initialized_archive): + snapshots = list(Snapshot.objects.filter(url=safe_url).values_list("url", "status", "tags__name")) + crawl = Crawl.objects.get() + assert crawl.status in {Crawl.StatusChoices.STARTED, Crawl.StatusChoices.SEALED} + assert len({url for url, _status, _tag in snapshots}) == 1 + assert all( + status in {Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED, Snapshot.StatusChoices.SEALED} + for _url, status, _tag in snapshots + ) + assert "cli-security" in {tag for _url, _status, tag in snapshots} + + +@pytest.mark.timeout(180) +def test_run_rejects_file_url_injected_directly_into_crawl_urls_with_sql(initialized_archive): + """Runner must validate Crawl.urls again when SQL bypasses normal add/create paths.""" + secret_url = "https://example.com/?archivebox-sql-crawl-file-secret=1" + local_source = initialized_archive / "not_owned_by_crawl_urls.txt" + local_source.write_text(f"{secret_url}\n", encoding="utf-8") + file_url = local_source.resolve().as_uri() + env = cli_env(**{**IMPORT_FORMAT_ENV, "PLUGINS": "parse_txt_urls", "SAVE_HEADERS": "False", "SAVE_WGET": "False"}) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.create( + urls="https://example.com/?archivebox-sql-crawl-control=1", + max_depth=2, + tags_str="sql-file-url", + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + bad_jsonl = json.dumps({"type": "Snapshot", "url": file_url, "depth": 0, "tags": "sql-file-url"}) + with connection.cursor() as cursor: + cursor.execute( + f"UPDATE {Crawl._meta.db_table} SET urls = %s WHERE id = %s", + [bad_jsonl, crawl.id.hex], + ) + + result = run_archivebox_cmd( + ["run", f"--crawl-id={crawl.id}"], + cwd=initialized_archive, + env=env, + timeout=180, + ) + assert result.returncode == 0, result.stderr or result.stdout + + with use_archivebox_db(initialized_archive): + crawl.refresh_from_db() + snapshot_urls = set(Snapshot.objects.values_list("url", flat=True)) + + assert crawl.status in {Crawl.StatusChoices.STARTED, Crawl.StatusChoices.SEALED} + assert file_url not in snapshot_urls + assert secret_url not in snapshot_urls + + +@pytest.mark.timeout(180) +def test_run_rejects_depth_two_file_url_snapshot_injected_directly_with_sql(initialized_archive): + """A queued Snapshot row with a file:// URL must not be allowed to run hooks or recurse.""" + secret_url = "https://example.com/?archivebox-sql-depth2-file-secret=1" + local_source = initialized_archive / "not_owned_by_depth_two_snapshot.txt" + local_source.write_text(f"{secret_url}\n", encoding="utf-8") + file_url = local_source.resolve().as_uri() + env = cli_env(**{**IMPORT_FORMAT_ENV, "PLUGINS": "parse_txt_urls", "SAVE_HEADERS": "False", "SAVE_WGET": "False"}) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.create( + urls="https://example.com/?archivebox-sql-depth2-root=1", + max_depth=2, + tags_str="sql-depth2-file-url", + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + root_snapshot = Snapshot.objects.create( + url="https://example.com/?archivebox-sql-depth2-root=1", + crawl=crawl, + depth=0, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + injected_snapshot = Snapshot.objects.create( + url="https://example.com/?archivebox-sql-depth2-placeholder=1", + crawl=crawl, + parent_snapshot=root_snapshot, + depth=2, + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + with connection.cursor() as cursor: + cursor.execute( + f"UPDATE {Snapshot._meta.db_table} SET url = %s WHERE id = %s", + [file_url, injected_snapshot.id.hex], + ) + + result = run_archivebox_cmd( + ["run", f"--snapshot-id={injected_snapshot.id}"], + cwd=initialized_archive, + env=env, + timeout=180, + ) + assert result.returncode == 0, result.stderr or result.stdout + + with use_archivebox_db(initialized_archive): + injected_snapshot.refresh_from_db() + snapshot_urls = set(Snapshot.objects.values_list("url", flat=True)) + file_results = list( + ArchiveResult.objects.filter( + snapshot=injected_snapshot, + ).values_list("plugin", "status"), + ) + + assert injected_snapshot.url == file_url + assert secret_url not in snapshot_urls + assert file_results == [] + + +def test_add_bg_queues_internal_input_root_snapshot(initialized_archive): + """Background add stores submitted input on an internal root snapshot for the runner.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--bg", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + root_input = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is not None + assert crawl.urls == "https://example.com" + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + assert root_snapshot.depth == 0 + assert root_input == "https://example.com" + + +def test_add_index_only_rejected_urls_leave_empty_crawl_for_runner_to_seal(initialized_archive): + """Index-only add only creates the crawl; rejected URLs are sealed by the runner.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + [ + "add", + "--index-only", + "--depth=0", + "--url-denylist=example.com", + "https://example.com", + ], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is None + assert crawl.urls == "https://example.com" + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + snapshot_urls = set(Snapshot.objects.values_list("url", flat=True)) + + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + assert crawl.urls == "https://example.com" + assert snapshot_urls == {Snapshot.INTERNAL_INPUT_URL} + + +def test_add_index_only_rejects_archivebox_internal_urls(initialized_archive): + """Index-only add must apply the same internal URL guard as snapshot creation.""" + env = cli_env(disable_extractors=True) + internal_urls = [ + "http://archivebox.localhost:9292/admin/", + "http://web.archivebox.localhost:9292/", + "http://api.archivebox.localhost:9292/api/v1/docs", + "http://snap-2fb8e923c58c.archivebox.localhost:9292/index.html", + ] + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", *internal_urls], + cwd=initialized_archive, + env={**env, "BASE_URL": "http://archivebox.localhost:9292"}, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + snapshot_urls = set(Snapshot.objects.values_list("url", flat=True)) + + assert crawl.urls == "\n".join(internal_urls) + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is None + assert snapshot_urls == {Snapshot.INTERNAL_INPUT_URL} + + +def test_add_creates_crawl_record(initialized_archive): + """Test that add command creates a Crawl record in the database.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + with use_archivebox_db(initialized_archive): + crawl_count = Crawl.objects.count() + + assert crawl_count == 1 + + +def test_add_creates_internal_input_file(initialized_archive): + """Test that add stores submitted text under the root snapshot staticfile output.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + with use_archivebox_db(initialized_archive): + root_snapshot = Snapshot.objects.get() + source_content = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + assert source_content == "https://example.com" + + +def test_add_multiple_urls_single_command(initialized_archive): + """Test adding multiple URLs in a single command records one crawl.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com", "https://example.org"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + root_input = (root_snapshot.output_dir / "staticfile" / "stdin.txt").read_text(encoding="utf-8") + + assert crawl.urls == "https://example.com\nhttps://example.org" + assert root_input == "https://example.com\nhttps://example.org" + + +def test_add_rejects_file_path_argument(initialized_archive): + """Local files must be piped through stdin, not passed as archiveable path arguments.""" + env = cli_env(disable_extractors=True) + urls_file = initialized_archive / "urls.txt" + urls_file.write_text("https://example.com\nhttps://example.org\n") + + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", str(urls_file)], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode != 0 + + with use_archivebox_db(initialized_archive): + assert Crawl.objects.count() == 0 + assert Snapshot.objects.count() == 0 + + assert "No URLs provided" in (result.stderr or result.stdout) + + +def test_add_with_depth_0_flag(initialized_archive): + """Test that --depth=0 flag is accepted and works.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --depth" not in result.stderr + + +def test_add_with_depth_1_flag(initialized_archive): + """Test that --depth=1 flag is accepted.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=1", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --depth" not in result.stderr + + +def test_add_rejects_invalid_depth_values(initialized_archive): + """Test that add rejects depth values outside the supported range.""" + env = cli_env(disable_extractors=True) + + for depth in ("5", "-1"): + result = run_archivebox_cmd( + ["add", "--index-only", f"--depth={depth}", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + stderr = result.stderr.lower() + assert result.returncode != 0 + assert "invalid" in stderr or "not one of" in stderr + + +def test_add_with_tags(initialized_archive): + """Test adding URL with tags stores tags_str in crawl. + + With --index-only, Tag objects are not created until archiving happens. + Tags are stored as a string in the Crawl.tags_str field. + """ + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + with use_archivebox_db(initialized_archive): + tags_str = Crawl.objects.values_list("tags_str", flat=True).get() + + # Tags are stored as a comma-separated string in crawl + assert "test" in tags_str or "example" in tags_str + + +def test_add_records_selected_persona_on_crawl(initialized_archive): + """Test add persists the selected persona so browser config derives from it later.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "--persona=Default", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + + assert crawl.persona_id + assert "ACTIVE_PERSONA" not in crawl.config + assert (initialized_archive / "personas" / "Default" / "chrome_profile").is_dir() + + +def test_add_records_url_filter_overrides_on_crawl(initialized_archive): + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + [ + "add", + "--index-only", + "--depth=0", + "--domain-allowlist=example.com,*.example.com", + "--domain-denylist=static.example.com", + "https://example.com", + ], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + + assert crawl.config["URL_ALLOWLIST"] == "example.com,*.example.com" + assert crawl.config["URL_DENYLIST"] == "static.example.com" + assert not (initialized_archive / "personas" / "Default" / "chrome_extensions").exists() + + +def test_add_duplicate_url_creates_separate_crawls(initialized_archive): + """Test that adding the same URL twice creates separate crawls. + + Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL. + This allows re-archiving URLs at different times. + """ + + env = cli_env(disable_extractors=True) + # Add URL first time + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + # Add same URL second time with --update to opt out of ONLY_NEW. + run_archivebox_cmd( + ["add", "--index-only", "--update", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + with use_archivebox_db(initialized_archive): + crawl_count = Crawl.objects.count() + root_inputs = [ + snapshot.output_dir.joinpath("staticfile", "stdin.txt").read_text(encoding="utf-8") + for snapshot in Snapshot.objects.order_by("created_at") + ] + + # Each add creates a new crawl with its own queued work. + assert crawl_count == 2 + assert root_inputs == ["https://example.com", "https://example.com"] + + +def test_add_with_overwrite_flag(initialized_archive): + """Test that --overwrite flag forces re-archiving.""" + env = cli_env(disable_extractors=True) + + # Add URL first time + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + # Add with overwrite + result = run_archivebox_cmd( + ["add", "--index-only", "--overwrite", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + assert "unrecognized arguments: --overwrite" not in result.stderr + + +def test_snapshot_create_creates_current_output_directory(initialized_archive): + """Test the user-facing snapshot creation path creates an output directory.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + + with use_archivebox_db(initialized_archive): + snapshot_id = str(Snapshot.objects.values_list("id", flat=True).get()) + + snapshot_dir = find_snapshot_dir(initialized_archive, snapshot_id) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + assert snapshot_dir.is_dir() + + +def test_add_help_shows_depth_and_tag_options(initialized_archive): + """Test that add --help documents the main filter and crawl options.""" + + result = run_archivebox_cmd( + ["add", "--help"], + ) + + assert result.returncode == 0 + assert "--depth" in result.stdout + assert "--max-urls" in result.stdout + assert "--crawl-max-size" in result.stdout + assert "--crawl-timeout" in result.stdout + assert "--snapshot-max-size" in result.stdout + assert "--tag" in result.stdout + + +def test_add_records_max_url_and_size_limits_on_crawl(initialized_archive): + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + [ + "add", + "--index-only", + "--depth=1", + "--max-urls=3", + "--crawl-max-size=45mb", + "--crawl-timeout=120", + "--snapshot-max-size=5mb", + "https://example.com", + ], + cwd=initialized_archive, + env=env, + ) + + assert result.returncode == 0 + + columns = {field.name for field in Crawl._meta.local_fields} + with use_archivebox_db(initialized_archive): + config = Crawl.objects.values_list("config", flat=True).get() or {} + + assert {"max_urls", "crawl_max_size", "crawl_timeout", "snapshot_max_size"}.isdisjoint(columns) + assert config["CRAWL_MAX_URLS"] == 3 + assert config["CRAWL_MAX_SIZE"] == 45 * 1024 * 1024 + assert config["CRAWL_TIMEOUT"] == 120 + assert config["SNAPSHOT_MAX_SIZE"] == 5 * 1024 * 1024 + + +def test_add_without_args_shows_usage(initialized_archive): + """Test that add without URLs fails with a usage hint instead of crashing.""" + + result = run_archivebox_cmd( + ["add"], + ) + + combined = result.stdout + result.stderr + assert result.returncode != 0 + assert "usage" in combined.lower() or "url" in combined.lower() + + +def test_add_index_only_queues_crawl_without_starting_runner(initialized_archive): + """Test that --index-only creates only a queued crawl and returns fast.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + timeout=30, # Should be fast + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is None + assert crawl.urls == "https://example.com" + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + + +def test_add_index_only_creates_only_internal_root_snapshot(initialized_archive): + """Test that index-only add creates the input root but not parsed child snapshots.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get() + root_snapshot = Snapshot.objects.get() + + assert crawl.urls == "https://example.com" + assert root_snapshot.url == Snapshot.INTERNAL_INPUT_URL + + +def test_snapshot_create_sets_snapshot_timestamp(initialized_archive): + """Test the user-facing snapshot creation path sets a timestamp.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + + with use_archivebox_db(initialized_archive): + timestamp = Snapshot.objects.values_list("timestamp", flat=True).get() + + assert timestamp is not None + assert len(str(timestamp)) > 0 + + +@pytest.mark.timeout(180) +def test_cli_add_real_urls_with_options_writes_inspectable_outputs(initialized_archive): + + wget_urls = [ + "https://example.com", + "https://pirate.github.io/stress-tests/challenge.html", + ] + chrome_url = "https://example.com/?archivebox-chrome-flow=1" + env = os.environ.copy() + env.pop("CHROME_BINARY", None) + env.update( + { + "USE_COLOR": "false", + "SHOW_PROGRESS": "false", + "TIMEOUT": "60", + "SAVE_WGET": "true", + "SAVE_HEADERS": "false", + "SAVE_TITLE": "false", + "SAVE_READABILITY": "false", + "SAVE_SINGLEFILE": "false", + "SAVE_MERCURY": "false", + "SAVE_SCREENSHOT": "false", + "SAVE_PDF": "false", + "SAVE_DOM": "false", + "SAVE_ARCHIVEDOTORG": "false", + "SAVE_GIT": "false", + "SAVE_YTDLP": "false", + "SAVE_FAVICON": "false", + }, + ) + _cmd_result = run_archivebox_cmd( + [ + "add", + "--depth=0", + "--max-urls=2", + "--crawl-max-size=10mb", + "--tag=real-flow,challenge", + "--parser=url_list", + "--plugins=wget", + *wget_urls, + ], + cwd=initialized_archive, + env=env, + timeout=180, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + + chrome_env = env | { + "SAVE_WGET": "false", + "SAVE_HEADERS": "true", + "SAVE_TITLE": "true", + "CHROME_HEADLESS": "true", + "CHROME_SANDBOX": "false", + "CHROME_ISOLATION": "snapshot", + } + system_browser = _find_system_browser() + if system_browser: + chrome_env["CHROME_BINARY"] = str(system_browser) + _cmd_result = run_archivebox_cmd( + ["install", "chrome"], + cwd=initialized_archive, + env=chrome_env, + timeout=600, + ) + install_stdout, install_stderr, install_returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert install_returncode == 0, install_stderr or install_stdout + _cmd_result = run_archivebox_cmd( + [ + "add", + "--depth=0", + "--max-urls=1", + "--crawl-max-size=10mb", + "--tag=chrome-flow", + "--parser=url_list", + "--plugins=chrome,wget,headers,title", + chrome_url, + ], + cwd=initialized_archive, + env=chrome_env, + timeout=180, + ) + chrome_stdout, chrome_stderr, chrome_returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert chrome_returncode == 0, chrome_stderr or chrome_stdout + + _cmd_result = run_archivebox_cmd( + ["list", "--tag=real-flow"], + cwd=initialized_archive, + env=env, + timeout=60, + ) + list_stdout, list_stderr, list_returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert list_returncode == 0, list_stderr or list_stdout + listed = [json.loads(line) for line in list_stdout.splitlines() if line.strip()] + assert {item["url"] for item in listed} >= set(wget_urls) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").values_list("max_depth", "tags_str", "config").first() + real_flow_crawl = Crawl.objects.filter(tags_str="real-flow,challenge").values_list("max_depth", "tags_str", "config").first() + snapshots = list(Snapshot.objects.order_by("url").values_list("id", "url", "depth", "status", "title")) + archive_results = list( + ArchiveResult.objects.select_related("snapshot") + .order_by("snapshot__url", "plugin") + .values_list("snapshot__url", "plugin", "status", "output_files", "output_size", "output_str"), + ) + processes = list(Process.objects.filter(process_type="hook").values_list("process_type", "status", "exit_code", "pwd", "cmd")) + + assert real_flow_crawl is not None + assert real_flow_crawl[0] == 1 + assert real_flow_crawl[1] == "real-flow,challenge" + real_flow_config = real_flow_crawl[2] or {} + assert real_flow_config["CRAWL_MAX_URLS"] == 2 + assert real_flow_config["CRAWL_MAX_SIZE"] == 10 * 1024 * 1024 + assert real_flow_config.get("SNAPSHOT_MAX_SIZE", 0) == 0 + assert "wget" in real_flow_config["PLUGINS"] + assert crawl is not None + assert crawl[1] == "chrome-flow" + assert "wget,headers,title" in json.dumps(crawl[2] or {}) + + snapshot_urls = {url for _id, url, _depth, _status, _title in snapshots} + assert snapshot_urls >= {*wget_urls, chrome_url} + assert all(depth == (0 if url == Snapshot.INTERNAL_INPUT_URL else 1) for _id, url, depth, _status, _title in snapshots) + + by_url_plugin = {(url, plugin): status for url, plugin, status, _files, _size, _output in archive_results} + assert by_url_plugin[("https://example.com", "wget")] == "succeeded" + assert by_url_plugin[("https://pirate.github.io/stress-tests/challenge.html", "wget")] == "succeeded" + assert by_url_plugin[(chrome_url, "headers")] == "succeeded" + assert by_url_plugin[(chrome_url, "title")] == "succeeded" + unexpected_results = [ + (url, plugin, status, output) for url, plugin, status, _files, _size, output in archive_results if status != "succeeded" + ] + assert not unexpected_results + + snapshot_root = initialized_archive / "archive/users/system/snapshots" + html_outputs = [path for path in snapshot_root.rglob("wget/**/*.html") if path.is_file()] + header_outputs = [path for path in snapshot_root.rglob("headers/**/headers.json") if path.is_file() and path.stat().st_size > 0] + title_outputs = [path for path in snapshot_root.rglob("title/title.txt") if path.is_file() and path.stat().st_size > 0] + index_outputs = [path for path in snapshot_root.rglob("index.jsonl") if path.is_file()] + assert html_outputs + assert header_outputs + assert any("example.com" in path.read_text(errors="ignore").lower() for path in header_outputs) + assert title_outputs + assert any("Example Domain" in path.read_text(errors="ignore") for path in title_outputs) + assert len(index_outputs) >= len(wget_urls) + 1 + + combined_html = "\n".join(path.read_text(errors="ignore") for path in html_outputs) + assert "Example Domain" in combined_html + assert "Browser Agent Challenge for AI Browser Drivers" in combined_html + + assert processes + assert any("wget" in (pwd or "") or "wget" in (cmd or "") for _type, _status, _exit, pwd, cmd in processes) + assert any("headers" in (pwd or "") or "headers" in (cmd or "") for _type, _status, _exit, pwd, cmd in processes) + + +@pytest.mark.timeout(180) +def test_cli_recursive_crawl_processes_discovered_html_urls(initialized_archive, recursive_test_site): + + env = os.environ.copy() + env.update( + { + "USE_COLOR": "false", + "SHOW_PROGRESS": "false", + "TIMEOUT": "60", + "SAVE_WGET": "true", + "SAVE_HEADERS": "false", + "SAVE_TITLE": "false", + "SAVE_READABILITY": "false", + "SAVE_SINGLEFILE": "false", + "SAVE_MERCURY": "false", + "SAVE_SCREENSHOT": "false", + "SAVE_PDF": "false", + "SAVE_DOM": "false", + "SAVE_ARCHIVEDOTORG": "false", + "SAVE_GIT": "false", + "SAVE_YTDLP": "false", + "SAVE_FAVICON": "false", + "PARSE_HTML_URLS_ENABLED": "true", + "PARSE_DOM_OUTLINKS_ENABLED": "false", + "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*", + }, + ) + root_url = recursive_test_site["root_url"] + child_url = recursive_test_site["child_urls"][0] + + _cmd_result = run_archivebox_cmd( + [ + "add", + "--depth=2", + "--max-urls=2", + "--crawl-max-size=50mb", + "--tag=recursive-flow", + "--parser=url_list", + "--plugins=wget,parse_html_urls", + root_url, + ], + cwd=initialized_archive, + env=env, + timeout=180, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").values_list("max_depth", "tags_str", "config").first() + snapshots = list(Snapshot.objects.order_by("depth", "url").values_list("url", "depth", "status")) + archive_results = list( + ArchiveResult.objects.select_related("snapshot") + .order_by("snapshot__depth", "snapshot__url", "plugin") + .values_list("snapshot__url", "plugin", "status", "output_files"), + ) + + assert crawl[0] == 3 + assert crawl[1] == "recursive-flow" + crawl_config = crawl[2] or {} + assert crawl_config["CRAWL_MAX_URLS"] == 2 + assert crawl_config["CRAWL_MAX_SIZE"] == 50 * 1024 * 1024 + assert crawl_config.get("SNAPSHOT_MAX_SIZE", 0) == 0 + assert (Snapshot.INTERNAL_INPUT_URL, 0, "sealed") in snapshots + assert (root_url, 1, "sealed") in snapshots + assert any(url == child_url and depth == 2 and status == "sealed" for url, depth, status in snapshots) + + by_url_plugin = {(url, plugin): status for url, plugin, status, _files in archive_results} + assert by_url_plugin[(root_url, "wget")] == "succeeded" + assert by_url_plugin[(root_url, "parse_html_urls")] == "succeeded" + assert by_url_plugin[(child_url, "wget")] == "succeeded" + + urls_outputs = list((initialized_archive / "archive/users/system/snapshots").rglob("parse_html_urls/urls.jsonl")) + assert urls_outputs + assert any(child_url in path.read_text() for path in urls_outputs) diff --git a/archivebox/tests/test_cli_archiveresult.py b/archivebox/tests/test_cli_archiveresult.py new file mode 100644 index 0000000000..681a8a1fda --- /dev/null +++ b/archivebox/tests/test_cli_archiveresult.py @@ -0,0 +1,466 @@ +""" +Tests for archivebox archiveresult CLI command. + +Tests cover: +- archiveresult create (from Snapshot JSONL, with --plugin, pass-through) +- archiveresult list (with filters) +- archiveresult update +- archiveresult delete +""" + +import json + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, +) + +PROJECTOR_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", +} + + +class TestArchiveResultCreate: + """Tests for `archivebox archiveresult create`.""" + + def test_create_from_snapshot_jsonl(self, initialized_archive): + """Create archive results from Snapshot JSONL input.""" + url = create_test_url() + + # Create a snapshot first + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + # Pipe snapshot to archiveresult create + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=title"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Snapshot passed through and an ArchiveResult request emitted + types = [r.get("type") for r in records] + assert "Snapshot" in types + assert "ArchiveResult" in types + + ar = next(r for r in records if r["type"] == "ArchiveResult") + assert ar["plugin"] == "title" + assert "id" not in ar + + def test_create_with_specific_plugin(self, initialized_archive): + """Create archive result for specific plugin.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=screenshot"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout2) + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + assert len(ar_records) >= 1 + assert ar_records[0]["plugin"] == "screenshot" + + def test_create_pass_through_crawl(self, initialized_archive): + """Pass-through Crawl records unchanged.""" + url = create_test_url() + + # Create crawl and snapshot + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + # Now pipe all to archiveresult create + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=title"], + stdin=stdout2, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout3, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout3) + + types = [r.get("type") for r in records] + assert "Crawl" in types + assert "Snapshot" in types + assert "ArchiveResult" in types + + def test_create_pass_through_only_when_no_snapshots(self, initialized_archive): + """Only pass-through records but no new snapshots returns success.""" + crawl_record = {"type": "Crawl", "id": "fake-id", "urls": "https://example.com"} + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create"], + stdin=json.dumps(crawl_record), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Passed through" in stderr + + +class TestArchiveResultList: + """Tests for `archivebox archiveresult list`.""" + + def test_list_empty(self, initialized_archive): + """List with no archive results returns empty.""" + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Listed 0 archive results" in stderr + + def test_list_filter_by_status(self, initialized_archive): + """Filter archive results by status.""" + # Create snapshot and materialize an archive result via the runner + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + created = parse_jsonl_output(_cmd_result.stdout)[0] + run_archivebox_cmd( + ["archiveresult", "update", "--status=queued"], + stdin=json.dumps(created), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--status=queued"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_filter_by_plugin(self, initialized_archive): + """Filter archive results by plugin.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["plugin"] == "favicon" + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + # Create multiple archive results + for _ in range(3): + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--limit=2"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestArchiveResultUpdate: + """Tests for `archivebox archiveresult update`.""" + + def test_update_status(self, initialized_archive): + """Update archive result status.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + _stdout_run, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout_list, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + ar = parse_jsonl_output(stdout_list)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "update", "--status=failed"], + stdin=json.dumps(ar), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout3, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Updated 1 archive results" in stderr + + records = parse_jsonl_output(stdout3) + assert records[0]["status"] == "failed" + + +class TestArchiveResultDelete: + """Tests for `archivebox archiveresult delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + _stdout_run, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout_list, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + ar = parse_jsonl_output(stdout_list)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "delete"], + stdin=json.dumps(ar), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=stdout2, + cwd=initialized_archive, + timeout=120, + env=PROJECTOR_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + _stdout_run, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout_list, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + ar = parse_jsonl_output(stdout_list)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "delete", "--yes"], + stdin=json.dumps(ar), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Deleted 1 archive results" in stderr diff --git a/archivebox/tests/test_cli_binary.py b/archivebox/tests/test_cli_binary.py new file mode 100644 index 0000000000..480728ea43 --- /dev/null +++ b/archivebox/tests/test_cli_binary.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox binary command. + +TODO: expand beyond command discovery into create/list/update/delete behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_binary_help_runs_successfully(tmp_path): + """The binary command should be registered and expose help.""" + + result = run_archivebox_cmd(["binary", "--help"]) + + assert result.returncode == 0 + assert "binary" in result.stdout.lower() + assert "list" in result.stdout diff --git a/archivebox/tests/test_cli_config.py b/archivebox/tests/test_cli_config.py new file mode 100644 index 0000000000..ba017bbac3 --- /dev/null +++ b/archivebox/tests/test_cli_config.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox config command. +Verify config reads/writes ArchiveBox.conf file correctly. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_config_displays_all_config(initialized_archive): + """Test that config without args displays all configuration.""" + result = run_archivebox_cmd(["config"]) + + assert result.returncode == 0 + output = result.stdout + # Should show config sections + assert len(output) > 100 + # Should show at least some standard config keys + assert "TIMEOUT" in output or "OUTPUT_PERMISSIONS" in output + + +def test_config_shows_derived_collection_paths_but_not_runtime_dirs(initialized_archive): + """The CLI should expose collection paths, not per-crawl/per-snapshot runtime dirs.""" + run_archivebox_cmd(["init"], cwd=initialized_archive, check=True) + + result = run_archivebox_cmd(["config"], cwd=initialized_archive) + + assert result.returncode == 0, result.stderr + unwrapped_output = result.stdout.replace("\n", "") + assert "DATA_DIR" in result.stdout + assert result.stdout.count("\nDATA_DIR =") == 1 + assert str(initialized_archive) in unwrapped_output + assert "PERSONAS_DIR" in result.stdout + assert result.stdout.count("\nPERSONAS_DIR =") == 1 + assert "SNAP_DIR" not in result.stdout + assert "CRAWL_DIR" not in result.stdout + + +def test_config_get_derived_path_but_rejects_runtime_dir(initialized_archive): + run_archivebox_cmd(["init"], cwd=initialized_archive, check=True) + + data_dir = run_archivebox_cmd(["config", "--get", "DATA_DIR"], cwd=initialized_archive) + snap_dir = run_archivebox_cmd(["config", "--get", "SNAP_DIR"], cwd=initialized_archive) + + assert data_dir.returncode == 0, data_dir.stderr + assert "DATA_DIR" in data_dir.stdout + assert str(initialized_archive) in data_dir.stdout.replace("\n", "") + assert snap_dir.returncode != 0 + assert "SNAP_DIR =" not in snap_dir.stdout + + +def test_config_set_rejects_readonly_and_runtime_dirs(initialized_archive): + run_archivebox_cmd(["init"], cwd=initialized_archive, check=True) + + data_dir = run_archivebox_cmd( + ["config", "--set", f"DATA_DIR={initialized_archive / 'other'}"], + cwd=initialized_archive, + ) + crawl_dir = run_archivebox_cmd( + ["config", "--set", f"CRAWL_DIR={initialized_archive / 'crawl'}"], + cwd=initialized_archive, + ) + + assert data_dir.returncode != 0 + assert crawl_dir.returncode != 0 + content = (initialized_archive / "ArchiveBox.conf").read_text() + assert "DATA_DIR" not in content + assert "CRAWL_DIR" not in content + + +def test_config_get_specific_key(initialized_archive): + """Test that config --get KEY retrieves specific value.""" + result = run_archivebox_cmd( + ["config", "--get", "TIMEOUT"], + ) + + assert result.returncode == 0 + assert "TIMEOUT" in result.stdout + + +def test_config_set_writes_to_file(initialized_archive): + """Test that config --set KEY=VALUE writes to ArchiveBox.conf.""" + + result = run_archivebox_cmd( + ["config", "--set", "TIMEOUT=120"], + ) + + assert result.returncode == 0 + + # Verify config file was updated + config_file = initialized_archive / "ArchiveBox.conf" + assert config_file.exists() + + content = config_file.read_text() + assert "TIMEOUT" in content or "120" in content + + +def test_config_set_and_get_roundtrip(initialized_archive): + """Test that set value can be retrieved with get.""" + + # Set a unique value + run_archivebox_cmd( + ["config", "--set", "TIMEOUT=987"], + ) + + # Get the value back + result = run_archivebox_cmd( + ["config", "--get", "TIMEOUT"], + ) + + assert "987" in result.stdout + + +def test_config_set_multiple_values(initialized_archive): + """Test setting multiple config values at once.""" + + result = run_archivebox_cmd( + ["config", "--set", "TIMEOUT=111", "YTDLP_TIMEOUT=222"], + ) + + assert result.returncode == 0 + + # Verify both were written + config_file = initialized_archive / "ArchiveBox.conf" + content = config_file.read_text() + assert "111" in content + assert "222" in content + + +def test_config_set_invalid_key_fails(initialized_archive): + """Test that setting invalid config key fails.""" + + result = run_archivebox_cmd( + ["config", "--set", "TOTALLY_INVALID_KEY_XYZ=value"], + ) + + assert result.returncode != 0 + + +def test_config_set_requires_equals_sign(initialized_archive): + """Test that set requires KEY=VALUE format.""" + + result = run_archivebox_cmd( + ["config", "--set", "TIMEOUT"], + ) + + assert result.returncode != 0 + + +def test_config_search_finds_keys(initialized_archive): + """Test that config --search finds matching keys.""" + + result = run_archivebox_cmd( + ["config", "--search", "TIMEOUT"], + ) + + # Should find timeout-related config + assert "TIMEOUT" in result.stdout + + +def test_config_preserves_existing_values(initialized_archive): + """Test that setting new values preserves existing ones.""" + + # Set first value + run_archivebox_cmd( + ["config", "--set", "TIMEOUT=100"], + ) + + # Set second value + run_archivebox_cmd( + ["config", "--set", "YTDLP_TIMEOUT=200"], + ) + + # Verify both are in config file + config_file = initialized_archive / "ArchiveBox.conf" + content = config_file.read_text() + assert "TIMEOUT" in content + assert "YTDLP_TIMEOUT" in content + + +def test_config_file_is_valid_toml(initialized_archive): + """Test that config file remains valid TOML after set.""" + + run_archivebox_cmd( + ["config", "--set", "TIMEOUT=150"], + ) + + config_file = initialized_archive / "ArchiveBox.conf" + content = config_file.read_text() + + # Basic TOML validation - should have sections and key=value pairs + assert "[" in content or "=" in content + + +def test_config_updates_existing_value(initialized_archive): + """Test that setting same key twice updates the value.""" + + # Set initial value + run_archivebox_cmd( + ["config", "--set", "TIMEOUT=100"], + ) + + # Update to new value + run_archivebox_cmd( + ["config", "--set", "TIMEOUT=200"], + ) + + # Get current value + result = run_archivebox_cmd( + ["config", "--get", "TIMEOUT"], + ) + + # Should show updated value + assert "200" in result.stdout + + +def test_config_ignores_legacy_unknown_keys(tmp_path, initialized_archive): + """Old ArchiveBox.conf keys should not prevent startup during upgrades.""" + (tmp_path / "ArchiveBox.conf").write_text( + """ +[ARCHIVING_CONFIG] +MAX_MEDIA_SIZE = "750m" + +[SEARCH_BACKEND_CONFIG] +SEARCH_BACKEND_HOST_NAME = "sonic" +SEARCH_BACKEND_PASSWORD = "SecretPassword" +""", + ) + + result = run_archivebox_cmd( + ["version"], + ) + + assert result.returncode == 0, result.stderr + assert "Extra inputs are not permitted" not in result.stderr + + +class TestConfigCLI: + """Test the CLI interface for config command.""" + + def test_cli_help(self, tmp_path, initialized_archive): + """Test that --help works for config command.""" + + result = run_archivebox_cmd( + ["config", "--help"], + ) + + assert result.returncode == 0 + assert "--get" in result.stdout + assert "--set" in result.stdout diff --git a/archivebox/tests/test_cli_crawl.py b/archivebox/tests/test_cli_crawl.py new file mode 100644 index 0000000000..a4ba957f54 --- /dev/null +++ b/archivebox/tests/test_cli_crawl.py @@ -0,0 +1,472 @@ +""" +Tests for archivebox crawl CLI command. + +Tests cover: +- crawl create (with URLs, from stdin, pass-through) +- crawl list (with filters) +- crawl update +- crawl delete +""" + +import json + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import ( + cli_env, + create_test_url, + parse_jsonl_output, + run_archivebox_cmd, + run_queued_crawls, +) +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +class TestCrawlCreate: + """Tests for `archivebox crawl create`.""" + + def test_create_from_url_args(self, initialized_archive): + """Create crawl from URL arguments.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + assert "Created crawl" in stderr + + # Check JSONL output + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]["type"] == "Crawl" + assert url in records[0]["urls"] + + def test_create_from_stdin_urls(self, initialized_archive): + """Create crawl from stdin URLs (one per line).""" + urls = [create_test_url() for _ in range(3)] + stdin = "\n".join(urls) + + _cmd_result = run_archivebox_cmd( + ["crawl", "create"], + stdin=stdin, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + crawl = records[0] + assert crawl["type"] == "Crawl" + # All URLs should be in the crawl + for url in urls: + assert url in crawl["urls"] + + def test_create_with_depth(self, initialized_archive): + """Create crawl with --depth flag.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", "--depth=2", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert records[0]["max_depth"] == 2 + + def test_create_with_tag(self, initialized_archive): + """Create crawl with --tag flag.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", "--tag=test-tag", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert "test-tag" in records[0].get("tags_str", "") + + def test_create_pass_through_other_types(self, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"} + url = create_test_url() + stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url}) + + _cmd_result = run_archivebox_cmd( + ["crawl", "create"], + stdin=stdin, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + + # Should have both the passed-through Tag and the new Crawl + types = [r.get("type") for r in records] + assert "Tag" in types + assert "Crawl" in types + + def test_create_pass_through_existing_crawl(self, initialized_archive): + """Existing Crawl records (with id) are passed through.""" + # First create a crawl + url = create_test_url() + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + # Now pipe it back - should pass through + _cmd_result = run_archivebox_cmd( + ["crawl", "create"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) == 1 + assert records[0]["id"] == crawl["id"] + + +class TestCrawlList: + """Tests for `archivebox crawl list`.""" + + def test_list_empty(self, initialized_archive): + """List with no crawls returns empty.""" + _cmd_result = run_archivebox_cmd( + ["crawl", "list"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Listed 0 crawls" in stderr + + def test_list_returns_created(self, initialized_archive): + """List returns previously created crawls.""" + url = create_test_url() + run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["crawl", "list"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(url in r.get("urls", "") for r in records) + + def test_list_filter_by_status(self, initialized_archive): + """Filter crawls by status.""" + url = create_test_url() + run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["crawl", "list", "--status=queued"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + # Create multiple crawls + for _ in range(3): + run_archivebox_cmd( + ["crawl", "create", create_test_url()], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["crawl", "list", "--limit=2"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestCrawlUpdate: + """Tests for `archivebox crawl update`.""" + + def test_update_status(self, initialized_archive): + """Update crawl status.""" + # Create a crawl + url = create_test_url() + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + # Update it + _cmd_result = run_archivebox_cmd( + ["crawl", "update", "--status=started"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Updated 1 crawls" in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]["status"] == "started" + + +class TestCrawlDelete: + """Tests for `archivebox crawl delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["crawl", "delete"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["crawl", "delete", "--yes"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Deleted 1 crawls" in stderr + + def test_delete_dry_run(self, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["crawl", "delete", "--dry-run"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Would delete" in stderr + assert "dry run" in stderr.lower() + + +def test_crawl_creates_crawl_object(initialized_archive): + """Test that crawl command creates a Crawl object.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["crawl", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").first() + + assert crawl is not None, "Crawl object should be created" + + +def test_crawl_depth_sets_max_depth_in_crawl(initialized_archive): + """Test that --depth option sets max_depth in the Crawl object.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["crawl", "create", "--depth=2", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").first() + + assert crawl is not None + assert crawl.max_depth == 2, "Crawl max_depth should match --depth=2" + + +def test_crawl_creates_snapshot_for_url(initialized_archive): + """Test that crawl creates a Snapshot for the input URL.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["crawl", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot = Snapshot.objects.filter(url="https://example.com").first() + + assert snapshot is not None, "Snapshot should be created for input URL" + + +def test_crawl_links_snapshot_to_crawl(initialized_archive): + """Test that Snapshot is linked to Crawl via crawl_id.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["crawl", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").first() + assert crawl is not None + snapshot = Snapshot.objects.filter(url="https://example.com").first() + + assert snapshot is not None + assert snapshot.crawl_id == crawl.id, "Snapshot should be linked to Crawl" + + +def test_crawl_multiple_urls_creates_multiple_snapshots(initialized_archive): + """Test that crawling multiple URLs creates multiple snapshots.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + [ + "crawl", + "create", + "https://example.com", + "https://iana.org", + ], + cwd=initialized_archive, + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + urls = list(Snapshot.objects.order_by("url").values_list("url", flat=True)) + + assert "https://example.com" in urls + assert "https://iana.org" in urls + + +def test_crawl_from_file_creates_snapshot(initialized_archive): + """Test that crawl can create snapshots from a file of URLs.""" + env = cli_env(disable_extractors=True) + + # Write URLs to a file + urls_file = initialized_archive / "urls.txt" + urls_file.write_text("https://example.com\n") + + run_archivebox_cmd( + ["crawl", "create", str(urls_file)], + cwd=initialized_archive, + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot = Snapshot.objects.first() + + # Should create at least one snapshot (the source file or the URL) + assert snapshot is not None, "Should create at least one snapshot" + + +def test_crawl_persists_input_urls_on_crawl(initialized_archive): + """Test that crawl input URLs are stored on the Crawl record.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["crawl", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.order_by("-created_at").first() + + assert crawl is not None, "Crawl should be created for crawl input" + assert "https://example.com" in crawl.urls, "Crawl should persist input URLs" + + +class TestCrawlCLI: + """Test the CLI interface for crawl command.""" + + def test_cli_help(self, tmp_path, initialized_archive): + """Test that --help works for crawl command.""" + + result = run_archivebox_cmd( + ["crawl", "--help"], + ) + + assert result.returncode == 0 + assert "create" in result.stdout diff --git a/archivebox/tests/test_cli_extract.py b/archivebox/tests/test_cli_extract.py new file mode 100644 index 0000000000..33c11a2093 --- /dev/null +++ b/archivebox/tests/test_cli_extract.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +"""Tests for archivebox extract command.""" + +import pytest + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.tests.conftest import cli_env, find_snapshot_dir, parse_jsonl_output, run_archivebox_cmd + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_extract_runs_on_existing_snapshots(initialized_archive): + """Extract runs a requested plugin for an existing snapshot.""" + env = cli_env(PLUGINS="wget,title") + + create_result = run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + snapshot = next(record for record in parse_jsonl_output(create_result.stdout) if record.get("type") == "Snapshot") + snapshot_id = snapshot["id"] + + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title", snapshot_id], + cwd=initialized_archive, + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" and record.get("snapshot_id") == snapshot_id and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + + snapshot_dir = find_snapshot_dir(initialized_archive, snapshot_id) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + warc_files = list((snapshot_dir / "wget" / "warc").glob("*.warc.gz")) + assert title_path.is_file() + assert wget_path.is_file() + assert warc_files + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_preserves_snapshot_count(initialized_archive): + """Extract queues work without creating duplicate snapshots.""" + env = cli_env(PLUGINS="wget,title") + + create_result = run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + check=True, + ) + snapshot = next(record for record in parse_jsonl_output(create_result.stdout) if record.get("type") == "Snapshot") + + with use_archivebox_db(initialized_archive): + count_before = Snapshot.objects.count() + + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title", snapshot["id"]], + cwd=initialized_archive, + env=env, + timeout=90, + ) + assert result.returncode == 0, result.stderr or result.stdout + + with use_archivebox_db(initialized_archive): + count_after = Snapshot.objects.count() + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot["id"], plugin__in=("wget", "title"))} + + assert count_after == count_before + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == snapshot["id"] + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + snapshot_dir = find_snapshot_dir(initialized_archive, snapshot["id"]) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + warc_files = list((snapshot_dir / "wget" / "warc").glob("*.warc.gz")) + assert title_path.is_file() + assert wget_path.is_file() + assert warc_files + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size diff --git a/archivebox/tests/test_cli_extract_input.py b/archivebox/tests/test_cli_extract_input.py new file mode 100644 index 0000000000..1ceb4df17c --- /dev/null +++ b/archivebox/tests/test_cli_extract_input.py @@ -0,0 +1,395 @@ +"""Tests for archivebox extract input handling and pipelines.""" + +import subprocess +import json + +import pytest + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.tests.conftest import cli_env, find_snapshot_dir, parse_jsonl_output, run_archivebox_cmd + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def create_extract_snapshot(initialized_archive, env, url="https://example.com"): + run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + env=env, + check=True, + ) + + +def test_extract_runs_on_snapshot_id(initialized_archive): + """Test that extract command accepts a snapshot ID.""" + env = cli_env(PLUGINS="wget,title") + create_extract_snapshot(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).first() + + # Run extract on the snapshot + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title", str(snapshot_id)], + cwd=initialized_archive, + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot_id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_with_enabled_extractor_creates_archiveresult(initialized_archive): + """Test that extract creates ArchiveResult when extractor is enabled.""" + env = cli_env(PLUGINS="wget,title") + create_extract_snapshot(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).first() + + # Run extract with title extractor enabled + env = env.copy() + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title", str(snapshot_id)], + cwd=initialized_archive, + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot_id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_plugin_option_accepted(initialized_archive): + """Test that --plugin option is accepted.""" + env = cli_env(PLUGINS="wget,title") + create_extract_snapshot(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).first() + + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title", str(snapshot_id)], + cwd=initialized_archive, + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot_id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_stdin_snapshot_id(initialized_archive): + """Test that extract reads snapshot IDs from stdin.""" + env = cli_env(PLUGINS="wget,title") + create_extract_snapshot(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).first() + + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title"], + cwd=initialized_archive, + input=f"{snapshot_id}\n", + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot_id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_stdin_jsonl_input(initialized_archive): + """Test that extract reads JSONL records from stdin.""" + env = cli_env(PLUGINS="wget,title") + create_extract_snapshot(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).first() + + jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + "\n" + + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title"], + cwd=initialized_archive, + input=jsonl_input, + env=env, + timeout=90, + ) + + assert result.returncode == 0, result.stderr or result.stdout + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot_id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_pipeline_from_snapshot(initialized_archive): + """Test piping snapshot output to extract.""" + env = cli_env(PLUGINS="wget,title") + + result = subprocess.run( + ["bash", "-lc", "set -o pipefail; archivebox snapshot create https://example.com | archivebox extract --plugins=wget,title"], + cwd=initialized_archive, + capture_output=True, + text=True, + env=env, + timeout=90, + ) + assert result.returncode == 0, result.stderr or result.stdout + + with use_archivebox_db(initialized_archive): + snapshot = Snapshot.objects.filter(url="https://example.com").first() + + assert snapshot is not None, "Snapshot should be created by pipeline" + records = parse_jsonl_output(result.stdout) + result_records = { + record["plugin"]: record + for record in records + if record.get("type") == "ArchiveResult" + and record.get("snapshot_id") == str(snapshot.id) + and record.get("plugin") in {"wget", "title"} + } + assert set(result_records) == {"wget", "title"}, records + assert result_records["title"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["title"]["output_str"] == "Example Domain" + assert result_records["wget"]["status"] == ArchiveResult.StatusChoices.SUCCEEDED + assert result_records["wget"]["output_str"] == "wget/example.com/index.html" + with use_archivebox_db(initialized_archive): + archiveresults = {row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot.id, plugin__in=("wget", "title"))} + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot.id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + wget_path = snapshot_dir / "wget" / "example.com" / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == "Example Domain" + assert "Example Domain" in wget_path.read_text(encoding="utf-8") + assert archiveresults["title"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["title"].output_str == "Example Domain" + assert archiveresults["title"].output_files["title.txt"]["size"] == title_path.stat().st_size + assert archiveresults["wget"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert archiveresults["wget"].output_str == "wget/example.com/index.html" + assert archiveresults["wget"].output_files["example.com/index.html"]["size"] == wget_path.stat().st_size + + +def test_extract_multiple_snapshots(initialized_archive): + """Test extracting from multiple snapshots.""" + env = cli_env(PLUGINS="wget,title") + + create_extract_snapshot(initialized_archive, env, "https://example.com") + create_extract_snapshot(initialized_archive, env, "https://example.org") + + with use_archivebox_db(initialized_archive): + snapshot_ids = list(Snapshot.objects.values_list("id", flat=True)) + + assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots" + + # Extract from all snapshots + ids_input = "\n".join(str(snapshot_id) for snapshot_id in snapshot_ids) + "\n" + result = run_archivebox_cmd( + ["extract", "--plugins=wget,title"], + cwd=initialized_archive, + input=ids_input, + env=env, + timeout=90, + ) + assert result.returncode == 0, result.stderr + + with use_archivebox_db(initialized_archive): + count = Snapshot.objects.count() + result_rows = list(ArchiveResult.objects.filter(plugin__in=("wget", "title")).values_list("snapshot_id", "plugin", "status")) + + assert count >= 2, "Both snapshots should still exist after extraction" + assert len(result_rows) == len(snapshot_ids) * 2 + assert {(snapshot_id, plugin) for snapshot_id, plugin, _status in result_rows} == { + (snapshot_id, plugin) for snapshot_id in snapshot_ids for plugin in ("wget", "title") + } + assert all(status == ArchiveResult.StatusChoices.SUCCEEDED for _snapshot_id, _plugin, status in result_rows) + for snapshot_id in snapshot_ids: + with use_archivebox_db(initialized_archive): + snapshot = Snapshot.objects.get(id=snapshot_id) + archiveresults = { + row.plugin: row for row in ArchiveResult.objects.filter(snapshot_id=snapshot_id, plugin__in=("wget", "title")) + } + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None + title_path = snapshot_dir / "title" / "title.txt" + domain = snapshot.url.split("://", 1)[1].rstrip("/") + wget_path = snapshot_dir / "wget" / domain / "index.html" + assert title_path.is_file() + assert wget_path.is_file() + assert title_path.read_text(encoding="utf-8").strip() == archiveresults["title"].output_str + assert " 100 + assert "archivebox" in combined.lower() + + +def test_help_in_initialized_dir(initialized_archive): + """Test help command in initialized data directory.""" + result = run_archivebox_cmd(["help"]) + + assert result.returncode == 0 + combined = result.stdout + result.stderr + assert "init" in combined + assert "add" in combined diff --git a/archivebox/tests/test_cli_init.py b/archivebox/tests/test_cli_init.py new file mode 100644 index 0000000000..9f21e87419 --- /dev/null +++ b/archivebox/tests/test_cli_init.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox init command. +Verify init creates correct database schema, filesystem structure, and config. +""" + +import pytest +from django.utils import timezone +from django.db import connections +from django.db.migrations.recorder import MigrationRecorder + +from archivebox.config.common import get_config +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.machine.models import Machine +from archivebox.tests.conftest import run_queued_crawls, run_archivebox_cmd, cli_env + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +DIR_PERMISSIONS = get_config().OUTPUT_PERMISSIONS.replace("6", "7").replace("4", "5") + + +def test_init_creates_database_file(tmp_path): + """Test that init creates index.sqlite3 database file.""" + result = run_archivebox_cmd(["init"]) + + assert result.returncode == 0 + db_path = tmp_path / "index.sqlite3" + assert db_path.exists() + assert db_path.is_file() + + +def test_init_creates_archive_directory(tmp_path): + """Test that init creates archive directory.""" + run_archivebox_cmd(["init"]) + + archive_dir = tmp_path / "archive" + assert archive_dir.exists() + assert archive_dir.is_dir() + + +def test_init_uses_cwd_archive_and_users_dirs(tmp_path): + """Test that init creates archive/users storage roots under cwd.""" + + result = run_archivebox_cmd(["init"]) + + assert result.returncode == 0 + assert (tmp_path / "archive").is_dir() + assert (tmp_path / "archive" / "users").is_dir() + + +def test_init_creates_sources_directory(tmp_path): + """Test that init creates sources directory.""" + run_archivebox_cmd(["init"]) + + sources_dir = tmp_path / "sources" + assert sources_dir.exists() + assert sources_dir.is_dir() + + +def test_init_creates_logs_directory(tmp_path): + """Test that init creates logs directory.""" + run_archivebox_cmd(["init"]) + + logs_dir = tmp_path / "logs" + assert logs_dir.exists() + assert logs_dir.is_dir() + + +def test_init_creates_config_file(tmp_path): + """Test that init creates ArchiveBox.conf config file.""" + run_archivebox_cmd(["init"]) + + config_file = tmp_path / "ArchiveBox.conf" + assert config_file.exists() + assert config_file.is_file() + + +def test_init_runs_migrations(tmp_path): + """Test that init runs Django migrations and creates core tables.""" + run_archivebox_cmd(["init"]) + + with use_archivebox_db(tmp_path): + migration_count = MigrationRecorder.Migration.objects.count() + + assert migration_count > 0 + + +def test_init_creates_core_snapshot_table(tmp_path): + """Test that init creates core_snapshot table.""" + run_archivebox_cmd(["init"]) + + assert Snapshot._meta.db_table == "core_snapshot" + with use_archivebox_db(tmp_path): + assert Snapshot.objects.count() == 0 + + +def test_init_creates_crawls_crawl_table(tmp_path): + """Test that init creates crawls_crawl table.""" + run_archivebox_cmd(["init"]) + + assert Crawl._meta.db_table == "crawls_crawl" + with use_archivebox_db(tmp_path): + assert Crawl.objects.count() == 0 + + +def test_init_creates_core_archiveresult_table(tmp_path): + """Test that init creates core_archiveresult table.""" + run_archivebox_cmd(["init"]) + + assert ArchiveResult._meta.db_table == "core_archiveresult" + with use_archivebox_db(tmp_path): + assert ArchiveResult.objects.count() == 0 + + +def test_init_sets_correct_file_permissions(tmp_path): + """Test that init sets correct permissions on created files.""" + run_archivebox_cmd(["init"]) + + # Check database permissions + db_path = tmp_path / "index.sqlite3" + assert oct(db_path.stat().st_mode)[-3:] in (get_config().OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + + # Check directory permissions + archive_dir = tmp_path / "archive" + assert oct(archive_dir.stat().st_mode)[-3:] in (get_config().OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + + +def test_init_is_idempotent(tmp_path): + """Test that running init multiple times is safe (idempotent).""" + + # First init + result1 = run_archivebox_cmd(["init"]) + assert result1.returncode == 0 + assert "Initializing a new ArchiveBox" in result1.stdout + + # Second init should update, not fail + result2 = run_archivebox_cmd(["init"]) + assert result2.returncode == 0 + assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower() + + # Database should still be valid + with use_archivebox_db(tmp_path): + count = MigrationRecorder.Migration.objects.count() + assert count > 0 + + +def test_init_refuses_database_migrated_by_newer_code(tmp_path): + """A downgraded ArchiveBox build must fail before serving a newer DB schema.""" + result = run_archivebox_cmd(["init"]) + assert result.returncode == 0 + + with use_archivebox_db(tmp_path): + MigrationRecorder.Migration.objects.create(app="crawls", name="9999_future_test", applied=timezone.now()) + connections["default"].commit() + + result = run_archivebox_cmd(["init"]) + assert result.returncode == 3 + assert "migrated by a newer version of ArchiveBox" in result.stderr + assert "crawls.9999_future_test" in result.stderr + assert "archivebox manage migrate crawls " in result.stderr + + +def test_init_recovers_from_pre_squash_dev_history(tmp_path): + """Pre-squash dev DBs (rows for migrations now absorbed by ``replaces=``) + must NOT trip the newer-DB guard โ€” every historical squash would otherwise + brick beta-tester collections that pre-date the squash commit.""" + result = run_archivebox_cmd(["init"]) + assert result.returncode == 0 + + # Sampling โ€” one name per affected app, all listed in the ``replaces=`` + # declarations of the current squash anchors. If any of these get treated + # as missing-from-code, dev DBs that ran the historical chain pre-squash + # would refuse to start. + historical_pre_squash_rows = [ + ("api", "0002_alter_apitoken_options"), + ("api", "0009_rename_created_apitoken_created_at_and_more"), + ("core", "0023_alter_archiveresult_options_archiveresult_abid_and_more"), + ("core", "0074_alter_snapshot_downloaded_at"), + ("core", "0075_crawl"), + ("machine", "0002_alter_machine_stats_installedbinary"), + ("machine", "0004_alter_installedbinary_abspath_and_more"), + ] + with use_archivebox_db(tmp_path): + for app, name in historical_pre_squash_rows: + MigrationRecorder.Migration.objects.create(app=app, name=name, applied=timezone.now()) + connections["default"].commit() + + result = run_archivebox_cmd(["init"]) + assert result.returncode == 0, f"init refused to recover pre-squash dev DB.\nstdout={result.stdout}\nstderr={result.stderr}" + assert "migrated by a newer version of ArchiveBox" not in result.stderr + + +def test_init_with_existing_data_preserves_snapshots(initialized_archive): + """Test that re-running init preserves existing snapshot data.""" + env = cli_env(disable_extractors=True) + + # Add a snapshot + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + cwd=initialized_archive, + env=env, + ) + run_queued_crawls(initialized_archive, env, timeout=300) + + # Check snapshot was created + with use_archivebox_db(initialized_archive): + count_before = Snapshot.objects.count() + assert count_before == 1 + + # Run init again + result = run_archivebox_cmd(["init"], cwd=initialized_archive) + assert result.returncode == 0 + + # Snapshot should still exist + with use_archivebox_db(initialized_archive): + count_after = Snapshot.objects.count() + assert count_after == count_before + + +def test_init_quick_flag_skips_checks(tmp_path): + """Test that init --quick runs faster by skipping some checks.""" + + result = run_archivebox_cmd(["init", "--quick"]) + + assert result.returncode == 0 + # Database should still be created + db_path = tmp_path / "index.sqlite3" + assert db_path.exists() + + +def test_init_creates_machine_table(tmp_path): + """Test that init creates the machine_machine table.""" + run_archivebox_cmd(["init"]) + + assert Machine._meta.db_table == "machine_machine" + with use_archivebox_db(tmp_path): + Machine.objects.count() + + +def test_init_output_shows_collection_info(tmp_path): + """Test that init output shows helpful collection information.""" + result = run_archivebox_cmd(["init"]) + + output = result.stdout + # Should show some helpful info about the collection + assert "ArchiveBox" in output or "collection" in output.lower() or "Initializing" in output + + +def test_init_ignores_unrecognized_archive_directories(initialized_archive): + """Test that init upgrades existing dirs without choking on extra folders.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + (initialized_archive / "archive" / "some_random_folder").mkdir(parents=True, exist_ok=True) + + result = run_archivebox_cmd( + ["init"], + env=env, + ) + + assert result.returncode == 0, result.stdout + result.stderr diff --git a/archivebox/tests/test_cli_install.py b/archivebox/tests/test_cli_install.py new file mode 100644 index 0000000000..945ac07457 --- /dev/null +++ b/archivebox/tests/test_cli_install.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox install command. +Verify install detects and records binary dependencies in DB. +""" + +import os +from pathlib import Path + +from archivebox.tests.conftest import run_archivebox_cmd + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from archivebox.machine.models import Binary +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_install_runs_successfully(initialized_archive): + """Test that install command runs without error.""" + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert "Dry run - would detect ArchiveBox dependencies" in result.stdout + + +def test_install_creates_binary_records_in_db(initialized_archive): + """Test that install --dry-run does not create Binary records in database.""" + + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + assert result.returncode == 0, result.stderr or result.stdout + + with use_archivebox_db(initialized_archive): + assert Binary.objects.count() == 0 + + +def test_install_dry_run_does_not_install(initialized_archive): + """Test that --dry-run doesn't actually install anything.""" + + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert result.stdout.strip() == "Dry run - would detect ArchiveBox dependencies and run the abx-dl install flow" + + +def test_install_detects_system_binaries(initialized_archive): + """Test that install detects existing system binaries.""" + + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert "ArchiveBox dependencies" in result.stdout + + +def test_install_shows_binary_status(initialized_archive): + """Test that install shows status of binaries.""" + + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert result.stdout.strip() == "Dry run - would detect ArchiveBox dependencies and run the abx-dl install flow" + + +def test_install_dry_run_prints_dry_run_message(initialized_archive): + """Test that install --dry-run clearly reports that no changes will be made.""" + result = run_archivebox_cmd( + ["install", "--dry-run"], + timeout=60, + ) + + assert result.returncode == 0 + assert "dry run" in result.stdout.lower() + + +def test_install_help_lists_dry_run_flag(tmp_path): + """Test that install --help documents the dry-run option.""" + result = run_archivebox_cmd( + ["install", "--help"], + ) + + assert result.returncode == 0 + assert "--dry-run" in result.stdout or "-d" in result.stdout + + +def test_install_invalid_option_fails(tmp_path): + """Test that invalid install options fail cleanly.""" + result = run_archivebox_cmd( + ["install", "--invalid-option"], + ) + + assert result.returncode != 0 + + +def test_install_from_empty_dir_initializes_collection(tmp_path): + """Test that install bootstraps an empty dir before performing work.""" + env = os.environ.copy() + tmp_short = Path("/tmp") / f"abx-install-empty-{tmp_path.name}" + tmp_short.mkdir(parents=True, exist_ok=True) + env.update( + { + "TMP_DIR": str(tmp_short), + "ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true", + }, + ) + + result = run_archivebox_cmd( + ["install", "git"], + cwd=tmp_path, + timeout=120, + env=env, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + assert "Initializing a new ArchiveBox" in output + assert "Installing specific binaries: git" in output + assert (tmp_path / "ArchiveBox.conf").is_file() + assert (tmp_path / "index.sqlite3").is_file() + + with use_archivebox_db(tmp_path): + assert Snapshot.objects.count() == 0 + assert Crawl.objects.count() == 0 + assert Binary.objects.filter(status="installed", name="git").count() == 1 + + +def test_install_updates_binary_table(initialized_archive): + """Test that install completes and only mutates dependency state.""" + env = os.environ.copy() + tmp_short = Path("/tmp") / f"abx-install-{initialized_archive.name}" + tmp_short.mkdir(parents=True, exist_ok=True) + env.update( + { + "TMP_DIR": str(tmp_short), + "ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true", + }, + ) + + result = run_archivebox_cmd( + ["install", "git"], + timeout=120, + env=env, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + + with use_archivebox_db(initialized_archive): + binary_counts = { + status: Binary.objects.filter(status=status).count() for status in Binary.objects.values_list("status", flat=True).distinct() + } + snapshot_count = Snapshot.objects.count() + sealed_crawls = Crawl.objects.filter(status="sealed").count() + installed_git = Binary.objects.filter(status="installed", name="git").count() + + assert sealed_crawls == 0 + assert snapshot_count == 0 + assert binary_counts.get("installed", 0) > 0 + assert installed_git == 1 diff --git a/archivebox/tests/test_cli_list.py b/archivebox/tests/test_cli_list.py new file mode 100644 index 0000000000..bb791cefa4 --- /dev/null +++ b/archivebox/tests/test_cli_list.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox list command. +Verify list emits snapshot JSONL and applies the documented filters. +""" + +import json +import sys + +import pytest +from django.db import connection +from django.utils import timezone + +from archivebox.core.models import Snapshot +from archivebox.tests.conftest import create_test_url, parse_jsonl_output, run_archivebox_cmd, run_queued_crawls, cli_env + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +class CountingStdout: + encoding = "utf-8" + + def __init__(self): + self.rows = 0 + self._pending = "" + + def isatty(self): + return False + + def write(self, text): + self._pending += text + lines = self._pending.split("\n") + self._pending = lines.pop() + self.rows += sum(1 for line in lines if line.startswith("{")) + return len(text) + + def flush(self): + return None + + +def test_list_limit_zero_streams_one_million_snapshots_without_materializing(admin_user, monkeypatch): + """Regression: archivebox list --limit=0 must stream unbounded result sets.""" + from archivebox.cli.archivebox_snapshot import list_snapshots + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by=admin_user, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + now = timezone.now().isoformat() + with connection.cursor() as cursor: + cursor.execute( + """ + WITH RECURSIVE seq(n) AS ( + SELECT 1 + UNION ALL + SELECT n + 1 FROM seq WHERE n < 1000000 + ) + INSERT INTO core_snapshot ( + id, + url, + timestamp, + title, + bookmarked_at, + created_at, + modified_at, + downloaded_at, + fs_version, + crawl_id, + config, + current_step, + depth, + notes, + num_uses_failed, + num_uses_succeeded, + retry_at, + status, + delete_at, + output_size, + parent_snapshot_id + ) + SELECT + lower(hex(randomblob(16))), + 'https://example.com/page-' || n, + printf('9%031d', n), + '', + %s, + %s, + %s, + NULL, + '0.9.0', + %s, + '{}', + 0, + 0, + '', + 0, + 0, + NULL, + 'sealed', + NULL, + 0, + NULL + FROM seq + """, + [now, now, now, str(crawl.id).replace("-", "")], + ) + + stdout = CountingStdout() + monkeypatch.setattr(sys, "stdout", stdout) + + assert list_snapshots(limit=0) == 0 + assert stdout.rows == 1000000 + + +def test_list_outputs_existing_snapshots_as_jsonl(initialized_archive): + """Test that list prints one JSON object per stored snapshot.""" + env = cli_env(disable_extractors=True) + for url in ["https://example.com", "https://iana.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + result = run_archivebox_cmd( + ["list"], + timeout=30, + ) + + rows = parse_jsonl_output(result.stdout) + urls = {row["url"] for row in rows} + + assert result.returncode == 0, result.stderr + assert "https://example.com" in urls + assert "https://iana.org" in urls + + +def test_list_filters_by_url_icontains(initialized_archive): + """Test that list --url__icontains returns only matching snapshots.""" + env = cli_env(disable_extractors=True) + for url in ["https://example.com", "https://iana.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + result = run_archivebox_cmd( + ["list", "--url__icontains", "example.com"], + timeout=30, + ) + + rows = parse_jsonl_output(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["url"] == "https://example.com" + + +def test_list_filters_by_crawl_id_and_limit(initialized_archive): + """Test that crawl-id and limit filters constrain the result set.""" + env = cli_env(disable_extractors=True) + for url in ["https://example.com", "https://iana.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + crawl_id = str(Snapshot.objects.values_list("crawl_id", flat=True).get(url="https://example.com")) + + result = run_archivebox_cmd( + ["list", "--crawl-id", crawl_id, "--limit", "1"], + timeout=30, + ) + + rows = parse_jsonl_output(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["crawl_id"].replace("-", "") == crawl_id.replace("-", "") + assert rows[0]["url"] == "https://example.com" + + +def test_list_filters_by_status(initialized_archive): + """Test that list can filter using the current snapshot status.""" + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + with use_archivebox_db(initialized_archive): + status = Snapshot.objects.values_list("status", flat=True).get() + + result = run_archivebox_cmd( + ["list", "--status", status], + timeout=30, + ) + + rows = parse_jsonl_output(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 1 + assert rows[0]["status"] == status + + +def test_list_help_lists_filter_options(initialized_archive): + """Test that list --help documents the supported filter flags.""" + + result = run_archivebox_cmd( + ["list", "--help"], + timeout=30, + ) + + assert result.returncode == 0 + assert "--url__icontains" in result.stdout + assert "--crawl-id" in result.stdout + assert "--limit" in result.stdout + assert "--search" in result.stdout + assert "--json" in result.stdout + assert "--html" in result.stdout + assert "--with-headers" in result.stdout + + +def test_list_allows_sort_with_limit(initialized_archive): + """Test that list can sort and then apply limit without queryset slicing errors.""" + env = cli_env(disable_extractors=True) + for url in ["https://example.com", "https://iana.org", "https://example.net"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + result = run_archivebox_cmd( + ["list", "--limit", "2", "--sort", "-created_at"], + timeout=30, + ) + + rows = parse_jsonl_output(result.stdout) + assert result.returncode == 0, result.stderr + assert len(rows) == 2 + + +def test_snapshot_list_search_meta(initialized_archive): + """snapshot list should support metadata search mode.""" + url = create_test_url(domain="meta-search-example.com") + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--search=meta", "meta-search-example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert "meta-search-example.com" in records[0]["url"] + + +def test_list_search_meta_matches_metadata(initialized_archive): + """top-level list --search=meta should apply metadata search to the queryset.""" + url = create_test_url(domain="top-level-meta-search-example.com") + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["list", "--search=meta", "top-level-meta-search-example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert "top-level-meta-search-example.com" in records[0]["url"] + + +def test_search_command_finds_snapshots(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd(["search", "example"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, stderr + assert "example" in stdout + + +def test_search_command_returns_no_results_for_missing_term(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["search", "nonexistentterm12345"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code in [0, 1] + + +def test_search_command_on_empty_archive(initialized_archive): + _cmd_result = run_archivebox_cmd(["search", "anything"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + _stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code in [0, 1] + + +def test_search_command_outputs_matching_snapshots_as_jsonl(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd(["search"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, stderr + records = parse_jsonl_output(stdout) + assert any("example.com" in row.get("url", "") for row in records) + + +def test_search_command_json_outputs_matching_snapshots(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + result = run_archivebox_cmd(["search", "--json"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert any("example.com" in row.get("url", "") for row in payload) + + +def test_search_command_json_with_headers_wraps_links_payload(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + result = run_archivebox_cmd( + ["search", "--json", "--with-headers"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert "links" in payload + assert any("example.com" in row.get("url", "") for row in payload["links"]) + + +def test_search_command_html_outputs_markup(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + result = run_archivebox_cmd(["search", "--html"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + assert result.returncode == 0, result.stderr + assert "<" in result.stdout + assert "example.com" in result.stdout + + +def test_search_command_csv_outputs_requested_column(initialized_archive): + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["search", "--csv", "url", "--with-headers"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, stderr + assert "url" in stdout + assert "example.com" in stdout + + +def test_search_command_with_headers_requires_structured_output_format(initialized_archive): + _cmd_result = run_archivebox_cmd(["search", "--with-headers"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code != 0 + assert "requires" in stderr.lower() + assert "json" in stderr.lower() + + +def test_search_command_sort_option_runs_successfully(initialized_archive): + for url in ["https://iana.org", "https://example.com"]: + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["search", "--csv", "url", "--sort=url"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, stderr + assert "example.com" in stdout or "iana.org" in stdout + + +def test_search_command_help_lists_supported_filters(initialized_archive): + _cmd_result = run_archivebox_cmd(["search", "--help"], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "--url__icontains" in stdout + assert "--crawl-id" in stdout + assert "--status" in stdout + assert "--sort" in stdout + assert "--json" in stdout + assert "--html" in stdout diff --git a/archivebox/tests/test_cli_machine.py b/archivebox/tests/test_cli_machine.py new file mode 100644 index 0000000000..602ea12db2 --- /dev/null +++ b/archivebox/tests/test_cli_machine.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox machine command. + +TODO: expand beyond command discovery into list/filter behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_machine_help_runs_successfully(tmp_path): + """The machine command should be registered and expose help.""" + + result = run_archivebox_cmd(["machine", "--help"]) + + assert result.returncode == 0 + assert "machine" in result.stdout.lower() + assert "list" in result.stdout diff --git a/archivebox/tests/test_cli_manage.py b/archivebox/tests/test_cli_manage.py new file mode 100644 index 0000000000..4ac0e6e8f5 --- /dev/null +++ b/archivebox/tests/test_cli_manage.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox manage command. +Verify manage command runs Django management commands. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_manage_help_works(initialized_archive): + """Test that manage help command works.""" + + result = run_archivebox_cmd( + ["manage", "help"], + timeout=30, + ) + + assert result.returncode == 0 + assert len(result.stdout) > 100 + + +def test_manage_showmigrations_works(initialized_archive): + """Test that manage showmigrations works.""" + + result = run_archivebox_cmd( + ["manage", "showmigrations"], + timeout=30, + ) + + assert result.returncode == 0 + # Should show migration status + assert "core" in result.stdout or "[" in result.stdout + + +def test_manage_dbshell_command_exists(initialized_archive): + """Test that manage dbshell command is recognized.""" + + result = run_archivebox_cmd( + ["manage", "help", "dbshell"], + timeout=30, + ) + + # Should show help for dbshell + assert result.returncode == 0 + assert "dbshell" in result.stdout or "database" in result.stdout.lower() + + +def test_manage_check_works(initialized_archive): + """Test that manage check works.""" + + result = run_archivebox_cmd( + ["manage", "check"], + timeout=30, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert "System check identified no issues" in result.stdout diff --git a/archivebox/tests/test_cli_mcp.py b/archivebox/tests/test_cli_mcp.py new file mode 100644 index 0000000000..82b2c97bda --- /dev/null +++ b/archivebox/tests/test_cli_mcp.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox mcp command. + +TODO: expand beyond command discovery into JSON-RPC stdio behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_mcp_help_runs_successfully(tmp_path): + """The mcp command should be registered and expose help.""" + + result = run_archivebox_cmd(["mcp", "--help"]) + + assert result.returncode == 0 + assert "mcp" in result.stdout.lower() diff --git a/archivebox/tests/test_cli_persona.py b/archivebox/tests/test_cli_persona.py new file mode 100644 index 0000000000..6f45cc543c --- /dev/null +++ b/archivebox/tests/test_cli_persona.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox persona command. + +TODO: expand beyond command discovery into create/list/update/delete behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_persona_help_runs_successfully(tmp_path): + """The persona command should be registered and expose help.""" + + result = run_archivebox_cmd(["persona", "--help"]) + + assert result.returncode == 0 + assert "persona" in result.stdout.lower() + assert "list" in result.stdout diff --git a/archivebox/tests/test_cli_piping.py b/archivebox/tests/test_cli_piping.py new file mode 100644 index 0000000000..4c85765135 --- /dev/null +++ b/archivebox/tests/test_cli_piping.py @@ -0,0 +1,430 @@ +""" +Tests for JSONL piping contracts and `archivebox run`. + +This file covers both: +- low-level JSONL/stdin parsing behavior that makes CLI piping work +- subprocess integration for the supported records `archivebox run` consumes +""" + +import sys +import uuid +from io import StringIO + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.machine.models import Binary +from archivebox.tests.conftest import ( + assert_jsonl_only, + create_test_url, + parse_jsonl_output, + run_archivebox_cmd, +) +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +PIPE_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", +} + + +class MockTTYStringIO(StringIO): + def __init__(self, initial_value: str = "", *, is_tty: bool): + super().__init__(initial_value) + self._is_tty = is_tty + + def isatty(self) -> bool: + return self._is_tty + + +def test_parse_line_accepts_supported_piping_inputs(): + """The JSONL parser should normalize the input forms CLI pipes accept.""" + from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line + + assert parse_line("") is None + assert parse_line(" ") is None + assert parse_line("# comment") is None + assert parse_line("not-a-url") is None + assert parse_line("ftp://example.com") is None + + plain_url = parse_line("https://example.com") + assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"} + + assert parse_line("file:///tmp/example.txt") is None + + snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}') + assert snapshot_json is not None + assert snapshot_json["type"] == TYPE_SNAPSHOT + assert snapshot_json["tags"] == "tag1,tag2" + + crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}') + assert crawl_json is not None + assert crawl_json["type"] == TYPE_CRAWL + assert crawl_json["id"] == "abc123" + assert crawl_json["max_depth"] == 1 + + snapshot_id = "01234567-89ab-cdef-0123-456789abcdef" + parsed_id = parse_line(snapshot_id) + assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id} + + compact_snapshot_id = "0123456789abcdef0123456789abcdef" + compact_parsed_id = parse_line(compact_snapshot_id) + assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id} + + +def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl(tmp_path): + """Piping helpers should consume args, structured JSONL, and pass-through records.""" + from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin + + records = list(read_args_or_stdin(("https://example1.com", "https://example2.com"))) + assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"] + + local_file = tmp_path / "urls.txt" + local_file.write_text("https://from-file-arg.example\n") + assert list(read_args_or_stdin((str(local_file),))) == [] + + stdin_records = list( + read_args_or_stdin( + (), + stream=MockTTYStringIO( + "https://plain-url.com\n" + '{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n' + '{"type":"Tag","id":"tag-1","name":"example"}\n' + "01234567-89ab-cdef-0123-456789abcdef\n" + "not valid json\n", + is_tty=False, + ), + ), + ) + assert len(stdin_records) == 4 + assert stdin_records[0]["url"] == "https://plain-url.com" + assert stdin_records[1]["url"] == "https://jsonl-url.com" + assert stdin_records[1]["tags"] == "test" + assert stdin_records[2]["type"] == "Tag" + assert stdin_records[2]["name"] == "example" + assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef" + + crawl_records = list( + read_args_or_stdin( + (), + stream=MockTTYStringIO( + '{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n', + is_tty=False, + ), + ), + ) + assert len(crawl_records) == 1 + assert crawl_records[0]["type"] == TYPE_CRAWL + assert crawl_records[0]["id"] == "crawl-1" + + tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True))) + assert tty_records == [] + + +def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path): + """Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping.""" + from archivebox.plugins.hooks import collect_urls_from_plugins + + (tmp_path / "wget").mkdir() + (tmp_path / "wget" / "urls.jsonl").write_text( + '{"url":"https://wget-link-1.com"}\n{"url":"https://wget-link-2.com"}\n', + encoding="utf-8", + ) + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + '{"url":"https://html-link-1.com"}\n{"url":"https://html-link-2.com","title":"HTML Link 2"}\n', + encoding="utf-8", + ) + (tmp_path / "screenshot").mkdir() + + urls = collect_urls_from_plugins(tmp_path) + assert len(urls) == 4 + assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"} + titled = [url for url in urls if url.get("title") == "HTML Link 2"] + assert len(titled) == 1 + assert titled[0]["url"] == "https://html-link-2.com" + + assert collect_urls_from_plugins(tmp_path / "nonexistent") == [] + + +def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path): + from archivebox.plugins.hooks import collect_urls_from_plugins + + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + '{"url":"https://docs.sweeting.me/s/youtube-favorites)**"}\n', + encoding="utf-8", + ) + + urls = collect_urls_from_plugins(tmp_path) + assert len(urls) == 1 + assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites" + + +def test_collect_urls_from_plugins_trims_trailing_punctuation(tmp_path): + from archivebox.plugins.hooks import collect_urls_from_plugins + + (tmp_path / "parse_html_urls").mkdir() + (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( + ('{"url":"https://github.com/ArchiveBox/ArchiveBox."}\n{"url":"https://github.com/abc?abc#234234?."}\n'), + encoding="utf-8", + ) + + urls = collect_urls_from_plugins(tmp_path) + assert [url["url"] for url in urls] == [ + "https://github.com/ArchiveBox/ArchiveBox", + "https://github.com/abc?abc#234234", + ] + + +def test_crawl_create_stdout_pipes_into_run(initialized_archive): + """`archivebox crawl create | archivebox run` should queue and materialize snapshots.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + create_stdout, create_stderr, create_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert create_code == 0, create_stderr + assert_jsonl_only(create_stdout) + + crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl") + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=create_stdout, + cwd=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, run_stderr + assert_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records) + + with use_archivebox_db(initialized_archive): + snapshot_count = Snapshot.objects.filter(crawl_id=uuid.UUID(crawl["id"])).count() + assert isinstance(snapshot_count, int) + assert snapshot_count >= 1 + + +def test_snapshot_list_stdout_pipes_into_run(initialized_archive): + """`archivebox snapshot list | archivebox run` should requeue listed snapshots.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + create_stdout, create_stderr, create_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert create_code == 0, create_stderr + snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot") + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + list_stdout, list_stderr, list_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + if list_code != 0 or not parse_jsonl_output(list_stdout): + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", f"--url__icontains={url}"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + list_stdout, list_stderr, list_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert list_code == 0, list_stderr + assert_jsonl_only(list_stdout) + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=list_stdout, + cwd=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, run_stderr + assert_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records) + + with use_archivebox_db(initialized_archive): + snapshot_status = Snapshot.objects.values_list("status", flat=True).get(pk=uuid.UUID(snapshot["id"])) + assert snapshot_status == "sealed" + + +def test_archiveresult_list_stdout_pipes_into_run(initialized_archive): + """`archivebox archiveresult list | archivebox run` should preserve clean JSONL stdout.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + snapshot_stdout, snapshot_stderr, snapshot_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert snapshot_code == 0, snapshot_stderr + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=snapshot_stdout, + cwd=initialized_archive, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + ar_create_stdout, ar_create_stderr, ar_create_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert ar_create_code == 0, ar_create_stderr + + run_archivebox_cmd( + ["run"], + stdin=ar_create_stdout, + cwd=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=favicon"], + cwd=initialized_archive, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + list_stdout, list_stderr, list_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert list_code == 0, list_stderr + assert_jsonl_only(list_stdout) + listed_records = parse_jsonl_output(list_stdout) + archiveresult = next(record for record in listed_records if record.get("type") == "ArchiveResult") + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=list_stdout, + cwd=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, run_stderr + assert_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"] for record in run_records) + + +def test_binary_create_stdout_pipes_into_run(initialized_archive): + """`archivebox binary create | archivebox run` should queue the binary record for processing.""" + _cmd_result = run_archivebox_cmd( + ["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + create_stdout, create_stderr, create_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert create_code == 0, create_stderr + assert_jsonl_only(create_stdout) + + binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") in {"BinaryRequest", "Binary"}) + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=create_stdout, + cwd=initialized_archive, + timeout=120, + default_cli_env=True, + disable_extractors=True, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, run_stderr + assert_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + assert any(record.get("type") in {"BinaryRequest", "Binary"} and record.get("id") == binary["id"] for record in run_records) + + with use_archivebox_db(initialized_archive): + status = Binary.objects.values_list("status", flat=True).get(pk=uuid.UUID(binary["id"])) + assert status in {"queued", "installed"} + + +def test_multi_stage_pipeline_into_run(initialized_archive): + """`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + crawl_stdout, crawl_stderr, crawl_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert crawl_code == 0, crawl_stderr + assert_jsonl_only(crawl_stdout) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create"], + stdin=crawl_stdout, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + snapshot_stdout, snapshot_stderr, snapshot_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert snapshot_code == 0, snapshot_stderr + assert_jsonl_only(snapshot_stdout) + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=snapshot_stdout, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + archiveresult_stdout, archiveresult_stderr, archiveresult_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert archiveresult_code == 0, archiveresult_stderr + assert_jsonl_only(archiveresult_stdout) + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=archiveresult_stdout, + cwd=initialized_archive, + timeout=120, + env=PIPE_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, run_stderr + assert_jsonl_only(run_stdout) + + run_records = parse_jsonl_output(run_stdout) + snapshot = next(record for record in run_records if record.get("type") == "Snapshot") + assert any(record.get("type") == "ArchiveResult" for record in run_records) + + with use_archivebox_db(initialized_archive): + snapshot_status = Snapshot.objects.values_list("status", flat=True).get(pk=uuid.UUID(snapshot["id"])) + assert snapshot_status == "sealed" diff --git a/archivebox/tests/test_cli_pluginmap.py b/archivebox/tests/test_cli_pluginmap.py new file mode 100644 index 0000000000..d7266ec3be --- /dev/null +++ b/archivebox/tests/test_cli_pluginmap.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox pluginmap command. + +TODO: expand beyond command discovery into quiet/event output behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_pluginmap_help_runs_successfully(tmp_path): + """The pluginmap command should be registered and expose help.""" + + result = run_archivebox_cmd(["pluginmap", "--help"]) + + assert result.returncode == 0 + assert "pluginmap" in result.stdout.lower() + assert "--event" in result.stdout diff --git a/archivebox/tests/test_cli_process.py b/archivebox/tests/test_cli_process.py new file mode 100644 index 0000000000..1968fc50da --- /dev/null +++ b/archivebox/tests/test_cli_process.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox process command. + +TODO: expand beyond command discovery into list/filter behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_process_help_runs_successfully(tmp_path): + """The process command should be registered and expose help.""" + + result = run_archivebox_cmd(["process", "--help"]) + + assert result.returncode == 0 + assert "process" in result.stdout.lower() + assert "list" in result.stdout diff --git a/archivebox/tests/test_cli_remove.py b/archivebox/tests/test_cli_remove.py new file mode 100644 index 0000000000..db02a3f6c3 --- /dev/null +++ b/archivebox/tests/test_cli_remove.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox remove command. +Verify remove deletes snapshots from DB and filesystem. +""" + +import json +from pathlib import Path + +from archivebox.tests.conftest import find_snapshot_dir, run_archivebox_cmd, run_queued_crawls, cli_env + + +def _snapshot_rows(data_dir: Path, env: dict) -> list[dict]: + script = """ +import json +from archivebox.core.models import Snapshot +print(json.dumps([ + {"id": str(snapshot.id), "url": snapshot.url} + for snapshot in Snapshot.objects.order_by("url") +])) +""" + result = run_archivebox_cmd( + ["manage", "shell", "-c", script], + cwd=data_dir, + env=env, + timeout=30, + check=True, + ) + return json.loads(result.stdout.strip().splitlines()[-1]) + + +def test_remove_deletes_snapshot_from_db(initialized_archive): + """Test that remove command deletes snapshot from database.""" + env = cli_env(disable_extractors=True) + + # Add a snapshot + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + ) + run_queued_crawls(initialized_archive, env) + + rows = _snapshot_rows(initialized_archive, env) + assert len(rows) == 1 + snapshot_id = rows[0]["id"] + snapshot_dir = find_snapshot_dir(initialized_archive, snapshot_id) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + + # Remove it + run_archivebox_cmd( + ["remove", "https://example.com", "--yes"], + env=env, + ) + + assert len(_snapshot_rows(initialized_archive, env)) == 0 + assert not snapshot_dir.exists() + + +def test_remove_deletes_archive_directory(initialized_archive): + """Test that remove --yes removes the current snapshot output directory.""" + env = cli_env(disable_extractors=True) + + # Add a snapshot + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + ) + run_queued_crawls(initialized_archive, env) + + rows = _snapshot_rows(initialized_archive, env) + assert len(rows) == 1 + snapshot_id = rows[0]["id"] + + snapshot_dir = find_snapshot_dir(initialized_archive, snapshot_id) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + + run_archivebox_cmd( + ["remove", "https://example.com", "--yes"], + env=env, + ) + + assert not snapshot_dir.exists() + + +def test_remove_yes_flag_skips_confirmation(initialized_archive): + """Test that --yes flag skips confirmation prompt.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + ) + run_queued_crawls(initialized_archive, env) + + # Remove with --yes should complete without interaction + result = run_archivebox_cmd( + ["remove", "https://example.com", "--yes"], + env=env, + timeout=30, + ) + + assert result.returncode == 0 + output = result.stdout + result.stderr + assert "Index now contains 0 links." in output + + +def test_remove_without_yes_prompts_and_keeps_snapshot(initialized_archive): + """Test that omitting --yes prompts for confirmation and keeps data when declined.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + rows = _snapshot_rows(initialized_archive, env) + assert len(rows) == 1 + snapshot_dir = find_snapshot_dir(initialized_archive, rows[0]["id"]) + assert snapshot_dir is not None + + result = run_archivebox_cmd( + ["remove", "https://example.com"], + input="n\n", + env=env, + timeout=30, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0 + assert "Do you want to proceed" in output or "y/[n]" in output + assert len(_snapshot_rows(initialized_archive, env)) == 1 + assert snapshot_dir.exists() + + +def test_remove_multiple_snapshots(initialized_archive): + """Test removing multiple snapshots at once.""" + env = cli_env(disable_extractors=True) + + # Add multiple snapshots + for url in ["https://example.com", "https://example.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + ) + run_queued_crawls(initialized_archive, env) + + assert len(_snapshot_rows(initialized_archive, env)) == 2 + + # Remove both + run_archivebox_cmd( + ["remove", "https://example.com", "https://example.org", "--yes"], + env=env, + ) + + assert len(_snapshot_rows(initialized_archive, env)) == 0 + + +def test_remove_with_regex_filter_deletes_all_matches(initialized_archive): + """Test regex filters remove every matching snapshot.""" + env = cli_env(disable_extractors=True) + + for url in ["https://example.com", "https://iana.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + result = run_archivebox_cmd( + ["remove", "--filter-type=regex", ".*", "--yes"], + env=env, + check=True, + ) + + output = result.stdout + result.stderr + assert len(_snapshot_rows(initialized_archive, env)) == 0 + assert "Removed" in output or "Found" in output + + +def test_remove_nonexistent_url_fails_gracefully(initialized_archive): + """Test that removing non-existent URL fails gracefully.""" + env = cli_env(disable_extractors=True) + + result = run_archivebox_cmd( + ["remove", "https://nonexistent-url-12345.com", "--yes"], + env=env, + ) + + # Should fail or show error + stdout_text = result.stdout.lower() + assert result.returncode != 0 or "not found" in stdout_text or "no matches" in stdout_text + + +def test_remove_reports_remaining_link_count_correctly(initialized_archive): + """Test remove reports the remaining snapshot count after deletion.""" + env = cli_env(disable_extractors=True) + + for url in ["https://example.com", "https://example.org"]: + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + result = run_archivebox_cmd( + ["remove", "https://example.org", "--yes"], + env=env, + check=True, + ) + + output = result.stdout + result.stderr + assert "Removed 1 out of 2 links" in output + assert "Index now contains 1 links." in output + + +def test_remove_after_flag(initialized_archive): + """Test remove --after flag removes snapshots after date.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + check=True, + ) + run_queued_crawls(initialized_archive, env) + + rows = _snapshot_rows(initialized_archive, env) + assert len(rows) == 1 + snapshot_dir = find_snapshot_dir(initialized_archive, rows[0]["id"]) + assert snapshot_dir is not None, f"Snapshot output directory not found for {rows[0]['id']}" + + result = run_archivebox_cmd( + ["remove", "--after=1577836800", "--yes"], + env=env, + timeout=30, + check=True, + ) + + output = result.stdout + result.stderr + assert "Removed 1 out of 1 links" in output + assert "Index now contains 0 links." in output + assert len(_snapshot_rows(initialized_archive, env)) == 0 + assert not snapshot_dir.exists() diff --git a/archivebox/tests/test_cli_run.py b/archivebox/tests/test_cli_run.py new file mode 100644 index 0000000000..99928adf6b --- /dev/null +++ b/archivebox/tests/test_cli_run.py @@ -0,0 +1,2561 @@ +""" +Tests for archivebox run CLI command. + +Tests cover: +- run with stdin JSONL (Crawl, Snapshot, ArchiveResult) +- create-or-update behavior (records with/without id) +- pass-through output (for chaining) +""" + +import json +import os +import signal +import subprocess +import sys +import time + +import pytest + +from archivebox.tests.conftest import ( + cleanup_process_group, + cli_env, + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, + create_test_crawl_json, + create_test_snapshot_json, + pid_is_alive, + wait_for_pid_to_disappear, +) + +RUN_TEST_ENV = { + "PLUGINS": "favicon", + "SAVE_FAVICON": "True", +} + + +def _install_real_chrome_for_test(data_dir, env, *, isolation): + env["CHROME_ISOLATION"] = isolation + env["CHROME_HEADLESS"] = "true" + env["CHROME_SANDBOX"] = "false" + install_process = run_archivebox_cmd( + ["install", "chrome"], + cwd=data_dir, + env=env, + timeout=600, + ) + assert install_process.returncode == 0, install_process.stderr or install_process.stdout + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.timeout(90) +def test_cli_run_signal_cleans_background_hook_process_group(initialized_archive): + + plugins_root = initialized_archive / "runtime_plugins" + plugin_dir = plugins_root / "cancel_group" + plugin_dir.mkdir(parents=True) + daemon_hook = plugin_dir / "on_CrawlSetup__10_daemon.daemon.bg.sh" + foreground_hook = plugin_dir / "on_CrawlSetup__20_foreground.sh" + daemon_hook.write_text( + "\n".join( + [ + "#!/usr/bin/env bash", + "set -euo pipefail", + 'test_dir="${LEAK_TEST_DIR:?}"', + "sleep 600 &", + 'echo $$ > "$test_dir/daemon.pid"', + 'echo $! > "$test_dir/daemon-child.pid"', + 'echo ready > "$test_dir/daemon.ready"', + "trap 'echo cleaned > \"$test_dir/daemon.cleaned\"; exit 0' TERM INT", + "wait", + "", + ], + ), + ) + foreground_hook.write_text( + "\n".join( + [ + "#!/usr/bin/env bash", + "set -euo pipefail", + 'test_dir="${LEAK_TEST_DIR:?}"', + 'echo $$ > "$test_dir/foreground.pid"', + 'echo ready > "$test_dir/foreground.ready"', + "trap 'echo cleaned > \"$test_dir/foreground.cleaned\"; exit 0' TERM INT", + "while true; do sleep 1; done", + "", + ], + ), + ) + daemon_hook.chmod(0o755) + foreground_hook.chmod(0o755) + + leak_test_dir = initialized_archive / "leak-check" + leak_test_dir.mkdir() + env = os.environ.copy() + env.update( + { + "ABX_PLUGINS_DIR": str(plugins_root), + "LEAK_TEST_DIR": str(leak_test_dir), + "PLUGINS": "cancel_group", + "TIMEOUT": "30", + "USE_COLOR": "false", + "SHOW_PROGRESS": "false", + }, + ) + + _cmd_result = run_archivebox_cmd( + ["crawl", "create", "https://example.com"], + cwd=initialized_archive, + env=env, + timeout=60, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + crawl_records = [json.loads(line) for line in stdout.splitlines() if line.strip().startswith("{")] + crawl_id = next(record["id"] for record in crawl_records if record.get("type") == "Crawl") + + daemon_pid: int | None = None + daemon_child_pid: int | None = None + foreground_pid: int | None = None + run_process = run_archivebox_cmd( + ["run", f"--crawl-id={crawl_id}"], + cwd=initialized_archive, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + start_new_session=True, + wait=False, + ) + try: + deadline = time.time() + 20 + while time.time() < deadline: + if (leak_test_dir / "daemon.ready").exists() and (leak_test_dir / "foreground.ready").exists(): + break + if run_process.poll() is not None: + output = run_process.communicate(timeout=1)[0] + raise AssertionError(f"archivebox run exited before hooks were ready:\n{output}") + time.sleep(0.05) + assert (leak_test_dir / "daemon.ready").exists() + assert (leak_test_dir / "foreground.ready").exists() + + daemon_pid = int((leak_test_dir / "daemon.pid").read_text().strip()) + daemon_child_pid = int((leak_test_dir / "daemon-child.pid").read_text().strip()) + foreground_pid = int((leak_test_dir / "foreground.pid").read_text().strip()) + assert pid_is_alive(daemon_pid) + assert pid_is_alive(daemon_child_pid) + assert pid_is_alive(foreground_pid) + + run_process.send_signal(signal.SIGTERM) + output = run_process.communicate(timeout=20)[0] + assert "Runner error" not in output + + wait_for_pid_to_disappear(daemon_pid, timeout=5) + wait_for_pid_to_disappear(daemon_child_pid, timeout=5) + wait_for_pid_to_disappear(foreground_pid, timeout=5) + assert (leak_test_dir / "daemon.cleaned").read_text().strip() == "cleaned" + assert (leak_test_dir / "foreground.cleaned").read_text().strip() == "cleaned" + finally: + if run_process.poll() is None: + try: + os.killpg(run_process.pid, signal.SIGKILL) + except ProcessLookupError: + pass + run_process.communicate(timeout=5) + cleanup_process_group(daemon_pid, daemon_child_pid) + cleanup_process_group(foreground_pid) + + +class TestRunWithCrawl: + """Tests for `archivebox run` with Crawl input.""" + + def test_run_with_new_crawl(self, initialized_archive): + """Run creates and processes a new Crawl (no id).""" + crawl_record = create_test_crawl_json() + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl_record), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + + # Should output the created Crawl + records = parse_jsonl_output(stdout) + crawl_records = [r for r in records if r.get("type") == "Crawl"] + assert len(crawl_records) >= 1 + assert crawl_records[0].get("id") # Should have an id now + + def test_run_with_existing_crawl(self, initialized_archive): + """Run re-queues an existing Crawl (with id).""" + url = create_test_url() + + # First create a crawl + _cmd_result = run_archivebox_cmd( + ["crawl", "create", url], + cwd=initialized_archive, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + # Run with the existing crawl + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + +class TestRunWithSnapshot: + """Tests for `archivebox run` with Snapshot input.""" + + def test_run_with_new_snapshot(self, initialized_archive): + """Run creates and processes a new Snapshot (no id, just url).""" + snapshot_record = create_test_snapshot_json() + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(snapshot_record), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + snapshot_records = [r for r in records if r.get("type") == "Snapshot"] + assert len(snapshot_records) >= 1 + assert snapshot_records[0].get("id") + + def test_run_with_existing_snapshot(self, initialized_archive): + """Run re-queues an existing Snapshot (with id).""" + url = create_test_url() + + # First create a snapshot + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + # Run with the existing snapshot + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + def test_run_with_plain_url(self, initialized_archive): + """Run accepts plain URL records (no type field).""" + url = create_test_url() + url_record = {"url": url} + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(url_record), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + + +class TestRunWithArchiveResult: + """Tests for `archivebox run` with ArchiveResult input.""" + + def test_run_requeues_failed_archiveresult(self, initialized_archive): + """Run re-queues a failed ArchiveResult.""" + url = create_test_url() + + # Create snapshot and archive result + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["archiveresult", "create", "--plugin=favicon"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + ar = next(r for r in parse_jsonl_output(stdout2) if r.get("type") == "ArchiveResult") + + # Update to failed + ar["status"] = "failed" + run_archivebox_cmd( + ["archiveresult", "update", "--status=failed"], + stdin=json.dumps(ar), + cwd=initialized_archive, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + + # Now run should re-queue it + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(ar), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout3, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout3) + ar_records = [r for r in records if r.get("type") == "ArchiveResult"] + assert len(ar_records) >= 1 + + +@pytest.mark.django_db(transaction=True) +class TestRunRecovery: + def test_run_maintenance_logs_unfinished_crawl_repair(self, initialized_archive): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.tests.test_orm_helpers import use_archivebox_db + + old = timezone.now() - timedelta(hours=13) + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + Crawl.objects.filter(id=crawl.id).update(modified_at=old, retry_at=None) + Snapshot.objects.filter(id=snapshot.id).update(modified_at=old, retry_at=None) + crawl_id = crawl.id + snapshot_id = snapshot.id + + _cmd_result = run_archivebox_cmd( + ["run", "--maintenance-only"], + cwd=initialized_archive, + timeout=90, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, stdout + stderr + assert "Repairing" in stderr + assert "Resuming 1 Crawl(s) with pending URLs ready to archive" in stderr + assert "interrupted before" in stderr + assert "remaining URLs" in stderr + + with use_archivebox_db(initialized_archive): + crawl = Crawl.objects.get(id=crawl_id) + snapshot = Snapshot.objects.get(id=snapshot_id) + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + + +class TestRunPassThrough: + """Tests for pass-through behavior in `archivebox run`.""" + + def test_run_passes_through_unknown_types(self, initialized_archive): + """Run passes through records with unknown types.""" + unknown_record = {"type": "Unknown", "id": "fake-id", "data": "test"} + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(unknown_record), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + unknown_records = [r for r in records if r.get("type") == "Unknown"] + assert len(unknown_records) == 1 + assert unknown_records[0]["data"] == "test" + + def test_run_outputs_all_processed_records(self, initialized_archive): + """Run outputs all processed records for chaining.""" + url = create_test_url() + crawl_record = create_test_crawl_json(urls=[url]) + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(crawl_record), + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + # Should have at least the Crawl in output + assert len(records) >= 1 + + +class TestRunMixedInput: + """Tests for `archivebox run` with mixed record types.""" + + def test_run_handles_mixed_types(self, initialized_archive): + """Run handles mixed Crawl/Snapshot/ArchiveResult input.""" + crawl = create_test_crawl_json() + snapshot = create_test_snapshot_json() + unknown = {"type": "Tag", "id": "fake", "name": "test"} + + stdin = "\n".join( + [ + json.dumps(crawl), + json.dumps(snapshot), + json.dumps(unknown), + ], + ) + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=stdin, + cwd=initialized_archive, + timeout=120, + env=RUN_TEST_ENV, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = {r.get("type") for r in records} + # Should have processed Crawl and Snapshot, passed through Tag + assert "Crawl" in types or "Snapshot" in types or "Tag" in types + + +class TestRunEmpty: + """Tests for `archivebox run` edge cases.""" + + def test_run_empty_stdin(self, initialized_archive): + """Run with empty stdin returns success.""" + _cmd_result = run_archivebox_cmd( + ["run"], + stdin="", + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + + def test_run_no_records_to_process(self, initialized_archive): + """Run with only pass-through records shows message.""" + unknown = {"type": "Unknown", "id": "fake"} + + _cmd_result = run_archivebox_cmd( + ["run"], + stdin=json.dumps(unknown), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "No records to process" in stderr + + +class TestRunDaemonMode: + @pytest.mark.parametrize("stdin_kind", ["malformed", "valid-snapshot"]) + def test_run_daemon_ignores_piped_stdin_and_starts_real_runner(self, initialized_archive, db, stdin_kind): + from archivebox.machine.models import Process + from archivebox.core.models import Snapshot + from archivebox.tests.test_orm_helpers import use_archivebox_db + + snapshot_url = None + if stdin_kind == "valid-snapshot": + snapshot_url = create_test_url() + piped_stdin = json.dumps(create_test_snapshot_json(url=snapshot_url)) + "\n" + else: + piped_stdin = "{this is not jsonl}\n" + + env = cli_env() + proc = run_archivebox_cmd( + ["run", "--daemon"], + cwd=initialized_archive, + env=env, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, + wait=False, + ) + assert proc.stdin is not None + assert proc.stdout is not None + assert proc.stderr is not None + + try: + proc.stdin.write(piped_stdin) + proc.stdin.close() + + deadline = time.monotonic() + 20 + started = False + while time.monotonic() < deadline: + if proc.poll() is not None: + stdout = proc.stdout.read() + stderr = proc.stderr.read() + pytest.fail(f"daemon exited before starting runner: code={proc.returncode}\nstdout={stdout}\nstderr={stderr}") + with use_archivebox_db(initialized_archive): + started = Process.objects.filter( + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + pid=proc.pid, + ).exists() + if started: + break + time.sleep(0.25) + + assert started is True + if snapshot_url is not None: + with use_archivebox_db(initialized_archive): + assert not Snapshot.objects.filter(url=snapshot_url).exists() + finally: + if proc.poll() is None: + os.killpg(proc.pid, signal.SIGTERM) + try: + proc.wait(timeout=15) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait(timeout=5) + + stdout = proc.stdout.read() + stderr = proc.stderr.read() + assert proc.returncode == 143, stdout + stderr + assert "No records to process" not in stderr + + def test_run_daemon_takeover_has_single_active_runner_gate(self, initialized_archive, db): + from archivebox.machine.models import Process + from archivebox.core.takeover_util import RUNNER_ACTIVE_WORKER_TYPE + from archivebox.tests.test_orm_helpers import use_archivebox_db + + env = cli_env() + + def active_runners(): + with use_archivebox_db(initialized_archive): + return [ + proc + for proc in Process.objects.filter( + process_type=Process.TypeChoices.ORCHESTRATOR, + worker_type=RUNNER_ACTIVE_WORKER_TYPE, + status=Process.StatusChoices.RUNNING, + pwd=str(initialized_archive), + ) + if proc.is_running + ] + + def wait_for_stable_single_active(*, timeout: float, stable_seconds: float = 1.0, exclude_pid: int | None = None): + deadline = time.monotonic() + timeout + stable_pid = None + stable_since = None + while time.monotonic() < deadline: + active = active_runners() + assert len(active) <= 1 + if len(active) == 1 and active[0].pid != exclude_pid: + pid = active[0].pid + if pid != stable_pid: + stable_pid = pid + stable_since = time.monotonic() + elif stable_since is not None and time.monotonic() - stable_since >= stable_seconds: + return pid + else: + stable_pid = None + stable_since = None + time.sleep(0.25) + return None + + procs = [ + run_archivebox_cmd( + ["run", "--daemon"], + cwd=initialized_archive, + env=env, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, + wait=False, + ) + for _ in range(2) + ] + try: + active_pid = wait_for_stable_single_active(timeout=30) + assert active_pid is not None + + os.killpg(active_pid, signal.SIGKILL) + replacement = run_archivebox_cmd( + ["run", "--daemon"], + cwd=initialized_archive, + env=env, + stdin=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + start_new_session=True, + wait=False, + ) + procs.append(replacement) + recovered_pid = wait_for_stable_single_active(timeout=30, exclude_pid=active_pid) + assert recovered_pid is not None + finally: + for proc in procs: + if proc.poll() is None: + os.killpg(proc.pid, signal.SIGTERM) + for proc in procs: + try: + proc.wait(timeout=15) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait(timeout=5) + + +@pytest.mark.django_db +class TestRecoverOrchestratorState: + def test_recover_orchestrator_state_unlocks_started_crawl_with_pending_snapshot(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + assert recovered["crawls_started_with_due_snapshots"] == 1 + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + def test_recover_orchestrator_state_unlocks_started_crawl_with_finished_snapshots_for_runner(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + from archivebox.services.runner import run_due_crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + assert "sealed_crawls" not in recovered + assert recovered["crawls_started_without_active_snapshots"] == 1 + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + assert run_due_crawl(crawl, lock_seconds=60) is True + crawl.refresh_from_db() + + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_recover_orchestrator_state_repairs_retry_at_status_invariants(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + user_id = get_or_create_system_user_pk() + queued_crawl = Crawl.objects.create( + urls="https://example.com/queued-crawl", + created_by_id=user_id, + status=Crawl.StatusChoices.QUEUED, + retry_at=None, + ) + sealed_crawl = Crawl.objects.create( + urls="https://example.com/sealed-crawl", + created_by_id=user_id, + status=Crawl.StatusChoices.SEALED, + retry_at=timezone.now(), + ) + queued_snapshot = Snapshot.objects.create( + url="https://example.com/queued-snapshot", + crawl=queued_crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + sealed_snapshot = Snapshot.objects.create( + url="https://example.com/sealed-snapshot", + crawl=sealed_crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=timezone.now(), + ) + + recovered = recover_orchestrator_state() + + queued_crawl.refresh_from_db() + sealed_crawl.refresh_from_db() + queued_snapshot.refresh_from_db() + sealed_snapshot.refresh_from_db() + + assert recovered["crawls_queued_without_retry_at"] == 1 + assert recovered["snapshots_queued_without_retry_at"] == 1 + assert queued_crawl.status == Crawl.StatusChoices.QUEUED + assert queued_crawl.retry_at is not None + assert sealed_crawl.status == Crawl.StatusChoices.SEALED + assert sealed_crawl.retry_at is not None + assert queued_snapshot.status == Snapshot.StatusChoices.QUEUED + assert queued_snapshot.retry_at is not None + assert sealed_snapshot.status == Snapshot.StatusChoices.SEALED + assert sealed_snapshot.retry_at is not None + + def test_recover_orchestrator_state_requeues_backoff_archiveresults(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite", + status=ArchiveResult.StatusChoices.BACKOFF, + ) + + recovered = recover_orchestrator_state() + + result.refresh_from_db() + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert recovered["archiveresults_backoff"] == 1 + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is not None + assert crawl.status == Crawl.StatusChoices.SEALED + + def test_recover_orchestrator_state_leaves_due_queued_snapshot_for_runner_even_with_final_results(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert "sealed_queued_snapshots" not in recovered + assert "sealed_queued_crawls" not in recovered + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at is not None + assert snapshot.downloaded_at is None + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at is not None + + def test_recover_orchestrator_state_leaves_stale_queued_final_rows_for_runner(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + from archivebox.services.runner import run_due_crawl, run_due_snapshot + + old = timezone.now() - timedelta(hours=13) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=old, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=old, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + Crawl.objects.filter(pk=crawl.pk).update(modified_at=old) + Snapshot.objects.filter(pk=snapshot.pk).update(modified_at=old) + ArchiveResult.objects.filter(pk=result.pk).update(modified_at=old) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert "sealed_queued_snapshots" not in recovered + assert "sealed_queued_crawls" not in recovered + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at == old + assert snapshot.downloaded_at is None + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == old + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == old + + assert run_due_crawl(crawl, lock_seconds=60) is True + crawl.refresh_from_db() + + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_run_due_snapshot_seals_queued_snapshot_with_final_results(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + snapshot.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert snapshot.downloaded_at is None + + def test_create_pending_archiveresults_uses_canonical_hook_names(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + snapshot.create_pending_archiveresults() + + hook_names = list(ArchiveResult.objects.filter(snapshot=snapshot).values_list("hook_name", flat=True)) + assert hook_names + assert all(not hook_name.endswith((".py", ".js", ".sh")) for hook_name in hook_names) + + def test_run_due_snapshot_pauses_child_when_parent_is_paused(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + from archivebox.workers.models import RETRY_AT_MAX + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + snapshot.refresh_from_db() + result.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.PAUSED + assert snapshot.retry_at == RETRY_AT_MAX + assert result.status == ArchiveResult.StatusChoices.PAUSED + assert snapshot.archiveresult_set.count() == 1 + + def test_parent_status_transitions_schedule_children_to_follow_parent_status(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + from archivebox.workers.models import RETRY_AT_MAX + + paused_crawl = Crawl.objects.create( + urls="https://example.com/paused", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + paused_child = Snapshot.objects.create( + url="https://example.com/paused", + crawl=paused_crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + paused_result = ArchiveResult.objects.create( + snapshot=paused_child, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.QUEUED, + ) + paused_crawl.pause() + + sealed_crawl = Crawl.objects.create( + urls="https://example.com/sealed", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + sealed_child = Snapshot.objects.create( + url="https://example.com/sealed", + crawl=sealed_crawl, + status=Snapshot.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + ) + sealed_started_child = Snapshot.objects.create( + url="https://example.com/sealed-started", + crawl=sealed_crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + sealed_crawl.cancel() + + paused_child.refresh_from_db() + sealed_child.refresh_from_db() + sealed_started_child.refresh_from_db() + assert paused_child.status == Snapshot.StatusChoices.STARTED + assert paused_child.retry_at is not None + assert paused_child.retry_at <= timezone.now() + assert sealed_child.status == Snapshot.StatusChoices.PAUSED + assert sealed_child.retry_at is not None + assert sealed_child.retry_at <= timezone.now() + assert sealed_started_child.status == Snapshot.StatusChoices.STARTED + assert sealed_started_child.retry_at is not None + assert sealed_started_child.retry_at <= timezone.now() + + assert run_due_snapshot(paused_child, lock_seconds=60) is True + paused_child.refresh_from_db() + paused_result.refresh_from_db() + assert paused_child.status == Snapshot.StatusChoices.PAUSED + assert paused_child.retry_at == RETRY_AT_MAX + assert paused_result.status == ArchiveResult.StatusChoices.PAUSED + + assert run_due_snapshot(sealed_child, lock_seconds=60) is True + sealed_child.refresh_from_db() + assert sealed_child.status == Snapshot.StatusChoices.SEALED + assert sealed_child.retry_at is None + + assert run_due_snapshot(sealed_started_child, lock_seconds=60) is True + sealed_started_child.refresh_from_db() + assert sealed_started_child.status == Snapshot.StatusChoices.SEALED + assert sealed_started_child.retry_at is None + + def test_recover_orchestrator_state_leaves_due_active_crawl_for_runner(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.recovery_util import recover_orchestrator_state + + old = timezone.now() - timedelta(hours=13) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=old, + ) + Crawl.objects.filter(id=crawl.id).update(modified_at=old, retry_at=old) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + assert "stale_active_crawls_unlocked" not in recovered + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == old + + def test_recover_orchestrator_state_unlocks_started_snapshot_without_running_result(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + future = timezone.now() + timedelta(seconds=45) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert recovered["snapshots_started_without_running_results"] == 1 + assert "snapshots_active_under_sealed_crawls" not in recovered + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + assert snapshot.retry_at < future + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_recover_orchestrator_state_unlocks_future_started_crawl_and_snapshot_after_owner_dies(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + future = timezone.now() + timedelta(seconds=45) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=future, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + snapshot.refresh_from_db() + + assert recovered["snapshots_started_without_running_results"] == 1 + assert recovered["crawls_started_with_due_snapshots"] == 1 + assert crawl.status == Crawl.StatusChoices.STARTED + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert crawl.retry_at is not None + assert snapshot.retry_at is not None + assert crawl.retry_at < future + assert snapshot.retry_at < future + + def test_recover_orchestrator_state_preserves_future_started_snapshot_with_live_result_process(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.core.recovery_util import recover_orchestrator_state + + worker = subprocess.Popen( + [sys.executable, "-c", "import time; time.sleep(60)"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + text=True, + start_new_session=True, + ) + try: + future = timezone.now() + timedelta(seconds=45) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=future, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + process = Process.objects.create( + machine=Machine.current(refresh=True), + iface=NetworkInterface.current(refresh=True), + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + pwd=str(snapshot.output_dir / "title"), + cmd=[], + status=Process.StatusChoices.RUNNING, + retry_at=None, + pid=worker.pid, + started_at=timezone.now(), + timeout=120, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + process=process, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + assert recovered["snapshots_started_without_running_results"] == 0 + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at == future + finally: + if worker.poll() is None: + os.killpg(worker.pid, signal.SIGTERM) + try: + worker.wait(timeout=5) + except subprocess.TimeoutExpired: + os.killpg(worker.pid, signal.SIGKILL) + worker.wait(timeout=5) + + def test_recover_orchestrator_state_does_not_resume_paused_rows_with_max_retry_at(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + from archivebox.workers.models import RETRY_AT_MAX + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.PAUSED, + retry_at=RETRY_AT_MAX, + ) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + snapshot.refresh_from_db() + + assert recovered["crawls_started_with_due_snapshots"] == 0 + assert recovered["crawls_started_waiting_on_future_snapshots"] == 0 + assert recovered["crawls_started_without_active_snapshots"] == 0 + assert recovered["snapshots_started_without_running_results"] == 0 + assert crawl.status == Crawl.StatusChoices.PAUSED + assert snapshot.status == Snapshot.StatusChoices.PAUSED + assert crawl.retry_at == RETRY_AT_MAX + assert snapshot.retry_at == RETRY_AT_MAX + + def test_recover_orchestrator_state_does_not_wake_sealed_snapshot_maintenance_rows(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="singlefile", + hook_name="on_Snapshot__50_singlefile.py", + status=ArchiveResult.StatusChoices.QUEUED, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sonic", + hook_name="on_Snapshot__91_index_sonic", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert "snapshots_sealed_with_queued_results" not in recovered + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_run_due_snapshot_finalizes_completed_upload_result_left_queued(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=timezone.now(), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="dom", + hook_name="on_Snapshot__archivebox_browser_extension_upload", + status=ArchiveResult.StatusChoices.QUEUED, + output_str="output.html", + output_files={"output.html": {"extension": "html", "mimetype": "text/html", "size": 42}}, + output_size=42, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + result.refresh_from_db() + snapshot.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert snapshot.retry_at is None + + @pytest.mark.django_db(transaction=True) + def test_run_due_snapshot_runs_queued_plugin_after_fs_migration(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=timezone.now(), + ) + Snapshot.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0") + snapshot.refresh_from_db() + snapshot.output_dir.mkdir(parents=True, exist_ok=True) + title_dir = snapshot.output_dir / "title" + title_dir.mkdir(parents=True, exist_ok=True) + (title_dir / "title.txt").write_text("Example Domain\n", encoding="utf-8") + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + snapshot.refresh_from_db() + result.refresh_from_db() + assert snapshot.fs_version == Snapshot._fs_current_version() + assert result.status in ArchiveResult.FINAL_STATES + assert result.status != ArchiveResult.StatusChoices.QUEUED + assert result.start_ts is not None + assert result.end_ts is not None + + @pytest.mark.django_db(transaction=True) + def test_run_due_snapshot_fails_obsolete_queued_hook_name(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=timezone.now(), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="singlefile", + hook_name="on_Snapshot__50_singlefile.py", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + result.refresh_from_db() + snapshot.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.FAILED + assert snapshot.retry_at is None + + @pytest.mark.django_db(transaction=True) + @pytest.mark.timeout(300) + @pytest.mark.parametrize("chrome_isolation", ["crawl", "snapshot"]) + def test_resume_queued_chrome_wait_reruns_background_prerequisites( + self, + initialized_archive, + recursive_test_site, + chrome_isolation, + ): + from archivebox.core.models import ArchiveResult + from archivebox.tests.test_orm_helpers import use_archivebox_db + + env = cli_env(disable_extractors=True) + env.update( + { + "SAVE_TITLE": "false", + "TIMEOUT": "60", + "CHROME_TIMEOUT": "30", + }, + ) + _install_real_chrome_for_test(initialized_archive, env, isolation=chrome_isolation) + + add_process = run_archivebox_cmd( + [ + "add", + "--depth=0", + "--plugins=chrome", + recursive_test_site["root_url"], + ], + cwd=initialized_archive, + env=env, + timeout=600, + ) + assert add_process.returncode == 0, add_process.stderr or add_process.stdout + + list_process = run_archivebox_cmd( + ["archiveresult", "list", "--plugin=chrome"], + cwd=initialized_archive, + env=env, + timeout=60, + ) + assert list_process.returncode == 0, list_process.stderr or list_process.stdout + chrome_results = parse_jsonl_output(list_process.stdout) + wait_record = next(record for record in chrome_results if record["hook_name"] == "on_Snapshot__11_chrome_wait") + snapshot_id = wait_record["snapshot_id"] + + with use_archivebox_db(initialized_archive): + tab_result = ArchiveResult.objects.get( + snapshot_id=snapshot_id, + plugin="chrome", + hook_name="on_Snapshot__10_chrome_tab.daemon.bg", + ) + first_tab_process_id = tab_result.process_id + assert first_tab_process_id is not None + + update_process = run_archivebox_cmd( + ["archiveresult", "update", "--status=queued"], + stdin=json.dumps(wait_record) + "\n", + cwd=initialized_archive, + env=env, + timeout=60, + ) + assert update_process.returncode == 0, update_process.stderr or update_process.stdout + + run_process = run_archivebox_cmd( + ["run"], + cwd=initialized_archive, + env=env, + stdin=subprocess.PIPE, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + wait=False, + start_new_session=True, + ) + assert run_process.stdin is not None + run_process.stdin.write(update_process.stdout) + run_process.stdin.close() + + resumed_full_plugin = False + try: + deadline = time.time() + 90 + last_wait_status = None + last_tab_process_id = None + while time.time() < deadline: + with use_archivebox_db(initialized_archive): + wait_result = ArchiveResult.objects.get( + snapshot_id=snapshot_id, + plugin="chrome", + hook_name="on_Snapshot__11_chrome_wait", + ) + tab_result = ArchiveResult.objects.get( + snapshot_id=snapshot_id, + plugin="chrome", + hook_name="on_Snapshot__10_chrome_tab.daemon.bg", + ) + last_wait_status = wait_result.status + last_tab_process_id = tab_result.process_id + if wait_result.status == ArchiveResult.StatusChoices.SUCCEEDED and tab_result.process_id != first_tab_process_id: + resumed_full_plugin = True + break + if run_process.poll() is not None: + break + time.sleep(0.5) + + if resumed_full_plugin: + try: + run_process.wait(timeout=30) + except subprocess.TimeoutExpired: + cleanup_process_group(run_process.pid) + run_process.wait(timeout=10) + finally: + if run_process.poll() is None: + cleanup_process_group(run_process.pid) + run_process.wait(timeout=10) + + assert run_process.returncode == 0 + assert last_wait_status == ArchiveResult.StatusChoices.SUCCEEDED + assert last_tab_process_id is not None + assert last_tab_process_id != first_tab_process_id + + def test_recover_orchestrator_state_ignores_sealed_downloaded_snapshot_without_results(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + downloaded_at=timezone.now(), + retry_at=None, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + crawl.refresh_from_db() + + assert recovered["snapshots_started_without_running_results"] == 0 + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_recover_orchestrator_state_unlocks_started_snapshot_with_final_results_for_runner(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + from archivebox.services.runner import run_due_snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + recovered = recover_orchestrator_state() + + snapshot.refresh_from_db() + assert "sealed_snapshots" not in recovered + assert recovered["snapshots_started_without_running_results"] == 1 + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + snapshot.refresh_from_db() + + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + + +@pytest.mark.django_db +class TestRunDueCrawlState: + def test_idle_maintenance_repairs_archive_result_delete_at(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_pending_crawls + + crawl = Crawl.objects.create( + urls="https://example.com/retention-repair", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + config={"DELETE_AFTER": "2h"}, + ) + snapshot = Snapshot.objects.create( + url="https://example.com/retention-repair", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite.py", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + # ArchiveResult saves are the plugin-event hot path. They intentionally + # do not resolve parent Snapshot/Crawl config on every write; the real + # runner's idle maintenance pass owns missing delete_at repair. + assert result.delete_at is None + assert run_pending_crawls(daemon=False, maintenance_only=True) == 0 + + result.refresh_from_db() + assert result.delete_at is not None + + def test_maintenance_only_runner_does_not_start_regular_queued_crawls(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.services.runner import run_pending_crawls + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=now, + ) + + assert run_pending_crawls(daemon=False, maintenance_only=True) == 0 + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == now + assert crawl.snapshot_set.count() == 0 + + def test_snapshot_start_writes_short_future_lease(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + snapshot.sm.tick() + snapshot.refresh_from_db() + + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + assert snapshot.retry_at > timezone.now() + + def test_abandoned_started_snapshot_results_are_reset_locally_for_resume(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + abandoned = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + output_str="partial output should be cleared", + output_files={"partial.txt": {"size": 12}}, + output_size=12, + start_ts=timezone.now(), + ) + queued = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="wget", + hook_name="on_Snapshot__40_wget", + status=ArchiveResult.StatusChoices.QUEUED, + ) + finished = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="favicon", + hook_name="on_Snapshot__01_favicon", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="keep me", + output_files={"favicon.ico": {"size": 1}}, + output_size=1, + ) + + snapshot.reset_abandoned_results() + + abandoned.refresh_from_db() + queued.refresh_from_db() + finished.refresh_from_db() + + assert abandoned.status == ArchiveResult.StatusChoices.QUEUED + assert abandoned.output_str == "" + assert abandoned.output_files == {} + assert abandoned.output_size == 0 + assert queued.status == ArchiveResult.StatusChoices.QUEUED + assert finished.status == ArchiveResult.StatusChoices.SUCCEEDED + assert finished.output_str == "keep me" + assert finished.output_files == {"favicon.ico": {"size": 1}} + + def test_due_started_snapshot_with_live_child_extends_lease_without_reset(self): + import os + from datetime import datetime + + import psutil + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.services.runner import run_due_snapshot + + now = timezone.now() + os_proc = psutil.Process(os.getpid()) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=now, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=now, + ) + process = Process.objects.create( + machine=Machine.current(), + iface=NetworkInterface.current(), + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + started_at=datetime.fromtimestamp(os_proc.create_time(), tz=timezone.get_current_timezone()), + cmd=os_proc.cmdline(), + pwd=str(snapshot.output_dir / "title"), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + process=process, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + output_str="live work should not be reset", + output_files={"partial.txt": {"size": 12}}, + output_size=12, + ) + + assert run_due_snapshot(snapshot, lock_seconds=60) is True + + snapshot.refresh_from_db() + result.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + assert snapshot.retry_at > now + assert result.status == ArchiveResult.StatusChoices.STARTED + assert result.output_str == "live work should not be reset" + assert result.output_files == {"partial.txt": {"size": 12}} + assert result.output_size == 12 + + def test_run_due_crawl_seals_finished_started_crawl(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + assert run_due_crawl(crawl, lock_seconds=10) is True + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_run_due_crawl_preserves_next_future_snapshot_retry(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + future = timezone.now() + timedelta(hours=1) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=future, + ) + + assert run_due_crawl(crawl, lock_seconds=10) is True + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at == future + + def test_run_due_crawl_preserves_next_future_started_snapshot_lease(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + future = timezone.now() + timedelta(minutes=5) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + + assert run_due_crawl(crawl, lock_seconds=10) is True + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at == future + + def test_run_due_crawl_unlocks_null_retry_queued_snapshot(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + + assert run_due_crawl(crawl, lock_seconds=10) is True + + crawl.refresh_from_db() + snapshot.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + assert snapshot.retry_at is not None + + +@pytest.mark.django_db +class TestRecoverOrchestratorStateRedFailureModes: + def test_recovery_does_not_seal_queued_snapshot_waiting_for_future_retry_even_with_final_results(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=future, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=future, + ) + ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.SUCCEEDED, + ) + + recover_orchestrator_state() + + snapshot.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.QUEUED + assert snapshot.retry_at == future + + def test_recovery_does_not_seal_queued_crawl_waiting_for_future_retry_even_with_finished_snapshots(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=future, + ) + Snapshot.objects.create(url="https://example.com", crawl=crawl, status=Snapshot.StatusChoices.SEALED, retry_at=None) + + recover_orchestrator_state() + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == future + + def test_recovery_unlocks_started_parent_to_future_retry_child_not_now(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://www.mathjax.org/", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create(url="https://www.mathjax.org/", crawl=crawl, status=Snapshot.StatusChoices.QUEUED, retry_at=future) + + recover_orchestrator_state() + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at == future + + def test_recovery_requeues_started_archiveresult_without_process(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://www.mathjax.org/", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://www.mathjax.org/", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + ) + + recover_orchestrator_state() + + result.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.QUEUED + + def test_recovery_requeues_started_archiveresult_with_exited_process(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://revealjs.com/", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create(url="https://revealjs.com/", crawl=crawl, status=Snapshot.StatusChoices.STARTED, retry_at=None) + process = Process.objects.create( + machine=Machine.current(refresh=True), + iface=NetworkInterface.current(refresh=True), + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + pwd=str(snapshot.output_dir / "title"), + cmd=["python", "--version"], + status=Process.StatusChoices.EXITED, + retry_at=None, + exit_code=0, + ended_at=timezone.now(), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + process=process, + ) + + recover_orchestrator_state() + + result.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.QUEUED + + def test_recovery_requeues_sealed_snapshot_started_result_with_exited_process_result_too(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://pdfobject.com/pdf/sample-3pp.pdf", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://pdfobject.com/pdf/sample-3pp.pdf", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + process = Process.objects.create( + machine=Machine.current(refresh=True), + iface=NetworkInterface.current(refresh=True), + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + pwd=str(snapshot.output_dir / "pdf"), + cmd=["python", "--version"], + status=Process.StatusChoices.EXITED, + retry_at=None, + exit_code=0, + ended_at=timezone.now(), + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="pdf", + hook_name="on_Snapshot__50_pdf", + status=ArchiveResult.StatusChoices.STARTED, + process=process, + ) + + recover_orchestrator_state() + + snapshot.refresh_from_db() + result.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is not None + assert result.status == ArchiveResult.StatusChoices.QUEUED + + def test_recovery_requeues_started_snapshot_result_before_unlocking_snapshot(self): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.core.recovery_util import recover_orchestrator_state + + crawl = Crawl.objects.create( + urls="https://mermaid-js.github.io/mermaid/", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://mermaid-js.github.io/mermaid/", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__01_title", + status=ArchiveResult.StatusChoices.STARTED, + ) + + recover_orchestrator_state() + + snapshot.refresh_from_db() + result.refresh_from_db() + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert snapshot.retry_at is not None + + def test_crawl_runner_load_run_state_does_not_return_future_retry_snapshots(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=future, + ) + Snapshot.objects.create(url="https://example.com", crawl=crawl, status=Snapshot.StatusChoices.QUEUED, retry_at=future) + + runner = CrawlRunner(crawl, selected_plugins=[]) + + assert runner.load_run_state() == [] + + def test_crawl_runner_finalize_run_state_preserves_next_future_snapshot_retry(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create(url="https://blog.sweeting.me", crawl=crawl, status=Snapshot.StatusChoices.QUEUED, retry_at=future) + + runner = CrawlRunner(crawl, selected_plugins=[]) + runner.finalize_run_state() + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at == future + + def test_due_started_crawl_yields_to_due_child_snapshot(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=now, + ) + snapshot = Snapshot.objects.create( + url="https://blog.sweeting.me", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=now, + ) + + assert run_due_crawl(crawl, lock_seconds=10) is True + + crawl.refresh_from_db() + snapshot.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + assert crawl.retry_at > timezone.now() + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at == now + + def test_crawl_cancel_schedules_children_for_per_snapshot_sealing(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.services.runner import run_due_crawl, run_due_snapshot + + now = timezone.now() + past = now - timedelta(minutes=5) + future = now + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=now, + ) + queued = Snapshot.objects.create( + url="https://blog.sweeting.me/queued", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=future, + ) + started = Snapshot.objects.create( + url="https://blog.sweeting.me/started", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + paused = Snapshot.objects.create( + url="https://blog.sweeting.me/paused", + crawl=crawl, + status=Snapshot.StatusChoices.PAUSED, + retry_at=future, + ) + already_due = Snapshot.objects.create( + url="https://blog.sweeting.me/already-due", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=past, + ) + maintenance = Snapshot.objects.create( + url="https://blog.sweeting.me/maintenance", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=future, + ) + ArchiveResult.objects.create( + snapshot=maintenance, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite", + status=ArchiveResult.StatusChoices.QUEUED, + ) + + crawl.cancel() + + crawl.refresh_from_db() + queued.refresh_from_db() + started.refresh_from_db() + paused.refresh_from_db() + maintenance.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is not None + assert crawl.retry_at <= timezone.now() + for snapshot in (queued, started, paused, already_due): + assert snapshot.status != Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is not None + assert snapshot.retry_at <= timezone.now() + assert run_due_snapshot(snapshot, lock_seconds=60) is True + snapshot.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert maintenance.status == Snapshot.StatusChoices.SEALED + assert maintenance.retry_at == future + assert run_due_crawl(crawl, lock_seconds=60) is True + crawl.refresh_from_db() + assert crawl.retry_at is None + + def test_crawl_cancel_reschedules_children_when_parent_was_already_sealed(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + future = timezone.now() + timedelta(days=1) + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://blog.sweeting.me/old-cancel", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=future, + ) + + crawl.cancel() + + crawl.refresh_from_db() + snapshot.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + assert snapshot.retry_at <= timezone.now() + + snapshot.status = Snapshot.StatusChoices.SEALED + snapshot.retry_at = None + snapshot.save(update_fields=["status", "retry_at", "modified_at"]) + crawl.refresh_from_db() + sealed_modified_at = crawl.modified_at + + crawl.cancel() + + crawl.refresh_from_db() + assert crawl.modified_at == sealed_modified_at + + def test_run_due_crawl_stale_started_object_cannot_resurrect_cancelled_crawl(self): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import run_due_crawl + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=now, + ) + Snapshot.objects.create( + url="https://blog.sweeting.me/queued", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=now, + ) + stale_crawl = Crawl.objects.get(pk=crawl.pk) + + crawl.cancel() + assert run_due_crawl(stale_crawl, lock_seconds=60) is True + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + def test_snapshot_seal_uses_retry_at_ownership_not_modified_at(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://blog.sweeting.me", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://blog.sweeting.me/owned-seal", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=now, + ) + + assert Snapshot.claim_for_worker(snapshot, lock_seconds=60) is True + Snapshot.objects.filter(pk=snapshot.pk).update( + downloaded_at=now, + modified_at=now + timedelta(seconds=1), + ) + + snapshot.sm.seal() + snapshot.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert snapshot.downloaded_at == now + + def test_recovery_reschedules_stale_due_crawl_even_with_unrelated_process_path_containing_crawl_id(self): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.core.recovery_util import recover_orchestrator_state + + old = timezone.now() - timedelta(hours=13) + crawl = Crawl.objects.create( + urls="https://github.com/nodeca/pica", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=old, + ) + Crawl.objects.filter(id=crawl.id).update(modified_at=old, retry_at=old) + Process.objects.create( + machine=Machine.current(refresh=True), + iface=NetworkInterface.current(refresh=True), + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + pwd=f"/tmp/not-an-archivebox-child/{crawl.id}/title", + cmd=["python", "--version"], + status=Process.StatusChoices.EXITED, + retry_at=None, + exit_code=0, + ended_at=timezone.now(), + ) + + recovered = recover_orchestrator_state() + + crawl.refresh_from_db() + assert "stale_active_crawls_unlocked" not in recovered + assert crawl.status == Crawl.StatusChoices.QUEUED + assert crawl.retry_at == old + + def test_recovery_does_not_crash_on_invalid_utf8_process_logs(self, tmp_path): + from datetime import timedelta + + from django.utils import timezone + + from archivebox.machine.models import Machine, NetworkInterface, Process + from archivebox.core.recovery_util import recover_orchestrator_state + + runtime_dir = tmp_path / "https_example_com" / ".hooks" / "on_Snapshot__01_title.py" + runtime_dir.mkdir(parents=True) + (runtime_dir / "stdout.log").write_bytes(b"\\xff\\xfe\\xfa") + process = Process.objects.create( + machine=Machine.current(refresh=True), + iface=NetworkInterface.current(refresh=True), + process_type=Process.TypeChoices.HOOK, + worker_type="archiveresult", + pwd=str(tmp_path / "https_example_com"), + cmd=["on_Snapshot__01_title.py"], + status=Process.StatusChoices.RUNNING, + retry_at=None, + pid=999999, + started_at=timezone.now() - timedelta(hours=1), + timeout=1, + ) + + recover_orchestrator_state() + + process.refresh_from_db() + assert process.status == Process.StatusChoices.EXITED diff --git a/archivebox/tests/test_cli_schedule.py b/archivebox/tests/test_cli_schedule.py new file mode 100644 index 0000000000..81bae29e42 --- /dev/null +++ b/archivebox/tests/test_cli_schedule.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +"""CLI-specific tests for archivebox schedule.""" + +from archivebox.tests.conftest import run_archivebox_cmd + +import pytest + +from archivebox.crawls.models import Crawl, CrawlSchedule +from archivebox.tests.test_orm_helpers import use_archivebox_db +from .conftest import ( + cli_env, + get_counts, + get_free_port, + init_archive, + make_latest_schedule_due, + start_archivebox_server, + stop_server, + wait_for_http, + wait_for_snapshot_capture, +) + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_schedule_run_all_enqueues_scheduled_crawl(initialized_archive): + + env = cli_env(disable_extractors=True) + run_archivebox_cmd( + ["schedule", "--every=daily", "--depth=0", "https://example.com"], + check=True, + ) + + result = run_archivebox_cmd( + ["schedule", "--run-all"], + env=env, + ) + + assert result.returncode == 0 + assert "Enqueued 1 scheduled crawl" in result.stdout + + with use_archivebox_db(initialized_archive): + crawl_count = Crawl.objects.count() + queued_count = Crawl.objects.filter(status="queued").count() + + assert crawl_count >= 2 + assert queued_count >= 1 + + +def test_schedule_without_import_path_creates_maintenance_schedule(initialized_archive): + + result = run_archivebox_cmd( + ["schedule", "--every=day"], + ) + + assert result.returncode == 0 + assert "Created scheduled maintenance update" in result.stdout + + with use_archivebox_db(initialized_archive): + row = Crawl.objects.order_by("-created_at").values_list("urls", "status").first() + + assert row == ("archivebox://update", "sealed") + + +def test_schedule_creates_enabled_db_schedule(initialized_archive): + + result = run_archivebox_cmd( + ["schedule", "--every=daily", "--depth=1", "https://example.com/feed.xml"], + ) + + assert result.returncode == 0 + + with use_archivebox_db(initialized_archive): + schedule_row = CrawlSchedule.objects.order_by("-created_at").values_list("schedule", "is_enabled", "label").first() + crawl = Crawl.objects.order_by("-created_at").first() + + assert schedule_row == ("daily", True, "Scheduled import: https://example.com/feed.xml") + assert crawl is not None + assert crawl.urls == "https://example.com/feed.xml" + assert crawl.status == "sealed" + assert crawl.max_depth == 1 + + +def test_schedule_show_lists_enabled_schedules(initialized_archive): + + run_archivebox_cmd( + ["schedule", "--every=weekly", "https://example.com/feed.xml"], + check=True, + ) + + result = run_archivebox_cmd( + ["schedule", "--show"], + ) + + assert result.returncode == 0 + assert "Active scheduled crawls" in result.stdout + assert "https://example.com/feed.xml" in result.stdout + assert "weekly" in result.stdout + + +def test_schedule_clear_disables_existing_schedules(initialized_archive): + + run_archivebox_cmd( + ["schedule", "--every=daily", "https://example.com/feed.xml"], + check=True, + ) + + result = run_archivebox_cmd( + ["schedule", "--clear"], + ) + + assert result.returncode == 0 + assert "Disabled 1 scheduled crawl" in result.stdout + + with use_archivebox_db(initialized_archive): + disabled_count = CrawlSchedule.objects.filter(is_enabled=False).count() + enabled_count = CrawlSchedule.objects.filter(is_enabled=True).count() + + assert disabled_count == 1 + assert enabled_count == 0 + + +def test_schedule_every_requires_valid_period(initialized_archive): + + result = run_archivebox_cmd( + ["schedule", "--every=invalid_period", "https://example.com/feed.xml"], + ) + + assert result.returncode != 0 + assert "Invalid schedule" in result.stderr or "Invalid schedule" in result.stdout + + +def test_schedule_help_lists_schedule_options(initialized_archive): + + result = run_archivebox_cmd( + ["schedule", "--help"], + ) + + assert result.returncode == 0 + assert "--every" in result.stdout + assert "--show" in result.stdout + assert "--clear" in result.stdout + assert "--run-all" in result.stdout + + +@pytest.mark.timeout(180) +def test_schedule_due_crawl_runs_over_server_and_saves_real_content(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True) + + schedule_result = run_archivebox_cmd( + ["schedule", "--every=daily", "--depth=0", recursive_test_site["root_url"]], + cwd=tmp_path, + env=env, + timeout=60, + ) + assert schedule_result.returncode == 0, schedule_result.stderr + assert "Created scheduled crawl" in schedule_result.stdout + + make_latest_schedule_due(tmp_path) + + try: + start_archivebox_server(tmp_path, env=env, port=port) + wait_for_http(port, host=f"web.archivebox.localhost:{port}") + captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180) + assert "Root" in captured_text + assert "About" in captured_text + finally: + stop_server(tmp_path) + + +@pytest.mark.timeout(180) +def test_add_remains_one_shot_when_schedule_is_due(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env(port=port, server=True) + scheduled_url = recursive_test_site["root_url"] + one_shot_url = recursive_test_site["child_urls"][0] + + schedule_result = run_archivebox_cmd( + ["schedule", "--every=daily", "--depth=0", scheduled_url], + cwd=tmp_path, + env=env, + timeout=60, + ) + assert schedule_result.returncode == 0, schedule_result.stderr + + make_latest_schedule_due(tmp_path) + + add_result = run_archivebox_cmd( + ["add", "--depth=0", "--plugins=wget", one_shot_url], + cwd=tmp_path, + env=env, + timeout=120, + ) + assert add_result.returncode == 0, add_result.stderr + captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120) + assert "Deep About" in captured_text or "About" in captured_text + + scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url) + assert one_shot_snapshots >= 1 + assert scheduled_snapshots == 0 + assert scheduled_crawls == 1 # template only, no materialized scheduled run diff --git a/archivebox/tests/test_cli_search.py b/archivebox/tests/test_cli_search.py new file mode 100644 index 0000000000..f3780796ab --- /dev/null +++ b/archivebox/tests/test_cli_search.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox search command. + +TODO: keep search-specific tests here instead of folding all coverage into test_cli_list.py. +""" + +import json + +from archivebox.tests.conftest import cli_env, run_archivebox_cmd + + +def test_search_help_runs_successfully(tmp_path): + """The search alias should be registered and expose list/search filters.""" + + result = run_archivebox_cmd(["search", "--help"]) + + assert result.returncode == 0 + assert "search" in result.stdout.lower() + assert "--csv" in result.stdout + + +def test_cli_search_status_filters_snapshot_status_column(tmp_path, initialized_archive): + env = cli_env(disable_extractors=True) + for url in ( + "https://example.com/search-status-queued", + "https://example.com/search-status-paused", + "https://example.com/search-status-sealed", + ): + result = run_archivebox_cmd( + ["snapshot", "create", url], + env=env, + timeout=30, + ) + assert result.returncode == 0, result.stderr + + for status, needle in ( + ("paused", "search-status-paused"), + ("sealed", "search-status-sealed"), + ): + listed = run_archivebox_cmd( + ["snapshot", "list", "--url__icontains", needle], + env=env, + timeout=30, + ) + assert listed.returncode == 0, listed.stderr + updated = run_archivebox_cmd( + ["snapshot", "update", "--status", status], + input=listed.stdout, + env=env, + timeout=30, + ) + assert updated.returncode == 0, updated.stderr + + result = run_archivebox_cmd( + ["search", "--status", "sealed", "search-status"], + env=env, + timeout=30, + ) + + assert result.returncode == 0, result.stderr + rows = [json.loads(line) for line in result.stdout.splitlines() if line.strip().startswith("{")] + assert [row["status"] for row in rows] == ["sealed"] + assert [row["url"] for row in rows] == ["https://example.com/search-status-sealed"] + + legacy_result = run_archivebox_cmd( + ["search", "--status", "unarchived", "search-status"], + env=env, + timeout=30, + ) + + assert legacy_result.returncode != 0 + assert "Invalid snapshot status" in legacy_result.stderr diff --git a/archivebox/tests/test_cli_server.py b/archivebox/tests/test_cli_server.py new file mode 100644 index 0000000000..38989d0594 --- /dev/null +++ b/archivebox/tests/test_cli_server.py @@ -0,0 +1,620 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox server command. +Verify server can start (basic smoke tests only, no full server testing). +""" + +import os +import asyncio +import json +import signal +import shutil +import socket +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from archivebox.tests.conftest import ( + assert_no_processes_for_data_dir, + get_free_port, + kill_processes_for_data_dir, + cli_env, + start_archivebox_server, + stop_archivebox_process, + wait_for_pid_to_disappear, + wait_for_port_open, + wait_for_process, + run_archivebox_cmd, +) + + +def test_server_auth_secret_and_cookie_settings_are_restart_stable(tmp_path, monkeypatch): + """Admin sessions must survive `archivebox server` restarts for a collection.""" + from archivebox.config.collection import write_config_file + + (tmp_path / ".archivebox_id").write_text("testcoll") + monkeypatch.setenv("BASE_URL", "http://archivebox.localhost:9292") + + first = subprocess.run( + [ + sys.executable, + "-c", + ( + "import os;" + "os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings');" + "import django;" + "django.setup();" + "from django.conf import settings;" + "print(settings.SECRET_KEY);" + "print(settings.SESSION_ENGINE);" + "print(settings.SESSION_COOKIE_NAME);" + "print(settings.SESSION_COOKIE_DOMAIN);" + "print(settings.SESSION_COOKIE_SECURE);" + "print(settings.SESSION_EXPIRE_AT_BROWSER_CLOSE)" + ), + ], + capture_output=True, + text=True, + check=True, + ) + first_lines = first.stdout.strip().splitlines() + assert first_lines[0], first.stderr + + # Simulate the next `archivebox server` process, reading only persisted + # collection config. If SECRET_KEY falls back to the random default_factory + # here, Django will reject existing signed session cookies after restart. + monkeypatch.delenv("BASE_URL", raising=False) + write_config_file({"BASE_URL": "http://archivebox.localhost:9292"}) + second = subprocess.run( + [ + sys.executable, + "-c", + ( + "import os;" + "os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings');" + "import django;" + "django.setup();" + "from django.conf import settings;" + "print(settings.SECRET_KEY);" + "print(settings.SESSION_ENGINE);" + "print(settings.SESSION_COOKIE_NAME);" + "print(settings.SESSION_COOKIE_DOMAIN);" + "print(settings.SESSION_COOKIE_SECURE);" + "print(settings.SESSION_EXPIRE_AT_BROWSER_CLOSE)" + ), + ], + capture_output=True, + text=True, + check=True, + ) + + assert second.stdout.strip().splitlines() == first_lines + assert first_lines[1] == "django.contrib.sessions.backends.db" + assert first_lines[2].startswith("archivebox_sessionid_") + assert first_lines[3:] == ["None", "False", "False"] + + +def test_https_base_url_enables_proxy_ssl_header_and_secure_cookies(tmp_path): + (tmp_path / ".archivebox_id").write_text("testcoll") + env = os.environ.copy() + env["BASE_URL"] = "https://archive.example.com" + env["DJANGO_SETTINGS_MODULE"] = "archivebox.core.settings" + repo_root = Path(__file__).resolve().parents[2] + env["PYTHONPATH"] = f"{repo_root}{os.pathsep}{env.get('PYTHONPATH', '')}" + + result = subprocess.run( + [ + sys.executable, + "-c", + ( + "import django, json;" + "django.setup();" + "from django.conf import settings;" + "print(json.dumps({" + "'csrf_secure': settings.CSRF_COOKIE_SECURE," + "'session_secure': settings.SESSION_COOKIE_SECURE," + "'proxy_ssl_header': settings.SECURE_PROXY_SSL_HEADER," + "}))" + ), + ], + capture_output=True, + text=True, + check=True, + env=env, + cwd=tmp_path, + ) + + assert json.loads(result.stdout) == { + "csrf_secure": True, + "session_secure": True, + "proxy_ssl_header": ["HTTP_X_FORWARDED_PROTO", "https"], + } + + +def test_sqlite_connections_use_explicit_busy_timeout(): + from archivebox.core.settings import SQLITE_CONNECTION_OPTIONS + + assert SQLITE_CONNECTION_OPTIONS["OPTIONS"]["timeout"] == 30.0 + assert "PRAGMA busy_timeout = 30000;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"] + assert "PRAGMA journal_mode = WAL;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"] + + +def test_server_shows_usage_info(initialized_archive): + """Test that server command shows usage or starts.""" + + # Just check that the command is recognized + # We won't actually start a full server in tests + result = run_archivebox_cmd( + ["server", "--help"], + timeout=10, + ) + + assert result.returncode == 0 + assert "server" in result.stdout.lower() or "http" in result.stdout.lower() + + +def test_server_help_lists_runtime_options(initialized_archive): + """Test that server help exposes the current runtime options.""" + + # Check init flag is recognized + result = run_archivebox_cmd( + ["server", "--help"], + timeout=10, + ) + + assert result.returncode == 0 + assert "--daemonize" in result.stdout + assert "--reload" in result.stdout + + +def test_runner_worker_uses_current_interpreter(): + """The supervised runner should use the active Python environment, not PATH.""" + from archivebox.workers.supervisord_util import RUNNER_WORKER + + assert RUNNER_WORKER["command"] == f"{sys.executable} -m archivebox run --daemon" + assert RUNNER_WORKER["autorestart"] == "true" + assert 'ARCHIVEBOX_RUNNER_DAEMON="1"' in RUNNER_WORKER["environment"] + + +def test_daphne_worker_uses_default_application_close_timeout(): + from archivebox.workers.supervisord_util import SERVER_WORKER + + command = SERVER_WORKER("127.0.0.1", "8000")["command"] + + assert "daphne" in command + assert "--application-close-timeout=0" not in command + + +def test_reload_workers_use_current_interpreter_and_supervisord_managed_runner(): + from archivebox.workers.supervisord_util import RUNNER_WATCH_WORKER, RUNSERVER_WORKER + + runserver = RUNSERVER_WORKER("127.0.0.1", "8000", reload=True) + watcher = RUNNER_WATCH_WORKER("http://127.0.0.1:8000") + + assert runserver["name"] == "worker_runserver" + assert runserver["command"] == f"{sys.executable} -m archivebox manage runserver 127.0.0.1:8000" + assert 'ARCHIVEBOX_RUNSERVER="1"' in runserver["environment"] + assert 'ARCHIVEBOX_AUTORELOAD="1"' in runserver["environment"] + assert 'ARCHIVEBOX_RUNSERVER_BIND_URL="http://127.0.0.1:8000"' in runserver["environment"] + + assert watcher["name"] == "worker_runner_watch" + assert watcher["command"] == f"{sys.executable} -m archivebox manage runner_watch --bind-url=http://127.0.0.1:8000" + + +def test_server_daemon_starts_real_plugin_owned_sonic_worker(archivebox_daemon_server): + if shutil.which("sonic") is None: + pytest.skip("sonic server binary is required for Sonic worker integration tests") + + server = archivebox_daemon_server( + SEARCH_BACKEND_ENGINE="sonic", + ) + state = server.wait_for_workers(("worker_daphne", "worker_sonic", "worker_runner")) + + assert state["worker_daphne"]["statename"] == "RUNNING", state + assert state["worker_runner"]["statename"] == "RUNNING", state + assert state["worker_sonic"]["statename"] == "RUNNING", state + assert "sonic" in state["worker_sonic"]["name"] + + +def test_server_daemon_restarts_runner_killed_by_signal(archivebox_daemon_server): + server = archivebox_daemon_server( + SEARCH_BACKEND_ENGINE="sqlite", + ) + state = server.wait_for_workers(("worker_daphne", "worker_runner")) + old_runner_pid = state["worker_runner"]["pid"] + + os.kill(old_runner_pid, signal.SIGTERM) + + deadline = time.time() + 30 + while time.time() < deadline: + state = server.worker_state() + runner = state.get("worker_runner", {}) + if runner.get("statename") == "RUNNING" and runner.get("pid") and runner.get("pid") != old_runner_pid: + break + time.sleep(0.5) + else: + raise AssertionError(f"worker_runner did not restart after SIGTERM: {state}") + + assert state["worker_daphne"]["statename"] == "RUNNING", state + + +def test_live_server_machine_search_engine_update_reaches_subsequent_snapshot_runtime(archivebox_daemon_server): + server = archivebox_daemon_server(SEARCH_BACKEND_ENGINE="ripgrep") + server.wait_for_workers(("worker_daphne", "worker_runner")) + + setup_result = subprocess.run( + [ + sys.executable, + "-c", + ( + "import django;" + "django.setup();" + "from archivebox.base_models.models import get_or_create_system_user_pk;" + "from archivebox.crawls.models import Crawl;" + "from archivebox.core.models import Snapshot;" + "from archivebox.machine.models import Machine;" + "machine = Machine.current(refresh=True);" + "machine.config = {**dict(machine.config or {}), 'SEARCH_BACKEND_ENGINE': 'sqlite'};" + "machine.save(update_fields=['config', 'modified_at']);" + "crawl = Crawl.objects.create(" + "urls='https://example.com/live-machine-search-config'," + "created_by_id=get_or_create_system_user_pk()," + "config={}," + ");" + "snapshot = Snapshot.objects.create(" + "url='https://example.com/live-machine-search-config'," + "crawl=crawl," + ");" + "print(snapshot.id)" + ), + ], + cwd=server.data_dir, + env=server.env, + capture_output=True, + text=True, + timeout=30, + ) + assert setup_result.returncode == 0, setup_result.stderr or setup_result.stdout + snapshot_id = setup_result.stdout.strip().splitlines()[-1] + + result = subprocess.run( + [ + sys.executable, + "-c", + ( + "import django,json;" + "django.setup();" + "from archivebox.core.models import Snapshot;" + "from archivebox.config.common import get_config;" + f"snapshot = Snapshot.objects.select_related('crawl').get(id='{snapshot_id}');" + "runtime = get_config(snapshot=snapshot).for_crawl_runtime(" + "crawl=snapshot.crawl," + "snapshot=snapshot," + "extra_context={'snapshot_id': str(snapshot.id)}," + ");" + "print(json.dumps({" + "'sqlite_enabled': runtime.get('SEARCH_BACKEND_SQLITE_ENABLED')," + "'engine_in_runtime': 'SEARCH_BACKEND_ENGINE' in runtime," + "}))" + ), + ], + cwd=server.data_dir, + env=server.env, + capture_output=True, + text=True, + timeout=30, + ) + assert result.returncode == 0, result.stderr or result.stdout + resolved = json.loads(result.stdout.strip().splitlines()[-1]) + assert resolved == {"sqlite_enabled": True, "engine_in_runtime": False} + + +def test_sonic_worker_is_disabled_when_sonic_disabled(tmp_path): + from archivebox.workers.supervisord_util import get_sonic_supervisord_worker_from_plugin + + worker = get_sonic_supervisord_worker_from_plugin( + SimpleNamespace( + DATA_DIR=str(tmp_path), + SEARCH_BACKEND_SONIC_ENABLED=False, + SEARCH_BACKEND_SONIC_HOST_NAME="127.0.0.1", + SEARCH_BACKEND_SONIC_PORT=get_free_port(), + SEARCH_BACKEND_SONIC_PASSWORD="SecretPassword", + SONIC_BINARY="sonic", + ), + ) + + assert worker is None + + +def test_sonic_daemon_event_handler_accepts_real_running_worker(archivebox_daemon_server): + if shutil.which("sonic") is None: + pytest.skip("sonic server binary is required for Sonic worker integration tests") + + from abx_dl.events import ProcessStdoutEvent + from abx_dl.orchestrator import create_bus + from archivebox.search.sonic_daemon import register_sonic_daemon_event_handler + from abx_plugins.plugins.search_backend_sonic.daemon import prepare_sonic_daemon + + sonic_port = get_free_port() + server = archivebox_daemon_server( + SEARCH_BACKEND_ENGINE="sonic", + SEARCH_BACKEND_SONIC_PORT=str(sonic_port), + ) + state = server.wait_for_workers(("worker_sonic",)) + assert state["worker_sonic"]["statename"] == "RUNNING", state + + daemon_event = prepare_sonic_daemon( + SimpleNamespace( + DATA_DIR=str(server.data_dir), + SEARCH_BACKEND_SONIC_ENABLED=True, + SEARCH_BACKEND_SONIC_HOST_NAME="127.0.0.1", + SEARCH_BACKEND_SONIC_PORT=sonic_port, + SEARCH_BACKEND_SONIC_PASSWORD="SecretPassword", + SONIC_BINARY="sonic", + ), + ) + + async def run_test(): + bus = create_bus(name="test_sonic_daemon_event_handler_accepts_real_running_worker") + try: + register_sonic_daemon_event_handler(bus) + event = await bus.emit( + ProcessStdoutEvent( + line=json.dumps(daemon_event.to_record()), + ), + ).now() + await event.event_results_list() + finally: + await bus.destroy() + + asyncio.run(run_test()) + + +def test_supervisord_sync_does_not_start_duplicate_sonic_listener(initialized_archive, db): + from abx_plugins.plugins.search_backend_sonic.daemon import get_sonic_supervisord_worker + from archivebox.tests.test_orm_helpers import use_archivebox_db + from archivebox.workers.supervisord_util import ( + get_or_create_supervisord_process, + get_worker, + stop_existing_supervisord_process, + sync_supervisord_workers, + ) + + listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + listener.bind(("127.0.0.1", 0)) + listener.listen() + sonic_port = listener.getsockname()[1] + worker = get_sonic_supervisord_worker( + SimpleNamespace( + DATA_DIR=str(initialized_archive), + SEARCH_BACKEND_ENGINE="sonic", + SEARCH_BACKEND_SONIC_HOST_NAME="127.0.0.1", + SEARCH_BACKEND_SONIC_PORT=sonic_port, + SEARCH_BACKEND_SONIC_PASSWORD="SecretPassword", + SONIC_BINARY="sonic", + ), + ) + assert worker is not None + + try: + with use_archivebox_db(initialized_archive): + supervisor = get_or_create_supervisord_process(daemonize=False) + state = sync_supervisord_workers(supervisor, [(worker, False)], prune=True) + sonic_state = state["worker_sonic"] + assert sonic_state["statename"] != "RUNNING", sonic_state + assert get_worker(supervisor, "worker_sonic")["statename"] != "RUNNING" + finally: + listener.close() + with use_archivebox_db(initialized_archive): + stop_existing_supervisord_process() + + +def test_supervisord_takeover_stops_all_live_process_rows(initialized_archive, db): + import psutil + from django.utils import timezone + + from archivebox.config import CONSTANTS + from archivebox.machine.models import Machine, Process + from archivebox.tests.test_orm_helpers import use_archivebox_db + + env = cli_env() + procs = [] + try: + for _index in range(2): + proc = run_archivebox_cmd( + ["run", "--daemon"], + cwd=initialized_archive, + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + wait=False, + ) + procs.append(proc) + started_at = datetime.fromtimestamp(psutil.Process(proc.pid).create_time(), tz=timezone.get_current_timezone()) + with use_archivebox_db(initialized_archive): + Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.SUPERVISORD, + worker_type="supervisord", + pwd=str(CONSTANTS.DATA_DIR), + cmd=[], + pid=proc.pid, + started_at=started_at, + status=Process.StatusChoices.RUNNING, + ) + + with use_archivebox_db(initialized_archive): + from archivebox.workers.supervisord_util import stop_existing_supervisord_process + + stop_existing_supervisord_process() + + for proc in procs: + proc.wait(timeout=10) + with use_archivebox_db(initialized_archive): + assert not Process.objects.filter( + process_type=Process.TypeChoices.SUPERVISORD, + status=Process.StatusChoices.RUNNING, + pwd=str(CONSTANTS.DATA_DIR), + ).exists() + finally: + for proc in procs: + if proc.poll() is None: + os.killpg(proc.pid, signal.SIGKILL) + + +@pytest.mark.timeout(300) +@pytest.mark.parametrize( + ("stop_signal", "expected_notice"), + [ + (signal.SIGHUP, "Got SIGHUP"), + (signal.SIGINT, "Got SIGINT"), + (signal.SIGTERM, "Got SIGTERM"), + (signal.SIGKILL, None), + ], +) +def test_live_server_signal_exit_and_resume_uses_existing_supervisor_state(initialized_archive, stop_signal, expected_notice): + + env = cli_env(live=True) + port = get_free_port() + server = None + resumed = None + try: + server = start_archivebox_server(initialized_archive, port=port, log_name=f"server-{stop_signal.name}.log", env=env) + server_log = server.log_path + + os.kill(server.pid, stop_signal) + try: + server.wait(timeout=20 if stop_signal != signal.SIGKILL else 5) + except subprocess.TimeoutExpired: + os.kill(server.pid, signal.SIGKILL) + server.wait(timeout=5) + + if expected_notice: + log_text = server_log.read_text(encoding="utf-8", errors="replace") + assert expected_notice in log_text + assert "ArchiveBox server shut down gracefully" in log_text + assert_no_processes_for_data_dir(initialized_archive, timeout=12) + + resumed = start_archivebox_server(initialized_archive, port=port, log_name=f"server-{stop_signal.name}-resumed.log", env=env) + resumed_log = resumed.log_path + _cmd_result = run_archivebox_cmd(["status"], cwd=initialized_archive, env=env, timeout=60) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + + os.kill(resumed.pid, signal.SIGTERM) + resumed.wait(timeout=20) + resumed_text = resumed_log.read_text(encoding="utf-8", errors="replace") + assert "Got SIGTERM" in resumed_text + assert "ArchiveBox server shut down gracefully" in resumed_text + assert_no_processes_for_data_dir(initialized_archive, timeout=12) + finally: + for proc in (server, resumed): + if proc is not None and proc.poll() is None: + stop_archivebox_process(proc, signal.SIGKILL) + kill_processes_for_data_dir(initialized_archive) + + +@pytest.mark.timeout(180) +def test_live_daemonized_server_keeps_supervisord_owned_by_archivebox_parent(initialized_archive): + + env = cli_env(live=True) + port = get_free_port() + bind_url = f"http://127.0.0.1:{port}" + try: + _cmd_result = run_archivebox_cmd( + ["server", "--daemonize", f"127.0.0.1:{port}"], + cwd=initialized_archive, + env=env, + timeout=90, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + wait_for_port_open("127.0.0.1", port, timeout=30) + + server_process = wait_for_process( + lambda _proc, command: "archivebox" in command and " server " in f" {command} " and bind_url.replace("http://", "") in command, + ) + supervisord = wait_for_process( + lambda proc, command: proc.ppid() == server_process.pid and "supervisord" in command, + ) + wait_for_process( + lambda proc, command: proc.ppid() == supervisord.pid and "supervisord_watchdog" in command, + ) + + os.kill(server_process.pid, signal.SIGKILL) + wait_for_pid_to_disappear(server_process.pid, timeout=10) + wait_for_pid_to_disappear(supervisord.pid, timeout=20) + assert_no_processes_for_data_dir(initialized_archive, timeout=12) + finally: + kill_processes_for_data_dir(initialized_archive) + assert_no_processes_for_data_dir(initialized_archive, timeout=12) + + +@pytest.mark.timeout(240) +def test_live_servers_in_different_data_dirs_do_not_interfere(initialized_archive): + + first_data_dir = initialized_archive + second_data_dir = initialized_archive.parent / f"{initialized_archive.name}-second" + second_data_dir.mkdir() + second_env = cli_env(live=True) + _cmd_result = run_archivebox_cmd(["init"], cwd=second_data_dir, env=second_env, timeout=90) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr or stdout + + first_port = get_free_port() + second_port = get_free_port() + first = None + second = None + first_resumed = None + try: + first = start_archivebox_server( + first_data_dir, + port=first_port, + log_name="server-first-data-dir.log", + env=cli_env(live=True), + ) + second = start_archivebox_server(second_data_dir, port=second_port, log_name="server-second-data-dir.log", env=second_env) + + _cmd_result = run_archivebox_cmd( + ["status"], + cwd=first_data_dir, + env=cli_env(live=True), + timeout=60, + ) + first_stdout, first_stderr, first_returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + _cmd_result = run_archivebox_cmd( + ["status"], + cwd=second_data_dir, + env=second_env, + timeout=60, + ) + second_stdout, second_stderr, second_returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert first_returncode == 0, first_stderr or first_stdout + assert second_returncode == 0, second_stderr or second_stdout + + stop_archivebox_process(first, signal.SIGTERM) + first = None + assert second.poll() is None, "stopping one DATA_DIR server must not stop another DATA_DIR server" + + first_resumed = start_archivebox_server( + first_data_dir, + port=first_port, + log_name="server-first-data-dir-resumed.log", + env=cli_env(live=True), + ) + assert second.poll() is None, "restarting one DATA_DIR server must not take over another DATA_DIR supervisor" + finally: + for proc in (first, first_resumed, second): + if proc is not None and proc.poll() is None: + stop_archivebox_process(proc, signal.SIGTERM) + kill_processes_for_data_dir(first_data_dir) + kill_processes_for_data_dir(second_data_dir) + assert_no_processes_for_data_dir(first_data_dir, timeout=12) + assert_no_processes_for_data_dir(second_data_dir, timeout=12) diff --git a/archivebox/tests/test_cli_shell.py b/archivebox/tests/test_cli_shell.py new file mode 100644 index 0000000000..a3160c95e8 --- /dev/null +++ b/archivebox/tests/test_cli_shell.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox shell command. +Verify shell command starts Django shell (basic smoke tests only). +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_shell_command_exists(initialized_archive): + """Test that shell command is recognized.""" + + result = run_archivebox_cmd( + ["shell", "--help"], + timeout=10, + ) + + assert result.returncode == 0, result.stderr or result.stdout + assert "usage:" in result.stdout + assert "shell" in result.stdout + + +def test_shell_c_executes_python(initialized_archive): + """shell -c should fully initialize Django and run the provided command.""" + + result = run_archivebox_cmd( + ["shell", "-c", 'print("shell-ok")'], + timeout=30, + ) + + assert result.returncode == 0, result.stderr + assert "shell-ok" in result.stdout diff --git a/archivebox/tests/test_cli_snapshot.py b/archivebox/tests/test_cli_snapshot.py new file mode 100644 index 0000000000..21a120789f --- /dev/null +++ b/archivebox/tests/test_cli_snapshot.py @@ -0,0 +1,542 @@ +""" +Tests for archivebox snapshot CLI command. + +Tests cover: +- snapshot create (from URLs, from Crawl JSONL, pass-through) +- snapshot list (with filters) +- snapshot update +- snapshot delete +""" + +import json +import os + +import pytest + +from archivebox.core.models import Snapshot, Tag +from archivebox.machine.models import Process +from archivebox.tests.conftest import ( + cli_env, + create_test_url, + parse_jsonl_output, + run_archivebox_cmd, +) +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +class TestSnapshotCreate: + """Tests for `archivebox snapshot create`.""" + + def test_create_from_url_args(self, initialized_archive): + """Create snapshot from URL arguments.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + assert "Created" in stderr + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]["type"] == "Snapshot" + assert records[0]["url"] == url + + def test_create_from_crawl_jsonl(self, initialized_archive): + """Create snapshots from Crawl JSONL input.""" + url = create_test_url() + + # First create a crawl + _cmd_result = run_archivebox_cmd(["crawl", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + crawl = parse_jsonl_output(stdout1)[0] + + # Pipe crawl to snapshot create + _cmd_result = run_archivebox_cmd( + ["snapshot", "create"], + stdin=json.dumps(crawl), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Crawl passed through and the Snapshot created + types = [r.get("type") for r in records] + assert "Crawl" in types + assert "Snapshot" in types + + snapshot = next(r for r in records if r["type"] == "Snapshot") + assert snapshot["url"] == url + + def test_create_with_tag(self, initialized_archive): + """Create snapshot with --tag flag.""" + url = create_test_url() + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", "--tag=test-tag", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert "test-tag" in records[0].get("tags", "") + + def test_create_pass_through_other_types(self, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"} + url = create_test_url() + stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url}) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create"], + stdin=stdin, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = [r.get("type") for r in records] + assert "Tag" in types + assert "Snapshot" in types + + def test_create_multiple_urls(self, initialized_archive): + """Create snapshots from multiple URLs.""" + urls = [create_test_url() for _ in range(3)] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "create"] + urls, + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 3 + + created_urls = {r["url"] for r in records} + for url in urls: + assert url in created_urls + + +class TestSnapshotList: + """Tests for `archivebox snapshot list`.""" + + def test_list_empty(self, initialized_archive): + """List with no snapshots returns empty.""" + _cmd_result = run_archivebox_cmd( + ["snapshot", "list"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Listed 0 snapshots" in stderr + + def test_list_returns_created(self, initialized_archive): + """List returns previously created snapshots.""" + url = create_test_url() + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(r.get("url") == url for r in records) + + def test_list_filter_by_status(self, initialized_archive): + """Filter snapshots by status.""" + url = create_test_url() + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--status=queued"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r["status"] == "queued" + + def test_list_filter_by_url_contains(self, initialized_archive): + """Filter snapshots by URL contains.""" + url = create_test_url(domain="unique-domain-12345.com") + run_archivebox_cmd(["snapshot", "create", url], cwd=initialized_archive, default_cli_env=True, disable_extractors=True) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--url__icontains=unique-domain-12345"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert "unique-domain-12345" in records[0]["url"] + + def test_list_with_limit(self, initialized_archive): + """Limit number of results.""" + for _ in range(3): + run_archivebox_cmd( + ["snapshot", "create", create_test_url()], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--limit=2"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, _stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + def test_list_with_sort_and_limit(self, initialized_archive): + """Sorting should be applied before limiting.""" + for _ in range(3): + run_archivebox_cmd( + ["snapshot", "create", create_test_url()], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + + _cmd_result = run_archivebox_cmd( + ["snapshot", "list", "--limit=2", "--sort=-created_at"], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0, f"Command failed: {stderr}" + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestSnapshotUpdate: + """Tests for `archivebox snapshot update`.""" + + def test_update_status(self, initialized_archive): + """Update snapshot status.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "update", "--status=started"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout2, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Updated 1 snapshots" in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]["status"] == "started" + + def test_update_add_tag(self, initialized_archive): + """Update snapshot by adding tag.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "update", "--tag=new-tag"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout2, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Updated 1 snapshots" in stderr + + +class TestSnapshotDelete: + """Tests for `archivebox snapshot delete`.""" + + def test_delete_requires_yes(self, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "delete"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 1 + assert "--yes" in stderr + + def test_delete_with_yes(self, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "delete", "--yes"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Deleted 1 snapshots" in stderr + + def test_delete_dry_run(self, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + _cmd_result = run_archivebox_cmd( + ["snapshot", "create", url], + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + stdout1, _, _ = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + snapshot = parse_jsonl_output(stdout1)[0] + + _cmd_result = run_archivebox_cmd( + ["snapshot", "delete", "--dry-run"], + stdin=json.dumps(snapshot), + cwd=initialized_archive, + default_cli_env=True, + disable_extractors=True, + ) + _stdout, stderr, code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert code == 0 + assert "Would delete" in stderr + + +def test_snapshot_creates_snapshot_with_correct_url(tmp_path, initialized_archive): + """Test that snapshot stores the exact URL in the database.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.select_related("crawl__created_by").get(url="https://example.com") + username = snapshot.crawl.created_by.username + + # Verify the crawl tree contains a relative symlink to the user-scoped snapshot output. + snapshots_root = tmp_path / "archive" / "users" / username / "snapshots" + crawl_root = tmp_path / "archive" / "users" / username / "crawls" + symlinks = [p for p in crawl_root.rglob("*") if p.is_symlink() and p.resolve().is_dir() and p.resolve().is_relative_to(snapshots_root)] + assert symlinks, "Snapshot symlink should exist under crawl dir" + link_path = symlinks[0] + + assert link_path.is_symlink(), "Snapshot symlink should exist under crawl dir" + link_target = os.readlink(link_path) + assert not os.path.isabs(link_target), "Symlink should be relative" + + +def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, initialized_archive): + """Test that multiple URLs each get their own snapshot record.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + [ + "snapshot", + "create", + "https://example.com", + "https://iana.org", + ], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + urls = list(Snapshot.objects.order_by("url").values_list("url", flat=True)) + + assert "https://example.com" in urls + assert "https://iana.org" in urls + assert len(urls) >= 2 + + +def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, initialized_archive): + """Test that --tag creates tag record and links it to the snapshot.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + [ + "snapshot", + "create", + "--tag=mytesttag", + "https://example.com", + ], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + tag = Tag.objects.filter(name="mytesttag").first() + assert tag is not None, "Tag 'mytesttag' should exist in core_tag" + snapshot = Snapshot.objects.filter(url="https://example.com").first() + assert snapshot is not None + assert snapshot.tags.filter(pk=tag.pk).exists(), "Tag should be linked to snapshot via core_snapshot_tags" + + +def test_snapshot_jsonl_output_has_correct_structure(tmp_path, initialized_archive): + """Test that JSONL output contains required fields with correct types.""" + env = cli_env(disable_extractors=True) + + # Pass URL as argument instead of stdin for more reliable behavior + result = run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=tmp_path, + env=env, + ) + + # Parse JSONL output lines + records = Process.parse_records_from_text(result.stdout) + snapshot_records = [r for r in records if r.get("type") == "Snapshot"] + + assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record" + + record = snapshot_records[0] + assert record.get("type") == "Snapshot" + assert "id" in record, "Snapshot record should have 'id' field" + assert "url" in record, "Snapshot record should have 'url' field" + assert record["url"] == "https://example.com" + + +def test_snapshot_with_tag_stores_tag_name(tmp_path, initialized_archive): + """Test that title is stored when provided via tag option.""" + env = cli_env(disable_extractors=True) + + # Use command line args instead of stdin + run_archivebox_cmd( + ["snapshot", "create", "--tag=customtag", "https://example.com"], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + tag = Tag.objects.filter(name="customtag").first() + + assert tag is not None + assert tag.name == "customtag" + + +def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, initialized_archive): + """Test that --depth sets snapshot depth when creating snapshots.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + [ + "snapshot", + "create", + "--depth=1", + "https://example.com", + ], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + snapshot = Snapshot.objects.order_by("-created_at").first() + + assert snapshot is not None, "Snapshot should be created when depth is provided" + assert snapshot.depth == 1, "Snapshot depth should match --depth value" + + +def test_snapshot_allows_duplicate_urls_across_crawls(tmp_path, initialized_archive): + """Snapshot create auto-creates a crawl per run; same URL can appear multiple times.""" + env = cli_env(disable_extractors=True) + + # Add same URL twice + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=tmp_path, + env=env, + ) + run_archivebox_cmd( + ["snapshot", "create", "https://example.com"], + cwd=tmp_path, + env=env, + ) + + with use_archivebox_db(tmp_path): + count = Snapshot.objects.filter(url="https://example.com").count() + + assert count == 2, "Same URL should create separate snapshots across different crawls" diff --git a/archivebox/tests/test_cli_status.py b/archivebox/tests/test_cli_status.py new file mode 100644 index 0000000000..7859f23f1c --- /dev/null +++ b/archivebox/tests/test_cli_status.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox status command. +Verify status reports accurate collection state from DB and filesystem. +""" + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.tests.conftest import find_snapshot_dir, run_archivebox_cmd, cli_env + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _create_snapshot_rows(initialized_archive, env, *urls): + result = run_archivebox_cmd( + ["snapshot", "create", *urls], + cwd=initialized_archive, + env=env, + check=True, + ) + return result + + +def test_status_runs_successfully(initialized_archive): + """Test that status command runs without error.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + assert result.returncode == 0 + assert len(result.stdout) > 100 + + +def test_status_shows_zero_snapshots_in_empty_archive(initialized_archive): + """Test status shows 0 snapshots in empty archive.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + output = result.stdout + # Should indicate empty/zero state + assert "0" in output + + +def test_status_shows_correct_snapshot_count(initialized_archive): + """Test that status shows accurate snapshot count from DB.""" + env = cli_env(disable_extractors=True) + + _create_snapshot_rows(initialized_archive, env, "https://example.com", "https://example.org", "https://example.net") + + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + # Verify DB has 3 snapshots + with use_archivebox_db(initialized_archive): + db_count = Snapshot.objects.count() + + assert db_count == 3 + # Status output should show 3 + assert "3" in result.stdout + + +def test_status_shows_archived_count(initialized_archive): + """Test status distinguishes archived vs unarchived snapshots.""" + env = cli_env(disable_extractors=True) + + _create_snapshot_rows(initialized_archive, env, "https://example.com") + + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + # Should show archived/unarchived categories + assert "archived" in result.stdout.lower() or "queued" in result.stdout.lower() + + +def test_status_shows_archive_directory_size(initialized_archive): + """Test status reports archive directory size.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + output = result.stdout + # Should show size info + assert "Size" in output or "size" in output + + +def test_status_counts_archive_directories(initialized_archive): + """Test status counts directories in archive/ folder.""" + env = cli_env(disable_extractors=True) + + _create_snapshot_rows(initialized_archive, env, "https://example.com") + + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + # Should show directory count + assert "present" in result.stdout.lower() or "directories" in result.stdout + + +def test_status_detects_orphaned_directories(initialized_archive): + """Test status detects directories not in DB (orphaned).""" + env = cli_env(disable_extractors=True) + + _create_snapshot_rows(initialized_archive, env, "https://example.com") + + # Create an orphaned directory + (initialized_archive / "archive" / "fake_orphaned_dir").mkdir(parents=True, exist_ok=True) + + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + # Should mention orphaned dirs + assert "orphan" in result.stdout.lower() or "1" in result.stdout + + +def test_status_counts_new_snapshot_output_dirs_as_archived(initialized_archive): + """Test status reads archived/present counts from the current snapshot output layout.""" + env = cli_env(disable_extractors=True) + env = env.copy() + env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true" + + _create_snapshot_rows(initialized_archive, env, "https://example.com") + + with use_archivebox_db(initialized_archive): + snapshot_id = Snapshot.objects.values_list("id", flat=True).get(url="https://example.com") + + snapshot_dir = find_snapshot_dir(initialized_archive, str(snapshot_id)) + assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}" + title_dir = snapshot_dir / "title" + title_dir.mkdir(parents=True, exist_ok=True) + (title_dir / "title.txt").write_text("Example Domain") + + result = run_archivebox_cmd(["status"], cwd=initialized_archive, env=env) + + assert result.returncode == 0, result.stdout + result.stderr + assert "archived: 1" in result.stdout + assert "present: 1" in result.stdout + + +def test_status_shows_user_info(initialized_archive): + """Test status shows user/login information.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + output = result.stdout + # Should show user section + assert "user" in output.lower() or "login" in output.lower() + + +def test_status_reads_from_db_not_filesystem(initialized_archive): + """Test that status uses DB as source of truth, not filesystem.""" + env = cli_env(disable_extractors=True) + + _create_snapshot_rows(initialized_archive, env, "https://example.com") + + # Verify DB has snapshot + with use_archivebox_db(initialized_archive): + db_count = Snapshot.objects.count() + + assert db_count == 1 + + # Status should reflect DB count + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + assert "1" in result.stdout + + +def test_status_shows_index_file_info(initialized_archive): + """Test status shows index file information.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + # Should mention index + assert "index" in result.stdout.lower() or "Index" in result.stdout + + +def test_status_help_lists_available_options(initialized_archive): + """Test that status --help works and documents the command.""" + result = run_archivebox_cmd( + ["status", "--help"], + cwd=initialized_archive, + ) + + assert result.returncode == 0 + assert "status" in result.stdout.lower() or "statistic" in result.stdout.lower() + + +def test_status_shows_data_directory_path(initialized_archive): + """Test that status reports which collection directory it is inspecting.""" + result = run_archivebox_cmd(["status"], cwd=initialized_archive) + + assert "archive" in result.stdout.lower() or str(initialized_archive) in result.stdout diff --git a/archivebox/tests/test_cli_tag.py b/archivebox/tests/test_cli_tag.py new file mode 100644 index 0000000000..2af1f78d2c --- /dev/null +++ b/archivebox/tests/test_cli_tag.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox tag command. + +TODO: expand beyond command discovery into create/list/update/delete behavior. +""" + +from archivebox.tests.conftest import run_archivebox_cmd + + +def test_tag_help_runs_successfully(tmp_path): + """The tag command should be registered and expose help.""" + + result = run_archivebox_cmd(["tag", "--help"]) + + assert result.returncode == 0 + assert "tag" in result.stdout.lower() + assert "list" in result.stdout diff --git a/archivebox/tests/test_cli_update.py b/archivebox/tests/test_cli_update.py new file mode 100644 index 0000000000..de625aaa47 --- /dev/null +++ b/archivebox/tests/test_cli_update.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Comprehensive tests for archivebox update command. +Verify update drains old dirs, reconciles DB, and queues snapshots. +""" + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.tests.conftest import run_queued_crawls, run_archivebox_cmd, cli_env + +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_update_runs_successfully_on_empty_archive(initialized_archive): + """Test that update runs without error on empty archive.""" + result = run_archivebox_cmd( + ["update"], + timeout=120, + ) + output = result.stdout + result.stderr + + assert result.returncode == 0, output + assert "Phase 1: Draining old archive/ directories" in output + assert "Phase 2: Processing all database snapshots" in output + assert "Updated DB rows: 0" in output + assert "Sealed crawls: 0" in output + + with use_archivebox_db(initialized_archive): + assert Snapshot.objects.count() == 0 + + +def test_update_reconciles_existing_snapshots(initialized_archive): + """Test that update command reconciles existing snapshots.""" + env = cli_env(disable_extractors=True) + + # Add a snapshot (index-only for faster test) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + ) + run_queued_crawls(initialized_archive, env) + + # Run update - should reconcile and queue + result = run_archivebox_cmd( + ["update"], + env=env, + timeout=120, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + assert "Phase 2: Processing all database snapshots" in output + assert "Updated DB rows:" in output + + with use_archivebox_db(initialized_archive): + assert Snapshot.objects.filter(url="https://example.com", status="sealed").count() == 1 + + +def test_update_specific_snapshot_by_filter(initialized_archive): + """Test updating specific snapshot using filter.""" + env = cli_env(disable_extractors=True) + + # Add multiple snapshots + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + timeout=90, + ) + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.org"], + env=env, + timeout=90, + ) + run_queued_crawls(initialized_archive, env) + + # Update with filter pattern (uses filter_patterns argument) + result = run_archivebox_cmd( + ["update", "--filter-type=substring", "example.com"], + env=env, + timeout=120, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + assert "Processing filtered snapshots from database" in output + assert "Found 1 matching snapshots" in output + + with use_archivebox_db(initialized_archive): + assert Snapshot.objects.filter(url="https://example.com", status="sealed").count() == 1 + assert Snapshot.objects.filter(url="https://example.org", status="sealed").count() == 1 + + +def test_update_preserves_snapshot_count(initialized_archive): + """Test that update doesn't change snapshot count.""" + env = cli_env(disable_extractors=True) + + # Add snapshots + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + timeout=90, + ) + run_queued_crawls(initialized_archive, env) + + # Count before update + with use_archivebox_db(initialized_archive): + count_before = Snapshot.objects.count() + + assert count_before == 1 + + # Run update (should reconcile + queue, not create new snapshots) + run_archivebox_cmd( + ["update"], + env=env, + timeout=120, + check=True, + ) + + # Count after update + with use_archivebox_db(initialized_archive): + count_after = Snapshot.objects.count() + + # Snapshot count should remain the same + assert count_after == count_before + + +def test_update_seals_migrated_snapshots(initialized_archive): + """Test that full update reconciles migrated snapshots without re-queuing them.""" + env = cli_env(disable_extractors=True) + + run_archivebox_cmd( + ["add", "--index-only", "--depth=0", "https://example.com"], + env=env, + timeout=90, + ) + run_queued_crawls(initialized_archive, env) + + # Run update + result = run_archivebox_cmd( + ["update"], + env=env, + timeout=120, + ) + + output = result.stdout + result.stderr + assert result.returncode == 0, output + assert "No queued/interrupted crawl work found" in output + + # Check that snapshot remains archived instead of being queued for a full re-crawl. + with use_archivebox_db(initialized_archive): + status = Snapshot.objects.values_list("status", flat=True).get() + + assert status == "sealed" diff --git a/archivebox/tests/test_cli_update_reindex_snapshots.py b/archivebox/tests/test_cli_update_reindex_snapshots.py new file mode 100644 index 0000000000..411e9be2ff --- /dev/null +++ b/archivebox/tests/test_cli_update_reindex_snapshots.py @@ -0,0 +1,396 @@ +import json +import os +from datetime import datetime, timedelta +from archivebox.tests.conftest import run_archivebox_cmd, cli_env + +import pytest +from django.utils import timezone + +from archivebox.core.models import Snapshot +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_update_imports_orphaned_snapshots(tmp_path, initialized_archive): + """Test that archivebox update imports real legacy archive directories.""" + env = cli_env(disable_extractors=True) + legacy_timestamp = "1710000000" + legacy_dir = tmp_path / "archive" / legacy_timestamp + legacy_dir.mkdir(parents=True, exist_ok=True) + (legacy_dir / "singlefile.html").write_text("example") + (legacy_dir / "index.json").write_text( + json.dumps( + { + "url": "https://example.com", + "timestamp": legacy_timestamp, + "title": "Example Domain", + "fs_version": "0.8.0", + "archive_results": [], + }, + ), + ) + + # Run the migration phase only; default update also runs queued crawl work. + update_process = run_archivebox_cmd( + ["update", "--migrate-only"], + env=env, + timeout=60, + ) + assert update_process.returncode == 0, update_process.stderr + + with use_archivebox_db(tmp_path): + row = Snapshot.objects.values_list("url", "fs_version").get() + + assert row == ("https://example.com", Snapshot._fs_current_version()) + assert legacy_dir.is_symlink() + + migrated_dir = legacy_dir.resolve() + assert migrated_dir.exists() + assert (migrated_dir / "index.jsonl").exists() + assert (migrated_dir / "singlefile.html").exists() + + +@pytest.mark.django_db(transaction=True) +def test_reindex_snapshots_resets_existing_search_results_and_reruns_requested_plugins(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.cli.archivebox_update import reindex_snapshots + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="search_backend_sqlite", + hook_name="on_Snapshot__90_index_sqlite", + status=ArchiveResult.StatusChoices.SUCCEEDED, + output_str="old index hit", + output_json={"indexed": True}, + output_files={"search.sqlite3": {"size": 123}}, + output_size=123, + ) + output_dir = snapshot.output_dir + (output_dir / "title").mkdir(parents=True, exist_ok=True) + (output_dir / "title" / "title.txt").write_text("Example Domain") + (output_dir / "dom").mkdir(parents=True, exist_ok=True) + (output_dir / "dom" / "output.html").write_text("Example searchable text") + + original_engine = os.environ.get("SEARCH_BACKEND_ENGINE") + os.environ["SEARCH_BACKEND_ENGINE"] = "sqlite" + try: + stats = reindex_snapshots( + Snapshot.objects.filter(id=snapshot.id), + search_plugins=["search_backend_sqlite"], + batch_size=10, + ) + finally: + if original_engine is None: + os.environ.pop("SEARCH_BACKEND_ENGINE", None) + else: + os.environ["SEARCH_BACKEND_ENGINE"] = original_engine + + result.refresh_from_db() + + assert stats["processed"] == 1 + assert stats["queued"] == 1 + assert stats["reindexed"] == 0 + assert result.status == ArchiveResult.StatusChoices.QUEUED + assert result.output_str == "" + assert result.output_json is None + + +@pytest.mark.django_db +def test_build_filtered_snapshots_queryset_respects_resume_cutoff(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com\nhttps://example.org\nhttps://example.net", + created_by_id=get_or_create_system_user_pk(), + ) + base = timezone.make_aware(datetime(2026, 3, 23, 12, 0, 0)) + older = Snapshot.objects.create( + url="https://example.net", + crawl=crawl, + bookmarked_at=base - timedelta(hours=2), + ) + middle = Snapshot.objects.create( + url="https://example.org", + crawl=crawl, + bookmarked_at=base - timedelta(hours=1), + ) + newer = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + bookmarked_at=base, + ) + + snapshots = list( + _build_filtered_snapshots_queryset( + filter_patterns=(), + filter_type="exact", + before=None, + after=None, + resume=middle.timestamp, + ).values_list("id", flat=True), + ) + + assert str(newer.id) not in {str(snapshot_id) for snapshot_id in snapshots} + assert set(map(str, snapshots)) == {str(middle.id), str(older.id)} + + +@pytest.mark.django_db +def test_build_filtered_snapshots_queryset_accepts_list_style_filters(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset + from archivebox.core.models import Snapshot, Tag + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com\nhttps://example.org", + created_by_id=get_or_create_system_user_pk(), + ) + tagged = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + title="Example Domain", + status=Snapshot.StatusChoices.SEALED, + ) + Snapshot.objects.create( + url="https://example.org", + crawl=crawl, + title="Other Example", + status=Snapshot.StatusChoices.QUEUED, + ) + tagged.tags.add(Tag.objects.create(name="keep")) + + snapshots = list( + _build_filtered_snapshots_queryset( + filter_patterns=(), + filter_type="exact", + status=Snapshot.StatusChoices.SEALED, + url__icontains="example", + tag="keep", + crawl_id=str(crawl.id), + limit=1, + sort="url", + ).values_list("id", flat=True), + ) + + assert snapshots == [tagged.id] + + +@pytest.mark.django_db +def test_reconcile_with_index_json_tolerates_null_title(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + title="Example Domain", + status=Snapshot.StatusChoices.SEALED, + ) + output_dir = snapshot.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot.url, + "timestamp": snapshot.timestamp, + "title": None, + "archive_results": [], + }, + ), + ) + + snapshot.reconcile_with_index_json() + snapshot.refresh_from_db() + + assert snapshot.title == "Example Domain" + + +@pytest.mark.django_db +def test_reconcile_with_index_json_imports_legacy_archive_results_and_process(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + output_dir = snapshot.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot.url, + "timestamp": snapshot.timestamp, + "title": "Example Domain", + "archive_results": [ + { + "plugin": "screenshot", + "status": "succeeded", + "output": "screenshot.png", + "output_files": {"screenshot.png": {"size": 3}}, + "output_size": 3, + "start_ts": "2024-01-01T00:00:00+00:00", + "end_ts": "2024-01-01T00:00:01+00:00", + "cmd": ["screenshot", snapshot.url], + "pwd": str(output_dir / "screenshot"), + }, + ], + }, + ), + ) + + snapshot.reconcile_with_index_json() + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="screenshot") + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert result.output_str == "screenshot.png" + assert result.output_files == {"screenshot.png": {"extension": "png", "mimetype": "image/png", "size": 3}} + assert result.process is not None + assert result.cmd == ["screenshot", snapshot.url] + assert result.pwd == str(output_dir / "screenshot") + assert (output_dir / "index.json").exists() is False + jsonl_text = (output_dir / "index.jsonl").read_text() + assert '"type": "ArchiveResult"' in jsonl_text + assert '"type": "Process"' in jsonl_text + + +@pytest.mark.django_db +def test_reconcile_with_index_json_merges_retried_archive_results(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + output_dir = snapshot.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot.url, + "timestamp": snapshot.timestamp, + "title": "Example Domain", + "archive_results": [ + { + "plugin": "dom", + "hook_name": "on_Snapshot__12_dom.js", + "status": "failed", + "output": "first attempt failed", + "start_ts": "2024-01-01T00:00:00+00:00", + "end_ts": "2024-01-01T00:00:01+00:00", + }, + { + "plugin": "dom", + "hook_name": "on_Snapshot__12_dom.js", + "status": "succeeded", + "output": "dom/output.html", + "output_files": {"output.html": {"size": 42}}, + "output_size": 42, + "start_ts": "2024-01-01T00:01:00+00:00", + "end_ts": "2024-01-01T00:01:01+00:00", + }, + ], + }, + ), + ) + + snapshot.reconcile_with_index_json() + + result = ArchiveResult.objects.get(snapshot=snapshot, plugin="dom", hook_name="on_Snapshot__12_dom.js") + assert ArchiveResult.objects.filter(snapshot=snapshot, plugin="dom", hook_name="on_Snapshot__12_dom.js").count() == 1 + assert result.status == ArchiveResult.StatusChoices.SUCCEEDED + assert result.output_str == "dom/output.html" + assert result.output_size == 42 + + +@pytest.mark.django_db +def test_reconcile_with_index_json_trusts_legacy_archive_results(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.crawls.models import Crawl + + crawl = Crawl.objects.create( + urls="https://example.com/page", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com/page", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + ) + output_dir = snapshot.output_dir + output_dir.mkdir(parents=True, exist_ok=True) + (output_dir / "screenshot.png").write_bytes(b"png") + (output_dir / "output.html").write_text("root wget output") + (output_dir / "example.com").mkdir() + (output_dir / "example.com" / "page.html").write_text("mirror output") + (output_dir / "cdn.example.com").mkdir() + (output_dir / "cdn.example.com" / "asset.js").write_text("console.log('asset')") + (output_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot.url, + "timestamp": snapshot.timestamp, + "title": "Example Domain", + "archive_results": [ + { + "plugin": "screenshot", + "status": "succeded", + "output": "screenshot.png", + "start_ts": "2024-01-01T00:00:00+00:00", + "end_ts": "2024-01-01T00:00:01+00:00", + }, + { + "plugin": "wget", + "status": "succeeded", + "output": "example.com/page.html", + "start_ts": "2024-01-01T00:00:02+00:00", + "end_ts": "2024-01-01T00:00:03+00:00", + }, + ], + }, + ), + ) + + snapshot.reconcile_with_index_json() + + results = {result.plugin: result for result in ArchiveResult.objects.filter(snapshot=snapshot)} + assert set(results) == {"screenshot", "wget"} + assert results["screenshot"].status == ArchiveResult.StatusChoices.SUCCEEDED + assert results["screenshot"].output_str == "screenshot.png" + assert results["screenshot"].output_files == {} + assert results["wget"].output_str == "example.com/page.html" + assert results["wget"].output_files == {} diff --git a/archivebox/tests/test_cli_version.py b/archivebox/tests/test_cli_version.py new file mode 100644 index 0000000000..33abaa12b0 --- /dev/null +++ b/archivebox/tests/test_cli_version.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Tests for archivebox version command. +Verify version output and system information reporting. +""" + +import os +import re +import tempfile +from pathlib import Path +from archivebox.config.paths import tmp_dir_socket_path_is_short_enough +from archivebox.tests.conftest import run_archivebox_cmd + + +def _make_deep_collection_dir(tmp_path: Path) -> Path: + deep_dir = tmp_path / "deep-collection" + for idx in range(6): + deep_dir /= f"segment-{idx}-1234567890abcdef" + deep_dir.mkdir(parents=True) + return deep_dir + + +def _extract_location_path(output: str, key: str) -> Path: + for line in output.splitlines(): + if key not in line: + continue + columns = [column for column in re.split(r"\s{2,}", line.strip()) if column] + if len(columns) >= 5 and columns[1] == key: + return Path(os.path.expanduser(columns[-1])) + raise AssertionError(f"Did not find a {key} location line in output:\n{output}") + + +def test_version_quiet_outputs_version_number(tmp_path): + """Test that version --quiet outputs just the version number.""" + result = run_archivebox_cmd(["version", "--quiet"]) + + assert result.returncode == 0 + version = result.stdout.strip() + assert version + # Version should be semver-ish format (e.g., 0.8.0) + parts = version.split(".") + assert len(parts) >= 2 + + +def test_version_flag_outputs_version_number(tmp_path): + """Test that top-level --version reports the package version.""" + result = run_archivebox_cmd(["--version"]) + + assert result.returncode == 0 + version = result.stdout.strip() + assert version + assert len(version.split(".")) >= 2 + + +def test_version_shows_system_info_in_initialized_dir(tmp_path, initialized_archive): + """Test that version shows system metadata in initialized directory.""" + result = run_archivebox_cmd(["version"]) + + output = result.stdout + assert "ArchiveBox" in output + # Should show system info + assert any(x in output for x in ["ARCH=", "OS=", "PYTHON="]) + + +def test_version_shows_binaries_after_init(tmp_path, initialized_archive): + """Test that version shows binary dependencies in initialized directory.""" + result = run_archivebox_cmd(["version"]) + + output = result.stdout + # Should show binary section + assert "Binary" in output or "Dependencies" in output + + +def test_version_shows_data_locations(tmp_path, initialized_archive): + """Test that version shows data directory locations.""" + result = run_archivebox_cmd(["version"]) + + output = result.stdout + # Should show paths + assert any(x in output for x in ["Data", "Code", "location"]) + + +def test_version_in_uninitialized_dir_still_works(tmp_path): + """Test that version command works even without initialized data dir.""" + empty_dir = tmp_path / "empty" + empty_dir.mkdir() + + result = run_archivebox_cmd(["version", "--quiet"], cwd=empty_dir) + + # Should still output version + assert result.returncode == 0 + assert len(result.stdout.strip()) > 0 + + +def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path): + """Test the real CLI init/version flow auto-selects a short TMP_DIR outside deep collections.""" + data_dir = _make_deep_collection_dir(tmp_path) + default_tmp_dir = data_dir / "tmp" + extra_env = {"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true"} + + with tempfile.TemporaryDirectory(prefix="abx-home-") as home_tmp: + home_dir = Path(home_tmp) + env = { + "HOME": str(home_dir), + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", + **extra_env, + } + + init_result = run_archivebox_cmd(["init", "--quick"], cwd=data_dir, env=env, timeout=180) + assert init_result.returncode == 0, init_result.stdout + init_result.stderr + + version_result = run_archivebox_cmd(["version"], cwd=data_dir, env=env, timeout=180) + output = version_result.stdout + version_result.stderr + + assert version_result.returncode == 0, output + assert "ArchiveBox" in output + assert "TMP_DIR" in output + assert "Error with configured TMP_DIR" not in output + + reported_tmp_dir = _extract_location_path(output, "TMP_DIR") + if not reported_tmp_dir.is_absolute(): + reported_tmp_dir = (data_dir / reported_tmp_dir).resolve() + + assert reported_tmp_dir.exists() + assert not reported_tmp_dir.is_relative_to(default_tmp_dir) + assert tmp_dir_socket_path_is_short_enough(reported_tmp_dir) + + +def test_version_help_lists_quiet_flag(tmp_path): + """Test that version --help documents the quiet output mode.""" + result = run_archivebox_cmd(["version", "--help"]) + + assert result.returncode == 0 + assert "--quiet" in result.stdout or "-q" in result.stdout + + +def test_version_invalid_option_fails(tmp_path): + """Test that invalid version options fail cleanly.""" + result = run_archivebox_cmd(["version", "--invalid-option"]) + + assert result.returncode != 0 diff --git a/archivebox/tests/test_config_DELETE_AFTER.py b/archivebox/tests/test_config_DELETE_AFTER.py new file mode 100644 index 0000000000..d01997da88 --- /dev/null +++ b/archivebox/tests/test_config_DELETE_AFTER.py @@ -0,0 +1,261 @@ +import json +from pathlib import Path + +import pytest +from django.contrib.auth import get_user_model +from django.urls import reverse + +from archivebox.tests.conftest import run_archivebox_cmd, run_queued_crawls, cli_env + + +pytestmark = pytest.mark.django_db(transaction=True) + +ADMIN_HOST = "admin.archivebox.localhost:8000" +API_HOST = "api.archivebox.localhost:8000" + + +def test_delete_after_real_cli_and_orchestrator_paths_cover_all_retained_models(tmp_path): + env = cli_env(disable_extractors=True) + _cmd_result = run_archivebox_cmd(["init", "--quick"], cwd=tmp_path, timeout=90) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, stderr + + run_env = { + **env, + "DELETE_AFTER": "1hr", + "USE_COLOR": "False", + "SHOW_PROGRESS": "False", + } + url = "https://example.com/delete-after-cli" + _cmd_result = run_archivebox_cmd( + ["add", "--index-only", "--depth=0", url], + cwd=tmp_path, + timeout=120, + env=run_env, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox add failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + run_queued_crawls(tmp_path, run_env) + + lookup_script = f""" +import json +from archivebox.core.models import Snapshot +snapshot = Snapshot.objects.select_related("crawl").get(url={url!r}) +print(json.dumps({{ + "crawl_id": str(snapshot.crawl.id), + "crawl_delete_at": bool(snapshot.crawl.delete_at), + "snapshot_id": str(snapshot.id), + "snapshot_delete_at": bool(snapshot.delete_at), +}})) +""" + _cmd_result = run_archivebox_cmd( + ["manage", "shell", "-c", lookup_script], + cwd=tmp_path, + timeout=90, + env=run_env, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"retention lookup failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + created = json.loads(stdout.strip().splitlines()[-1]) + assert created["crawl_delete_at"] + assert created["snapshot_delete_at"] + + setup_script = f""" +import json +from pathlib import Path +from datetime import timedelta +from django.utils import timezone +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.machine.models import Machine, NetworkInterface, Process + +snapshot = Snapshot.objects.select_related("crawl").get(id="{created["snapshot_id"]}") +crawl = snapshot.crawl +if snapshot.status == snapshot.StatusChoices.QUEUED: + snapshot.sm.tick() + snapshot.refresh_from_db() +if snapshot.status == snapshot.StatusChoices.STARTED: + snapshot.sm.seal() + snapshot.refresh_from_db() +crawl.refresh_from_db() +if crawl.status == crawl.StatusChoices.QUEUED: + crawl.sm.tick() + crawl.refresh_from_db() +if crawl.status == crawl.StatusChoices.STARTED: + crawl.sm.seal() + crawl.refresh_from_db() +if snapshot.status != snapshot.StatusChoices.SEALED or crawl.status != crawl.StatusChoices.SEALED: + raise RuntimeError(f"expected sealed snapshot/crawl, got {{snapshot.status}}/{{crawl.status}}") + +Path(crawl.output_dir).mkdir(parents=True, exist_ok=True) +Path(snapshot.output_dir).mkdir(parents=True, exist_ok=True) +(Path(crawl.output_dir) / "crawl-retention.txt").write_text("crawl") +(Path(snapshot.output_dir) / "snapshot-retention.txt").write_text("snapshot") + +result = ArchiveResult.objects.create( + snapshot=snapshot, + plugin="title", + hook_name="on_Snapshot__54_title.py", + status=ArchiveResult.StatusChoices.SUCCEEDED, +) +Path(result.output_dir).mkdir(parents=True, exist_ok=True) +(Path(result.output_dir) / "title.txt").write_text("Example") + +machine = Machine.current() +iface = NetworkInterface.objects.filter(machine=machine).first() +if iface is None: + iface = NetworkInterface.objects.create( + machine=machine, + mac_address="00:00:00:00:00:00", + ip_public="203.0.113.10", + ip_local="127.0.0.1", + dns_server="1.1.1.1", + hostname=machine.hostname, + iface="lo", + isp="Test ISP", + city="Test City", + region="Test Region", + country="Test Country", + ) +process = Process.objects.create( + machine=machine, + iface=iface, + process_type=Process.TypeChoices.HOOK, + pwd=str(result.output_dir), + cmd=["echo", "ok"], + env={{"DELETE_AFTER": "1hr"}}, + status=Process.StatusChoices.EXITED, +) + +due_at = timezone.now() - timedelta(hours=1) +ArchiveResult.objects.filter(pk=result.pk).update(delete_at=due_at) +Snapshot.objects.filter(pk=snapshot.pk).update(delete_at=due_at) +Crawl = type(crawl) +Crawl.objects.filter(pk=crawl.pk).update(delete_at=due_at) +Process.objects.filter(pk=process.pk).update(delete_at=due_at) + +print(json.dumps({{ + "crawl_id": str(crawl.id), + "snapshot_id": str(snapshot.id), + "archiveresult_id": str(result.id), + "process_id": str(process.id), + "crawl_dir": str(crawl.output_dir), + "snapshot_dir": str(snapshot.output_dir), + "archiveresult_dir": str(result.output_dir), +}})) +""" + _cmd_result = run_archivebox_cmd( + ["manage", "shell", "-c", setup_script], + cwd=tmp_path, + timeout=90, + env=run_env, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"retention setup failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + retained = json.loads(stdout.strip().splitlines()[-1]) + assert retained["crawl_id"] == created["crawl_id"] + + _cmd_result = run_archivebox_cmd(["run", "--crawl-id", retained["crawl_id"]], cwd=tmp_path, timeout=120, env=run_env) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"archivebox run failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + + remaining_script = f""" +import json +from archivebox.crawls.models import Crawl +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.machine.models import Process +print(json.dumps({{ + "crawl": Crawl.objects.filter(id="{retained["crawl_id"]}").count(), + "snapshot": Snapshot.objects.filter(id="{retained["snapshot_id"]}").count(), + "archiveresult": ArchiveResult.objects.filter(id="{retained["archiveresult_id"]}").count(), + "process": Process.objects.filter(id="{retained["process_id"]}").count(), +}})) +""" + _cmd_result = run_archivebox_cmd( + ["manage", "shell", "-c", remaining_script], + cwd=tmp_path, + timeout=90, + env=run_env, + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert returncode == 0, f"retention remaining lookup failed:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}" + remaining = json.loads(stdout.strip().splitlines()[-1]) + assert remaining == {"crawl": 0, "snapshot": 0, "archiveresult": 0, "process": 0} + assert not Path(retained["crawl_dir"]).exists() + assert not Path(retained["snapshot_dir"]).exists() + assert not Path(retained["archiveresult_dir"]).exists() + + +def test_delete_after_real_add_page_and_rest_create_paths(client): + User = get_user_model() + admin_user = User.objects.create_superuser( + username="retentionadmin", + email="retentionadmin@test.com", + password="testpassword", + ) + + client.force_login(admin_user) + response = client.get(reverse("add"), HTTP_HOST=ADMIN_HOST) + assert response.status_code == 200 + assert 'name="delete_after"' in response.content.decode() + + response = client.post( + reverse("add"), + data={ + "url": "https://example.com/delete-after-ui", + "tag": "retention-ui", + "depth": "0", + "max_urls": "1", + "crawl_max_size": "0", + "crawl_timeout": "60", + "snapshot_max_size": "0", + "delete_after": "2h", + "crawl_max_concurrent_snapshots": "1", + "url_filters_allowlist": "", + "url_filters_denylist": "", + "notes": "delete-after ui test", + "schedule": "", + "persona": "Default", + "permissions": "public", + "index_only": "on", + "config": "{}", + }, + HTTP_HOST=ADMIN_HOST, + ) + assert response.status_code == 302, response.context["form"].errors if response.context else response.content.decode() + + from archivebox.crawls.models import Crawl + + ui_crawl = Crawl.objects.get(urls__contains="https://example.com/delete-after-ui") + assert ui_crawl.config["DELETE_AFTER"] == "2h" + assert ui_crawl.delete_at is not None + from archivebox.services.runner import run_due_crawl + + assert run_due_crawl(ui_crawl, lock_seconds=10) + ui_snapshot = ui_crawl.snapshot_set.get(url="https://example.com/delete-after-ui") + assert ui_snapshot.delete_at is not None + + from archivebox.api.auth import get_or_create_api_token + + api_token = get_or_create_api_token(admin_user) + assert api_token is not None + response = client.post( + "/api/v1/crawls/crawls", + data=json.dumps( + { + "urls": ["https://example.com/delete-after-rest"], + "max_depth": 0, + "max_urls": 1, + "config": {"DELETE_AFTER": "3h"}, + }, + ), + content_type="application/json", + HTTP_HOST=API_HOST, + HTTP_X_ARCHIVEBOX_API_KEY=api_token.token, + ) + assert response.status_code == 200 + rest_crawl = Crawl.objects.get(urls__contains="https://example.com/delete-after-rest") + assert rest_crawl.config["DELETE_AFTER"] == "3h" + assert rest_crawl.delete_at is not None + assert run_due_crawl(rest_crawl, lock_seconds=10) + rest_snapshot = rest_crawl.snapshot_set.get(url="https://example.com/delete-after-rest") + assert rest_snapshot.delete_at is not None diff --git a/archivebox/tests/test_config_MAX_limits.py b/archivebox/tests/test_config_MAX_limits.py new file mode 100644 index 0000000000..b3e4416348 --- /dev/null +++ b/archivebox/tests/test_config_MAX_limits.py @@ -0,0 +1,295 @@ +"""Tests for MAX/SIZE crawl limit config behavior.""" + +import asyncio +import json +from pathlib import Path + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import cli_env, run_archivebox_cmd +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_create_snapshots_from_urls_respects_max_urls(admin_user): + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://example.com/root", + "https://example.com/about", + "https://example.com/contact", + ], + ), + config={"CRAWL_MAX_URLS": 2}, + created_by=admin_user, + ) + + created = crawl.create_snapshots_from_urls() + + assert [snapshot.url for snapshot in created] == [ + "https://example.com/root", + "https://example.com/about", + ] + assert crawl.snapshot_set.count() == 2 + assert crawl.remaining_snapshot_capacity() == 0 + assert crawl.limit_stop_reason() == "crawl_max_urls" + assert crawl.add_url({"url": "https://example.com/extra", "depth": 1}) is False + + +def test_crawl_stop_reason_keeps_specific_limit_reason_over_lifecycle_fallback(admin_user): + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://example.com/root", + "https://example.com/about", + ], + ), + config={"CRAWL_MAX_URLS": 1}, + status=Crawl.StatusChoices.SEALED, + retry_at=None, + created_by=admin_user, + ) + Snapshot.objects.create( + url="https://example.com/root", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + timestamp="1700000000.011", + ) + + assert crawl.stop_reason() == "crawl_max_urls" + + +def test_enqueue_discovered_snapshots_refreshes_crawl_limits(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + + crawl = Crawl.objects.create( + urls="https://example.com", + max_depth=0, + config={"CRAWL_MAX_URLS": 5}, + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + depth=0, + ) + parser_dir = Path(snapshot.output_dir) / "parse_html_urls" + parser_dir.mkdir(parents=True, exist_ok=True) + (parser_dir / "urls.jsonl").write_text( + "\n".join( + [ + json.dumps({"type": "Snapshot", "url": "https://example.com/child-a", "depth": 1}), + json.dumps({"type": "Snapshot", "url": "https://example.com/child-b", "depth": 1}), + "", + ], + ), + ) + + runner = CrawlRunner(crawl) + Crawl.objects.filter(id=crawl.id).update(max_depth=1) + payload = runner.load_snapshot_payload(str(snapshot.id)) + + asyncio.run(runner.enqueue_discovered_snapshots_from_outputs(payload)) + + child_snapshots = list(crawl.snapshot_set.filter(depth=1).order_by("url").values_list("url", "status")) + assert child_snapshots == [ + ("https://example.com/child-a", Snapshot.StatusChoices.QUEUED), + ("https://example.com/child-b", Snapshot.StatusChoices.QUEUED), + ] + + +def test_run_snapshot_seals_descendant_when_crawl_max_size_is_reached(tmp_path): + from abx_dl.events import CrawlStartEvent, SnapshotEvent + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + + crawl = Crawl.objects.create( + urls="https://example.com", + config={ + "LIB_DIR": str(tmp_path / "lib"), + "PLUGINS": "__archivebox_test_no_plugins__", + "CHROME_BINARY": "", + "CRAWL_MAX_SIZE": 16, + }, + created_by_id=get_or_create_system_user_pk(), + ) + root = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + depth=0, + status=Snapshot.StatusChoices.SEALED, + ) + child = Snapshot.objects.create( + url="https://example.com/child", + crawl=crawl, + depth=1, + parent_snapshot=root, + status=Snapshot.StatusChoices.QUEUED, + ) + state_dir = Path(crawl.output_dir) / ".abx-dl" + state_dir.mkdir(parents=True, exist_ok=True) + (state_dir / "limits.json").write_text( + json.dumps( + { + "admitted_snapshot_ids": [str(root.id), str(child.id)], + "counted_event_ids": ["proc-1"], + "total_size": 32, + "stop_reason": "crawl_max_size", + }, + ), + encoding="utf-8", + ) + + runner = CrawlRunner(crawl) + runner.load_run_state() + + async def run_child_from_crawl_start() -> list[SnapshotEvent]: + async def on_crawl_start(_event: CrawlStartEvent) -> None: + await runner.run_snapshot(str(child.id)) + + runner.bus.on(CrawlStartEvent, on_crawl_start) + crawl_start = runner.bus.emit( + CrawlStartEvent( + url=root.url, + snapshot_id=str(root.id), + output_dir=str(crawl.output_dir), + event_timeout=30, + event_handler_timeout=30, + ), + ) + await crawl_start.now() + await crawl_start.event_results_list() + await runner.bus.wait_until_idle() + return await runner.bus.filter(SnapshotEvent, child_of=crawl_start, past=True) + + snapshot_events = asyncio.run(run_child_from_crawl_start()) + + child.refresh_from_db() + assert child.status == Snapshot.StatusChoices.SEALED + assert child.retry_at is None + assert snapshot_events == [] + + +def test_seal_snapshot_cancels_queued_descendants_after_crawl_max_size(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.snapshot_service import SnapshotService + from abx_dl.events import SnapshotCompletedEvent + from abx_dl.orchestrator import create_bus + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + config={"CRAWL_MAX_SIZE": 16}, + ) + root = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + child = Snapshot.objects.create( + url="https://example.com/child", + crawl=crawl, + depth=1, + parent_snapshot_id=root.id, + status=Snapshot.StatusChoices.QUEUED, + ) + + state_dir = Path(crawl.output_dir) / ".abx-dl" + state_dir.mkdir(parents=True, exist_ok=True) + (state_dir / "limits.json").write_text( + json.dumps( + { + "admitted_snapshot_ids": [str(root.id), str(child.id)], + "counted_event_ids": ["proc-1"], + "total_size": 32, + "stop_reason": "crawl_max_size", + }, + ), + encoding="utf-8", + ) + + bus = create_bus(name=f"test_snapshot_limit_cancel_{str(crawl.id).replace('-', '_')}") + service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None) + try: + + async def emit_event() -> None: + await service.on_SnapshotCompletedEvent( + SnapshotCompletedEvent( + url=root.url, + snapshot_id=str(root.id), + output_dir=str(root.output_dir), + ), + ) + + asyncio.run(emit_event()) + finally: + asyncio.run(bus.wait_until_idle()) + asyncio.run(bus.destroy()) + + root.refresh_from_db() + child.refresh_from_db() + assert root.status == Snapshot.StatusChoices.SEALED + assert child.status == Snapshot.StatusChoices.SEALED + assert child.retry_at is None + + +def test_recursive_crawl_respects_max_urls(tmp_path, initialized_archive, recursive_test_site): + """Test that recursive discovery stops creating snapshots at max_urls.""" + env = cli_env(disable_extractors=True) + + env = env.copy() + env.update( + { + "URL_ALLOWLIST": r"127\.0\.0\.1[:/].*", + "SAVE_WGET": "true", + "USE_CHROME": "false", + "USE_COLOR": "false", + "SHOW_PROGRESS": "false", + }, + ) + + result = run_archivebox_cmd( + [ + "add", + "--depth=2", + "--max-urls=4", + "--plugins=wget,parse_html_urls", + recursive_test_site["root_url"], + ], + env=env, + timeout=120, + ) + stdout, stderr = result.stdout, result.stderr + + if stderr: + print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n") + if stdout: + print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n") + + assert result.returncode == 0, result.stderr + + with use_archivebox_db(tmp_path): + crawl_obj = Crawl.objects.order_by("-created_at").first() + crawl = (crawl_obj.max_depth, crawl_obj.config["CRAWL_MAX_URLS"]) if crawl_obj else None + snapshot_rows = list(Snapshot.objects.order_by("depth", "url").values_list("url", "depth", "parent_snapshot_id")) + depth_counts = { + depth: Snapshot.objects.filter(depth=depth).count() for depth in set(Snapshot.objects.values_list("depth", flat=True)) + } + + assert crawl == (2, 4) + assert len(snapshot_rows) == 4 + assert depth_counts.get(0, 0) == 1 + assert depth_counts.get(1, 0) == 3 + assert depth_counts.get(2, 0) == 0 + assert set(recursive_test_site["child_urls"]).issubset({url for url, depth, _parent in snapshot_rows if depth == 1}) diff --git a/archivebox/tests/test_config_ONLY_NEW.py b/archivebox/tests/test_config_ONLY_NEW.py new file mode 100644 index 0000000000..6df7b31209 --- /dev/null +++ b/archivebox/tests/test_config_ONLY_NEW.py @@ -0,0 +1,87 @@ +"""Tests for ONLY_NEW crawl config behavior.""" + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + +pytestmark = pytest.mark.django_db + + +def test_create_snapshots_from_urls_respects_only_new_exact_url_matches(admin_user): + existing_crawl = Crawl.objects.create(urls="https://example.com/existing", created_by=admin_user) + Snapshot.objects.create( + url="https://example.com/existing", + crawl=existing_crawl, + timestamp="1700000000.001", + ) + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://example.com/existing", + "https://example.com/existing/", + "https://example.com/fresh", + ], + ), + config={"ONLY_NEW": True}, + created_by=admin_user, + ) + + created = crawl.create_snapshots_from_urls() + + assert [snapshot.url for snapshot in created] == [ + "https://example.com/existing/", + "https://example.com/fresh", + ] + assert Snapshot.objects.filter(url="https://example.com/existing").count() == 1 + + +def test_create_snapshots_from_urls_allows_existing_exact_url_when_only_new_false(admin_user): + existing_crawl = Crawl.objects.create(urls="https://example.com/existing", created_by=admin_user) + Snapshot.objects.create( + url="https://example.com/existing", + crawl=existing_crawl, + timestamp="1700000000.002", + ) + crawl = Crawl.objects.create( + urls="https://example.com/existing", + config={"ONLY_NEW": False}, + created_by=admin_user, + ) + + created = crawl.create_snapshots_from_urls() + + assert [snapshot.url for snapshot in created] == ["https://example.com/existing"] + assert Snapshot.objects.filter(url="https://example.com/existing").count() == 2 + + +def test_create_discovered_snapshots_respects_only_new_exact_url_matches(admin_user): + existing_crawl = Crawl.objects.create(urls="https://example.com/existing", created_by=admin_user) + Snapshot.objects.create( + url="https://example.com/existing", + crawl=existing_crawl, + timestamp="1700000000.003", + ) + crawl = Crawl.objects.create( + urls="https://example.com/root", + max_depth=1, + config={"ONLY_NEW": True}, + created_by=admin_user, + ) + parent = crawl.create_snapshots_from_urls()[0] + + created = crawl.create_discovered_snapshots( + parent, + [ + {"url": "https://example.com/existing"}, + {"url": "https://example.com/existing/"}, + {"url": "https://example.com/fresh"}, + ], + depth=1, + ) + + assert [snapshot.url for snapshot in created] == [ + "https://example.com/existing/", + "https://example.com/fresh", + ] + assert Snapshot.objects.filter(url="https://example.com/existing").count() == 1 diff --git a/archivebox/tests/test_config_SAVE_TITLE.py b/archivebox/tests/test_config_SAVE_TITLE.py new file mode 100644 index 0000000000..37e0d7d89f --- /dev/null +++ b/archivebox/tests/test_config_SAVE_TITLE.py @@ -0,0 +1,88 @@ +import time +from pathlib import Path + +from archivebox.tests.conftest import run_archivebox_cmd, cli_env + +import pytest + +from archivebox.core.models import Snapshot +from archivebox.tests.test_orm_helpers import use_archivebox_db +from .conftest import _find_cached_chrome, _find_system_browser + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _install_chrome(tmp_path, env): + env["CHROME_ISOLATION"] = "snapshot" + env["LIB_DIR"] = str(tmp_path / "lib") + install_process = run_archivebox_cmd( + ["install", "chrome"], + cwd=tmp_path, + env=env, + timeout=600, + ) + assert install_process.returncode == 0, install_process.stderr or install_process.stdout + cached_browser = _find_cached_chrome(Path(env["LIB_DIR"])) + if cached_browser is not None: + env["CHROME_BINARY"] = str(cached_browser) + return + system_browser = _find_system_browser() + if system_browser: + env["CHROME_BINARY"] = str(system_browser) + return + raise AssertionError(install_process.stderr or install_process.stdout) + + +def _wait_for_snapshot_title(data_dir, *, timeout=60): + deadline = time.time() + timeout + title = None + while time.time() < deadline: + with use_archivebox_db(data_dir): + title = Snapshot.objects.get().resolved_title + if title: + return title + time.sleep(0.5) + return title + + +def test_title_is_extracted(tmp_path, initialized_archive, recursive_test_site): + """Test that title is extracted from the page.""" + env = cli_env(disable_extractors=True) + env.update({"SAVE_TITLE": "true"}) + _install_chrome(tmp_path, env) + add_process = run_archivebox_cmd( + ["add", "--plugins=chrome,wget,title", recursive_test_site["root_url"]], + cwd=tmp_path, + env=env, + ) + assert add_process.returncode == 0, add_process.stderr or add_process.stdout + + title = _wait_for_snapshot_title(tmp_path) + assert title is not None + assert "Root" in title + + +def test_title_is_listed_by_search_alias(tmp_path, initialized_archive, recursive_test_site): + """ + https://github.com/ArchiveBox/ArchiveBox/issues/330 + Unencoded content should not be rendered as it facilitates xss injections + and breaks the layout. + """ + env = cli_env(disable_extractors=True) + env.update({"SAVE_TITLE": "true"}) + _install_chrome(tmp_path, env) + add_process = run_archivebox_cmd( + ["add", "--plugins=chrome,wget,title", recursive_test_site["root_url"]], + cwd=tmp_path, + env=env, + ) + assert add_process.returncode == 0, add_process.stderr or add_process.stdout + list_process = run_archivebox_cmd( + ["search"], + cwd=tmp_path, + env=env, + ) + assert list_process.returncode == 0, list_process.stderr or list_process.stdout + + output = list_process.stdout + assert recursive_test_site["root_url"] in output diff --git a/archivebox/tests/test_config_URL_filters.py b/archivebox/tests/test_config_URL_filters.py new file mode 100644 index 0000000000..38d8964afe --- /dev/null +++ b/archivebox/tests/test_config_URL_filters.py @@ -0,0 +1,60 @@ +"""Tests for URL_ALLOWLIST and URL_DENYLIST behavior.""" + +import pytest + +from archivebox.crawls.models import Crawl + +pytestmark = pytest.mark.django_db + + +def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user): + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://example.com/root", + "https://static.example.com/app.js", + "https://other.test/page", + ], + ), + created_by=admin_user, + config={ + "URL_ALLOWLIST": "example.com", + "URL_DENYLIST": "static.example.com", + }, + ) + + created = crawl.create_snapshots_from_urls() + + assert [snapshot.url for snapshot in created] == ["https://example.com/root"] + + +def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user): + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://example.com/root", + "https://example.com/path,with,commas", + "https://other.test/page", + ], + ), + created_by=admin_user, + config={ + "URL_ALLOWLIST": r"^https://example\.com/(root|path,with,commas)$" + "\n" + r"^https://other\.test/page$", + "URL_DENYLIST": r"^https://example\.com/path,with,commas$", + }, + ) + + assert crawl.get_url_allowlist(use_effective_config=False) == [ + r"^https://example\.com/(root|path,with,commas)$", + r"^https://other\.test/page$", + ] + assert crawl.get_url_denylist(use_effective_config=False) == [ + r"^https://example\.com/path,with,commas$", + ] + + created = crawl.create_snapshots_from_urls() + + assert [snapshot.url for snapshot in created] == [ + "https://example.com/root", + "https://other.test/page", + ] diff --git a/archivebox/tests/test_core_config.py b/archivebox/tests/test_core_config.py new file mode 100644 index 0000000000..6b6c94388f --- /dev/null +++ b/archivebox/tests/test_core_config.py @@ -0,0 +1,5 @@ +from archivebox.config import CONSTANTS + + +def test_sonic_dir_is_allowed_inside_data_dir(): + assert "sonic" in CONSTANTS.ALLOWED_IN_DATA_DIR diff --git a/archivebox/tests/test_crawl_pause.py b/archivebox/tests/test_crawl_pause.py new file mode 100644 index 0000000000..8c7b2e3f37 --- /dev/null +++ b/archivebox/tests/test_crawl_pause.py @@ -0,0 +1,12 @@ +from django.utils import timezone + +from archivebox.workers.models import RETRY_AT_MAX + + +def test_retry_at_max_is_safe_for_admin_timezone_localization(): + with timezone.override("Pacific/Kiritimati"): + assert timezone.localtime(RETRY_AT_MAX).year == 9999 + + +# test_crawl_pause_resume_api_survives_server_restart_and_processes_after_resume moved to test_api_v1_crawls_crawl_crawl_id.py. +# test_update_index_only_runs_paused_search_rows_and_resume_later_runs_crawl moved to test_api_v1_crawls_crawl_crawl_id.py. diff --git a/archivebox/tests/test_crawl_runner.py b/archivebox/tests/test_crawl_runner.py new file mode 100644 index 0000000000..3d5504aee9 --- /dev/null +++ b/archivebox/tests/test_crawl_runner.py @@ -0,0 +1,1149 @@ +import asyncio +import sys +from pathlib import Path + +import pytest +from asgiref.sync import sync_to_async + + +pytestmark = pytest.mark.django_db + + +@pytest.mark.django_db(transaction=True) +def test_cancelled_crawl_projection_emits_abort_event_from_runner_bus(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + from abx_dl.events import CrawlAbortEvent, CrawlEvent + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + ) + runner = CrawlRunner(crawl) + + async def run() -> CrawlAbortEvent | None: + abort_event_holder: dict[str, CrawlAbortEvent | None] = {"event": None} + + async def on_CrawlEvent(event: CrawlEvent) -> None: + watcher = asyncio.create_task(runner.watch_for_cancelled_crawl(event, poll_interval=0.01)) + await asyncio.sleep(0.02) + await sync_to_async(Crawl.objects.filter(id=crawl.id).update, thread_sensitive=True)( + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + abort_event = await runner.bus.find(CrawlAbortEvent, child_of=event, past=True, future=1.0) + abort_event_holder["event"] = abort_event if isinstance(abort_event, CrawlAbortEvent) else None + await watcher + + runner.bus.on(CrawlEvent, on_CrawlEvent) + await runner.bus.emit( + CrawlEvent( + url=snapshot.url, + snapshot_id=str(snapshot.id), + output_dir=str(crawl.output_dir), + ), + ).now() + await runner.bus.wait_until_idle() + return abort_event_holder["event"] + + abort_event = asyncio.run(run()) + + assert abort_event is not None + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.django_db(transaction=True) +def test_snapshot_payload_uses_crawl_chrome_dirs_by_default(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.personas.models import Persona + from archivebox.services.runner import CrawlRunner + + persona = Persona(name="RuntimePersona") + persona.save() + crawl = Crawl( + urls="https://example.com", + persona_id=persona.id, + created_by_id=get_or_create_system_user_pk(), + ) + crawl.save() + snapshot = Snapshot(url="https://example.com", crawl=crawl) + snapshot.save() + other_snapshot = Snapshot(url="https://example.org", crawl=crawl) + other_snapshot.save() + + runner = CrawlRunner(crawl) + runner.load_run_state() + crawl_downloads_sentinel = persona.runtime_downloads_dir_for_crawl(crawl) / "keep.txt" + crawl_downloads_sentinel.write_text("keep") + payload = runner.load_snapshot_payload(str(snapshot.id)) + other_payload = runner.load_snapshot_payload(str(other_snapshot.id)) + config = payload["config"] + other_config = other_payload["config"] + + assert Path(config["CHROME_USER_DATA_DIR"]).is_relative_to(crawl.output_dir) + assert Path(config["CHROME_DOWNLOADS_DIR"]).is_relative_to(crawl.output_dir) + assert Path(config["CHROME_USER_DATA_DIR"]).name == "chrome_profile" + assert Path(config["CHROME_DOWNLOADS_DIR"]).name == "chrome_downloads" + assert Path(config["CHROME_USER_DATA_DIR"]) == Path(other_config["CHROME_USER_DATA_DIR"]) + assert Path(config["CHROME_DOWNLOADS_DIR"]) == Path(other_config["CHROME_DOWNLOADS_DIR"]) + assert crawl_downloads_sentinel.read_text() == "keep" + assert config["ACTIVE_PERSONA"] == "RuntimePersona" + assert Path(config["CRAWL_DIR"]) == crawl.output_dir + assert Path(config["SNAP_DIR"]) == snapshot.output_dir + + +@pytest.mark.django_db(transaction=True) +def test_snapshot_payload_uses_snapshot_chrome_dirs_when_snapshot_isolated(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.personas.models import Persona + from archivebox.services.runner import CrawlRunner + + persona = Persona(name="SnapshotRuntimePersona") + persona.save() + crawl = Crawl( + urls="https://example.com\nhttps://example.org", + persona_id=persona.id, + created_by_id=get_or_create_system_user_pk(), + config={"CHROME_ISOLATION": "snapshot"}, + ) + crawl.save() + snapshot = Snapshot(url="https://example.com", crawl=crawl) + snapshot.save() + other_snapshot = Snapshot(url="https://example.org", crawl=crawl) + other_snapshot.save() + + runner = CrawlRunner(crawl) + runner.load_run_state() + payload = runner.load_snapshot_payload(str(snapshot.id)) + other_payload = runner.load_snapshot_payload(str(other_snapshot.id)) + config = payload["config"] + other_config = other_payload["config"] + + assert Path(config["CHROME_USER_DATA_DIR"]).is_relative_to(snapshot.output_dir) + assert Path(config["CHROME_DOWNLOADS_DIR"]).is_relative_to(snapshot.output_dir) + assert Path(other_config["CHROME_USER_DATA_DIR"]).is_relative_to(other_snapshot.output_dir) + assert Path(other_config["CHROME_DOWNLOADS_DIR"]).is_relative_to(other_snapshot.output_dir) + assert Path(config["CHROME_USER_DATA_DIR"]) != Path(other_config["CHROME_USER_DATA_DIR"]) + assert Path(config["CHROME_DOWNLOADS_DIR"]) != Path(other_config["CHROME_DOWNLOADS_DIR"]) + assert Path(config["CRAWL_DIR"]) == crawl.output_dir + assert Path(config["SNAP_DIR"]) == snapshot.output_dir + + +def test_ensure_background_runner_skips_under_pytest_guard(): + from archivebox.services.runner import ensure_background_runner + + assert ensure_background_runner() is False + + +@pytest.mark.django_db(transaction=True) +def test_ensure_background_runner_skips_with_real_running_orchestrator_record(): + import os + from datetime import datetime + + import psutil + from archivebox.machine.models import Machine, Process + from archivebox.services.runner import ensure_background_runner + from django.utils import timezone + + os_proc = psutil.Process(os.getpid()) + process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.ORCHESTRATOR, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + started_at=datetime.fromtimestamp(os_proc.create_time(), tz=timezone.get_current_timezone()), + ) + + assert ensure_background_runner(allow_under_pytest=True) is False + process.refresh_from_db() + assert process.status == Process.StatusChoices.RUNNING + + +@pytest.mark.django_db(transaction=True) +def test_ensure_background_runner_does_not_spawn_runner_without_supervisord(): + from archivebox.services.runner import ensure_background_runner + from archivebox.workers.supervisord_util import get_existing_supervisord_process, stop_existing_supervisord_process + + stop_existing_supervisord_process() + assert get_existing_supervisord_process(quiet=True) is None + + assert ensure_background_runner(allow_under_pytest=True) is False + assert get_existing_supervisord_process(quiet=True) is None + + +def test_runner_task_context_clears_inherited_abxbus_handler_context(tmp_path): + from abx_dl.events import CrawlEvent, MachineEvent + from abx_dl.orchestrator import create_bus + from abxbus.event_bus import in_handler_context + from archivebox.services import runner as runner_module + + bus = create_bus(name="test_runner_task_context_clears_inherited_abxbus_handler_context") + observations = [] + + async def emit_from_runner_task(): + observations.append(("in_handler_context", in_handler_context())) + machine_event = bus.emit(MachineEvent(config={"TIMEOUT": "30"}, config_type="user")) + await machine_event.now() + observations.append(("machine_event_path", bool(machine_event.event_path))) + + async def on_crawl(event): + assert in_handler_context() is True + task = asyncio.create_task(emit_from_runner_task(), context=runner_module._runner_task_context()) + await task + + bus.on(CrawlEvent, on_crawl) + + async def run_test(): + try: + await bus.emit( + CrawlEvent( + url="https://example.com", + snapshot_id="snapshot-1", + output_dir=str(tmp_path), + ), + ).now() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(run_test()) + + assert observations == [ + ("in_handler_context", False), + ("machine_event_path", True), + ] + + +@pytest.mark.django_db(transaction=True) +def test_snapshot_started_state_keeps_retry_at_lease(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from django.utils import timezone + + before = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=before, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=before, + ) + + assert snapshot.tick_claimed(lock_seconds=60) is True + + snapshot.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.STARTED + assert snapshot.retry_at is not None + assert snapshot.retry_at > before + + +@pytest.mark.django_db(transaction=True) +def test_crawl_start_event_keeps_retry_at_lease(): + from abx_dl.events import CrawlStartEvent + from abx_dl.orchestrator import create_bus + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.crawl_service import CrawlService + from django.utils import timezone + + before = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.QUEUED, + retry_at=before, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=before, + ) + bus = create_bus(name="test_crawl_start_event_keeps_retry_at_lease") + CrawlService(bus, crawl_id=str(crawl.id)) + + async def run_event(): + try: + await bus.emit( + CrawlStartEvent( + url=snapshot.url, + snapshot_id=str(snapshot.id), + output_dir=str(crawl.output_dir), + ), + ).now() + await bus.wait_until_idle() + finally: + await bus.destroy(clear=False) + + asyncio.run(run_event()) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + assert crawl.retry_at > before + + +@pytest.mark.django_db(transaction=True) +def test_crawl_start_event_does_not_reschedule_sealed_parent_until_explicit_requeue(): + from abx_dl.events import CrawlStartEvent + from abx_dl.orchestrator import create_bus + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.crawl_service import CrawlService + from django.utils import timezone + + before = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=before, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=before, + ) + crawl_output_dir = str(crawl.output_dir) + + async def emit_start(name: str) -> None: + bus = create_bus(name=name) + try: + CrawlService(bus, crawl_id=str(crawl.id)) + emitted = bus.emit( + CrawlStartEvent( + url=snapshot.url, + snapshot_id=str(snapshot.id), + output_dir=crawl_output_dir, + ), + ) + await emitted.now() + await emitted.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy(clear=False) + + asyncio.run(emit_start("test_crawl_start_event_sealed_parent_noop")) + + crawl.refresh_from_db() + snapshot.refresh_from_db() + # CrawlStartEvent on a sealed parent is a no-op โ€” neither the parent nor + # its sealed child gets resurrected by the handler. + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at == before + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at == before + + # The orchestrator's seal-cleanup pass picks the sealed row up via + # retry_at, runs cleanup, and clears retry_at โ€” that's how the + # ``retry_at != None`` invariant the handler intentionally preserves + # eventually drains to ``None``. + from archivebox.services.runner import run_due_crawl + + assert run_due_crawl(crawl, lock_seconds=10) is True + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + crawl.update_and_requeue(status=Crawl.StatusChoices.QUEUED, retry_at=timezone.now()) + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.QUEUED + + asyncio.run(emit_start("test_crawl_start_event_after_explicit_requeue")) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + assert crawl.retry_at > before + + +@pytest.mark.django_db(transaction=True) +def test_snapshot_queue_selection_is_retry_at_only_for_sealed_maintenance(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from django.utils import timezone + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=now, + ) + + assert Snapshot.get_queue().filter(id=snapshot.id).exists() + + +@pytest.mark.django_db(transaction=True) +def test_machine_service_persists_only_derived_config_events(tmp_path, hermetic_lib_dir): + from abx_dl.events import MachineEvent + from abx_dl.orchestrator import create_bus + from archivebox.machine.models import Machine + from archivebox.services.machine_service import MachineService + + machine = Machine.current() + machine.config = {} + machine.save(update_fields=["config"]) + wget_binary = hermetic_lib_dir / "bin" / "wget" + wget_binary.write_text("#!/bin/sh\n") + wget_binary.chmod(0o755) + + async def run_test(): + bus = create_bus(name="test_machine_service_persists_only_derived_config_events") + try: + MachineService(bus) + user_event = bus.emit( + MachineEvent( + config={ + "CHROME_ISOLATION": "snapshot", + "CHROME_USER_DATA_DIR": "/tmp/stale-profile", + }, + config_type="user", + ), + ) + await user_event.now() + await user_event.event_results_list() + derived_event = bus.emit( + MachineEvent( + config={ + "WGET_BINARY": str(wget_binary), + "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}, + "ABX_UV_CACHE": "/tmp/uv-cache", + "ABX_PNPM_CACHE": "/tmp/pnpm-cache", + "CHROME_USER_DATA_DIR": "/tmp/stale-derived-profile", + }, + config_type="derived", + ), + ) + await derived_event.now() + await derived_event.event_results_list() + unset_event = bus.emit( + MachineEvent( + method="unset", + key="config/WGET_BINARY", + config_type="derived", + ), + ) + await unset_event.now() + await unset_event.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(run_test()) + + machine.refresh_from_db() + # User events are dropped (handler ignores non-derived). At the event + # projector ``machine_service.py`` strips anything that isn't a binary + # path / install cache โ€” that's the security boundary that stops plugins + # from rewriting arbitrary user config via events. So CHROME_USER_DATA_DIR + # from the derived payload is dropped; WGET_BINARY made it in (inside + # LIB_DIR) then the unset removed it; ABX_INSTALL_CACHE survives. + assert machine.config == { + "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}, + "ABX_PNPM_CACHE": "/tmp/pnpm-cache", + "ABX_UV_CACHE": "/tmp/uv-cache", + } + + +@pytest.mark.django_db(transaction=True) +def test_load_run_state_uses_real_lib_dir_for_machine_binary_config(tmp_path, hermetic_lib_dir): + import archivebox.machine.models as machine_models + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.config.common import get_config + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.machine.models import Machine + from archivebox.services.runner import CrawlRunner + + resolved_lib_dir = get_config(include_machine=False).LIB_DIR + assert resolved_lib_dir == hermetic_lib_dir, f"LIB_DIR override not applied: {resolved_lib_dir!r} != {hermetic_lib_dir!r}" + + wget_binary = resolved_lib_dir / "bin" / "wget" + wget_binary.write_text("#!/bin/sh\n", encoding="utf-8") + wget_binary.chmod(0o755) + external_binary = tmp_path / "external" / "yt-dlp" + external_binary.parent.mkdir(parents=True) + external_binary.write_text("#!/bin/sh\n", encoding="utf-8") + external_binary.chmod(0o755) + + machine = Machine.current() + machine.config = { + "WGET_BINARY": str(wget_binary), + "YTDLP_BINARY": str(external_binary), + "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}, + "CHROME_ISOLATION": "snapshot", + "CHROME_USER_DATA_DIR": "/tmp/stale-profile", + } + machine.save(update_fields=["config"]) + machine_models._CURRENT_MACHINE = machine + + crawl = Crawl.objects.create( + urls="https://example.com", + config={ + "PLUGINS": "__archivebox_test_no_plugins__", + "CHROME_BINARY": "", + }, + created_by_id=get_or_create_system_user_pk(), + ) + + runner = CrawlRunner(crawl) + snapshot_ids = runner.load_run_state() + + # ``derived_config`` is Machine.config sanitized against LIB_DIR. Binary + # paths outside LIB_DIR drop out (YTDLP_BINARY โ†’ ``/tmp/...``); the + # ArchiveBox.conf mirror values (CHROME_ISOLATION, CHROME_USER_DATA_DIR, + # ABX_INSTALL_CACHE) survive so plugin hooks see the same runtime cache + # the user/runner persisted. + assert runner.derived_config == { + "WGET_BINARY": str(wget_binary), + "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}, + "CHROME_ISOLATION": "snapshot", + "CHROME_USER_DATA_DIR": "/tmp/stale-profile", + } + assert runner.base_config["LIB_DIR"] == resolved_lib_dir + assert runner.base_config["CHROME_KEEPALIVE"] is False + assert runner.selected_plugins == ["__archivebox_test_no_plugins__"] + assert Snapshot.objects.filter(id__in=snapshot_ids, crawl=crawl, url="https://example.com").count() == 1 + + +@pytest.mark.django_db(transaction=True) +def test_crawl_runner_empty_plugin_selection_emits_lifecycle_and_seals_crawl(tmp_path): + from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlEvent, CrawlSetupEvent, CrawlStartEvent, MachineEvent + from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.runner import CrawlRunner + + crawl = Crawl.objects.create( + urls="https://example.com", + config={ + "LIB_DIR": str(tmp_path / "lib"), + "PLUGINS": "__archivebox_test_no_plugins__", + "CHROME_BINARY": "", + }, + created_by_id=get_or_create_system_user_pk(), + ) + runner = CrawlRunner(crawl) + + seen_events = { + CrawlEvent: [], + CrawlSetupEvent: [], + CrawlStartEvent: [], + SnapshotEvent: [], + SnapshotCompletedEvent: [], + CrawlCleanupEvent: [], + CrawlCompletedEvent: [], + MachineEvent: [], + } + for event_type, events in seen_events.items(): + runner.bus.on(event_type, lambda event, events=events: events.append(event)) + + asyncio.run(runner.run()) + + crawl_events = seen_events[CrawlEvent] + setup_events = seen_events[CrawlSetupEvent] + start_events = seen_events[CrawlStartEvent] + snapshot_events = seen_events[SnapshotEvent] + snapshot_completed_events = seen_events[SnapshotCompletedEvent] + cleanup_events = seen_events[CrawlCleanupEvent] + completed_events = seen_events[CrawlCompletedEvent] + machine_events = seen_events[MachineEvent] + + assert len(crawl_events) == 1 + assert len(setup_events) == 1 + assert len(start_events) == 1 + assert len(snapshot_events) == 1 + assert len(snapshot_completed_events) == 1 + assert len(cleanup_events) == 1 + assert len(completed_events) == 1 + assert runner.bus.event_is_child_of(setup_events[0], crawl_events[0]) + assert runner.bus.event_is_child_of(start_events[0], crawl_events[0]) + assert runner.bus.event_is_child_of(cleanup_events[0], crawl_events[0]) + assert runner.bus.event_is_child_of(completed_events[0], crawl_events[0]) + assert runner.bus.event_is_child_of(snapshot_events[0], start_events[0]) + assert runner.bus.event_is_child_of(snapshot_completed_events[0], snapshot_events[0]) + assert any(event.config_type == "user" for event in machine_events) + + crawl.refresh_from_db() + snapshot = Snapshot.objects.get(crawl=crawl) + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert snapshot.retry_at is None + assert snapshot.archiveresult_set.count() == 0 + + +@pytest.mark.django_db(transaction=True) +def test_crawl_runner_resolves_persona_and_crawl_config_for_each_live_snapshot(): + from abx_dl.events import SnapshotCompletedEvent + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import ArchiveResult, Snapshot + from archivebox.machine.models import Process + from archivebox.personas.models import Persona + from archivebox.services.runner import CrawlRunner + + persona = Persona.objects.create( + name="RuntimeConfig", + config={ + "FAVICON_PROVIDER": "https://example.com/persona-first.ico", + "FAVICON_TIMEOUT": 10, + }, + ) + persona.ensure_dirs() + crawl = Crawl.objects.create( + urls="\n".join( + [ + "https://www.python.org/", + "https://www.djangoproject.com/", + "https://www.wikipedia.org/", + ], + ), + config={ + "PLUGINS": "favicon", + "CRAWL_MAX_CONCURRENT_SNAPSHOTS": 1, + }, + persona_id=persona.id, + created_by_id=get_or_create_system_user_pk(), + ) + runner = CrawlRunner(crawl) + completed_snapshot_ids: list[str] = [] + + async def update_config_between_snapshots(event: SnapshotCompletedEvent) -> None: + if event.snapshot_id in completed_snapshot_ids: + return + completed_snapshot_ids.append(event.snapshot_id) + if len(completed_snapshot_ids) == 1: + persona.config = { + **(persona.config or {}), + "FAVICON_PROVIDER": "https://example.com/persona-second.ico", + } + await persona.asave(update_fields=["config"]) + elif len(completed_snapshot_ids) == 2: + fresh_crawl = await Crawl.objects.aget(id=crawl.id) + fresh_crawl.config = { + **(fresh_crawl.config or {}), + "FAVICON_PROVIDER": "https://example.com/crawl-third.ico", + } + await fresh_crawl.asave(update_fields=["config"]) + + runner.bus.on(SnapshotCompletedEvent, update_config_between_snapshots) + + asyncio.run(runner.run()) + + favicon_processes = [ + process + for process in Process.objects.filter(process_type=Process.TypeChoices.HOOK).order_by("started_at") + if process.cmd and "on_Snapshot__11_favicon.finite.bg.py" in str(process.cmd[0]) + ] + providers = [process.env.get("FAVICON_PROVIDER") for process in favicon_processes] + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert Snapshot.objects.filter(crawl=crawl, status=Snapshot.StatusChoices.SEALED).count() == 3 + assert ( + ArchiveResult.objects.filter(snapshot__crawl=crawl, plugin="favicon").exclude(status=ArchiveResult.StatusChoices.FAILED).count() + == 3 + ) + assert providers == [ + "https://example.com/persona-first.ico", + "https://example.com/persona-second.ico", + "https://example.com/crawl-third.ico", + ] + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.django_db(transaction=True) +def test_run_pending_crawls_processes_queued_crawl_before_missing_binary_backlog(tmp_path): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.machine.models import Binary, Machine + from archivebox.services.runner import run_pending_crawls + from django.utils import timezone + + crawl = Crawl.objects.create( + urls="https://example.com", + config={ + "LIB_DIR": str(tmp_path / "lib"), + "PLUGINS": "__archivebox_test_no_plugins__", + "CHROME_BINARY": "", + }, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + created_by_id=get_or_create_system_user_pk(), + ) + binary = Binary.objects.create( + machine=Machine.current(), + name=str(tmp_path / "missing-node"), + status=Binary.StatusChoices.QUEUED, + retry_at=timezone.now(), + binproviders="env,apt", + overrides={"apt": {"install_args": ["nodejs"]}}, + ) + + result = run_pending_crawls(daemon=False) + + crawl.refresh_from_db() + binary.refresh_from_db() + assert result == 0 + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + assert Snapshot.objects.filter(crawl=crawl, status=Snapshot.StatusChoices.SEALED).count() == 1 + assert binary.status == Binary.StatusChoices.QUEUED + assert binary.retry_at is None + + +@pytest.mark.django_db(transaction=True) +def test_sealed_crawl_does_not_create_discovered_snapshots(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=None, + max_depth=3, + ) + root = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + assert crawl.create_snapshots_from_urls() == [] + assert crawl.create_discovered_snapshot(root, url="https://example.com/child", depth=1) is None + assert crawl.snapshot_set.count() == 1 + + +# test_create_crawl_api_queues_crawl_without_spawning_runner moved to test_api_v1_crawls_crawls.py. + + +def test_wait_for_snapshot_tasks_surfaces_already_failed_task(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.services import runner as runner_module + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + crawl_runner = runner_module.CrawlRunner(crawl) + + async def run_test(): + task = asyncio.get_running_loop().create_future() + task.set_exception(RuntimeError("snapshot failed")) + crawl_runner.snapshot_tasks["snap-1"] = task + with pytest.raises(RuntimeError, match="snapshot failed"): + await crawl_runner.wait_for_snapshot_tasks() + + asyncio.run(run_test()) + + +def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.services import runner as runner_module + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + ) + crawl_runner = runner_module.CrawlRunner(crawl) + + async def finish_snapshot() -> None: + await asyncio.sleep(0) + + async def run_test(): + task = asyncio.create_task(finish_snapshot()) + crawl_runner.snapshot_tasks["snap-1"] = task + await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5) + assert crawl_runner.snapshot_tasks == {} + + asyncio.run(run_test()) + + +def test_abx_process_service_background_process_finishes_after_process_exit(tmp_path): + from abx_dl.events import ProcessCompletedEvent, ProcessEvent + from abx_dl.orchestrator import create_bus + from abx_dl.services.process_service import ProcessService + + bus = create_bus(name="test_abx_process_service_background_process_finishes_after_process_exit") + ProcessService(bus, emit_jsonl=False, interactive_tty=False) + emitted_events = [] + + async def collect_completed(event): + emitted_events.append(event) + + bus.on(ProcessCompletedEvent, collect_completed) + + plugin_output_dir = tmp_path / "chrome" + plugin_output_dir.mkdir() + + async def run_test(): + try: + event = ProcessEvent( + plugin_name="chrome", + hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg", + hook_path=sys.executable, + hook_args=["-c", "print('daemon output')"], + env={}, + output_dir=str(plugin_output_dir), + timeout=60, + is_background=True, + url="https://example.org/", + process_type="hook", + worker_type="hook", + ) + await asyncio.wait_for(bus.emit(event).now(), timeout=0.5) + completed = await bus.find(ProcessCompletedEvent, past=True, future=5.0) + assert isinstance(completed, ProcessCompletedEvent) + await completed.event_results_list() + assert completed.status == "succeeded" + assert completed.stdout.strip() == "daemon output" + assert completed.output_dir == str(plugin_output_dir) + assert bus.event_is_child_of(completed, event) + finally: + await bus.destroy() + + asyncio.run(run_test()) + + assert not list(plugin_output_dir.glob("on_CrawlSetup__90_chrome_launch.daemon.bg.*.pid")) + assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events) + + +def test_run_pending_crawls_disables_missing_absolute_binary_backlog(tmp_path): + from archivebox.machine.models import Binary, Machine + from archivebox.services import runner as runner_module + + missing_binary = tmp_path / "missing-node" + binary = Binary.objects.create( + machine=Machine.current(), + name=str(missing_binary), + status=Binary.StatusChoices.QUEUED, + retry_at=runner_module.timezone.now(), + binproviders="env,apt", + overrides={"apt": {"install_args": ["nodejs"]}}, + ) + + result = runner_module.run_pending_crawls(daemon=False) + + binary.refresh_from_db() + assert result == 0 + assert binary.status == Binary.StatusChoices.QUEUED + assert binary.retry_at is None + + +@pytest.mark.django_db(transaction=True) +def test_crawl_completed_event_requeues_active_snapshots(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.crawl_service import CrawlService + from abx_dl.events import CrawlCompletedEvent + from abx_dl.orchestrator import create_bus + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + + bus = create_bus(name=f"test_crawl_completed_active_snapshots_{str(crawl.id).replace('-', '_')}") + service = CrawlService(bus, crawl_id=str(crawl.id)) + assert service is not None + + async def emit_completed() -> None: + try: + event = CrawlCompletedEvent( + url="https://example.com", + snapshot_id="", + output_dir=str(crawl.output_dir), + ) + emitted = bus.emit(event) + await emitted.wait() + await emitted.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(emit_completed()) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + +@pytest.mark.django_db(transaction=True) +def test_crawl_start_event_does_not_resurrect_cancelled_crawl(): + from django.utils import timezone + + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.services.crawl_service import CrawlService + from abx_dl.events import CrawlStartEvent + from abx_dl.orchestrator import create_bus + + now = timezone.now() + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.SEALED, + retry_at=now, + ) + + bus = create_bus(name=f"test_crawl_start_cancelled_{str(crawl.id).replace('-', '_')}") + service = CrawlService(bus, crawl_id=str(crawl.id)) + assert service is not None + + async def emit_start() -> None: + try: + event = CrawlStartEvent( + url="https://example.com", + snapshot_id="", + output_dir=str(crawl.output_dir), + ) + emitted = bus.emit(event) + await emitted.wait() + await emitted.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(emit_start()) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at == now + + +@pytest.mark.django_db(transaction=True) +def test_crawl_cleanup_event_requeues_unfinished_crawl(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.crawl_service import CrawlService + from abx_dl.events import CrawlCleanupEvent + from abx_dl.orchestrator import create_bus + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=None, + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.QUEUED, + retry_at=None, + ) + + bus = create_bus(name=f"test_crawl_cleanup_requeues_unfinished_{str(crawl.id).replace('-', '_')}") + service = CrawlService(bus, crawl_id=str(crawl.id)) + assert service is not None + + async def emit_cleanup() -> None: + try: + event = CrawlCleanupEvent( + url="https://example.com", + snapshot_id=str(snapshot.id), + output_dir=str(crawl.output_dir), + ) + emitted = bus.emit(event) + await emitted.wait() + await emitted.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(emit_cleanup()) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + +@pytest.mark.django_db(transaction=True) +def test_crawl_completed_event_seals_finished_crawl(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.crawl_service import CrawlService + from abx_dl.events import CrawlCompletedEvent + from abx_dl.orchestrator import create_bus + from django.utils import timezone + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.SEALED, + retry_at=None, + ) + + bus = create_bus(name=f"test_crawl_completed_finished_crawl_{str(crawl.id).replace('-', '_')}") + service = CrawlService(bus, crawl_id=str(crawl.id)) + assert service is not None + + async def emit_cleanup() -> None: + try: + event = CrawlCompletedEvent( + url="https://example.com", + snapshot_id=str(snapshot.id), + output_dir=str(crawl.output_dir), + ) + emitted = bus.emit(event) + await emitted.wait() + await emitted.event_results_list() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(emit_cleanup()) + + crawl.refresh_from_db() + assert crawl.status == Crawl.StatusChoices.SEALED + assert crawl.retry_at is None + + +@pytest.mark.django_db(transaction=True) +def test_snapshot_completed_event_defers_finished_crawl_seal(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.snapshot_service import SnapshotService + from abx_dl.events import SnapshotCompletedEvent + from abx_dl.orchestrator import create_bus + from django.utils import timezone + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + + bus = create_bus(name=f"test_snapshot_completed_finished_crawl_{str(crawl.id).replace('-', '_')}") + service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: asyncio.sleep(0)) + try: + + async def emit_completed() -> None: + await service.on_SnapshotCompletedEvent( + SnapshotCompletedEvent( + url="https://example.com", + snapshot_id=str(snapshot.id), + output_dir=str(snapshot.output_dir), + ), + ) + + asyncio.run(emit_completed()) + finally: + asyncio.run(bus.destroy()) + + snapshot.refresh_from_db() + crawl.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None + + +@pytest.mark.django_db(transaction=True) +def test_snapshot_completed_event_bus_defers_finished_crawl_seal(): + from archivebox.base_models.models import get_or_create_system_user_pk + from archivebox.crawls.models import Crawl + from archivebox.core.models import Snapshot + from archivebox.services.snapshot_service import SnapshotService + from abx_dl.events import SnapshotCompletedEvent + from abx_dl.orchestrator import create_bus + from django.utils import timezone + + crawl = Crawl.objects.create( + urls="https://example.com", + created_by_id=get_or_create_system_user_pk(), + status=Crawl.StatusChoices.STARTED, + retry_at=timezone.now(), + ) + snapshot = Snapshot.objects.create( + url="https://example.com", + crawl=crawl, + status=Snapshot.StatusChoices.STARTED, + retry_at=None, + ) + + bus = create_bus(name=f"test_snapshot_completed_bus_finished_crawl_{str(crawl.id).replace('-', '_')}") + service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: asyncio.sleep(0)) + assert service is not None + try: + + async def emit_completed() -> None: + emitted = bus.emit( + SnapshotCompletedEvent( + url="https://example.com", + snapshot_id=str(snapshot.id), + output_dir=str(snapshot.output_dir), + ), + ) + await emitted.wait() + await emitted.event_results_list() + + asyncio.run(emit_completed()) + finally: + asyncio.run(bus.wait_until_idle()) + asyncio.run(bus.destroy()) + + snapshot.refresh_from_db() + crawl.refresh_from_db() + assert snapshot.status == Snapshot.StatusChoices.SEALED + assert crawl.status == Crawl.StatusChoices.STARTED + assert crawl.retry_at is not None diff --git a/archivebox/tests/test_crawl_service.py b/archivebox/tests/test_crawl_service.py new file mode 100644 index 0000000000..ada4801d7c --- /dev/null +++ b/archivebox/tests/test_crawl_service.py @@ -0,0 +1,115 @@ +from pathlib import Path + +import pytest + +from archivebox.core.models import ArchiveResult, Snapshot +from archivebox.crawls.models import Crawl +from archivebox.tests.conftest import run_archivebox_cmd +from archivebox.tests.test_orm_helpers import use_archivebox_db +from .conftest import cli_env, get_free_port, init_archive + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _crawl_state(cwd: Path, crawl_id: str) -> dict[str, object]: + with use_archivebox_db(cwd): + crawl = Crawl.objects.select_related("created_by").get(id=crawl_id) + snapshots = list( + Snapshot.objects.filter(crawl=crawl) + .order_by("depth", "url") + .values("id", "url", "depth", "status", "parent_snapshot_id", "downloaded_at"), + ) + results = list( + ArchiveResult.objects.filter(snapshot__crawl=crawl) + .order_by("snapshot__url", "plugin", "hook_name") + .values("snapshot__url", "plugin", "hook_name", "status", "output_files", "output_size"), + ) + return { + "status": crawl.status, + "retry_at": crawl.retry_at, + "urls": crawl.urls, + "config": crawl.config or {}, + "output_dir": Path(crawl.output_dir), + "snapshots": snapshots, + "results": results, + } + + +@pytest.mark.timeout(240) +def test_crawl_service_run_processes_queued_crawl_and_applies_crawl_config(tmp_path, recursive_test_site): + init_archive(tmp_path) + + port = get_free_port() + env = cli_env( + port=port, + PLUGINS="wget,parse_html_urls", + SAVE_WGET="True", + SAVE_FAVICON="False", + SAVE_TITLE="False", + ) + root_url = recursive_test_site["root_url"] + about_url = recursive_test_site["child_urls"][0] + contact_url = recursive_test_site["child_urls"][2] + + _cmd_result = run_archivebox_cmd( + [ + "add", + "--bg", + "--depth=0", + "--max-urls=20", + "--plugins=wget,parse_html_urls", + "--tag=crawl-service-e2e", + "--url-denylist=/contact$", + root_url, + about_url, + contact_url, + ], + cwd=tmp_path, + env=env, + timeout=120, + ) + add_stdout, add_stderr, add_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert add_code == 0, f"archivebox add --bg failed with code {add_code}\nSTDOUT:\n{add_stdout}\nSTDERR:\n{add_stderr}" + + with use_archivebox_db(tmp_path): + latest_crawl_id = Crawl.objects.order_by("-created_at").values_list("id", flat=True).first() + assert latest_crawl_id is not None + crawl_id = str(latest_crawl_id) + queued_state = _crawl_state(tmp_path, crawl_id) + assert queued_state["status"] == Crawl.StatusChoices.QUEUED + assert queued_state["retry_at"] is not None + assert queued_state["config"]["PLUGINS"] == "wget,parse_html_urls" + assert queued_state["config"]["URL_DENYLIST"] == "/contact$" + assert queued_state["snapshots"] == [] + + _cmd_result = run_archivebox_cmd( + ["run", "--crawl-id", crawl_id], + cwd=tmp_path, + env=env, + timeout=240, + ) + run_stdout, run_stderr, run_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert run_code == 0, f"archivebox run --crawl-id failed with code {run_code}\nSTDOUT:\n{run_stdout}\nSTDERR:\n{run_stderr}" + + state = _crawl_state(tmp_path, crawl_id) + snapshots = state["snapshots"] + results = state["results"] + snapshotted_urls = {row["url"] for row in snapshots} + + assert state["status"] == Crawl.StatusChoices.SEALED + assert state["retry_at"] is None + assert snapshotted_urls == {root_url, about_url} + assert contact_url not in snapshotted_urls + assert {row["depth"] for row in snapshots} == {0} + assert all(row["status"] == Snapshot.StatusChoices.SEALED for row in snapshots) + assert all(row["downloaded_at"] is not None for row in snapshots) + assert all("/contact" not in row["url"] for row in snapshots) + assert all(row["parent_snapshot_id"] is None for row in snapshots) + + result_statuses = {(row["plugin"], row["status"]) for row in results} + assert ("wget", ArchiveResult.StatusChoices.SUCCEEDED) in result_statuses + assert any(row["plugin"].endswith("parse_html_urls") and row["status"] == ArchiveResult.StatusChoices.SUCCEEDED for row in results) + assert any(row["plugin"] == "wget" and row["output_size"] > 0 for row in results) + + assert list((tmp_path / "archive/users/system/snapshots").rglob("wget/**/*.html")) + assert list((tmp_path / "archive/users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl")) diff --git a/archivebox/tests/test_frozen_crawl_config.py b/archivebox/tests/test_frozen_crawl_config.py new file mode 100644 index 0000000000..e901df75a8 --- /dev/null +++ b/archivebox/tests/test_frozen_crawl_config.py @@ -0,0 +1,344 @@ +import time +from types import SimpleNamespace + +import pytest +from django.utils import timezone + +pytestmark = pytest.mark.django_db(transaction=True) + + +SENSITIVE_SECRET = "raw-twocaptcha-secret-for-frozen-crawl-test" +UPDATED_SECRET = "updated-secret-that-must-not-affect-old-crawl" + + +@pytest.fixture +def archivebox_db(initialized_archive): + from archivebox.tests.test_orm_helpers import use_archivebox_db + + with use_archivebox_db(initialized_archive): + yield initialized_archive + + +def _user(username="frozen-config-admin"): + from django.contrib.auth import get_user_model + + return get_user_model().objects.create_superuser( + username=username, + email=f"{username}@example.com", + password="testpassword", + ) + + +def _persona(user, *, name="Frozen Persona", secret=SENSITIVE_SECRET, user_agent="Frozen UA"): + from archivebox.personas.models import Persona + + persona = Persona.objects.create( + name=name, + created_by=user, + config={ + "PERMISSIONS": "private", + "USER_AGENT": user_agent, + "TWOCAPTCHA_API_KEY": secret, + "DELETE_AFTER": "2h", + }, + ) + persona.ensure_dirs() + return persona + + +def test_crawl_save_freezes_full_raw_persona_config_and_redacts_public_serialization(archivebox_db): + from archivebox.config.common import SENSITIVE_CONFIG_VALUE_REDACTED, get_config + from archivebox.crawls.models import Crawl + + user = _user() + persona = _persona(user) + + crawl = Crawl.objects.create( + urls="https://example.com/frozen", + persona=persona, + created_by=user, + config={"CRAWL_MAX_CONCURRENT_SNAPSHOTS": 3}, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + assert "TIMEOUT" in crawl.config + assert "CHECK_SSL_VALIDITY" in crawl.config + assert crawl.config["USER_AGENT"] == "Frozen UA" + assert "TWOCAPTCHA_API_KEY" not in crawl.config + assert crawl.config["CRAWL_MAX_CONCURRENT_SNAPSHOTS"] == 3 + assert "ACTIVE_PERSONA" not in crawl.config + assert "DEFAULT_PERSONA" not in crawl.config + assert "CRAWL_DIR" not in crawl.config + assert "SNAP_DIR" not in crawl.config + assert "DEBUG" not in crawl.config + assert "SECRET_KEY" not in crawl.config + assert "PUBLIC_ADD_VIEW" not in crawl.config + assert "DATABASE_NAME" not in crawl.config + + persona.config["USER_AGENT"] = "Mutated UA" + persona.config["TWOCAPTCHA_API_KEY"] = UPDATED_SECRET + persona.save(update_fields=["config"]) + + runtime_config = get_config(crawl=crawl) + assert runtime_config.USER_AGENT == "Frozen UA" + assert runtime_config.TWOCAPTCHA_API_KEY == UPDATED_SECRET + redacted_runtime_config = get_config(crawl=crawl, redact_sensitive=True) + assert redacted_runtime_config.USER_AGENT == "Frozen UA" + assert redacted_runtime_config.TWOCAPTCHA_API_KEY == SENSITIVE_CONFIG_VALUE_REDACTED + execution_config = runtime_config.for_crawl() + assert execution_config["DEBUG"] is False + assert "CRAWL_DIR" not in execution_config + assert "SNAP_DIR" not in execution_config + assert "SECRET_KEY" not in execution_config + assert "PUBLIC_ADD_VIEW" not in execution_config + assert "DATABASE_NAME" not in execution_config + + public_json = crawl.to_json() + assert "TWOCAPTCHA_API_KEY" not in public_json["config"] + assert SENSITIVE_SECRET not in str(public_json) + + +def test_snapshot_config_overlays_frozen_crawl_without_re_reading_persona(archivebox_db): + from archivebox.config.common import get_config + from archivebox.core.models import Snapshot + from archivebox.crawls.models import Crawl + + user = _user("frozen-config-snapshot-admin") + persona = _persona(user, name="Frozen Snapshot Persona", user_agent="Crawl UA") + crawl = Crawl.objects.create(urls="https://example.com/root", persona=persona, created_by=user, config={"TIMEOUT": 11}) + snapshot = Snapshot.objects.create( + url="https://example.com/root", + crawl=crawl, + config={"TIMEOUT": 22, "ANTHROPIC_API_KEY": "snapshot-secret"}, + ) + + persona.config["TIMEOUT"] = 99 + persona.save(update_fields=["config"]) + + runtime_config = get_config(crawl=crawl, snapshot=snapshot) + assert runtime_config.USER_AGENT == "Crawl UA" + assert runtime_config.TIMEOUT == 22 + assert runtime_config.ANTHROPIC_API_KEY == "snapshot-secret" + assert snapshot.config == { + "TIMEOUT": 22, + "ANTHROPIC_API_KEY": "snapshot-secret", + "PERMISSIONS": "private", + } + + +def test_config_scopes_are_derived_from_section_and_field_metadata(): + from archivebox.config.common import ArchiveBoxConfig + + assert ArchiveBoxConfig.scope_for_key("TIMEOUT") == "crawl_frozen" + assert ArchiveBoxConfig.scope_for_key("DEBUG") == "crawl_execution" + assert ArchiveBoxConfig.scope_for_key("DEFAULT_PERSONA") == "crawl_execution" + assert ArchiveBoxConfig.scope_for_key("WGET_ENABLED") == "crawl_execution" + assert ArchiveBoxConfig.scope_for_key("SEARCH_BACKEND_SQLITE_ENABLED") == "crawl_execution" + assert ArchiveBoxConfig.scope_for_key("SEARCH_BACKEND_ENGINE") == "crawl_execution" + assert ArchiveBoxConfig.scope_for_key("WGET_WARC_ENABLED") == "crawl_frozen" + assert ArchiveBoxConfig.scope_for_key("SECRET_KEY") == "server" + assert ArchiveBoxConfig.scope_for_key("DATABASE_NAME") == "server" + + +def test_search_backend_engine_derives_default_backend_enabled_without_entering_hook_env(): + from archivebox.config.common import ArchiveBoxConfig + + default_runtime_config = ArchiveBoxConfig().for_crawl_runtime(extra_context={"snapshot_id": "default-runtime-config"}) + assert default_runtime_config["SEARCH_BACKEND_RIPGREP_ENABLED"] is True + assert default_runtime_config["SEARCH_BACKEND_SQLITE_ENABLED"] is False + assert default_runtime_config["SEARCH_BACKEND_SONIC_ENABLED"] is True + + sqlite_runtime_config = ArchiveBoxConfig(SEARCH_BACKEND_ENGINE="sqlite").for_crawl_runtime( + extra_context={"snapshot_id": "sqlite-runtime-config"}, + ) + assert sqlite_runtime_config["SEARCH_BACKEND_SQLITE_ENABLED"] is True + + config = ArchiveBoxConfig( + SEARCH_BACKEND_ENGINE="ripgrep", + SEARCH_BACKEND_SQLITE_ENABLED=False, + SEARCH_BACKEND_SONIC_ENABLED=True, + SECRET_KEY="server-secret", + DATABASE_NAME="server-db.sqlite3", + ) + + runtime_config = config.for_crawl_runtime(extra_context={"snapshot_id": "runtime-config"}) + + assert "SEARCH_BACKEND_ENGINE" not in runtime_config + assert runtime_config["SEARCH_BACKEND_RIPGREP_ENABLED"] is True + assert runtime_config["SEARCH_BACKEND_SQLITE_ENABLED"] is False + assert runtime_config["SEARCH_BACKEND_SONIC_ENABLED"] is True + assert "SECRET_KEY" not in runtime_config + assert "DATABASE_NAME" not in runtime_config + + +def test_plugin_selection_enabled_keys_are_derived_from_plugins_not_frozen_or_env_overridden(archivebox_db, monkeypatch): + from archivebox.config.common import get_config + from archivebox.crawls.models import Crawl + from archivebox.plugins.discovery import get_plugin_special_config + + monkeypatch.setenv("ARCHIVEDOTORG_ENABLED", "False") + user = _user("frozen-config-enabled-admin") + persona = _persona(user, name="Enabled Persona") + + env_default_crawl = Crawl( + urls="https://example.com/env-enabled", + persona=persona, + created_by=user, + config={}, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + env_default_crawl.save() + env_default_config = get_config(crawl=env_default_crawl, include_machine=False) + assert env_default_config.ARCHIVEDOTORG_ENABLED is False + + crawl = Crawl( + urls="https://example.com/enabled", + persona=persona, + created_by=user, + config={"PLUGINS": "archivedotorg", "ARCHIVEDOTORG_ENABLED": True, "DEFAULT_PERSONA": "Other"}, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + crawl.save() + + assert crawl.config["PLUGINS"] == "archivedotorg" + assert "ARCHIVEDOTORG_ENABLED" not in crawl.config + assert "DEFAULT_PERSONA" not in crawl.config + runtime_config = get_config(crawl=crawl, include_machine=False) + assert runtime_config.ARCHIVEDOTORG_ENABLED is True + assert runtime_config.WGET_ENABLED is False + assert get_plugin_special_config("archivedotorg", runtime_config)["enabled"] is True + assert get_plugin_special_config("wget", runtime_config)["enabled"] is False + + monkeypatch.delenv("ARCHIVEDOTORG_ENABLED") + + Crawl.objects.filter(id=crawl.id).update( + config={ + **crawl.config, + "ARCHIVEDOTORG_ENABLED": False, + "YTDLP_ENABLED": True, + "WGET_ENABLED": True, + }, + ) + crawl.refresh_from_db() + stale_runtime_config = get_config(crawl=crawl, include_machine=False) + assert stale_runtime_config.ARCHIVEDOTORG_ENABLED is True + assert stale_runtime_config.YTDLP_ENABLED is False + assert stale_runtime_config.WGET_ENABLED is False + + +def test_crawl_config_projections_stay_under_hot_path_budget(): + from archivebox.config.common import ArchiveBoxConfig + + config = ArchiveBoxConfig(TIMEOUT=12, USER_AGENT="Perf UA", CHROME_BINARY="perf-chrome") + persona = SimpleNamespace( + config={"USER_AGENT": "Persona UA"}, + get_derived_config=lambda: { + "ACTIVE_PERSONA": "Perf Persona", + "USER_AGENT": "Persona UA", + "CHROME_USER_DATA_DIR": "/tmp/persona/chrome", + }, + ) + crawl = SimpleNamespace(config={"TIMEOUT": 13, "CHROME_BINARY": "crawl-chrome"}) + snapshot = SimpleNamespace(config={"TIMEOUT": 14, "CHROME_BINARY": "snapshot-chrome"}) + runtime_kwargs = { + "crawl": crawl, + "snapshot": snapshot, + "persona": persona, + "crawl_output_dir": "/tmp/archivebox/crawls/perf", + "snapshot_output_dir": "/tmp/archivebox/crawls/perf/snapshots/example", + "extra_context": {"snapshot_id": "perf"}, + } + + methods = { + "for_crawl": config.for_crawl, + "for_crawl_frozen": lambda: config.for_crawl_frozen(persona=persona), + "for_crawl_runtime": lambda: config.for_crawl_runtime(**runtime_kwargs), + } + iterations = 250 + max_average_seconds = 0.020 + + for name, method in methods.items(): + method() + started_at = time.perf_counter() + for _ in range(iterations): + method() + average_seconds = (time.perf_counter() - started_at) / iterations + assert average_seconds < max_average_seconds, f"{name} averaged {average_seconds * 1000:.3f}ms" + + +# test_api_create_and_cli_add_store_full_frozen_config moved to test_api_v1_workflow_frozen_crawl_config_sources.py. + + +def test_schedule_enqueue_refreezes_using_current_template_persona_defaults(archivebox_db): + from archivebox.crawls.models import Crawl, CrawlSchedule + + user = _user("frozen-config-schedule-admin") + persona = _persona(user, name="Frozen Schedule Persona", user_agent="Initial schedule UA") + template = Crawl.objects.create( + urls="https://example.com/scheduled", + persona=persona, + created_by=user, + config={"TIMEOUT": 55, "SECRET_KEY": "template-secret-must-not-freeze", "PUBLIC_ADD_VIEW": True}, + status=Crawl.StatusChoices.PAUSED, + ) + schedule = CrawlSchedule.objects.create( + template=template, + schedule="daily", + created_by=user, + config={"TIMEOUT": 55, "SECRET_KEY": "schedule-secret-must-not-freeze", "PUBLIC_ADD_VIEW": True}, + ) + + assert schedule.config["TIMEOUT"] == 55 + assert "SECRET_KEY" in schedule.config + + persona.config["USER_AGENT"] = "Current schedule UA" + persona.config["TWOCAPTCHA_API_KEY"] = UPDATED_SECRET + persona.save(update_fields=["config"]) + + child = schedule.enqueue() + assert child.config["TIMEOUT"] == 55 + assert child.config["USER_AGENT"] == "Current schedule UA" + assert "TWOCAPTCHA_API_KEY" not in child.config + assert "SECRET_KEY" not in child.config + assert "PUBLIC_ADD_VIEW" not in child.config + assert template.config["USER_AGENT"] == "Initial schedule UA" + assert "TWOCAPTCHA_API_KEY" not in template.config + + +def test_crawl_config_backfill_migration_uses_frozen_config_helper(archivebox_db): + import importlib + + from django.apps import apps + from django.db import connection + + from archivebox.crawls.models import Crawl + + migration = importlib.import_module("archivebox.crawls.migrations.0018_freeze_crawl_config_snapshots") + user = _user("frozen-config-migration-admin") + persona = _persona(user, name="Migration Persona", user_agent="Migration UA") + crawl = Crawl( + urls="https://example.com/migration", + persona=persona, + created_by=user, + config={"TIMEOUT": 44, "CHROME_BINARY": "migration-chrome"}, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + crawl.save() + + Crawl.objects.filter(id=crawl.id).update(config={"TIMEOUT": 44, "CHROME_BINARY": "migration-chrome"}) + migration.freeze_existing_crawl_configs(apps, SimpleNamespace(connection=connection)) + + crawl.refresh_from_db() + assert crawl.config["TIMEOUT"] == 44 + assert "CHROME_BINARY" not in crawl.config + assert crawl.config["USER_AGENT"] == "Migration UA" + assert "TWOCAPTCHA_API_KEY" not in crawl.config + assert "ACTIVE_PERSONA" not in crawl.config + assert "DEFAULT_PERSONA" not in crawl.config + assert "CRAWL_DIR" not in crawl.config + assert "SNAP_DIR" not in crawl.config + assert "SECRET_KEY" not in crawl.config diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py new file mode 100755 index 0000000000..ee614fc807 --- /dev/null +++ b/archivebox/tests/test_hooks.py @@ -0,0 +1,825 @@ +#!/usr/bin/env python3 +""" +Unit tests for the ArchiveBox hook architecture. + +Tests hook discovery, execution, JSONL parsing, background hook detection, +binary lookup, and required_binaries XYZ_BINARY passthrough handling. + +Run with: + sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v' +""" + +import json +import os +import shutil +import subprocess +import sys +import textwrap +from pathlib import Path + +import pytest +import rich_click as click + +# Set up Django before importing any Django-dependent modules +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings") + +REPO_ROOT = Path(__file__).resolve().parents[2] +WORKSPACE_ROOT = REPO_ROOT.parent +RESULT_PREFIX = "__ARCHIVEBOX_TEST_RESULT__=" + + +def create_test_plugin_structure(plugins_dir: Path) -> None: + """Create a minimal plugin tree for hook discovery tests.""" + plugins_dir.mkdir() + + wget_dir = plugins_dir / "wget" + wget_dir.mkdir() + (wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook") + + chrome_dir = plugins_dir / "chrome" + chrome_dir.mkdir(exist_ok=True) + (chrome_dir / "on_Snapshot__20_chrome_tab.daemon.bg.js").write_text("// background hook") + + consolelog_dir = plugins_dir / "consolelog" + consolelog_dir.mkdir() + (consolelog_dir / "on_Snapshot__21_consolelog.daemon.bg.js").write_text("// background hook") + + +def run_plugin_discovery_subprocess(tmp_path: Path, plugins_dir: Path, script: str): + env = os.environ.copy() + data_dir = tmp_path / "data" + data_dir.mkdir() + cwd_plugins_dir = data_dir / "custom_plugins" + if plugins_dir != cwd_plugins_dir: + shutil.copytree(plugins_dir, cwd_plugins_dir) + existing_pythonpath = [ + str(Path(entry).expanduser().resolve(strict=False)) + for entry in env.get("PYTHONPATH", "").split(os.pathsep) + if entry and Path(entry).expanduser().is_absolute() + ] + env["PYTHONPATH"] = os.pathsep.join(dict.fromkeys([str(REPO_ROOT), *existing_pythonpath])) + subprocess_script = "\n".join( + [ + "import json", + f"RESULT_PREFIX = {RESULT_PREFIX!r}", + "", + "def emit(value):", + " print(RESULT_PREFIX + json.dumps(value))", + "", + textwrap.dedent(script), + ], + ) + + result = subprocess.run( + [ + sys.executable, + "-c", + subprocess_script, + ], + cwd=data_dir, + env=env, + capture_output=True, + text=True, + timeout=30, + ) + assert result.returncode == 0, result.stderr + + for line in reversed(result.stdout.splitlines()): + if line.startswith(RESULT_PREFIX): + return json.loads(line.removeprefix(RESULT_PREFIX)) + + raise AssertionError(f"Subprocess did not emit a result line.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}") + + +def test_cli_env_does_not_emit_relative_pythonpath_entries(): + from archivebox.tests.conftest import cli_env + + old_pythonpath = os.environ.get("PYTHONPATH") + try: + os.environ["PYTHONPATH"] = os.pathsep.join(["../abxpkg", "../abx-plugins", "../abx-dl"]) + + env = cli_env() + finally: + if old_pythonpath is None: + os.environ.pop("PYTHONPATH", None) + else: + os.environ["PYTHONPATH"] = old_pythonpath + + pythonpath_entries = env["PYTHONPATH"].split(os.pathsep) + for repo_name in ("abxpkg", "abx-plugins", "abx-dl"): + repo_path = next(path for path in (WORKSPACE_ROOT / repo_name, REPO_ROOT / repo_name) if path.exists()) + assert str(repo_path.resolve(strict=False)) in pythonpath_entries + assert all(Path(entry).is_absolute() for entry in pythonpath_entries) + assert not any(entry.startswith("..") for entry in pythonpath_entries) + + +class TestBackgroundHookDetection: + """Test that background hooks are detected by .bg. suffix.""" + + def test_bg_js_suffix_detected(self): + """Hooks with .bg.js suffix should be detected as background.""" + from archivebox.plugins.hooks import is_background_hook + + assert is_background_hook("on_Snapshot__21_consolelog.daemon.bg.js") + + def test_bg_py_suffix_detected(self): + """Hooks with .bg.py suffix should be detected as background.""" + from archivebox.plugins.hooks import is_background_hook + + assert is_background_hook("on_Snapshot__24_responses.finite.bg.py") + + def test_bg_sh_suffix_detected(self): + """Hooks with .bg.sh suffix should be detected as background.""" + from archivebox.plugins.hooks import is_background_hook + + assert is_background_hook("on_Snapshot__23_ssl.daemon.bg.sh") + + def test_legacy_background_suffix_detected(self): + """Hooks with __background in stem should be detected (backwards compat).""" + from archivebox.plugins.hooks import is_background_hook + + assert is_background_hook("on_Snapshot__21_consolelog__background.js") + + def test_foreground_hook_not_detected(self): + """Hooks without .bg. or __background should NOT be detected as background.""" + from archivebox.plugins.hooks import is_background_hook + + assert not is_background_hook("on_Snapshot__11_favicon.js") + + def test_foreground_py_hook_not_detected(self): + """Python hooks without .bg. should NOT be detected as background.""" + from archivebox.plugins.hooks import is_background_hook + + assert not is_background_hook("on_Snapshot__50_wget.py") + + +class TestJSONLParsing: + """Test JSONL parsing in run_hook() output processing.""" + + def test_parse_clean_jsonl(self): + """Clean JSONL format should be parsed correctly.""" + stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}' + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(stdout) + + assert len(records) == 1 + assert records[0]["type"] == "ArchiveResult" + assert records[0]["status"] == "succeeded" + assert records[0]["output_str"] == "Done" + + def test_parse_multiple_jsonl_records(self): + """Multiple JSONL records should all be parsed.""" + stdout = """{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"} +{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}""" + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(stdout) + + assert len(records) == 2 + assert records[0]["type"] == "ArchiveResult" + assert records[1]["type"] == "Binary" + + def test_parse_jsonl_with_log_output(self): + """JSONL should be extracted from mixed stdout with log lines.""" + stdout = """Starting hook execution... +Processing URL: https://example.com +{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"} +Hook completed successfully""" + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(stdout) + + assert len(records) == 1 + assert records[0]["status"] == "succeeded" + + def test_ignore_invalid_json(self): + """Invalid JSON should be silently ignored.""" + stdout = """{"type": "ArchiveResult", "status": "succeeded"} +{invalid json here} +not json at all +{"type": "BinaryRequest", "name": "wget"}""" + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(stdout) + + assert len(records) == 2 + + def test_json_without_type_ignored(self): + """JSON objects without 'type' field should be ignored.""" + stdout = """{"status": "succeeded", "output_str": "Done"} +{"type": "ArchiveResult", "status": "succeeded"}""" + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(stdout) + + assert len(records) == 1 + assert records[0]["type"] == "ArchiveResult" + + +class TestRequiredBinaryConfigHandling: + """Test that required_binaries keep configured XYZ_BINARY values intact.""" + + def test_binary_env_var_absolute_path_handling(self): + """Absolute binary paths should pass through unchanged.""" + configured_binary = "/custom/path/to/wget2" + binary_name = configured_binary + + assert binary_name == "/custom/path/to/wget2" + + def test_binary_env_var_name_only_handling(self): + """Binary command names should pass through unchanged.""" + configured_binary = "wget2" + binary_name = configured_binary + + assert binary_name == "wget2" + + def test_binary_env_var_empty_default(self): + """Empty configured binary values should keep the schema default.""" + configured_binary = "" + if configured_binary: + binary_name = configured_binary + else: + binary_name = "wget" + + assert binary_name == "wget" + + +class TestHookDiscovery: + """Test hook discovery functions.""" + + def test_discover_hooks_by_event(self, tmp_path): + """discover_hooks() should find all hooks for an event.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + hooks = [] + for ext in ("sh", "py", "js"): + pattern = f"*/on_Snapshot__*.{ext}" + hooks.extend(plugins_dir.glob(pattern)) + + hooks = sorted(set(hooks), key=lambda p: p.name) + + assert len(hooks) == 3 + hook_names = [h.name for h in hooks] + assert "on_Snapshot__20_chrome_tab.daemon.bg.js" in hook_names + assert "on_Snapshot__21_consolelog.daemon.bg.js" in hook_names + assert "on_Snapshot__50_wget.py" in hook_names + + def test_discover_hooks_sorted_by_name(self, tmp_path): + """Hooks should be sorted by filename (numeric prefix ordering).""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + hooks = [] + for ext in ("sh", "py", "js"): + pattern = f"*/on_Snapshot__*.{ext}" + hooks.extend(plugins_dir.glob(pattern)) + + hooks = sorted(set(hooks), key=lambda p: p.name) + + # Check numeric ordering + assert hooks[0].name == "on_Snapshot__20_chrome_tab.daemon.bg.js" + assert hooks[1].name == "on_Snapshot__21_consolelog.daemon.bg.js" + assert hooks[2].name == "on_Snapshot__50_wget.py" + + def test_normalize_hook_event_name_accepts_event_classes(self): + """Hook discovery should normalize bus event class names to hook families.""" + from archivebox.plugins import hooks as hooks_module + + assert hooks_module.normalize_hook_event_name("InstallEvent") == "Install" + assert hooks_module.normalize_hook_event_name("BinaryRequestEvent") == "BinaryRequest" + assert hooks_module.normalize_hook_event_name("CrawlSetupEvent") == "CrawlSetup" + assert hooks_module.normalize_hook_event_name("SnapshotEvent") == "Snapshot" + + def test_normalize_hook_event_name_strips_event_suffix_for_lifecycle_events(self): + """Lifecycle event names should normalize via simple suffix stripping.""" + from archivebox.plugins import hooks as hooks_module + + assert hooks_module.normalize_hook_event_name("BinaryEvent") == "Binary" + assert hooks_module.normalize_hook_event_name("CrawlEvent") == "Crawl" + assert hooks_module.normalize_hook_event_name("SnapshotCleanupEvent") == "SnapshotCleanup" + assert hooks_module.normalize_hook_event_name("CrawlCleanupEvent") == "CrawlCleanup" + + def test_discover_hooks_skips_plugins_with_disabled_required_dependencies(self, tmp_path): + """Plugins whose required_plugins are disabled should not run.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + chrome_dir = plugins_dir / "chrome" + chrome_dir.mkdir(exist_ok=True) + (chrome_dir / "config.json").write_text( + json.dumps( + { + "type": "object", + "required_plugins": [], + "properties": { + "CHROME_ENABLED": { + "type": "boolean", + "default": True, + "x-aliases": ["USE_CHROME"], + }, + }, + }, + ), + ) + (chrome_dir / "on_Snapshot__20_chrome.js").write_text("// chrome hook") + + accessibility_dir = plugins_dir / "accessibility" + accessibility_dir.mkdir(exist_ok=True) + (accessibility_dir / "config.json").write_text( + json.dumps( + { + "type": "object", + "required_plugins": ["chrome"], + "properties": { + "ACCESSIBILITY_ENABLED": { + "type": "boolean", + "default": True, + }, + }, + }, + ), + ) + (accessibility_dir / "on_Snapshot__10_accessibility.js").write_text("// accessibility hook") + + wget_dir = plugins_dir / "wget" + (wget_dir / "config.json").write_text( + json.dumps( + { + "type": "object", + "required_plugins": [], + "properties": { + "WGET_ENABLED": { + "type": "boolean", + "default": True, + "x-aliases": ["SAVE_WGET"], + }, + }, + }, + ), + ) + + hook_names = run_plugin_discovery_subprocess( + tmp_path, + plugins_dir, + """ + from archivebox.plugins import hooks as hooks_module + + hooks = hooks_module.discover_hooks("Snapshot", config={"CHROME_ENABLED": False, "WGET_ENABLED": True}) + emit([hook.parent.name for hook in hooks]) + """, + ) + assert "wget" in hook_names + assert "chrome" not in hook_names + assert "accessibility" not in hook_names + + def test_get_plugins_includes_config_only_plugin_dirs(self, tmp_path): + """get_plugins() should include config-only plugins with standardized metadata.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + helper_dir = plugins_dir / "helper" + helper_dir.mkdir() + (helper_dir / "config.json").write_text('{"type": "object", "properties": {}}') + + plugins = run_plugin_discovery_subprocess( + tmp_path, + plugins_dir, + """ + from archivebox.plugins import hooks as hooks_module + + from archivebox.plugins.discovery import get_plugins + get_plugins.cache_clear() + emit(get_plugins()) + """, + ) + assert "helper" in plugins + + def test_discover_binary_hooks_returns_empty(self, tmp_path): + """Binary provider hooks are owned by abxpkg, not ArchiveBox plugin discovery.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + hook_names = run_plugin_discovery_subprocess( + tmp_path, + plugins_dir, + """ + from archivebox.plugins import hooks as hooks_module + + from archivebox.plugins.discovery import get_plugins + get_plugins.cache_clear() + hooks = hooks_module.discover_hooks("BinaryRequest", filter_disabled=False) + emit([hook.name for hook in hooks]) + """, + ) + assert hook_names == [] + + def test_discover_hooks_accepts_event_class_names(self, tmp_path): + """discover_hooks should accept CrawlSetupEvent / SnapshotEvent class names.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + chrome_dir = plugins_dir / "chrome" + (chrome_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.js").write_text("// crawl hook") + + hook_names = run_plugin_discovery_subprocess( + tmp_path, + plugins_dir, + """ + from archivebox.plugins import hooks as hooks_module + + from archivebox.plugins.discovery import get_plugins + get_plugins.cache_clear() + crawl_setup_hooks = hooks_module.discover_hooks("CrawlSetupEvent", filter_disabled=False) + snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False) + emit({ + "crawl_setup": [hook.name for hook in crawl_setup_hooks], + "snapshot": [hook.name for hook in snapshot_hooks], + }) + """, + ) + assert "on_CrawlSetup__90_chrome_launch.daemon.bg.js" in hook_names["crawl_setup"] + assert "on_Snapshot__50_wget.py" in hook_names["snapshot"] + + def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self, tmp_path): + """Lifecycle events without a hook family should return no hooks.""" + plugins_dir = tmp_path / "plugins" + create_test_plugin_structure(plugins_dir) + + hooks = run_plugin_discovery_subprocess( + tmp_path, + plugins_dir, + """ + from archivebox.plugins import hooks as hooks_module + + from archivebox.plugins.discovery import get_plugins + get_plugins.cache_clear() + emit({ + "binary": [hook.name for hook in hooks_module.discover_hooks("BinaryEvent", filter_disabled=False)], + "crawl_cleanup": [ + hook.name for hook in hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False) + ], + }) + """, + ) + assert hooks["binary"] == [] + assert hooks["crawl_cleanup"] == [] + + +class TestGetExtractorName: + """Test get_extractor_name() function.""" + + def test_strip_numeric_prefix(self): + """Numeric prefix should be stripped from extractor name.""" + + # Inline implementation of get_extractor_name + def get_extractor_name(extractor: str) -> str: + parts = extractor.split("_", 1) + if len(parts) == 2 and parts[0].isdigit(): + return parts[1] + return extractor + + assert get_extractor_name("10_title") == "title" + assert get_extractor_name("26_readability") == "readability" + assert get_extractor_name("50_parse_html_urls") == "parse_html_urls" + + def test_no_prefix_unchanged(self): + """Extractor without numeric prefix should be unchanged.""" + + def get_extractor_name(extractor: str) -> str: + parts = extractor.split("_", 1) + if len(parts) == 2 and parts[0].isdigit(): + return parts[1] + return extractor + + assert get_extractor_name("title") == "title" + assert get_extractor_name("readability") == "readability" + + +class TestHookExecution: + """Test hook execution with real subprocesses.""" + + def test_python_hook_execution(self, tmp_path): + """Python hook should execute and output JSONL.""" + hook_path = tmp_path / "test_hook.py" + hook_path.write_text("""#!/usr/bin/env python3 +import json +print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"})) +""") + + result = subprocess.run( + [sys.executable, str(hook_path)], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(result.stdout) + assert records + assert records[0]["type"] == "ArchiveResult" + assert records[0]["status"] == "succeeded" + + def test_js_hook_execution(self, tmp_path): + """JavaScript hook should execute and output JSONL.""" + assert shutil.which("node") is not None, "Node.js not available" + + hook_path = tmp_path / "test_hook.js" + hook_path.write_text("""#!/usr/bin/env node +console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'})); +""") + + result = subprocess.run( + ["node", str(hook_path)], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(result.stdout) + assert records + assert records[0]["type"] == "ArchiveResult" + assert records[0]["status"] == "succeeded" + + def test_hook_receives_cli_args(self, tmp_path): + """Hook should receive CLI arguments.""" + hook_path = tmp_path / "test_hook.py" + hook_path.write_text("""#!/usr/bin/env python3 +import sys +import json +# Simple arg parsing +args = {} +for arg in sys.argv[1:]: + if arg.startswith('--') and '=' in arg: + key, val = arg[2:].split('=', 1) + args[key.replace('-', '_')] = val +print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")})) +""") + + result = subprocess.run( + [sys.executable, str(hook_path), "--url=https://example.com"], + cwd=tmp_path, + capture_output=True, + text=True, + ) + + assert result.returncode == 0 + from archivebox.machine.models import Process + + records = Process.parse_records_from_text(result.stdout) + assert records + assert records[0]["url"] == "https://example.com" + + +class TestDependencyRecordOutput: + """Test dependency record output format compliance.""" + + def test_dependency_record_outputs_binary(self): + """Dependency resolution should output Binary JSONL when binary is found.""" + hook_output = json.dumps( + { + "type": "Binary", + "name": "wget", + "abspath": "/usr/bin/wget", + "version": "1.21.3", + "sha256": None, + "binprovider": "apt", + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["type"] == "Binary" + assert data["name"] == "wget" + assert data["abspath"].startswith("/") + + def test_dependency_record_outputs_binary_jsonl(self): + """Dependency resolution should output Binary JSONL.""" + hook_output = json.dumps( + { + "type": "Binary", + "name": "wget", + "abspath": "/usr/bin/wget", + "version": "1.21.3", + "binprovider": "env", + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["type"] == "Binary" + assert data["name"] == "wget" + assert data["abspath"] == "/usr/bin/wget" + + +class TestSnapshotHookOutput: + """Test snapshot hook output format compliance.""" + + def test_snapshot_hook_basic_output(self): + """Snapshot hook should output clean ArchiveResult JSONL.""" + hook_output = json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": "Downloaded 5 files", + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["type"] == "ArchiveResult" + assert data["status"] == "succeeded" + assert "output_str" in data + + def test_snapshot_hook_with_cmd(self): + """Snapshot hook should include cmd for binary FK lookup.""" + hook_output = json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": "Archived with wget", + "cmd": ["/usr/bin/wget", "-p", "-k", "https://example.com"], + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["type"] == "ArchiveResult" + assert isinstance(data["cmd"], list) + assert data["cmd"][0] == "/usr/bin/wget" + + def test_snapshot_hook_with_output_json(self): + """Snapshot hook can include structured metadata in output_json.""" + hook_output = json.dumps( + { + "type": "ArchiveResult", + "status": "succeeded", + "output_str": "Got headers", + "output_json": { + "content-type": "text/html", + "server": "nginx", + "status-code": 200, + }, + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["type"] == "ArchiveResult" + assert isinstance(data["output_json"], dict) + assert data["output_json"]["status-code"] == 200 + + def test_snapshot_hook_skipped_status(self): + """Snapshot hook should support skipped status.""" + hook_output = json.dumps( + { + "type": "ArchiveResult", + "status": "skipped", + "output_str": "SAVE_WGET=False", + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["status"] == "skipped" + + def test_snapshot_hook_failed_status(self): + """Snapshot hook should support failed status.""" + hook_output = json.dumps( + { + "type": "ArchiveResult", + "status": "failed", + "output_str": "404 Not Found", + }, + ) + + from archivebox.machine.models import Process + + data = Process.parse_records_from_text(hook_output)[0] + assert data["status"] == "failed" + + +class TestPluginMetadata: + """Test that plugin metadata is added to JSONL records.""" + + def test_plugin_name_added(self): + """run_hook() should add plugin name to records.""" + # Simulate what run_hook() does + script = Path("/abx_plugins/plugins/wget/on_Snapshot__50_wget.py") + plugin_name = script.parent.name + + record = {"type": "ArchiveResult", "status": "succeeded"} + record["plugin"] = plugin_name + record["plugin_hook"] = str(script) + + assert record["plugin"] == "wget" + assert "on_Snapshot__50_wget.py" in record["plugin_hook"] + + +@pytest.mark.django_db(transaction=True) +def test_run_hook_exports_singular_node_modules_dir_with_colon_node_path(tmp_path): + """Hook subprocesses must get a real NODE_MODULES_DIR even when NODE_PATH has multiple entries.""" + from archivebox.plugins.hooks import run_hook + + lib_dir = tmp_path / "lib" + node_modules_dir = lib_dir / "pnpm" / "packages" / "chrome" / "node_modules" + configured_node_path = os.pathsep.join( + [ + "/home/archivebox/.pnpm/packages/chrome/node_modules", + "/usr/lib/node_modules", + str(node_modules_dir), + "/usr/share/archivebox/lib/pnpm/packages/chrome/node_modules", + ], + ) + + plugin_dir = tmp_path / "plugins" / "envprobe" + plugin_dir.mkdir(parents=True) + hook_path = plugin_dir / "on_Snapshot__99_envprobe.py" + hook_path.write_text( + """#!/usr/bin/env python3 +import json +import os + +print(json.dumps({ + "NODE_PATH": os.environ.get("NODE_PATH"), + "NODE_MODULES_DIR": os.environ.get("NODE_MODULES_DIR"), + "NODE_MODULE_DIR": os.environ.get("NODE_MODULE_DIR"), +})) +""", + encoding="utf-8", + ) + hook_path.chmod(0o755) + + output_dir = tmp_path / "archive" / "users" / "system" / "snapshots" / "20260513" / "example.com" / "test" / "envprobe" + process = run_hook( + hook_path, + output_dir, + config={ + "LIB_DIR": str(lib_dir), + "NODE_PATH": configured_node_path, + }, + timeout=10, + ) + process.refresh_from_db() + + assert process.exit_code == 0, process.stderr + payload = json.loads(process.stdout.strip()) + assert payload["NODE_MODULES_DIR"] == str(node_modules_dir) + assert payload["NODE_MODULE_DIR"] == str(node_modules_dir) + assert payload["NODE_PATH"].split(os.pathsep) == configured_node_path.split(os.pathsep) + assert process.env["NODE_MODULES_DIR"] == str(node_modules_dir) + + +@pytest.mark.django_db(transaction=True) +def test_run_hook_executes_python_hooks_through_script_shebang(tmp_path): + """Python hooks must use their abxpkg script header instead of sys.executable.""" + from archivebox.plugins.hooks import run_hook + + plugin_dir = tmp_path / "plugins" / "shebangprobe" + plugin_dir.mkdir(parents=True) + hook_path = plugin_dir / "on_Snapshot__99_shebangprobe.py" + hook_path.write_text( + """#!/usr/bin/env -S abxpkg run --script python3 +# /// script +# requires-python = ">=3.12" +# /// +import json +import os +import rich_click + +print(json.dumps({ + "ABXPKG_FAST_SCRIPT": os.environ.get("ABXPKG_FAST_SCRIPT"), + "RICH_CLICK_FILE": rich_click.__file__, +})) +""", + encoding="utf-8", + ) + hook_path.chmod(0o755) + + output_dir = tmp_path / "archive" / "users" / "system" / "snapshots" / "20260603" / "example.com" / "test" / "shebangprobe" + process = run_hook( + hook_path, + output_dir, + config={ + "LIB_DIR": str(tmp_path / "lib"), + }, + timeout=10, + ) + process.refresh_from_db() + + assert process.cmd[0] == str(hook_path) + assert process.exit_code == 0, process.stderr + payload = json.loads(process.stdout.strip()) + assert payload["ABXPKG_FAST_SCRIPT"] == "1" + assert Path(payload["RICH_CLICK_FILE"]).resolve() == Path(click.__file__).resolve() diff --git a/archivebox/tests/test_machine_models.py b/archivebox/tests/test_machine_models.py new file mode 100644 index 0000000000..9a8c24d785 --- /dev/null +++ b/archivebox/tests/test_machine_models.py @@ -0,0 +1,969 @@ +""" +Unit tests for machine module models: Machine, NetworkInterface, Binary, Process. + +Tests cover: +1. Machine model creation and current() method +2. NetworkInterface model and network detection +3. Binary model lifecycle and state machine +4. Process model lifecycle, hierarchy, and state machine +5. JSONL serialization/deserialization +6. Manager methods +7. Process tracking methods (replacing pid_utils) +""" + +import os +import subprocess +import sys +from datetime import timedelta +from pathlib import Path +from typing import cast + +import pytest +from django.db import transaction +from django.utils import timezone + +from archivebox.machine.models import ( + BinaryManager, + Machine, + NetworkInterface, + Binary, + Process, + BinaryMachine, + ProcessMachine, + MACHINE_RECHECK_INTERVAL, + PID_REUSE_WINDOW, + PROCESS_TIMEOUT_GRACE, +) + +pytestmark = pytest.mark.django_db + + +def _reset_machine_model_caches(): + import archivebox.machine.models as models + + models._CURRENT_MACHINE = None + models._CURRENT_INTERFACE = None + models._CURRENT_PROCESS = None + models._CURRENT_BINARIES = {} + + +@pytest.fixture(autouse=True) +def reset_machine_model_caches(): + _reset_machine_model_caches() + yield + _reset_machine_model_caches() + + +@pytest.fixture +def machine(): + return Machine.current() + + +@pytest.fixture +def binary(machine): + return Binary.objects.create( + machine=machine, + name="test-binary", + binproviders="env", + ) + + +@pytest.fixture +def process(machine): + return Process.objects.create( + machine=machine, + cmd=["echo", "test"], + pwd="/tmp", + ) + + +@pytest.fixture +def cleanup_paths(): + paths: list[Path] = [] + yield paths + for path in reversed(paths): + path.unlink(missing_ok=True) + + +class TestMachineModel: + """Test the Machine model.""" + + def test_machine_config_save_heals_json_encoded_string_values(self, machine): + machine.config = { + "EXTRA_CONTEXT": 'prefix "inner" suffix', + "USER_AGENT": '"ArchiveBox \\"Quoted\\" Agent"', + } + machine.save(update_fields=["config"]) + + machine.refresh_from_db() + + assert machine.config["EXTRA_CONTEXT"] == 'prefix "inner" suffix' + assert machine.config["USER_AGENT"] == 'ArchiveBox "Quoted" Agent' + + def test_machine_current_creates_machine(self): + """Machine.current() should create a machine if none exists.""" + machine = Machine.current() + + assert machine is not None + assert machine.id is not None + assert machine.guid is not None + assert machine.hostname == os.uname().nodename + assert machine.os_family in ["linux", "darwin", "windows", "freebsd"] + + def test_machine_current_returns_cached(self): + """Machine.current() should return cached machine within recheck interval.""" + machine1 = Machine.current() + machine2 = Machine.current() + + assert machine1.id == machine2.id + + def test_machine_current_refreshes_after_interval(self): + """Machine.current() should refresh after recheck interval.""" + import archivebox.machine.models as models + + machine1 = Machine.current() + + # Manually expire the cache by modifying modified_at + machine1.modified_at = timezone.now() - timedelta(seconds=MACHINE_RECHECK_INTERVAL + 1) + machine1.save() + models._CURRENT_MACHINE = machine1 + + machine2 = Machine.current() + + # Should have fetched/updated the machine (same GUID) + assert machine1.guid == machine2.guid + + def test_machine_current_recreates_stale_cached_row(self): + """Machine.current() should recreate the cached machine if the row was deleted.""" + import archivebox.machine.models as models + + machine1 = Machine.current() + machine1_id = machine1.id + machine1_guid = machine1.guid + + machine1.delete() + models._CURRENT_MACHINE = machine1 + + machine2 = Machine.current() + + assert machine1_id != machine2.id + assert machine1_guid == machine2.guid + + def test_machine_from_jsonl_update(self, cleanup_paths): + """Machine.from_json() should update machine config.""" + from archivebox.config.constants import CONSTANTS + + Machine.current() # Ensure machine exists + wget_path = CONSTANTS.DEFAULT_LIB_DIR / "wget" + wget_path.parent.mkdir(parents=True, exist_ok=True) + wget_path.write_text("#!/bin/sh\n") + cleanup_paths.append(wget_path) + record = { + "config": { + "WGET_BINARY": str(wget_path), + }, + } + + result = Machine.from_json(record) + + assert result is not None + assert result.config.get("WGET_BINARY") == str(wget_path) + + def test_machine_from_jsonl_drops_invalid_binary_paths_keeps_mirror(self, cleanup_paths): + """Machine.from_json() drops invalid binary paths but mirrors other keys. + + ``Machine.config`` mirrors ``ArchiveBox.conf`` (non-binary user config + keys live alongside derived binary state), so non-binary keys in the + import survive. Only ``_BINARY`` paths get validated/dropped on import. + """ + from archivebox.config.constants import CONSTANTS + + Machine.current() # Ensure machine exists + wget_path = CONSTANTS.DEFAULT_LIB_DIR / "wget" + wget_path.parent.mkdir(parents=True, exist_ok=True) + wget_path.write_text("#!/bin/sh\n") + cleanup_paths.append(wget_path) + record = { + "config": { + "WGET_BINARY": str(wget_path), + "CHROMIUM_VERSION": "123.4.5", + "YTDLP_BINARY": "/tmp/archivebox-test-missing-yt-dlp", + }, + } + + result = Machine.from_json(record) + + assert result is not None + assert result.config.get("WGET_BINARY") == str(wget_path) + assert result.config.get("CHROMIUM_VERSION") == "123.4.5" + assert "YTDLP_BINARY" not in result.config + + def test_machine_from_jsonl_invalid(self): + """Machine.from_json() should return None for invalid records.""" + result = Machine.from_json({"invalid": "record"}) + assert result is None + + def test_machine_current_drops_invalid_binary_paths_keeps_mirror(self, cleanup_paths): + """Machine.current() mirrors ArchiveBox.conf, only drops invalid binaries. + + ``Machine.config`` is the file โ†” DB mirror of ``ArchiveBox.conf``, so + non-binary keys (``CHROME_ISOLATION``, ``CHROMIUM_VERSION``, etc.) are + preserved on read. Only ``_BINARY`` paths get validated against + ``LIB_DIR`` and dropped when stale/missing. + """ + import archivebox.machine.models as models + from archivebox.config.constants import CONSTANTS + + active_lib_dir = CONSTANTS.DEFAULT_LIB_DIR + active_lib_dir.mkdir(parents=True, exist_ok=True) + chrome_path = active_lib_dir / "chromium" + node_path = active_lib_dir / "node" + chrome_path.write_text("#!/bin/sh\n") + node_path.write_text("#!/bin/sh\n") + external_path = Path("/tmp/archivebox-test-external-node") + external_path.touch() + cleanup_paths.extend([chrome_path, node_path, external_path]) + machine = Machine.current() + machine.config = { + "CHROME_BINARY": str(chrome_path), + "NODE_BINARY": str(node_path), + "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}, + "CHROME_ISOLATION": "snapshot", + "CHROME_USER_DATA_DIR": "/tmp/profile", + "CHROMIUM_VERSION": "123.4.5", + "YTDLP_BINARY": str(external_path), + "WGET_BINARY": "/tmp/archivebox-test-missing-wget", + } + machine.save(update_fields=["config"]) + models._CURRENT_MACHINE = machine + + refreshed = Machine.current(refresh=True) + + # Valid binary paths inside LIB_DIR survive. + assert refreshed.config.get("CHROME_BINARY") == str(chrome_path) + assert refreshed.config.get("NODE_BINARY") == str(node_path) + # Non-binary mirror keys survive โ€” they belong to ArchiveBox.conf. + assert refreshed.config.get("ABX_INSTALL_CACHE") == {"wget": "2026-03-24T00:00:00+00:00"} + assert refreshed.config.get("CHROME_ISOLATION") == "snapshot" + assert refreshed.config.get("CHROME_USER_DATA_DIR") == "/tmp/profile" + assert refreshed.config.get("CHROMIUM_VERSION") == "123.4.5" + # Stale binary paths get dropped: YTDLP_BINARY outside LIB_DIR, + # WGET_BINARY path doesn't exist. + assert "YTDLP_BINARY" not in refreshed.config + assert "WGET_BINARY" not in refreshed.config + + def test_get_config_auto_applies_current_machine_config(self, cleanup_paths): + """get_config() applies the full Machine.config mirror as scope overrides. + + ``Machine.config`` mirrors ``ArchiveBox.conf``, so non-binary user keys + like ``CHROME_ISOLATION`` flow through into the merged ``get_config()`` + result alongside validated binary paths. + """ + import archivebox.machine.models as models + from archivebox.config.common import get_config + + lib_dir = get_config(include_machine=False).LIB_DIR + chrome_path = lib_dir / "chromium" + chrome_path.parent.mkdir(parents=True, exist_ok=True) + chrome_path.write_text("#!/bin/sh\n") + cleanup_paths.append(chrome_path) + machine = Machine.current() + machine.config = { + "CHROME_BINARY": str(chrome_path), + "ABX_INSTALL_CACHE": {"chrome": "2026-03-24T00:00:00+00:00"}, + "CHROME_ISOLATION": "snapshot", + } + machine.save(update_fields=["config"]) + models._CURRENT_MACHINE = machine + + config = get_config() + + assert config.CHROME_BINARY == str(chrome_path) + assert config.CHROME_ISOLATION == "snapshot" + + def test_machine_manager_current(self): + """Machine.objects.current() should return current machine.""" + machine = Machine.current() + assert machine is not None + assert machine.id == Machine.current().id + + +class TestNetworkInterfaceModel: + """Test the NetworkInterface model.""" + + def test_networkinterface_current_creates_interface(self): + """NetworkInterface.current() should create an interface if none exists.""" + interface = NetworkInterface.current() + + assert interface is not None + assert interface.id is not None + assert interface.machine is not None + assert interface.ip_local is not None + + def test_networkinterface_current_returns_cached(self): + """NetworkInterface.current() should return cached interface within recheck interval.""" + interface1 = NetworkInterface.current() + interface2 = NetworkInterface.current() + + assert interface1.id == interface2.id + + def test_networkinterface_manager_current(self): + """NetworkInterface.objects.current() should return current interface.""" + interface = NetworkInterface.current() + assert interface is not None + + +class TestBinaryModel: + """Test the Binary model.""" + + @pytest.fixture(autouse=True) + def setup_machine(self, machine): + self.machine = machine + + def test_binary_creation(self): + """Binary should be created with default values.""" + binary = Binary.objects.create( + machine=self.machine, + name="wget", + binproviders="apt,brew,env", + ) + + assert binary.id is not None + assert binary.name == "wget" + assert binary.status == Binary.StatusChoices.QUEUED + assert not binary.is_valid + + def test_binary_is_valid(self): + """Binary.is_valid should be True for installed binaries with a resolved path.""" + binary = Binary.objects.create( + machine=self.machine, + name="wget", + abspath="/usr/bin/wget", + version="1.21", + status=Binary.StatusChoices.INSTALLED, + ) + + assert binary.is_valid + + def test_binary_manager_get_valid_binary(self): + """BinaryManager.get_valid_binary() should find valid binaries.""" + # Create invalid binary (no abspath) + Binary.objects.create(machine=self.machine, name="wget") + + # Create valid binary + Binary.objects.create( + machine=self.machine, + name="wget", + abspath="/usr/bin/wget", + version="1.21", + status=Binary.StatusChoices.INSTALLED, + ) + + result = cast(BinaryManager, Binary.objects).get_valid_binary("wget") + + assert result is not None + assert result.abspath == "/usr/bin/wget" + + def test_binary_update_and_requeue(self): + """Binary.update_and_requeue() should update fields and save.""" + binary = Binary.objects.create(machine=self.machine, name="test") + old_modified = binary.modified_at + + binary.update_and_requeue( + status=Binary.StatusChoices.QUEUED, + retry_at=timezone.now() + timedelta(seconds=60), + ) + + binary.refresh_from_db() + assert binary.status == Binary.StatusChoices.QUEUED + assert binary.modified_at > old_modified + + def test_binary_from_json_preserves_provider_overrides(self): + """Binary.from_json() should persist provider overrides unchanged.""" + overrides = { + "apt": {"install_args": ["chromium"]}, + "pnpm": {"install_args": "puppeteer"}, + "custom": {"install": "bash -lc 'echo ok'"}, + } + + binary = Binary.from_json( + { + "name": "chrome", + "binproviders": "apt,pnpm,custom", + "overrides": overrides, + }, + ) + + assert binary is not None + assert binary.overrides == overrides + + def test_binary_from_json_canonicalizes_path_like_names(self): + """Binary.from_json() should store command names, not path cache values.""" + binary = Binary.from_json( + { + "name": "/tmp/old-lib/pip/venv/bin/trafilatura", + "binproviders": "env,pip", + "overrides": {"pip": {"install_args": ["trafilatura"]}}, + }, + ) + + assert binary is not None + assert binary.name == "trafilatura" + + def test_binary_from_json_does_not_coerce_legacy_override_shapes(self): + """Binary.from_json() should no longer translate legacy non-dict provider overrides.""" + overrides = { + "apt": ["chromium"], + "pnpm": "puppeteer", + } + + binary = Binary.from_json( + { + "name": "chrome", + "binproviders": "apt,pnpm", + "overrides": overrides, + }, + ) + + assert binary is not None + assert binary.overrides == overrides + + def test_binary_from_json_preserves_readability_package_metadata(self): + """Binary.from_json() should preserve readability's pnpm package metadata.""" + binary = Binary.from_json( + { + "name": "readability-extractor", + "binproviders": "env,pnpm", + "overrides": { + "pnpm": { + "install_args": ["readability-extractor"], + }, + }, + }, + ) + + assert binary is not None + assert binary.overrides == { + "pnpm": { + "install_args": ["readability-extractor"], + }, + } + + @pytest.mark.django_db(transaction=True) + def test_binary_lib_bin_symlink_waits_for_outer_transaction_commit(self, tmp_path): + """Binary DB projection writes can be direct, but convenience symlinks must run after commit.""" + source = tmp_path / "provider" / "bin" / "abx-test-binary" + source.parent.mkdir(parents=True) + source.write_text("#!/bin/sh\nexit 0\n") + source.chmod(0o755) + lib_bin_dir = tmp_path / "lib" / "bin" + symlink = lib_bin_dir / "abx-test-binary" + + with transaction.atomic(): + binary = Binary.objects.create( + machine=self.machine, + name="abx-test-binary", + abspath=str(source), + version="1.0.0", + status=Binary.StatusChoices.INSTALLED, + ) + binary.symlink_to_lib_bin_after_commit(lib_bin_dir) + assert not symlink.exists() + + assert symlink.is_symlink() + assert symlink.resolve() == source + + +class TestBinaryStateMachine: + """Test the BinaryMachine state machine.""" + + @pytest.fixture(autouse=True) + def setup_binary(self, binary): + self.binary = binary + + def test_binary_state_machine_initial_state(self): + """BinaryMachine should start in queued state.""" + sm = BinaryMachine(self.binary) + assert sm.current_state_value == Binary.StatusChoices.QUEUED + + def test_binary_state_machine_can_start(self): + """BinaryMachine.can_start() should check name and binproviders.""" + sm = BinaryMachine(self.binary) + assert sm.can_install() + + self.binary.binproviders = "" + self.binary.save() + sm = BinaryMachine(self.binary) + assert not sm.can_install() + + +class TestProcessModel: + """Test the Process model.""" + + @pytest.fixture(autouse=True) + def setup_machine(self, machine): + self.machine = machine + + def test_process_creation(self): + """Process should be created with default values.""" + process = Process.objects.create( + machine=self.machine, + cmd=["echo", "hello"], + pwd="/tmp", + ) + + assert process.id is not None + assert process.cmd == ["echo", "hello"] + assert process.status == Process.StatusChoices.QUEUED + assert process.pid is None + assert process.exit_code is None + + def test_process_to_jsonl(self): + """Process.to_json() should serialize correctly.""" + process = Process.objects.create( + machine=self.machine, + cmd=["echo", "hello"], + pwd="/tmp", + timeout=60, + ) + json_data = process.to_json() + + assert json_data["type"] == "Process" + assert json_data["cmd"] == ["echo", "hello"] + assert json_data["pwd"] == "/tmp" + assert json_data["timeout"] == 60 + + def test_process_update_and_requeue(self): + """Process.update_and_requeue() should update fields and save.""" + process = Process.objects.create(machine=self.machine, cmd=["test"]) + + process.update_and_requeue( + status=Process.StatusChoices.RUNNING, + pid=12345, + started_at=timezone.now(), + ) + + process.refresh_from_db() + assert process.status == Process.StatusChoices.RUNNING + assert process.pid == 12345 + assert process.started_at is not None + + +class TestProcessCurrent: + """Test Process.current() method.""" + + def test_process_current_creates_record(self): + """Process.current() should create a Process for current PID.""" + proc = Process.current() + + assert proc is not None + assert proc.pid == os.getpid() + assert proc.status == Process.StatusChoices.RUNNING + assert proc.machine is not None + assert proc.iface is not None + assert proc.iface.machine_id == proc.machine_id + assert proc.started_at is not None + + def test_process_current_caches(self): + """Process.current() should cache the result.""" + proc1 = Process.current() + proc2 = Process.current() + + assert proc1.id == proc2.id + + def test_process_detect_type_runner(self): + """_detect_process_type should detect the background runner command.""" + old_argv = sys.argv + try: + sys.argv = ["archivebox", "run", "--daemon"] + result = Process._detect_process_type() + assert result == Process.TypeChoices.ORCHESTRATOR + finally: + sys.argv = old_argv + + def test_process_detect_type_runner_watch(self): + """runner_watch should be classified as a worker, not the orchestrator itself.""" + old_argv = sys.argv + try: + sys.argv = ["archivebox", "manage", "runner_watch", "--bind-url=http://127.0.0.1:8000"] + result = Process._detect_process_type() + assert result == Process.TypeChoices.WORKER + finally: + sys.argv = old_argv + + def test_process_detect_type_cli(self): + """_detect_process_type should detect CLI commands.""" + old_argv = sys.argv + try: + sys.argv = ["archivebox", "add", "http://example.com"] + result = Process._detect_process_type() + assert result == Process.TypeChoices.ADD + finally: + sys.argv = old_argv + + def test_process_detect_type_binary(self): + """_detect_process_type should detect non-ArchiveBox subprocesses as binary processes.""" + old_argv = sys.argv + try: + sys.argv = ["/usr/bin/wget", "https://example.com"] + result = Process._detect_process_type() + assert result == Process.TypeChoices.BINARY + finally: + sys.argv = old_argv + + def test_process_proc_allows_interpreter_wrapped_script(self, tmp_path): + """Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil.""" + import psutil + + script = tmp_path / "on_CrawlSetup__90_chrome_launch.daemon.bg.py" + script.write_text("import time\ntime.sleep(30)\n", encoding="utf-8") + process = subprocess.Popen( + [sys.executable, str(script), "--url=https://example.com/"], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + def cleanup_process(): + if process.poll() is None: + process.terminate() + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + process.wait(timeout=5) + + try: + os_proc = psutil.Process(process.pid) + proc = Process.objects.create( + machine=Machine.current(), + cmd=[str(script), "--url=https://example.com/"], + pid=process.pid, + status=Process.StatusChoices.RUNNING, + started_at=timezone.datetime.fromtimestamp(os_proc.create_time(), tz=timezone.get_current_timezone()), + ) + + resolved_proc = proc.proc + assert resolved_proc is not None + assert resolved_proc.pid == process.pid + finally: + cleanup_process() + + +class TestProcessHierarchy: + """Test Process parent/child relationships.""" + + @pytest.fixture(autouse=True) + def setup_machine(self, machine): + self.machine = machine + + def test_process_parent_child(self): + """Process should track parent/child relationships.""" + parent = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.CLI, + status=Process.StatusChoices.RUNNING, + pid=1, + started_at=timezone.now(), + ) + + child = Process.objects.create( + machine=self.machine, + parent=parent, + process_type=Process.TypeChoices.WORKER, + status=Process.StatusChoices.RUNNING, + pid=2, + started_at=timezone.now(), + ) + + assert child.parent == parent + assert child in parent.children.all() + + def test_process_root(self): + """Process.root should return the root of the hierarchy.""" + root = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.CLI, + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + child = Process.objects.create( + machine=self.machine, + parent=root, + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + grandchild = Process.objects.create( + machine=self.machine, + parent=child, + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + + assert grandchild.root == root + assert child.root == root + assert root.root == root + + def test_process_depth(self): + """Process.depth should return depth in tree.""" + root = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + child = Process.objects.create( + machine=self.machine, + parent=root, + status=Process.StatusChoices.RUNNING, + started_at=timezone.now(), + ) + + assert root.depth == 0 + assert child.depth == 1 + + +class TestProcessLifecycle: + """Test Process lifecycle methods.""" + + @pytest.fixture(autouse=True) + def setup_machine(self, machine): + self.machine = machine + + def test_process_is_running_current_pid(self): + """is_running should be True for current PID.""" + import psutil + from datetime import datetime + + proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone()) + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + started_at=proc_start, + ) + + assert proc.is_running + + def test_process_is_running_fake_pid(self): + """is_running should be False for non-existent PID.""" + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=999999, + started_at=timezone.now(), + ) + + assert not proc.is_running + + def test_process_poll_detects_exit(self): + """poll() should detect exited process.""" + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=999999, + started_at=timezone.now(), + ) + + exit_code = proc.poll() + + assert exit_code is not None + proc.refresh_from_db() + assert proc.status == Process.StatusChoices.EXITED + + def test_process_poll_normalizes_negative_exit_code(self): + """poll() should normalize -1 exit codes to 137.""" + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.EXITED, + pid=999999, + exit_code=-1, + started_at=timezone.now(), + ) + + exit_code = proc.poll() + + assert exit_code == 137 + proc.refresh_from_db() + assert proc.exit_code == 137 + + def test_process_terminate_dead_process(self): + """terminate() should handle already-dead process.""" + proc = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=999999, + started_at=timezone.now(), + ) + + result = proc.terminate() + + assert not result + proc.refresh_from_db() + assert proc.status == Process.StatusChoices.EXITED + + +class TestProcessClassMethods: + """Test Process class methods for querying.""" + + @pytest.fixture(autouse=True) + def setup_machine(self, machine): + self.machine = machine + + def test_get_running(self): + """get_running should return running processes.""" + proc = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=99999, + started_at=timezone.now(), + ) + + running = Process.get_running(process_type=Process.TypeChoices.HOOK) + + assert proc in running + + def test_get_running_count(self): + """get_running_count should count running processes.""" + for i in range(3): + Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=99900 + i, + started_at=timezone.now(), + ) + + count = Process.get_running_count(process_type=Process.TypeChoices.HOOK) + assert count >= 3 + + def test_cleanup_stale_running(self): + """cleanup_stale_running should mark stale processes as exited.""" + stale = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=999999, + started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1), + ) + + cleaned = Process.cleanup_stale_running() + + assert cleaned >= 1 + stale.refresh_from_db() + assert stale.status == Process.StatusChoices.EXITED + + def test_cleanup_stale_running_marks_timed_out_rows_exited(self): + """cleanup_stale_running should retire RUNNING rows that exceed timeout + grace.""" + stale = Process.objects.create( + machine=self.machine, + status=Process.StatusChoices.RUNNING, + pid=999998, + timeout=5, + started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10), + ) + + cleaned = Process.cleanup_stale_running() + + assert cleaned >= 1 + stale.refresh_from_db() + assert stale.status == Process.StatusChoices.EXITED + + def test_cleanup_stale_running_marks_timed_out_live_hooks_exited(self): + """Timed-out live hook rows should be retired in the DB without trying to kill the process.""" + stale = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + timeout=5, + started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10), + ) + + cleaned = Process.cleanup_stale_running() + + assert cleaned >= 1 + stale.refresh_from_db() + assert stale.status == Process.StatusChoices.EXITED + + def test_cleanup_orphaned_workers_marks_dead_root_children_exited(self): + """cleanup_orphaned_workers should retire rows whose CLI/orchestrator root is gone.""" + import psutil + from datetime import datetime + + started_at = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone()) + parent = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.CLI, + status=Process.StatusChoices.RUNNING, + pid=999997, + started_at=timezone.now() - timedelta(minutes=5), + ) + child = Process.objects.create( + machine=self.machine, + parent=parent, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=os.getpid(), + started_at=started_at, + ) + + cleaned = Process.cleanup_orphaned_workers() + + assert cleaned == 1 + child.refresh_from_db() + assert child.status == Process.StatusChoices.EXITED + + def test_cleanup_orphaned_workers_marks_non_running_children_exited(self): + """cleanup_orphaned_workers should retire child rows whose OS process is already gone.""" + child = Process.objects.create( + machine=self.machine, + process_type=Process.TypeChoices.HOOK, + status=Process.StatusChoices.RUNNING, + pid=999997, + started_at=timezone.now() - timedelta(minutes=5), + ) + + cleaned = Process.cleanup_orphaned_workers() + + assert cleaned == 1 + child.refresh_from_db() + assert child.status == Process.StatusChoices.EXITED + assert child.ended_at is not None + assert child.exit_code == 143 + + +class TestProcessStateMachine: + """Test the ProcessMachine state machine.""" + + @pytest.fixture(autouse=True) + def setup_process(self, process): + self.process = process + + def test_process_state_machine_initial_state(self): + """ProcessMachine should start in queued state.""" + sm = ProcessMachine(self.process) + assert sm.current_state_value == Process.StatusChoices.QUEUED + + def test_process_state_machine_can_start(self): + """ProcessMachine.can_start() should check cmd and machine.""" + sm = ProcessMachine(self.process) + assert sm.can_start() + + self.process.cmd = [] + self.process.save() + sm = ProcessMachine(self.process) + assert not sm.can_start() + + def test_process_state_machine_is_exited(self): + """ProcessMachine.is_exited() should check exit_code.""" + sm = ProcessMachine(self.process) + assert not sm.is_exited() + + self.process.exit_code = 0 + self.process.save() + sm = ProcessMachine(self.process) + assert sm.is_exited() + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/archivebox/tests/test_machine_service.py b/archivebox/tests/test_machine_service.py new file mode 100644 index 0000000000..9c64e758ec --- /dev/null +++ b/archivebox/tests/test_machine_service.py @@ -0,0 +1,165 @@ +import os +import shutil +import textwrap +from pathlib import Path + +import pytest + +from archivebox.machine.models import Binary, Machine, Process +from archivebox.tests.conftest import run_archivebox_cmd +from archivebox.tests.test_orm_helpers import use_archivebox_db + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _link_real_tool(bin_dir: Path, name: str) -> Path: + bin_dir.mkdir(parents=True, exist_ok=True) + source_path = shutil.which(name) + assert source_path, f"{name} must be installed for this integration test" + link = bin_dir / name + link.unlink(missing_ok=True) + link.symlink_to(source_path) + return link + + +def _write_tool_shim(bin_dir: Path, name: str, version: str) -> Path: + bin_dir.mkdir(parents=True, exist_ok=True) + shim = bin_dir / name + shim.write_text(f"#!/bin/sh\nprintf '%s\\n' '{name} {version}'\n", encoding="utf-8") + shim.chmod(0o755) + return shim + + +def _runtime_env(data_dir: Path, bin_dir: Path) -> dict[str, str]: + path_entries = [ + str(bin_dir), + str(data_dir / "lib" / "env" / "bin"), + os.environ.get("PATH", ""), + ] + return { + "LIB_DIR": str(data_dir / "lib"), + "ABXPKG_LIB_DIR": str(data_dir / "lib"), + "LITEPARSE_ENABLED": "True", + "TIMEOUT": "180", + "ABXPKG_INSTALL_TIMEOUT": "180", + "PATH": os.pathsep.join(entry for entry in path_entries if entry), + } + + +def test_install_persists_machine_binary_config_and_recovers_stale_path(initialized_archive, tmp_path): + bootstrap_bin_dir = tmp_path / "realbin" + provider_bin_dir = initialized_archive / "lib" / "env" / "bin" + _link_real_tool(bootstrap_bin_dir, "uv") + _write_tool_shim(provider_bin_dir, "lit", "2.5.9") + _link_real_tool(provider_bin_dir, "node") + + _cmd_result = run_archivebox_cmd( + ["install", "--binproviders=env", "liteparse"], + cwd=initialized_archive, + timeout=240, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + ) + stdout, stderr, returncode = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + + assert returncode == 0, stdout + stderr + assert "liteparse" in stdout + + with use_archivebox_db(initialized_archive): + liteparse_binary = Binary.objects.get(name="lit") + machine = Machine.objects.get(pk=liteparse_binary.machine_id) + machine.config = {} + machine.save(update_fields=["config"]) + machine_id = machine.id + binaries = list(Binary.objects.filter(status=Binary.StatusChoices.INSTALLED).order_by("name")) + process = Process.objects.filter(process_type=Process.TypeChoices.BINARY).latest("created_at") + + installed_liteparse_path = Path(liteparse_binary.abspath) + assert installed_liteparse_path.exists() + assert installed_liteparse_path.is_relative_to(initialized_archive / "lib") + assert binaries + assert process.status == Process.StatusChoices.EXITED + assert process.exit_code == 0 + + external_tool = Path(shutil.which("node") or "") + assert external_tool.exists() + machine_event_script = textwrap.dedent( + f""" + import asyncio + + from abx_dl.events import MachineEvent + from abx_dl.orchestrator import create_bus + from archivebox.services.machine_service import MachineService + + async def main(): + bus = create_bus(name="machine_service_e2e") + try: + MachineService(bus) + await bus.emit(MachineEvent(config={{ + "LITEPARSE_BINARY": "/tmp/user-config-must-not-persist", + "CHROME_USER_DATA_DIR": "/tmp/profile", + }}, config_type="user")).now() + await bus.emit(MachineEvent(config={{ + "LITEPARSE_BINARY": {str(installed_liteparse_path)!r}, + "NODE_BINARY": {str(external_tool)!r}, + "ABX_INSTALL_CACHE": {{"lit": "cached"}}, + "ABX_UV_CACHE": "/tmp/uv-cache", + "CHROME_USER_DATA_DIR": "/tmp/derived-profile", + }}, config_type="derived")).now() + await bus.emit(MachineEvent(method="unset", key="config/LITEPARSE_BINARY", config_type="derived")).now() + await bus.emit(MachineEvent( + method="update", + key="config/LITEPARSE_BINARY", + value={str(installed_liteparse_path)!r}, + config_type="derived", + )).now() + await bus.wait_until_idle() + finally: + await bus.destroy() + + asyncio.run(main()) + print("MACHINE_SERVICE_E2E_DONE") + """, + ) + _cmd_result = run_archivebox_cmd( + ["shell", "-c", machine_event_script], + cwd=initialized_archive, + timeout=60, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + ) + shell_stdout, shell_stderr, shell_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert shell_code == 0, shell_stdout + shell_stderr + assert "MACHINE_SERVICE_E2E_DONE" in shell_stdout + + with use_archivebox_db(initialized_archive): + machine = Machine.objects.get(pk=machine_id) + + assert machine.config["LITEPARSE_BINARY"] == str(installed_liteparse_path) + assert machine.config["LITEPARSE_BINARY"] != "/tmp/user-config-must-not-persist" + assert machine.config["ABX_INSTALL_CACHE"] == {"lit": "cached"} + assert machine.config["ABX_UV_CACHE"] == "/tmp/uv-cache" + + _cmd_result = run_archivebox_cmd( + ["version"], + cwd=initialized_archive, + timeout=60, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + ) + version_stdout, version_stderr, version_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert version_code == 0, version_stderr + assert "lit" in version_stdout + + installed_liteparse_path.unlink() + + _cmd_result = run_archivebox_cmd( + ["version"], + cwd=initialized_archive, + timeout=60, + env=_runtime_env(initialized_archive, bootstrap_bin_dir), + ) + cleanup_stdout, cleanup_stderr, cleanup_code = _cmd_result.stdout, _cmd_result.stderr, _cmd_result.returncode + assert cleanup_code == 0, cleanup_stdout + cleanup_stderr + + with use_archivebox_db(initialized_archive): + cleaned_machine_config = Machine.objects.get(pk=machine_id).config or {} + + assert "LITEPARSE_BINARY" not in cleaned_machine_config diff --git a/archivebox/tests/test_migrations_04_to_09.py b/archivebox/tests/test_migrations_04_to_09.py new file mode 100644 index 0000000000..cada377463 --- /dev/null +++ b/archivebox/tests/test_migrations_04_to_09.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Migration tests from 0.4.x to 0.9.x. + +0.4.x was the first Django-powered version with a simpler schema: +- No Tag model (tags stored as comma-separated string in Snapshot) +- No ArchiveResult model (results stored in JSON files) +""" + +import sqlite3 + +import pytest + +from .migrations_helpers import ( + SCHEMA_0_4, + create_data_dir_structure, + run_archivebox_migration_cmd, + seed_0_4_data, + verify_snapshot_count, + verify_snapshot_urls, + verify_tag_count, +) + + +@pytest.fixture +def archive_04(tmp_path): + """Create a temporary directory with 0.4.x schema and data.""" + db_path = tmp_path / "index.sqlite3" + + # Create directory structure + create_data_dir_structure(tmp_path) + + # Create database with 0.4.x schema + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_4) + conn.close() + + # Seed with test data + original_data = seed_0_4_data(db_path) + + return tmp_path, db_path, original_data + + +def test_migration_preserves_snapshot_count(archive_04): + """Migration should preserve all snapshots from 0.4.x.""" + work_dir, db_path, original_data = archive_04 + expected_count = len(original_data["snapshots"]) + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_count(db_path, expected_count) + assert ok, msg + + +def test_migration_preserves_snapshot_urls(archive_04): + """Migration should preserve all snapshot URLs from 0.4.x.""" + work_dir, db_path, original_data = archive_04 + expected_urls = [s["url"] for s in original_data["snapshots"]] + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_urls(db_path, expected_urls) + assert ok, msg + + +def test_migration_converts_string_tags_to_model(archive_04): + """Migration should convert comma-separated tags to Tag model instances.""" + work_dir, db_path, original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Collect unique tags from original data + original_tags = set() + for tags_str in original_data["tags_str"]: + if tags_str: + for tag in tags_str.split(","): + original_tags.add(tag.strip()) + + # Tags should have been created + ok, msg = verify_tag_count(db_path, len(original_tags)) + assert ok, msg + + +def test_migration_preserves_snapshot_titles(archive_04): + """Migration should preserve all snapshot titles.""" + work_dir, db_path, original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url, title FROM core_snapshot") + actual = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + + for snapshot in original_data["snapshots"]: + assert actual.get(snapshot["url"]) == snapshot["title"], f"Title mismatch for {snapshot['url']}" + + +def test_status_works_after_migration(archive_04): + """Status command should work after migration.""" + work_dir, _db_path, _original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["status"]) + assert result.returncode == 0, f"Status failed after migration: {result.stderr}" + + +def test_list_works_after_migration(archive_04): + """List command should work and show ALL migrated snapshots.""" + work_dir, _db_path, original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["list"]) + assert result.returncode == 0, f"List failed after migration: {result.stderr}" + + # Verify ALL snapshots appear in output + output = result.stdout + result.stderr + for snapshot in original_data["snapshots"]: + url_fragment = snapshot["url"][:30] + assert url_fragment in output, f"Snapshot {snapshot['url']} not found in list output" + + +def test_add_works_after_migration(archive_04): + """Adding new URLs should work after migration from 0.4.x.""" + work_dir, db_path, _original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Try to add a new URL after migration + result = run_archivebox_migration_cmd(work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45) + assert result.returncode == 0, f"Add failed after migration: {result.stderr}" + result = run_archivebox_migration_cmd(work_dir, ["run"], timeout=90) + assert result.returncode == 0, f"Run failed after migration: {result.stderr}" + + # Verify add queued the new crawl after migration. + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl WHERE urls LIKE '%example.com/new-page%'") + count = cursor.fetchone()[0] + conn.close() + + assert count == 1, "New crawl was not created after migration" + + +def test_new_schema_elements_created(archive_04): + """Migration should create new 0.9.x schema elements.""" + work_dir, db_path, _original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = {row[0] for row in cursor.fetchall()} + conn.close() + + # New tables should exist + assert "crawls_crawl" in tables, "crawls_crawl table not created" + assert "core_tag" in tables, "core_tag table not created" + assert "core_archiveresult" in tables, "core_archiveresult table not created" + + +def test_snapshots_have_new_fields(archive_04): + """Migrated snapshots should have new 0.9.x fields.""" + work_dir, db_path, _original_data = archive_04 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("PRAGMA table_info(core_snapshot)") + columns = {row[1] for row in cursor.fetchall()} + conn.close() + + required_columns = {"status", "depth", "created_at", "modified_at"} + for col in required_columns: + assert col in columns, f"Snapshot missing new column: {col}" diff --git a/archivebox/tests/test_migrations_07_to_09.py b/archivebox/tests/test_migrations_07_to_09.py new file mode 100644 index 0000000000..f3afa70f4d --- /dev/null +++ b/archivebox/tests/test_migrations_07_to_09.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +""" +Migration tests from 0.7.x to 0.9.x. + +0.7.x schema includes: +- Tag model with ManyToMany to Snapshot +- ArchiveResult model with ForeignKey to Snapshot +- AutoField primary keys +""" + +import sqlite3 + +import pytest + +from .migrations_helpers import ( + SCHEMA_0_7, + create_data_dir_structure, + run_archivebox_migration_cmd, + seed_0_7_data, + verify_all_snapshots_in_output, + verify_archiveresult_count, + verify_foreign_keys, + verify_snapshot_count, + verify_snapshot_titles, + verify_snapshot_urls, + verify_tag_count, +) + + +@pytest.fixture +def archive_07(tmp_path): + """Create a temporary directory with 0.7.x schema and data.""" + db_path = tmp_path / "index.sqlite3" + + # Create directory structure + create_data_dir_structure(tmp_path) + + # Create database with 0.7.x schema + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + + # Seed with test data + original_data = seed_0_7_data(db_path) + + return tmp_path, db_path, original_data + + +def test_migration_preserves_snapshot_count(archive_07): + """Migration should preserve all snapshots.""" + work_dir, db_path, original_data = archive_07 + expected_count = len(original_data["snapshots"]) + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_count(db_path, expected_count) + assert ok, msg + + +def test_migration_preserves_snapshot_urls(archive_07): + """Migration should preserve all snapshot URLs.""" + work_dir, db_path, original_data = archive_07 + expected_urls = [s["url"] for s in original_data["snapshots"]] + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_urls(db_path, expected_urls) + assert ok, msg + + +def test_migration_preserves_snapshot_titles(archive_07): + """Migration should preserve all snapshot titles.""" + work_dir, db_path, original_data = archive_07 + expected_titles = {s["url"]: s["title"] for s in original_data["snapshots"]} + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_titles(db_path, expected_titles) + assert ok, msg + + +def test_migration_preserves_tags(archive_07): + """Migration should preserve all tags.""" + work_dir, db_path, original_data = archive_07 + expected_count = len(original_data["tags"]) + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_tag_count(db_path, expected_count) + assert ok, msg + + +def test_migration_preserves_archiveresults(archive_07): + """Migration should preserve ArchiveResult rows and link each one to a Process.""" + work_dir, db_path, original_data = archive_07 + expected_count = len(original_data["archiveresults"]) + expected_counts = {} + for result in original_data["archiveresults"]: + key = (result["extractor"], result["status"]) + expected_counts[key] = expected_counts.get(key, 0) + 1 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_archiveresult_count(db_path, expected_count) + assert ok, msg + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT plugin, status, COUNT(*) FROM core_archiveresult GROUP BY plugin, status") + migrated_counts = {(plugin, status): count for plugin, status, count in cursor.fetchall()} + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL") + missing_process_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + conn.close() + + assert migrated_counts == expected_counts + assert missing_process_count == 0 + assert process_count == expected_count + + +def test_migration_preserves_foreign_keys(archive_07): + """Migration should maintain foreign key relationships.""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_foreign_keys(db_path) + assert ok, msg + + +def test_migration_preserves_legacy_timestamp_meanings(archive_07): + """0.7.x timestamp is bookmark identity; added is row creation; updated is downloaded.""" + work_dir, db_path, original_data = archive_07 + snapshot = original_data["snapshots"][0] + legacy_bookmark_ts = "1609459200.123456" + legacy_added = "2024-08-28 09:40:00" + legacy_updated = "2024-08-29 10:41:00" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute( + """ + UPDATE core_snapshot + SET timestamp = ?, added = ?, updated = ? + WHERE id = ? + """, + (legacy_bookmark_ts, legacy_added, legacy_updated, snapshot["id"]), + ) + conn.commit() + conn.close() + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute( + "SELECT timestamp, bookmarked_at, created_at, modified_at, downloaded_at FROM core_snapshot WHERE id = ?", + (snapshot["id"],), + ) + timestamp, bookmarked_at, created_at, modified_at, downloaded_at = cursor.fetchone() + conn.close() + + assert timestamp == legacy_bookmark_ts + assert bookmarked_at.startswith("2021-01-01"), bookmarked_at + assert created_at.startswith("2024-08-28"), created_at + assert modified_at.startswith("2024-08-29"), modified_at + assert downloaded_at.startswith("2024-08-29"), downloaded_at + + +def test_update_saves_migrated_snapshots_without_foreign_key_errors(archive_07): + """Migrated 0.7.x snapshots should be writable through the current ORM.""" + work_dir, _db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=60) + output = result.stdout + result.stderr + assert result.returncode == 0, f"Update failed after migration: {result.stderr}" + assert "FOREIGN KEY constraint failed" not in output + assert "Skipping snapshot" not in output + + +def test_status_works_after_migration(archive_07): + """Status command should work after migration.""" + work_dir, _db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["status"]) + assert result.returncode == 0, f"Status failed after migration: {result.stderr}" + + +def test_search_works_after_migration(archive_07): + """Search command should find ALL migrated snapshots.""" + work_dir, _db_path, original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["search"]) + assert result.returncode == 0, f"Search failed after migration: {result.stderr}" + + # Verify ALL snapshots appear in output + output = result.stdout + result.stderr + ok, msg = verify_all_snapshots_in_output(output, original_data["snapshots"]) + assert ok, msg + + +def test_list_works_after_migration(archive_07): + """List command should work and show ALL migrated data.""" + work_dir, _db_path, original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["snapshot", "list"]) + assert result.returncode == 0, f"List failed after migration: {result.stderr}" + + # Verify ALL snapshots appear in output + output = result.stdout + result.stderr + ok, msg = verify_all_snapshots_in_output(output, original_data["snapshots"]) + assert ok, msg + + +def test_new_schema_elements_created_after_migration(archive_07): + """Migration should create new 0.9.x schema elements (crawls_crawl, etc.).""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check that new tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + tables = {row[0] for row in cursor.fetchall()} + conn.close() + + # 0.9.x should have crawls_crawl table + assert "crawls_crawl" in tables, "crawls_crawl table not created during migration" + + +def test_snapshots_have_new_fields_after_migration(archive_07): + """Migrated snapshots should have new 0.9.x fields (status, depth, etc.).""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check snapshot table has new columns + cursor.execute("PRAGMA table_info(core_snapshot)") + columns = {row[1] for row in cursor.fetchall()} + conn.close() + + # 0.9.x snapshots should have status, depth, created_at, modified_at + required_new_columns = {"status", "depth", "created_at", "modified_at"} + for col in required_new_columns: + assert col in columns, f"Snapshot missing new column: {col}" + + +def test_add_works_after_migration(archive_07): + """Adding new URLs should work after migration from 0.7.x.""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Verify that init created the crawls_crawl table before proceeding + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'") + table_exists = cursor.fetchone() is not None + conn.close() + assert table_exists, f"Init failed to create crawls_crawl table. Init stderr: {result.stderr[-500:]}" + + # Try to add a new URL after migration (use --index-only for speed) + result = run_archivebox_migration_cmd(work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45) + assert result.returncode == 0, f"Add failed after migration: {result.stderr}" + + # Verify a Crawl was created for the new URL + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + crawl_count = cursor.fetchone()[0] + conn.close() + + assert crawl_count >= 1, f"No Crawl created when adding URL. Add stderr: {result.stderr[-500:]}" + + +def test_archiveresult_status_preserved_after_migration(archive_07): + """Migration should preserve archive result status values.""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Get status counts + cursor.execute("SELECT status, COUNT(*) FROM core_archiveresult GROUP BY status") + status_counts = dict(cursor.fetchall()) + conn.close() + + # Original data has known status distribution: succeeded, failed, skipped + assert "succeeded" in status_counts, "Should have succeeded results" + assert "failed" in status_counts, "Should have failed results" + assert "skipped" in status_counts, "Should have skipped results" + + +def test_version_works_after_migration(archive_07): + """Version command should work after migration.""" + work_dir, _db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["version"]) + assert result.returncode == 0, f"Version failed after migration: {result.stderr}" + + # Should show version info + output = result.stdout + result.stderr + assert "ArchiveBox" in output or "version" in output.lower(), f"Version output missing expected content: {output[:500]}" + + +def test_help_works_after_migration(archive_07): + """Help command should work after migration.""" + work_dir, _db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["help"]) + assert result.returncode == 0, f"Help failed after migration: {result.stderr}" + + # Should show available commands + output = result.stdout + result.stderr + assert "add" in output.lower() and "status" in output.lower(), f"Help output missing expected commands: {output[:500]}" + + +def test_no_duplicate_snapshots_after_migration(archive_07): + """Migration should not create duplicate snapshots.""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Check for duplicate URLs + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute(""" + SELECT url, COUNT(*) as cnt FROM core_snapshot + GROUP BY url HAVING cnt > 1 + """) + duplicates = cursor.fetchall() + conn.close() + + assert len(duplicates) == 0, f"Found duplicate URLs: {duplicates}" + + +def test_no_orphaned_archiveresults_after_migration(archive_07): + """Migration should not leave orphaned ArchiveResults.""" + work_dir, db_path, _original_data = archive_07 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_foreign_keys(db_path) + assert ok, msg + + +def test_timestamps_preserved_after_migration(archive_07): + """Migration should preserve original timestamps.""" + work_dir, db_path, original_data = archive_07 + original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]} + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url, timestamp FROM core_snapshot") + migrated_timestamps = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + + for url, original_ts in original_timestamps.items(): + assert migrated_timestamps.get(url) == original_ts, f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}" + + +def test_tag_associations_preserved_after_migration(archive_07): + """Migration should preserve snapshot-tag associations.""" + work_dir, db_path, _original_data = archive_07 + + # Count tag associations before migration + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags") + original_count = cursor.fetchone()[0] + conn.close() + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Count tag associations after migration + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags") + migrated_count = cursor.fetchone()[0] + conn.close() + + assert migrated_count == original_count, f"Tag associations changed: {original_count} -> {migrated_count}" diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py new file mode 100644 index 0000000000..9d156bdae8 --- /dev/null +++ b/archivebox/tests/test_migrations_08_to_09.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +""" +Migration tests from 0.8.x to 0.9.x. + +0.8.x introduced: +- Crawl model for grouping URLs +- Seed model (removed in 0.9.x) +- UUID primary keys for Snapshot +- Status fields for state machine +- New fields like depth, retry_at, etc. +""" + +import sqlite3 +import json +import uuid + +import pytest + +from .migrations_helpers import ( + SCHEMA_0_7, + SCHEMA_0_8, + seed_0_8_data, + seed_0_7_data, + run_archivebox_migration_cmd, + create_data_dir_structure, + verify_snapshot_count, + verify_snapshot_urls, + verify_snapshot_titles, + verify_tag_count, + verify_archiveresult_count, + verify_foreign_keys, + verify_all_snapshots_in_output, + verify_crawl_count, + verify_process_migration, +) + + +@pytest.fixture +def migration_08_data(tmp_path): + """Create a temporary directory with 0.8.x schema and data.""" + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + + create_data_dir_structure(work_dir) + + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_8) + conn.close() + + original_data = seed_0_8_data(db_path) + return work_dir, db_path, original_data + + +def test_migration_preserves_snapshot_count(migration_08_data): + """Migration should preserve all snapshots from 0.8.x.""" + work_dir, db_path, original_data = migration_08_data + expected_count = len(original_data["snapshots"]) + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_count(db_path, expected_count) + assert ok, msg + + +def test_migration_preserves_snapshot_urls(migration_08_data): + """Migration should preserve all snapshot URLs from 0.8.x.""" + work_dir, db_path, original_data = migration_08_data + expected_urls = [s["url"] for s in original_data["snapshots"]] + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_urls(db_path, expected_urls) + assert ok, msg + + +def test_migration_preserves_crawls(migration_08_data): + """Migration should preserve all Crawl records and create default crawl if needed.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Count snapshots with NULL crawl_id in original data + snapshots_without_crawl = sum(1 for s in original_data["snapshots"] if s["crawl_id"] is None) + + # Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id + expected_count = len(original_data["crawls"]) + if snapshots_without_crawl > 0: + expected_count += 1 # Migration 0024 creates a default crawl + + ok, msg = verify_crawl_count(db_path, expected_count) + assert ok, msg + + +def test_migration_preserves_snapshot_crawl_links(migration_08_data): + """Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check EVERY snapshot has a crawl_id after migration + for snapshot in original_data["snapshots"]: + cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot["url"],)) + row = cursor.fetchone() + assert row is not None, f"Snapshot {snapshot['url']} not found after migration" + + if snapshot["crawl_id"] is not None: + # Snapshots that had a crawl should keep it + assert row[0] == snapshot["crawl_id"], f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}" + else: + # Snapshots without a crawl should now have one (the default crawl) + assert row[0] is not None, f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL" + + conn.close() + + +def test_migration_preserves_tags(migration_08_data): + """Migration should preserve all tags.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_tag_count(db_path, len(original_data["tags"])) + assert ok, msg + + +def test_migration_preserves_archiveresults(migration_08_data): + """Migration should preserve ArchiveResult rows and link each one to a Process.""" + work_dir, db_path, original_data = migration_08_data + expected_count = len(original_data["archiveresults"]) + expected_counts = {} + for result in original_data["archiveresults"]: + status = "succeeded" if result["status"] == "success" else result["status"] + key = (result["extractor"], status) + expected_counts[key] = expected_counts.get(key, 0) + 1 + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_archiveresult_count(db_path, expected_count) + assert ok, msg + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT plugin, status, COUNT(*) FROM core_archiveresult GROUP BY plugin, status") + migrated_counts = {(plugin, status): count for plugin, status, count in cursor.fetchall()} + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL") + missing_process_count = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + conn.close() + + assert migrated_counts == expected_counts + assert missing_process_count == 0 + assert process_count == expected_count + + +def test_migration_preserves_archiveresult_status(migration_08_data): + """Migration should preserve archive result status values.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Get status counts + cursor.execute("SELECT status, COUNT(*) FROM core_archiveresult GROUP BY status") + status_counts = dict(cursor.fetchall()) + conn.close() + + # Original data has known status distribution: succeeded, failed, skipped + assert "succeeded" in status_counts, "Should have succeeded results" + assert "failed" in status_counts, "Should have failed results" + assert "skipped" in status_counts, "Should have skipped results" + + +def test_status_works_after_migration(migration_08_data): + """Status command should work after migration.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["status"]) + assert result.returncode == 0, f"Status failed after migration: {result.stderr}" + + +def test_list_works_after_migration(migration_08_data): + """List command should work and show ALL migrated data.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["snapshot", "list"]) + assert result.returncode == 0, f"List failed after migration: {result.stderr}" + + # Verify ALL snapshots appear in output + output = result.stdout + result.stderr + ok, msg = verify_all_snapshots_in_output(output, original_data["snapshots"]) + assert ok, msg + + +def test_search_works_after_migration(migration_08_data): + """Search command should find ALL migrated snapshots.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["search"]) + assert result.returncode == 0, f"Search failed after migration: {result.stderr}" + + # Verify ALL snapshots appear in output + output = result.stdout + result.stderr + ok, msg = verify_all_snapshots_in_output(output, original_data["snapshots"]) + assert ok, msg + + +def test_migration_preserves_snapshot_titles(migration_08_data): + """Migration should preserve all snapshot titles.""" + work_dir, db_path, original_data = migration_08_data + expected_titles = {s["url"]: s["title"] for s in original_data["snapshots"]} + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_snapshot_titles(db_path, expected_titles) + assert ok, msg + + +def test_migration_preserves_foreign_keys(migration_08_data): + """Migration should maintain foreign key relationships.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_foreign_keys(db_path) + assert ok, msg + + +def test_migration_preserves_08_timestamp_meanings(migration_08_data): + """0.8.x already has separated timestamp/bookmarked_at/created_at/downloaded_at fields.""" + work_dir, db_path, original_data = migration_08_data + snapshot = original_data["snapshots"][0] + legacy_timestamp = "1609459200.123456" + bookmarked_at = "2021-01-01 00:00:00" + created_at = "2024-08-28 09:40:00" + modified_at = "2024-08-29 10:41:00" + downloaded_at = "2024-08-30 11:42:00" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute( + """ + UPDATE core_snapshot + SET timestamp = ?, bookmarked_at = ?, created_at = ?, modified_at = ?, downloaded_at = ? + WHERE id = ? + """, + (legacy_timestamp, bookmarked_at, created_at, modified_at, downloaded_at, snapshot["id"]), + ) + conn.commit() + conn.close() + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute( + "SELECT timestamp, bookmarked_at, created_at, modified_at, downloaded_at FROM core_snapshot WHERE id = ?", + (snapshot["id"],), + ) + migrated = cursor.fetchone() + conn.close() + + assert migrated[0] == legacy_timestamp + assert migrated[1].startswith("2021-01-01"), migrated[1] + assert migrated[2].startswith("2024-08-28"), migrated[2] + assert migrated[3].startswith("2024-08-29"), migrated[3] + assert migrated[4].startswith("2024-08-30"), migrated[4] + + +def test_hyphenated_crawl_ids_are_normalized_before_snapshot_saves(migration_08_data): + """0.8.x crawl UUIDs with dashes should migrate to Django's SQLite UUID format.""" + work_dir, db_path, original_data = migration_08_data + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + for crawl in original_data["crawls"]: + hyphenated = str(uuid.UUID(hex=crawl["id"])) + cursor.execute("UPDATE crawls_crawl SET id = ? WHERE id = ?", (hyphenated, crawl["id"])) + cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id = ?", (hyphenated, crawl["id"])) + crawl["id"] = hyphenated + conn.commit() + conn.close() + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl WHERE id LIKE '%-%'") + hyphenated_crawls = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE crawl_id LIKE '%-%'") + hyphenated_snapshot_refs = cursor.fetchone()[0] + conn.close() + + assert hyphenated_crawls == 0 + assert hyphenated_snapshot_refs == 0 + + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=60) + output = result.stdout + result.stderr + assert result.returncode == 0, f"Update failed after migration: {result.stderr}" + assert "FOREIGN KEY constraint failed" not in output + + +def test_migration_removes_seed_id_column(migration_08_data): + """Migration should remove seed_id column from archivebox.crawls.crawl.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("PRAGMA table_info(crawls_crawl)") + columns = [row[1] for row in cursor.fetchall()] + conn.close() + + assert "seed_id" not in columns, f"seed_id column should have been removed by migration. Columns: {columns}" + + +def test_migration_removes_seed_table(migration_08_data): + """Migration should remove crawls_seed table.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_seed'") + table_exists = cursor.fetchone() is not None + conn.close() + + assert not table_exists, "crawls_seed table should have been removed by migration" + + +def test_add_works_after_migration(migration_08_data): + """Adding new URLs should work after migration from 0.8.x.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + # Check that init actually ran and applied migrations + assert "Applying" in result.stdout + result.stderr, ( + f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}" + ) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Count existing crawls + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + initial_crawl_count = cursor.fetchone()[0] + conn.close() + + # Try to add a new URL after migration (use --index-only for speed) + result = run_archivebox_migration_cmd(work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45) + assert result.returncode == 0, f"Add failed after migration: {result.stderr}" + + # Verify a new Crawl was created + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + new_crawl_count = cursor.fetchone()[0] + conn.close() + + assert new_crawl_count > initial_crawl_count, f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}" + + +def test_version_works_after_migration(migration_08_data): + """Version command should work after migration.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(work_dir, ["version"]) + assert result.returncode == 0, f"Version failed after migration: {result.stderr}" + + # Should show version info + output = result.stdout + result.stderr + assert "ArchiveBox" in output or "version" in output.lower(), f"Version output missing expected content: {output[:500]}" + + +def test_migration_creates_process_records(migration_08_data): + """Migration should create Process records for all ArchiveResults.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Verify Process records created + expected_count = len(original_data["archiveresults"]) + ok, msg = verify_process_migration(db_path, expected_count) + assert ok, msg + + +def test_migration_creates_binary_records(migration_08_data): + """Migration should create and link Binary/NetworkInterface records from migrated Process data.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check Binary records exist + cursor.execute("SELECT COUNT(*) FROM machine_binary") + binary_count = cursor.fetchone()[0] + + # Should have at least one binary per unique extractor + extractors = {ar["extractor"] for ar in original_data["archiveresults"]} + assert binary_count >= len(extractors), f"Expected at least {len(extractors)} Binaries, got {binary_count}" + + cursor.execute(""" + SELECT COUNT(*) + FROM machine_process + WHERE cmd != '[]' AND binary_id IS NULL + """) + missing_binary_count = cursor.fetchone()[0] + assert missing_binary_count == 0 + + cursor.execute(""" + SELECT p.cmd, b.name, b.abspath + FROM machine_process p + JOIN machine_binary b ON p.binary_id = b.id + WHERE p.cmd != '[]' + """) + rows = cursor.fetchall() + assert rows + for cmd_raw, binary_name, binary_abspath in rows: + cmd = json.loads(cmd_raw) + assert binary_name == cmd[0] + assert binary_abspath == cmd[0] + + cursor.execute("SELECT COUNT(*) FROM machine_process WHERE iface_id IS NULL") + missing_iface_count = cursor.fetchone()[0] + assert missing_iface_count == 0 + + conn.close() + + +def test_migration_preserves_cmd_data(migration_08_data): + """Migration should preserve cmd data in Process.cmd field.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check that Process records have cmd arrays + cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'") + cmd_records = cursor.fetchall() + + # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version'])) + expected_count = len(original_data["archiveresults"]) + assert len(cmd_records) == expected_count, f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}" + + conn.close() + + +def test_no_duplicate_snapshots_after_migration(migration_08_data): + """Migration should not create duplicate snapshots.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Check for duplicate URLs + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute(""" + SELECT url, COUNT(*) as cnt FROM core_snapshot + GROUP BY url HAVING cnt > 1 + """) + duplicates = cursor.fetchall() + conn.close() + + assert len(duplicates) == 0, f"Found duplicate URLs: {duplicates}" + + +def test_no_orphaned_archiveresults_after_migration(migration_08_data): + """Migration should not leave orphaned ArchiveResults.""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + ok, msg = verify_foreign_keys(db_path) + assert ok, msg + + +def test_timestamps_preserved_after_migration(migration_08_data): + """Migration should preserve original timestamps.""" + work_dir, db_path, original_data = migration_08_data + original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]} + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url, timestamp FROM core_snapshot") + migrated_timestamps = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + + for url, original_ts in original_timestamps.items(): + assert migrated_timestamps.get(url) == original_ts, f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}" + + +def test_crawl_data_preserved_after_migration(migration_08_data): + """Migration should preserve crawl metadata (urls, label, status).""" + work_dir, db_path, original_data = migration_08_data + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check each crawl's data is preserved + for crawl in original_data["crawls"]: + cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl["id"],)) + row = cursor.fetchone() + assert row is not None, f"Crawl {crawl['id']} not found after migration" + assert row[0] == crawl["urls"], f"URLs mismatch for crawl {crawl['id']}" + assert row[1] == crawl["label"], f"Label mismatch for crawl {crawl['id']}" + + conn.close() + + +def test_tag_associations_preserved_after_migration(migration_08_data): + """Migration should preserve snapshot-tag associations.""" + work_dir, db_path, original_data = migration_08_data + # Count tag associations before migration + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags") + original_count = cursor.fetchone()[0] + conn.close() + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=45) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Count tag associations after migration + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM core_snapshot_tags") + migrated_count = cursor.fetchone()[0] + conn.close() + + assert migrated_count == original_count, f"Tag associations changed: {original_count} -> {migrated_count}" + + +def test_update_migrates_db_snapshot_when_legacy_index_missing(tmp_path): + """A legacy folder with no index file should still migrate if its timestamp exists in DB.""" + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + create_data_dir_structure(work_dir) + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + original_data = seed_0_7_data(db_path) + snapshot = original_data["snapshots"][0] + + snapshot_dir = work_dir / "archive" / snapshot["timestamp"] + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "screenshot.png").write_text("existing-db-snapshot") + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=60) + assert result.returncode == 0, f"Init failed: {result.stderr}" + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=120) + assert result.returncode == 0, f"Update failed: {result.stderr}" + + migrated_files = list((work_dir / "archive" / "users").glob("*/snapshots/*/*/*/screenshot.png")) + assert len(migrated_files) == 1 + assert migrated_files[0].read_text() == "existing-db-snapshot" + assert not (work_dir / "invalid").exists() + + +def test_update_recovers_orphan_with_corrupt_index_from_archive_org_url(tmp_path): + """A corrupt legacy index can be imported when archive.org.txt has the original URL.""" + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + create_data_dir_structure(work_dir) + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + seed_0_7_data(db_path) + + timestamp = "1339747993" + original_url = "http://www.wired.com/wiredenterprise/2012/01/seamicro-and-google/all/1" + snapshot_dir = work_dir / "archive" / timestamp + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "index.json").write_text("") + (snapshot_dir / "archive.org.txt").write_text(f"https://web.archive.org/web/20170531210128/{original_url}\n") + (snapshot_dir / "output.pdf").write_text("orphan-output") + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=60) + assert result.returncode == 0, f"Init failed: {result.stderr}" + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=120) + assert result.returncode == 0, f"Update failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT url, timestamp FROM core_snapshot WHERE timestamp = ?", (timestamp,)) + row = cursor.fetchone() + conn.close() + + assert row == (original_url, timestamp) + migrated_files = list((work_dir / "archive" / "users").glob("*/snapshots/*/*/*/output.pdf")) + assert len(migrated_files) == 1 + assert migrated_files[0].read_text() == "orphan-output" + assert not (work_dir / "invalid").exists() + + +def test_update_preserves_legacy_folder_timestamp_over_index_float_variant(tmp_path): + """Legacy folder timestamp is the on-disk identity even if index.json has a .0 variant.""" + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + create_data_dir_structure(work_dir) + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + seed_0_7_data(db_path) + + timestamp = "1508259732" + url = "https://example.com/folder-timestamp" + snapshot_dir = work_dir / "archive" / timestamp + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "index.json").write_text( + json.dumps( + { + "url": url, + "timestamp": "1508259732.0", + "title": "Folder Timestamp", + }, + ), + ) + (snapshot_dir / "output.html").write_text("folder timestamp output") + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=60) + assert result.returncode == 0, f"Init failed: {result.stderr}" + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=120) + assert result.returncode == 0, f"Update failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT timestamp FROM core_snapshot WHERE url = ?", (url,)) + row = cursor.fetchone() + conn.close() + + assert row == (timestamp,) + assert (work_dir / "archive" / timestamp).is_symlink() + assert not (work_dir / "archive" / f"{timestamp}.0").exists() + assert not (work_dir / "invalid").exists() + + +def test_update_preserves_distinct_legacy_dirs_with_integer_and_float_timestamps(tmp_path): + """Sibling legacy dirs like 1508259732 and 1508259732.0 must not fuzzy-merge.""" + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + create_data_dir_structure(work_dir) + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + seed_0_7_data(db_path) + + url = "https://example.com/duplicate-timestamp" + for timestamp, payload in [("1508259732.0", "float-dir"), ("1508259732", "int-dir")]: + snapshot_dir = work_dir / "archive" / timestamp + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "index.json").write_text( + json.dumps( + { + "url": url, + "timestamp": timestamp, + "title": payload, + }, + ), + ) + (snapshot_dir / f"{payload}.txt").write_text(payload) + + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=60) + assert result.returncode == 0, f"Init failed: {result.stderr}" + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=120) + assert result.returncode == 0, f"Update failed: {result.stderr}" + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT timestamp FROM core_snapshot WHERE url = ? ORDER BY timestamp", (url,)) + rows = cursor.fetchall() + conn.close() + + assert rows == [("1508259732",), ("1508259732.0",)] + assert (work_dir / "archive" / "1508259732").is_symlink() + assert (work_dir / "archive" / "1508259732.0").is_symlink() + assert not (work_dir / "invalid").exists() + + +def test_archiveresult_files_preserved_after_migration(tmp_path): + """ + Test that ArchiveResult output files are reorganized into new structure. + + This test verifies that: + 1. Migration preserves ArchiveResult data in Process/Binary records + 2. Running `archivebox update` reorganizes files into new structure + 3. New structure: archive/users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext + 4. All files are moved (no data loss) + 5. Old archive/timestamp/ directories are cleaned up + """ + work_dir = tmp_path + db_path = work_dir / "index.sqlite3" + create_data_dir_structure(work_dir) + conn = sqlite3.connect(str(db_path)) + conn.executescript(SCHEMA_0_7) + conn.close() + original_data = seed_0_7_data(db_path) + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + for i, snapshot in enumerate(original_data["snapshots"]): + legacy_timestamp = "1609459200.123456" if i == 0 else str(1704110400 + (i * 86400)) + cursor.execute( + "UPDATE core_snapshot SET timestamp = ? WHERE id = ?", + (legacy_timestamp, snapshot["id"]), + ) + cursor.execute( + "UPDATE core_archiveresult SET pwd = ? WHERE snapshot_id = ?", + (f"/data/archive/{legacy_timestamp}", snapshot["id"]), + ) + snapshot["timestamp"] = legacy_timestamp + conn.commit() + conn.close() + + sample_files = [ + "favicon.ico", + "screenshot.png", + "singlefile.html", + "headers.json", + ] + for snapshot in original_data["snapshots"]: + snapshot_dir = work_dir / "archive" / snapshot["timestamp"] + snapshot_dir.mkdir(parents=True, exist_ok=True) + (snapshot_dir / "index.json").write_text( + json.dumps( + { + "url": snapshot["url"], + "timestamp": snapshot["timestamp"], + "title": snapshot["title"], + }, + ), + ) + for sample_file in sample_files: + (snapshot_dir / sample_file).write_text(f"{snapshot['url']}::{sample_file}") + + # Count archive directories and files BEFORE migration + archive_dir = work_dir / "archive" + dirs_before = [d for d in archive_dir.glob("*") if d.name.replace(".", "").isdigit()] if archive_dir.exists() else [] + dirs_before_count = len([d for d in dirs_before if d.is_dir()]) + + # Count total files in all archive directories + files_before = [] + for d in dirs_before: + if d.is_dir(): + files_before.extend([f for f in d.rglob("*") if f.is_file()]) + files_before_count = len(files_before) + generated_metadata_names = {"index.html", "index.json", "index.jsonl"} + generated_search_backends = {"search_backend_sqlite", "search_backend_sonic"} + + def is_generated_file(path) -> bool: + return path.name in generated_metadata_names or any(part in generated_search_backends for part in path.parts) + + original_payloads = sorted(path.read_text() for path in files_before if not is_generated_file(path)) + + # Sample some specific files to check they're preserved + sample_paths_before = {} + for d in dirs_before: + if d.is_dir(): + for sample_file in sample_files: + matching = list(d.glob(sample_file)) + if matching: + sample_paths_before[f"{d.name}/{sample_file}"] = matching[0] + + print(f"\n[*] Archive directories before migration: {dirs_before_count}") + print(f"[*] Total files before migration: {files_before_count}") + print(f"[*] Sample files found: {len(sample_paths_before)}") + + # Run init to trigger migration + result = run_archivebox_migration_cmd(work_dir, ["init"], timeout=60) + assert result.returncode == 0, f"Init (migration) failed: {result.stderr}" + + # Count archive directories and files AFTER migration + dirs_after = [d for d in archive_dir.glob("*") if d.name.replace(".", "").isdigit()] if archive_dir.exists() else [] + dirs_after_count = len([d for d in dirs_after if d.is_dir()]) + + files_after = [] + for d in dirs_after: + if d.is_dir(): + files_after.extend([f for f in d.rglob("*") if f.is_file()]) + files_after_count = len(files_after) + + # Verify sample files still exist + sample_paths_after = {} + for d in dirs_after: + if d.is_dir(): + for sample_file in sample_files: + matching = list(d.glob(sample_file)) + if matching: + sample_paths_after[f"{d.name}/{sample_file}"] = matching[0] + + print(f"[*] Archive directories after migration: {dirs_after_count}") + print(f"[*] Total files after migration: {files_after_count}") + print(f"[*] Sample files found: {len(sample_paths_after)}") + + # Verify files still in old structure after migration (not moved yet) + assert dirs_before_count == dirs_after_count, f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}" + assert files_before_count == files_after_count, f"Files lost during migration: {files_before_count} -> {files_after_count}" + + # Run update to trigger filesystem reorganization + print("\n[*] Running archivebox update to reorganize filesystem...") + result = run_archivebox_migration_cmd(work_dir, ["update"], timeout=120) + assert result.returncode == 0, f"Update failed: {result.stderr}" + + # Check new filesystem structure + # New structure: archive/users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext + users_dir = work_dir / "archive" / "users" + snapshots_base = None + + if users_dir.exists(): + # Find the snapshots directory + for user_dir in users_dir.iterdir(): + if user_dir.is_dir(): + user_snapshots = user_dir / "snapshots" + if user_snapshots.exists(): + snapshots_base = user_snapshots + break + + print(f"[*] New structure base: {snapshots_base}") + + # Count files in new structure + # Structure: archive/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files... + files_new_structure = [] + new_sample_files = {} + + if snapshots_base and snapshots_base.exists(): + for date_dir in snapshots_base.iterdir(): + if date_dir.is_dir(): + for domain_dir in date_dir.iterdir(): + if domain_dir.is_dir(): + for snap_dir in domain_dir.iterdir(): + if snap_dir.is_dir(): + # Files are directly in snap-uuid/ directory (no plugin subdirs) + for f in snap_dir.rglob("*"): + if f.is_file(): + files_new_structure.append(f) + # Track sample files + if f.name in sample_files: + new_sample_files[f"{snap_dir.name}/{f.name}"] = f + + files_new_count = len(files_new_structure) + print(f"[*] Files in new structure: {files_new_count}") + print(f"[*] Sample files in new structure: {len(new_sample_files)}") + + migrated_2021_files = list(users_dir.glob("*/snapshots/20210101/*/*/favicon.ico")) + assert len(migrated_2021_files) > 0, "Legacy snapshot should be bucketed by normalized bookmarked_at, not created_at/import time" + + crawl_snapshot_links = list(users_dir.glob("*/crawls/*/*/*/snapshots/*/*")) + crawl_snapshot_symlinks = [path for path in crawl_snapshot_links if path.is_symlink()] + crawl_dirs = list(users_dir.glob("*/crawls/*/*/*")) + print(f"[*] Crawl snapshot symlinks: {len(crawl_snapshot_symlinks)}") + + # Check old structure (should be gone or empty) + old_archive_dir = work_dir / "archive" + old_files_remaining = [] + unmigrated_dirs = [] + if old_archive_dir.exists(): + for d in old_archive_dir.glob("*"): + # Only count REAL directories, not symlinks (symlinks are the migrated ones) + if d.is_dir(follow_symlinks=False) and d.name.replace(".", "").isdigit(): + # This is a timestamp directory (old structure) + files_in_dir = [f for f in d.rglob("*") if f.is_file()] + if files_in_dir: + unmigrated_dirs.append((d.name, len(files_in_dir))) + old_files_remaining.extend(files_in_dir) + + old_files_count = len(old_files_remaining) + print(f"[*] Files remaining in old structure: {old_files_count}") + if unmigrated_dirs: + print(f"[*] Unmigrated directories: {unmigrated_dirs}") + + # CRITICAL: Verify files were moved to new structure + assert files_new_count > 0, "No files found in new structure after update" + + assert len(crawl_snapshot_symlinks) > 0, "No crawl snapshot symlinks created for migrated snapshots" + + assert not any((crawl_dir / "index.jsonl").exists() for crawl_dir in crawl_dirs), ( + "Migrated crawl dirs should match normal 0.9 crawl dirs and not add crawl index.jsonl files" + ) + + # CRITICAL: Verify old structure is cleaned up + assert old_files_count == 0, f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories" + + # CRITICAL: Verify all original payload files were moved. The 0.9 lazy + # maintenance pass also writes fresh index.jsonl/index.html metadata from + # the hydrated DB row, so raw file counts are allowed to increase; compare + # the legacy payload contents after excluding those generated metadata + # files to keep the no-data-loss assertion strict. + migrated_payloads = sorted(path.read_text() for path in [*files_new_structure, *old_files_remaining] if not is_generated_file(path)) + assert original_payloads == migrated_payloads, "Legacy payload files changed or were lost during reorganization" + assert files_new_count >= files_before_count, "New 0.9 metadata should not replace legacy payload files" + + # CRITICAL: Verify sample files exist in new structure + assert len(new_sample_files) > 0, "Sample files not found in new structure" + + # Verify new path format + for path_key, file_path in new_sample_files.items(): + # Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file + path_parts = file_path.parts + assert "snapshots" in path_parts, f"New path should contain 'snapshots': {file_path}" + assert "users" in path_parts, f"New path should contain 'users': {file_path}" + print(f" โœ“ {path_key} โ†’ {file_path.relative_to(work_dir)}") + + # Verify Process and Binary records were created + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + archiveresult_count = cursor.fetchone()[0] + + original_plugins = sorted({row["extractor"] for row in original_data["archiveresults"]}) + cursor.execute( + f"SELECT COUNT(*) FROM core_archiveresult WHERE plugin IN ({','.join('?' for _ in original_plugins)})", + original_plugins, + ) + legacy_archiveresult_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM machine_binary") + binary_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL") + linked_count = cursor.fetchone()[0] + + cursor.execute( + f"SELECT COUNT(*) FROM core_archiveresult WHERE plugin IN ({','.join('?' for _ in original_plugins)}) AND process_id IS NOT NULL", + original_plugins, + ) + legacy_linked_count = cursor.fetchone()[0] + + conn.close() + + print(f"[*] ArchiveResults: {archiveresult_count}") + print(f"[*] Process records created: {process_count}") + print(f"[*] Binary records created: {binary_count}") + print(f"[*] ArchiveResults linked to Process: {linked_count}") + + # Verify data migration happened correctly. A full `archivebox update` may + # add new maintenance ArchiveResults (e.g. search index backfills), so keep + # the strict preservation assertion scoped to the legacy extractor plugins + # that came from the old DB rows. + assert archiveresult_count >= len(original_data["archiveresults"]), "Full update should not delete ArchiveResult rows" + assert legacy_archiveresult_count == len(original_data["archiveresults"]), ( + f"Expected {len(original_data['archiveresults'])} migrated legacy ArchiveResults, got {legacy_archiveresult_count}" + ) + + # Each legacy ArchiveResult should create one linked Process record. The + # command/worker rows created by `archivebox update` itself can increase the + # total process count, but they must not replace or orphan migrated process + # metadata. + assert process_count >= len(original_data["archiveresults"]), ( + f"Expected at least {len(original_data['archiveresults'])} Process records, got {process_count}" + ) + + assert binary_count == 5, f"Expected 5 unique Binary records, got {binary_count}" + + # ALL legacy ArchiveResults should be linked to Process records + assert linked_count >= len(original_data["archiveresults"]), "Full update should not unlink migrated ArchiveResult processes" + assert legacy_linked_count == len(original_data["archiveresults"]), ( + f"Expected all {len(original_data['archiveresults'])} legacy ArchiveResults linked to Process, got {legacy_linked_count}" + ) diff --git a/archivebox/tests/test_migrations_fresh.py b/archivebox/tests/test_migrations_fresh.py new file mode 100644 index 0000000000..d3cacddaa9 --- /dev/null +++ b/archivebox/tests/test_migrations_fresh.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +Fresh install tests for ArchiveBox. + +Tests that fresh installations work correctly with the current schema. +""" + +import pytest +from django.db.migrations.recorder import MigrationRecorder + +from archivebox.core.models import ArchiveResult, Snapshot, Tag +from archivebox.crawls.models import Crawl +from archivebox.tests.test_orm_helpers import use_archivebox_db +from archivebox.tests.conftest import run_queued_crawls, cli_env + +from .migrations_helpers import run_archivebox_migration_cmd + +pytestmark = pytest.mark.django_db(transaction=True) + + +def test_init_creates_database(tmp_path): + """Fresh init should create database and directories.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Verify database was created + assert (tmp_path / "index.sqlite3").exists(), "Database not created" + # Verify archive directory exists + assert (tmp_path / "archive").is_dir(), "Archive dir not created" + + +def test_status_after_init(tmp_path): + """Status command should work after init.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(tmp_path, ["status"]) + assert result.returncode == 0, f"Status failed: {result.stderr}" + + +def test_add_url_after_init(tmp_path): + """Should be able to add URLs after init with --index-only.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Add a URL with --index-only for speed + result = run_archivebox_migration_cmd(tmp_path, ["add", "--index-only", "https://example.com"]) + assert result.returncode == 0, f"Add command failed: {result.stderr}" + run_queued_crawls(tmp_path, env) + + with use_archivebox_db(tmp_path): + assert Crawl.objects.count() >= 1, "No Crawl was created" + assert Snapshot.objects.count() >= 1, "No Snapshot was created" + + +def test_list_after_add(tmp_path): + """List command should show added snapshots.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(tmp_path, ["add", "--index-only", "https://example.com"]) + assert result.returncode == 0, f"Add failed: {result.stderr}" + run_queued_crawls(tmp_path, env) + + result = run_archivebox_migration_cmd(tmp_path, ["list"]) + assert result.returncode == 0, f"List failed: {result.stderr}" + + # Verify the URL appears in output + output = result.stdout + result.stderr + assert "example.com" in output, f"Added URL not in list output: {output[:500]}" + + +def test_migrations_table_populated(tmp_path): + """Django migrations table should be populated after init.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + with use_archivebox_db(tmp_path): + count = MigrationRecorder.Migration.objects.count() + + # Should have many migrations applied + assert count > 10, f"Expected >10 migrations, got {count}" + + +def test_core_migrations_applied(tmp_path): + """Core app migrations should be applied.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + with use_archivebox_db(tmp_path): + migrations = list( + MigrationRecorder.Migration.objects.filter(app="core").order_by("name").values_list("name", flat=True), + ) + + assert "0001_initial" in migrations + + +def test_snapshot_table_has_required_columns(tmp_path): + """Snapshot table should have all required columns.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + columns = {field.column for field in Snapshot._meta.local_fields} + + required = {"id", "url", "timestamp", "title", "status", "created_at", "modified_at"} + for col in required: + assert col in columns, f"Missing column: {col}" + + +def test_archiveresult_table_has_required_columns(tmp_path): + """ArchiveResult table should have all required columns.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + columns = {field.column for field in ArchiveResult._meta.local_fields} + + required = {"id", "snapshot_id", "plugin", "status", "created_at", "modified_at"} + for col in required: + assert col in columns, f"Missing column: {col}" + + +def test_tag_table_has_required_columns(tmp_path): + """Tag table should have all required columns.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + columns = {field.column for field in Tag._meta.local_fields} + + required = {"id", "name"} + for col in required: + assert col in columns, f"Missing column: {col}" + + +def test_crawl_table_has_required_columns(tmp_path): + """Crawl table should have all required columns.""" + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + columns = {field.column for field in Crawl._meta.local_fields} + + required = {"id", "urls", "status", "created_at", "created_by_id"} + for col in required: + assert col in columns, f"Missing column: {col}" + + # seed_id should NOT exist (removed in 0.9.x) + assert "seed_id" not in columns, "seed_id column should not exist in 0.9.x" + + +def test_add_urls_separately(tmp_path): + """Should be able to add multiple URLs one at a time.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + # Add URLs one at a time + result = run_archivebox_migration_cmd(tmp_path, ["add", "--index-only", "https://example.com"]) + assert result.returncode == 0, f"Add 1 failed: {result.stderr}" + + result = run_archivebox_migration_cmd(tmp_path, ["add", "--index-only", "https://example.org"]) + assert result.returncode == 0, f"Add 2 failed: {result.stderr}" + run_queued_crawls(tmp_path, env) + + with use_archivebox_db(tmp_path): + snapshot_count = Snapshot.objects.count() + crawl_count = Crawl.objects.count() + assert snapshot_count == 2, f"Expected 2 snapshots, got {snapshot_count}" + assert crawl_count == 2, f"Expected 2 Crawls, got {crawl_count}" + + +def test_snapshots_linked_to_crawls(tmp_path): + """Each snapshot should be linked to a crawl.""" + env = cli_env(disable_extractors=True) + result = run_archivebox_migration_cmd(tmp_path, ["init"]) + assert result.returncode == 0, f"Init failed: {result.stderr}" + + result = run_archivebox_migration_cmd(tmp_path, ["add", "--index-only", "https://example.com"]) + assert result.returncode == 0, f"Add failed: {result.stderr}" + run_queued_crawls(tmp_path, env) + + with use_archivebox_db(tmp_path): + row = Snapshot.objects.filter(url="https://example.com").values_list("crawl_id", flat=True).first() + assert row is not None, "Snapshot not found" + assert row is not None, "Snapshot should have a crawl_id" diff --git a/archivebox/tests/test_misc_checks.py b/archivebox/tests/test_misc_checks.py new file mode 100644 index 0000000000..ecf7e47c20 --- /dev/null +++ b/archivebox/tests/test_misc_checks.py @@ -0,0 +1,57 @@ +import os +import signal + +import pytest + +from archivebox.core.shutdown_util import foreground_shutdown_signals +from archivebox.core.shutdown_util import raise_if_shutdown_requested +from archivebox.misc.checks import _migration_interrupt_message +from archivebox.misc.checks import _exit_on_migration_interrupt + + +def test_migration_interrupt_message_prints_resume_command_and_atomic_safety(): + message = _migration_interrupt_message() + + assert "Migration interrupted." in message + assert "Database migrations are atomic" in message + assert "no data loss has occurred" in message + assert "archivebox init" in message + + +def test_migration_interrupt_message_before_apply_says_no_changes_applied(): + message = _migration_interrupt_message(before_apply=True) + + assert "cancelled before any changes were applied" in message + assert "archivebox init" in message + + +def test_migration_interrupt_handler_exits_for_sigint_and_sigterm(monkeypatch): + def fake_exit(code): + raise SystemExit(code) + + monkeypatch.setattr("archivebox.misc.checks.os._exit", fake_exit) + + for sig in (signal.SIGINT, signal.SIGTERM): + previous_handler = signal.getsignal(sig) + try: + with _exit_on_migration_interrupt(): + assert signal.getsignal(sig) != previous_handler + os.kill(os.getpid(), sig) + except SystemExit as err: + assert err.code == 130 + else: + raise AssertionError(f"{sig.name} should exit during migration auto-apply") + assert signal.getsignal(sig) == previous_handler + + +def test_nested_foreground_signal_state_propagates_to_outer_context(): + with foreground_shutdown_signals(first_signal_message=None) as outer_state: + try: + with foreground_shutdown_signals(first_signal_message=None): + os.kill(os.getpid(), signal.SIGTERM) + except KeyboardInterrupt: + pass + + assert outer_state.signal_name == "SIGTERM" + with pytest.raises(KeyboardInterrupt): + raise_if_shutdown_requested() diff --git a/archivebox/tests/test_misc_monkey_patches.py b/archivebox/tests/test_misc_monkey_patches.py new file mode 100644 index 0000000000..3bd162080f --- /dev/null +++ b/archivebox/tests/test_misc_monkey_patches.py @@ -0,0 +1,21 @@ +import datetime +import io + + +def test_daphne_access_log_redacts_sensitive_query_params(): + from archivebox.misc.monkey_patches import ModifiedAccessLogGenerator + + stream = io.StringIO() + logger = ModifiedAccessLogGenerator(stream) + + logger.write_entry( + host="127.0.0.1:54321", + date=datetime.datetime(2026, 5, 29, 12, 0, 0), + request="GET /api/v1/crawls/crawl/a1000000-0000-0000-0000-00000003cea2?api_key=d837c273f6e8f4950e706ebd67d95889&limit=1", + status=200, + ) + + output = stream.getvalue() + assert "api_key=[REDACTED]" in output + assert "d837c273f6e8f4950e706ebd67d95889" not in output + assert "limit=1" in output diff --git a/archivebox/tests/test_opencode_agent.py b/archivebox/tests/test_opencode_agent.py new file mode 100644 index 0000000000..8cc613b9f1 --- /dev/null +++ b/archivebox/tests/test_opencode_agent.py @@ -0,0 +1,272 @@ +import os +import socket +import subprocess +from pathlib import Path +from types import SimpleNamespace +from urllib.parse import quote + +import psutil +import pytest + +from archivebox.tests.conftest import ADMIN_TEST_HOST, run_archivebox_cmd + + +pytestmark = pytest.mark.django_db(transaction=True) + + +def _free_port() -> int: + with socket.socket() as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def _reset_runtime_config() -> None: + from archivebox.config import common + from archivebox.config.configset import _INI_CACHE + from archivebox.machine.models import Machine + + _INI_CACHE.clear() + for value in vars(common).values(): + cache_clear = getattr(value, "cache_clear", None) + if cache_clear is not None: + cache_clear() + Machine.current(refresh=True) + + +def _set_archivebox_config(data_dir: Path, *values: str, env: dict[str, str] | None = None) -> None: + os.chdir(data_dir) + result = run_archivebox_cmd( + ["config", "--set", *values], + cwd=data_dir, + env=env, + timeout=120, + ) + assert result.returncode == 0, result.stderr or result.stdout + _reset_runtime_config() + + +@pytest.fixture +def opencode_archive_config(initialized_archive): + port = _free_port() + state_dir = initialized_archive / "opencode" + env = os.environ.copy() + env.update( + { + "ABXPKG_INSTALL_TIMEOUT": "900", + "ABXPKG_MIN_RELEASE_AGE": "0", + "ABX_RUNTIME": "archivebox", + "ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true", + "OPENCODE_ENABLED": "True", + "OPENCODE_HOST": "127.0.0.1", + "OPENCODE_PORT": str(port), + "OPENCODE_WORKDIR": str(initialized_archive), + "OPENCODE_STATE_DIR": str(state_dir), + "OPENCODE_TIMEOUT": "60", + }, + ) + _set_archivebox_config( + initialized_archive, + "OPENCODE_ENABLED=True", + "OPENCODE_HOST=127.0.0.1", + f"OPENCODE_PORT={port}", + f"OPENCODE_WORKDIR={initialized_archive}", + f"OPENCODE_STATE_DIR={state_dir}", + "OPENCODE_TIMEOUT=60", + env=env, + ) + return SimpleNamespace(data_dir=initialized_archive, port=port, state_dir=state_dir, env=env) + + +@pytest.fixture +def live_opencode(opencode_archive_config): + from abx_plugins.plugins.opencode import views + + install = run_archivebox_cmd( + ["install", "opencode", "--binproviders=env,pnpm"], + cwd=opencode_archive_config.data_dir, + env=opencode_archive_config.env, + timeout=1200, + ) + assert install.returncode == 0, install.stderr or install.stdout + _reset_runtime_config() + + config = views._machine_config() + settings = views._settings(config) + settings["archivebox_base_url"] = "http://admin.archivebox.localhost:8000" + settings["archivebox_admin_url"] = "http://admin.archivebox.localhost:8000/admin" + settings["archivebox_api_url"] = "http://admin.archivebox.localhost:8000/api/" + binary, binary_env = views._resolve_binary(settings["binary"], settings["config"]) + version = subprocess.run( + [binary, "--version"], + env={**os.environ, **binary_env}, + text=True, + capture_output=True, + timeout=120, + ) + assert version.returncode == 0, version.stderr or version.stdout + ok, error = views._ensure_opencode(settings) + assert ok, error + + process = views._PROCESS + assert process is not None + try: + yield SimpleNamespace(config=opencode_archive_config, settings=settings, process=process) + finally: + if views._PROCESS and views._PROCESS.poll() is None: + views._PROCESS.terminate() + try: + views._PROCESS.wait(timeout=10) + except Exception: + views._PROCESS.kill() + views._PROCESS = None + + +def test_opencode_disabled_route_does_not_start_server(client, initialized_archive): + from archivebox.machine.models import Machine + from abx_plugins.plugins.opencode import views + + os.chdir(initialized_archive) + Machine.from_json({"config": {"OPENCODE_ENABLED": False}}) + _reset_runtime_config() + assert views._machine_config()["OPENCODE_ENABLED"] is False + + response = client.get("/admin/agent", HTTP_HOST=ADMIN_TEST_HOST) + + assert response.status_code == 404 + assert views._PROCESS is None or views._PROCESS.poll() is not None + + +def test_opencode_agent_requires_superuser_when_enabled(client, db, django_user_model, live_opencode): + response = client.get("/admin/agent", HTTP_HOST=ADMIN_TEST_HOST) + assert response.status_code == 302 + assert "/admin/login/" in response.headers["Location"] + + user = django_user_model.objects.create_user(username="regular", password="testpassword") + client.force_login(user) + response = client.get("/admin/agent", HTTP_HOST=ADMIN_TEST_HOST) + assert response.status_code == 403 + + +def test_opencode_proxy_blocks_cross_origin_mutation(admin_client, db, live_opencode): + response = admin_client.post( + "/admin/agent/opencode/session", + data=b"{}", + content_type="application/json", + HTTP_HOST=ADMIN_TEST_HOST, + HTTP_ORIGIN="https://evil.example", + ) + + assert response.status_code == 403 + + +def test_opencode_proxy_blocks_cross_site_fetch_metadata(admin_client, db, live_opencode): + response = admin_client.post( + "/admin/agent/opencode/session", + data=b"{}", + content_type="application/json", + HTTP_HOST=ADMIN_TEST_HOST, + HTTP_SEC_FETCH_SITE="cross-site", + ) + + assert response.status_code == 403 + + +def test_opencode_agent_superuser_gets_admin_wrapper(admin_client, live_opencode): + from abx_plugins.plugins.opencode import views + + response = admin_client.get("/admin/agent", HTTP_HOST=ADMIN_TEST_HOST) + + assert response.status_code == 200 + assert f'