diff --git a/.github/workflows/appdir-jekyll-update.yml b/.github/workflows/appdir-jekyll-update.yml deleted file mode 100644 index 902ea98..0000000 --- a/.github/workflows/appdir-jekyll-update.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: "📦 Publish (app user manual)" - -on: - workflow_dispatch: - - push: - branches: ['main'] - paths: ['documentation/clamsapp.md'] - -jobs: - call-jekyll-build-deploy: - name: "🤙 Call AppDir Jekyll build-deploy workflow" - runs-on: ubuntu-latest - steps: - - uses: actions/github-script@v6 - with: - github-token: ${{ secrets.GH_CLAMSBOT_TOKEN }} - script: | - const result = await github.rest.repos.createDispatchEvent({ - owner: 'clamsproject', - repo: 'apps', - event_type: 'appdir-update', - client_payload: {} - }) - console.log(result); diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index e9f12f3..68bee90 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -12,7 +12,7 @@ on: jobs: test-and-codecov: name: "🤙 Call SDK test workflow" - uses: clamsproject/.github/.github/workflows/sdk-codecov.yml@main + uses: clamsproject/.github/.github/workflows/sdk-codecov-pyproj.yml@main secrets: CC_REPO_UPLOAD_TOKEN: ${{ secrets.CODECOV_UPLOAD_TOKEN_CLAMS_PYTHON }} diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index 2296743..ad925a7 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -59,7 +59,7 @@ jobs: echo "slug=${slug}" >> $GITHUB_OUTPUT - name: "🛍️ Checkout repository" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ inputs.ref || inputs.version }} diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml index 9241041..4cdbca9 100644 --- a/.github/workflows/containers.yml +++ b/.github/workflows/containers.yml @@ -39,7 +39,7 @@ jobs: - name: "⏱️ Wait up to 20 minutes for the new clams-python is deployed on PyPI" uses: nev7n/wait_for_response@v1 with: - url: "https://pypi.org/project/clams-python/${{ needs.set-version.outputs.version }}/" + url: "https://pypi.org/pypi/clams-python/${{ needs.set-version.outputs.version }}/json" responseCode: 200 timeout: 1200000 interval: 5000 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 07868a8..28d21a4 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -36,7 +36,7 @@ jobs: name: "📦 Build and upload to PyPI" needs: check-pypi if: needs.check-pypi.outputs.exists == 'false' - uses: clamsproject/.github/.github/workflows/sdk-publish.yml@main + uses: clamsproject/.github/.github/workflows/sdk-publish-pyproj.yml@main secrets: inherit publish-docs: @@ -48,7 +48,4 @@ jobs: source_repo: clamsproject/clams-python source_ref: ${{ needs.check-pypi.outputs.version }} project_name: clams-python - build_command: 'python3 build-tools/docs.py --output-dir docs' - docs_output_dir: 'docs' - python_version: '3.11' secrets: inherit diff --git a/.gitignore b/.gitignore index 2582f0a..2618793 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ # build time temp files VERSION* -clams/ver # linux .*.sw? @@ -67,6 +66,7 @@ hs_err_pid* # virtual machine crash logs, see http://www.java.com/en/download/he build/ dist/ *.egg-info +clams_python-*/ coverage.xml # shared folders @@ -84,3 +84,5 @@ tags # sphinx docs-test/ +documentation/appmetadata.jsonschema +documentation/whatsnew.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e8939f1 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,91 @@ +# Contributing to clams-python + +## Prerequisites + +- Python 3.10+ +- `gh` CLI (for changelog generation) + +## Setup + +```bash +pip install -e ".[dev]" +``` + +Unlike the old `setup.py`-based workflow, an editable install +(`pip install -e .`) is now required before running tests or building +docs. The package uses `importlib.metadata` for version resolution at +runtime, which only works when the package is registered in the +environment. You can no longer run `pytest` or `pytype` directly +against the source tree without installing first. If you want to avoid +pulling in all dependencies, `pip install -e . --no-deps` is sufficient +to register the package metadata. + +## Local Development + +All build tasks are handled by scripts in `build-tools/`. Each script +is self-contained and installs its own dependencies as needed. + +| Task | Command | +|------|---------| +| Build (sdist + wheel) | `python build-tools/build.py` | +| Run tests | `python build-tools/test.py` | +| Build docs | `python build-tools/docs.py` | +| Clean artifacts | `python build-tools/clean.py` | +| Publish | `python build-tools/publish.py` | + +All scripts support `--help` for full usage details. + +### Build + +```bash +python build-tools/build.py +``` + +Produces sdist and wheel in `dist/`. + +### Test + +```bash +python build-tools/test.py +``` + +Runs pytest with coverage. Use `--skip-install` if you already have the +package installed in editable mode. + +### Documentation + +```bash +python build-tools/docs.py +``` + +Builds Sphinx HTML docs into `docs-test/` (override with `--output-dir`). +The `--build-ver` flag is accepted for CI compatibility but has no effect +— clams-python uses unversioned documentation. + +### Versioning + +Versions are derived automatically from git tags via `setuptools-scm`. +There is no `VERSION` file to manage. At runtime, the version is +accessed through `importlib.metadata`: + +```python +from clams.ver import __version__ +``` + +For a dev install without a matching tag, `setuptools-scm` generates a +version like `1.4.1.dev20+gaf551a4e4.d20260325`. + +## Migration from Makefile + +The old `Makefile` and `setup.py` have been removed. If you are +accustomed to the old workflow, here is a mapping: + +| Old command | New equivalent | +|-------------|----------------| +| `make package` / `python setup.py sdist` | `python build-tools/build.py` | +| `make develop` / `python setup.py develop` | `pip install -e ".[dev]"` | +| `make test` | `python build-tools/test.py` | +| `make doc` | `python build-tools/docs.py` | +| `make version` / `make devversion` | Automatic via `setuptools-scm` (tag-based) | +| `make clean` | `python build-tools/clean.py` | +| `make publish` | `python build-tools/publish.py` | diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 44e8bd8..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include ./requirements.txt -include ./VERSION - diff --git a/Makefile b/Makefile deleted file mode 100644 index 5e53823..0000000 --- a/Makefile +++ /dev/null @@ -1,102 +0,0 @@ -# check for dependencies -SHELL := /bin/bash -deps = curl jq git python3 -check_deps := $(foreach dep,$(deps), $(if $(shell which $(dep)),some string,$(error "No $(dep) in PATH!"))) - -# constants -packagename = clams -generatedcode = $(packagename)/ver -distname = $(packagename)_python -artifact = build/lib/$(packagename) -buildcaches = build/bdist* $(distname).egg-info __pycache__ -testcaches = .hypothesis .pytest_cache .pytype coverage.xml htmlcov .coverage - -.PHONY: all -.PHONY: clean -.PHONY: test -.PHONY: develop -.PHONY: publish -.PHONY: docs -.PHONY: package -.PHONY: devversion - -all: version test build - -develop: devversion package test - python3 setup.py develop --uninstall - python3 setup.py develop - -publish: distclean version package test - test `git branch --show-current` = "master" - @git tag `cat VERSION` - @git push origin `cat VERSION` - -$(generatedcode): VERSION - # this will generate the version subpackage inside clams package - python3 setup.py --help 2>/dev/null || echo "Ignore setuptools import error for now" - ls $(generatedcode)* - -# generating jsonschema depends on mmif-python and pydantic -docs: - @echo "WARNING: The 'docs' target is deprecated and will be removed." - @echo "The 'docs' directory is no longer used. Documentation is now hosted in the central CLAMS documentation hub." - @echo "Use 'make doc' for local builds." - @echo "Nothing is done." - -doc: VERSION - python3 build-tools/docs.py - -package: VERSION - pip install --upgrade -r requirements.dev - python3 setup.py sdist - -build: $(artifact) -$(artifact): - python3 setup.py build - -# invoking `test` without a VERSION file will generated a dev version - this ensures `make test` runs unmanned -test: devversion $(generatedcode) - pip install --upgrade -r requirements.dev - pip install -r requirements.txt - pytype --config .pytype.cfg $(packagename) - python3 -m pytest --cov=$(packagename) --cov-report=xml - -# helper functions -e := -space := $(e) $(e) -## handling version numbers -macro = $(word 1,$(subst .,$(space),$(1))) -micro = $(word 2,$(subst .,$(space),$(1))) -patch = $(word 3,$(subst .,$(space),$(1))) -increase_patch = $(call macro,$(1)).$(call micro,$(1)).$$(($(call patch,$(1))+1)) -## handling versioning for dev version -add_dev = $(call macro,$(1)).$(call micro,$(1)).$(call patch,$(1)).dev1 -split_dev = $(word 2,$(subst .dev,$(space),$(1))) -increase_dev = $(call macro,$(1)).$(call micro,$(1)).$(call patch,$(1)).dev$$(($(call split_dev,$(1))+1)) - -devversion: VERSION.dev VERSION; cat VERSION -version: VERSION; cat VERSION - -VERSION.dev: devver := $(shell curl --silent "https://api.github.com/repos/clamsproject/clams-python/git/refs/tags" | grep '"ref":' | sed -E 's/.+refs\/tags\/([0-9.]+)",/\1/g' | sort | tail -n 1) -VERSION.dev: - @if [ -z "$(devver)" ]; then if [ -e VERSION ] ; then cp VERSION{"",".dev"}; else echo "0.0.0.dev1" > VERSION.dev ; fi \ - else if [[ "$(devver)" == *.dev* ]]; then echo $(call increase_dev,$(devver)); else echo $(call add_dev,$(call increase_patch, $(devver))); fi > VERSION.dev; \ - fi - -VERSION: version := $(shell git tag | sort -r | head -n 1) -VERSION: - @if [ -e VERSION.dev ] ; \ - then cp VERSION.dev VERSION; \ - else (read -p "Current version is ${version}, please enter a new version (default: increase *patch* level by 1): " new_ver; \ - [ -z $$new_ver ] && echo $(call increase_patch,$(version)) || echo $$new_ver) > VERSION; \ - fi - -distclean: - @rm -rf dist $(artifact) build/bdist* -clean: distclean - @rm -rf VERSION VERSION.dev $(testcaches) $(buildcaches) $(generatedcode) - @rm -rf docs - @rm -rf .*cache - @rm -rf .hypothesis tests/.hypothesis -cleandocs: - @git checkout -- docs && git clean -fx docs diff --git a/build-tools/build.py b/build-tools/build.py new file mode 100644 index 0000000..be8871b --- /dev/null +++ b/build-tools/build.py @@ -0,0 +1,54 @@ +""" +Build the clams-python package. + +Installs dependencies and runs `python -m build` to produce sdist + wheel. +""" +import argparse +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent + + +def run_command(command, cwd=None, check=True): + """Helper to run a shell command.""" + print(f"Running: {' '.join(str(c) for c in command)}") + result = subprocess.run(command, cwd=cwd) + if check and result.returncode != 0: + print( + f"Error: Command failed with exit code " + f"{result.returncode}" + ) + sys.exit(result.returncode) + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Build the clams-python package." + ) + parser.parse_args() + + project_root = SCRIPT_DIR.parent + + # Install dev + build dependencies + print("--- Installing dependencies ---") + run_command( + [sys.executable, "-m", "pip", + "install", "-e", ".[dev]", "build"], + cwd=project_root, + ) + + # Build sdist + wheel + print("\n--- Building sdist + wheel ---") + run_command( + [sys.executable, "-m", "build"], + cwd=project_root, + ) + + print("\nBuild complete. Output in: dist/") + + +if __name__ == "__main__": + main() diff --git a/build-tools/clean.py b/build-tools/clean.py new file mode 100644 index 0000000..784dfda --- /dev/null +++ b/build-tools/clean.py @@ -0,0 +1,70 @@ +""" +Clean build artifacts, caches, and generated files. + +Replaces ``make clean`` / ``make distclean`` from the old Makefile. +""" +import argparse +import shutil +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent +PROJECT_ROOT = SCRIPT_DIR.parent + +# Directories to remove +CLEAN_DIRS = [ + "build", "dist", "*.egg-info", "clams_python-*", + ".pytest_cache", ".pytype", ".hypothesis", + "tests/.hypothesis", "htmlcov", + "docs-test", +] + +# Files to remove +CLEAN_FILES = [ + "coverage.xml", ".coverage", +] + +# Glob patterns for recursive removal +CLEAN_GLOBS = [ + "**/__pycache__", +] + + +def clean(root: Path): + removed = [] + + for pattern in CLEAN_DIRS: + for p in root.glob(pattern): + if p.is_dir(): + shutil.rmtree(p) + removed.append(str(p.relative_to(root))) + + for name in CLEAN_FILES: + p = root / name + if p.exists(): + p.unlink() + removed.append(str(p.relative_to(root))) + + for pattern in CLEAN_GLOBS: + for p in root.glob(pattern): + if p.is_dir(): + shutil.rmtree(p) + removed.append(str(p.relative_to(root))) + + if removed: + print(f"Removed {len(removed)} items:") + for item in sorted(removed): + print(f" {item}") + else: + print("Nothing to clean.") + + +def main(): + parser = argparse.ArgumentParser( + description="Clean build artifacts and caches." + ) + parser.parse_args() + clean(PROJECT_ROOT) + + +if __name__ == "__main__": + main() diff --git a/build-tools/docs.py b/build-tools/docs.py index 3ef8cce..e27a426 100644 --- a/build-tools/docs.py +++ b/build-tools/docs.py @@ -1,87 +1,93 @@ +""" +Build documentation for the clams-python project. + +This script is equivalent to: + 1. pip install -e .[docs] + 2. sphinx-build -b html -a -E documentation +""" import argparse +import shutil import subprocess import sys -import os -import shutil from pathlib import Path -def run_command(command, cwd=None, check=True, env=None): + +def run_command(command, cwd=None, check=True): """Helper to run a shell command.""" print(f"Running: {' '.join(str(c) for c in command)}") - result = subprocess.run(command, cwd=cwd, env=env) + result = subprocess.run(command, cwd=cwd) if check and result.returncode != 0: print(f"Error: Command failed with exit code {result.returncode}") sys.exit(result.returncode) return result -def build_docs_local(source_dir: Path, output_dir: Path = None): + +def build_docs_local(source_dir: Path, output_dir: Path): """ Builds documentation for the provided source directory. - Assumes it's running in an environment with necessary tools. + + :param source_dir: Path to the source directory containing the project. + :param output_dir: Path to the output directory for built documentation. """ - if output_dir is None: - output_dir = source_dir / "docs-test" - print("--- Running in Local Build Mode ---") - - # 1. Generate source code and install in editable mode. - print("\n--- Step 1: Installing in editable mode ---") - try: - run_command([sys.executable, "-m", "pip", "install", "-e", "."], cwd=source_dir) - except SystemExit: - print("Warning: 'pip install -e .' failed. This might be due to an externally managed environment.") - print("Attempting to proceed with documentation build assuming dependencies are met...") + print("--- Building clams-python documentation ---") - # 2. Install documentation-specific dependencies. - print("\n--- Step 2: Installing documentation dependencies ---") - doc_reqs = source_dir / "build-tools" / "requirements.docs.txt" - if not doc_reqs.exists(): - print(f"Error: Documentation requirements not found at {doc_reqs}") - sys.exit(1) + # Install package with docs dependencies in editable mode. + print("\n--- Step 1: Installing package with docs dependencies ---") try: - run_command([sys.executable, "-m", "pip", "install", "-r", str(doc_reqs)]) + run_command( + [sys.executable, "-m", "pip", "install", "-e", ".[docs]"], + cwd=source_dir, + ) except SystemExit: - print("Warning: Failed to install documentation dependencies.") - # Check if sphinx-build is available + print("Warning: 'pip install -e .[docs]' failed.") if shutil.which("sphinx-build") is None: print("Error: 'sphinx-build' not found and installation failed.") - print("Please install dependencies manually or run this script inside a virtual environment.") sys.exit(1) print("Assuming dependencies are already installed...") - # 3. Build the documentation using Sphinx. - print("\n--- Step 3: Building Sphinx documentation ---") + # Build the documentation using Sphinx. + print("\n--- Step 2: Building Sphinx documentation ---") docs_source_dir = source_dir / "documentation" - docs_build_dir = output_dir - - # Schema generation is now handled in conf.py - # schema_src = source_dir / "clams" / "appmetadata.jsonschema" - # schema_dst = docs_source_dir / "appmetadata.jsonschema" - # if schema_src.exists(): - # shutil.copy(schema_src, schema_dst) sphinx_command = [ sys.executable, "-m", "sphinx.cmd.build", str(docs_source_dir), - str(docs_build_dir), + str(output_dir), "-b", "html", # build html "-a", # write all files (rebuild everything) "-E", # don't use a saved environment, reread all files ] run_command(sphinx_command) - print(f"\nDocumentation build complete. Output in: {docs_build_dir}") - return docs_build_dir + print(f"\nDocumentation build complete. Output in: {output_dir}") + return output_dir + def main(): parser = argparse.ArgumentParser( description="Build documentation for the clams-python project." ) parser.add_argument( - '--output-dir', type=Path, default=None, - help='Output directory for built docs (default: docs-test)') + "--build-ver", + metavar="", + default=None, + help="Accepted for CLI compatibility with other SDK repos. " + "Ignored by this script (clams-python uses " + "unversioned documentation)." + ) + parser.add_argument( + "--output-dir", + metavar="", + default="docs-test", + help="The directory for documentation output " + "(default: docs-test)." + ) args = parser.parse_args() - build_docs_local(Path.cwd(), output_dir=args.output_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(exist_ok=True) + build_docs_local(Path.cwd(), output_dir) + if __name__ == "__main__": main() diff --git a/build-tools/publish.py b/build-tools/publish.py new file mode 100644 index 0000000..b1b59eb --- /dev/null +++ b/build-tools/publish.py @@ -0,0 +1,202 @@ +""" +Publish the clams-python package. + +This script is equivalent to `make publish` in the old Makefile build: + 1. Generate CHANGELOG.md from merged release PRs (requires `gh` CLI) + 2. Upload dist/ to PyPI via twine (requires TWINE_PASSWORD env var) + +Credentials are passed via environment variables (twine reads them +natively): + TWINE_USERNAME — defaults to __token__ for API tokens + TWINE_PASSWORD — the PyPI/TestPyPI API token + TWINE_REPOSITORY_URL — override for TestPyPI (or use --testpypi) +""" +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent +TESTPYPI_URL = "https://test.pypi.org/legacy/" + + +def run_command(command, cwd=None, capture=False, check=False): + """Helper to run a shell command.""" + print(f"Running: {' '.join(str(c) for c in command)}") + result = subprocess.run( + command, + cwd=cwd, + capture_output=capture, + text=capture, + ) + if check and result.returncode != 0: + print( + f"Error: Command failed with exit code " + f"{result.returncode}" + ) + sys.exit(result.returncode) + return result + + +def check_gh_available(): + """Check if gh CLI is installed and authenticated.""" + result = run_command( + ["gh", "auth", "status"], + capture=True, + ) + return result.returncode == 0 + + +def generate_changelog(repo=None): + """ + Generate CHANGELOG.md from merged release PRs. + + :param repo: GitHub repo in owner/name format. + If None, uses the repo from the current git remote. + """ + project_root = SCRIPT_DIR.parent + + repo_args = ["--repo", repo] if repo else [] + + # Query merged PRs with "releas" in title + result = run_command( + ["gh", "pr", "list", + "-L", "1000", + "-s", "merged", + "--json", "number,title,body,mergedAt"] + + repo_args, + cwd=project_root, + capture=True, + ) + if result.returncode != 0: + print(f"Error querying PRs: {result.stderr}") + return False + + prs = json.loads(result.stdout) + # Filter to release PRs + release_prs = [ + pr for pr in prs + if pr["title"].lower().startswith("releas") + ] + + if not release_prs: + print("No release PRs found.") + return False + + # Sort by merge date (newest first) + release_prs.sort( + key=lambda pr: pr["mergedAt"], reverse=True + ) + + # Format changelog + lines = [] + for pr in release_prs: + merged_date = pr["mergedAt"][:10] # YYYY-MM-DD + lines.append(f"\n## {pr['title']} ({merged_date})") + if pr["body"]: + lines.append(pr["body"]) + lines.append("") + + changelog_path = project_root / "CHANGELOG.md" + changelog_path.write_text("\n".join(lines)) + print( + f"CHANGELOG.md written with {len(release_prs)} entries." + ) + return True + + +def upload_to_pypi(testpypi=False): + """ + Upload dist/ to PyPI via twine. + + Auth via env vars: TWINE_USERNAME (default: __token__), + TWINE_PASSWORD (required). + """ + project_root = SCRIPT_DIR.parent + dist_dir = project_root / "dist" + + tarballs = list(dist_dir.glob("*.tar.gz")) + if not tarballs: + print("No tarball found in dist/. Run build.py first.") + sys.exit(1) + + if not os.environ.get("TWINE_PASSWORD"): + print( + "Warning: TWINE_PASSWORD not set. " + "Skipping PyPI upload." + ) + print( + "Set TWINE_PASSWORD to your PyPI API token " + "to enable upload." + ) + return + + # Set default username for token auth + if not os.environ.get("TWINE_USERNAME"): + os.environ["TWINE_USERNAME"] = "__token__" + + # Install twine + run_command( + [sys.executable, "-m", "pip", "install", "twine"], + cwd=project_root, + ) + + # Build upload command + cmd = [sys.executable, "-m", "twine", "upload"] + if testpypi: + cmd.extend(["--repository-url", TESTPYPI_URL]) + cmd.extend(str(t) for t in tarballs) + + run_command(cmd, cwd=project_root, check=True) + + +def main(): + parser = argparse.ArgumentParser( + description="Publish: generate CHANGELOG and upload to PyPI." + ) + parser.add_argument( + "--repo", + default=None, + help="GitHub repo in owner/name format " + "(default: infer from git remote)." + ) + parser.add_argument( + "--skip-changelog", + action="store_true", + help="Skip changelog generation." + ) + parser.add_argument( + "--skip-upload", + action="store_true", + help="Skip PyPI upload (changelog only)." + ) + parser.add_argument( + "--testpypi", + action="store_true", + help="Upload to TestPyPI instead of PyPI." + ) + args = parser.parse_args() + + # Changelog + if args.skip_changelog: + print("Skipping changelog generation.") + elif not check_gh_available(): + print( + "Warning: gh CLI not available or not authenticated. " + "Skipping changelog generation." + ) + else: + if not generate_changelog(repo=args.repo): + sys.exit(1) + + # Upload + if args.skip_upload: + print("Skipping PyPI upload.") + else: + upload_to_pypi(testpypi=args.testpypi) + + +if __name__ == "__main__": + main() diff --git a/build-tools/requirements.docs.txt b/build-tools/requirements.docs.txt deleted file mode 100644 index 43e71f2..0000000 --- a/build-tools/requirements.docs.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx>=7.0,<8.0 -furo -m2r2 -sphinx-jsonschema diff --git a/build-tools/test.py b/build-tools/test.py new file mode 100644 index 0000000..1a93227 --- /dev/null +++ b/build-tools/test.py @@ -0,0 +1,72 @@ +""" +Run tests for the clams-python package. + +This script is equivalent to ``make test`` in the Makefile-based repos: + pip install -e ".[test]" + pytype --config .pytype.cfg clams + python -m pytest --cov=clams --cov-report=xml +""" +import argparse +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).parent + + +def run_command(command, cwd=None, check=True): + """Helper to run a shell command.""" + print(f"Running: {' '.join(str(c) for c in command)}") + result = subprocess.run(command, cwd=cwd) + if check and result.returncode != 0: + print( + f"Error: Command failed with exit code " + f"{result.returncode}" + ) + sys.exit(result.returncode) + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Run tests for the clams-python package." + ) + parser.add_argument( + "--skip-install", + action="store_true", + help="Skip pip install step " + "(useful if already installed)." + ) + args = parser.parse_args() + + project_root = SCRIPT_DIR.parent + + # Install package with test dependencies + if not args.skip_install: + print("--- Installing package with test dependencies ---") + run_command( + [sys.executable, "-m", "pip", + "install", "-e", ".[test]"], + cwd=project_root, + ) + + # Run pytype static analysis + print("\n--- Running pytype ---") + run_command( + ["pytype", "--config", ".pytype.cfg", "clams"], + cwd=project_root, + ) + + # Run pytest with coverage + print("\n--- Running pytest ---") + run_command( + [sys.executable, "-m", "pytest", + "--cov=clams", "--cov-report=xml"], + cwd=project_root, + ) + + print("\nAll tests passed.") + + +if __name__ == "__main__": + main() diff --git a/clams/__init__.py b/clams/__init__.py index bd1753f..5ed8716 100644 --- a/clams/__init__.py +++ b/clams/__init__.py @@ -3,9 +3,11 @@ import mmif from clams import develop +from clams import envelop from clams.app import * from clams.app import __all__ as app_all from clams.appmetadata import AppMetadata +from clams.envelop import create_envelope from clams.restify import Restifier from clams.ver import __version__ @@ -32,6 +34,7 @@ def cli(): to_register = list(mmif.find_all_modules('mmif.utils.cli')) # then add my own subcommands to_register.append(develop) + to_register.append(envelop) for cli_module in to_register: cli_module_name = cli_module.__name__.rsplit('.')[-1] cli_modules[cli_module_name] = cli_module diff --git a/clams/app/__init__.py b/clams/app/__init__.py index 71d70f2..1d3f0a2 100644 --- a/clams/app/__init__.py +++ b/clams/app/__init__.py @@ -1,3 +1,4 @@ +import json import logging import os import pathlib @@ -19,6 +20,7 @@ ) from mmif.utils.workflow_helper import generate_param_hash # pytype: disable=import-error from clams.appmetadata import AppMetadata, real_valued_primitives, python_type, map_param_kv_delimiter +from clams.envelop import unwrap_if_envelope logging.basicConfig( level=getattr(logging, os.environ.get('CLAMS_LOGLEVEL', 'WARNING').upper(), logging.WARNING), @@ -159,11 +161,19 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) -> wrapper around :meth:`~clams.app.ClamsApp._annotate` method where some common operations (that are invoked by keyword arguments) are implemented. - :param mmif: An input MMIF object to annotate + The input may be a raw MMIF (str, dict, or :class:`~mmif.serialize.mmif.Mmif`) + or a JSON envelope wrapping both ``"parameters"`` and ``"mmif"``. + Envelope detection and unwrapping happen here so every execution + path (HTTP, CLI, direct Python API) is envelope-aware. When an + envelope is given, its parameters are merged under ``runtime_params`` + (explicitly-passed parameters take priority on key collision). + + :param mmif: An input MMIF object, or a JSON envelope, to annotate :param runtime_params: An arbitrary set of k-v pairs to configure the app at runtime :return: Serialized JSON string of the output of the app """ if not isinstance(mmif, Mmif): + mmif, runtime_params = unwrap_if_envelope(mmif, runtime_params) mmif = Mmif(mmif) existing_view_ids = {view.id for view in mmif.views} issued_warnings = [] @@ -329,7 +339,8 @@ def set_error_view(self, mmif: Union[str, dict, Mmif], **runtime_conf: List[str] :return: An output MMIF with a new view with the error encoded in the view metadata """ import traceback - if isinstance(mmif, bytes) or isinstance(mmif, str) or isinstance(mmif, dict): + if isinstance(mmif, (bytes, str, dict)): + mmif, runtime_conf = unwrap_if_envelope(mmif, runtime_conf) mmif = Mmif(mmif) error_view: Optional[View] = None for view in reversed(mmif.views): @@ -739,6 +750,22 @@ def str_param(value) -> str: @staticmethod def kv_param(value) -> Dict[str, str]: """ - Helper function to convert string values to key-value pair type. - """ - return dict([value.split(map_param_kv_delimiter, 1)]) + Helper function to convert a colon-separated string into a + single-entry dictionary. The first colon is used as the + key-value delimiter; colons are not allowed in keys. + + :param value: a colon-separated key-value string (e.g. ``key:value``) + :type value: str + :returns: a single-entry dict parsed from the input + :rtype: Dict[str, str] + """ + k, v = value.split(map_param_kv_delimiter, 1) + if map_param_kv_delimiter in v: + warnings.warn( + f"The map parameter value {value!r} contains " + f"multiple '{map_param_kv_delimiter}' characters. " + f"Only the first one is used as the delimiter " + f"(key={k!r}, value={v!r}). " + f"Colons are not allowed in map parameter keys." + ) + return {k: v} diff --git a/clams/appmetadata/__init__.py b/clams/appmetadata/__init__.py index 74ff712..fd2fe75 100644 --- a/clams/appmetadata/__init__.py +++ b/clams/appmetadata/__init__.py @@ -203,11 +203,13 @@ class RuntimeParameter(_BaseModel): "forced. \n\n" "Notes for developers: \n\n" "When the type is ``map``, the parameter value (still a single string from the users' perspective) " - "must be formatted as a ``KEY:VALUE`` pair, namely a colon-separated string. To pass multiple " - "key-value pairs, users need to pass the parameter multiple times (remember ``type=map`` implies " - "``multivalued=true``) with pairs in the colon-separated format. \n\n" - "Also, the `VALUE` part of the user input is always expected and handled as a string. If a " - "developers wants to do more text processing on the passed value to accept more complex data types " + "must be formatted as a ``KEY:VALUE`` pair, namely a colon-separated string. The first colon is " + "always used as the delimiter, so **colons are not allowed in keys**. Colons may appear in " + "values (everything after the first colon becomes the value), but a warning will be emitted. " + "To pass multiple key-value pairs, users need to pass the parameter multiple times (remember " + "``type=map`` implies ``multivalued=true``) with pairs in the colon-separated format. \n\n" + "Also, the ``VALUE`` part of the user input is always expected and handled as a string. If a " + "developer wants to do more text processing on the passed value to accept more complex data types " "or structures (e.g., map from a string to a list of strings), it is up to the developer. However, " "any additional form requirements should be precisely described in the ``description`` field for " "users. \n\n" @@ -438,6 +440,18 @@ def add_input(self, at_type: Union[str, vocabulary.ThingTypesBase], required: bo return new def add_input_oneof(self, *inputs: Union[str, Input, vocabulary.ThingTypesBase]): + """ + Helper method to add a ``oneOf`` (disjunctive) group to + the ``input`` list. When a single type is given, it is + added as a regular (conjunctive) input. When multiple + types are given, they are wrapped in a nested list to + indicate that exactly one of them is required. + + :param inputs: one or more input types (as URI strings, + :class:`Input` objects, or vocabulary types) + :raises ValueError: if any input in a ``oneOf`` group is + optional, or if a duplicate input is detected + """ newinputs = [] if len(inputs) == 1: if isinstance(inputs[0], Input): @@ -520,6 +534,13 @@ def add_more(self, key: str, value: str): raise ValueError("Key and value should not be empty!") def jsonify(self, pretty=False): + """ + Serialize the app metadata to a JSON string. + + :param pretty: if True, indent the output with 2 spaces + :returns: JSON string of the metadata + :rtype: str + """ return self.model_dump_json(exclude_defaults=True, by_alias=True, indent=2 if pretty else None) diff --git a/clams/envelop/__init__.py b/clams/envelop/__init__.py new file mode 100644 index 0000000..c925bb8 --- /dev/null +++ b/clams/envelop/__init__.py @@ -0,0 +1,201 @@ +import argparse +import json +import sys +from typing import Dict, List, Optional, Tuple, Union + +from mmif import Mmif + +from clams.appmetadata import map_param_kv_delimiter + +ENVELOPE_KEY = 'parameters' +MMIF_KEY = 'mmif' + + +class EnvelopeError(ValueError): + """ + Raised when an input body is detected as an envelope (has a + ``"parameters"`` key) but is otherwise malformed. Subclasses + ``ValueError`` so existing ``except ValueError`` handlers keep + working, while still being distinguishable from unrelated + ``ValueError``\\ s raised by app code. + """ + + +def normalize_params(params: dict) -> Dict[str, List[str]]: + """ + Normalize JSON-native parameter values to the + ``Dict[str, List[str]]`` format expected by + :class:`~clams.app.ParameterCaster`. + + :param params: parameter dict with JSON-native values + :returns: normalized dict where every value is a list of strings + :rtype: Dict[str, List[str]] + """ + normalized = {} + for k, v in params.items(): + if isinstance(v, list): + normalized[k] = [str(elem) for elem in v] + elif isinstance(v, dict): + normalized[k] = [ + f"{dk}{map_param_kv_delimiter}{dv}" + for dk, dv in v.items() + ] + else: + normalized[k] = [str(v)] + return normalized + + +def is_envelope(body: dict) -> bool: + """ + Check whether a parsed JSON body is an envelope. + + Detection relies on the presence of a top-level ``"parameters"`` + key, which is never part of the MMIF schema. + + :param body: parsed JSON dict + :returns: True if the body appears to be an envelope + :rtype: bool + """ + return isinstance(body, dict) and ENVELOPE_KEY in body + + +def unwrap_envelope(body: dict) -> Tuple[str, Dict[str, List[str]]]: + """ + Extract MMIF and normalized parameters from an envelope. + + :param body: parsed JSON dict with ``"parameters"`` and ``"mmif"`` + :returns: tuple of (mmif_json_string, normalized_params) + :rtype: Tuple[str, Dict[str, List[str]]] + :raises EnvelopeError: if ``"mmif"`` key is missing or + ``"parameters"`` is not a dict + """ + params = body.get(ENVELOPE_KEY) + if not isinstance(params, dict): + raise EnvelopeError( + f'"{ENVELOPE_KEY}" must be a JSON object, ' + f'got {type(params).__name__}' + ) + if MMIF_KEY not in body: + raise EnvelopeError( + f'Envelope is missing required "{MMIF_KEY}" key' + ) + mmif_str = json.dumps(body[MMIF_KEY]) + return mmif_str, normalize_params(params) + + +def unwrap_if_envelope(data, runtime_params): + """ + If ``data`` is (or decodes to) an envelope, return the inner MMIF + together with envelope parameters merged under the explicitly-passed + ``runtime_params`` (so query-string / CLI flags take priority). If + ``data`` is not an envelope, return it unchanged. + + This is the single entry point used by every execution path + (HTTP, CLI, direct Python API) so envelope handling is uniform + regardless of how the app is invoked. + + :param data: raw input -- ``bytes``, ``str``, or ``dict`` + :param runtime_params: explicitly-passed parameters that override + envelope parameters on key collision + :returns: tuple of (mmif_or_original_data, effective_params) + :raises EnvelopeError: if ``data`` is a malformed envelope + """ + raw = data.decode('utf-8') if isinstance(data, bytes) else data + body = raw + if isinstance(raw, str): + try: + body = json.loads(raw) + except (json.JSONDecodeError, ValueError): + return data, runtime_params + if is_envelope(body): + inner_mmif, envelope_params = unwrap_envelope(body) + return inner_mmif, {**envelope_params, **runtime_params} + return data, runtime_params + + +def create_envelope( + mmif: Union[str, dict, Mmif], + parameters: Optional[dict] = None, +) -> str: + """ + Create a JSON envelope string wrapping MMIF and parameters. + + :param mmif: MMIF as a string, dict, or + :class:`~mmif.serialize.mmif.Mmif` object + :param parameters: parameter dict with JSON-native values + :returns: JSON string of the envelope + :rtype: str + """ + if isinstance(mmif, Mmif): + mmif_obj = json.loads(mmif.serialize()) + elif isinstance(mmif, str): + mmif_obj = json.loads(mmif) + else: + mmif_obj = mmif + envelope = { + ENVELOPE_KEY: parameters if parameters is not None else {}, + MMIF_KEY: mmif_obj, + } + return json.dumps(envelope) + + +# -- CLI interface --------------------------------------------------- + +def describe_argparser(): + """ + :returns: tuple of (one-line help, detailed description) + """ + oneliner = ( + 'create a JSON envelope wrapping MMIF and runtime parameters' + ) + detailed = ( + 'Reads a JSON parameter file and an MMIF file (or stdin), ' + 'combines them into a JSON envelope, and writes the result ' + 'to stdout. The envelope can be POSTed directly to a CLAMS ' + 'app HTTP endpoint.' + ) + return oneliner, detailed + + +def prep_argparser(**kwargs): + """ + :returns: argparse.ArgumentParser for the envelop subcommand + """ + parser = argparse.ArgumentParser( + description=describe_argparser()[1], + formatter_class=argparse.RawDescriptionHelpFormatter, + **kwargs, + ) + parser.add_argument( + 'PARAMS_FILE', + type=argparse.FileType('r'), + help='Path to a JSON file containing runtime parameters.', + ) + parser.add_argument( + 'MMIF_FILE', + nargs='?', + type=argparse.FileType('r'), + default=None if sys.stdin.isatty() else sys.stdin, + help=( + 'Path to the input MMIF file, or stdin if omitted. ' + 'Use "-" to explicitly read from stdin.' + ), + ) + return parser + + +def main(args): + """ + CLI entry point. Reads params JSON and MMIF, writes envelope + JSON to stdout. + """ + if args.MMIF_FILE is None: + print( + 'Error: no MMIF input provided ' + '(pass a file path or pipe to stdin)', + file=sys.stderr, + ) + sys.exit(1) + params = json.load(args.PARAMS_FILE) + mmif_str = args.MMIF_FILE.read() + print(create_envelope(mmif_str, parameters=params)) diff --git a/clams/restify/__init__.py b/clams/restify/__init__.py index 811ee4a..5f99045 100644 --- a/clams/restify/__init__.py +++ b/clams/restify/__init__.py @@ -1,9 +1,11 @@ +import json + import jsonschema from flask import Flask, request, Response from flask_restful import Resource, Api -from mmif import Mmif from clams.app import ClamsApp +from clams.envelop import EnvelopeError class Restifier(object): @@ -194,13 +196,28 @@ def post(self) -> Response: # this will catch duplicate arguments with different values into a list under the key raw_params = request.args.to_dict(flat=False) try: - _ = Mmif(raw_data) - except jsonschema.exceptions.ValidationError as e: - return Response(response="Invalid input data. See below for validation error.\n\n" + str(e), status=500, mimetype='text/plain') - try: - return self.json_to_response(self.cla.annotate(raw_data, **raw_params)) + return self.json_to_response( + self.cla.annotate(raw_data, **raw_params)) + except (jsonschema.exceptions.ValidationError, + json.JSONDecodeError, EnvelopeError) as e: + # jsonschema's str(e) dumps the entire MMIF schema; use its + # concise .message instead so envelope and MMIF input + # errors share the same compact payload format. + detail = ( + e.message + if isinstance(e, jsonschema.exceptions.ValidationError) + else str(e)) + return Response( + response="Invalid input data. " + "See below for validation error.\n\n" + + detail, + status=500, mimetype='text/plain') except Exception: self.cla.logger.exception("Error in annotation") - return self.json_to_response(self.cla.record_error(raw_data, **raw_params).serialize(pretty=True), status=500) + return self.json_to_response( + self.cla.record_error( + raw_data, **raw_params + ).serialize(pretty=True), + status=500) put = post diff --git a/clams/ver/__init__.py b/clams/ver/__init__.py new file mode 100644 index 0000000..38a483d --- /dev/null +++ b/clams/ver/__init__.py @@ -0,0 +1,2 @@ +from importlib.metadata import version +__version__ = version("clams-python") diff --git a/container/generate_containers_yml.py b/container/generate_containers_yml.py index 51ab9cd..607f900 100644 --- a/container/generate_containers_yml.py +++ b/container/generate_containers_yml.py @@ -63,7 +63,7 @@ def generate_workflow(container_files, dependencies, output_file): f.write(" - name: \"⏱️ Wait up to 20 minutes for the new clams-python is deployed on PyPI\"\n") f.write(" uses: nev7n/wait_for_response@v1\n") f.write(" with:\n") - f.write(" url: \"https://pypi.org/project/clams-python/${{ needs.set-version.outputs.version }}/\"\n") + f.write(" url: \"https://pypi.org/pypi/clams-python/${{ needs.set-version.outputs.version }}/json\"\n") f.write(" responseCode: 200\n") f.write(" timeout: 1200000\n") f.write(" interval: 5000\n") diff --git a/documentation/appmetadata.rst b/documentation/appmetadata.rst index 537c90c..d0ba20a 100644 --- a/documentation/appmetadata.rst +++ b/documentation/appmetadata.rst @@ -13,7 +13,7 @@ Format A CLAMS App Metadata should be able to be serialized into a JSON string. -Input/Output type specification +Input/Output Type Specification =============================== Essentially, all CLAMS apps are designed to take one MMIF file as input and produce another MMIF file as output. In this @@ -29,7 +29,7 @@ how that information should be formatted in terms of the App Metadata syntax, co additional information about submission. Visit the `CLAMS app directory `_ to see how the app metadata is rendered. -Annotation types in MMIF +Annotation Types in MMIF ------------------------ As described in the `MMIF documentation `_, MMIF files can contain annotations of various types. @@ -56,8 +56,8 @@ needs to add additional information to the type definition, they can do so by ad definition in action. In such a case, the app developer is expected to provide the explanation of the extended type in the app metadata. See below for the syntax of I/O specification in the app metadata. -Syntax for I/O specification in App Metadata --------------------------------------------- +Syntax for I/O Specification in App Metadata +--------------------------------------------- In the App Metadata, the input and output types are specified as lists of objects. Each object in the list should have the following fields: @@ -69,7 +69,7 @@ the following fields: defaults to ``true``. Not applicable for output types. -Simple case - using types as defined in the vocabularies +Simple Case - Using Types as Defined in the Vocabularies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In the simplest case, where a developer merely re-uses an annotation type definition and pre-defined properties, an @@ -195,14 +195,14 @@ Note that in the actual output MMIF, more properties can be stored in the ``Time specification in the app metadata is a subset of the properties to be produced that are useful for type checking in the downstream apps, as well as for human readers to understand the output. -Extended case - adding custom properties to the type definition +Extended Case - Adding Custom Properties to the Type Definition ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When the type definition is extended on the fly, developers are expected to provide the extended specification in the form of key-value pairs in the ``properties`` field. The grammar of the JSON object does not change, but developers are expected to provide a verbose description of the type extension in the ``description`` field. -Runtime parameter specification +Runtime Parameter Specification =============================== CLAMS apps designed to be run as HTTP servers, preferably as `stateless `_. @@ -219,7 +219,7 @@ can be specified as ``multivalued=True`` to accept multiple values as a list. Fo parameter value parsing works, please refer to the App Metadata json scheme (in the `below <#clams-app-runtime-parameter>`_ section). -Syntax for parameter specification in App Metadata +Syntax for Parameter Specification in App Metadata -------------------------------------------------- Metadata Schema @@ -228,9 +228,11 @@ Metadata Schema The schema for app metadata is as follows. (You can also download the schema in `JSON Schema `_ format from `here `_.) -.. jsonschema:: appmetadata.jsonschema +.. jsonschema:: appmetadata.jsonschema :lift_description: True :lift_title: True :lift_definitions: True + :auto_target: True + :auto_reference: True diff --git a/documentation/autodoc/clams.appmetadata.rst b/documentation/autodoc/clams.appmetadata.rst index 1849208..bd94d78 100644 --- a/documentation/autodoc/clams.appmetadata.rst +++ b/documentation/autodoc/clams.appmetadata.rst @@ -8,12 +8,12 @@ Package providing classes for representing metadata of CLAMS apps. .. autoclass:: clams.appmetadata.Input :members: - :inherited-members: + :inherited-members: BaseModel .. autoclass:: clams.appmetadata.Output :members: - :inherited-members: + :inherited-members: BaseModel .. autoclass:: clams.appmetadata.RuntimeParameter :members: - :inherited-members: + :inherited-members: BaseModel diff --git a/documentation/clamsapp.md b/documentation/clamsapp.md deleted file mode 100644 index 5ec697f..0000000 --- a/documentation/clamsapp.md +++ /dev/null @@ -1,275 +0,0 @@ -## Using CLAMS App - -This document provides general instructions for installing and using CLAMS Apps. -App developers may provide additional information specific to their app, -hence it's advised also to look up the app website (or code repository) to get the additional information. - -### Requirements - -Generally, a CLAMS App requires - -- To run the app in a container (as an HTTP server), container management software such as `docker` or `podman`. This is the recommended way to use CLAMS Apps. - - (the CLAMS team is using `docker` for development and testing, hence the instructions are based on `docker` commands.) -- To run the app locally, Python3 with the `clams-python` module installed. Python 3.8 or higher is required. -- To invoke and execute analysis, HTTP client utility (such as `curl`). - -For Python dependencies, usually CLAMS Apps come with `requirements.txt` files that list up the Python library. -However, there could be other non-Python software/library that are required by the app. - -### Installation - -CLAMS Apps available on the CLAMS App Directory. Currently, all CLAMS Apps are open-source projects and are distributed as - -1. source code downloadable from code repository -1. pre-built container image - -Please visit [the app-directory](https://apps.clams.ai) to see which apps are available and where you can download them. - -In most cases, you can "install" a CLAMS App by either - -1. downloading pre-built container image directly (quick-and-easy way) -1. downloading source code from the app code repository and manually building a container image (more flexible way if you want to modify the app, or have to build for a specific HW) - -#### Download prebuilt image - -This is the quickest (and recommended) way to get started with a CLAMS App. -CLAMS apps in the App Directory come with public prebuilt container images, available in a container registry. - -``` bash -docker pull -``` - -The image name can be found on the App Directory entry of the app. - -#### Build an image from source code - -Alternatively, you can build a container image from the source code. -This is useful when you want to modify the app itself, or when you want to change the image building process to adjust to your hardware environment (e.g., specific compute engine), or additional software dependencies (e.g. [MMIF plugins](https://clams.ai/mmif-python/latest/plugins.html)). -To download the source code, you can either use `git clone` command or download a zip file from the source code repository. -The source code repository address can be found on the App Directory entry of the app. - -From the locally downloaded project directory, run the following in your terminal to build an image from the included container specification file. - -(Assuming you are using `docker` as your container manager) - -```bash -$ docker build . -f Containerfile -t -``` - -### Running CLAMS App - -CLAMS Apps are primarily designed to run as an HTTP server, but some apps written based on `clams-python` SDK additionally provide CLI equivalent to the HTTP requests. -In this session, we will first cover the usage of CLAMS apps as an HTTP server, and then cover the (optional) CLI. - -#### Starting the HTTP server as a container - -Once the image is built (by `docker build`) or downloaded (by `docker pull`), to create and start a container, run: - -```bash -$ docker run -v /path/to/data/directory:/data -p :5000 -``` - -where `/path/to/data/directory` is the local location of your media files or MMIF objects and `PORT` is the *host* port number you want your container to be listening to. -The HTTP inside the container will be listening to 5000 by default, so the second part of the `-p` argument is always `5000`. -Usually any number above 1024 is fine for the host port number, and you can use the same 5000 number for the host port number. - -The mount point for the data directory inside the container can be any path, and we used `/data` just as an example. -However, it is very important to understand that the file location in the input MMIF file must be a valid and available path inside the container (see below for more details). - -> **Note** -> If you are using a Mac, on recent versions of macOS, port 5000 is used by Airplay Receiver by default. So you may need to use a different port number, or turn off the Airplay Receiver in the System Preferences to release 5000. -> For more information on *safe* port numbers, see [IANA Port Number Registry](https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml) or [Wikipedia](https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers). - -> **Note** -> Another note for users of recent Macs with Apple Silicon (M1, M2, etc) CPU: you might see the following error message when you run the container image. -> ``` -> The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested -> ``` -> This is because the image you are trying to run is built for Intel/AMD x64 CPUs. To force the container to run on an emulation layer, you can add `--platform linux/amd64` option to the `docker run` command. - -Additionally, you can mount a directory to `/cache/` inside the container to persist the cache data between container runs. -This is particularly handy when the app you are using downloads a fairly large pretrained model file on the first run, and you want to keep it for the next run. - -Unlike the data directory, the cache directory is not required to be mounted, but if you want to persist the cache data, you can mount a local directory to `/cache/` inside the container (fixed path). - -```bash -docker run -v /path/to/data/directory:/data -v /path/to/cache/directory:/cache -p :5000 -``` - -> **Note** -> One might be tempted bind-mount their entire local cache directory (usually `~/.cache` in Linux systems) to re-use locally downloaded model files, across different apps. -> However, doing so will expose all the cached data, not just model files, to the container. -> This can include sensitive information such as browser cache, authentication tokens, etc, hence will pose a great security risk. -> It is recommended to create a separate directory to use as a cache directory for CLAMS containers. - - -### Invoking the app server - -#### To get app metadata - -Once the app is running as an HTTP server, visit the server address ([localhost:5000](http://localhost:5000), or the remote host name if running on a remote computer) to get the app metadata. -App metadata is also available at the App Directory entry of the app if the app is published on the App Directory. -App metadata contains important information about the app that we will use in the following sections. - -#### To process input media - -To actually run the app and process input media through computational analysis, simply send a POST request to the app with a MMIF input as the request body. - -MMIF input files can be obtained from outputs of other CLAMS apps, or you can create an *empty* MMIF only with source media locations using `clams source` command. See the help message for a more detailed instructions. -(Make sure you have installed [`clams-python` package](https://pypi.org/project/clams-python/) version from PyPI.) - -```bash -$ pip install clams-python -$ clams source --help -``` - -For example; by running - -```bash -$ clams source audio:/data/audio/some-audio-file.mp3 -``` - -You will get - -```json -{ - "metadata": { - "mmif": "http://mmif.clams.ai/X.Y.Z" - }, - "documents": [ - { - "@type": "http://mmif.clams.ai/vocabulary/AudioDocument/v1", - "properties": { - "mime": "audio", - "id": "d1", - "location": "file:///data/audio/some-audio-file.mp3" - } - } - ], - "views": [] -} -``` - -If an app requires just `Document` inputs (see `input` section of the app metadata), an empty MMIF with required media file locations will suffice. -The location has to be a URL or an absolute path, and it is important to ensure that it exists. -Especially when running the app in a container, and the document location is specified as a file system path, the file must be available inside the container. -In the above, we bind-mounted `/path/to/data/directory` (host) to `/data` (container). -That is why we used `/data/audio/some-audio-file.mp3` as the location when generating this MMIF input. -So in this example, the file `/path/to/data/directory/audio/some-audio-file.mp3` must exist on the host side, so that inside the container, it can be accessed as `/data/audio/some-audio-file.mp3`. - -Some apps only works with input MMIF that already contains some annotations of specific types. To run such apps, you need to run different apps in a sequence. - -(TODO: added CLAMS workflow documentation link here.) - -When an input MMIF is ready, you can send it to the app server. -Here's an example of how to use the `curl` command, and store the response in a file `output.mmif`. - -```bash -$ clams source audio:/data/audio/some-audio-file.mp3 > input.mmif -$ curl -H "Accept: application/json" -X POST -d@input.mmif -s http://localhost:5000 > output.mmif - -# or using a bash pipeline -$ clams source audio:/data/audio/some-audio-file.mp3 | curl -X POST -d@- -s http://localhost:5000 > output.mmif -``` - -Windows PowerShell users may encounter an `Invoke-WebRequest` exception when attempting to send an input file with `curl`. -This can be resolved for the duration of the current session by using the command `remove-item alias:curl` before proceeding to use `curl`. - -#### Configuring the app - -Running as an HTTP server, CLAMS Apps are stateless, but can be configured for each HTTP request by providing configuration parameters as [query string](https://en.wikipedia.org/wiki/Query_string). - -For example, appending `?pretty=True` to the URL will result in a JSON output with indentation for better readability. - -> **Note** -> When you're using `curl` from a shell session, you need to escape the `?` or `&` characters with `\` to prevent the shell from interpreting it as a special character. - -Different apps have different configurability. For configuration parameters of an app, please refer to `parameter` section of the app metadata. In addition to app-specific parameters, all apps support universal parameters (e.g., `pretty` for formatted output). Check the app metadata for the complete and up-to-date list. - -### Using CLAMS App as a CLI program - -First and foremost, not all CLAMS Apps support command line interface (CLI). -At the minimum, a CLAMS app is required to support HTTP interfaces described in the previous section. -If any of the following instructions do not work for an app, it is likely that the app does not support CLI. - -#### Python entry points - -Apps written on `clams-python` SDK have three python entry points by default: `app.py`, `metadata.py`, and `cli.py`. - -#### `app.py`: Running app as a local HTTP server - -`app.py` is the main entry point for running the app as an HTTP server. -To run the app as a local HTTP server without containerization, you can run the following command from the source code directory. - -```bash -$ python app.py -``` - -* By default, the app will be listening to port 5000, but you can change the port number by passing `--port ` option. -* Be default, the app will be running in *debugging* mode, but you can change it to *production* mode by passing `--production` option to support larger traffic volume. -* As you might have noticed, the default `CMD` in the prebuilt containers is `python app.py --production --port 5000`. - -##### Environment variables for production mode - -When running in production mode, the following environment variables can be used to configure the app server: - -| Variable | Description | Default | -|----------|-------------|---------| -| `CLAMS_GUNICORN_WORKERS` | Number of gunicorn worker processes | Auto-calculated based on CPU cores and GPU memory | -| `CLAMS_LOGLEVEL` | Logging verbosity level (`debug`, `info`, `warning`, `error`) | `warning` | - -By default, the number of workers is calculated as `(CPU cores × 2) + 1`. For GPU-based apps, see [GPU Memory Management](gpu-apps.md) for details on automatic worker scaling and VRAM management. - -#### `metadata.py`: Getting app metadata - -Running `metadata.py` will print out the app metadata in JSON format. - -#### `cli.py`: Running as a CLI program - -`cli.py` is completely optional for app developers, and unlike the other two above that are guaranteed to be available, `cli.py` may not be available for some apps. -When running an app as a HTTP app, the input MMIF must be passed as POST request's body, and the output MMIF will be returned as the response body. -To mimic this behavior in a CLI, `cli.py` has two positional arguments; - -``` bash -$ python cli.py # will read INPUT_MMIF file, process it, and write the result to OUTPUT_MMIF file -``` - -`` and `` are file paths to the input and output MMIF files, respectively. -Following the common unix CLI practice, you can use `-` to represent STDIN and/or STDOUT - -``` bash -# will read from STDIN, process it, and write the result to STDOUT -$ python cli.py - - - -# or equivalently -$ python cli.py - -# read from a file, write to STDOUT -$ python cli.py input.mmif - - -# or equivalently -$ python cli.py input.mmif - -# read from STDIN, write to a file -$ cat input.mmif | python cli.py - output.mmif -``` - -As with the HTTP server, you can pass configuration parameters to the CLI program. -All parameter names are the same as the HTTP query parameters, but you need to use `--` prefix to indicate that it is a parameter. - -``` bash -$ python cli.py --pretty True input.mmif output.mmif -``` - -Finally, when running the app as a container, you can override the default `CMD` (`app.py`) by passing a `cli.py` command to the `docker run` command. - -``` bash -$ cat input.mmif | docker run -i -v /path/to/data/directory:/data python cli.py -``` - -Note that `input.mmif` is in the host machine, and the container is reading it from the STDIN. -You can also pass the input MMIF file as a volume to the container. -However, when you do so, you need to make sure that the file path in the MMIF is correctly set to the file path in the container. - -> **Note** -> Here, make sure to pass [`-i` option to the `docker run`](https://docs.docker.com/reference/cli/docker/container/run/#interactive) command to make host's STDIN work properly with the container. diff --git a/documentation/clamsapp.rst b/documentation/clamsapp.rst new file mode 100644 index 0000000..b1af2db --- /dev/null +++ b/documentation/clamsapp.rst @@ -0,0 +1,460 @@ +.. _clamsapp: + +Using CLAMS App +=============== + +This document provides general instructions for installing and using CLAMS Apps. +App developers may provide additional information specific to their app, +hence it's advised also to look up the app website (or code repository) to get the additional information. + +.. _clamsapp-requirements: + +Requirements +------------ + +Generally, a CLAMS App requires + +- To run the app in a container (as an HTTP server), container management software such as ``docker`` or ``podman``. This is the recommended way to use CLAMS Apps. + + - (the CLAMS team is using ``docker`` for development and testing, hence the instructions are based on ``docker`` commands.) + +- To run the app locally, Python3 with the ``clams-python`` module installed. Python 3.8 or higher is required. +- To invoke and execute analysis, HTTP client utility (such as ``curl``). + +For Python dependencies, usually CLAMS Apps come with ``requirements.txt`` files that list up the Python library. +However, there could be other non-Python software/library that are required by the app. + +.. _clamsapp-installation: + +Installation +------------ + +CLAMS Apps available on the CLAMS App Directory. Currently, all CLAMS Apps are open-source projects and are distributed as + +#. source code downloadable from code repository +#. pre-built container image + +Please visit `the app-directory `_ to see which apps are available and where you can download them. + +In most cases, you can "install" a CLAMS App by either + +#. downloading pre-built container image directly (quick-and-easy way) +#. downloading source code from the app code repository and manually building a container image (more flexible way if you want to modify the app, or have to build for a specific HW) + +Download prebuilt image +^^^^^^^^^^^^^^^^^^^^^^^ + +This is the quickest (and recommended) way to get started with a CLAMS App. +CLAMS apps in the App Directory come with public prebuilt container images, available in a container registry. + +.. code-block:: bash + + docker pull + +The image name can be found on the App Directory entry of the app. + +Build an image from source code +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Alternatively, you can build a container image from the source code. +This is useful when you want to modify the app itself, or when you want to change the image building process to adjust to your hardware environment (e.g., specific compute engine), or additional software dependencies (e.g. `MMIF plugins `_). +To download the source code, you can either use ``git clone`` command or download a zip file from the source code repository. +The source code repository address can be found on the App Directory entry of the app. + +From the locally downloaded project directory, run the following in your terminal to build an image from the included container specification file. + +(Assuming you are using ``docker`` as your container manager) + +.. code-block:: bash + + $ docker build . -f Containerfile -t + +.. _clamsapp-running: + +Running CLAMS App +----------------- + +CLAMS Apps are primarily designed to run as an HTTP server, but some apps written based on ``clams-python`` SDK additionally provide CLI equivalent to the HTTP requests. +In this session, we will first cover the usage of CLAMS apps as an HTTP server, and then cover the (optional) CLI. + +Starting the HTTP server as a container +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once the image is built (by ``docker build``) or downloaded (by ``docker pull``), to create and start a container, run: + +.. code-block:: bash + + $ docker run -v /path/to/data/directory:/data -p :5000 + +where ``/path/to/data/directory`` is the local location of your media files or MMIF objects and ``PORT`` is the *host* port number you want your container to be listening to. +The HTTP inside the container will be listening to 5000 by default, so the second part of the ``-p`` argument is always ``5000``. +Usually any number above 1024 is fine for the host port number, and you can use the same 5000 number for the host port number. + +The mount point for the data directory inside the container can be any path, and we used ``/data`` just as an example. +However, it is very important to understand that the file location in the input MMIF file must be a valid and available path inside the container (see below for more details). + +.. note:: + + If you are using a Mac, on recent versions of macOS, port 5000 is used by Airplay Receiver by default. So you may need to use a different port number, or turn off the Airplay Receiver in the System Preferences to release 5000. + For more information on *safe* port numbers, see `IANA Port Number Registry `_ or `Wikipedia `_. + +.. note:: + + Another note for users of recent Macs with Apple Silicon (M1, M2, etc) CPU: you might see the following error message when you run the container image. + + :: + + The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested + + This is because the image you are trying to run is built for Intel/AMD x64 CPUs. To force the container to run on an emulation layer, you can add ``--platform linux/amd64`` option to the ``docker run`` command. + +Additionally, you can mount a directory to ``/cache/`` inside the container to persist the cache data between container runs. +This is particularly handy when the app you are using downloads a fairly large pretrained model file on the first run, and you want to keep it for the next run. + +Unlike the data directory, the cache directory is not required to be mounted, but if you want to persist the cache data, you can mount a local directory to ``/cache/`` inside the container (fixed path). + +.. code-block:: bash + + docker run -v /path/to/data/directory:/data -v /path/to/cache/directory:/cache -p :5000 + +.. note:: + + One might be tempted bind-mount their entire local cache directory (usually ``~/.cache`` in Linux systems) to re-use locally downloaded model files, across different apps. + However, doing so will expose all the cached data, not just model files, to the container. + This can include sensitive information such as browser cache, authentication tokens, etc, hence will pose a great security risk. + It is recommended to create a separate directory to use as a cache directory for CLAMS containers. + +.. _clamsapp-invoking: + +Invoking the app server +----------------------- + +To get app metadata +^^^^^^^^^^^^^^^^^^^ + +Once the app is running as an HTTP server, visit the server address (`localhost:5000 `_, or the remote host name if running on a remote computer) to get the app metadata. +App metadata is also available at the App Directory entry of the app if the app is published on the App Directory. +App metadata contains important information about the app that we will use in the following sections. + +To process input media +^^^^^^^^^^^^^^^^^^^^^^ + +To actually run the app and process input media through computational analysis, simply send a POST request to the app with a MMIF input as the request body. + +MMIF input files can be obtained from outputs of other CLAMS apps, or you can create an *empty* MMIF only with source media locations using ``clams source`` command. See the help message for a more detailed instructions. +(Make sure you have installed `clams-python package `_ version from PyPI.) + +.. code-block:: bash + + $ pip install clams-python + $ clams source --help + +For example; by running + +.. code-block:: bash + + $ clams source audio:/data/audio/some-audio-file.mp3 + +You will get + +.. code-block:: json + + { + "metadata": { + "mmif": "http://mmif.clams.ai/X.Y.Z" + }, + "documents": [ + { + "@type": "http://mmif.clams.ai/vocabulary/AudioDocument/v1", + "properties": { + "mime": "audio", + "id": "d1", + "location": "file:///data/audio/some-audio-file.mp3" + } + } + ], + "views": [] + } + +If an app requires just ``Document`` inputs (see ``input`` section of the app metadata), an empty MMIF with required media file locations will suffice. +The location has to be a URL or an absolute path, and it is important to ensure that it exists. +Especially when running the app in a container, and the document location is specified as a file system path, the file must be available inside the container. +In the above, we bind-mounted ``/path/to/data/directory`` (host) to ``/data`` (container). +That is why we used ``/data/audio/some-audio-file.mp3`` as the location when generating this MMIF input. +So in this example, the file ``/path/to/data/directory/audio/some-audio-file.mp3`` must exist on the host side, so that inside the container, it can be accessed as ``/data/audio/some-audio-file.mp3``. + +Some apps only works with input MMIF that already contains some annotations of specific types. To run such apps, you need to run different apps in a sequence. + +(TODO: added CLAMS workflow documentation link here.) + +When an input MMIF is ready, you can send it to the app server. +Here's an example of how to use the ``curl`` command, and store the response in a file ``output.mmif``. + +.. code-block:: bash + + $ clams source audio:/data/audio/some-audio-file.mp3 > input.mmif + $ curl -H "Accept: application/json" -X POST -d@input.mmif -s http://localhost:5000 > output.mmif + + # or using a bash pipeline + $ clams source audio:/data/audio/some-audio-file.mp3 | curl -X POST -d@- -s http://localhost:5000 > output.mmif + +Windows PowerShell users may encounter an ``Invoke-WebRequest`` exception when attempting to send an input file with ``curl``. +This can be resolved for the duration of the current session by using the command ``remove-item alias:curl`` before proceeding to use ``curl``. + +.. _clamsapp-configuring: + +Configuring the app +^^^^^^^^^^^^^^^^^^^ + +CLAMS Apps are stateless, but can be configured per-request via runtime parameters. +Different apps have different configurability. +For configuration parameters of an app, please refer to the ``parameter`` section of the app metadata. +In addition to app-specific parameters, all apps support universal parameters (e.g., ``pretty`` for formatted output). +Check the app metadata for the complete and up-to-date list. + +For detailed documentation of parameter types including map-type and multivalued +parameters, see :ref:`runtime-params-detailed`. + +There are three ways to pass runtime parameters, depending on your execution mode. + +Via query string (HTTP) +""""""""""""""""""""""" + +When running an app as an HTTP server, you can pass simple parameters as +`query strings `_ appended to the request URL. + +For example, appending ``?pretty=True`` to the URL will result in a JSON output with indentation for better readability. + +.. code-block:: bash + + $ curl -X POST -d@input.mmif "http://app-server:5000\?pretty=True" + +For multivalued parameters, repeat the parameter name: + +.. code-block:: bash + + $ curl -X POST -d@input.mmif "http://app-server:5000\?labels=PERSON\&labels=ORG" + +.. note:: + + When you're using ``curl`` from a shell session, you need to escape the ``?`` or ``&`` characters with ``\`` to prevent the shell from interpreting it as a special character. + +Via CLI flags +""""""""""""" + +When running an app via ``cli.py`` (see :ref:`clamsapp-cli` below), parameters +are passed as ``--``-prefixed flags. + +.. code-block:: bash + + $ python cli.py --pretty True input.mmif output.mmif + +For multivalued parameters, list the values after the flag, then use a +``--`` separator before the positional arguments: + +.. code-block:: bash + + $ python cli.py --labels PERSON ORG -- input.mmif output.mmif + +.. warning:: + + A multivalued flag greedily consumes every following token, so without + the ``--`` separator the positional ``INPUT_MMIF`` (and ``OUTPUT_MMIF``) + would be swallowed as additional ``--labels`` values, and the command + would fail with a missing-argument error. Always place ``--`` between the + last value of a multivalued flag and the positional file arguments. + +.. _clamsapp-envelope: + +Via JSON envelope +""""""""""""""""" + +For complex or lengthy parameter values (e.g., long prompts for LLM-based apps, +large map parameters), query strings and CLI flags can be impractical. +The JSON envelope format wraps both the MMIF input and the parameters in a +single JSON object. +It works the same way in both execution modes: the input (the POST body in +HTTP mode, or the ``INPUT_MMIF`` argument read from stdin or a positional +file path in CLI mode) can be either pure MMIF or a MMIF wrapped in a +parameter envelope, and the app detects and unwraps the envelope +automatically (see :ref:`clamsapp-cli`). +The envelope looks like this: + +.. code-block:: json + + { + "parameters": { + "prompt": "A very long prompt that would not fit in a query string...", + "labelMap": {"B": "bars", "S": "slate"}, + "temperature": 0.7, + "pretty": true + }, + "mmif": { + "metadata": { "mmif": "..." }, + "documents": [ "..." ], + "views": [] + } + } + +The output is always raw MMIF, regardless of input format, so downstream +pipeline steps are unaffected. + +You can still combine the envelope with query string parameters. +When the same parameter appears in both, the **query string takes priority**, +which allows quick overrides without editing the parameter file: + +.. code-block:: bash + + $ curl -X POST -d@envelope.json "http://app-server:5000\?temperature=0.3" + +The ``clams envelop`` CLI tool helps construct envelope JSON from a parameter +file and an MMIF file: + +.. note:: + + The subcommand is spelled ``envelop`` (no trailing ``e``) -- it is the + *verb* "to envelop" (to wrap/enclose), consistent with the other + action-named subcommands (``describe``, ``rewind``, ``summarize``). The + *noun* "envelope" still refers to the JSON object it produces. The + missing ``e`` is intentional, not a typo. + +.. code-block:: bash + + # from files + $ clams envelop params.json input.mmif > envelope.json + + # pipe-friendly: read MMIF from stdin + $ cat input.mmif | clams envelop params.json | curl -d@- http://app-server:5000 + + # chain apps: first app uses query string, second uses envelope + $ cat input.mmif \ + | curl -d@- -s "http://app1:5000\?simple_param=value" \ + | clams envelop params.json \ + | curl -d@- -s http://app2:5000 > output.mmif + +For programmatic use in Python: + +.. code-block:: python + + from clams import create_envelope + + body = create_envelope(mmif_obj, parameters={"prompt": "...", "labelMap": {"B": "bars"}}) + requests.post("http://app-server:5000", data=body) + +.. _clamsapp-cli: + +Using CLAMS App as a CLI program +--------------------------------- + +First and foremost, not all CLAMS Apps support command line interface (CLI). +At the minimum, a CLAMS app is required to support HTTP interfaces described in the previous section. +If any of the following instructions do not work for an app, it is likely that the app does not support CLI. + +Python entry points +^^^^^^^^^^^^^^^^^^^ + +Apps written on ``clams-python`` SDK have three python entry points by default: ``app.py``, ``metadata.py``, and ``cli.py``. + +``app.py``: Running app as a local HTTP server +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``app.py`` is the main entry point for running the app as an HTTP server. +To run the app as a local HTTP server without containerization, you can run the following command from the source code directory. + +.. code-block:: bash + + $ python app.py + +* By default, the app will be listening to port 5000, but you can change the port number by passing ``--port `` option. +* Be default, the app will be running in *debugging* mode, but you can change it to *production* mode by passing ``--production`` option to support larger traffic volume. +* As you might have noticed, the default ``CMD`` in the prebuilt containers is ``python app.py --production --port 5000``. + +Environment variables for production mode +"""""""""""""""""""""""""""""""""""""""""" + +When running in production mode, the following environment variables can be used to configure the app server: + +.. list-table:: + :header-rows: 1 + :widths: 30 50 20 + + * - Variable + - Description + - Default + * - ``CLAMS_GUNICORN_WORKERS`` + - Number of gunicorn worker processes + - Auto-calculated based on CPU cores and GPU memory + * - ``CLAMS_LOGLEVEL`` + - Logging verbosity level (``debug``, ``info``, ``warning``, ``error``) + - ``warning`` + +By default, the number of workers is calculated as ``(CPU cores x 2) + 1``. For GPU-based apps, see `GPU Memory Management `_ for details on automatic worker scaling and VRAM management. + +``metadata.py``: Getting app metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Running ``metadata.py`` will print out the app metadata in JSON format. + +``cli.py``: Running as a CLI program +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``cli.py`` is completely optional for app developers, and unlike the other two above that are guaranteed to be available, ``cli.py`` may not be available for some apps. +When running an app as a HTTP app, the input MMIF must be passed as POST request's body, and the output MMIF will be returned as the response body. +To mimic this behavior in a CLI, ``cli.py`` has two positional arguments; + +.. code-block:: bash + + $ python cli.py # will read INPUT_MMIF file, process it, and write the result to OUTPUT_MMIF file + +```` and ```` are file paths to the input and output MMIF files, respectively. +Following the common unix CLI practice, you can use ``-`` to represent STDIN and/or STDOUT + +.. code-block:: bash + + # will read from STDIN, process it, and write the result to STDOUT + $ python cli.py - - + + # or equivalently + $ python cli.py + + # read from a file, write to STDOUT + $ python cli.py input.mmif - + + # or equivalently + $ python cli.py input.mmif + + # read from STDIN, write to a file + $ cat input.mmif | python cli.py - output.mmif + +As with the HTTP server, you can pass configuration parameters to the CLI program. +All parameter names are the same as the HTTP query parameters, but you need to use ``--`` prefix to indicate that it is a parameter. + +.. code-block:: bash + + $ python cli.py --pretty True input.mmif output.mmif + +.. note:: + + For complex parameters that are difficult to express as CLI flags (e.g., long + prompts, large map parameters), consider using the JSON envelope approach + instead. You can pipe ``clams envelop`` output into ``cli.py``'s stdin:: + + cat input.mmif | clams envelop params.json | python cli.py + + See :ref:`clamsapp-envelope` for details. + +Finally, when running the app as a container, you can override the default ``CMD`` (``app.py``) by passing a ``cli.py`` command to the ``docker run`` command. + +.. code-block:: bash + + $ cat input.mmif | docker run -i -v /path/to/data/directory:/data python cli.py + +Note that ``input.mmif`` is in the host machine, and the container is reading it from the STDIN. +You can also pass the input MMIF file as a volume to the container. +However, when you do so, you need to make sure that the file path in the MMIF is correctly set to the file path in the container. + +.. note:: + + Here, make sure to pass the ``-i`` option to the ``docker run`` command + (see `docker run --interactive `_) + to make host's STDIN work properly with the container. diff --git a/documentation/cli.rst b/documentation/cli.rst index 6e189bd..c15f535 100644 --- a/documentation/cli.rst +++ b/documentation/cli.rst @@ -3,13 +3,12 @@ ``clams`` shell command ======================= -``clams-python`` comes with a command line interface (CLI) that allows you to +``clams-python`` comes with a command line interface (CLI) for creating a new CLAMS app from a template (``develop`` subcommand). -#. create a new CLAMS app from a template -#. create a new MMIF file with selected source documents and an empty view +The CLI is installed as the ``clams`` shell command. For backward compatibility, it also exposes all ``mmif`` subcommands (``source``, ``rewind``, ``describe``, ``summarize``). See the `mmif-python CLI documentation `_ for details on those commands. -The CLI is installed as ``clams`` shell command. To see the available commands, run +To see the available commands, run -.. code-block:: bash +.. code-block:: bash clams --help diff --git a/documentation/conf.py b/documentation/conf.py index e48cc17..e056cd2 100644 --- a/documentation/conf.py +++ b/documentation/conf.py @@ -11,11 +11,14 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. import datetime -from pathlib import Path -import shutil -import sys import inspect +import json import os +import re +import shutil +import subprocess +import sys +from pathlib import Path import mmif @@ -30,9 +33,10 @@ copyright = f'{datetime.date.today().year}, Brandeis LLC' author = 'Brandeis LLC' try: - version = open(proj_root_dir / 'VERSION').read().strip() -except FileNotFoundError: - print("WARNING: VERSION file not found, using 'dev' as version.") + from importlib.metadata import version as _get_version + version = _get_version('clams-python') +except Exception: + print("WARNING: could not read package version, using 'dev'.") version = 'dev' root_doc = 'index' @@ -134,48 +138,50 @@ def linkcode_resolve(domain, info): def generate_whatsnew_rst(app): - changelog_path = proj_root_dir / 'CHANGELOG.md' - output_path = proj_root_dir / 'documentation' / 'whatsnew.md' - if not changelog_path.exists(): - print(f"WARNING: CHANGELOG.md not found at {changelog_path}") - with open(output_path, 'w') as f: - f.write("") - return + """ + Generate whatsnew.md by fetching the latest release PR body + from GitHub via ``gh pr list``. - import re + Falls back gracefully if ``gh`` is unavailable (local builds). + """ + output_path = (proj_root_dir / 'documentation' + / 'whatsnew.md') + repo = f'clamsproject/{project}' - content = [] - found_version = False - version_header_re = re.compile(r'^## releasing\s+([^\s]+)\s*(\(.*\))?') - - print(f"DEBUG: Looking for version '{version}' in CHANGELOG.md") - - with open(changelog_path, 'r') as f: - lines = f.readlines() - - for line in lines: - match = version_header_re.match(line) - if match: - header_version = match.group(1) - if header_version == version: - found_version = True - # We don't include the header line itself in the content we want to wrap - continue - elif found_version: - break - - if found_version: - content.append(line) + try: + result = subprocess.run( + ['gh', 'pr', 'list', + '-s', 'merged', '-B', 'main', + '-L', '100', + '--json', 'title,body', + '--repo', repo], + capture_output=True, text=True, timeout=15, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr) + + prs = json.loads(result.stdout) + pr = next( + (p for p in prs + if p['title'].startswith('releasing ')), + None, + ) + if pr is None: + raise RuntimeError("No release PR found") + title = pr['title'] + body = pr.get('body', '') - if not found_version: - print(f"NOTE: No changelog entry found for version {version}") with open(output_path, 'w') as f: - f.write("") - else: - # Dump matched markdown content directly to whatsnew.md + f.write(f"## {title}\n\n") + f.write(f"(Full changelog: " + f"[CHANGELOG.md]" + f"({blob_base_url}/main/CHANGELOG.md))\n\n") + if body: + f.write(body) + + except Exception as e: with open(output_path, 'w') as f: - f.write(f"## What's New in {version}\n\n(Full changelog available in the [CHANGELOG.md]({blob_base_url}/main/CHANGELOG.md))\n") - f.writelines(content) + f.write("") def generate_jsonschema(app): @@ -192,8 +198,10 @@ def generate_jsonschema(app): def update_target_spec(app): target_vers_csv = Path(__file__).parent / 'target-versions.csv' - with open(proj_root_dir / "VERSION", 'r') as version_f: - version = version_f.read().strip() + version = _get_version('clams-python') + # Skip dev/dummy versions to avoid dirtying the git-tracked CSV + if 'dev' in version or not re.match(r'^\d+\.\d+\.\d+$', version): + return mmifver = mmif.__version__ specver = mmif.__specver__ with open(target_vers_csv) as in_f, open(f'{target_vers_csv}.new', 'w') as out_f: diff --git a/documentation/index.rst b/documentation/index.rst index 109647d..9f6db33 100644 --- a/documentation/index.rst +++ b/documentation/index.rst @@ -1,5 +1,5 @@ -Welcome to CLAMS Python SDK documentation! -========================================== +CLAMS Python SDK +================ .. mdinclude:: ../README.md @@ -11,7 +11,7 @@ Welcome to CLAMS Python SDK documentation! .. toctree:: :maxdepth: 2 - :caption: Contents + :caption: For CLAMS App Developers introduction input-output @@ -28,9 +28,14 @@ Welcome to CLAMS Python SDK documentation! modules -Indices and tables -================== +.. toctree:: + :maxdepth: 2 + :caption: For CLAMS App Users + + clamsapp + +Indices +======= * :ref:`genindex` * :ref:`modindex` -* :ref:`search` diff --git a/documentation/introduction.rst b/documentation/introduction.rst index fe3c9f6..ce907e8 100644 --- a/documentation/introduction.rst +++ b/documentation/introduction.rst @@ -1,6 +1,6 @@ .. _introduction: -Getting started +Getting Started =============== Overview diff --git a/documentation/runtime-params.rst b/documentation/runtime-params.rst index 1c29032..4d3bf93 100644 --- a/documentation/runtime-params.rst +++ b/documentation/runtime-params.rst @@ -41,17 +41,164 @@ As HTTP Server When running as a HTTP server, a CLAMS app should be stateless (or always set to default states), and all the state should be "configured" by the client for each request, via the runtime configuration parameters we described above if necessary. -For HTTP interface, users can enter configuration values via -`query strings `_ as part of the -request URL. For example, if the user wants to use the above app as a server -with the `labels` parameter only set to ``PERSON`` and ``ORG``, then the user +For HTTP interface, users can enter configuration values via +`query strings `_ as part of the +request URL, or via a JSON envelope in the POST body (see :ref:`clamsapp-configuring` +for user-facing details on all three parameter-passing methods). +For example, if the user wants to use the above app as a server +with the `labels` parameter only set to ``PERSON`` and ``ORG``, then the user can send a ``POST`` request to the server with the following URL: .. code-block:: bash http://app-server:5000?labels=PERSON&labels=ORG -Note that for this example to work, the parameter must be specified as +Note that for this example to work, the parameter must be specified as ``multivalued=True`` in the app metadata, so that the SDK can collect multiple values for the same parameter name in a single python list and pass to the -``annotate()`` method. Otherwise, only the *first* value will be passed. +``annotate()`` method. Otherwise, only the *first* value will be passed. + +.. _runtime-params-detailed: + +Parameter Types +--------------- + +Each runtime parameter has a ``type`` that determines how user-provided string +values are cast into Python objects before reaching your ``_annotate()`` method. +The supported types are: ``string``, ``integer``, ``number``, ``boolean``, and +``map``. + +Primitive types +^^^^^^^^^^^^^^^ + +``string`` + Values are passed through as-is (no casting). + + .. code-block:: python + + metadata.add_parameter(name='outputFormat', type='string', + default='json', + description='Output format.') + +``integer`` + Values are cast to Python ``int`` via ``int(value)``. + + .. code-block:: python + + metadata.add_parameter(name='minFrameCount', type='integer', + default=5, + description='Minimum number of frames.') + +``number`` + Values are cast to Python ``float`` via ``float(value)``. + + .. code-block:: python + + metadata.add_parameter(name='threshold', type='number', + default=0.5, + description='Confidence threshold.') + +``boolean`` + Values are cast to Python ``bool``. The following string values are + recognized as ``False``: ``False``, ``false``, ``F``, ``f``, ``0``. + Everything else is treated as ``True``. Boolean parameters always have + ``multivalued=False`` (enforced by the SDK). + + .. code-block:: python + + metadata.add_parameter(name='pretty', type='boolean', + default=False, + description='Pretty-print JSON output.') + +Multivalued parameters +^^^^^^^^^^^^^^^^^^^^^^ + +When a parameter can accept more than one value, set ``multivalued=True``. +The SDK will always pass a **list** to ``_annotate()``, even when the user +provides only a single value. + +.. code-block:: python + + metadata.add_parameter(name='labels', type='string', + multivalued=True, + default=['PERSON', 'ORG'], + description='Annotation labels to use.') + +To pass multiple values: + +- via query string: repeat the parameter name + + .. code-block:: bash + + http://app-server:5000?labels=PERSON&labels=ORG + +- via CLI: list values after the flag + + .. code-block:: bash + + python cli.py --labels PERSON ORG + +.. note:: + + ``boolean`` parameters always force ``multivalued=False``. + +Map type parameters +^^^^^^^^^^^^^^^^^^^ + +Map parameters allow users to pass key-value pairs that arrive in +``_annotate()`` as a Python ``dict``. Declaring ``type='map'`` automatically +forces ``multivalued=True``. + +.. code-block:: python + + metadata.add_parameter(name='labelMap', type='map', + default=['B:bars', 'S:slate'], + description='Mapping from source to target labels.') + +Each value uses a colon (``:``) as the key-value delimiter:: + + KEY:VALUE + +**Colons are not allowed in keys.** The first colon in the string is always used +as the delimiter. If colons appear in the value portion (after the first colon), +the SDK will emit a warning. When the same key is passed more than once, the +last value wins. + +To pass map values: + +- via query string: repeat the parameter name with each ``KEY:VALUE`` pair + + .. code-block:: bash + + http://app-server:5000?labelMap=B:bars&labelMap=S:slate + +- via CLI: list pairs after the flag + + .. code-block:: bash + + python cli.py --labelMap B:bars S:slate + +Inside ``_annotate()``, the parameter arrives as:: + + {'B': 'bars', 'S': 'slate'} + +Default values must be a list of colon-separated strings:: + + default=['key1:value1', 'key2:value2'] + +For more complex value structures (e.g., comma-separated lists within values), +the app developer is responsible for further parsing and should document the +expected format in the parameter's ``description`` field. + +.. _runtime-params-envelope-note: + +Note on JSON envelope input +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Users may also pass parameters via a JSON envelope in the POST body +(see :ref:`clamsapp-configuring` for user-facing documentation). +App developers do **not** need to handle this case specially. +The SDK normalizes envelope parameters to the same ``Dict[str, List[str]]`` +format as query strings before they reach ``_annotate()``, so all type casting, +default filling, and view signing work identically regardless of how parameters +were provided. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1b3fc4e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["setuptools>=61.0", "setuptools-scm>=8.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] + +[project] +name = "clams-python" +dynamic = ["version"] +description = "A collection of APIs to develop CLAMS app for python" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Framework :: Flask", + "Framework :: Pytest", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3 :: Only", +] +dependencies = [ + "mmif-python==1.4.0", + "Flask>=2", + "Flask-RESTful>=0.3.9", + "gunicorn>=20", + "lapps>=0.0.2", + "pydantic>=2", + "jsonschema>=3", +] + +[project.scripts] +clams = "clams:cli" + +[project.urls] +homepage = "https://clams.ai" +source = "https://github.com/clamsproject/clams-python" + +[project.optional-dependencies] +dev = ["pytype", "pytest", "pytest-cov", "pillow", "setuptools"] +docs = ["sphinx>=7.0,<8.0", "furo", "m2r2", "sphinx-jsonschema"] +test = ["pytype", "pytest", "pytest-cov", "pillow"] + +[tool.setuptools.packages.find] +where = ["."] +include = ["clams*"] + +[tool.setuptools.package-data] +clams = ["develop/templates/**/*", "develop/templates/**/.*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--cov=clams --cov-report=xml" diff --git a/requirements.dev b/requirements.dev deleted file mode 100644 index e5d9f9d..0000000 --- a/requirements.dev +++ /dev/null @@ -1,12 +0,0 @@ -pytype -pytest -pytest-cov -twine -sphinx -sphinx-rtd-theme -sphinx-jsonschema -sphinx-autobuild -autodoc -m2r2 -pillow -setuptools diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 6cd63d0..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -mmif-python==1.3.1 - -Flask>=2 -Flask-RESTful>=0.3.9 -gunicorn>=20 -lapps>=0.0.2 -pydantic>=2 -jsonschema>=3 diff --git a/setup.py b/setup.py deleted file mode 100644 index b96a0ea..0000000 --- a/setup.py +++ /dev/null @@ -1,58 +0,0 @@ -#! /usr/bin/env python3 -import os -from os import path -import shutil - -name = "clams-python" -cmdclass = {} - -with open("VERSION", 'r') as version_f: - version = version_f.read().strip() - -with open('README.md') as readme: - long_desc = readme.read() - -with open('requirements.txt') as requirements: - requires = requirements.readlines() - -ver_pack_dir = path.join('clams', 'ver') -shutil.rmtree(ver_pack_dir, ignore_errors=True) -os.makedirs(ver_pack_dir, exist_ok=True) -init_mod = open(path.join(ver_pack_dir, '__init__.py'), 'w') -init_mod.write(f'__version__ = "{version}"') -init_mod.close() - -import setuptools - -setuptools.setup( - name=name, - version=version, - author="Brandeis Lab for Linguistics and Computation", - author_email="admin@clams.ai", - description="A collection of APIs to develop CLAMS app for python", - long_description=long_desc, - long_description_content_type="text/markdown", - url="https://clams.ai", - license="Apache-2.0", - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'Framework :: Flask', - 'Framework :: Pytest', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3 :: Only', - ], - cmdclass=cmdclass, - # this is for *building*, building (build, bdist_*) doesn't get along with MANIFEST.in - # so using this param explicitly is much safer implementation - package_data={ - 'clams': ['develop/templates/**/*', 'develop/templates/**/.*'] - }, - install_requires=requires, - python_requires='>=3.10', - packages=setuptools.find_packages(), - entry_points={ - 'console_scripts': [ - 'clams = clams.__init__:cli', - ], - }, -) diff --git a/tests/test_clamsapp.py b/tests/test_clamsapp.py index b108461..1abcbac 100644 --- a/tests/test_clamsapp.py +++ b/tests/test_clamsapp.py @@ -73,9 +73,8 @@ def _annotate(self, mmif, **kwargs): new_view.new_contain(AnnotationTypes.TimeFrame, **{"producer": "dummy-producer"}) ann = new_view.new_annotation(AnnotationTypes.TimeFrame, 'a1', start=10, end=99) ann.add_property("f1", "hello_world") - d1 = DocumentTypes.VideoDocument - d2 = DocumentTypes.from_str(f'{str(d1)[:-1]}99') - if mmif.get_documents_by_type(d2): + # forcing a version mismatch warning for testing "warning view" generation in Restifier + if mmif.get_documents_by_type(DocumentTypes.VideoDocument_v1): new_view.new_annotation(AnnotationTypes.TimePoint, 'tp1') if 'raise_error' in kwargs and kwargs['raise_error']: raise ValueError @@ -596,7 +595,23 @@ def test_cast(self): with warnings.catch_warnings(): warnings.simplefilter("error") caster.cast(params) - + + def test_kv_param_simple(self): + result = ParameterCaster.kv_param('key:value') + self.assertEqual(result, {'key': 'value'}) + # no warning for a single colon + with warnings.catch_warnings(): + warnings.simplefilter("error") + ParameterCaster.kv_param('key:value') + + def test_kv_param_colon_in_value_warns(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + result = ParameterCaster.kv_param('a:b:c') + self.assertEqual(result, {'a': 'b:c'}) + self.assertEqual(len(w), 1) + self.assertIn('multiple', str(w[0].message).lower()) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_envelope.py b/tests/test_envelope.py new file mode 100644 index 0000000..b295475 --- /dev/null +++ b/tests/test_envelope.py @@ -0,0 +1,455 @@ +import io +import json +import tempfile +import unittest +from contextlib import redirect_stdout +from pathlib import Path +from typing import Union + +from mmif import AnnotationTypes, Mmif + +import clams +import clams.app +from clams.appmetadata import AppMetadata +from clams.envelop import ( + EnvelopeError, create_envelope, is_envelope, + main as envelope_cli_main, normalize_params, prep_argparser, + unwrap_envelope, unwrap_if_envelope, +) +from tests.test_clamsapp import ExampleInputMMIF + + +class EnvelopeTestApp(clams.app.ClamsApp): + """ + Minimal test app that exercises the parameter pipeline without + relying on mmif APIs that have shifted between versions. + """ + app_version = 'envelope-test' + + def _appmetadata(self) -> Union[dict, AppMetadata]: + # the metadata.py fallback in tests/ is used by other tests; + # we don't need extra params here, sign_view records anything + # in _RAW_PARAMS_KEY anyway. + pass + + def _annotate(self, mmif, **kwargs): + if type(mmif) is not Mmif: + mmif = Mmif(mmif, validate=False) + new_view = mmif.new_view() + self.sign_view(new_view, kwargs) + new_view.new_contain(AnnotationTypes.TimeFrame, + producer='envelope-test') + if kwargs.get('raise_error'): + raise ValueError('boom') + return mmif + + +class TestNormalizeParams(unittest.TestCase): + + def test_string_scalar(self): + result = normalize_params({'prompt': 'hello'}) + self.assertEqual(result, {'prompt': ['hello']}) + + def test_int_scalar(self): + result = normalize_params({'count': 5}) + self.assertEqual(result, {'count': ['5']}) + + def test_float_scalar(self): + result = normalize_params({'temperature': 0.7}) + self.assertEqual(result, {'temperature': ['0.7']}) + + def test_bool_true(self): + result = normalize_params({'pretty': True}) + self.assertEqual(result, {'pretty': ['True']}) + + def test_bool_false(self): + result = normalize_params({'pretty': False}) + self.assertEqual(result, {'pretty': ['False']}) + + def test_array_of_strings(self): + result = normalize_params({'labels': ['slate', 'chyron']}) + self.assertEqual(result, {'labels': ['slate', 'chyron']}) + + def test_array_of_numbers(self): + result = normalize_params({'ids': [1, 2, 3]}) + self.assertEqual(result, {'ids': ['1', '2', '3']}) + + def test_object(self): + result = normalize_params( + {'labelMap': {'B': 'bars', 'S': 'slate'}} + ) + self.assertEqual( + result, + {'labelMap': ['B:bars', 'S:slate']}, + ) + + def test_mixed(self): + result = normalize_params({ + 'prompt': 'describe this', + 'temperature': 0.7, + 'pretty': True, + 'labels': ['a', 'b'], + 'labelMap': {'X': 'y'}, + }) + self.assertEqual(result['prompt'], ['describe this']) + self.assertEqual(result['temperature'], ['0.7']) + self.assertEqual(result['pretty'], ['True']) + self.assertEqual(result['labels'], ['a', 'b']) + self.assertEqual(result['labelMap'], ['X:y']) + + def test_empty(self): + self.assertEqual(normalize_params({}), {}) + + +class TestEnvelopeDetection(unittest.TestCase): + + def test_is_envelope_true(self): + self.assertTrue(is_envelope({'parameters': {}, 'mmif': {}})) + + def test_is_envelope_false(self): + self.assertFalse(is_envelope({'metadata': {}, 'views': []})) + + def test_is_envelope_non_dict(self): + self.assertFalse(is_envelope('not a dict')) + self.assertFalse(is_envelope(None)) + self.assertFalse(is_envelope([])) + + def test_unwrap_missing_mmif(self): + with self.assertRaises(EnvelopeError) as ctx: + unwrap_envelope({'parameters': {}}) + self.assertIn('mmif', str(ctx.exception).lower()) + + def test_unwrap_non_dict_parameters(self): + with self.assertRaises(EnvelopeError) as ctx: + unwrap_envelope({'parameters': 'bad', 'mmif': {}}) + self.assertIn('object', str(ctx.exception).lower()) + + +class TestEnvelopeCreation(unittest.TestCase): + + def setUp(self): + self.mmif_str = ExampleInputMMIF.get_mmif() + self.mmif_obj = Mmif(self.mmif_str) + + def test_from_string(self): + result = json.loads( + create_envelope(self.mmif_str, {'pretty': True}) + ) + self.assertIn('parameters', result) + self.assertIn('mmif', result) + self.assertEqual(result['parameters']['pretty'], True) + + def test_from_mmif_object(self): + result = json.loads( + create_envelope(self.mmif_obj, {'pretty': True}) + ) + self.assertIn('parameters', result) + self.assertIn('mmif', result) + + def test_from_dict(self): + mmif_dict = json.loads(self.mmif_str) + result = json.loads( + create_envelope(mmif_dict, {'pretty': True}) + ) + self.assertEqual(result['mmif'], mmif_dict) + self.assertEqual(result['parameters']['pretty'], True) + + def test_no_params(self): + result = json.loads(create_envelope(self.mmif_str)) + self.assertEqual(result['parameters'], {}) + self.assertIn('mmif', result) + + def test_roundtrip(self): + params = {'prompt': 'describe', 'labels': ['a', 'b']} + envelope_str = create_envelope(self.mmif_str, params) + body = json.loads(envelope_str) + self.assertTrue(is_envelope(body)) + mmif_str, normalized = unwrap_envelope(body) + # MMIF should be valid + Mmif(mmif_str) + self.assertEqual(normalized['prompt'], ['describe']) + self.assertEqual(normalized['labels'], ['a', 'b']) + + +class TestUnwrapIfEnvelope(unittest.TestCase): + """ + Direct tests for the shared helper used by every entry point. + """ + + def setUp(self): + self.mmif_str = ExampleInputMMIF.get_mmif() + + def test_str_envelope(self): + env = create_envelope(self.mmif_str, {'prompt': 'x'}) + data, params = unwrap_if_envelope(env, {}) + Mmif(data) # inner MMIF extracted and valid + self.assertEqual(params, {'prompt': ['x']}) + + def test_bytes_envelope(self): + env = create_envelope( + self.mmif_str, {'prompt': 'x'}).encode('utf-8') + data, params = unwrap_if_envelope(env, {}) + Mmif(data) + self.assertEqual(params, {'prompt': ['x']}) + + def test_dict_envelope(self): + env = json.loads(create_envelope(self.mmif_str, {'prompt': 'x'})) + data, params = unwrap_if_envelope(env, {}) + Mmif(data) + self.assertEqual(params, {'prompt': ['x']}) + + def test_explicit_params_win(self): + env = create_envelope(self.mmif_str, {'prompt': 'env'}) + data, params = unwrap_if_envelope(env, {'prompt': ['cli']}) + self.assertEqual(params['prompt'], ['cli']) + + def test_non_envelope_str_passthrough(self): + data, params = unwrap_if_envelope(self.mmif_str, {'a': ['1']}) + self.assertEqual(data, self.mmif_str) + self.assertEqual(params, {'a': ['1']}) + + def test_valid_json_non_dict_passthrough(self): + data, params = unwrap_if_envelope('123', {}) + self.assertEqual(data, '123') + self.assertEqual(params, {}) + + +class TestRestifierEnvelope(unittest.TestCase): + + def setUp(self): + self.client = clams.Restifier(EnvelopeTestApp()).test_client() + self.mmif_str = ExampleInputMMIF.get_mmif() + + def test_post_envelope(self): + envelope_str = create_envelope( + self.mmif_str, {'pretty': True} + ) + res = self.client.post('/', data=envelope_str) + self.assertEqual(res.status_code, 200) + Mmif(res.get_data(as_text=True)) + + def test_put_envelope(self): + envelope_str = create_envelope( + self.mmif_str, {'pretty': True} + ) + res = self.client.put('/', data=envelope_str) + self.assertEqual(res.status_code, 200) + Mmif(res.get_data(as_text=True)) + + PREFIX = "Invalid input data. See below for validation error." + + def test_envelope_missing_mmif(self): + bad = json.dumps({'parameters': {'pretty': True}}) + res = self.client.post('/', data=bad) + self.assertEqual(res.status_code, 500) + self.assertEqual(res.mimetype, 'text/plain') + body = res.get_data(as_text=True) + # EnvelopeError 500 uses the same payload format as the + # MMIF-validation 500 + self.assertTrue(body.startswith(self.PREFIX)) + self.assertIn('mmif', body) + + def test_envelope_invalid_mmif(self): + bad = json.dumps({ + 'parameters': {}, + 'mmif': {'not': 'valid mmif'}, + }) + res = self.client.post('/', data=bad) + self.assertEqual(res.status_code, 500) + self.assertEqual(res.mimetype, 'text/plain') + body = res.get_data(as_text=True) + # trimmed: concise jsonschema message, NOT the full schema dump + self.assertTrue(body.startswith(self.PREFIX)) + self.assertNotIn('$schema', body) + self.assertLess(len(body), 500) + + def test_raw_mmif_still_works(self): + res = self.client.post('/', data=self.mmif_str) + self.assertEqual(res.status_code, 200) + Mmif(res.get_data(as_text=True)) + + def test_invalid_json(self): + res = self.client.post('/', data='this is not json') + self.assertEqual(res.status_code, 500) + self.assertEqual(res.mimetype, 'text/plain') + + def test_envelope_app_error_returns_error_view(self): + # app raises during _annotate with envelope input: the error + # must come back as an error-view MMIF (application/json), + # NOT a text/plain input error, and set_error_view must have + # unwrapped the envelope to record the params. + envelope_str = create_envelope( + self.mmif_str, {'prompt': 'p', 'raise_error': True} + ) + res = self.client.post('/', data=envelope_str) + self.assertEqual(res.status_code, 500) + self.assertEqual(res.mimetype, 'application/json') + out = Mmif(res.get_data(as_text=True)) + err_meta = json.loads(list(out.views)[-1].metadata.serialize()) + self.assertIn('error', err_meta) + self.assertEqual( + err_meta.get('parameters', {}).get('prompt'), 'p') + + +class TestEnvelopeReproducibility(unittest.TestCase): + """ + End-to-end tests verifying the issue's key claim: envelope parameters + are recorded in view metadata via ``sign_view``, providing + transparent and reproducible app configuration. + """ + + def setUp(self): + self.client = clams.Restifier(EnvelopeTestApp()).test_client() + self.mmif_str = ExampleInputMMIF.get_mmif() + + def _get_view_params(self, response_text): + out = Mmif(response_text) + # signing view is the last view added by EnvelopeTestApp + signed = list(out.views)[-1] + return json.loads(signed.metadata.serialize()).get( + 'parameters', {}) + + def test_envelope_param_recorded_in_view_metadata(self): + envelope_str = create_envelope( + self.mmif_str, + {'prompt': 'describe this scene'}, + ) + res = self.client.post('/', data=envelope_str) + self.assertEqual(res.status_code, 200) + params = self._get_view_params(res.get_data(as_text=True)) + self.assertEqual(params.get('prompt'), 'describe this scene') + + def test_long_prompt_roundtrip(self): + # The original motivation: prompts of any length should pass + # through transparently. URL length limits don't apply since + # the envelope rides in the POST body. + long_prompt = 'word ' * 500 + envelope_str = create_envelope( + self.mmif_str, {'prompt': long_prompt}, + ) + res = self.client.post('/', data=envelope_str) + self.assertEqual(res.status_code, 200) + params = self._get_view_params(res.get_data(as_text=True)) + self.assertEqual(params.get('prompt'), long_prompt) + + def test_query_string_overrides_in_view_metadata(self): + envelope_str = create_envelope( + self.mmif_str, {'prompt': 'envelope value'}, + ) + res = self.client.post( + '/', data=envelope_str, + query_string={'prompt': 'query value'}, + ) + self.assertEqual(res.status_code, 200) + params = self._get_view_params(res.get_data(as_text=True)) + self.assertEqual(params.get('prompt'), 'query value') + + +class TestEnvelopeCLI(unittest.TestCase): + + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + tmp = Path(self.tmpdir.name) + self.params_path = tmp / 'params.json' + self.mmif_path = tmp / 'input.mmif' + self.params_path.write_text(json.dumps( + {'prompt': 'hello', 'temperature': 0.5} + )) + self.mmif_path.write_text(ExampleInputMMIF.get_mmif()) + + def tearDown(self): + self.tmpdir.cleanup() + + def _run_cli(self, argv): + parser = prep_argparser() + args = parser.parse_args(argv) + buf = io.StringIO() + with redirect_stdout(buf): + envelope_cli_main(args) + return buf.getvalue() + + def test_cli_with_files(self): + output = self._run_cli([ + str(self.params_path), str(self.mmif_path) + ]) + body = json.loads(output) + self.assertTrue(is_envelope(body)) + self.assertEqual(body['parameters']['prompt'], 'hello') + self.assertEqual(body['parameters']['temperature'], 0.5) + # extracted MMIF should still be valid + mmif_str, _ = unwrap_envelope(body) + Mmif(mmif_str) + + def test_cli_envelope_can_be_consumed_by_restifier(self): + # The envelope produced by the CLI should be directly POSTable. + output = self._run_cli([ + str(self.params_path), str(self.mmif_path) + ]) + client = clams.Restifier(EnvelopeTestApp()).test_client() + res = client.post('/', data=output) + self.assertEqual(res.status_code, 200) + Mmif(res.get_data(as_text=True)) + + +class TestEnvelopeNonHTTPEntrypoints(unittest.TestCase): + """ + Envelope handling lives in ClamsApp.annotate(), so it must work + when annotate() is called directly -- i.e. the path used by the + cli.py entry point and any programmatic use -- not just over HTTP. + """ + + def setUp(self): + self.app = EnvelopeTestApp() + self.mmif_str = ExampleInputMMIF.get_mmif() + + def _signed_params(self, out_mmif_str): + out = Mmif(out_mmif_str) + signed = list(out.views)[-1] + return json.loads(signed.metadata.serialize()).get( + 'parameters', {}) + + def test_annotate_accepts_envelope_string(self): + # exactly what cli.py does: clamsapp.annotate(in_data, **params) + envelope_str = create_envelope( + self.mmif_str, {'prompt': 'from cli'} + ) + out = self.app.annotate(envelope_str) + params = self._signed_params(out) + self.assertEqual(params.get('prompt'), 'from cli') + + def test_annotate_raw_mmif_still_works(self): + # raw MMIF (no envelope) passes through; app adds one view + out = Mmif(self.app.annotate(self.mmif_str)) + self.assertEqual(len(list(out.views)), 1) + + def test_explicit_params_override_envelope(self): + envelope_str = create_envelope( + self.mmif_str, {'prompt': 'envelope value'} + ) + # CLI flags / kwargs arrive as lists, like argparse produces + out = self.app.annotate(envelope_str, prompt=['kwarg value']) + params = self._signed_params(out) + self.assertEqual(params.get('prompt'), 'kwarg value') + + def test_malformed_envelope_raises_envelope_error(self): + bad = json.dumps({'parameters': {'p': 1}}) # missing "mmif" + with self.assertRaises(EnvelopeError): + self.app.annotate(bad) + + +class TestEnvelopePythonAPI(unittest.TestCase): + + def test_create_envelope_at_package_root(self): + # `from clams import create_envelope` exposes the fn at root. + self.assertTrue(callable(clams.create_envelope)) + result = json.loads( + clams.create_envelope( + ExampleInputMMIF.get_mmif(), {'pretty': True} + ) + ) + self.assertTrue(is_envelope(result)) + self.assertEqual(result['parameters']['pretty'], True) + + +if __name__ == '__main__': + unittest.main()