diff --git a/.github/workflows/build-manifests.yml b/.github/workflows/build-manifests.yml index 8e4fdf3..d3cb4d3 100644 --- a/.github/workflows/build-manifests.yml +++ b/.github/workflows/build-manifests.yml @@ -1,4 +1,4 @@ -name: Fetch releases from S3 +name: Build and publish releases artifacts on: push: @@ -33,10 +33,10 @@ jobs: with: persist-credentials: false - - name: Set up Python 3.11 + - name: Set up Python 3.12 uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: - python-version: "3.11" + python-version: "3.12" - name: Install dependencies run: | @@ -44,19 +44,17 @@ jobs: cd utils pip install -r requirements.txt - - name: Build releases.json and latest.dbb + - name: Build releases.json and latest.ddb run: | cd utils - python3 fetch-releases-from-s3.py - python3 simple-registry-manifest.py + python3 fetch_releases_from_stac.py - name: Copy output to publish directory run: | mkdir publish + cp 404.html publish/ cp utils/releases.json publish/ - cp utils/registry-manifest.json publish/ cp utils/latest.ddb publish/ - cp utils/latest.ddb publish/latest.dbb cp overture_releases.yaml publish/ - name: Setup Pages diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..37795b6 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,39 @@ +name: Test utils + +on: + push: + branches: main + pull_request: + +permissions: + contents: read + +concurrency: + group: "test-${{ github.ref }}" + cancel-in-progress: true + +jobs: + test: + name: Run unit tests + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Python 3.12 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + cd utils + pip install -r requirements-test.txt + + - name: Run tests + run: | + cd utils + python -m pytest tests/ -v diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..41c33c8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ + +__pycache__/ +*.pyc diff --git a/404.html b/404.html new file mode 100644 index 0000000..bb19ca2 --- /dev/null +++ b/404.html @@ -0,0 +1,99 @@ + + + + + + Moved | Overture Maps Data + + + + + + + +

This resource has moved

+
+ +

+ The file you requested is no longer published here. Release discovery is + now handled by the + Overture STAC catalog + (SpatioTemporal Asset Catalog). +

+

+ See the data repo README + for details. Redirecting in 5 seconds… +

+ + + + + diff --git a/README.md b/README.md index 68b70d8..483c20b 100644 --- a/README.md +++ b/README.md @@ -49,3 +49,9 @@ for each theme with the following enhancements: Data Release Feedback --- We are very interested in feedback on the Overture data. Please use the [Discussion](https://github.com/OvertureMaps/data/discussions) section of this repo to comment. Tagging it with the release and relevant theme name (Places, Transportation) will help direct your ideas. Please include as much detail as possible. The associated Task Force will carefully review each submission and offer feedback where required. + +Release Discovery +--- +Use the [Overture STAC catalog](https://stac.overturemaps.org/catalog.json) (SpatioTemporal Asset Catalog) for authoritative release discovery. + +> **Deprecated:** `overture_releases.yaml`, `releases.json`, and `registry-manifest.json` (previously at `labs.overturemaps.org/data/`) are no longer maintained and will be removed in a future release. Use the STAC catalog instead. diff --git a/overture_releases.yaml b/overture_releases.yaml index 05da21c..fcca45c 100644 --- a/overture_releases.yaml +++ b/overture_releases.yaml @@ -1,3 +1,6 @@ +# DEPRECATED: This file is no longer maintained and will be removed in a future release. +# Use the Overture STAC catalog for authoritative release discovery: +# https://stac.overturemaps.org/catalog.json - schema: "1.16.0" release: "2026-04-15.0" - schema: "1.16.0" diff --git a/utils/fetch-releases-from-s3.py b/utils/fetch-releases-from-s3.py deleted file mode 100644 index 56d3ee2..0000000 --- a/utils/fetch-releases-from-s3.py +++ /dev/null @@ -1,89 +0,0 @@ -import duckdb, json -from obstore.store import S3Store - -store = S3Store("overturemaps-us-west-2", region="us-west-2", skip_signature=True) - -releases = store.list_with_delimiter("release/") - -output = {} - -for idx, release in enumerate(sorted(releases.get("common_prefixes"), reverse=True)): - path = release.split("/")[1] - if idx == 0: - output["latest"] = path - output["releases"] = [] - output["releases"].append(path) - - print(f" - {path}") - -with open("releases.json", "w") as output_file: - output_file.write(json.dumps(output, indent=4)) - -conn = duckdb.connect("latest.ddb") - -conn.sql( - f""" -INSTALL spatial; -LOAD spatial; - -CREATE OR REPLACE VIEW address AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=addresses/type=address/*.parquet') -); - -CREATE OR REPLACE VIEW bathymetry AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=bathymetry/*.parquet') -); - -CREATE OR REPLACE VIEW building AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=buildings/type=building/*.parquet') -); - -CREATE OR REPLACE VIEW building_part AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=buildings/type=building_part/*.parquet') -); - -CREATE OR REPLACE VIEW connector AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=transportation/type=connector/*.parquet') -); - -CREATE OR REPLACE VIEW division AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=divisions/type=division/*.parquet') -); - -CREATE OR REPLACE VIEW division_area AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=divisions/type=division_area/*.parquet') -); - -CREATE OR REPLACE VIEW division_boundary AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=divisions/type=division_boundary/*.parquet') -); - -CREATE OR REPLACE VIEW infrastructure AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=infrastructure/*.parquet') -); - -CREATE OR REPLACE VIEW land AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=land/*.parquet') -); - -CREATE OR REPLACE VIEW land_cover AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=land_cover/*.parquet') -); - -CREATE OR REPLACE VIEW land_use AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=land_use/*.parquet') -); - -CREATE OR REPLACE VIEW place AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=places/type=place/*.parquet') -); - -CREATE OR REPLACE VIEW segment AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=transportation/type=segment/*.parquet') -); - -CREATE OR REPLACE VIEW water AS ( - SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/{output.get("latest")}/theme=base/type=water/*.parquet') -); -""" -) diff --git a/utils/fetch_releases_from_stac.py b/utils/fetch_releases_from_stac.py new file mode 100644 index 0000000..a7cf5a0 --- /dev/null +++ b/utils/fetch_releases_from_stac.py @@ -0,0 +1,87 @@ +import json +import urllib.request +from urllib.parse import urlparse + +import duckdb + +STAC_CATALOG = "https://stac.overturemaps.org/catalog.json" +S3_BASE = "s3://overturemaps-us-west-2/release" +_USER_AGENT = "overturemaps-data/1.0" + +VIEWS = [ + ("address", "addresses", "address"), + ("bathymetry", "base", "bathymetry"), + ("building", "buildings", "building"), + ("building_part", "buildings", "building_part"), + ("connector", "transportation", "connector"), + ("division", "divisions", "division"), + ("division_area", "divisions", "division_area"), + ("division_boundary", "divisions", "division_boundary"), + ("infrastructure", "base", "infrastructure"), + ("land", "base", "land"), + ("land_cover", "base", "land_cover"), + ("land_use", "base", "land_use"), + ("place", "places", "place"), + ("segment", "transportation", "segment"), + ("water", "base", "water"), +] + + +def fetch_catalog(url: str, timeout: int = 30) -> dict: + req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as response: + return json.loads(response.read()) + + +def _release_id_from_href(href: str) -> str: + parts = [p for p in urlparse(href).path.split("/") if p and p != "."] + return parts[0] + + +def parse_releases(catalog: dict) -> dict: + latest = catalog["latest"] + releases = sorted( + [ + _release_id_from_href(link["href"]) + for link in catalog["links"] + if link["rel"] == "child" + ], + reverse=True, + ) + return {"latest": latest, "releases": releases} + + +def build_views_sql(latest: str, s3_base: str = S3_BASE) -> str: + stmts = ["INSTALL spatial;", "LOAD spatial;"] + for view_name, theme, type_ in VIEWS: + path = f"{s3_base}/{latest}/theme={theme}/type={type_}/*.parquet" + stmts.append( + f"CREATE OR REPLACE VIEW {view_name} AS (\n" + f" SELECT * FROM read_parquet('{path}')\n);" + ) + return "\n\n".join(stmts) + + +def create_duckdb_views(db_path: str, latest: str, s3_base: str = S3_BASE) -> None: + conn = duckdb.connect(db_path) + try: + conn.sql(build_views_sql(latest, s3_base)) + finally: + conn.close() + + +def main(): + catalog = fetch_catalog(STAC_CATALOG) + output = parse_releases(catalog) + + for release in output["releases"]: + print(f" - {release}") + + with open("releases.json", "w") as f: + f.write(json.dumps(output, indent=4)) + + create_duckdb_views("latest.ddb", output["latest"]) + + +if __name__ == "__main__": + main() diff --git a/utils/requirements-test.txt b/utils/requirements-test.txt new file mode 100644 index 0000000..13f6026 --- /dev/null +++ b/utils/requirements-test.txt @@ -0,0 +1,2 @@ +-r requirements.txt +pytest>=8.0.0 diff --git a/utils/requirements.txt b/utils/requirements.txt index 2718dd1..348ac20 100644 --- a/utils/requirements.txt +++ b/utils/requirements.txt @@ -1,3 +1 @@ -obstore>=0.7.0 duckdb==1.3.2 -pyarrow>=20.0.0 diff --git a/utils/simple-registry-manifest.py b/utils/simple-registry-manifest.py deleted file mode 100644 index 381134f..0000000 --- a/utils/simple-registry-manifest.py +++ /dev/null @@ -1,142 +0,0 @@ -import json -import logging - -import pyarrow.dataset as ds -import pyarrow.fs as fs - -# Set up logging -logging.basicConfig() -logger = logging.getLogger("registry-manifest") -logger.setLevel(logging.INFO) - - -def create_registry_manifest(): - """ - Simple script to read all parquet files in s3://overturemaps-us-west-2/registry/ - and create a JSON manifest with min/max IDs and file paths. - """ - - # Initialize S3 filesystem (following the pattern from registry-manifest.py) - filesystem = fs.S3FileSystem(anonymous=True, region="us-west-2") - - # Registry path (without s3:// prefix for filesystem operations) - registry_path = "overturemaps-us-west-2/registry" - - logger.info(f"Scanning registry path: {registry_path}") - - # Get all files in the registry directory using proper FileSelector - registry_selector = fs.FileSelector(registry_path, recursive=True) - all_files = filesystem.get_file_info(registry_selector) - - # Filter for parquet files only - parquet_files = [ - f - for f in all_files - if f.path.endswith(".parquet") and f.type == fs.FileType.File - ] - - logger.info(f"Found {len(parquet_files)} parquet files in registry") - - # Simple arrays for bounds and files - bounds = [] - files = [] - - # Process each parquet file - for file_info in parquet_files: - logger.info(f"Processing: {file_info.path}") - - try: - # Create dataset for this single file - file_dataset = ds.dataset( - file_info.path, filesystem=filesystem, format="parquet" - ) - - # Get fragments - fragments = list(file_dataset.get_fragments()) - - if not fragments: - logger.warning(f"No fragments found for {file_info.path}") - continue - - # Use first fragment to get schema - fragment = fragments[0] - schema = fragment.metadata.schema.to_arrow_schema() - - # Check if 'id' column exists - has_id_column = "id" in [field.name for field in schema] - - if has_id_column: - # Since ID field is always sorted, we only need first and last row group - min_id = None - max_id = None - - # Find the ID column index - id_column_index = next( - i for i, field in enumerate(schema) if field.name == "id" - ) - - # Get min from first row group and max from last row group - first_fragment = fragments[0] - last_fragment = fragments[-1] - - # Get min from first row group of first fragment - first_metadata = first_fragment.metadata - if first_metadata.num_row_groups > 0: - first_row_group = first_metadata.row_group(0) - first_id_column = first_row_group.column(id_column_index) - - if ( - first_id_column.statistics - and first_id_column.statistics.has_min_max - ): - min_id = first_id_column.statistics.min - if isinstance(min_id, bytes): - min_id = min_id.decode("utf-8") - - # Get max from last row group of last fragment - last_metadata = last_fragment.metadata - if last_metadata.num_row_groups > 0: - last_row_group = last_metadata.row_group( - last_metadata.num_row_groups - 1 - ) - last_id_column = last_row_group.column(id_column_index) - - if ( - last_id_column.statistics - and last_id_column.statistics.has_min_max - ): - max_id = last_id_column.statistics.max - if isinstance(max_id, bytes): - max_id = max_id.decode("utf-8") - - filename = file_info.path.replace( - "overturemaps-us-west-2/registry/", "" - ) - bounds.append([min_id, max_id]) - files.append(filename) - - logger.info( - f"Successfully processed {file_info.path}: {min_id} - {max_id}" - ) - else: - logger.warning(f"No 'id' column found in {file_info.path}") - - except Exception as e: - logger.error(f"Error processing {file_info.path}: {e}") - - # Create simple manifest - manifest = {"bounds": bounds, "files": files} - - # Write manifest to JSON file (compact format for smallest size) - output_file = "registry-manifest.json" - with open(output_file, "w") as f: - json.dump(manifest, f, separators=(",", ":")) - - print(f"Registry manifest written to {output_file}") - print(f"Total files processed: {len(manifest['files'])}") - - return manifest - - -if __name__ == "__main__": - manifest = create_registry_manifest() diff --git a/utils/tests/__init__.py b/utils/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/tests/test_fetch_releases_from_stac.py b/utils/tests/test_fetch_releases_from_stac.py new file mode 100644 index 0000000..e482282 --- /dev/null +++ b/utils/tests/test_fetch_releases_from_stac.py @@ -0,0 +1,199 @@ +import json +import sys +import os +from unittest.mock import MagicMock, patch + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from fetch_releases_from_stac import ( + VIEWS, + build_views_sql, + create_duckdb_views, + fetch_catalog, + parse_releases, +) + +SAMPLE_CATALOG = { + "type": "Catalog", + "id": "Overture Releases", + "stac_version": "1.1.0", + "description": "All Overture Releases", + "links": [ + {"rel": "root", "href": "./catalog.json", "type": "application/json"}, + { + "rel": "child", + "href": "./2026-05-20.0/catalog.json", + "type": "application/json", + "latest": True, + }, + { + "rel": "child", + "href": "./2026-04-15.0/catalog.json", + "type": "application/json", + }, + ], + "latest": "2026-05-20.0", +} + + +class TestFetchCatalog: + def test_returns_parsed_json(self): + mock_response = MagicMock() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + mock_response.read.return_value = json.dumps(SAMPLE_CATALOG).encode() + + with patch("urllib.request.urlopen", return_value=mock_response): + result = fetch_catalog("https://stac.overturemaps.org/catalog.json") + + assert result == SAMPLE_CATALOG + + def test_uses_provided_url(self): + mock_response = MagicMock() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + mock_response.read.return_value = json.dumps(SAMPLE_CATALOG).encode() + + with patch("urllib.request.urlopen", return_value=mock_response) as mock_open: + fetch_catalog("https://custom.example.com/catalog.json") + req = mock_open.call_args[0][0] + assert req.full_url == "https://custom.example.com/catalog.json" + + def test_applies_timeout(self): + mock_response = MagicMock() + mock_response.__enter__ = lambda s: s + mock_response.__exit__ = MagicMock(return_value=False) + mock_response.read.return_value = json.dumps(SAMPLE_CATALOG).encode() + + with patch("urllib.request.urlopen", return_value=mock_response) as mock_open: + fetch_catalog("https://stac.overturemaps.org/catalog.json", timeout=10) + assert mock_open.call_args[1]["timeout"] == 10 + + +class TestParseReleases: + def test_extracts_latest(self): + result = parse_releases(SAMPLE_CATALOG) + assert result["latest"] == "2026-05-20.0" + + def test_extracts_child_releases(self): + result = parse_releases(SAMPLE_CATALOG) + assert "2026-05-20.0" in result["releases"] + assert "2026-04-15.0" in result["releases"] + + def test_excludes_root_link(self): + result = parse_releases(SAMPLE_CATALOG) + # root link href is "./catalog.json" — split("/")[1] would be "catalog.json" + # but more importantly rel="root" should be excluded + assert "catalog.json" not in result["releases"] + assert len(result["releases"]) == 2 + + def test_releases_sorted_descending(self): + result = parse_releases(SAMPLE_CATALOG) + assert result["releases"] == sorted(result["releases"], reverse=True) + + def test_returns_dict_with_expected_keys(self): + result = parse_releases(SAMPLE_CATALOG) + assert set(result.keys()) == {"latest", "releases"} + + def test_empty_links(self): + catalog = {**SAMPLE_CATALOG, "links": [], "latest": "2026-05-20.0"} + result = parse_releases(catalog) + assert result["latest"] == "2026-05-20.0" + assert result["releases"] == [] + + def test_single_release(self): + catalog = { + **SAMPLE_CATALOG, + "links": [ + {"rel": "child", "href": "./2026-05-20.0/catalog.json"}, + ], + "latest": "2026-05-20.0", + } + result = parse_releases(catalog) + assert result["releases"] == ["2026-05-20.0"] + + def test_absolute_href_parsed_correctly(self): + catalog = { + **SAMPLE_CATALOG, + "links": [ + { + "rel": "child", + "href": "https://stac.overturemaps.org/2026-05-20.0/catalog.json", + }, + ], + "latest": "2026-05-20.0", + } + result = parse_releases(catalog) + assert result["releases"] == ["2026-05-20.0"] + + def test_relative_href_without_dotslash_parsed_correctly(self): + catalog = { + **SAMPLE_CATALOG, + "links": [ + {"rel": "child", "href": "2026-05-20.0/catalog.json"}, + ], + "latest": "2026-05-20.0", + } + result = parse_releases(catalog) + assert result["releases"] == ["2026-05-20.0"] + + + def test_contains_install_spatial(self): + sql = build_views_sql("2026-05-20.0") + assert "INSTALL spatial" in sql + + def test_contains_load_spatial(self): + sql = build_views_sql("2026-05-20.0") + assert "LOAD spatial" in sql + + def test_all_views_present(self): + sql = build_views_sql("2026-05-20.0") + for view_name, _, _ in VIEWS: + assert f"CREATE OR REPLACE VIEW {view_name}" in sql + + def test_latest_release_in_paths(self): + release = "2026-05-20.0" + sql = build_views_sql(release) + assert release in sql + + def test_custom_s3_base(self): + sql = build_views_sql("2026-05-20.0", s3_base="s3://my-bucket/release") + assert "s3://my-bucket/release" in sql + + def test_correct_theme_type_mapping(self): + sql = build_views_sql("2026-05-20.0") + assert "theme=addresses/type=address" in sql + assert "theme=buildings/type=building_part" in sql + assert "theme=transportation/type=segment" in sql + assert "theme=divisions/type=division_boundary" in sql + + def test_view_count_matches_views_constant(self): + sql = build_views_sql("2026-05-20.0") + count = sql.count("CREATE OR REPLACE VIEW") + assert count == len(VIEWS) + + +class TestCreateDuckdbViews: + def test_creates_all_views(self): + mock_conn = MagicMock() + with patch("duckdb.connect", return_value=mock_conn): + create_duckdb_views(":memory:", "2026-05-20.0") + mock_conn.sql.assert_called_once() + sql_arg = mock_conn.sql.call_args[0][0] + for view_name, _, _ in VIEWS: + assert f"CREATE OR REPLACE VIEW {view_name}" in sql_arg + + def test_views_reference_correct_release(self): + release = "2026-05-20.0" + mock_conn = MagicMock() + with patch("duckdb.connect", return_value=mock_conn): + create_duckdb_views(":memory:", release) + sql_arg = mock_conn.sql.call_args[0][0] + assert release in sql_arg + + def test_closes_connection(self): + mock_conn = MagicMock() + with patch("duckdb.connect", return_value=mock_conn): + create_duckdb_views(":memory:", "2026-05-20.0") + mock_conn.close.assert_called_once() +