Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions fix/rewrite_sta_ssp_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Fix ScheduledStopPoint IDs to match SIRI feed format.

Transforms IDs from: IT:ITH1:ScheduledStopPoint:it-22021-7010-51-32073:
to: IT:ITH10:ScheduledStopPoint:7010:51:32073

This is needed so that NeTEx and SIRI feeds reference the same stops.

It is a mystery to me why this cannot be fixed at the source.
"""

import dataclasses
import logging
import re
from collections.abc import Generator
from pathlib import Path
from typing import Any

from domain.netex.model import (
PassengerStopAssignment,
Route,
RoutePoint,
RoutePointRef,
ScheduledStopPoint,
ScheduledStopPointRef,
ServiceJourneyPattern,
ServiceLink,
TimingLink,
)
from domain.netex.services.recursive_attributes import recursive_attributes
from storage.mdbx.core.implementation import MdbxStorage
from utils.aux_logging import log_all, prepare_logger

_PATTERN = re.compile(r'^.*:ScheduledStopPoint:it-22021-(.+):$')


def _new_id(old_id: str) -> str | None:
m = _PATTERN.match(old_id)
if m:
return 'IT:ITH10:ScheduledStopPoint:' + m.group(1).replace('-', ':')
return None


# Object types that may transitively contain ScheduledStopPointRef or RoutePointRef.
_REF_BEARING_TYPES = [
ServiceJourneyPattern,
ServiceLink,
TimingLink,
PassengerStopAssignment,
Route,
RoutePoint,
]


def _iter_updated_objects(
db: MdbxStorage, txn: Any,
) -> Generator[Any, None, None]:
for cls in _REF_BEARING_TYPES:
for obj in db.iter_only_objects(txn, cls):
changed = False
for ref, _path in recursive_attributes(obj, []):
if isinstance(ref, (ScheduledStopPointRef, RoutePointRef)):
new_ref = _new_id(ref.ref)
if new_ref is not None:
ref.ref = new_ref
changed = True
if changed:
yield obj


def _iter_renamed_ssps(
db: MdbxStorage, txn: Any,
) -> Generator[ScheduledStopPoint, None, None]:
for ssp in db.iter_only_objects(txn, ScheduledStopPoint):
new_id = _new_id(ssp.id)
if new_id is not None:
yield dataclasses.replace(ssp, id=new_id)


def fix_ssp_ids(database: Path) -> None:
with MdbxStorage(database, readonly=False) as db:
with db.env.rw_transaction() as txn:
# TODO: delete the old ScheduledStopPoint objects (no delete API available yet)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For deleting the following steps must be assured in this order:

  • the id of the object itself must be renamed
  • all internal references must be updated, hence at least ScheduledStopPointRef, TimingPointRef nameOfRefClass="ScheduledStopPoint", ObjectRef (NoticeAssignment), rewriting should cause the updating the referencing
  • the old relationship between objects must be deleted
  • the key with the old object must be deleted

We have avoided such operations, so we fill a new database with the context, and not try to do such invasive operations in place.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you never really delete but simply filtering them out when copying to a new database?

@skinkie skinkie Jun 25, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two facets here. The way we have worked was always to transfer from database to database when doing any transformation, so from NeTEx to NeTEx. The (inline) fix operations work well on attribute level like projection of all coordinates from a national grid to WGS84.

What you are doing here would match something like the EPIP conversion. Do all the transformations, write the output into the second database, and copy_map everything that remains stable. https://github.com/MMTIS/badger/blob/binary_relation_serializer/conv/epip_db_to_db.py#L181

The effect is that anything related to referential relationships are never updated, only created.

So in effect, the code to achieve such thing is virtually the same, but source is copied, and transformed, then written to the target.

The second facet is, that we have always overwritten the key. This is not the case when the id is changed, thus the key changes.

db.insert_any_object_on_queue(txn, _iter_updated_objects(db, txn))
db.insert_any_object_on_queue(txn, _iter_renamed_ssps(db, txn))
txn.commit()


def main(source_database_file: str) -> None:
fix_ssp_ids(Path(source_database_file))


if __name__ == "__main__":
import argparse
import traceback

parser = argparse.ArgumentParser(description="Fix ScheduledStopPoint IDs to SIRI format")
parser.add_argument("source", type=str, help="mdbx file to fix in-place")
parser.add_argument("--log_file", type=str, required=False, help="log file path")
args = parser.parse_args()
prepare_logger(logging.INFO, args.log_file)
try:
main(args.source)
except Exception as e:
log_all(logging.ERROR, f"{e}")
raise e
10 changes: 10 additions & 0 deletions test/rewrite_sta_ssp_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import unittest

from fix.rewrite_sta_ssp_ids import main

class FixSSPTestCase(unittest.TestCase):
def test(self):
main("sta.lmdb")

if __name__ == '__main__':
unittest.main()