From b0ba321613531a594ed70aecde67decf8b8f4e9f Mon Sep 17 00:00:00 2001 From: Adam Nowak Date: Tue, 24 Mar 2026 20:57:10 +0000 Subject: [PATCH 1/3] fix(archive): unique shard zips, drop tgz, output under data/archives - Name per-shard zips from path relative to data_path to avoid pdbs.h5 collisions - Remove redundant tar.gz wrapper around zip shards - Write shard and final merged zips to {data_path}/archives/{dataset}/ - Open H5 paths from index keys directly; pass data_path for naming - Add unit tests for _shard_zip_name Made-with: Cursor --- tests/test_archive.py | 26 +++++++++++++++++++++ toolbox/scripts/archive.py | 47 ++++++++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 tests/test_archive.py diff --git a/tests/test_archive.py b/tests/test_archive.py new file mode 100644 index 0000000..735da6e --- /dev/null +++ b/tests/test_archive.py @@ -0,0 +1,26 @@ +from toolbox.scripts.archive import _shard_zip_name + + +def test_shard_zip_name_distinct_batches_same_basename(): + data = "/data" + a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5" + b = f"{data}/structures/PDB/subset_/fourth_7/1/pdbs.h5" + assert _shard_zip_name(a, data) != _shard_zip_name(b, data) + assert _shard_zip_name(a, data).endswith(".zip") + assert "0" in _shard_zip_name(a, data) or "pdbs" in _shard_zip_name(a, data) + + +def test_shard_zip_name_distinct_trees_same_batch_id(): + data = "/data" + a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5" + b = f"{data}/structures/PDB/subset_/other_slug/0/pdbs.h5" + assert _shard_zip_name(a, data) != _shard_zip_name(b, data) + + +def test_shard_zip_name_long_path_uses_hash(): + data = "/data" + long_mid = "x" * 300 + h5 = f"{data}/structures/{long_mid}/0/pdbs.h5" + name = _shard_zip_name(h5, data) + assert name.endswith(".zip") + assert len(name) < 80 diff --git a/toolbox/scripts/archive.py b/toolbox/scripts/archive.py index 0c4ca30..004db29 100644 --- a/toolbox/scripts/archive.py +++ b/toolbox/scripts/archive.py @@ -1,6 +1,5 @@ -from toolbox.models.manage_dataset.index.handle_index import read_index import datetime -import os +import hashlib import zipfile from pathlib import Path @@ -8,14 +7,29 @@ from tqdm import tqdm from tqdm.contrib.logging import logging_redirect_tqdm +from toolbox.models.manage_dataset.index.handle_index import read_index from toolbox.models.manage_dataset.utils import read_all_pdbs_from_h5 from toolbox.utlis.logging import logger -def process_h5_file(h5_file, dataset_path, output_dir): - full_h5_file_path = Path(dataset_path) / h5_file - prots = read_all_pdbs_from_h5(full_h5_file_path) - archive_name = os.path.basename(h5_file).replace(".h5", ".zip") +def _shard_zip_name(h5_file: str, data_path_str: str) -> str: + """Stable unique .zip basename per H5 shard (avoids .../N/pdbs.h5 → pdbs.zip collisions).""" + h5_posix = Path(h5_file).as_posix() + root = Path(data_path_str).as_posix().rstrip("/") + if root and h5_posix.startswith(root + "/"): + rel = h5_posix.removeprefix(root + "/") + else: + rel = h5_posix + stem = rel.removesuffix(".h5").replace("/", "__") + if len(stem) > 200: + stem = hashlib.sha256(rel.encode()).hexdigest()[:24] + return f"{stem}.zip" + + +def process_h5_file(h5_file, data_path_str, output_dir): + h5_path = Path(h5_file) + prots = read_all_pdbs_from_h5(str(h5_path)) + archive_name = _shard_zip_name(h5_file, data_path_str) archive_path = Path(output_dir) / archive_name with zipfile.ZipFile(archive_path, "w") as zipf: @@ -23,30 +37,35 @@ def process_h5_file(h5_file, dataset_path, output_dir): code = p.removesuffix(".pdb") zipf.writestr(f"{code}.pdb", pdb_file_content) - os.system(f"tar -czf {str(archive_path)}.tgz {str(archive_path)}") - return str(archive_path) def create_archive(structures_dataset: "StructuresDataset"): + data_path = structures_dataset.config.data_path dataset_path = structures_dataset.dataset_path() - proteins_index = read_index(Path(dataset_path) / "dataset_reversed.idx", structures_dataset.config.data_path) - output_dir = Path(dataset_path) / "archives" - output_dir.mkdir(exist_ok=True) + proteins_index = read_index( + Path(dataset_path) / "dataset_reversed.idx", data_path + ) + output_dir = Path(data_path) / "archives" / structures_dataset.dataset_dir_name() + output_dir.mkdir(parents=True, exist_ok=True) client = structures_dataset._client futures = [] for h5_file in proteins_index.keys(): - future = client.submit(process_h5_file, h5_file, dataset_path, output_dir) + future = client.submit( + process_h5_file, h5_file, data_path, output_dir + ) futures.append(future) n = len(futures) logger.info("Building combined PDB archive from %s H5 shard(s)", n) current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - final_archive_name = f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip" - final_archive_path = Path.cwd() / final_archive_name + final_archive_name = ( + f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip" + ) + final_archive_path = output_dir / final_archive_name with zipfile.ZipFile(final_archive_path, "w") as final_zip: with logging_redirect_tqdm(): From e1bd5a54a2e3f7263dd5ab191ad570f9c94283a8 Mon Sep 17 00:00:00 2001 From: Adam Nowak Date: Fri, 3 Apr 2026 01:20:49 +0200 Subject: [PATCH 2/3] Remove the mega-zip, only per shard zip --- fridata.py | 3 ++- toolbox/scripts/archive.py | 25 ++++++------------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/fridata.py b/fridata.py index d270736..504021e 100644 --- a/fridata.py +++ b/fridata.py @@ -225,7 +225,8 @@ def create_parser(): add_common_arguments(verify_chains_parser) create_archive_parser = subparsers.add_parser( - "create_archive", help="Create PDB compressed archive" + "create_archive", + help="Write one PDB .zip per H5 shard under data_path/archives//", ) add_common_arguments(create_archive_parser) diff --git a/toolbox/scripts/archive.py b/toolbox/scripts/archive.py index 004db29..7a5774a 100644 --- a/toolbox/scripts/archive.py +++ b/toolbox/scripts/archive.py @@ -1,4 +1,3 @@ -import datetime import hashlib import zipfile from pathlib import Path @@ -59,22 +58,10 @@ def create_archive(structures_dataset: "StructuresDataset"): futures.append(future) n = len(futures) - logger.info("Building combined PDB archive from %s H5 shard(s)", n) + logger.info("Writing %s PDB shard zip(s) under %s", n, output_dir) - current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - final_archive_name = ( - f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip" - ) - final_archive_path = output_dir / final_archive_name - - with zipfile.ZipFile(final_archive_path, "w") as final_zip: - with logging_redirect_tqdm(): - with tqdm(total=n, desc="H5 shards → final zip", unit="h5") as pbar: - i = 0 - for fut in as_completed(futures): - archive_path = fut.result() - with open(archive_path, "rb") as f: - archive_data = f.read() - final_zip.writestr(f"{i}.zip", archive_data) - i += 1 - pbar.update(1) + with logging_redirect_tqdm(): + with tqdm(total=n, desc="H5 shards → zip", unit="h5") as pbar: + for fut in as_completed(futures): + fut.result() + pbar.update(1) From a4089f8725a2c9da9c7076d9b13b0be02de2ce5c Mon Sep 17 00:00:00 2001 From: Adam Nowak Date: Fri, 3 Apr 2026 13:24:40 +0200 Subject: [PATCH 3/3] Add newline to embedding time report --- toolbox/models/embedding/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toolbox/models/embedding/embedding.py b/toolbox/models/embedding/embedding.py index cde4812..f24af07 100644 --- a/toolbox/models/embedding/embedding.py +++ b/toolbox/models/embedding/embedding.py @@ -75,7 +75,7 @@ def run(self): create_index(self.embeddings_index_path, present_embeddings, self.structures_dataset.config.data_path) end = time.time() - logger.info(f"Total time: {format_time(end - start)}") + logger.info(f"Total time: {format_time(end - start)}\n") def missing_ids_to_fasta(self, missing_ids: List[str]) -> Dict[str, str]: