From b0ba321613531a594ed70aecde67decf8b8f4e9f Mon Sep 17 00:00:00 2001
From: Adam Nowak <anowak2000@gmail.com>
Date: Tue, 24 Mar 2026 20:57:10 +0000
Subject: [PATCH 1/3] fix(archive): unique shard zips, drop tgz, output under
 data/archives

- Name per-shard zips from path relative to data_path to avoid pdbs.h5 collisions
- Remove redundant tar.gz wrapper around zip shards
- Write shard and final merged zips to {data_path}/archives/{dataset}/
- Open H5 paths from index keys directly; pass data_path for naming
- Add unit tests for _shard_zip_name

Made-with: Cursor
---
 tests/test_archive.py      | 26 +++++++++++++++++++++
 toolbox/scripts/archive.py | 47 ++++++++++++++++++++++++++------------
 2 files changed, 59 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_archive.py

diff --git a/tests/test_archive.py b/tests/test_archive.py
new file mode 100644
index 0000000..735da6e
--- /dev/null
+++ b/tests/test_archive.py
@@ -0,0 +1,26 @@
+from toolbox.scripts.archive import _shard_zip_name
+
+
+def test_shard_zip_name_distinct_batches_same_basename():
+    data = "/data"
+    a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5"
+    b = f"{data}/structures/PDB/subset_/fourth_7/1/pdbs.h5"
+    assert _shard_zip_name(a, data) != _shard_zip_name(b, data)
+    assert _shard_zip_name(a, data).endswith(".zip")
+    assert "0" in _shard_zip_name(a, data) or "pdbs" in _shard_zip_name(a, data)
+
+
+def test_shard_zip_name_distinct_trees_same_batch_id():
+    data = "/data"
+    a = f"{data}/structures/PDB/subset_/fourth_7/0/pdbs.h5"
+    b = f"{data}/structures/PDB/subset_/other_slug/0/pdbs.h5"
+    assert _shard_zip_name(a, data) != _shard_zip_name(b, data)
+
+
+def test_shard_zip_name_long_path_uses_hash():
+    data = "/data"
+    long_mid = "x" * 300
+    h5 = f"{data}/structures/{long_mid}/0/pdbs.h5"
+    name = _shard_zip_name(h5, data)
+    assert name.endswith(".zip")
+    assert len(name) < 80
diff --git a/toolbox/scripts/archive.py b/toolbox/scripts/archive.py
index 0c4ca30..004db29 100644
--- a/toolbox/scripts/archive.py
+++ b/toolbox/scripts/archive.py
@@ -1,6 +1,5 @@
-from toolbox.models.manage_dataset.index.handle_index import read_index
 import datetime
-import os
+import hashlib
 import zipfile
 from pathlib import Path
 
@@ -8,14 +7,29 @@
 from tqdm import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
 
+from toolbox.models.manage_dataset.index.handle_index import read_index
 from toolbox.models.manage_dataset.utils import read_all_pdbs_from_h5
 from toolbox.utlis.logging import logger
 
 
-def process_h5_file(h5_file, dataset_path, output_dir):
-    full_h5_file_path = Path(dataset_path) / h5_file
-    prots = read_all_pdbs_from_h5(full_h5_file_path)
-    archive_name = os.path.basename(h5_file).replace(".h5", ".zip")
+def _shard_zip_name(h5_file: str, data_path_str: str) -> str:
+    """Stable unique .zip basename per H5 shard (avoids .../N/pdbs.h5 → pdbs.zip collisions)."""
+    h5_posix = Path(h5_file).as_posix()
+    root = Path(data_path_str).as_posix().rstrip("/")
+    if root and h5_posix.startswith(root + "/"):
+        rel = h5_posix.removeprefix(root + "/")
+    else:
+        rel = h5_posix
+    stem = rel.removesuffix(".h5").replace("/", "__")
+    if len(stem) > 200:
+        stem = hashlib.sha256(rel.encode()).hexdigest()[:24]
+    return f"{stem}.zip"
+
+
+def process_h5_file(h5_file, data_path_str, output_dir):
+    h5_path = Path(h5_file)
+    prots = read_all_pdbs_from_h5(str(h5_path))
+    archive_name = _shard_zip_name(h5_file, data_path_str)
     archive_path = Path(output_dir) / archive_name
 
     with zipfile.ZipFile(archive_path, "w") as zipf:
@@ -23,30 +37,35 @@ def process_h5_file(h5_file, dataset_path, output_dir):
             code = p.removesuffix(".pdb")
             zipf.writestr(f"{code}.pdb", pdb_file_content)
 
-    os.system(f"tar -czf {str(archive_path)}.tgz {str(archive_path)}")
-
     return str(archive_path)
 
 
 def create_archive(structures_dataset: "StructuresDataset"):
+    data_path = structures_dataset.config.data_path
     dataset_path = structures_dataset.dataset_path()
-    proteins_index = read_index(Path(dataset_path) / "dataset_reversed.idx", structures_dataset.config.data_path)
-    output_dir = Path(dataset_path) / "archives"
-    output_dir.mkdir(exist_ok=True)
+    proteins_index = read_index(
+        Path(dataset_path) / "dataset_reversed.idx", data_path
+    )
+    output_dir = Path(data_path) / "archives" / structures_dataset.dataset_dir_name()
+    output_dir.mkdir(parents=True, exist_ok=True)
 
     client = structures_dataset._client
 
     futures = []
     for h5_file in proteins_index.keys():
-        future = client.submit(process_h5_file, h5_file, dataset_path, output_dir)
+        future = client.submit(
+            process_h5_file, h5_file, data_path, output_dir
+        )
         futures.append(future)
 
     n = len(futures)
     logger.info("Building combined PDB archive from %s H5 shard(s)", n)
 
     current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-    final_archive_name = f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip"
-    final_archive_path = Path.cwd() / final_archive_name
+    final_archive_name = (
+        f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip"
+    )
+    final_archive_path = output_dir / final_archive_name
 
     with zipfile.ZipFile(final_archive_path, "w") as final_zip:
         with logging_redirect_tqdm():

From e1bd5a54a2e3f7263dd5ab191ad570f9c94283a8 Mon Sep 17 00:00:00 2001
From: Adam Nowak <anowak2000@gmail.com>
Date: Fri, 3 Apr 2026 01:20:49 +0200
Subject: [PATCH 2/3] Remove the mega-zip, only per shard zip

---
 fridata.py                 |  3 ++-
 toolbox/scripts/archive.py | 25 ++++++-------------------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/fridata.py b/fridata.py
index d270736..504021e 100644
--- a/fridata.py
+++ b/fridata.py
@@ -225,7 +225,8 @@ def create_parser():
     add_common_arguments(verify_chains_parser)
 
     create_archive_parser = subparsers.add_parser(
-        "create_archive", help="Create PDB compressed archive"
+        "create_archive",
+        help="Write one PDB .zip per H5 shard under data_path/archives/<dataset>/",
     )
     add_common_arguments(create_archive_parser)
 
diff --git a/toolbox/scripts/archive.py b/toolbox/scripts/archive.py
index 004db29..7a5774a 100644
--- a/toolbox/scripts/archive.py
+++ b/toolbox/scripts/archive.py
@@ -1,4 +1,3 @@
-import datetime
 import hashlib
 import zipfile
 from pathlib import Path
@@ -59,22 +58,10 @@ def create_archive(structures_dataset: "StructuresDataset"):
         futures.append(future)
 
     n = len(futures)
-    logger.info("Building combined PDB archive from %s H5 shard(s)", n)
+    logger.info("Writing %s PDB shard zip(s) under %s", n, output_dir)
 
-    current_time = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
-    final_archive_name = (
-        f"archive_pdb_{structures_dataset.dataset_dir_name()}_{current_time}.zip"
-    )
-    final_archive_path = output_dir / final_archive_name
-
-    with zipfile.ZipFile(final_archive_path, "w") as final_zip:
-        with logging_redirect_tqdm():
-            with tqdm(total=n, desc="H5 shards → final zip", unit="h5") as pbar:
-                i = 0
-                for fut in as_completed(futures):
-                    archive_path = fut.result()
-                    with open(archive_path, "rb") as f:
-                        archive_data = f.read()
-                    final_zip.writestr(f"{i}.zip", archive_data)
-                    i += 1
-                    pbar.update(1)
+    with logging_redirect_tqdm():
+        with tqdm(total=n, desc="H5 shards → zip", unit="h5") as pbar:
+            for fut in as_completed(futures):
+                fut.result()
+                pbar.update(1)

From a4089f8725a2c9da9c7076d9b13b0be02de2ce5c Mon Sep 17 00:00:00 2001
From: Adam Nowak <anowak2000@gmail.com>
Date: Fri, 3 Apr 2026 13:24:40 +0200
Subject: [PATCH 3/3] Add newline to embedding time report

---
 toolbox/models/embedding/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolbox/models/embedding/embedding.py b/toolbox/models/embedding/embedding.py
index cde4812..f24af07 100644
--- a/toolbox/models/embedding/embedding.py
+++ b/toolbox/models/embedding/embedding.py
@@ -75,7 +75,7 @@ def run(self):
         create_index(self.embeddings_index_path, present_embeddings, self.structures_dataset.config.data_path)
 
         end = time.time()
-        logger.info(f"Total time: {format_time(end - start)}")
+        logger.info(f"Total time: {format_time(end - start)}\n")
 
 
     def missing_ids_to_fasta(self, missing_ids: List[str]) -> Dict[str, str]: