From 1fa8f852a06c9badeeb768049c26996ae72b74e1 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sun, 18 May 2025 16:41:51 +0100 Subject: [PATCH 01/13] added helper func --- src/spatialdata/_core/validation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_core/validation.py b/src/spatialdata/_core/validation.py index d10ba3572..eb5eda36c 100644 --- a/src/spatialdata/_core/validation.py +++ b/src/spatialdata/_core/validation.py @@ -379,5 +379,8 @@ def __exit__( return False # Exceptions were collected that we want to raise as a combined validation error. if self._collector.errors: - raise ValidationError(title=self._message, errors=self._collector.errors) + raise ValidationError( + title=self._message + "\nTo fix, run `spatialdata.utils.sanitize_table(adata)`.", + errors=self._collector.errors + ) return True From d5cd64fa2b418b25798239c49c5e1d337374c16e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 15:43:00 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_core/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_core/validation.py b/src/spatialdata/_core/validation.py index eb5eda36c..8c038a4ae 100644 --- a/src/spatialdata/_core/validation.py +++ b/src/spatialdata/_core/validation.py @@ -381,6 +381,6 @@ def __exit__( if self._collector.errors: raise ValidationError( title=self._message + "\nTo fix, run `spatialdata.utils.sanitize_table(adata)`.", - errors=self._collector.errors + errors=self._collector.errors, ) return True From 8c3b792e91e57d032c8c5aecb9dd64a4d4af0878 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sun, 18 May 2025 16:48:00 +0100 Subject: [PATCH 03/13] tests --- src/spatialdata/_utils.py | 2 +- tests/utils/test_sanitize.py | 212 +++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 tests/utils/test_sanitize.py diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index 61f5a52c7..d64ab59e1 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -345,4 +345,4 @@ def _check_match_length_channels_c_dim( f"The number of channel names `{len(c_coords)}` does not match the length of dimension 'c'" f" with length {c_length}." ) - return c_coords + return c_coords \ No newline at end of file diff --git a/tests/utils/test_sanitize.py b/tests/utils/test_sanitize.py new file mode 100644 index 000000000..b567cc53d --- /dev/null +++ b/tests/utils/test_sanitize.py @@ -0,0 +1,212 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest +from anndata import AnnData + +from spatialdata import SpatialData +from spatialdata._utils import sanitize_name, sanitize_table + + +@pytest.fixture +def invalid_table() -> AnnData: + """AnnData with invalid obs column names to test basic sanitization.""" + return AnnData( + obs=pd.DataFrame( + { + "@invalid#": [1, 2], + "valid_name": [3, 4], + "__private": [5, 6], + } + ) + ) + + +@pytest.fixture +def invalid_table_with_index() -> AnnData: + """AnnData with a name requiring whitespace→underscore and a dataframe index column.""" + return AnnData( + obs=pd.DataFrame( + { + "invalid name": [1, 2], + "_index": [3, 4], + } + ) + ) + + +@pytest.fixture +def sdata_sanitized_tables(invalid_table, invalid_table_with_index) -> SpatialData: + """SpatialData built from sanitized copies of the invalid tables.""" + table1 = invalid_table.copy() + table2 = invalid_table_with_index.copy() + sanitize_table(table1) + sanitize_table(table2) + return SpatialData(tables={"table1": table1, "table2": table2}) + + +# ----------------------------------------------------------------------------- +# sanitize_name tests +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "raw,expected", + [ + ("valid_name", "valid_name"), + ("valid-name", "valid-name"), + ("valid.name", "valid.name"), + ("invalid@name", "invalid_name"), + ("invalid#name", "invalid_name"), + ("invalid name", "invalid_name"), + ("", "unnamed"), + (".", "unnamed"), + ("..", "unnamed"), + ("__private", "private"), + ], +) +def test_sanitize_name_strips_special_chars(raw, expected): + assert sanitize_name(raw) == expected + + +@pytest.mark.parametrize( + "raw,is_df_col,expected", + [ + ("_index", True, "index"), + ("_index", False, "index"), + ("valid@column", True, "valid_column"), + ("__private", True, "private"), + ], +) +def test_sanitize_name_dataframe_column(raw, is_df_col, expected): + assert sanitize_name(raw, is_dataframe_column=is_df_col) == expected + + +# ----------------------------------------------------------------------------- +# sanitize_table basic behaviors +# ----------------------------------------------------------------------------- + + +def test_sanitize_table_basic_columns(invalid_table, invalid_table_with_index): + ad1 = sanitize_table(invalid_table, inplace=False) + assert isinstance(ad1, AnnData) + assert list(ad1.obs.columns) == ["invalid_", "valid_name", "private"] + + ad2 = sanitize_table(invalid_table_with_index, inplace=False) + assert list(ad2.obs.columns) == ["invalid_name", "index"] + + # original fixture remains unchanged + assert list(invalid_table.obs.columns) == ["@invalid#", "valid_name", "__private"] + + +def test_sanitize_table_inplace_copy(invalid_table): + ad = invalid_table.copy() + sanitize_table(ad) # inplace=True is now default + assert list(ad.obs.columns) == ["invalid_", "valid_name", "private"] + + +def test_sanitize_table_case_insensitive_collisions(): + obs = pd.DataFrame( + { + "Column1": [1, 2], + "column1": [3, 4], + "COLUMN1": [5, 6], + } + ) + ad = AnnData(obs=obs) + sanitized = sanitize_table(ad, inplace=False) + cols = list(sanitized.obs.columns) + assert sorted(cols) == sorted(["Column1", "column1_1", "COLUMN1_2"]) + + +def test_sanitize_table_whitespace_collision(): + """Ensure 'a b' → 'a_b' doesn't collide silently with existing 'a_b'.""" + obs = pd.DataFrame({"a b": [1], "a_b": [2]}) + ad = AnnData(obs=obs) + sanitized = sanitize_table(ad, inplace=False) + cols = list(sanitized.obs.columns) + assert "a_b" in cols + assert "a_b_1" in cols + + +# ----------------------------------------------------------------------------- +# sanitize_table attribute‐specific tests +# ----------------------------------------------------------------------------- + + +def test_sanitize_table_obs_and_obs_columns(): + ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]})) + sanitized = sanitize_table(ad, inplace=False) + assert list(sanitized.obs.columns) == ["col"] + + +def test_sanitize_table_obsm_and_obsp(): + ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]})) + ad.obsm["@col"] = np.array([[1, 2], [3, 4]]) + ad.obsp["bad name"] = np.array([[1, 2], [3, 4]]) + sanitized = sanitize_table(ad, inplace=False) + assert list(sanitized.obsm.keys()) == ["col"] + assert list(sanitized.obsp.keys()) == ["bad_name"] + + +def test_sanitize_table_varm_and_varp(): + ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"])) + ad.varm["__priv"] = np.array([[1, 2], [3, 4]]) + ad.varp["_index"] = np.array([[1, 2], [3, 4]]) + sanitized = sanitize_table(ad, inplace=False) + assert list(sanitized.varm.keys()) == ["priv"] + assert list(sanitized.varp.keys()) == ["index"] + + +def test_sanitize_table_uns_and_layers(): + ad = AnnData(obs=pd.DataFrame({"x": [1, 2]}), var=pd.DataFrame(index=["v1", "v2"])) + ad.uns["bad@key"] = "val" + ad.layers["bad#layer"] = np.array([[0, 1], [1, 0]]) + sanitized = sanitize_table(ad, inplace=False) + assert list(sanitized.uns.keys()) == ["bad_key"] + assert list(sanitized.layers.keys()) == ["bad_layer"] + + +def test_sanitize_table_empty_returns_empty(): + ad = AnnData() + sanitized = sanitize_table(ad, inplace=False) + assert isinstance(sanitized, AnnData) + assert sanitized.obs.empty + assert sanitized.var.empty + + +def test_sanitize_table_preserves_underlying_data(): + ad = AnnData(obs=pd.DataFrame({"@invalid#": [1, 2], "valid": [3, 4]})) + ad.obsm["@invalid#"] = np.array([[1, 2], [3, 4]]) + ad.uns["invalid@key"] = "value" + sanitized = sanitize_table(ad, inplace=False) + assert sanitized.obs["invalid_"].tolist() == [1, 2] + assert sanitized.obs["valid"].tolist() == [3, 4] + assert np.array_equal(sanitized.obsm["invalid_"], np.array([[1, 2], [3, 4]])) + assert sanitized.uns["invalid_key"] == "value" + + +# ----------------------------------------------------------------------------- +# SpatialData integration +# ----------------------------------------------------------------------------- + + +def test_sanitize_table_in_spatialdata_sanitized_fixture(sdata_sanitized_tables): + t1 = sdata_sanitized_tables.tables["table1"] + t2 = sdata_sanitized_tables.tables["table2"] + assert list(t1.obs.columns) == ["invalid_", "valid_name", "private"] + assert list(t2.obs.columns) == ["invalid_name", "index"] + + +def test_spatialdata_retains_other_elements(full_sdata, sdata_sanitized_tables): + # Add another sanitized table into an existing full_sdata + tbl = AnnData(obs=pd.DataFrame({"@foo#": [1, 2], "bar": [3, 4]})) + sanitize_table(tbl) + full_sdata.tables["new_table"] = tbl + + # Verify columns and presence of other SpatialData attributes + assert list(full_sdata.tables["new_table"].obs.columns) == ["foo_", "bar"] + assert "image2d" in full_sdata.images + assert "labels2d" in full_sdata.labels + assert "points_0" in full_sdata.points From fd2ef3411c427e8d6ccc892f5f88c8b4ca60a7fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 15:49:55 +0000 Subject: [PATCH 04/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index d64ab59e1..61f5a52c7 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -345,4 +345,4 @@ def _check_match_length_channels_c_dim( f"The number of channel names `{len(c_coords)}` does not match the length of dimension 'c'" f" with length {c_length}." ) - return c_coords \ No newline at end of file + return c_coords From e95cebbdb58c530f8949ee59a83da67ed7c3d1b0 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sun, 18 May 2025 16:50:29 +0100 Subject: [PATCH 05/13] fix --- src/spatialdata/_utils.py | 132 +++++++++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index d64ab59e1..0f11248b6 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -345,4 +345,134 @@ def _check_match_length_channels_c_dim( f"The number of channel names `{len(c_coords)}` does not match the length of dimension 'c'" f" with length {c_length}." ) - return c_coords \ No newline at end of file + return c_coords + + +def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: + """ + Sanitize a name to comply with SpatialData naming rules. + This function converts invalid names into valid ones by: + 1. Converting to string if not already + 2. Removing invalid characters + 3. Handling special cases like "__" prefix + 4. Ensuring the name is not empty + 5. Handling special cases for dataframe columns + Parameters + ---------- + name + The name to sanitize + is_dataframe_column + Whether this name is for a dataframe column (additional restrictions apply) + Returns + ------- + A sanitized version of the name that complies with SpatialData naming rules. + Examples + -------- + >>> sanitize_name("my@invalid#name") + 'my_invalid_name' + >>> sanitize_name("__private") + 'private' + >>> sanitize_name("_index", is_dataframe_column=True) + 'index' + """ + # Convert to string if not already + name = str(name) + + # Handle empty string case + if not name: + return "unnamed" + + # Handle special cases + if name == "." or name == "..": + return "unnamed" + + # Remove "__" prefix if present + if name.startswith("__"): + name = name[2:] + + # Replace invalid characters with underscore + # Keep only alphanumeric, underscore, dot, and hyphen + sanitized = "" + for char in name: + if char.isalnum() or char in "_-.": + sanitized += char + else: + sanitized += "_" + + # Remove leading underscores but keep trailing ones + sanitized = sanitized.lstrip("_") + + # Ensure we don't end up with an empty string after sanitization + if not sanitized: + return "unnamed" + + return sanitized + + +def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: + """ + Sanitize all keys in an AnnData table to comply with SpatialData naming rules. + This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers + while maintaining case-insensitive uniqueness. It can either modify the table in-place + or return a new sanitized copy. + Parameters + ---------- + data + The AnnData table to sanitize + inplace + Whether to modify the table in-place or return a new copy + Returns + ------- + If inplace is False, returns a new AnnData object with sanitized keys. + If inplace is True, returns None as the original object is modified. + Examples + -------- + >>> import anndata as ad + >>> adata = ad.AnnData(obs=pd.DataFrame({"@invalid#": [1, 2]})) + >>> # Create a new sanitized copy + >>> sanitized = sanitize_table(adata) + >>> print(sanitized.obs.columns) + Index(['invalid_'], dtype='object') + >>> # Or modify in-place + >>> sanitize_table(adata, inplace=True) + >>> print(adata.obs.columns) + Index(['invalid_'], dtype='object') + """ + import copy + from collections import defaultdict + + # Create a deep copy if not modifying in-place + sanitized = data if inplace else copy.deepcopy(data) + + # Track used names to maintain case-insensitive uniqueness + used_names: dict[str, set[str]] = defaultdict(set) + + def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: + base_name = sanitize_name(name, is_dataframe_column) + normalized_base = base_name.lower() + + # If this exact name is already used, add a number + if normalized_base in {n.lower() for n in used_names[attr]}: + counter = 1 + while f"{base_name}_{counter}".lower() in {n.lower() for n in used_names[attr]}: + counter += 1 + base_name = f"{base_name}_{counter}" + + used_names[attr].add(base_name) + return base_name + + # Handle obs and var (dataframe columns) + for attr in ("obs", "var"): + df = getattr(sanitized, attr) + new_columns = {old: get_unique_name(old, attr, is_dataframe_column=True) for old in df.columns} + df.rename(columns=new_columns, inplace=True) + + # Handle other attributes + for attr in ("obsm", "obsp", "varm", "varp", "uns", "layers"): + d = getattr(sanitized, attr) + new_keys = {old: get_unique_name(old, attr) for old in d} + # Create new dictionary with sanitized keys + new_dict = {new_keys[old]: value for old, value in d.items()} + setattr(sanitized, attr, new_dict) + + return None if inplace else sanitized \ No newline at end of file From 6ce065dd7d79155b53a773d191ec0ddf6c32a662 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 15:52:06 +0000 Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index 0f11248b6..38b246254 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -357,15 +357,18 @@ def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: 3. Handling special cases like "__" prefix 4. Ensuring the name is not empty 5. Handling special cases for dataframe columns + Parameters ---------- name The name to sanitize is_dataframe_column Whether this name is for a dataframe column (additional restrictions apply) + Returns ------- A sanitized version of the name that complies with SpatialData naming rules. + Examples -------- >>> sanitize_name("my@invalid#name") @@ -415,16 +418,19 @@ def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers while maintaining case-insensitive uniqueness. It can either modify the table in-place or return a new sanitized copy. + Parameters ---------- data The AnnData table to sanitize inplace Whether to modify the table in-place or return a new copy + Returns ------- If inplace is False, returns a new AnnData object with sanitized keys. If inplace is True, returns None as the original object is modified. + Examples -------- >>> import anndata as ad @@ -475,4 +481,4 @@ def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> new_dict = {new_keys[old]: value for old, value in d.items()} setattr(sanitized, attr, new_dict) - return None if inplace else sanitized \ No newline at end of file + return None if inplace else sanitized From f25165a56cf68912e15c9f94f8eb63b3fbdeaebd Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sun, 18 May 2025 17:07:05 +0100 Subject: [PATCH 07/13] mypy --- src/spatialdata/_utils.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index 38b246254..239fffed1 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -351,6 +351,7 @@ def _check_match_length_channels_c_dim( def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: """ Sanitize a name to comply with SpatialData naming rules. + This function converts invalid names into valid ones by: 1. Converting to string if not already 2. Removing invalid characters @@ -386,35 +387,25 @@ def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: return "unnamed" # Handle special cases - if name == "." or name == "..": + if name in {".", ".."}: return "unnamed" # Remove "__" prefix if present if name.startswith("__"): name = name[2:] - # Replace invalid characters with underscore - # Keep only alphanumeric, underscore, dot, and hyphen - sanitized = "" - for char in name: - if char.isalnum() or char in "_-.": - sanitized += char - else: - sanitized += "_" - + sanitized = "".join(char if char.isalnum() or char in "_-." else "_" for char in name) # Remove leading underscores but keep trailing ones sanitized = sanitized.lstrip("_") # Ensure we don't end up with an empty string after sanitization - if not sanitized: - return "unnamed" - - return sanitized + return sanitized or "unnamed" def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: """ Sanitize all keys in an AnnData table to comply with SpatialData naming rules. + This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers while maintaining case-insensitive uniqueness. It can either modify the table in-place or return a new sanitized copy. From 9f221857e11668c734533c9a26351c703771a81f Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Sun, 18 May 2025 17:19:49 +0100 Subject: [PATCH 08/13] Update src/spatialdata/_utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/spatialdata/_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index 239fffed1..487ffdf36 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -443,19 +443,21 @@ def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: # Track used names to maintain case-insensitive uniqueness used_names: dict[str, set[str]] = defaultdict(set) + used_names_lower: dict[str, set[str]] = defaultdict(set) def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: base_name = sanitize_name(name, is_dataframe_column) normalized_base = base_name.lower() # If this exact name is already used, add a number - if normalized_base in {n.lower() for n in used_names[attr]}: + if normalized_base in used_names_lower[attr]: counter = 1 - while f"{base_name}_{counter}".lower() in {n.lower() for n in used_names[attr]}: + while f"{base_name}_{counter}".lower() in used_names_lower[attr]: counter += 1 base_name = f"{base_name}_{counter}" used_names[attr].add(base_name) + used_names_lower[attr].add(base_name.lower()) return base_name # Handle obs and var (dataframe columns) From 8221d5d66c9e61713fec9a5e8a524e4b7b2cce9a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 25 May 2025 19:01:26 -0400 Subject: [PATCH 09/13] fix sanitize edge case; add to docs --- docs/api/operations.md | 2 + src/spatialdata/__init__.py | 3 + src/spatialdata/_core/_utils.py | 141 +++++++++++++++++++++++++++++ src/spatialdata/_utils.py | 129 -------------------------- src/spatialdata/models/__init__.py | 1 + tests/utils/test_sanitize.py | 31 ++++--- 6 files changed, 164 insertions(+), 143 deletions(-) diff --git a/docs/api/operations.md b/docs/api/operations.md index 937b8dbca..3eb2a5a6c 100644 --- a/docs/api/operations.md +++ b/docs/api/operations.md @@ -29,4 +29,6 @@ Operations on `SpatialData` objects. .. autofunction:: are_extents_equal .. autofunction:: deepcopy .. autofunction:: get_pyramid_levels +.. autofunction:: sanitize_name +.. autofunction:: sanitize_table ``` diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index 9ddfea32d..0b68391ad 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -53,10 +53,13 @@ "relabel_sequential", "map_raster", "deepcopy", + "sanitize_table", + "sanitize_name", ] from spatialdata import dataloader, datasets, models, transformations from spatialdata._core._deepcopy import deepcopy +from spatialdata._core._utils import sanitize_name, sanitize_table from spatialdata._core.centroids import get_centroids from spatialdata._core.concatenate import concatenate from spatialdata._core.data_extent import are_extents_equal, get_extent diff --git a/src/spatialdata/_core/_utils.py b/src/spatialdata/_core/_utils.py index dd27e9c8d..12e794374 100644 --- a/src/spatialdata/_core/_utils.py +++ b/src/spatialdata/_core/_utils.py @@ -1,5 +1,9 @@ +from __future__ import annotations + from collections.abc import Iterable +from anndata import AnnData + from spatialdata._core.spatialdata import SpatialData @@ -25,3 +29,140 @@ def _find_common_table_keys(sdatas: Iterable[SpatialData]) -> set[str]: common_keys.intersection_update(sdata.tables.keys()) return common_keys + + +def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: + """ + Sanitize a name to comply with SpatialData naming rules. + + This function converts invalid names into valid ones by: + 1. Converting to string if not already + 2. Removing invalid characters + 3. Handling special cases like "__" prefix + 4. Ensuring the name is not empty + 5. Handling special cases for dataframe columns + + See a discussion on the naming rules, and how to avoid naming collisions, here: + https://github.com/scverse/spatialdata/discussions/707 + + Parameters + ---------- + name + The name to sanitize + is_dataframe_column + Whether this name is for a dataframe column (additional restrictions apply) + + Returns + ------- + A sanitized version of the name that complies with SpatialData naming rules. If a + santized name cannoted be generated, it returns "unnamed". + + Examples + -------- + >>> sanitize_name("my@invalid#name") + 'my_invalid_name' + >>> sanitize_name("__private") + 'private' + >>> sanitize_name("_index", is_dataframe_column=True) + 'index' + """ + # Convert to string if not already + name = str(name) + + # Handle empty string case + if not name: + return "unnamed" + + # Handle special cases + if name in {".", ".."}: + return "unnamed" + + sanitized = "".join(char if char.isalnum() or char in "_-." else "_" for char in name) + + # remove double underscores if found as a prefix + while sanitized.startswith("__"): + sanitized = sanitized[1:] + + if is_dataframe_column and sanitized == "_index": + return "index" + + # Ensure we don't end up with an empty string after sanitization + return sanitized or "unnamed" + + +def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: + """ + Sanitize all keys in an AnnData table to comply with SpatialData naming rules. + + This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers + while maintaining case-insensitive uniqueness. It can either modify the table in-place + or return a new sanitized copy. + + See a discussion on the naming rules here: + https://github.com/scverse/spatialdata/discussions/707 + + Parameters + ---------- + data + The AnnData table to sanitize + inplace + Whether to modify the table in-place or return a new copy + + Returns + ------- + If inplace is False, returns a new AnnData object with sanitized keys. + If inplace is True, returns None as the original object is modified. + + Examples + -------- + >>> import anndata as ad + >>> adata = ad.AnnData(obs=pd.DataFrame({"@invalid#": [1, 2]})) + >>> # Create a new sanitized copy + >>> sanitized = sanitize_table(adata) + >>> print(sanitized.obs.columns) + Index(['invalid_'], dtype='object') + >>> # Or modify in-place + >>> sanitize_table(adata, inplace=True) + >>> print(adata.obs.columns) + Index(['invalid_'], dtype='object') + """ + import copy + from collections import defaultdict + + # Create a deep copy if not modifying in-place + sanitized = data if inplace else copy.deepcopy(data) + + # Track used names to maintain case-insensitive uniqueness + used_names: dict[str, set[str]] = defaultdict(set) + used_names_lower: dict[str, set[str]] = defaultdict(set) + + def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: + base_name = sanitize_name(name, is_dataframe_column) + normalized_base = base_name.lower() + + # If this exact name is already used, add a number + if normalized_base in used_names_lower[attr]: + counter = 1 + while f"{base_name}_{counter}".lower() in used_names_lower[attr]: + counter += 1 + base_name = f"{base_name}_{counter}" + + used_names[attr].add(base_name) + used_names_lower[attr].add(base_name.lower()) + return base_name + + # Handle obs and var (dataframe columns) + for attr in ("obs", "var"): + df = getattr(sanitized, attr) + new_columns = {old: get_unique_name(old, attr, is_dataframe_column=True) for old in df.columns} + df.rename(columns=new_columns, inplace=True) + + # Handle other attributes + for attr in ("obsm", "obsp", "varm", "varp", "uns", "layers"): + d = getattr(sanitized, attr) + new_keys = {old: get_unique_name(old, attr) for old in d} + # Create new dictionary with sanitized keys + new_dict = {new_keys[old]: value for old, value in d.items()} + setattr(sanitized, attr, new_dict) + + return None if inplace else sanitized diff --git a/src/spatialdata/_utils.py b/src/spatialdata/_utils.py index 487ffdf36..61f5a52c7 100644 --- a/src/spatialdata/_utils.py +++ b/src/spatialdata/_utils.py @@ -346,132 +346,3 @@ def _check_match_length_channels_c_dim( f" with length {c_length}." ) return c_coords - - -def sanitize_name(name: str, is_dataframe_column: bool = False) -> str: - """ - Sanitize a name to comply with SpatialData naming rules. - - This function converts invalid names into valid ones by: - 1. Converting to string if not already - 2. Removing invalid characters - 3. Handling special cases like "__" prefix - 4. Ensuring the name is not empty - 5. Handling special cases for dataframe columns - - Parameters - ---------- - name - The name to sanitize - is_dataframe_column - Whether this name is for a dataframe column (additional restrictions apply) - - Returns - ------- - A sanitized version of the name that complies with SpatialData naming rules. - - Examples - -------- - >>> sanitize_name("my@invalid#name") - 'my_invalid_name' - >>> sanitize_name("__private") - 'private' - >>> sanitize_name("_index", is_dataframe_column=True) - 'index' - """ - # Convert to string if not already - name = str(name) - - # Handle empty string case - if not name: - return "unnamed" - - # Handle special cases - if name in {".", ".."}: - return "unnamed" - - # Remove "__" prefix if present - if name.startswith("__"): - name = name[2:] - - sanitized = "".join(char if char.isalnum() or char in "_-." else "_" for char in name) - # Remove leading underscores but keep trailing ones - sanitized = sanitized.lstrip("_") - - # Ensure we don't end up with an empty string after sanitization - return sanitized or "unnamed" - - -def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: - """ - Sanitize all keys in an AnnData table to comply with SpatialData naming rules. - - This function sanitizes all keys in obs, var, obsm, obsp, varm, varp, uns, and layers - while maintaining case-insensitive uniqueness. It can either modify the table in-place - or return a new sanitized copy. - - Parameters - ---------- - data - The AnnData table to sanitize - inplace - Whether to modify the table in-place or return a new copy - - Returns - ------- - If inplace is False, returns a new AnnData object with sanitized keys. - If inplace is True, returns None as the original object is modified. - - Examples - -------- - >>> import anndata as ad - >>> adata = ad.AnnData(obs=pd.DataFrame({"@invalid#": [1, 2]})) - >>> # Create a new sanitized copy - >>> sanitized = sanitize_table(adata) - >>> print(sanitized.obs.columns) - Index(['invalid_'], dtype='object') - >>> # Or modify in-place - >>> sanitize_table(adata, inplace=True) - >>> print(adata.obs.columns) - Index(['invalid_'], dtype='object') - """ - import copy - from collections import defaultdict - - # Create a deep copy if not modifying in-place - sanitized = data if inplace else copy.deepcopy(data) - - # Track used names to maintain case-insensitive uniqueness - used_names: dict[str, set[str]] = defaultdict(set) - used_names_lower: dict[str, set[str]] = defaultdict(set) - - def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: - base_name = sanitize_name(name, is_dataframe_column) - normalized_base = base_name.lower() - - # If this exact name is already used, add a number - if normalized_base in used_names_lower[attr]: - counter = 1 - while f"{base_name}_{counter}".lower() in used_names_lower[attr]: - counter += 1 - base_name = f"{base_name}_{counter}" - - used_names[attr].add(base_name) - used_names_lower[attr].add(base_name.lower()) - return base_name - - # Handle obs and var (dataframe columns) - for attr in ("obs", "var"): - df = getattr(sanitized, attr) - new_columns = {old: get_unique_name(old, attr, is_dataframe_column=True) for old in df.columns} - df.rename(columns=new_columns, inplace=True) - - # Handle other attributes - for attr in ("obsm", "obsp", "varm", "varp", "uns", "layers"): - d = getattr(sanitized, attr) - new_keys = {old: get_unique_name(old, attr) for old in d} - # Create new dictionary with sanitized keys - new_dict = {new_keys[old]: value for old, value in d.items()} - setattr(sanitized, attr, new_dict) - - return None if inplace else sanitized diff --git a/src/spatialdata/models/__init__.py b/src/spatialdata/models/__init__.py index 3c86fa0ec..d040514ac 100644 --- a/src/spatialdata/models/__init__.py +++ b/src/spatialdata/models/__init__.py @@ -55,4 +55,5 @@ "set_channel_names", "force_2d", "RasterSchema", + "sani", ] diff --git a/tests/utils/test_sanitize.py b/tests/utils/test_sanitize.py index b567cc53d..23c374cae 100644 --- a/tests/utils/test_sanitize.py +++ b/tests/utils/test_sanitize.py @@ -6,7 +6,7 @@ from anndata import AnnData from spatialdata import SpatialData -from spatialdata._utils import sanitize_name, sanitize_table +from spatialdata._core._utils import sanitize_name, sanitize_table @pytest.fixture @@ -63,7 +63,10 @@ def sdata_sanitized_tables(invalid_table, invalid_table_with_index) -> SpatialDa ("", "unnamed"), (".", "unnamed"), ("..", "unnamed"), - ("__private", "private"), + ("__", "_"), + ("___", "_"), + ("____#@$@", "_"), + ("__private", "_private"), ], ) def test_sanitize_name_strips_special_chars(raw, expected): @@ -74,9 +77,9 @@ def test_sanitize_name_strips_special_chars(raw, expected): "raw,is_df_col,expected", [ ("_index", True, "index"), - ("_index", False, "index"), + ("_index", False, "_index"), ("valid@column", True, "valid_column"), - ("__private", True, "private"), + ("__private", True, "_private"), ], ) def test_sanitize_name_dataframe_column(raw, is_df_col, expected): @@ -91,7 +94,7 @@ def test_sanitize_name_dataframe_column(raw, is_df_col, expected): def test_sanitize_table_basic_columns(invalid_table, invalid_table_with_index): ad1 = sanitize_table(invalid_table, inplace=False) assert isinstance(ad1, AnnData) - assert list(ad1.obs.columns) == ["invalid_", "valid_name", "private"] + assert list(ad1.obs.columns) == ["_invalid_", "valid_name", "_private"] ad2 = sanitize_table(invalid_table_with_index, inplace=False) assert list(ad2.obs.columns) == ["invalid_name", "index"] @@ -103,7 +106,7 @@ def test_sanitize_table_basic_columns(invalid_table, invalid_table_with_index): def test_sanitize_table_inplace_copy(invalid_table): ad = invalid_table.copy() sanitize_table(ad) # inplace=True is now default - assert list(ad.obs.columns) == ["invalid_", "valid_name", "private"] + assert list(ad.obs.columns) == ["_invalid_", "valid_name", "_private"] def test_sanitize_table_case_insensitive_collisions(): @@ -138,7 +141,7 @@ def test_sanitize_table_whitespace_collision(): def test_sanitize_table_obs_and_obs_columns(): ad = AnnData(obs=pd.DataFrame({"@col": [1, 2]})) sanitized = sanitize_table(ad, inplace=False) - assert list(sanitized.obs.columns) == ["col"] + assert list(sanitized.obs.columns) == ["_col"] def test_sanitize_table_obsm_and_obsp(): @@ -146,7 +149,7 @@ def test_sanitize_table_obsm_and_obsp(): ad.obsm["@col"] = np.array([[1, 2], [3, 4]]) ad.obsp["bad name"] = np.array([[1, 2], [3, 4]]) sanitized = sanitize_table(ad, inplace=False) - assert list(sanitized.obsm.keys()) == ["col"] + assert list(sanitized.obsm.keys()) == ["_col"] assert list(sanitized.obsp.keys()) == ["bad_name"] @@ -155,8 +158,8 @@ def test_sanitize_table_varm_and_varp(): ad.varm["__priv"] = np.array([[1, 2], [3, 4]]) ad.varp["_index"] = np.array([[1, 2], [3, 4]]) sanitized = sanitize_table(ad, inplace=False) - assert list(sanitized.varm.keys()) == ["priv"] - assert list(sanitized.varp.keys()) == ["index"] + assert list(sanitized.varm.keys()) == ["_priv"] + assert list(sanitized.varp.keys()) == ["_index"] def test_sanitize_table_uns_and_layers(): @@ -181,9 +184,9 @@ def test_sanitize_table_preserves_underlying_data(): ad.obsm["@invalid#"] = np.array([[1, 2], [3, 4]]) ad.uns["invalid@key"] = "value" sanitized = sanitize_table(ad, inplace=False) - assert sanitized.obs["invalid_"].tolist() == [1, 2] + assert sanitized.obs["_invalid_"].tolist() == [1, 2] assert sanitized.obs["valid"].tolist() == [3, 4] - assert np.array_equal(sanitized.obsm["invalid_"], np.array([[1, 2], [3, 4]])) + assert np.array_equal(sanitized.obsm["_invalid_"], np.array([[1, 2], [3, 4]])) assert sanitized.uns["invalid_key"] == "value" @@ -195,7 +198,7 @@ def test_sanitize_table_preserves_underlying_data(): def test_sanitize_table_in_spatialdata_sanitized_fixture(sdata_sanitized_tables): t1 = sdata_sanitized_tables.tables["table1"] t2 = sdata_sanitized_tables.tables["table2"] - assert list(t1.obs.columns) == ["invalid_", "valid_name", "private"] + assert list(t1.obs.columns) == ["_invalid_", "valid_name", "_private"] assert list(t2.obs.columns) == ["invalid_name", "index"] @@ -206,7 +209,7 @@ def test_spatialdata_retains_other_elements(full_sdata, sdata_sanitized_tables): full_sdata.tables["new_table"] = tbl # Verify columns and presence of other SpatialData attributes - assert list(full_sdata.tables["new_table"].obs.columns) == ["foo_", "bar"] + assert list(full_sdata.tables["new_table"].obs.columns) == ["_foo_", "bar"] assert "image2d" in full_sdata.images assert "labels2d" in full_sdata.labels assert "points_0" in full_sdata.points From 1ec773b541e26bc0a819147afd4bfffe8ad493b1 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 27 May 2025 17:18:24 +0200 Subject: [PATCH 10/13] Apply suggestions from code review --- src/spatialdata/_core/_utils.py | 1 - src/spatialdata/models/__init__.py | 1 - tests/utils/test_sanitize.py | 17 +---------------- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/src/spatialdata/_core/_utils.py b/src/spatialdata/_core/_utils.py index 12e794374..ce1203438 100644 --- a/src/spatialdata/_core/_utils.py +++ b/src/spatialdata/_core/_utils.py @@ -133,7 +133,6 @@ def sanitize_table(data: AnnData, inplace: bool = True) -> AnnData | None: sanitized = data if inplace else copy.deepcopy(data) # Track used names to maintain case-insensitive uniqueness - used_names: dict[str, set[str]] = defaultdict(set) used_names_lower: dict[str, set[str]] = defaultdict(set) def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> str: diff --git a/src/spatialdata/models/__init__.py b/src/spatialdata/models/__init__.py index d040514ac..3c86fa0ec 100644 --- a/src/spatialdata/models/__init__.py +++ b/src/spatialdata/models/__init__.py @@ -55,5 +55,4 @@ "set_channel_names", "force_2d", "RasterSchema", - "sani", ] diff --git a/tests/utils/test_sanitize.py b/tests/utils/test_sanitize.py index 23c374cae..fb7cd2024 100644 --- a/tests/utils/test_sanitize.py +++ b/tests/utils/test_sanitize.py @@ -34,18 +34,6 @@ def invalid_table_with_index() -> AnnData: } ) ) - - -@pytest.fixture -def sdata_sanitized_tables(invalid_table, invalid_table_with_index) -> SpatialData: - """SpatialData built from sanitized copies of the invalid tables.""" - table1 = invalid_table.copy() - table2 = invalid_table_with_index.copy() - sanitize_table(table1) - sanitize_table(table2) - return SpatialData(tables={"table1": table1, "table2": table2}) - - # ----------------------------------------------------------------------------- # sanitize_name tests # ----------------------------------------------------------------------------- @@ -202,7 +190,7 @@ def test_sanitize_table_in_spatialdata_sanitized_fixture(sdata_sanitized_tables) assert list(t2.obs.columns) == ["invalid_name", "index"] -def test_spatialdata_retains_other_elements(full_sdata, sdata_sanitized_tables): +def test_spatialdata_retains_other_elements(full_sdata): # Add another sanitized table into an existing full_sdata tbl = AnnData(obs=pd.DataFrame({"@foo#": [1, 2], "bar": [3, 4]})) sanitize_table(tbl) @@ -210,6 +198,3 @@ def test_spatialdata_retains_other_elements(full_sdata, sdata_sanitized_tables): # Verify columns and presence of other SpatialData attributes assert list(full_sdata.tables["new_table"].obs.columns) == ["_foo_", "bar"] - assert "image2d" in full_sdata.images - assert "labels2d" in full_sdata.labels - assert "points_0" in full_sdata.points From 38bb5546ff929ddc1deb73fab77f7e31c875f8fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 May 2025 15:18:39 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/utils/test_sanitize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_sanitize.py b/tests/utils/test_sanitize.py index fb7cd2024..85953ca25 100644 --- a/tests/utils/test_sanitize.py +++ b/tests/utils/test_sanitize.py @@ -5,7 +5,6 @@ import pytest from anndata import AnnData -from spatialdata import SpatialData from spatialdata._core._utils import sanitize_name, sanitize_table @@ -34,6 +33,8 @@ def invalid_table_with_index() -> AnnData: } ) ) + + # ----------------------------------------------------------------------------- # sanitize_name tests # ----------------------------------------------------------------------------- From ad1c573120e6b4481c052360e60a0e4bccdac65e Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 27 May 2025 17:20:16 +0200 Subject: [PATCH 12/13] move sanite_tables into test directly --- tests/utils/test_sanitize.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_sanitize.py b/tests/utils/test_sanitize.py index fb7cd2024..b4999a48a 100644 --- a/tests/utils/test_sanitize.py +++ b/tests/utils/test_sanitize.py @@ -34,6 +34,8 @@ def invalid_table_with_index() -> AnnData: } ) ) + + # ----------------------------------------------------------------------------- # sanitize_name tests # ----------------------------------------------------------------------------- @@ -183,7 +185,13 @@ def test_sanitize_table_preserves_underlying_data(): # ----------------------------------------------------------------------------- -def test_sanitize_table_in_spatialdata_sanitized_fixture(sdata_sanitized_tables): +def test_sanitize_table_in_spatialdata_sanitized_fixture(invalid_table, invalid_table_with_index): + table1 = invalid_table.copy() + table2 = invalid_table_with_index.copy() + sanitize_table(table1) + sanitize_table(table2) + sdata_sanitized_tables = SpatialData(tables={"table1": table1, "table2": table2}) + t1 = sdata_sanitized_tables.tables["table1"] t2 = sdata_sanitized_tables.tables["table2"] assert list(t1.obs.columns) == ["_invalid_", "valid_name", "_private"] From b994960e0fb21856114785d8cdeb8977a92ce809 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 27 May 2025 17:25:23 +0200 Subject: [PATCH 13/13] remove unnecessary used_names --- src/spatialdata/_core/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/spatialdata/_core/_utils.py b/src/spatialdata/_core/_utils.py index ce1203438..a55815655 100644 --- a/src/spatialdata/_core/_utils.py +++ b/src/spatialdata/_core/_utils.py @@ -146,7 +146,6 @@ def get_unique_name(name: str, attr: str, is_dataframe_column: bool = False) -> counter += 1 base_name = f"{base_name}_{counter}" - used_names[attr].add(base_name) used_names_lower[attr].add(base_name.lower()) return base_name