From c458ad4a72015c552c87c2653c9c8190ef8a88c5 Mon Sep 17 00:00:00 2001 From: Vincent Gao Date: Wed, 24 Jun 2026 09:23:27 +0200 Subject: [PATCH] fix: ensure slugify is idempotent with replacement rules User replacements can break slugify(slugify(x)) == slugify(x) in two ways: 1. Direct self-reference: old appears in new (e.g. a -> aa), causing compound growth on re-invocation. 2. Indirect self-reference: new contains non-word characters that, after slugification, become old (e.g. dash -> dollar-x-dollar, where dollar chars become dashes, creating dash-x-dash which contains dash, triggering pass-1 replacement in the next call). Fix: - Skip replacement rules in both passes when old-in-new (direct) or old appears after slugifying new (indirect). - Run the disallowed-char pattern + dedup + strip after both replacement passes so non-word characters from replacements do not break idempotence. Non-cyclic replacements (like pipe->or, percent->percent) are unaffected. --- slugify/slugify.py | 42 ++++++++++++++++++++++++++++++++++-------- test.py | 23 +++++++++++++++++++++++ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/slugify/slugify.py b/slugify/slugify.py index 9b5f27f..61e62b5 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -105,10 +105,32 @@ def slugify( :return (str): """ - # user-specific replacements + # determine the disallowed-char pattern early so both replacement + # passes can clean up non-word characters they introduce. + if allow_unicode: + _pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN + else: + _pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + + # user-specific replacements (pass 1 -- before normalization) if replacements: for old, new in replacements: + if old: + # Skip replacements that form a self-referential cycle. + # Direct: old appears in `new` (e.g. 'a' → 'aa'). + # Indirect: slugifying `new` recreates `old` (e.g. '-' + # → '$x$', because '$' → '-' produces '-x-' which contains + # '-'); such rules would grow every time slugify is called + # again on its own output. + _cleaned = re.sub(_pattern, DEFAULT_SEPARATOR, new) + _cleaned = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, _cleaned) + if old in new or old in _cleaned: + continue text = text.replace(old, new) + # Clean up any non-word characters introduced by pass-1 replacements + # so they don't interfere with subsequent normalization / pass-2 passes. + text = re.sub(_pattern, DEFAULT_SEPARATOR, text) + text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # ensure text is unicode if not isinstance(text, str): @@ -163,12 +185,7 @@ def slugify( text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters - if allow_unicode: - pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN - else: - pattern = regex_pattern or DISALLOWED_CHARS_PATTERN - - text = re.sub(pattern, DEFAULT_SEPARATOR, text) + text = re.sub(_pattern, DEFAULT_SEPARATOR, text) # remove redundant text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) @@ -182,10 +199,19 @@ def slugify( words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] text = DEFAULT_SEPARATOR.join(words) - # finalize user-specific replacements + # finalize user-specific replacements (pass 2) if replacements: for old, new in replacements: + if old: + _cleaned = re.sub(_pattern, DEFAULT_SEPARATOR, new) + _cleaned = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, _cleaned) + if old in new or old in _cleaned: + continue text = text.replace(old, new) + # Clean up any non-word characters introduced by pass-2 replacements + # so that slugify(slugify(x)) == slugify(x). + text = re.sub(_pattern, DEFAULT_SEPARATOR, text) + text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # smart truncate if requested if max_length > 0: diff --git a/test.py b/test.py index fcec4b6..689be8f 100644 --- a/test.py +++ b/test.py @@ -237,6 +237,29 @@ def test_replacements_german_umlaut_custom(self): r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']]) self.assertEqual(r, "ueber-ueber-german-umlaut") + def test_replacements_idempotent(self): + """slugify(slugify(x)) == slugify(x) with replacements.""" + # Non-word chars in replacement 'new' values + self.assertEqual( + slugify(slugify('a.b', replacements=[['-', '$x$']]), + replacements=[['-', '$x$']]), + slugify('a.b', replacements=[['-', '$x$']])) + # Self-referential replacement (direct: old in new) + self.assertEqual( + slugify(slugify('a', replacements=[['a', 'aa']]), + replacements=[['a', 'aa']]), + slugify('a', replacements=[['a', 'aa']])) + # Self-referential through slugification (indirect) + self.assertEqual( + slugify(slugify('hello world', replacements=[['-', '$iqt']]), + replacements=[['-', '$iqt']]), + slugify('hello world', replacements=[['-', '$iqt']])) + # Non-cyclic replacements still work + self.assertEqual( + slugify(slugify('10 | 20 %', replacements=[['|', 'or'], ['%', 'percent']]), + replacements=[['|', 'or'], ['%', 'percent']]), + "10-or-20-percent") + def test_pre_translation(self): self.assertEqual(PRE_TRANSLATIONS, [('Ю', 'U'), ('Щ', 'Sch'), ('У', 'Y'), ('Х', 'H'), ('Я', 'Ya'), ('Ё', 'E'), ('ё', 'e'), ('я', 'ya'), ('х', 'h'), ('у', 'y'), ('щ', 'sch'), ('ю', 'u'), ('Ü', 'Ue'), ('Ö', 'Oe'), ('Ä', 'Ae'), ('ä', 'ae'), ('ö', 'oe'), ('ü', 'ue'), ('Ϋ́', 'Y'), ('Ϋ', 'Y'), ('Ύ', 'Y'), ('Υ', 'Y'), ('Χ', 'Ch'), ('χ', 'ch'), ('Ξ', 'X'), ('ϒ', 'Y'), ('υ', 'y'), ('ύ', 'y'), ('ϋ', 'y'), ('ΰ', 'y')])