diff --git a/HISTORY.rst b/HISTORY.rst index cc4f794..cf6dcc9 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,22 @@ Release History --------------- +1.1.6 (2026-xx-xx) +++++++++++++++++++ + +**Updates** + +- Improve code quality with more Ruff linters + +**Fixes** + +- Fixes `#80 `_: Update logging to use module-level logger. | `dfop02 `_ + +**New Features** + +- None + + 1.1.5 (2026-04-17) ++++++++++++++++++ diff --git a/README.md b/README.md index cd71747..4667a3e 100644 --- a/README.md +++ b/README.md @@ -275,6 +275,43 @@ document.save('your_file_name.docx') You can find all available metadata attributes [here](https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html). +#### Logging + +html4docx uses Python's standard `logging` module with named, hierarchical loggers (for example, `html4docx.h4d`). The library never logs directly to the root logger and installs a `NullHandler` by default, so it remains silent unless your application configures logging. + +**Silence all html4docx logs:** + +```python +import logging + +logging.getLogger("html4docx").setLevel(logging.ERROR) # suppresses WARNING and below errors +``` + +**Enable debug logging:** + +```python +import logging + +logging.basicConfig(level=logging.DEBUG) +logging.getLogger("html4docx").setLevel(logging.DEBUG) +``` + +**Django / framework `LOGGING` dict** — add an entry for the `html4docx` parent and it applies to all sub-loggers: + +```python +LOGGING = { + "version": 1, + "loggers": { + "html4docx": { + "level": "ERROR", # suppresses WARNING and below + "propagate": False, + }, + }, +} +``` + +> **Note:** Unrecognised CSS properties (e.g. `letter-spacing`, `margin`, `padding`) are intentionally logged at `DEBUG` level because they are expected skips for any real-world HTML, not problems. Only genuinely unexpected situations (missing styles, unsupported colour formats, etc.) are logged at `WARNING`. + ### Why My goal in forking and fixing/updating this package was to complete my current task at work, which involves converting HTML to DOCX. The original package lacked a few features and had some bugs, preventing me from completing the task. Instead of creating a new package from scratch, I preferred to update this one. @@ -295,6 +332,7 @@ My goal in forking and fixing/updating this package was to complete my current t - Fixed bug on styles parsing when style contains multiple colon. | [Dfop02](https://github.com/dfop02) - Fixed highlighting a single word | [Lynuxen](https://github.com/Lynuxen) - Fix color parsing failing due to invalid colors, falling back to black. | [dfop02](https://github.com/dfop02) from [Issue](https://github.com/dfop02/html4docx/issues/53) +- Fix logging noise: replace root-logger calls with named module loggers so consumers can silence or configure html4docx output independently. | [dfop02](https://github.com/dfop02) from [Issue](https://github.com/dfop02/html4docx/issues/80) **New Features** - Add Witdh/Height style to images | [maifeeulasad](https://github.com/maifeeulasad) from [PR](https://github.com/pqzx/html2docx/pull/29) diff --git a/html4docx/__init__.py b/html4docx/__init__.py index b50f8e5..3e66cbf 100644 --- a/html4docx/__init__.py +++ b/html4docx/__init__.py @@ -1 +1,4 @@ +import logging from .h4d import HtmlToDocx + +logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/html4docx/colors.py b/html4docx/colors.py index 5cc31b0..f30662a 100644 --- a/html4docx/colors.py +++ b/html4docx/colors.py @@ -1,5 +1,6 @@ from enum import Enum + # Reference colors from W3 # https://www.w3.org/wiki/CSS/Properties/color/keywords class Color(Enum): diff --git a/html4docx/constants.py b/html4docx/constants.py index de06b3a..eff13cb 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -1,4 +1,5 @@ import re + # values in inches from docx.enum.text import WD_UNDERLINE diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 6b1ea6a..9722735 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -2,10 +2,9 @@ import logging import os import re -from functools import lru_cache +from functools import cache from html.parser import HTMLParser from io import BytesIO -from typing import Any, Dict import docx from bs4 import BeautifulSoup @@ -19,6 +18,7 @@ from html4docx import constants, utils from html4docx.metadata import Metadata +logger = logging.getLogger(__name__) class HtmlToDocx(HTMLParser): """ @@ -65,7 +65,7 @@ def set_initial_attrs(self, document=None): self.pending_important_styles = None @property - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> Metadata: if not hasattr(self, "_metadata"): self._metadata = Metadata(self.doc) return self._metadata @@ -160,7 +160,7 @@ def apply_style_to_paragraph(self, paragraph, style_name): return True except KeyError: # Style doesn't exist in document - print(f"Warning: Style '{style_name}' not found in document. Using default.") + logger.warning(f"Style '{style_name}' not found in document. Using default.") return False def apply_style_to_run(self, style_name): @@ -178,12 +178,14 @@ def apply_style_to_run(self, style_name): self.run.style = style_name return True except KeyError: - print(f"Warning: Character style '{style_name}' not found in document.") + logger.warning(f"Character style '{style_name}' not found in document.") return False except ValueError as e: if "need type CHARACTER" in str(e): - print(f"Warning: '{style_name}' is a paragraph style, not a character style.") - print("For inline elements like , please create a character style in Word.") + logger.warning( + f"'{style_name}' is a paragraph style, not a character style. " + "For inline elements like , please create a character style in Word." + ) return False def parse_inline_styles(self, style_string): @@ -310,14 +312,14 @@ def set_cell_borders(self, cell, styles): def parse_border_style(value: str) -> str: """Parses border styles to match word standart""" - return constants.BORDER_STYLES[value] if value in constants.BORDER_STYLES.keys() else "none" + return constants.BORDER_STYLES.get(value, "none") def check_unit_keywords(value: str) -> str: """Convert medium, thin, thick keywords to numeric values (px)""" lower_val = value.lower() return keywords.get(lower_val, value) - @lru_cache(maxsize=None) + @cache def border_unit_converter(unit_value: str): """Convert multiple units to pt that is used on Word table cell border""" unit_value = utils.remove_important_from_style(unit_value) @@ -481,12 +483,14 @@ def apply_styles_to_run(self, run, style, isCustom=False): run.style = style return except KeyError: - print(f"Warning: Character style '{style}' not found in document.") + logger.warning(f"Character style '{style}' not found in document.") return except ValueError as e: if "need type CHARACTER" in str(e): - print(f"Warning: '{style}' is a paragraph style, not a character style.") - print("For inline elements like , please create a character style in Word.") + logger.warning( + f"'{style}' is a paragraph style, not a character style. " + "For inline elements like , please create a character style in Word." + ) if not style or not hasattr(run, "font"): return @@ -528,7 +532,7 @@ def apply_styles_to_run(self, run, style, isCustom=False): param_name = style_name.replace("-", "_") handler(run=run, **{param_name: style_value}) else: - logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") + logger.debug(f"Unrecognized style '{style_name}', will be skipped.") def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): if isCustom: @@ -536,7 +540,7 @@ def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): paragraph.style = style return except KeyError: - print(f"Warning: Style '{style}' not found in document. Using default.") + logger.warning(f"Style '{style}' not found in document. Using default.") return if not style or not hasattr(paragraph, "paragraph_format"): @@ -548,7 +552,7 @@ def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): elif style_name in constants.PARAGRAPH_RUN_STYLES: handler = getattr(self, constants.PARAGRAPH_RUN_STYLES[style_name]) else: - logging.warning(f"Warning: Unrecognized paragraph style '{style_name}', will be skipped.") + logger.debug(f"Unrecognized paragraph style '{style_name}', will be skipped.") continue handler(paragraph=paragraph, style_name=style_name, value=style_value, all_styles=style) @@ -596,10 +600,9 @@ def _apply_margins_paragraph(self, **kwargs): margin_left = all_styles.get("margin-left") margin_right = all_styles.get("margin-right") - if margin_left and margin_right: - if "auto" in margin_left and "auto" in margin_right: - paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER - return + if margin_left and margin_right and "auto" in margin_left and "auto" in margin_right: + paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + return if style_name == "margin-left" and margin_left and "auto" not in margin_left: paragraph.paragraph_format.left_indent = utils.unit_converter(margin_left) @@ -709,7 +712,7 @@ def _apply_font_size_to_run(self, **kwargs): run.font.size = converted_size except (ValueError, TypeError) as e: - logging.warning(f"Warning: Could not parse font-size '{font_size}': {e}") + logger.warning(f"Warning: Could not parse font-size '{font_size}': {e}") def _apply_font_family_paragraph(self, **kwargs): paragraph = kwargs["paragraph"] @@ -758,7 +761,7 @@ def _apply_font_family_to_run(self, **kwargs): break except (AttributeError, Exception) as e: - logging.warning(f"Warning: Could not apply font-family '{font_family}': {e}") + logger.warning(f"Warning: Could not apply font-family '{font_family}': {e}") def _apply_color_paragraph(self, **kwargs): paragraph = kwargs["paragraph"] @@ -785,7 +788,7 @@ def _apply_color_to_run(self, **kwargs): colors = utils.parse_color(color_value) run.font.color.rgb = RGBColor(*colors) except (ValueError, AttributeError) as e: - logging.warning(f"Could not apply color '{color_value}': {e}") + logger.warning(f"Could not apply color '{color_value}': {e}") def _apply_text_transform_paragraph(self, **kwargs): paragraph = kwargs["paragraph"] @@ -823,10 +826,10 @@ def _apply_text_transform_to_run(self, **kwargs): # No transformation needed pass elif text_transform in ("full-width", "math-auto", "full-size-kana"): - logging.warning(f"Warning: Unsupported text transform '{text_transform}'") + logger.warning(f"Warning: Unsupported text transform '{text_transform}'") except (AttributeError, Exception) as e: - logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") + logger.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") def _apply_text_decoration_paragraph(self, **kwargs): paragraph = kwargs["paragraph"] @@ -920,7 +923,7 @@ def _apply_text_decoration_line_to_run(self, **kwargs): run.font.underline = False run.font.strike = False else: - logging.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'") + logger.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'") def _apply_text_decoration_style_to_run(self, **kwargs): run = kwargs["run"] @@ -942,7 +945,7 @@ def _apply_text_decoration_style_to_run(self, **kwargs): try: run.font.underline = constants.FONT_UNDERLINE_STYLES[text_decoration_style] except KeyError: - logging.warning(f"Warning: Style not recognized'{text_decoration_style}', defaulting to single line.") + logger.warning(f"Warning: Style not recognized'{text_decoration_style}', defaulting to single line.") # Mark that we applied a text-decoration style by adding text-decoration-line to span_styles paragraph_id = id(self.paragraph) @@ -973,7 +976,7 @@ def _apply_background_color_paragraph(self, **kwargs): if background_color in ("inherit", "initial"): return elif background_color in ("transparent", "none"): - logging.warning(f"Warning: Unsupported background color '{background_color}'") + logger.warning(f"Warning: Unsupported background color '{background_color}'") return try: @@ -993,7 +996,7 @@ def _apply_background_color_paragraph(self, **kwargs): ) except Exception as e: - logging.warning(f"Could not apply background-color to paragraph: {e}") + logger.warning(f"Could not apply background-color to paragraph: {e}") def _apply_background_color_to_run(self, **kwargs): run = kwargs["run"] @@ -1002,7 +1005,7 @@ def _apply_background_color_to_run(self, **kwargs): if background_color in ("inherit", "initial"): return elif background_color in ("transparent", "none"): - logging.warning(f"Warning: Unsupported background color '{background_color}'") + logger.warning(f"Warning: Unsupported background color '{background_color}'") return color_hex = utils.parse_color(background_color, return_hex=True) @@ -1024,7 +1027,7 @@ def _apply_background_color_to_run(self, **kwargs): r_pr.append(shd) except Exception as e: - logging.warning(f"Could not apply background-color to run: {e}") + logger.warning(f"Could not apply background-color to run: {e}") def add_text_align_or_margin_to(self, obj, style): """Styles that can be applied on multiple objects""" @@ -1082,7 +1085,7 @@ def add_styles_to_table_cell(self, styles, doc_cell, cell_row): doc_cell.vertical_alignment = WD_ALIGN_VERTICAL.BOTTOM # Set borders - if any("border" in style for style in styles.keys()): + if any("border" in style for style in styles): self.set_cell_borders(doc_cell, styles) self.add_text_align_or_margin_to(doc_cell.paragraphs[0], styles) @@ -1221,10 +1224,10 @@ def handle_img(self, current_attrs): if not image: if utils.is_url(src): - self.doc.add_paragraph("" % src) + self.doc.add_paragraph(f"") else: # avoid exposing filepaths in document - self.doc.add_paragraph("" % utils.get_filename_from_url(src)) + self.doc.add_paragraph(f"") """ #adding style @@ -1283,7 +1286,7 @@ def handle_table(self, current_attrs): cell_html = self.get_cell_html(col) if col.name == "th": - cell_html = "%s" % cell_html + cell_html = f"{cell_html}" # Get _Cell object from table based on cell_row and cell_col docx_cell = self.table.cell(current_row, current_col) @@ -1510,7 +1513,7 @@ def handle_starttag(self, tag, attrs): if custom_style: valid_style = utils.check_style_exists(self.doc, custom_style) if not valid_style: - logging.warning(f"Warning: Custom style '{custom_style}' not found in document, Ignoring style.") + logger.warning(f"Warning: Custom style '{custom_style}' not found in document, Ignoring style.") custom_style = None if tag in ["p", "pre"]: @@ -1618,7 +1621,7 @@ def handle_endtag(self, tag): self.pending_important_styles = None if self.skip: - if not tag == self.skip_tag: + if tag != self.skip_tag: return if self.instances_to_skip > 0: @@ -1698,7 +1701,7 @@ def handle_data(self, data): if tag == "div" and "style" in attrs: div_style = utils.parse_dict_string(attrs["style"]) - for span_style_name in span_style.keys(): + for span_style_name in span_style: if span_style_name in div_style: del div_style[span_style_name] @@ -1863,7 +1866,7 @@ def add_html_to_cell(self, html: str, cell: docx.table._Cell) -> None: self.doc.add_paragraph("") def parse_html_file(self, filename_html: str, filename_docx, encoding: str = "utf-8") -> None: - with open(filename_html, "r", encoding=encoding) as infile: + with open(filename_html, encoding=encoding) as infile: html = infile.read() self.set_initial_attrs() diff --git a/html4docx/metadata.py b/html4docx/metadata.py index a709f9e..78a794c 100644 --- a/html4docx/metadata.py +++ b/html4docx/metadata.py @@ -1,8 +1,12 @@ import json +import logging from datetime import datetime + from docx import Document -class Metadata(): +logger = logging.getLogger(__name__) + +class Metadata: """Handle docx document metadata""" def __init__(self, document: Document): self.document = document @@ -28,18 +32,18 @@ def set_metadata(self, **kwargs) -> None: try: value = int(value) except ValueError: - print(f'Invalid revision number "{value}". Must be an integer. Skipping...') + logger.warning(f'Invalid revision number "{value}". Must be an integer. Skipping...') continue elif key in ['last_printed', 'modified', 'created']: try: value = datetime.fromisoformat(value) except ValueError: - print(f'Invalid datetime string on property "{key}", must be in ISO format. Skipping...') + logger.warning(f'Invalid datetime string on property "{key}", must be in ISO format. Skipping...') continue setattr(core_props, key, value) else: - print(f'Property "{key}" not found in core properties. Skipping...') + logger.warning(f'Property "{key}" not found in core properties. Skipping...') def get_metadata(self, print_result: bool = False): """ diff --git a/html4docx/utils.py b/html4docx/utils.py index 9633e27..8fd2d66 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -13,6 +13,7 @@ from html4docx import constants from html4docx.colors import Color +logger = logging.getLogger(__name__) class ImageAlignment(Enum): LEFT = 1 @@ -43,7 +44,7 @@ def rgb_to_hex(rgb: str): def adapt_font_size(size: str): - if size in constants.FONT_SIZES_NAMED.keys(): + if size in constants.FONT_SIZES_NAMED: return constants.FONT_SIZES_NAMED[size] return size @@ -154,7 +155,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): if unit in conversion_to_pt: value_in_pt = conversion_to_pt[unit] else: - print(f"Warning: unsupported unit {unit}, return None instead.") + logger.warning(f"Unsupported CSS unit '{unit}', returning None.") return None # Clamp the value to MAX_INDENT (in points) @@ -216,7 +217,7 @@ def parse_color(original_color: str, return_hex: bool = False): color = re.sub(r"[^0-9,]", "", color) colors = [int(x) for x in color.split(",")] colors = colors[:3] # remove opacity because it's not supported by python-docx - logging.warning("RGBA color is not supported by python-docx. Opacity will be ignored.") + logger.warning("RGBA color is not supported by python-docx. Opacity will be ignored.") elif "rgb" in color: color = re.sub(r"[^0-9,]", "", color) colors = [int(x) for x in color.split(",")] @@ -230,10 +231,10 @@ def parse_color(original_color: str, return_hex: bool = False): colors = Color[color].value else: colors = [0, 0, 0] # Default to black for unexpected colors - logging.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") + logger.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") except Exception: colors = [0, 0, 0] # Default to black for errors - logging.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") + logger.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") return rgb_to_hex(colors) if return_hex else colors @@ -373,10 +374,10 @@ def parse_text_decoration(text_decoration): result["color"] = token elif token in ("blink", "overline"): result["line_style"] = None - logging.warning("Blink or overline not supported.") + logger.warning("Blink or overline not supported.") if result["line_type"] == "line-through" and result["color"] is not None: - logging.warning( + logger.warning( f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through." ) return result diff --git a/pyproject.toml b/pyproject.toml index ee4f1bc..577b5ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,10 +8,25 @@ exclude = [ ] [tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", +] ignore = [ - "W292", # no newline at end of file - "E261", # at least two spaces before inline comment - "E302", # expected 2 blank lines - "E305", # expected 2 blank lines after class or function definition + "W292", # no newline at end of file + "E261", # at least two spaces before inline comment + "E302", # expected 2 blank lines + "E305", # expected 2 blank lines after class or function definition + "E501", # Line is too long. + "B905", # Checks for `zip` calls without an explicit `strict` parameter ] - diff --git a/tests/test_h4d.py b/tests/test_h4d.py index b4ac2bb..e3ad55e 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -27,7 +27,7 @@ def clean_up_docx(): @staticmethod def get_html_from_file(filename: str): file_path = Path(f"{test_dir}/assets/htmls") / Path(filename) - with open(file_path, "r") as f: + with open(file_path) as f: html = f.read() return html @@ -215,7 +215,7 @@ def test_wrong_argument_type_raises_error(self): assert isinstance(e, ValueError) assert "First argument needs to be a " in str(e) else: - assert False, "Error not raised as expected" + raise AssertionError("Error not raised as expected") try: self.parser.add_html_to_document(self.text1, self.text1) @@ -224,7 +224,7 @@ def test_wrong_argument_type_raises_error(self): assert "Second argument" in str(e) assert "" in str(e) else: - assert False, "Error not raised as expected" + raise AssertionError("Error not raised as expected") def test_add_html_to_cells_method(self): self.document.add_heading("Test: add_html_to_cells method", level=1) @@ -492,7 +492,7 @@ def test_font_size(self): document = self.parser.parse_html_string(font_size_html_example) font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] - assert ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] == font_sizes + assert font_sizes == ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] def test_font_size_paragraph(self): font_size_html_example = ( @@ -513,7 +513,7 @@ def test_font_size_paragraph(self): document = self.parser.parse_html_string(font_size_html_example) font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] - assert ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] == font_sizes + assert font_sizes == ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] def test_font_weight_paragraph(self): self.document.add_heading("Test: font weight on

", level=1) @@ -1459,10 +1459,7 @@ def test_table_cell_background_color(self): # Get the background color (shading) if it exists shading = tcPr.find(qn("w:shd")) - if shading is not None: - background_color = shading.get(qn("w:fill"), "").upper() # Ensure uppercase and no # - else: - background_color = "None" + background_color = shading.get(qn("w:fill"), "").upper() if shading is not None else "None" # Get expected background color for the current cell expected_color = expected_background_colors[cell_idx].upper() @@ -1521,11 +1518,8 @@ def test_table_cell_dimensions(self): # Validate dimensions for each cell for row_idx, row in enumerate(document.tables[0].rows): for cell_idx, cell in enumerate(row.cells): - # Get the table cell element and properties - docx_cell = document.tables[0].cell(row_idx, cell_idx) - # Convert width from EMUs to px - cell_width_px = round((docx_cell.width / 914400) * 96, 2) # 1 EMU = 1/914400 inch, 1 inch = 96px + cell_width_px = round((cell.width / 914400) * 96, 2) # 1 EMU = 1/914400 inch, 1 inch = 96px # Get expected width and convert it to points using unit_converter expected_width = expected_dimensions[row_idx][cell_idx]["width"] expected_width_px = unit_converter(expected_width, "px") @@ -2228,10 +2222,16 @@ def test_code_and_pre_tag_overrides(self): ) def test_custom_style_not_found_warning(self): - """Warn when tag_style_override names a style that does not exist in the document.""" + """Warn when tag_style_override names a style that does not exist in the document. + + Records must be emitted on the named 'html4docx.h4d' logger so that consumers + can silence or configure them via their logging config without affecting the + root logger (see GitHub issue #80). + """ doc = Document() parser = HtmlToDocx(tag_style_overrides={"h1": "NonExistentStyle"}) - with self.assertLogs(level=logging.WARNING) as cm: + # Use the named logger so the assertion is tied to 'html4docx.h4d', not the root logger. + with self.assertLogs("html4docx.h4d", level=logging.WARNING) as cm: parser.add_html_to_document("

Heading

", doc) self.assertIn( @@ -2773,6 +2773,94 @@ def test_invalid_rowspan_and_colspan(self): self.assertEqual(table.cell(0, 0).text.strip(), "Test 1") + def test_logger_is_named_html4docx_h4d(self): + """The module-level logger must be named 'html4docx.h4d' so consumers can + target it in their logging config (e.g. Django LOGGING dict). Issue #80.""" + import html4docx.h4d as h4d_module + + self.assertEqual(h4d_module.logger.name, "html4docx.h4d") + + def test_package_logger_has_null_handler(self): + """The package-level 'html4docx' logger carries a NullHandler so the library + is silent by default. Child loggers (html4docx.h4d, etc.) inherit it via + propagation — no NullHandler is needed on each module logger. Issue #80.""" + + package_logger = logging.getLogger("html4docx") + handler_types = [type(h) for h in package_logger.handlers] + self.assertIn( + logging.NullHandler, + handler_types, + "html4docx package logger should have a NullHandler attached", + ) + + def test_unrecognized_css_style_emits_debug_not_warning(self): + """CSS properties that html4docx does not support (e.g. 'letter-spacing', 'padding') + must be logged at DEBUG level, not WARNING. They are expected skips for any + real-world HTML and should not pollute production logs. Issue #80.""" + doc = Document() + parser = HtmlToDocx() + # 'letter-spacing' and 'padding' are unrecognised paragraph styles -> DEBUG + # 'margin-left' is recognised -> no log at all + html = '

text

' + + with self.assertLogs("html4docx.h4d", level=logging.DEBUG) as cm: + parser.add_html_to_document(html, doc) + + # Must have produced at least one DEBUG record for each skipped property + debug_records = [r for r in cm.output if r.startswith("DEBUG")] + self.assertTrue( + any("letter-spacing" in r for r in debug_records), + f"Expected a DEBUG record mentioning 'letter-spacing'; got: {cm.output}", + ) + self.assertTrue( + any("padding" in r for r in debug_records), + f"Expected a DEBUG record mentioning 'padding'; got: {cm.output}", + ) + # Must not have produced any WARNING records for these routine skips + warning_records = [r for r in cm.output if r.startswith("WARNING")] + self.assertEqual( + warning_records, + [], + "Unknown CSS properties must not generate WARNING log records; " + f"got: {warning_records}", + ) + + def test_warning_logger_is_silenceable_via_named_logger(self): + """Consumers must be able to suppress html4docx warnings by configuring the + 'html4docx' logger without touching the root logger. Issue #80.""" + + named_logger = logging.getLogger("html4docx.h4d") + original_level = named_logger.level + try: + # Raise the effective level to ERROR — warnings should now be invisible. + named_logger.setLevel(logging.ERROR) + doc = Document() + parser = HtmlToDocx(tag_style_overrides={"h1": "NonExistentStyle"}) + + # assertLogs would raise if no records are emitted at the requested level, + # so we use a manual handler to verify silence instead. + captured = [] + + class _Capture(logging.Handler): + def emit(self, record): + captured.append(record) + + handler = _Capture(level=logging.WARNING) + named_logger.addHandler(handler) + try: + parser.add_html_to_document("

Heading

", doc) + finally: + named_logger.removeHandler(handler) + + self.assertEqual( + captured, + [], + "Setting html4docx.h4d to ERROR should suppress WARNING records; " + f"got: {[r.getMessage() for r in captured]}", + ) + finally: + named_logger.setLevel(original_level) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 94a5e28..e4b9ed3 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,9 +1,13 @@ -import pytest +import logging +from datetime import datetime from io import BytesIO + +import pytest from docx import Document -from html4docx.metadata import Metadata + from html4docx import HtmlToDocx -from datetime import datetime +from html4docx.metadata import Metadata + @pytest.fixture def empty_doc(): @@ -20,27 +24,32 @@ def test_set_and_get_standard_metadata(metadata_obj): assert props["title"] == "The Robert Success" assert props["revision"] == 3 -def test_invalid_revision_type(metadata_obj, capsys): - metadata_obj.set_metadata(revision="not_a_number") - captured = capsys.readouterr() - assert "Invalid revision number" in captured.out +def test_invalid_revision_type(metadata_obj, caplog): + """Invalid revision must emit a WARNING on the 'html4docx.metadata' logger, not print to stdout.""" + with caplog.at_level(logging.WARNING, logger="html4docx.metadata"): + metadata_obj.set_metadata(revision="not_a_number") + assert any("Invalid revision number" in r.message for r in caplog.records) -def test_invalid_datetime_string(metadata_obj, capsys): - metadata_obj.set_metadata(modified="2025-18-99T10:00:00") - captured = capsys.readouterr() - assert "Invalid datetime string" in captured.out +def test_invalid_datetime_string(metadata_obj, caplog): + """Invalid ISO datetime must emit a WARNING on the 'html4docx.metadata' logger, not print to stdout.""" + with caplog.at_level(logging.WARNING, logger="html4docx.metadata"): + metadata_obj.set_metadata(modified="2025-18-99T10:00:00") + assert any("Invalid datetime string" in r.message for r in caplog.records) def test_valid_datetime_string(metadata_obj): metadata_obj.set_metadata(modified="2025-07-18T10:00:00") props = metadata_obj.get_metadata() assert isinstance(props["modified"], datetime) -def test_unrecognized_property(metadata_obj, capsys): - metadata_obj.set_metadata(nonexistent="something") - captured = capsys.readouterr() - assert 'Property "nonexistent" not found' in captured.out + +def test_unrecognized_property(metadata_obj, caplog): + """Unrecognized core property must emit a WARNING on the 'html4docx.metadata' logger, not print to stdout.""" + with caplog.at_level(logging.WARNING, logger="html4docx.metadata"): + metadata_obj.set_metadata(nonexistent="something") + assert any('Property "nonexistent" not found' in r.message for r in caplog.records) def test_print_metadata(capsys, metadata_obj): + """get_metadata(print_result=True) should still print to stdout — this is intentional output.""" metadata_obj.set_metadata(author="Test Author") metadata_obj.get_metadata(print_result=True) captured = capsys.readouterr() @@ -74,3 +83,46 @@ def test_metadata_integration_with_html4docx(empty_doc): assert reloaded_props.author == "Jane" assert isinstance(reloaded_props.created, datetime) assert reloaded_props.created.isoformat().startswith("2025-07-18T09:30") + +def test_metadata_logger_is_named(): + """The Metadata module must expose a named logger 'html4docx.metadata' so consumers + can target it in their logging config. Issue #80.""" + import html4docx.metadata as meta_module + assert meta_module.logger.name == "html4docx.metadata" + +def test_package_logger_has_null_handler(): + """The package-level 'html4docx' logger carries a NullHandler so the library + is silent by default. Child loggers inherit it via propagation — no NullHandler + is needed on each module logger. Issue #80.""" + package_logger = logging.getLogger("html4docx") + handler_types = [type(h) for h in package_logger.handlers] + assert logging.NullHandler in handler_types, ( + "html4docx package logger should have a NullHandler attached" + ) + +def test_metadata_warnings_are_silenceable(metadata_obj): + """Consumers must be able to suppress html4docx.metadata warnings by raising its level + without touching the root logger. Issue #80.""" + named_logger = logging.getLogger("html4docx.metadata") + original_level = named_logger.level + try: + named_logger.setLevel(logging.ERROR) + captured = [] + + class _Capture(logging.Handler): + def emit(self, record): + captured.append(record) + + handler = _Capture(level=logging.WARNING) + named_logger.addHandler(handler) + try: + metadata_obj.set_metadata(revision="bad", nonexistent="x") + finally: + named_logger.removeHandler(handler) + + assert captured == [], ( + "Setting html4docx.metadata to ERROR should suppress WARNING records; " + f"got: {[r.getMessage() for r in captured]}" + ) + finally: + named_logger.setLevel(original_level)