Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,22 @@
Release History
---------------

1.1.6 (2026-xx-xx)
++++++++++++++++++

**Updates**

- Improve code quality with more Ruff linters

**Fixes**

- Fixes `#80 <https://github.com/dfop02/html4docx/issues/80>`_: Update logging to use module-level logger. | `dfop02 <https://github.com/dfop02>`_

**New Features**

- None


1.1.5 (2026-04-17)
++++++++++++++++++

Expand Down
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,43 @@ document.save('your_file_name.docx')

You can find all available metadata attributes [here](https://python-docx.readthedocs.io/en/latest/dev/analysis/features/coreprops.html).

#### Logging

html4docx uses Python's standard `logging` module with named, hierarchical loggers (for example, `html4docx.h4d`). The library never logs directly to the root logger and installs a `NullHandler` by default, so it remains silent unless your application configures logging.

**Silence all html4docx logs:**

```python
import logging

logging.getLogger("html4docx").setLevel(logging.ERROR) # suppresses WARNING and below errors
```

**Enable debug logging:**

```python
import logging

logging.basicConfig(level=logging.DEBUG)
logging.getLogger("html4docx").setLevel(logging.DEBUG)
```

**Django / framework `LOGGING` dict** — add an entry for the `html4docx` parent and it applies to all sub-loggers:

```python
LOGGING = {
"version": 1,
"loggers": {
"html4docx": {
"level": "ERROR", # suppresses WARNING and below
"propagate": False,
},
},
}
```

> **Note:** Unrecognised CSS properties (e.g. `letter-spacing`, `margin`, `padding`) are intentionally logged at `DEBUG` level because they are expected skips for any real-world HTML, not problems. Only genuinely unexpected situations (missing styles, unsupported colour formats, etc.) are logged at `WARNING`.

### Why

My goal in forking and fixing/updating this package was to complete my current task at work, which involves converting HTML to DOCX. The original package lacked a few features and had some bugs, preventing me from completing the task. Instead of creating a new package from scratch, I preferred to update this one.
Expand All @@ -295,6 +332,7 @@ My goal in forking and fixing/updating this package was to complete my current t
- Fixed bug on styles parsing when style contains multiple colon. | [Dfop02](https://github.com/dfop02)
- Fixed highlighting a single word | [Lynuxen](https://github.com/Lynuxen)
- Fix color parsing failing due to invalid colors, falling back to black. | [dfop02](https://github.com/dfop02) from [Issue](https://github.com/dfop02/html4docx/issues/53)
- Fix logging noise: replace root-logger calls with named module loggers so consumers can silence or configure html4docx output independently. | [dfop02](https://github.com/dfop02) from [Issue](https://github.com/dfop02/html4docx/issues/80)

**New Features**
- Add Witdh/Height style to images | [maifeeulasad](https://github.com/maifeeulasad) from [PR](https://github.com/pqzx/html2docx/pull/29)
Expand Down
3 changes: 3 additions & 0 deletions html4docx/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
import logging
from .h4d import HtmlToDocx

logging.getLogger(__name__).addHandler(logging.NullHandler())
1 change: 1 addition & 0 deletions html4docx/colors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum


# Reference colors from W3
# https://www.w3.org/wiki/CSS/Properties/color/keywords
class Color(Enum):
Expand Down
1 change: 1 addition & 0 deletions html4docx/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re

# values in inches
from docx.enum.text import WD_UNDERLINE

Expand Down
79 changes: 41 additions & 38 deletions html4docx/h4d.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import logging
import os
import re
from functools import lru_cache
from functools import cache
from html.parser import HTMLParser
from io import BytesIO
from typing import Any, Dict

import docx
from bs4 import BeautifulSoup
Expand All @@ -19,6 +18,7 @@
from html4docx import constants, utils
from html4docx.metadata import Metadata

logger = logging.getLogger(__name__)

class HtmlToDocx(HTMLParser):
"""
Expand Down Expand Up @@ -65,7 +65,7 @@ def set_initial_attrs(self, document=None):
self.pending_important_styles = None

@property
def metadata(self) -> Dict[str, Any]:
def metadata(self) -> Metadata:
if not hasattr(self, "_metadata"):
self._metadata = Metadata(self.doc)
return self._metadata
Expand Down Expand Up @@ -160,7 +160,7 @@ def apply_style_to_paragraph(self, paragraph, style_name):
return True
except KeyError:
# Style doesn't exist in document
print(f"Warning: Style '{style_name}' not found in document. Using default.")
logger.warning(f"Style '{style_name}' not found in document. Using default.")
return False

def apply_style_to_run(self, style_name):
Expand All @@ -178,12 +178,14 @@ def apply_style_to_run(self, style_name):
self.run.style = style_name
return True
except KeyError:
print(f"Warning: Character style '{style_name}' not found in document.")
logger.warning(f"Character style '{style_name}' not found in document.")
return False
except ValueError as e:
if "need type CHARACTER" in str(e):
print(f"Warning: '{style_name}' is a paragraph style, not a character style.")
print("For inline elements like <code>, please create a character style in Word.")
logger.warning(
f"'{style_name}' is a paragraph style, not a character style. "
"For inline elements like <code>, please create a character style in Word."
)
return False

def parse_inline_styles(self, style_string):
Expand Down Expand Up @@ -310,14 +312,14 @@ def set_cell_borders(self, cell, styles):

def parse_border_style(value: str) -> str:
"""Parses border styles to match word standart"""
return constants.BORDER_STYLES[value] if value in constants.BORDER_STYLES.keys() else "none"
return constants.BORDER_STYLES.get(value, "none")

def check_unit_keywords(value: str) -> str:
"""Convert medium, thin, thick keywords to numeric values (px)"""
lower_val = value.lower()
return keywords.get(lower_val, value)

@lru_cache(maxsize=None)
@cache
def border_unit_converter(unit_value: str):
"""Convert multiple units to pt that is used on Word table cell border"""
unit_value = utils.remove_important_from_style(unit_value)
Expand Down Expand Up @@ -481,12 +483,14 @@ def apply_styles_to_run(self, run, style, isCustom=False):
run.style = style
return
except KeyError:
print(f"Warning: Character style '{style}' not found in document.")
logger.warning(f"Character style '{style}' not found in document.")
return
except ValueError as e:
if "need type CHARACTER" in str(e):
print(f"Warning: '{style}' is a paragraph style, not a character style.")
print("For inline elements like <code>, please create a character style in Word.")
logger.warning(
f"'{style}' is a paragraph style, not a character style. "
"For inline elements like <code>, please create a character style in Word."
)

if not style or not hasattr(run, "font"):
return
Expand Down Expand Up @@ -528,15 +532,15 @@ def apply_styles_to_run(self, run, style, isCustom=False):
param_name = style_name.replace("-", "_")
handler(run=run, **{param_name: style_value})
else:
logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.")
logger.debug(f"Unrecognized style '{style_name}', will be skipped.")

def apply_styles_to_paragraph(self, paragraph, style, isCustom=False):
if isCustom:
try:
paragraph.style = style
return
except KeyError:
print(f"Warning: Style '{style}' not found in document. Using default.")
logger.warning(f"Style '{style}' not found in document. Using default.")
return

if not style or not hasattr(paragraph, "paragraph_format"):
Expand All @@ -548,7 +552,7 @@ def apply_styles_to_paragraph(self, paragraph, style, isCustom=False):
elif style_name in constants.PARAGRAPH_RUN_STYLES:
handler = getattr(self, constants.PARAGRAPH_RUN_STYLES[style_name])
else:
logging.warning(f"Warning: Unrecognized paragraph style '{style_name}', will be skipped.")
logger.debug(f"Unrecognized paragraph style '{style_name}', will be skipped.")
continue

handler(paragraph=paragraph, style_name=style_name, value=style_value, all_styles=style)
Expand Down Expand Up @@ -596,10 +600,9 @@ def _apply_margins_paragraph(self, **kwargs):
margin_left = all_styles.get("margin-left")
margin_right = all_styles.get("margin-right")

if margin_left and margin_right:
if "auto" in margin_left and "auto" in margin_right:
paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
return
if margin_left and margin_right and "auto" in margin_left and "auto" in margin_right:
paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
return

if style_name == "margin-left" and margin_left and "auto" not in margin_left:
paragraph.paragraph_format.left_indent = utils.unit_converter(margin_left)
Expand Down Expand Up @@ -709,7 +712,7 @@ def _apply_font_size_to_run(self, **kwargs):
run.font.size = converted_size

except (ValueError, TypeError) as e:
logging.warning(f"Warning: Could not parse font-size '{font_size}': {e}")
logger.warning(f"Warning: Could not parse font-size '{font_size}': {e}")

def _apply_font_family_paragraph(self, **kwargs):
paragraph = kwargs["paragraph"]
Expand Down Expand Up @@ -758,7 +761,7 @@ def _apply_font_family_to_run(self, **kwargs):
break

except (AttributeError, Exception) as e:
logging.warning(f"Warning: Could not apply font-family '{font_family}': {e}")
logger.warning(f"Warning: Could not apply font-family '{font_family}': {e}")

def _apply_color_paragraph(self, **kwargs):
paragraph = kwargs["paragraph"]
Expand All @@ -785,7 +788,7 @@ def _apply_color_to_run(self, **kwargs):
colors = utils.parse_color(color_value)
run.font.color.rgb = RGBColor(*colors)
except (ValueError, AttributeError) as e:
logging.warning(f"Could not apply color '{color_value}': {e}")
logger.warning(f"Could not apply color '{color_value}': {e}")

def _apply_text_transform_paragraph(self, **kwargs):
paragraph = kwargs["paragraph"]
Expand Down Expand Up @@ -823,10 +826,10 @@ def _apply_text_transform_to_run(self, **kwargs):
# No transformation needed
pass
elif text_transform in ("full-width", "math-auto", "full-size-kana"):
logging.warning(f"Warning: Unsupported text transform '{text_transform}'")
logger.warning(f"Warning: Unsupported text transform '{text_transform}'")

except (AttributeError, Exception) as e:
logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}")
logger.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}")

def _apply_text_decoration_paragraph(self, **kwargs):
paragraph = kwargs["paragraph"]
Expand Down Expand Up @@ -920,7 +923,7 @@ def _apply_text_decoration_line_to_run(self, **kwargs):
run.font.underline = False
run.font.strike = False
else:
logging.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'")
logger.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'")

def _apply_text_decoration_style_to_run(self, **kwargs):
run = kwargs["run"]
Expand All @@ -942,7 +945,7 @@ def _apply_text_decoration_style_to_run(self, **kwargs):
try:
run.font.underline = constants.FONT_UNDERLINE_STYLES[text_decoration_style]
except KeyError:
logging.warning(f"Warning: Style not recognized'{text_decoration_style}', defaulting to single line.")
logger.warning(f"Warning: Style not recognized'{text_decoration_style}', defaulting to single line.")

# Mark that we applied a text-decoration style by adding text-decoration-line to span_styles
paragraph_id = id(self.paragraph)
Expand Down Expand Up @@ -973,7 +976,7 @@ def _apply_background_color_paragraph(self, **kwargs):
if background_color in ("inherit", "initial"):
return
elif background_color in ("transparent", "none"):
logging.warning(f"Warning: Unsupported background color '{background_color}'")
logger.warning(f"Warning: Unsupported background color '{background_color}'")
return

try:
Expand All @@ -993,7 +996,7 @@ def _apply_background_color_paragraph(self, **kwargs):
)

except Exception as e:
logging.warning(f"Could not apply background-color to paragraph: {e}")
logger.warning(f"Could not apply background-color to paragraph: {e}")

def _apply_background_color_to_run(self, **kwargs):
run = kwargs["run"]
Expand All @@ -1002,7 +1005,7 @@ def _apply_background_color_to_run(self, **kwargs):
if background_color in ("inherit", "initial"):
return
elif background_color in ("transparent", "none"):
logging.warning(f"Warning: Unsupported background color '{background_color}'")
logger.warning(f"Warning: Unsupported background color '{background_color}'")
return

color_hex = utils.parse_color(background_color, return_hex=True)
Expand All @@ -1024,7 +1027,7 @@ def _apply_background_color_to_run(self, **kwargs):
r_pr.append(shd)

except Exception as e:
logging.warning(f"Could not apply background-color to run: {e}")
logger.warning(f"Could not apply background-color to run: {e}")

def add_text_align_or_margin_to(self, obj, style):
"""Styles that can be applied on multiple objects"""
Expand Down Expand Up @@ -1082,7 +1085,7 @@ def add_styles_to_table_cell(self, styles, doc_cell, cell_row):
doc_cell.vertical_alignment = WD_ALIGN_VERTICAL.BOTTOM

# Set borders
if any("border" in style for style in styles.keys()):
if any("border" in style for style in styles):
self.set_cell_borders(doc_cell, styles)

self.add_text_align_or_margin_to(doc_cell.paragraphs[0], styles)
Expand Down Expand Up @@ -1221,10 +1224,10 @@ def handle_img(self, current_attrs):

if not image:
if utils.is_url(src):
self.doc.add_paragraph("<image: %s>" % src)
self.doc.add_paragraph(f"<image: {src}>")
else:
# avoid exposing filepaths in document
self.doc.add_paragraph("<image: %s>" % utils.get_filename_from_url(src))
self.doc.add_paragraph(f"<image: {utils.get_filename_from_url(src)}>")

"""
#adding style
Expand Down Expand Up @@ -1283,7 +1286,7 @@ def handle_table(self, current_attrs):

cell_html = self.get_cell_html(col)
if col.name == "th":
cell_html = "<b>%s</b>" % cell_html
cell_html = f"<b>{cell_html}</b>"

# Get _Cell object from table based on cell_row and cell_col
docx_cell = self.table.cell(current_row, current_col)
Expand Down Expand Up @@ -1510,7 +1513,7 @@ def handle_starttag(self, tag, attrs):
if custom_style:
valid_style = utils.check_style_exists(self.doc, custom_style)
if not valid_style:
logging.warning(f"Warning: Custom style '{custom_style}' not found in document, Ignoring style.")
logger.warning(f"Warning: Custom style '{custom_style}' not found in document, Ignoring style.")
custom_style = None

if tag in ["p", "pre"]:
Expand Down Expand Up @@ -1618,7 +1621,7 @@ def handle_endtag(self, tag):
self.pending_important_styles = None

if self.skip:
if not tag == self.skip_tag:
if tag != self.skip_tag:
return

if self.instances_to_skip > 0:
Expand Down Expand Up @@ -1698,7 +1701,7 @@ def handle_data(self, data):
if tag == "div" and "style" in attrs:
div_style = utils.parse_dict_string(attrs["style"])

for span_style_name in span_style.keys():
for span_style_name in span_style:
if span_style_name in div_style:
del div_style[span_style_name]

Expand Down Expand Up @@ -1863,7 +1866,7 @@ def add_html_to_cell(self, html: str, cell: docx.table._Cell) -> None:
self.doc.add_paragraph("")

def parse_html_file(self, filename_html: str, filename_docx, encoding: str = "utf-8") -> None:
with open(filename_html, "r", encoding=encoding) as infile:
with open(filename_html, encoding=encoding) as infile:
html = infile.read()

self.set_initial_attrs()
Expand Down
Loading
Loading