diff --git a/.gitignore b/.gitignore index 50ba3dd..891dec0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.swp *.pyc .idea/ +.eggs/ build/ dist/ mf2py.egg-info/ diff --git a/mf2py/dom_helpers.py b/mf2py/dom_helpers.py index de361cf..62273c3 100644 --- a/mf2py/dom_helpers.py +++ b/mf2py/dom_helpers.py @@ -1,14 +1,46 @@ import sys import bs4 +import copy if sys.version < '3': + from urlparse import urljoin text_type = unicode binary_type = str else: + from urllib.parse import urljoin text_type = str binary_type = bytes basestring = str +def get_textContent(el, replace_img=False, base_url=''): + """ Get the text content of an element, replacing images by alt or src + """ + + # copy el to avoid making direct changes + el_copy = copy.copy(el) + + # drop all