Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
*.swp
*.pyc
.idea/
.eggs/
build/
dist/
mf2py.egg-info/
Expand Down
32 changes: 32 additions & 0 deletions mf2py/dom_helpers.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,46 @@
import sys
import bs4
import copy

if sys.version < '3':
from urlparse import urljoin
text_type = unicode
binary_type = str
else:
from urllib.parse import urljoin
text_type = str
binary_type = bytes
basestring = str

def get_textContent(el, replace_img=False, base_url=''):
""" Get the text content of an element, replacing images by alt or src
"""

# copy el to avoid making direct changes
el_copy = copy.copy(el)

# drop all <style> and <script> elements
drops = el_copy.find_all(['style', 'script'])
for drop in drops:
drop.decompose()

# replace <img> with alt or src
if replace_img:
imgs = el_copy.find_all('img')

for img in imgs:
replacement = img.get('alt')
if replacement is None:
replacement = img.get('src')
if replacement is not None:
replacement = ' ' + urljoin(base_url, replacement) + ' '

if replacement is None:
replacement = ''

img.replace_with(replacement)

return el_copy.get_text().strip()

def get_attr(el, attr, check_name=None):
"""Get the attribute of an element if it exists and is not empty.
Expand Down
6 changes: 3 additions & 3 deletions mf2py/implied_properties.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import unicode_literals, print_function
from . import mf2_classes
from .dom_helpers import get_attr, get_children
from .dom_helpers import get_attr, get_children, get_textContent
import sys

if sys.version < '3':
Expand All @@ -9,7 +9,7 @@
from urllib.parse import urljoin


def name(el):
def name(el, base_url=''):
"""Find an implied name property

Args:
Expand Down Expand Up @@ -60,7 +60,7 @@ def non_empty(val):
return [prop_value]

# use text if all else fails
return [el.get_text().strip()]
return [get_textContent(el, replace_img=True, base_url=base_url)]


def photo(el, base_url=''):
Expand Down
72 changes: 33 additions & 39 deletions mf2py/parse_property.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""functions to parse the properties of elements"""
from __future__ import unicode_literals, print_function

from .dom_helpers import get_attr, get_children
from .dom_helpers import get_attr, get_children, get_textContent
import sys
import re

Expand Down Expand Up @@ -37,42 +37,36 @@ def get_vcp_children(el):
and ('value' in c['class'] or 'value-title' in c['class'])]


def text(el):
def text(el, base_url=''):
"""Process p-* properties"""

# handle value-class-pattern
value_els = get_vcp_children(el)
if value_els:
return ''.join(get_vcp_value(el) for el in value_els)

prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is not None:
return prop_value
prop_value = get_attr(el, "title", check_name=("abbr", "link"))
if prop_value is None:
prop_value = get_attr(el, "value", check_name=("data", "input"))
if prop_value is None:
prop_value = get_attr(el, "alt", check_name=("img", "area"))
if prop_value is None:
prop_value = get_textContent(el, replace_img=True, base_url=base_url)

prop_value = get_attr(el, "value", check_name=("data", "input"))
if prop_value is not None:
return prop_value

prop_value = get_attr(el, "alt", check_name=("img", "area"))
if prop_value is not None:
return prop_value

# see if get_text() replaces img with alts
# strip here?
return el.get_text()
return prop_value


def url(el, base_url=''):
"""Process u-* properties"""
prop_value = get_attr(el, "href", check_name=("a", "area", "link"))
if prop_value is not None:
return urljoin(base_url, prop_value) # make urls absolute

prop_value = get_attr(el, "src", check_name=("img", "audio", "video",
"source"))
if prop_value is not None:
return urljoin(base_url, prop_value)
prop_value = get_attr(el, "href", check_name=("a", "area", "link"))
if prop_value is None:
prop_value = get_attr(el, "src", check_name=("img", "audio", "video", "source"))
if prop_value is None:
prop_value = get_attr(el, "poster", check_name="video")
if prop_value is None:
prop_value = get_attr(el, "data", check_name="object")

prop_value = get_attr(el, "data", check_name="object")
if prop_value is not None:
return urljoin(base_url, prop_value)

Expand All @@ -82,15 +76,12 @@ def url(el, base_url=''):
for el in value_els))

prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is not None:
return prop_value

prop_value = get_attr(el, "value", check_name=("data", "input"))
if prop_value is not None:
return prop_value
if prop_value is None:
prop_value = get_attr(el, "value", check_name=("data", "input"))
if prop_value is None:
prop_value = get_textContent(el)

# strip here?
return el.get_text()
return prop_value


def datetime(el, default_date=None):
Expand Down Expand Up @@ -181,10 +172,13 @@ def try_normalize(dtstr, match=None):

return try_normalize(date_time_value), date_part

prop_value = get_attr(el, "datetime", check_name=("time", "ins", "del"))\
or get_attr(el, "title", check_name="abbr")\
or get_attr(el, "value", check_name=("data", "input"))\
or el.get_text() # strip here?
prop_value = get_attr(el, "datetime", check_name=("time", "ins", "del"))
if prop_value is None:
prop_value = get_attr(el, "title", check_name="abbr")
if prop_value is None:
prop_value = get_attr(el, "value", check_name=("data", "input"))
if prop_value is None:
prop_value = get_textContent(el)

# if this is just a time, augment with default date
match = re.match(TIME_RE + '$', prop_value)
Expand All @@ -198,9 +192,9 @@ def try_normalize(dtstr, match=None):
match and match.group('date'),)


def embedded(el):
def embedded(el, base_url=''):
"""Process e-* properties"""
return {
'html': el.decode_contents(), # secret bs4 method to get innerHTML
'value': el.get_text() # strip here?
'html': el.decode_contents().strip(), # secret bs4 method to get innerHTML
'value': get_textContent(el, replace_img=True, base_url=base_url)
}
46 changes: 28 additions & 18 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals, print_function
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, FeatureNotFound
from bs4.element import Tag
from mf2py import backcompat, mf2_classes, implied_properties, parse_property
from mf2py import temp_fixes
Expand Down Expand Up @@ -63,7 +63,7 @@ class Parser(object):

dict_class = dict

def __init__(self, doc=None, url=None, html_parser=None):
def __init__(self, doc=None, url=None, html_parser='html5lib'):
self.__url__ = None
self.__doc__ = None
self.__parsed__ = self.dict_class([
Expand All @@ -72,28 +72,36 @@ def __init__(self, doc=None, url=None, html_parser=None):
('rel-urls', self.dict_class()),
])

if doc is not None:
self.__doc__ = doc
if isinstance(doc, BeautifulSoup) or isinstance(doc, Tag):
self.__doc__ = doc
else:
self.__doc__ = BeautifulSoup(doc, features=html_parser)

if url is not None:
self.__url__ = url

if self.__doc__ is None:
if doc is None:
data = requests.get(self.__url__, headers={
'User-Agent': self.useragent,
})

# check for charater encodings and use 'correct' data
# update to final URL after redirects
self.__url__ = data.url

# HACK: check for character encodings and use 'correct' data
if 'charset' in data.headers.get('content-type', ''):
self.__doc__ = BeautifulSoup(data.text,
features=html_parser)
doc = data.text
else:
self.__doc__ = BeautifulSoup(data.content,
features=html_parser)
doc = data.content

if doc is not None:
self.__doc__ = doc
if isinstance(doc, BeautifulSoup) or isinstance(doc, Tag):
self.__doc__ = doc
else:
try:
# try the user-given html parser or default html5lib
self.__doc__ = BeautifulSoup(doc, features=html_parser)
except FeatureNotFound:
# maybe raise a warning?
# else switch to default use
self.__doc__ = BeautifulSoup(doc)


# check for <base> tag
if self.__doc__:
Expand Down Expand Up @@ -150,10 +158,11 @@ def handle_microformat(root_class_names, el, value_property=None,
simple_value = properties[value_property][0]

# if some properties not already found find in implied ways

if "name" not in properties:
properties["name"] = [text_type(prop)
for prop
in implied_properties.name(el)]
in implied_properties.name(el, base_url=self.__url__)]
if "photo" not in properties:
x = implied_properties.photo(el, base_url=self.__url__)
if x is not None:
Expand Down Expand Up @@ -216,7 +225,8 @@ def parse_props(el):

# if value has not been parsed then parse it
if p_value is None:
p_value = text_type(parse_property.text(el).strip())
p_value = text_type(parse_property.text(el, base_url=self.__url__))


if root_class_names:
prop_value.append(handle_microformat(
Expand Down Expand Up @@ -272,7 +282,7 @@ def parse_props(el):

# if value has not been parsed then parse it
if e_value is None:
e_value = parse_property.embedded(el)
e_value = parse_property.embedded(el, base_url=self.__url__)

if root_class_names:
prop_value.append(handle_microformat(
Expand Down
2 changes: 1 addition & 1 deletion mf2py/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Define the version number. This class is exec'd by setup.py to read
# the value without loading mf2py (loading mf2py is bad if its dependencies
# haven't been installed yet, which is common during setup)
__version__ = '1.0.5'
__version__ = '1.0.6'
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
html5lib==0.999
html5lib==1.0.1
nose==1.3.0
mock==1.3.0
requests==2.0.1
BeautifulSoup4==4.3.2
requests==2.18.4
BeautifulSoup4==4.6.0
Flask==0.10.1
gunicorn==19.1.1
-e .
6 changes: 3 additions & 3 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ def test_embedded_parsing():
result = parse_fixture("embedded.html")
assert_equal(
result["items"][0]["properties"]["content"][0]["html"],
'\n <p>Blah blah blah blah blah.</p>\n' +
' <p>Blah.</p>\n <p>Blah blah blah.</p>\n ')
'<p>Blah blah blah blah blah.</p>\n' +
' <p>Blah.</p>\n <p>Blah blah blah.</p>')
assert_equal(
result["items"][0]["properties"]["content"][0]["value"],
'\n Blah blah blah blah blah.\n Blah.\n Blah blah blah.\n ')
'Blah blah blah blah blah.\n Blah.\n Blah blah blah.')


def test_backcompat():
Expand Down