Merge pull request #115 from johnklee/Issue99

Fix #99 - Hiding exception raised during "a href" normalization, added handle_failures parameter defaulting to "discard" bad urls.
pull/116/head
Yuri Baburov 5 years ago committed by GitHub
commit a4ac1c7704
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -86,7 +86,8 @@ class Document:
"""Class to build a etree document out of html.""" """Class to build a etree document out of html."""
def __init__(self, input, positive_keywords=None, negative_keywords=None, def __init__(self, input, positive_keywords=None, negative_keywords=None,
url=None, min_text_length=25, retry_length=250, xpath=False): url=None, min_text_length=25, retry_length=250, xpath=False,
handle_failures='discard'):
"""Generate the document """Generate the document
:param input: string of the html content. :param input: string of the html content.
@ -97,6 +98,8 @@ class Document:
:param xpath: If set to True, adds x="..." attribute to each HTML node, :param xpath: If set to True, adds x="..." attribute to each HTML node,
containing xpath path pointing to original document path (allows to containing xpath path pointing to original document path (allows to
reconstruct selected summary in original document). reconstruct selected summary in original document).
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
Support options = ["discard", "ignore", None]
Examples: Examples:
positive_keywords=["news-item", "block"] positive_keywords=["news-item", "block"]
@ -122,6 +125,7 @@ class Document:
self.min_text_length = min_text_length self.min_text_length = min_text_length
self.retry_length = retry_length self.retry_length = retry_length
self.xpath = xpath self.xpath = xpath
self.handle_failures = handle_failures
def _html(self, force=False): def _html(self, force=False):
if force or self.html is None: if force or self.html is None:
@ -141,13 +145,13 @@ class Document:
# trying to guard against bad links like <a href="http://[http://..."> # trying to guard against bad links like <a href="http://[http://...">
try: try:
# such support is added in lxml 3.3.0 # such support is added in lxml 3.3.0
doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures='discard') doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures)
except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures' except TypeError: #make_links_absolute() got an unexpected keyword argument 'handle_failures'
# then we have lxml < 3.3.0 # then we have lxml < 3.3.0
# please upgrade to lxml >= 3.3.0 if you're failing here! # please upgrade to lxml >= 3.3.0 if you're failing here!
doc.make_links_absolute(base_href, resolve_base_href=True) doc.make_links_absolute(base_href, resolve_base_href=True, handle_failures=self.handle_failures)
else: else:
doc.resolve_base_href() doc.resolve_base_href(handle_failures=self.handle_failures)
return doc return doc
def content(self): def content(self):

Loading…
Cancel
Save