|
|
|
@ -1,4 +1,5 @@
|
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
from __future__ import print_function
|
|
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
@ -19,6 +20,8 @@ from .htmls import shorten_title
|
|
|
|
|
|
|
|
|
|
log = logging.getLogger()
|
|
|
|
|
|
|
|
|
|
if sys.version_info[0] == 2:
|
|
|
|
|
str = unicode
|
|
|
|
|
|
|
|
|
|
REGEXES = {
|
|
|
|
|
'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
|
|
|
|
@ -80,11 +83,12 @@ regexp_type = type(re.compile('hello, world'))
|
|
|
|
|
def compile_pattern(elements):
|
|
|
|
|
if not elements:
|
|
|
|
|
return None
|
|
|
|
|
if isinstance(elements, regexp_type):
|
|
|
|
|
elif isinstance(elements, regexp_type):
|
|
|
|
|
return elements
|
|
|
|
|
if isinstance(elements, basestring):
|
|
|
|
|
else:
|
|
|
|
|
# assume string or string like object
|
|
|
|
|
elements = elements.split(',')
|
|
|
|
|
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
|
|
|
|
|
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
|
|
|
|
|
|
|
|
|
|
class Document:
|
|
|
|
|
"""Class to build a etree document out of html."""
|
|
|
|
@ -194,9 +198,13 @@ class Document:
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
return cleaned_article
|
|
|
|
|
except StandardError, e:
|
|
|
|
|
except Exception as e:
|
|
|
|
|
log.exception('error getting summary: ')
|
|
|
|
|
raise Unparseable(str(e)), None, sys.exc_info()[2]
|
|
|
|
|
if sys.version_info[0] == 2:
|
|
|
|
|
from .compat.two import raise_with_traceback
|
|
|
|
|
else:
|
|
|
|
|
from .compat.three import raise_with_traceback
|
|
|
|
|
raise_with_traceback(Unparseable, sys.exc_info()[2], str(e))
|
|
|
|
|
|
|
|
|
|
def get_article(self, candidates, best_candidate, html_partial=False):
|
|
|
|
|
# Now that we have the top candidate, look through its siblings for
|
|
|
|
@ -389,7 +397,7 @@ class Document:
|
|
|
|
|
# This results in incorrect results in case there is an <img>
|
|
|
|
|
# buried within an <a> for example
|
|
|
|
|
if not REGEXES['divToPElementsRe'].search(
|
|
|
|
|
unicode(''.join(map(tostring, list(elem))))):
|
|
|
|
|
str(''.join(map(str, map(tostring, list(elem)))))):
|
|
|
|
|
#self.debug("Altering %s to p" % (describe(elem)))
|
|
|
|
|
elem.tag = "p"
|
|
|
|
|
#print "Fixed element "+describe(elem)
|
|
|
|
@ -612,18 +620,18 @@ def main():
|
|
|
|
|
|
|
|
|
|
file = None
|
|
|
|
|
if options.url:
|
|
|
|
|
import urllib
|
|
|
|
|
file = urllib.urlopen(options.url)
|
|
|
|
|
import urllib.request, urllib.parse, urllib.error
|
|
|
|
|
file = urllib.request.urlopen(options.url)
|
|
|
|
|
else:
|
|
|
|
|
file = open(args[0], 'rt')
|
|
|
|
|
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
|
|
|
|
|
try:
|
|
|
|
|
print Document(file.read(),
|
|
|
|
|
print(Document(file.read(),
|
|
|
|
|
debug=options.verbose,
|
|
|
|
|
url=options.url,
|
|
|
|
|
positive_keywords = options.positive_keywords,
|
|
|
|
|
negative_keywords = options.negative_keywords,
|
|
|
|
|
).summary().encode(enc, 'replace')
|
|
|
|
|
).summary().encode(enc, 'replace'))
|
|
|
|
|
finally:
|
|
|
|
|
file.close()
|
|
|
|
|
|
|
|
|
|