|
|
|
@ -4,6 +4,7 @@ from collections import defaultdict
|
|
|
|
|
from htmls import build_doc, get_body, get_title, shorten_title
|
|
|
|
|
from lxml.etree import tostring, tounicode
|
|
|
|
|
from lxml.html import fragment_fromstring, document_fromstring
|
|
|
|
|
from lxml.html import builder as B
|
|
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
@ -139,7 +140,8 @@ class Document:
|
|
|
|
|
# Things like preambles, content split by ads that we removed, etc.
|
|
|
|
|
|
|
|
|
|
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
|
|
|
|
|
output = document_fromstring('<div/>')
|
|
|
|
|
body = B.BODY()
|
|
|
|
|
html = B.HTML(body)
|
|
|
|
|
best_elem = best_candidate['elem']
|
|
|
|
|
for sibling in best_elem.getparent().getchildren():
|
|
|
|
|
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
|
|
|
|
@ -161,10 +163,11 @@ class Document:
|
|
|
|
|
append = True
|
|
|
|
|
|
|
|
|
|
if append:
|
|
|
|
|
output.append(sibling)
|
|
|
|
|
#if output is not None:
|
|
|
|
|
# output.append(best_elem)
|
|
|
|
|
return output
|
|
|
|
|
body.append(sibling)
|
|
|
|
|
|
|
|
|
|
#if body is not None:
|
|
|
|
|
# body.append(best_elem)
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def select_best_candidate(self, candidates):
|
|
|
|
|
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
|
|
|
|
|