Convert tabs to spaces; put article in body

0.3.0.dev
Jerry Charumilind 13 years ago
parent 01247903b8
commit ac517834e6

@ -4,6 +4,7 @@ from collections import defaultdict
from htmls import build_doc, get_body, get_title, shorten_title
from lxml.etree import tostring, tounicode
from lxml.html import fragment_fromstring, document_fromstring
from lxml.html import builder as B
import logging
import re
import sys
@ -139,7 +140,8 @@ class Document:
# Things like preambles, content split by ads that we removed, etc.
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
output = document_fromstring('<div/>')
body = B.BODY()
html = B.HTML(body)
best_elem = best_candidate['elem']
for sibling in best_elem.getparent().getchildren():
#if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
@ -161,10 +163,11 @@ class Document:
append = True
if append:
output.append(sibling)
#if output is not None:
# output.append(best_elem)
return output
body.append(sibling)
#if body is not None:
# body.append(best_elem)
return html
def select_best_candidate(self, candidates):
sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)

Loading…
Cancel
Save