Implement duplicate page detection

This adds detection of duplicate pages to avoid adding duplicate pages to a
multi-page article.  It adds a simple unit test and regenerates the nytimes
regression test with the new, and more correct, result.  Previously, we were
including page 2 again after page 5.

Conflicts:

	src/readability_lxml/readability.py
0.3.0.dev
Jerry Charumilind 13 years ago committed by Richard Harding
parent c931a80ba8
commit eefb8e1125

@ -24,6 +24,7 @@ logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
PAGE_CLASS = 'article-page'
REGEXES = {
'unlikelyCandidatesRe': re.compile(
('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|'
@ -408,8 +409,10 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
if enclose_with_html_tag:
output = document_fromstring('<div/>')
output.getchildren()[0].attrib['id'] = 'page'
else:
output = fragment_fromstring('<div/>')
output.attrib['id'] = 'page'
best_elem = best_candidate['elem']
if best_elem.getparent() is not None:
for sibling in best_elem.getparent().getchildren():
@ -476,7 +479,7 @@ def get_article(doc, options, enclose_with_html_tag=True):
if best_candidate:
confidence = best_candidate['content_score']
article = get_raw_article(candidates, best_candidate,
enclose_with_html_tag=enclose_with_html_tag)
enclose_with_html_tag=enclose_with_html_tag)
else:
if ruthless:
log.debug("ruthless removal did not work. ")
@ -825,34 +828,76 @@ def find_next_page_url(parsed_urls, url, elem):
return None
def append_next_page(parsed_urls, page_url, doc, options):
log.debug(str((parsed_urls, page_url, doc, options)))
log.debug('appending next page: %s' % page_url)
def page_id(i):
return 'page-%d' % (i + 1)
def make_page_elem(page_index, elem):
elem.attrib['id'] = page_id(page_index)
elem.attrib['class'] = PAGE_CLASS
def first_paragraph(elem):
paragraphs = elem.xpath('.//p')
logging.debug('len(paragraphs) is %d' % len(paragraphs))
if len(paragraphs) > 0:
return paragraphs[0]
else:
return None
def is_suspected_duplicate(doc, page_doc):
page_p = first_paragraph(page_doc)
if page_p is None:
return False
pages = doc.xpath('//*[contains(@class, $name)]', name = PAGE_CLASS)
for existing_page in pages:
existing_page_p = first_paragraph(existing_page)
if existing_page_p is not None:
page_p_content = page_p.xpath('string()')
existing_page_p_content = existing_page_p.xpath('string()')
if page_p.xpath('string()') == existing_page_p.xpath('string()'):
return True
return False
def append_next_page(parsed_urls, page_index, page_url, doc, options):
logging.debug('appending next page: %s' % page_url)
fetcher = options['urlfetch']
html = fetcher.urlread(page_url)
orig_page_doc = parse(html, page_url)
next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc)
page_article = get_article(orig_page_doc, options)
log.debug('Appending ' + str(page_article))
if page_article.html:
page_doc = fragment_fromstring(page_article.html)
# page_doc is a singular element containing the page article elements. We
# want to add its children to the main article document to which we are
# appending a page.
if doc.tag == 'html':
children = doc.getchildren()
if children[0].tag == 'head':
for elem in page_doc:
doc.getchildren()[1].append(elem)
make_page_elem(page_index, page_doc)
if not is_suspected_duplicate(doc, page_doc):
# page_doc is a singular element containing the page article elements. We
# want to add its children to the main article document to which we are
# appending a page.
if doc.tag == 'html':
children = doc.getchildren()
if children[0].tag == 'head':
for elem in page_doc:
doc.getchildren()[1].append(elem)
else:
for elem in page_doc:
doc.getchildren()[0].append(elem)
else:
for elem in page_doc:
doc.getchildren()[0].append(elem)
else:
for elem in page_doc:
doc.append(elem)
if next_page_url is not None:
append_next_page(parsed_urls, next_page_url, doc, options)
doc.append(elem)
doc.append(page_doc)
if next_page_url is not None:
append_next_page(
parsed_urls,
page_index + 1,
next_page_url,
doc,
options
)
def parse(input, url):
raw_doc = build_doc(input)
@ -916,6 +961,30 @@ class Document:
:param enclose_with_html_tag: Bool do you want a full <html> document
or just the <div> html partial.
def summary(self):
doc = self._html(True)
parsed_urls = set()
url = self.options['url']
if url is not None:
parsed_urls.add(url)
next_page_url = find_next_page_url(parsed_urls, url, doc)
page_0 = get_article(doc, self.options)
page_0_doc = fragment_fromstring(page_0.html)
page_index = 0
make_page_elem(page_index, page_0_doc)
article_doc = B.DIV(page_0_doc)
article_doc.attrib['id'] = 'article'
if next_page_url is not None:
append_next_page(
parsed_urls,
page_index + 1,
next_page_url,
article_doc,
self.options
)
return Summary(page_0.confidence, tostring(article_doc))
"""
summary = self._summary(enclose_with_html_tag=enclose_with_html_tag)
# For this call return the raw Summary object.
@ -944,9 +1013,38 @@ class Document:
# check the current doc for a next page if requested
if self.options.get('multipage', False):
next_page_link = find_next_page_url(parsed_urls, url, doc)
if next_page_link is not None:
append_next_page(parsed_urls, next_page_link, doc, self.options)
next_page_url = find_next_page_url(parsed_urls, url, doc)
page_0 = get_article(doc, self.options)
page_0_doc = fragment_fromstring(page_0.html)
page_index = 0
make_page_elem(page_index, page_0_doc)
if enclose_with_html_tag:
output = document_fromstring('<div/>')
output.getchildren()[0].attrib['id'] = 'article'
output.getchildren()[0].append(page_0_doc)
else:
output = fragment_fromstring('<div/>')
output.attrib['id'] = 'article'
output.append(page_0_doc)
if next_page_url is not None:
append_next_page(
parsed_urls,
page_index + 1,
next_page_url,
output,
self.options
)
return Summary(tostring(output),
page_0.confidence,
short_title=shorten_title(output),
title=get_title(output))
return get_article(doc, self.options,
enclose_with_html_tag=enclose_with_html_tag)

@ -0,0 +1,48 @@
<div id="article">
<div id="page-1" class="article-page">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
suscipit posuere velit. Proin est orci, sollicitudin at luctus
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
eu, placerat sed sem.
</p>
<p>
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
habitant morbi tristique senectus et netus et malesuada fames ac
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
nisi.
</p>
</div>
<div id="page-2" class="article-page">
<p>
Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis,
pharetra at consequat tristique, convallis nec turpis. Vestibulum
sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor
eget velit mollis tempor vel a nisl. Vivamus posuere tristique
ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra
orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec,
dignissim vitae libero. Nunc at mauris et ante accumsan
pellentesque. In placerat pretium suscipit. Phasellus tellus est,
venenatis eu consectetur non, vehicula vel metus. Curabitur
venenatis sem fringilla ante elementum eget faucibus nulla tempus.
Aenean convallis sapien et dolor lobortis interdum. Phasellus odio
risus, sagittis ut elementum ut, porttitor non libero. Integer
fringilla magna quis augue dapibus malesuada. Nulla consectetur
nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis
tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit
cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu
massa.
</p>
</div>
</div>

@ -0,0 +1,25 @@
<div id="page-1" class="article-page">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et
laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in
fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor,
et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in,
porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu
ultricies et. Etiam risus sapien, suscipit et ultricies vel,
suscipit posuere velit. Proin est orci, sollicitudin at luctus
feugiat, consectetur a justo. Etiam nec sem vel massa consectetur
vulputate non interdum est. Donec sem dui, ultricies a adipiscing
eu, placerat sed sem.
</p>
<p>
Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin
vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque
habitant morbi tristique senectus et netus et malesuada fames ac
turpis egestas. In hac habitasse platea dictumst. Vivamus magna
libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc
turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus
est velit, molestie sed molestie quis, tincidunt a diam. Quisque et
neque a ante fermentum tempus in at nunc. Nunc sit amet egestas
nisi.
</p>
</div>

@ -0,0 +1,20 @@
<div id="page-3" class="article-page">
<p>
Nunc non blandit velit. Maecenas suscipit sem sed velit tristique
facilisis. Quisque condimentum, nisi vitae dictum euismod, diam risus
vehicula nibh, in scelerisque lorem risus et risus. Aliquam erat
volutpat. Pellentesque habitant morbi tristique senectus et netus et
malesuada fames ac turpis egestas. Donec blandit venenatis feugiat. Ut
quis turpis ac urna consectetur sagittis. Vestibulum aliquet eros et
orci placerat vitae tempus tellus pretium. Quisque rutrum sapien quis
nibh facilisis quis posuere ipsum elementum. In ac pretium justo. Sed
egestas luctus mollis. Donec rutrum leo a turpis facilisis commodo. Nam
quis quam eget mi malesuada scelerisque. Pellentesque semper
condimentum sagittis. Nam lobortis, tortor ut placerat viverra, ante
felis vehicula sem, blandit ultricies purus urna eget elit.
Pellentesque habitant morbi tristique senectus et netus et malesuada
fames ac turpis egestas. Sed vel nulla sollicitudin dolor adipiscing
dapibus aliquam vitae leo. Phasellus at turpis tempus lectus
pellentesque faucibus.
</p>
</div>

@ -1,4 +1,4 @@
<div id="article"><div class="articleSpanImage"><img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg" alt="" border="0"/><p class="credit">Robert Yager for The New York Times</p>
<div id="article"><div id="page-1" class="article-page"><div class="articleSpanImage"><img src="http://graphics8.nytimes.com/images/2011/07/10/magazine/10bad_span/10bad_span-articleLarge.jpg" alt="" border="0"/><p class="credit">Robert Yager for The New York Times</p>
<p class="caption"><strong/>Gilligan on the set with the actors Bryan Cranston and Aaron Paul. </p>
</div>
<div class="articleBody">
@ -39,7 +39,7 @@ Gilligan has the nerve to provide his own hopeful answer. &#8220;Breaking Bad&#8
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> <div class="articleBody">
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-2" class="article-page"><div class="articleBody">
@ -62,7 +62,7 @@ After graduating, Gilligan won a screenplay contest in 1989, and one of the judg
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> <div class="articleBody">
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-3" class="article-page"><div class="articleBody">
@ -83,7 +83,7 @@ That is something new. The depravities of leading men in TV dramas traditionally
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> <div class="articleBody">
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-4" class="article-page"><div class="articleBody">
@ -109,7 +109,7 @@ This, it turns out, is an abbreviated version of a process that Gilligan goes th
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> <div class="articleBody">
</nyt_correction_bottom><nyt_update_bottom/></div> </div><div id="page-5" class="article-page"><div class="articleBody">
@ -131,27 +131,4 @@ Driving to the set after lunch one day, he told me that Walter White had started
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> <div class="articleBody">
<nyt_text><nyt_correction_top/><p><font size="-1">(Page 2 of 5)</font></p><p/><p/><p>
Which might make Gilligan TV&#8217;s first true red-state auteur. His characters lead middle-American lives in a middle-American place, and they are beset with middle-American problems. They speak like middle Americans too, and they inhabit a realm of moral ambiguities that&#8217;s overseen by a man with both a wicked sense of humor and a highly refined sense of right and wrong. </p>
</nyt_text></div>
<div class="articleBody">
<p>
&#8220;If there&#8217;s a larger lesson to &#8216;Breaking Bad,&#8217; it&#8217;s that actions have consequences,&#8221; Gilligan said during lunch one day in his trailer. &#8220;If religion is a reaction of man, and nothing more, it seems to me that it represents a human desire for wrongdoers to be punished. I hate the idea of Idi Amin living in Saudi Arabia for the last 25 years of his life. That galls me to no end.&#8221; </p><p>
He paused for a moment and speared a few tater tots in a white plastic-foam tray perched on his lap. </p><p>
&#8220;I feel some sort of need for biblical atonement, or justice, or something,&#8221; he said between chews. &#8220;I like to believe there is some comeuppance, that karma kicks in at some point, even if it takes years or decades to happen,&#8221; he went on. &#8220;My girlfriend says this great thing that&#8217;s become my philosophy as well. &#8216;I want to believe there&#8217;s a heaven. But I can&#8217;t not believe there&#8217;s a hell.&#8217; &#8221; </p><p>
&#8216;Breaking Bad&#8221; was born out of a conversation in 2004 between Gilligan and a friend named Thomas Schnauz, who is now a writer on the show. Schnauz had just read a story about a man cooking meth in an apartment complex, which had sickened kids in apartments above. Saddam Hussein&#8217;s putative mobile chemical-weapons labs came up in the conversation, too. </p><p>
&#8220;Neither of us were working,&#8221; Schnauz says, &#8220;and we were like two 70-year-old men who like to complain about the world. And somehow we spun off into the idea of driving around in a mobile lab, cooking meth. It was a joke and not something I would have ever thought about again. But a couple days later Vince called back and said: &#8216;Remember we were talking about that mobile lab and meth? Do you mind if I run with that?&#8217; &#8221; </p><p>
A show about a very smart middle-aged guy who hadn&#8217;t quite achieved his dreams had a faintly autobiographical whiff for Gilligan at the time. He grew up in Farmville, Va., a town of roughly 6,000 people, not far from Appomattox, the site of the South&#8217;s surrender in the Civil War. His father was an insurance claims adjuster, and his mother was a grade-school teacher who had a brief career as a wing walker. &#8220;Vince was an acolyte in the Catholic Church,&#8221; Gail Gilligan says, though she notes that he also played Dungeons and Dragons. &#8220;There was certainly a lot of evil in that game, but it never seemed to affect him adversely.&#8221; </p><p>
Gilligan earned a partial scholarship to attend New York University&#8217;s film program, where his instructors included Jesse Kornbluth, who remembers a polite kid who was so good at drawing bent, violent characters that Kornbluth initially pegged him as the &#8220;go postal&#8221; type. &#8220;In the end, he turned us all into his audience,&#8221; Kornbluth said to me. &#8220;We were all just mesmerized. Attendance was unnaturally high on days when he was reading his scenes.&#8221; </p><p>
After graduating, Gilligan won a screenplay contest in 1989, and one of the judges, a producer named Mark Johnson (now an executive producer on &#8220;Breaking Bad&#8221;), helped him find an agent and sell scripts to Hollywood. Two of them, &#8220;Home Fries,&#8221; starring Drew Barrymore, and &#8220;Wilder Napalm,&#8221; starring Debra Winger and Dennis Quaid, were turned into films. It was a promising start. Gilligan bought a house outside Richmond, assuming that he would keep lobbing movie scripts to Los Angeles, which would keep lobbing money back. That did not happen. By 1994, the money dried up and he lost his writer&#8217;s guild health insurance. That year, his agent got Gilligan a meeting with Chris Carter, the creator of &#8220;The X-Files.&#8221; </p><nyt_author_id><div class="authorIdentification">
<p>David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)</p> </div>
</nyt_author_id><nyt_correction_bottom><p class="articleCorrection">
</p>
</nyt_correction_bottom><nyt_update_bottom/></div> </div>
</nyt_correction_bottom><nyt_update_bottom/></div> </div></div>

@ -1,5 +1,5 @@
test_description: multi-page article from nytimes
notes: multi-page not yet implemented
notes: wrongly includes author identification from each page
url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html
url_map:
http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1: nytimes-001-orig-2.html

Loading…
Cancel
Save