diff --git a/src/readability_lxml/readability.py b/src/readability_lxml/readability.py index 0689aea..ebdfdfb 100755 --- a/src/readability_lxml/readability.py +++ b/src/readability_lxml/readability.py @@ -24,6 +24,7 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger() +PAGE_CLASS = 'article-page' REGEXES = { 'unlikelyCandidatesRe': re.compile( ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|' @@ -408,8 +409,10 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True): sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) if enclose_with_html_tag: output = document_fromstring('
') + output.getchildren()[0].attrib['id'] = 'page' else: output = fragment_fromstring('
') + output.attrib['id'] = 'page' best_elem = best_candidate['elem'] if best_elem.getparent() is not None: for sibling in best_elem.getparent().getchildren(): @@ -476,7 +479,7 @@ def get_article(doc, options, enclose_with_html_tag=True): if best_candidate: confidence = best_candidate['content_score'] article = get_raw_article(candidates, best_candidate, - enclose_with_html_tag=enclose_with_html_tag) + enclose_with_html_tag=enclose_with_html_tag) else: if ruthless: log.debug("ruthless removal did not work. ") @@ -825,34 +828,76 @@ def find_next_page_url(parsed_urls, url, elem): return None -def append_next_page(parsed_urls, page_url, doc, options): - log.debug(str((parsed_urls, page_url, doc, options))) - log.debug('appending next page: %s' % page_url) +def page_id(i): + return 'page-%d' % (i + 1) + + +def make_page_elem(page_index, elem): + elem.attrib['id'] = page_id(page_index) + elem.attrib['class'] = PAGE_CLASS + + +def first_paragraph(elem): + paragraphs = elem.xpath('.//p') + logging.debug('len(paragraphs) is %d' % len(paragraphs)) + if len(paragraphs) > 0: + return paragraphs[0] + else: + return None + + +def is_suspected_duplicate(doc, page_doc): + page_p = first_paragraph(page_doc) + if page_p is None: + return False + pages = doc.xpath('//*[contains(@class, $name)]', name = PAGE_CLASS) + for existing_page in pages: + existing_page_p = first_paragraph(existing_page) + if existing_page_p is not None: + page_p_content = page_p.xpath('string()') + existing_page_p_content = existing_page_p.xpath('string()') + if page_p.xpath('string()') == existing_page_p.xpath('string()'): + return True + return False + + +def append_next_page(parsed_urls, page_index, page_url, doc, options): + logging.debug('appending next page: %s' % page_url) fetcher = options['urlfetch'] html = fetcher.urlread(page_url) orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article(orig_page_doc, options) log.debug('Appending ' + str(page_article)) + if page_article.html: page_doc = fragment_fromstring(page_article.html) - # page_doc is a singular element containing the page article elements. We - # want to add its children to the main article document to which we are - # appending a page. - if doc.tag == 'html': - children = doc.getchildren() - if children[0].tag == 'head': - for elem in page_doc: - doc.getchildren()[1].append(elem) + make_page_elem(page_index, page_doc) + + if not is_suspected_duplicate(doc, page_doc): + # page_doc is a singular element containing the page article elements. We + # want to add its children to the main article document to which we are + # appending a page. + if doc.tag == 'html': + children = doc.getchildren() + if children[0].tag == 'head': + for elem in page_doc: + doc.getchildren()[1].append(elem) + else: + for elem in page_doc: + doc.getchildren()[0].append(elem) else: for elem in page_doc: - doc.getchildren()[0].append(elem) - else: - for elem in page_doc: - doc.append(elem) - if next_page_url is not None: - append_next_page(parsed_urls, next_page_url, doc, options) - + doc.append(elem) + doc.append(page_doc) + if next_page_url is not None: + append_next_page( + parsed_urls, + page_index + 1, + next_page_url, + doc, + options + ) def parse(input, url): raw_doc = build_doc(input) @@ -916,6 +961,30 @@ class Document: :param enclose_with_html_tag: Bool do you want a full document or just the
html partial. + def summary(self): + doc = self._html(True) + parsed_urls = set() + url = self.options['url'] + if url is not None: + parsed_urls.add(url) + next_page_url = find_next_page_url(parsed_urls, url, doc) + page_0 = get_article(doc, self.options) + page_0_doc = fragment_fromstring(page_0.html) + page_index = 0 + make_page_elem(page_index, page_0_doc) + article_doc = B.DIV(page_0_doc) + article_doc.attrib['id'] = 'article' + if next_page_url is not None: + append_next_page( + parsed_urls, + page_index + 1, + next_page_url, + article_doc, + self.options + ) + return Summary(page_0.confidence, tostring(article_doc)) + + """ summary = self._summary(enclose_with_html_tag=enclose_with_html_tag) # For this call return the raw Summary object. @@ -944,9 +1013,38 @@ class Document: # check the current doc for a next page if requested if self.options.get('multipage', False): - next_page_link = find_next_page_url(parsed_urls, url, doc) - if next_page_link is not None: - append_next_page(parsed_urls, next_page_link, doc, self.options) + next_page_url = find_next_page_url(parsed_urls, url, doc) + + page_0 = get_article(doc, self.options) + page_0_doc = fragment_fromstring(page_0.html) + page_index = 0 + make_page_elem(page_index, page_0_doc) + + if enclose_with_html_tag: + output = document_fromstring('
') + output.getchildren()[0].attrib['id'] = 'article' + output.getchildren()[0].append(page_0_doc) + else: + output = fragment_fromstring('
') + output.attrib['id'] = 'article' + output.append(page_0_doc) + + if next_page_url is not None: + append_next_page( + parsed_urls, + page_index + 1, + next_page_url, + output, + self.options + ) + return Summary(tostring(output), + page_0.confidence, + short_title=shorten_title(output), + title=get_title(output)) return get_article(doc, self.options, enclose_with_html_tag=enclose_with_html_tag) + + + + diff --git a/src/tests/regression_test_data/duplicate-page-article.html b/src/tests/regression_test_data/duplicate-page-article.html new file mode 100644 index 0000000..9cd4b85 --- /dev/null +++ b/src/tests/regression_test_data/duplicate-page-article.html @@ -0,0 +1,48 @@ +
+
+

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et + laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in + fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor, + et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in, + porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu + ultricies et. Etiam risus sapien, suscipit et ultricies vel, + suscipit posuere velit. Proin est orci, sollicitudin at luctus + feugiat, consectetur a justo. Etiam nec sem vel massa consectetur + vulputate non interdum est. Donec sem dui, ultricies a adipiscing + eu, placerat sed sem. +

+

+ Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin + vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque + habitant morbi tristique senectus et netus et malesuada fames ac + turpis egestas. In hac habitasse platea dictumst. Vivamus magna + libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc + turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus + est velit, molestie sed molestie quis, tincidunt a diam. Quisque et + neque a ante fermentum tempus in at nunc. Nunc sit amet egestas + nisi. +

+
+
+

+ Proin in lacus dolor, sit amet molestie quam. Morbi nisi turpis, + pharetra at consequat tristique, convallis nec turpis. Vestibulum + sit amet magna vitae sem bibendum tincidunt. Maecenas quis tortor + eget velit mollis tempor vel a nisl. Vivamus posuere tristique + ante, cursus rhoncus tortor malesuada eu. Praesent faucibus viverra + orci ac porttitor. Maecenas dui purus, aliquam sed aliquam nec, + dignissim vitae libero. Nunc at mauris et ante accumsan + pellentesque. In placerat pretium suscipit. Phasellus tellus est, + venenatis eu consectetur non, vehicula vel metus. Curabitur + venenatis sem fringilla ante elementum eget faucibus nulla tempus. + Aenean convallis sapien et dolor lobortis interdum. Phasellus odio + risus, sagittis ut elementum ut, porttitor non libero. Integer + fringilla magna quis augue dapibus malesuada. Nulla consectetur + nisi mi. Suspendisse faucibus lobortis ornare. Nunc venenatis + tortor in urna pulvinar pulvinar. Sed et mi nec justo hendrerit + cursus ac nec mauris. Morbi et ante a lorem iaculis rutrum vitae eu + massa. +

+
+
diff --git a/src/tests/regression_test_data/duplicate-page-duplicate.html b/src/tests/regression_test_data/duplicate-page-duplicate.html new file mode 100644 index 0000000..08e8bbe --- /dev/null +++ b/src/tests/regression_test_data/duplicate-page-duplicate.html @@ -0,0 +1,25 @@ +
+

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla et + laoreet ligula. Nulla facilisi. Morbi condimentum molestie enim in + fermentum. Phasellus sit amet vehicula turpis. Sed eu dolor tortor, + et accumsan purus. Aliquam velit nisl, facilisis quis suscipit in, + porttitor at lorem. Ut adipiscing suscipit augue, id interdum arcu + ultricies et. Etiam risus sapien, suscipit et ultricies vel, + suscipit posuere velit. Proin est orci, sollicitudin at luctus + feugiat, consectetur a justo. Etiam nec sem vel massa consectetur + vulputate non interdum est. Donec sem dui, ultricies a adipiscing + eu, placerat sed sem. +

+

+ Nunc lacinia varius justo, at lacinia felis ultricies vel. Proin + vestibulum vehicula eleifend. Ut vitae risus eros. Pellentesque + habitant morbi tristique senectus et netus et malesuada fames ac + turpis egestas. In hac habitasse platea dictumst. Vivamus magna + libero, blandit vitae hendrerit porta, dapibus eget eros. Nunc + turpis felis, facilisis eu vestibulum sed, porta a ipsum. Vivamus + est velit, molestie sed molestie quis, tincidunt a diam. Quisque et + neque a ante fermentum tempus in at nunc. Nunc sit amet egestas + nisi. +

+
diff --git a/src/tests/regression_test_data/duplicate-page-unique.html b/src/tests/regression_test_data/duplicate-page-unique.html new file mode 100644 index 0000000..b92dc1d --- /dev/null +++ b/src/tests/regression_test_data/duplicate-page-unique.html @@ -0,0 +1,20 @@ +
+

+ Nunc non blandit velit. Maecenas suscipit sem sed velit tristique + facilisis. Quisque condimentum, nisi vitae dictum euismod, diam risus + vehicula nibh, in scelerisque lorem risus et risus. Aliquam erat + volutpat. Pellentesque habitant morbi tristique senectus et netus et + malesuada fames ac turpis egestas. Donec blandit venenatis feugiat. Ut + quis turpis ac urna consectetur sagittis. Vestibulum aliquet eros et + orci placerat vitae tempus tellus pretium. Quisque rutrum sapien quis + nibh facilisis quis posuere ipsum elementum. In ac pretium justo. Sed + egestas luctus mollis. Donec rutrum leo a turpis facilisis commodo. Nam + quis quam eget mi malesuada scelerisque. Pellentesque semper + condimentum sagittis. Nam lobortis, tortor ut placerat viverra, ante + felis vehicula sem, blandit ultricies purus urna eget elit. + Pellentesque habitant morbi tristique senectus et netus et malesuada + fames ac turpis egestas. Sed vel nulla sollicitudin dolor adipiscing + dapibus aliquam vitae leo. Phasellus at turpis tempus lectus + pellentesque faucibus. +

+
diff --git a/src/tests/regression_test_data/nytimes-001-rdbl.html b/src/tests/regression_test_data/nytimes-001-rdbl.html index 538bf94..c37d684 100644 --- a/src/tests/regression_test_data/nytimes-001-rdbl.html +++ b/src/tests/regression_test_data/nytimes-001-rdbl.html @@ -1,4 +1,4 @@ -

Robert Yager for The New York Times

+

Robert Yager for The New York Times

Gilligan on the set with the actors Bryan Cranston and Aaron Paul.

@@ -39,7 +39,7 @@ Gilligan has the nerve to provide his own hopeful answer. “Breaking Bad

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-
+
@@ -62,7 +62,7 @@ After graduating, Gilligan won a screenplay contest in 1989, and one of the judg

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-
+
@@ -83,7 +83,7 @@ That is something new. The depravities of leading men in TV dramas traditionally

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-
+
@@ -109,7 +109,7 @@ This, it turns out, is an abbreviated version of a process that Gilligan goes th

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-
+
@@ -131,27 +131,4 @@ Driving to the set after lunch one day, he told me that Walter White had started

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-
- - - - - -

(Page 2 of 5)

-Which might make Gilligan TV’s first true red-state auteur. His characters lead middle-American lives in a middle-American place, and they are beset with middle-American problems. They speak like middle Americans too, and they inhabit a realm of moral ambiguities that’s overseen by a man with both a wicked sense of humor and a highly refined sense of right and wrong.

-
- -
-

-“If there’s a larger lesson to ‘Breaking Bad,’ it’s that actions have consequences,” Gilligan said during lunch one day in his trailer. “If religion is a reaction of man, and nothing more, it seems to me that it represents a human desire for wrongdoers to be punished. I hate the idea of Idi Amin living in Saudi Arabia for the last 25 years of his life. That galls me to no end.”

-He paused for a moment and speared a few tater tots in a white plastic-foam tray perched on his lap.

-“I feel some sort of need for biblical atonement, or justice, or something,” he said between chews. “I like to believe there is some comeuppance, that karma kicks in at some point, even if it takes years or decades to happen,” he went on. “My girlfriend says this great thing that’s become my philosophy as well. ‘I want to believe there’s a heaven. But I can’t not believe there’s a hell.’ ”

-‘Breaking Bad” was born out of a conversation in 2004 between Gilligan and a friend named Thomas Schnauz, who is now a writer on the show. Schnauz had just read a story about a man cooking meth in an apartment complex, which had sickened kids in apartments above. Saddam Hussein’s putative mobile chemical-weapons labs came up in the conversation, too.

-“Neither of us were working,” Schnauz says, “and we were like two 70-year-old men who like to complain about the world. And somehow we spun off into the idea of driving around in a mobile lab, cooking meth. It was a joke and not something I would have ever thought about again. But a couple days later Vince called back and said: ‘Remember we were talking about that mobile lab and meth? Do you mind if I run with that?’ ”

-A show about a very smart middle-aged guy who hadn’t quite achieved his dreams had a faintly autobiographical whiff for Gilligan at the time. He grew up in Farmville, Va., a town of roughly 6,000 people, not far from Appomattox, the site of the South’s surrender in the Civil War. His father was an insurance claims adjuster, and his mother was a grade-school teacher who had a brief career as a wing walker. “Vince was an acolyte in the Catholic Church,” Gail Gilligan says, though she notes that he also played Dungeons and Dragons. “There was certainly a lot of evil in that game, but it never seemed to affect him adversely.”

-Gilligan earned a partial scholarship to attend New York University’s film program, where his instructors included Jesse Kornbluth, who remembers a polite kid who was so good at drawing bent, violent characters that Kornbluth initially pegged him as the “go postal” type. “In the end, he turned us all into his audience,” Kornbluth said to me. “We were all just mesmerized. Attendance was unnaturally high on days when he was reading his scenes.”

-After graduating, Gilligan won a screenplay contest in 1989, and one of the judges, a producer named Mark Johnson (now an executive producer on “Breaking Bad”), helped him find an agent and sell scripts to Hollywood. Two of them, “Home Fries,” starring Drew Barrymore, and “Wilder Napalm,” starring Debra Winger and Dennis Quaid, were turned into films. It was a promising start. Gilligan bought a house outside Richmond, assuming that he would keep lobbing movie scripts to Los Angeles, which would keep lobbing money back. That did not happen. By 1994, the money dried up and he lost his writer’s guild health insurance. That year, his agent got Gilligan a meeting with Chris Carter, the creator of “The X-Files.”

-

David Segal is a reporter for the Business section of The New York Times. His most recent article in the magazine was about the New York Cosmos. Editor: Adam Sternbergh (a.sternbergh-MagGroup@nytimes.com)

-

-

-
\ No newline at end of file +
\ No newline at end of file diff --git a/src/tests/regression_test_data/nytimes-001.yaml b/src/tests/regression_test_data/nytimes-001.yaml index bddc6e0..5bb19a2 100644 --- a/src/tests/regression_test_data/nytimes-001.yaml +++ b/src/tests/regression_test_data/nytimes-001.yaml @@ -1,5 +1,5 @@ test_description: multi-page article from nytimes -notes: multi-page not yet implemented +notes: wrongly includes author identification from each page url: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html url_map: http://www.nytimes.com/2011/07/10/magazine/the-dark-art-of-breaking-bad.html?pagewanted=2&_r=1: nytimes-001-orig-2.html