From 2c5ba594dd25ba0abb39e521625573beaabd5392 Mon Sep 17 00:00:00 2001 From: Nicolas Perriault Date: Wed, 6 May 2015 15:54:58 +0200 Subject: [PATCH] Refs #209 - Increase score for elements containing large amount of text. --- Readability.js | 6 +- test/test-pages/ars-1/expected.html | 55 +- .../sfgate-1/expected-metadata.json | 6 + test/test-pages/sfgate-1/expected.html | 292 + test/test-pages/sfgate-1/source.html | 9373 +++++++++++++++++ test/test-readability.js | 2 +- 6 files changed, 9711 insertions(+), 23 deletions(-) create mode 100644 test/test-pages/sfgate-1/expected-metadata.json create mode 100644 test/test-pages/sfgate-1/expected.html create mode 100644 test/test-pages/sfgate-1/source.html diff --git a/Readability.js b/Readability.js index f47fe36..4166e24 100644 --- a/Readability.js +++ b/Readability.js @@ -110,7 +110,7 @@ Readability.prototype = { unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i, okMaybeItsACandidate: /and|article|body|column|main|shadow/i, positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /hidden|banner|combx|comment|com-|contact|control-?bar|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby/i, replaceFonts: /<(\/?)font[^>]*>/gi, @@ -722,6 +722,10 @@ Readability.prototype = { // For every 100 characters in this paragraph, add another point. Up to 3 points. contentScore += Math.min(Math.floor(innerText.length / 100), 3); + if (elementToScore.tagName !== "section" && innerText.length > 300) { + contentScore *= 1.5; + } + // Initialize and score ancestors. this._forEachNode(ancestors, function(ancestor, level) { if (!ancestor.tagName) diff --git a/test/test-pages/ars-1/expected.html b/test/test-pages/ars-1/expected.html index abc1fc9..cc95641 100644 --- a/test/test-pages/ars-1/expected.html +++ b/test/test-pages/ars-1/expected.html @@ -1,15 +1,18 @@
-
-
-
-
-

A flaw in the wildly popular online game Minecraft makes it easy for just about anyone to crash the server hosting the game, according to a computer programmer who has released proof-of-concept code that exploits the vulnerability.

-

"I thought a lot before writing this post," Pakistan-based developer Ammar Askar wrote in a blog post published Thursday, 21 months, he said, after privately reporting the bug to Minecraft developer Mojang. "On the one hand I don't want to expose thousands of servers to a major vulnerability, yet on the other hand Mojang has failed to act on it."

-

The bug resides in the networking internals of the Minecraft protocol. It allows the contents of inventory slots to be exchanged, so that, among other things, items in players' hotbars are displayed automatically after logging in. Minecraft items can also store arbitrary metadata in a file format known as Named Binary Tag (NBT), which allows complex data structures to be kept in hierarchical nests. Askar has released proof-of-concept attack code he said exploits the vulnerability to crash any server hosting the game. Here's how it works.

-
-

The vulnerability stems from the fact that the client is allowed to send the server information about certain slots. This, coupled with the NBT format’s nesting allows us to craft a packet that is incredibly complex for the server to deserialize but trivial for us to generate.

-

In my case, I chose to create lists within lists, down to five levels. This is a json representation of what it looks like.

-
rekt: {
+    
+
+
+
+
+
+
+

A flaw in the wildly popular online game Minecraft makes it easy for just about anyone to crash the server hosting the game, according to a computer programmer who has released proof-of-concept code that exploits the vulnerability.

+

"I thought a lot before writing this post," Pakistan-based developer Ammar Askar wrote in a blog post published Thursday, 21 months, he said, after privately reporting the bug to Minecraft developer Mojang. "On the one hand I don't want to expose thousands of servers to a major vulnerability, yet on the other hand Mojang has failed to act on it."

+

The bug resides in the networking internals of the Minecraft protocol. It allows the contents of inventory slots to be exchanged, so that, among other things, items in players' hotbars are displayed automatically after logging in. Minecraft items can also store arbitrary metadata in a file format known as Named Binary Tag (NBT), which allows complex data structures to be kept in hierarchical nests. Askar has released proof-of-concept attack code he said exploits the vulnerability to crash any server hosting the game. Here's how it works.

+
+

The vulnerability stems from the fact that the client is allowed to send the server information about certain slots. This, coupled with the NBT format’s nesting allows us to craft a packet that is incredibly complex for the server to deserialize but trivial for us to generate.

+

In my case, I chose to create lists within lists, down to five levels. This is a json representation of what it looks like.

+
rekt: {
     list: [
         list: [
             list: [
@@ -35,14 +38,24 @@
     ]
     ...
 }
-

The root of the object, rekt, contains 300 lists. Each list has a list with 10 sublists, and each of those sublists has 10 of their own, up until 5 levels of recursion. That’s a total of 10^5 * 300 = 30,000,000 lists.

-

And this isn’t even the theoretical maximum for this attack. Just the nbt data for this payload is 26.6 megabytes. But luckily Minecraft implements a way to compress large packets, lucky us! zlib shrinks down our evil data to a mere 39 kilobytes.

-

Note: in previous versions of Minecraft, there was no protocol wide compression for big packets. Previously, NBT was sent compressed with gzip and prefixed with a signed short of its length, which reduced our maximum payload size to 2^15 - 1. Now that the length is a varint capable of storing integers up to 2^28, our potential for attack has increased significantly.

-

When the server will decompress our data, it’ll have 27 megs in a buffer somewhere in memory, but that isn’t the bit that’ll kill it. When it attempts to parse it into NBT, it’ll create java representations of the objects meaning suddenly, the sever is having to create several million java objects including ArrayLists. This runs the server out of memory and causes tremendous CPU load.

-

This vulnerability exists on almost all previous and current Minecraft versions as of 1.8.3, the packets used as attack vectors are the 0x08: Block Placement Packet and 0x10: Creative Inventory Action.

-

The fix for this vulnerability isn’t exactly that hard, the client should never really send a data structure as complex as NBT of arbitrary size and if it must, some form of recursion and size limits should be implemented.

-

These were the fixes that I recommended to Mojang 2 years ago.

-
-

Ars is asking Mojang for comment and will update this post if company officials respond.

-
+

The root of the object, rekt, contains 300 lists. Each list has a list with 10 sublists, and each of those sublists has 10 of their own, up until 5 levels of recursion. That’s a total of 10^5 * 300 = 30,000,000 lists.

+

And this isn’t even the theoretical maximum for this attack. Just the nbt data for this payload is 26.6 megabytes. But luckily Minecraft implements a way to compress large packets, lucky us! zlib shrinks down our evil data to a mere 39 kilobytes.

+

Note: in previous versions of Minecraft, there was no protocol wide compression for big packets. Previously, NBT was sent compressed with gzip and prefixed with a signed short of its length, which reduced our maximum payload size to 2^15 - 1. Now that the length is a varint capable of storing integers up to 2^28, our potential for attack has increased significantly.

+

When the server will decompress our data, it’ll have 27 megs in a buffer somewhere in memory, but that isn’t the bit that’ll kill it. When it attempts to parse it into NBT, it’ll create java representations of the objects meaning suddenly, the sever is having to create several million java objects including ArrayLists. This runs the server out of memory and causes tremendous CPU load.

+

This vulnerability exists on almost all previous and current Minecraft versions as of 1.8.3, the packets used as attack vectors are the 0x08: Block Placement Packet and 0x10: Creative Inventory Action.

+

The fix for this vulnerability isn’t exactly that hard, the client should never really send a data structure as complex as NBT of arbitrary size and if it must, some form of recursion and size limits should be implemented.

+

These were the fixes that I recommended to Mojang 2 years ago.

+
+

Ars is asking Mojang for comment and will update this post if company officials respond.

+
+

Expand full story

+ +
+ + +
+
\ No newline at end of file diff --git a/test/test-pages/sfgate-1/expected-metadata.json b/test/test-pages/sfgate-1/expected-metadata.json new file mode 100644 index 0000000..e2d8140 --- /dev/null +++ b/test/test-pages/sfgate-1/expected-metadata.json @@ -0,0 +1,6 @@ +{ + "title": "Nob Hill one-bedroom sells for $2.3 million", + "byline": "By Emily Landes", + "excerpt": "In early March, we told you about a one-bedroom in the prestigious Comstock in Nob Hill, which came to market at $2.495 million, making it the most expensive one-bedroom on the market at the time. The northwest facing unit—with amazing panoramic views of Twin Peaks, the Golden Gate Bridge, the", + "readerable": true +} diff --git a/test/test-pages/sfgate-1/expected.html b/test/test-pages/sfgate-1/expected.html new file mode 100644 index 0000000..abd8f98 --- /dev/null +++ b/test/test-pages/sfgate-1/expected.html @@ -0,0 +1,292 @@ +
+
+ +

In early March, we told you about a one-bedroom in the prestigious Comstock in Nob Hill, which came to market at $2.495 million, making it the most expensive one-bedroom on the market at the time. The northwest facing unit—with amazing panoramic views of Twin Peaks, the Golden Gate Bridge, the North Bay, Alcatraz and Coit Tower in almost every room—has now sold for $2.3 million, or 8% under the asking price.

+

At just over 1,800 square feet, the sale works out to about $1,250 a square foot for the completely gutted and remodeled unit. That price is also about $700K higher than when the unit last sold at the end of 2006. Of course, it was also in a very different state at the time, with old carpeting; a closed-off, very outdated kitchen; and two bedrooms. (Before and afters are in the gallery above.)

+

Almost all the walls came down during the remodel and the carpeting was changed out for hardwoods, not to mention the addition of a completely new open concept kitchen and modern bathrooms. Also, the second bedroom was changed into a den/office with a glass wall separating it from the rest of the large entertaining space. Even the windows were updated to take better advantage of the unbelievable views.

+

By the way, if you’re looking for the same great views with a (slightly) smaller price tag, a 1,200-square-foot one-bedroom just came to market in the same building. You’ll get less indoor space but what appears to be a larger balcony to take in those incredible San Francisco sights, all for the bargain price of $1.75 million.

+

Emily Landes is a writer and editor who is obsessed with all things real estate.

+
+
\ No newline at end of file diff --git a/test/test-pages/sfgate-1/source.html b/test/test-pages/sfgate-1/source.html new file mode 100644 index 0000000..a52ed37 --- /dev/null +++ b/test/test-pages/sfgate-1/source.html @@ -0,0 +1,9373 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Nob Hill one-bedroom sells for $2.3 million - On The Block + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CmfThirdPartyHeader - SFGate + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+
+
+
+ +
+ + + + + +
+
+ +
+
+
+ + +
+
+ + + + + +
+ + +
+ + +
+
+
+ +
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+ +
+
+ + + +
+
+
+
+

Nob Hill one-bedroom sells for $2.3 million

+
+ + +
+ + +
+
+

+ +

+ +

+ +

In early March, we told you about a one-bedroom in the prestigious Comstock in Nob Hill, which came to market at $2.495 million, making it the most expensive one-bedroom on the market at the time. The northwest facing unit—with amazing panoramic views of Twin Peaks, the Golden Gate Bridge, the North Bay, Alcatraz and Coit Tower in almost every room—has now sold for $2.3 million, or 8% under the asking price.

+

At just over 1,800 square feet, the sale works out to about $1,250 a square foot for the completely gutted and remodeled unit. That price is also about $700K higher than when the unit last sold at the end of 2006. Of course, it was also in a very different state at the time, with old carpeting; a closed-off, very outdated kitchen; and two bedrooms. (Before and afters are in the gallery above.)

+

Almost all the walls came down during the remodel and the carpeting was changed out for hardwoods, not to mention the addition of a completely new open concept kitchen and modern bathrooms. Also, the second bedroom was changed into a den/office with a glass wall separating it from the rest of the large entertaining space. Even the windows were updated to take better advantage of the unbelievable views.

+

By the way, if you’re looking for the same great views with a (slightly) smaller price tag, a 1,200-square-foot one-bedroom just came to market in the same building. You’ll get less indoor space but what appears to be a larger balcony to take in those incredible San Francisco sights, all for the bargain price of $1.75 million.

+

Emily Landes is a writer and editor who is obsessed with all things real estate.

+

 

+
+ + +
+
+
+
+ Emily Landes
+ +
+
+
+
+
+
+ +
+
+ + +
+
+ + + + +You Might Also Like + +
+ +
+
+
+
+ +
+ + +
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+ + +
+
+
+
+
+ +
+ + +
+
+
+
+
+
+ +
+
+ +
+
+ + + + + + +
+
+
+ +
+
+
+
+
+
+ + +
+
+
+ +
+ + + +
+ +
+
+
+
+
+ +
+
+
+ Guest +
+
+
+
+
+ +
+
+ + +
+
+
+
+ Follow +
+ +
+
+
+ +
+ +
+
+
+
+
+ + + +
+
+
+ + +
+ +
+
+ + + +
+
+ +
+ + + + + + + + + +
+ + + + + + + + +
+ + + + + + + + + + + + + + + + + +
Quantcast
+ + + diff --git a/test/test-readability.js b/test/test-readability.js index c92d475..a38ca5c 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -18,7 +18,7 @@ function reformatError(err) { function runTestsWithItems(label, beforeFn, expectedContent, expectedMetadata) { describe(label, function() { - this.timeout(5000); + this.timeout(10000); var result;