diff --git a/test/bootstrap.js b/test/bootstrap.js index baa45c1..050cf73 100644 --- a/test/bootstrap.js +++ b/test/bootstrap.js @@ -12,13 +12,31 @@ function readJSON(path) { var testPageRoot = path.join(__dirname, "test-pages"); -exports.getTestPages = function() { - return fs.readdirSync(testPageRoot).map(function(dir) { +exports.getExtractionTestPages = function() { + var testRootFolder = path.join(testPageRoot, "extraction"); + return fs.readdirSync(testRootFolder).filter(function(entry) { + return fs.statSync(path.join(testRootFolder, entry)).isDirectory(); + }).map(function(dir) { return { dir: dir, - source: readFile(path.join(testPageRoot, dir, "source.html")), - expectedContent: readFile(path.join(testPageRoot, dir, "expected.html")), - expectedMetadata: readJSON(path.join(testPageRoot, dir, "expected-metadata.json")), + source: readFile(path.join(testRootFolder, dir, "source.html")), + expectedContent: readFile(path.join(testRootFolder, dir, "expected.html")), + expectedMetadata: readJSON(path.join(testRootFolder, dir, "expected-metadata.json")), }; }); }; + +exports.getDetectionTestPages = function() { + var testRootFolder = path.join(testPageRoot, "detection"); + var readableFilesRoot = path.join(testRootFolder, "readerable"); + var readable = fs.readdirSync(readableFilesRoot).map(function(file) { + var source = readFile(path.join(readableFilesRoot, file)); + return {file: file, source: source, readerable: true}; + }); + var nonReadableFilesRoot = path.join(testRootFolder, "non-readerable"); + var nonReadable = fs.readdirSync(nonReadableFilesRoot).map(function(file) { + var source = readFile(path.join(nonReadableFilesRoot, file)); + return {file: file, source: source, readerable: false}; + }); + return readable.concat(nonReadable); +}; diff --git a/test/generate-testcase.js b/test/generate-testcase.js index 72f9e87..9d9cf4a 100644 --- a/test/generate-testcase.js +++ b/test/generate-testcase.js @@ -20,7 +20,7 @@ if (process.argv.length < 3) { var slug = process.argv[2]; var url = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue. -var destRoot = path.join(__dirname, "test-pages", slug); +var destRoot = path.join(__dirname, "test-pages", "extraction", slug); fs.mkdir(destRoot, function(err) { if (err) { diff --git a/test/test-pages/detection/non-readerable/single-image.html b/test/test-pages/detection/non-readerable/single-image.html new file mode 100644 index 0000000..1cad704 --- /dev/null +++ b/test/test-pages/detection/non-readerable/single-image.html @@ -0,0 +1,19 @@ + + + + I shouldn't be detected as readerable + + +
+
+
+
+
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+
+
+
+ + diff --git a/test/test-pages/detection/readerable/craiglist.html b/test/test-pages/detection/readerable/craiglist.html new file mode 100644 index 0000000..5fb3d45 --- /dev/null +++ b/test/test-pages/detection/readerable/craiglist.html @@ -0,0 +1,317 @@ + + + + + apartments4rent.ro : CAROL villa apartment top floor with terrace + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+
+ + reply + +
+ +

Posted: + +

+ + + + + +
+ +

+ + RON2800 / 1br - 50m2 - apartments4rent.ro : CAROL villa apartment top floor with terrace (downtown) +

+ +
+
+
< + + + > +
+ + + +
+
+

1BR / 1Ba 50m2 furnished apartment + available mar 23 +
street parking +
dogs are OK - wooof +

+

+
+
+
Welcome to RomPromo Plus Accommodation! +
In the heart of Bucharest AMZEI historic center, on Magheru Bld the main + blvd of the city, at the entrance of the Carol park, locations of our apartments, + but also of restaurants, night-clubs, casinos, 24/7 supermarket, .......,studios, + apartments 2 rooms, villa for short, medium or long terms rentals. +
More details at http://apartments4rent.ro +
+
VILLA CAROL ...... UP TO 3 PERSONS............. http://apartments4rent.ro/villacarol.php +
Apartment studio with a large terrace at the TOP floor of a new villa, +
at the entrance of the attractive Carol park, +
ALLphotos on this site are from this Apartment Villa ... +
For 1 month, Euro650 for 1 or 2 persons, +
Internet WiFi, linen, towels weekly provided with cleaning AND charges + INCLUDED. +
+
STUDIO AMZEI +
For 1 or 2 persons. +
http://apartments4rent.ro/studio_garsoniera_amzei.php +
+
2 ROOMS AMZEI +
Photos ............ http://apartments4rent.ro/2_rooms_amzei.php +
2 Rooms AMZEI beside the French embassy is a place of choice UP TO 4 PERSONS. +
+
STUDIO MAGHERU +
VIDEO: http://apartments4rent.ro/studio_garsoniera_magheru.php +
This suite is UP TO 3 PERSONS. +
+
VILLA ROMANA +
Villa close Romana square, UP TO 10 PERSONS, +
VIDEOS and PHOTOS at: http://apartments4rent.ro/villaromana.php +
+
Apply to us ...
+
    +
  • do NOT contact me with unsolicited services or offers
  • +
+
+

post id: 4943647026

+

posted: + +

+

updated: + +

+

email to friend +

+

best of [?] + +

+
+
+
+
+
+ +
+ + + + + + + \ No newline at end of file diff --git a/test/test-pages/detection/readerable/theverge.html b/test/test-pages/detection/readerable/theverge.html new file mode 100644 index 0000000..0c808df --- /dev/null +++ b/test/test-pages/detection/readerable/theverge.html @@ -0,0 +1,2946 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + All six Star Wars movies are coming to iTunes, Google Play, and other + video services | The Verge + + + + + + + +
+
+
+ +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+
+ +
+ +
+ + +

+ All six Star Wars movies are coming to iTunes, Google Play, and other video services +

+ + 147 + +
+
+
+ +
+
+ +
+
+ + + +
+

The Star Wars movies are coming to smartphones. All six movies — + yes, even The Phantom Menace, unfortunately — will be launched + on digital video services such as iTunes, Google Play, Amazon Instant Video, + and Xbox Video around the world on April 10th, Disney and Lucasfilm announced today. + The launch will allow fans to buy Digital HD versions of the movies individually, + or get them all at once as part of the Star Wars Digital Movie Collection.

+

The six movies each come with bonus features, including documentaries, + interviews with production staff, deleted scenes, and closer looks at the + films' models and sets. Some digital retailers are also offering extra + incentives to buy the movies from their marketplace — get the entire collection  + from Xbox Videoand you'll earn a digital R2-D2 to accompany your creepy, + blank-faced Xbox Live avatar, a pinball table for the free-to-play Pinball FX 2, + and access to an Xbox-only featurette.

+
+
+ +
+
+ + +

Buy the collection on Xbox Video to earn an imaginary R2-D2 +

+

A number of digital retailers have yet to specify the price for the collection, + but the whole bundle is available for $89 on the Google Play store, with + individual movies going for $19.99. There were rumors last year that + Disney was planning to re-release the original Star Wars movies on + Blu-ray without George Lucas' CGI spot-welding, but viewers don't appear + to be able to choose which version of the movie they want to watch with + the upcoming Digital HD editions. Purchasers of the new versions will have + to cope with unconvincing Dewbacks, Han having the gall to step on Jabba's + tail, and the disconcerting sight of a moody Hayden Christensen as Vader's + ghost at the Endor feast. At least they'll be in high resolution.

+
+
+
+Verge Video: Star Wars- Episode VII adds new cast and Millenium Falcon + + +
+
+
+ +
+ +
+
+
+

The best of Verge Video

+ +
+ + +
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + +
+
+
+
+ +
+
+
+ + + +
+ + +
+
+
+
+ +
+
+ +
+
+ +
+ +
Back to top ^ + + +
+ + +
+
+
X + + +
+
+
Log In + Sign Up + +
+ + + + + + + + + + + + + + +

If you currently have a username with "@" in it, please email support@voxmedia.com.

+
+ +
+ +
+
+
+

+
+ forgot? + +
+
+ forgot? + +
+
+
+ + + + + +
+
+
+
+
Log In + Sign Up + +
+ + + + + + + + + + + + + + +
+ +
+ +
+
+
+

+ +
+ + +
+
+ + +
+
+ + +
+ +
+ +
+
+
+
+

Forgot password?

+ +

We'll email you a reset link.

+

If you signed up using a 3rd party account like Facebook or Twitter, please + loginwith it instead.

+
+ +
+ +
+
+
+
+

Forgot username?

+ +

We'll email it to you.

+

If you signed up using a 3rd party account like Facebook or Twitter, please + loginwith it instead.

+
+ +
+ +
+
+
+
+

Forgot password?

+ +

+

+

If you signed up using a 3rd party account like Facebook or Twitter, please + loginwith it instead.

Try another email? + +
+
+

Forgot username?

+ +

+

+

If you signed up using a 3rd party account like Facebook or Twitter, please + loginwith it instead.

Try another email? + +
+
+

Almost done,

+ +

+
+
+
By becoming a registered user, you are also agreeing to our Terms and confirming that + you have read our Privacy Policy.
+
+ +
+
+
+
+ Spinner.vc97ec6e +

Authenticating

+ +
+
+ +

Great!

+ +

Choose an available username to complete sign up.

+
+ +
+ + +
+
+ +
+
+
+
+ +

In order to provide our users with a better overall experience, we ask + for more information from Facebook when using it to login so that we can + learn more about our audience and provide you with the best possible experience. + We do not store specific user data and the sharing of it is not required + to login with Facebook.

+
+ + + + + + + + +
+ +
+
+
+
+ +
+ + + +
+ + + + +
+ +
+ + + + + + + \ No newline at end of file diff --git a/test/test-pages/001/expected-metadata.json b/test/test-pages/extraction/001/expected-metadata.json similarity index 60% rename from test/test-pages/001/expected-metadata.json rename to test/test-pages/extraction/001/expected-metadata.json index ee91098..ed9c5f7 100644 --- a/test/test-pages/001/expected-metadata.json +++ b/test/test-pages/extraction/001/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Get your Frontend JavaScript Code Covered | Code", "byline": "Nicolas Perriault —", - "excerpt": "Nicolas Perriault's homepage.", - "readerable": true + "excerpt": "Nicolas Perriault's homepage." } diff --git a/test/test-pages/001/expected.html b/test/test-pages/extraction/001/expected.html similarity index 100% rename from test/test-pages/001/expected.html rename to test/test-pages/extraction/001/expected.html diff --git a/test/test-pages/001/source.html b/test/test-pages/extraction/001/source.html similarity index 100% rename from test/test-pages/001/source.html rename to test/test-pages/extraction/001/source.html diff --git a/test/test-pages/002/expected-metadata.json b/test/test-pages/extraction/002/expected-metadata.json similarity index 80% rename from test/test-pages/002/expected-metadata.json rename to test/test-pages/extraction/002/expected-metadata.json index 9b020e4..608544a 100644 --- a/test/test-pages/002/expected-metadata.json +++ b/test/test-pages/extraction/002/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog", "byline": "Nikhil Marathe", - "excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...", - "readerable": true + "excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ..." } diff --git a/test/test-pages/002/expected.html b/test/test-pages/extraction/002/expected.html similarity index 100% rename from test/test-pages/002/expected.html rename to test/test-pages/extraction/002/expected.html diff --git a/test/test-pages/002/source.html b/test/test-pages/extraction/002/source.html similarity index 100% rename from test/test-pages/002/source.html rename to test/test-pages/extraction/002/source.html diff --git a/test/test-pages/extraction/about-com/expected-metadata.json b/test/test-pages/extraction/about-com/expected-metadata.json new file mode 100644 index 0000000..78a0f16 --- /dev/null +++ b/test/test-pages/extraction/about-com/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Must-Have Accessories for Every Room", + "byline": "By Valerie Ott", + "excerpt": "Transform a room that feels unfinished into an inviting, interesting, and polished space by using this checklist of must-have accessories." +} diff --git a/test/test-pages/extraction/about-com/expected.html b/test/test-pages/extraction/about-com/expected.html new file mode 100644 index 0000000..b1d26ad --- /dev/null +++ b/test/test-pages/extraction/about-com/expected.html @@ -0,0 +1,66 @@ +
+
+

Have you ever decorated a room and felt it was missing something, but + you couldn't put your finger on what?  Do your rooms feel dull despite + your best efforts to make them inviting?  This list of must-have home + accessories can serve as your checklist as you work on decorating a room, + ensuring that you won't end up with a space that feels unfinished.  

+ + + +
+
\ No newline at end of file diff --git a/test/test-pages/extraction/about-com/source.html b/test/test-pages/extraction/about-com/source.html new file mode 100644 index 0000000..940a829 --- /dev/null +++ b/test/test-pages/extraction/about-com/source.html @@ -0,0 +1,1697 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Must-Have Accessories for Every Room + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ + +
+
+
+
+ +
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+ +
+

What Every Room Needs

+ +
+
+
+ +
+
+
+ +
+
+
+ +
+
+
+
+
+
+ + +
+
+
+
+
+
+ Bedroom - Jodi Jacobson/E+/Getty Images +
+
+
Jodi Jacobson/E+/Getty Images + +
+
+
+
+
+ +

Have you ever decorated a room and felt it was missing something, but + you couldn't put your finger on what?  Do your rooms feel dull despite + your best efforts to make them inviting?  This list of must-have home + accessories can serve as your checklist as you work on decorating a room, + ensuring that you won't end up with a space that feels unfinished.  

+
    +
  • Color--All-white or all-beige rooms can feel sterile and lifeless. +  Monochromatic schemes are fine, just be sure to add some pops of + color with accessories.
  • +
+
+ +
    +
  • Something living--Houseplants do wonders for a space.  No green thumb? +  No problem!  Try out some trendy succulents, which are very hardy, or an easy-to-grow + houseplant.  Trees add wonderful height and life to a room as well. +  Alternatively, faux plants can look very realistic. Fresh-cut flowers + are another avenue, and can last up to two weeks if you care for them properly. +  A small vase of flowers on the kitchen counter or coffee table, or + a houseplant or two by a sunny window instantly makes a place feel more + homey.  Try it, you'll see.
  • +
  • Something black--I don't know where I first heard this piece of advice, + but it has stuck with me and I have tried it myself with a lot of success. +  If your room feels like it's missing something, try adding something + black.  Whether it's a piece of accent furniture, a set of picture + frames, or a chalkboard-painted sign, you'll see improvement in the room + by adding black, which seems to ground the space and bring order to + disparate pieces.
  • +
  • Texture--Bringing in texture through baskets, throws and rugs adds a layer + of interest and polish to a room.  If your sofa has great pillows, but still seems uninviting, a throw + can make a difference.  Likewise, layering a throw on the end of a + bed can tie everything together.  Woven baskets for firewood, toys, + extra pillows or books are another great way to add texture to a space, + not to mention restore order to a cluttered room.
  • +
+
+
    +
  • Books--I personally cannot imagine a room without books in it.  Whether + there are coffee table tomes in the living room, cookbooks in the + kitchen, or a basket of children's books in the family room, it's + easy to envision how books can live happily in any room.  You needn't + feel as though you have to be an avid reader, nor should you feel + like you have to have whole bookcases filled with books.  Books can simply + be used as beautiful objects in a display, and they can add height + to other items you might want to showcase.
  • +
  • Personality--If your room feels like a furniture showroom, chances are + you haven't added in enough of your own personality.  Your home should + be a reflection of who lives there, so try to highlight your family's hobbies, + interests, or travels through accessories that reflect those stories. +  Instead of your usual haunts, why not shop off the beaten track + at places like Etsy, garage sales, or flea markets?  Items that lend a + sense of history, as well as tell your own meaningful story, make + great conversation pieces and infuse a room with life.  Photographs + are only one way to do this.  Framed memorabilia, objects acquired + on trips, or collections are other ways to infuse your room with personality.
  • +
+
+
+ +
+ +
+
+
+
+ +
+Home Accessories Essentials +
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
© 2015 About.com — All rights reserved. + +
+
+ +
+
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/test/test-pages/base-url/expected-metadata.json b/test/test-pages/extraction/base-url/expected-metadata.json similarity index 87% rename from test/test-pages/base-url/expected-metadata.json rename to test/test-pages/extraction/base-url/expected-metadata.json index 1feb5ae..b9f22e6 100644 --- a/test/test-pages/base-url/expected-metadata.json +++ b/test/test-pages/extraction/base-url/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Base URL test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/base-url/expected.html b/test/test-pages/extraction/base-url/expected.html similarity index 100% rename from test/test-pages/base-url/expected.html rename to test/test-pages/extraction/base-url/expected.html diff --git a/test/test-pages/base-url/source.html b/test/test-pages/extraction/base-url/source.html similarity index 100% rename from test/test-pages/base-url/source.html rename to test/test-pages/extraction/base-url/source.html diff --git a/test/test-pages/basic-tags-cleaning/expected-metadata.json b/test/test-pages/extraction/basic-tags-cleaning/expected-metadata.json similarity index 81% rename from test/test-pages/basic-tags-cleaning/expected-metadata.json rename to test/test-pages/extraction/basic-tags-cleaning/expected-metadata.json index baa8c73..4fd25ab 100644 --- a/test/test-pages/basic-tags-cleaning/expected-metadata.json +++ b/test/test-pages/extraction/basic-tags-cleaning/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Basic tag cleaning test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." } diff --git a/test/test-pages/basic-tags-cleaning/expected.html b/test/test-pages/extraction/basic-tags-cleaning/expected.html similarity index 100% rename from test/test-pages/basic-tags-cleaning/expected.html rename to test/test-pages/extraction/basic-tags-cleaning/expected.html diff --git a/test/test-pages/basic-tags-cleaning/source.html b/test/test-pages/extraction/basic-tags-cleaning/source.html similarity index 100% rename from test/test-pages/basic-tags-cleaning/source.html rename to test/test-pages/extraction/basic-tags-cleaning/source.html diff --git a/test/test-pages/comment-inside-script-parsing/expected-metadata.json b/test/test-pages/extraction/comment-inside-script-parsing/expected-metadata.json similarity index 81% rename from test/test-pages/comment-inside-script-parsing/expected-metadata.json rename to test/test-pages/extraction/comment-inside-script-parsing/expected-metadata.json index d3857ba..c0520af 100644 --- a/test/test-pages/comment-inside-script-parsing/expected-metadata.json +++ b/test/test-pages/extraction/comment-inside-script-parsing/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Test script parsing", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." } diff --git a/test/test-pages/comment-inside-script-parsing/expected.html b/test/test-pages/extraction/comment-inside-script-parsing/expected.html similarity index 100% rename from test/test-pages/comment-inside-script-parsing/expected.html rename to test/test-pages/extraction/comment-inside-script-parsing/expected.html diff --git a/test/test-pages/comment-inside-script-parsing/source.html b/test/test-pages/extraction/comment-inside-script-parsing/source.html similarity index 100% rename from test/test-pages/comment-inside-script-parsing/source.html rename to test/test-pages/extraction/comment-inside-script-parsing/source.html diff --git a/test/test-pages/extraction/craiglist/expected-metadata.json b/test/test-pages/extraction/craiglist/expected-metadata.json new file mode 100644 index 0000000..a68297f --- /dev/null +++ b/test/test-pages/extraction/craiglist/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "CAROL villa apartment top floor with terrace", + "byline": null, + "excerpt": "Welcome to RomPromo Plus Accommodation! In the heart of Bucharest AMZEI historic center, on Magheru Bld the main blvd of the city, at the entrance of the Carol park, locations of our apartments, but..." +} diff --git a/test/test-pages/extraction/craiglist/expected.html b/test/test-pages/extraction/craiglist/expected.html new file mode 100644 index 0000000..ae3cee0 --- /dev/null +++ b/test/test-pages/extraction/craiglist/expected.html @@ -0,0 +1,77 @@ +
+
+
+

< + + + > +

+ +
+
+

1BR / 1Ba 50m2 furnished apartment + available mar 23 +
street parking + +
dogs are OK - wooof + +

+
+
Welcome to RomPromo Plus Accommodation! +
In the heart of Bucharest AMZEI historic center, on Magheru Bld the main + blvd of the city, at the entrance of the Carol park, locations of our apartments, + but also of restaurants, night-clubs, casinos, 24/7 supermarket, .......,studios, + apartments 2 rooms, villa for short, medium or long terms rentals. +
More details at http://apartments4rent.ro +

VILLA CAROL ...... UP TO 3 PERSONS............. http://apartments4rent.ro/villacarol.php

+
Apartment studio with a large terrace at the TOP floor of a new villa, +
at the entrance of the attractive Carol park, +
ALLphotos on this site are from this Apartment Villa ... +
For 1 month, Euro650 for 1 or 2 persons, +
Internet WiFi, linen, towels weekly provided with cleaning AND charges + INCLUDED. +

STUDIO AMZEI

+
For 1 or 2 persons. +
http://apartments4rent.ro/studio_garsoniera_amzei.php +

2 ROOMS AMZEI

+
Photos ............ http://apartments4rent.ro/2_rooms_amzei.php +
2 Rooms AMZEI beside the French embassy is a place of choice UP TO 4 PERSONS. +

STUDIO MAGHERU

+
VIDEO: http://apartments4rent.ro/studio_garsoniera_magheru.php +
This suite is UP TO 3 PERSONS. +

VILLA ROMANA

+
Villa close Romana square, UP TO 10 PERSONS, +
VIDEOS and PHOTOS at: http://apartments4rent.ro/villaromana.php +

Apply to us ...

+
+ +
+

post id: 4943647026

+

posted: + +

+

updated: + +

+

email to friend + +

+

best of [?] + +

+
+
+
\ No newline at end of file diff --git a/test/test-pages/extraction/craiglist/source.html b/test/test-pages/extraction/craiglist/source.html new file mode 100644 index 0000000..5fb3d45 --- /dev/null +++ b/test/test-pages/extraction/craiglist/source.html @@ -0,0 +1,317 @@ + + + + + apartments4rent.ro : CAROL villa apartment top floor with terrace + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+
+ + reply + +
+ +

Posted: + +

+ + + + + +
+ +

+ + RON2800 / 1br - 50m2 - apartments4rent.ro : CAROL villa apartment top floor with terrace (downtown) +

+ +
+
+
< + + + > +
+ + + +
+
+

1BR / 1Ba 50m2 furnished apartment + available mar 23 +
street parking +
dogs are OK - wooof +

+

+
+
+
Welcome to RomPromo Plus Accommodation! +
In the heart of Bucharest AMZEI historic center, on Magheru Bld the main + blvd of the city, at the entrance of the Carol park, locations of our apartments, + but also of restaurants, night-clubs, casinos, 24/7 supermarket, .......,studios, + apartments 2 rooms, villa for short, medium or long terms rentals. +
More details at http://apartments4rent.ro +
+
VILLA CAROL ...... UP TO 3 PERSONS............. http://apartments4rent.ro/villacarol.php +
Apartment studio with a large terrace at the TOP floor of a new villa, +
at the entrance of the attractive Carol park, +
ALLphotos on this site are from this Apartment Villa ... +
For 1 month, Euro650 for 1 or 2 persons, +
Internet WiFi, linen, towels weekly provided with cleaning AND charges + INCLUDED. +
+
STUDIO AMZEI +
For 1 or 2 persons. +
http://apartments4rent.ro/studio_garsoniera_amzei.php +
+
2 ROOMS AMZEI +
Photos ............ http://apartments4rent.ro/2_rooms_amzei.php +
2 Rooms AMZEI beside the French embassy is a place of choice UP TO 4 PERSONS. +
+
STUDIO MAGHERU +
VIDEO: http://apartments4rent.ro/studio_garsoniera_magheru.php +
This suite is UP TO 3 PERSONS. +
+
VILLA ROMANA +
Villa close Romana square, UP TO 10 PERSONS, +
VIDEOS and PHOTOS at: http://apartments4rent.ro/villaromana.php +
+
Apply to us ...
+
    +
  • do NOT contact me with unsolicited services or offers
  • +
+
+

post id: 4943647026

+

posted: + +

+

updated: + +

+

email to friend +

+

best of [?] + +

+
+
+
+
+
+ +
+ + + + + + + \ No newline at end of file diff --git a/test/test-pages/embedded-videos/expected-metadata.json b/test/test-pages/extraction/embedded-videos/expected-metadata.json similarity index 87% rename from test/test-pages/embedded-videos/expected-metadata.json rename to test/test-pages/extraction/embedded-videos/expected-metadata.json index 4e8dcb9..8663ff4 100644 --- a/test/test-pages/embedded-videos/expected-metadata.json +++ b/test/test-pages/extraction/embedded-videos/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Embedded videos test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/embedded-videos/expected.html b/test/test-pages/extraction/embedded-videos/expected.html similarity index 100% rename from test/test-pages/embedded-videos/expected.html rename to test/test-pages/extraction/embedded-videos/expected.html diff --git a/test/test-pages/embedded-videos/source.html b/test/test-pages/extraction/embedded-videos/source.html similarity index 100% rename from test/test-pages/embedded-videos/source.html rename to test/test-pages/extraction/embedded-videos/source.html diff --git a/test/test-pages/herald-sun-1/expected-metadata.json b/test/test-pages/extraction/herald-sun-1/expected-metadata.json similarity index 83% rename from test/test-pages/herald-sun-1/expected-metadata.json rename to test/test-pages/extraction/herald-sun-1/expected-metadata.json index 6c8b59b..07a5e91 100644 --- a/test/test-pages/herald-sun-1/expected-metadata.json +++ b/test/test-pages/extraction/herald-sun-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Angry media won’t buckle over new surveillance laws\n\t\t\t\t\t\t| Herald Sun", "byline": "JOE HILDEBRAND", - "excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed.", - "readerable": true + "excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed." } diff --git a/test/test-pages/herald-sun-1/expected.html b/test/test-pages/extraction/herald-sun-1/expected.html similarity index 100% rename from test/test-pages/herald-sun-1/expected.html rename to test/test-pages/extraction/herald-sun-1/expected.html diff --git a/test/test-pages/herald-sun-1/source.html b/test/test-pages/extraction/herald-sun-1/source.html similarity index 100% rename from test/test-pages/herald-sun-1/source.html rename to test/test-pages/extraction/herald-sun-1/source.html diff --git a/test/test-pages/keep-images/expected-metadata.json b/test/test-pages/extraction/keep-images/expected-metadata.json similarity index 86% rename from test/test-pages/keep-images/expected-metadata.json rename to test/test-pages/extraction/keep-images/expected-metadata.json index a62d0ab..1570581 100644 --- a/test/test-pages/keep-images/expected-metadata.json +++ b/test/test-pages/extraction/keep-images/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Inside the Deep Web Drug Lab — Backchannel — Medium", "byline": "Joseph Cox", - "excerpt": "Welcome to DoctorX’s Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…", - "readerable": true + "excerpt": "Welcome to DoctorX’s Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…" } diff --git a/test/test-pages/keep-images/expected.html b/test/test-pages/extraction/keep-images/expected.html similarity index 100% rename from test/test-pages/keep-images/expected.html rename to test/test-pages/extraction/keep-images/expected.html diff --git a/test/test-pages/keep-images/source.html b/test/test-pages/extraction/keep-images/source.html similarity index 100% rename from test/test-pages/keep-images/source.html rename to test/test-pages/extraction/keep-images/source.html diff --git a/test/test-pages/lifehacker-post-comment-load/expected-metadata.json b/test/test-pages/extraction/lifehacker-post-comment-load/expected-metadata.json similarity index 82% rename from test/test-pages/lifehacker-post-comment-load/expected-metadata.json rename to test/test-pages/extraction/lifehacker-post-comment-load/expected-metadata.json index ead5dca..88e9ff2 100644 --- a/test/test-pages/lifehacker-post-comment-load/expected-metadata.json +++ b/test/test-pages/extraction/lifehacker-post-comment-load/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "How to Program Your Mind to Stop Buying Crap You Don’t Need", "byline": "Patrick Allan", - "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store.", - "readerable": true + "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store." } diff --git a/test/test-pages/lifehacker-post-comment-load/expected.html b/test/test-pages/extraction/lifehacker-post-comment-load/expected.html similarity index 100% rename from test/test-pages/lifehacker-post-comment-load/expected.html rename to test/test-pages/extraction/lifehacker-post-comment-load/expected.html diff --git a/test/test-pages/lifehacker-post-comment-load/source.html b/test/test-pages/extraction/lifehacker-post-comment-load/source.html similarity index 100% rename from test/test-pages/lifehacker-post-comment-load/source.html rename to test/test-pages/extraction/lifehacker-post-comment-load/source.html diff --git a/test/test-pages/lifehacker-working/expected-metadata.json b/test/test-pages/extraction/lifehacker-working/expected-metadata.json similarity index 82% rename from test/test-pages/lifehacker-working/expected-metadata.json rename to test/test-pages/extraction/lifehacker-working/expected-metadata.json index ead5dca..88e9ff2 100644 --- a/test/test-pages/lifehacker-working/expected-metadata.json +++ b/test/test-pages/extraction/lifehacker-working/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "How to Program Your Mind to Stop Buying Crap You Don’t Need", "byline": "Patrick Allan", - "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store.", - "readerable": true + "excerpt": "We all buy things from time to time that we don't really need. It's okay to appeal to your wants every once in a while, as long as you're in control. If you struggle with clutter, impulse buys, and buyer's remorse, here's how to put your mind in the right place before you even set foot in a store." } diff --git a/test/test-pages/lifehacker-working/expected.html b/test/test-pages/extraction/lifehacker-working/expected.html similarity index 100% rename from test/test-pages/lifehacker-working/expected.html rename to test/test-pages/extraction/lifehacker-working/expected.html diff --git a/test/test-pages/lifehacker-working/source.html b/test/test-pages/extraction/lifehacker-working/source.html similarity index 100% rename from test/test-pages/lifehacker-working/source.html rename to test/test-pages/extraction/lifehacker-working/source.html diff --git a/test/test-pages/medium-1/expected-metadata.json b/test/test-pages/extraction/medium-1/expected-metadata.json similarity index 84% rename from test/test-pages/medium-1/expected-metadata.json rename to test/test-pages/extraction/medium-1/expected-metadata.json index 232a067..3bc073b 100644 --- a/test/test-pages/medium-1/expected-metadata.json +++ b/test/test-pages/extraction/medium-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Better Student Journalism — Medium", "byline": "Pippin Lee", - "excerpt": "We pushed out the first version of the Open Journalism site in January. Here’s what we’ve learned about student journali…", - "readerable": true + "excerpt": "We pushed out the first version of the Open Journalism site in January. Here’s what we’ve learned about student journali…" } diff --git a/test/test-pages/medium-1/expected.html b/test/test-pages/extraction/medium-1/expected.html similarity index 100% rename from test/test-pages/medium-1/expected.html rename to test/test-pages/extraction/medium-1/expected.html diff --git a/test/test-pages/medium-1/source.html b/test/test-pages/extraction/medium-1/source.html similarity index 100% rename from test/test-pages/medium-1/source.html rename to test/test-pages/extraction/medium-1/source.html diff --git a/test/test-pages/medium-2/expected-metadata.json b/test/test-pages/extraction/medium-2/expected-metadata.json similarity index 85% rename from test/test-pages/medium-2/expected-metadata.json rename to test/test-pages/extraction/medium-2/expected-metadata.json index dade1f4..27bbb9e 100644 --- a/test/test-pages/medium-2/expected-metadata.json +++ b/test/test-pages/extraction/medium-2/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "On Behalf of “Literally” — Medium", "byline": "Courtney Kirchoff", - "excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…", - "readerable": true + "excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…" } diff --git a/test/test-pages/medium-2/expected.html b/test/test-pages/extraction/medium-2/expected.html similarity index 100% rename from test/test-pages/medium-2/expected.html rename to test/test-pages/extraction/medium-2/expected.html diff --git a/test/test-pages/medium-2/source.html b/test/test-pages/extraction/medium-2/source.html similarity index 100% rename from test/test-pages/medium-2/source.html rename to test/test-pages/extraction/medium-2/source.html diff --git a/test/test-pages/missing-paragraphs/expected-metadata.json b/test/test-pages/extraction/missing-paragraphs/expected-metadata.json similarity index 95% rename from test/test-pages/missing-paragraphs/expected-metadata.json rename to test/test-pages/extraction/missing-paragraphs/expected-metadata.json index 80c8500..5471b7a 100644 --- a/test/test-pages/missing-paragraphs/expected-metadata.json +++ b/test/test-pages/extraction/missing-paragraphs/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt", "byline": "Henri Sivonen", - "excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet." } diff --git a/test/test-pages/missing-paragraphs/expected.html b/test/test-pages/extraction/missing-paragraphs/expected.html similarity index 100% rename from test/test-pages/missing-paragraphs/expected.html rename to test/test-pages/extraction/missing-paragraphs/expected.html diff --git a/test/test-pages/missing-paragraphs/source.html b/test/test-pages/extraction/missing-paragraphs/source.html similarity index 100% rename from test/test-pages/missing-paragraphs/source.html rename to test/test-pages/extraction/missing-paragraphs/source.html diff --git a/test/test-pages/normalize-spaces/expected-metadata.json b/test/test-pages/extraction/normalize-spaces/expected-metadata.json similarity index 85% rename from test/test-pages/normalize-spaces/expected-metadata.json rename to test/test-pages/extraction/normalize-spaces/expected-metadata.json index 78039d8..7300185 100644 --- a/test/test-pages/normalize-spaces/expected-metadata.json +++ b/test/test-pages/extraction/normalize-spaces/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Normalize space test", "byline": null, - "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem\n ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n\ttab here\n incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/normalize-spaces/expected.html b/test/test-pages/extraction/normalize-spaces/expected.html similarity index 100% rename from test/test-pages/normalize-spaces/expected.html rename to test/test-pages/extraction/normalize-spaces/expected.html diff --git a/test/test-pages/normalize-spaces/source.html b/test/test-pages/extraction/normalize-spaces/source.html similarity index 100% rename from test/test-pages/normalize-spaces/source.html rename to test/test-pages/extraction/normalize-spaces/source.html diff --git a/test/test-pages/remove-extra-brs/expected-metadata.json b/test/test-pages/extraction/remove-extra-brs/expected-metadata.json similarity index 81% rename from test/test-pages/remove-extra-brs/expected-metadata.json rename to test/test-pages/extraction/remove-extra-brs/expected-metadata.json index 25e04b4..3eb3ebb 100644 --- a/test/test-pages/remove-extra-brs/expected-metadata.json +++ b/test/test-pages/extraction/remove-extra-brs/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Remove trailing brs test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." } diff --git a/test/test-pages/remove-extra-brs/expected.html b/test/test-pages/extraction/remove-extra-brs/expected.html similarity index 100% rename from test/test-pages/remove-extra-brs/expected.html rename to test/test-pages/extraction/remove-extra-brs/expected.html diff --git a/test/test-pages/remove-extra-brs/source.html b/test/test-pages/extraction/remove-extra-brs/source.html similarity index 100% rename from test/test-pages/remove-extra-brs/source.html rename to test/test-pages/extraction/remove-extra-brs/source.html diff --git a/test/test-pages/remove-extra-paragraphs/expected-metadata.json b/test/test-pages/extraction/remove-extra-paragraphs/expected-metadata.json similarity index 81% rename from test/test-pages/remove-extra-paragraphs/expected-metadata.json rename to test/test-pages/extraction/remove-extra-paragraphs/expected-metadata.json index 8e841af..662b7a2 100644 --- a/test/test-pages/remove-extra-paragraphs/expected-metadata.json +++ b/test/test-pages/extraction/remove-extra-paragraphs/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Replace font tags test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." } diff --git a/test/test-pages/remove-extra-paragraphs/expected.html b/test/test-pages/extraction/remove-extra-paragraphs/expected.html similarity index 100% rename from test/test-pages/remove-extra-paragraphs/expected.html rename to test/test-pages/extraction/remove-extra-paragraphs/expected.html diff --git a/test/test-pages/remove-extra-paragraphs/source.html b/test/test-pages/extraction/remove-extra-paragraphs/source.html similarity index 100% rename from test/test-pages/remove-extra-paragraphs/source.html rename to test/test-pages/extraction/remove-extra-paragraphs/source.html diff --git a/test/test-pages/remove-script-tags/expected-metadata.json b/test/test-pages/extraction/remove-script-tags/expected-metadata.json similarity index 81% rename from test/test-pages/remove-script-tags/expected-metadata.json rename to test/test-pages/extraction/remove-script-tags/expected-metadata.json index 2ef0848..707383d 100644 --- a/test/test-pages/remove-script-tags/expected-metadata.json +++ b/test/test-pages/extraction/remove-script-tags/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Remove script tags test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua." } diff --git a/test/test-pages/remove-script-tags/expected.html b/test/test-pages/extraction/remove-script-tags/expected.html similarity index 100% rename from test/test-pages/remove-script-tags/expected.html rename to test/test-pages/extraction/remove-script-tags/expected.html diff --git a/test/test-pages/remove-script-tags/source.html b/test/test-pages/extraction/remove-script-tags/source.html similarity index 100% rename from test/test-pages/remove-script-tags/source.html rename to test/test-pages/extraction/remove-script-tags/source.html diff --git a/test/test-pages/reordering-paragraphs/expected-metadata.json b/test/test-pages/extraction/reordering-paragraphs/expected-metadata.json similarity index 94% rename from test/test-pages/reordering-paragraphs/expected-metadata.json rename to test/test-pages/extraction/reordering-paragraphs/expected-metadata.json index ca7653a..1c13f8c 100644 --- a/test/test-pages/reordering-paragraphs/expected-metadata.json +++ b/test/test-pages/extraction/reordering-paragraphs/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "", "byline": null, - "excerpt": "Regarding item# 11111, under sufficiently extreme conditions, quarks may\n become deconfined and exist as free particles. In the course of asymptotic\n freedom, the strong interaction becomes weaker at higher temperatures.\n Eventually, color confinement would be lost and an extremely hot plasma\n of freely moving quarks and gluons would be formed. This theoretical phase\n of matter is called quark-gluon plasma.[81] The exact conditions needed\n to give rise to this state are unknown and have been the subject of a great\n deal of speculation and experimentation.", - "readerable": true + "excerpt": "Regarding item# 11111, under sufficiently extreme conditions, quarks may\n become deconfined and exist as free particles. In the course of asymptotic\n freedom, the strong interaction becomes weaker at higher temperatures.\n Eventually, color confinement would be lost and an extremely hot plasma\n of freely moving quarks and gluons would be formed. This theoretical phase\n of matter is called quark-gluon plasma.[81] The exact conditions needed\n to give rise to this state are unknown and have been the subject of a great\n deal of speculation and experimentation." } diff --git a/test/test-pages/reordering-paragraphs/expected.html b/test/test-pages/extraction/reordering-paragraphs/expected.html similarity index 100% rename from test/test-pages/reordering-paragraphs/expected.html rename to test/test-pages/extraction/reordering-paragraphs/expected.html diff --git a/test/test-pages/reordering-paragraphs/source.html b/test/test-pages/extraction/reordering-paragraphs/source.html similarity index 100% rename from test/test-pages/reordering-paragraphs/source.html rename to test/test-pages/extraction/reordering-paragraphs/source.html diff --git a/test/test-pages/replace-brs/expected-metadata.json b/test/test-pages/extraction/replace-brs/expected-metadata.json similarity index 51% rename from test/test-pages/replace-brs/expected-metadata.json rename to test/test-pages/extraction/replace-brs/expected-metadata.json index 9654255..1da4929 100644 --- a/test/test-pages/replace-brs/expected-metadata.json +++ b/test/test-pages/extraction/replace-brs/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Replace brs test", "byline": null, - "excerpt": "Lorem ipsum", - "readerable": true + "excerpt": "Lorem ipsum" } diff --git a/test/test-pages/replace-brs/expected.html b/test/test-pages/extraction/replace-brs/expected.html similarity index 100% rename from test/test-pages/replace-brs/expected.html rename to test/test-pages/extraction/replace-brs/expected.html diff --git a/test/test-pages/replace-brs/source.html b/test/test-pages/extraction/replace-brs/source.html similarity index 100% rename from test/test-pages/replace-brs/source.html rename to test/test-pages/extraction/replace-brs/source.html diff --git a/test/test-pages/replace-font-tags/expected-metadata.json b/test/test-pages/extraction/replace-font-tags/expected-metadata.json similarity index 87% rename from test/test-pages/replace-font-tags/expected-metadata.json rename to test/test-pages/extraction/replace-font-tags/expected-metadata.json index 036066a..501704f 100644 --- a/test/test-pages/replace-font-tags/expected-metadata.json +++ b/test/test-pages/extraction/replace-font-tags/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Replace font tags test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/replace-font-tags/expected.html b/test/test-pages/extraction/replace-font-tags/expected.html similarity index 100% rename from test/test-pages/replace-font-tags/expected.html rename to test/test-pages/extraction/replace-font-tags/expected.html diff --git a/test/test-pages/replace-font-tags/source.html b/test/test-pages/extraction/replace-font-tags/source.html similarity index 100% rename from test/test-pages/replace-font-tags/source.html rename to test/test-pages/extraction/replace-font-tags/source.html diff --git a/test/test-pages/salon-1/expected-metadata.json b/test/test-pages/extraction/salon-1/expected-metadata.json similarity index 71% rename from test/test-pages/salon-1/expected-metadata.json rename to test/test-pages/extraction/salon-1/expected-metadata.json index 02cb664..74c9e04 100644 --- a/test/test-pages/salon-1/expected-metadata.json +++ b/test/test-pages/extraction/salon-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech\n and libertarians", "byline": "Joanna Rothkopf", - "excerpt": "Disruptive companies talk a good game about sharing. Uber's really just an under-regulated company making riches", - "readerable": true + "excerpt": "Disruptive companies talk a good game about sharing. Uber's really just an under-regulated company making riches" } diff --git a/test/test-pages/salon-1/expected.html b/test/test-pages/extraction/salon-1/expected.html similarity index 100% rename from test/test-pages/salon-1/expected.html rename to test/test-pages/extraction/salon-1/expected.html diff --git a/test/test-pages/salon-1/source.html b/test/test-pages/extraction/salon-1/source.html similarity index 100% rename from test/test-pages/salon-1/source.html rename to test/test-pages/extraction/salon-1/source.html diff --git a/test/test-pages/social-buttons/expected-metadata.json b/test/test-pages/extraction/social-buttons/expected-metadata.json similarity index 87% rename from test/test-pages/social-buttons/expected-metadata.json rename to test/test-pages/extraction/social-buttons/expected-metadata.json index 1a738af..913d8a9 100644 --- a/test/test-pages/social-buttons/expected-metadata.json +++ b/test/test-pages/extraction/social-buttons/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Share buttons removal test", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/social-buttons/expected.html b/test/test-pages/extraction/social-buttons/expected.html similarity index 100% rename from test/test-pages/social-buttons/expected.html rename to test/test-pages/extraction/social-buttons/expected.html diff --git a/test/test-pages/social-buttons/source.html b/test/test-pages/extraction/social-buttons/source.html similarity index 100% rename from test/test-pages/social-buttons/source.html rename to test/test-pages/extraction/social-buttons/source.html diff --git a/test/test-pages/style-tags-removal/expected-metadata.json b/test/test-pages/extraction/style-tags-removal/expected-metadata.json similarity index 87% rename from test/test-pages/style-tags-removal/expected-metadata.json rename to test/test-pages/extraction/style-tags-removal/expected-metadata.json index 401629a..35b0908 100644 --- a/test/test-pages/style-tags-removal/expected-metadata.json +++ b/test/test-pages/extraction/style-tags-removal/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Style tags removal", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/style-tags-removal/expected.html b/test/test-pages/extraction/style-tags-removal/expected.html similarity index 100% rename from test/test-pages/style-tags-removal/expected.html rename to test/test-pages/extraction/style-tags-removal/expected.html diff --git a/test/test-pages/style-tags-removal/source.html b/test/test-pages/extraction/style-tags-removal/source.html similarity index 100% rename from test/test-pages/style-tags-removal/source.html rename to test/test-pages/extraction/style-tags-removal/source.html diff --git a/test/test-pages/svg-parsing/expected-metadata.json b/test/test-pages/extraction/svg-parsing/expected-metadata.json similarity index 92% rename from test/test-pages/svg-parsing/expected-metadata.json rename to test/test-pages/extraction/svg-parsing/expected-metadata.json index c8cbac6..1845399 100644 --- a/test/test-pages/svg-parsing/expected-metadata.json +++ b/test/test-pages/extraction/svg-parsing/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "SVG parsing", "byline": null, - "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\ntempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\nconsequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\nproident, sunt in culpa qui officia deserunt mollit anim id est laborum.", - "readerable": true + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\ntempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\nquis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\nconsequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\ncillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\nproident, sunt in culpa qui officia deserunt mollit anim id est laborum." } diff --git a/test/test-pages/svg-parsing/expected.html b/test/test-pages/extraction/svg-parsing/expected.html similarity index 100% rename from test/test-pages/svg-parsing/expected.html rename to test/test-pages/extraction/svg-parsing/expected.html diff --git a/test/test-pages/svg-parsing/source.html b/test/test-pages/extraction/svg-parsing/source.html similarity index 100% rename from test/test-pages/svg-parsing/source.html rename to test/test-pages/extraction/svg-parsing/source.html diff --git a/test/test-pages/tmz-1/expected-metadata.json b/test/test-pages/extraction/tmz-1/expected-metadata.json similarity index 77% rename from test/test-pages/tmz-1/expected-metadata.json rename to test/test-pages/extraction/tmz-1/expected-metadata.json index 39a351a..2587175 100644 --- a/test/test-pages/tmz-1/expected-metadata.json +++ b/test/test-pages/extraction/tmz-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Lupita Nyong'o's $150K Pearl Oscar Dress -- STOLEN!!!", "byline": null, - "excerpt": "Lupita Nyong'o's now-famous Oscar dress -- adorned in pearls -- was stolen right out of her hotel room ... TMZ has learned. Law enforcement sources tell…", - "readerable": true + "excerpt": "Lupita Nyong'o's now-famous Oscar dress -- adorned in pearls -- was stolen right out of her hotel room ... TMZ has learned. Law enforcement sources tell…" } diff --git a/test/test-pages/tmz-1/expected.html b/test/test-pages/extraction/tmz-1/expected.html similarity index 100% rename from test/test-pages/tmz-1/expected.html rename to test/test-pages/extraction/tmz-1/expected.html diff --git a/test/test-pages/tmz-1/source.html b/test/test-pages/extraction/tmz-1/source.html similarity index 100% rename from test/test-pages/tmz-1/source.html rename to test/test-pages/extraction/tmz-1/source.html diff --git a/test/test-pages/wapo-1/expected-metadata.json b/test/test-pages/extraction/wapo-1/expected-metadata.json similarity index 86% rename from test/test-pages/wapo-1/expected-metadata.json rename to test/test-pages/extraction/wapo-1/expected-metadata.json index 5195394..15d53eb 100644 --- a/test/test-pages/wapo-1/expected-metadata.json +++ b/test/test-pages/extraction/wapo-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Attack stokes instability fears in North Africa", "byline": "By Erin Cunningham", - "excerpt": "The assault on Tunisia’s most renowned museum, in which gunmen killed at least 19 people, could heighten tensions in a nation that has become deeply divided between pro- and anti-Islamist factions.", - "readerable": true + "excerpt": "The assault on Tunisia’s most renowned museum, in which gunmen killed at least 19 people, could heighten tensions in a nation that has become deeply divided between pro- and anti-Islamist factions." } diff --git a/test/test-pages/wapo-1/expected.html b/test/test-pages/extraction/wapo-1/expected.html similarity index 100% rename from test/test-pages/wapo-1/expected.html rename to test/test-pages/extraction/wapo-1/expected.html diff --git a/test/test-pages/wapo-1/source.html b/test/test-pages/extraction/wapo-1/source.html similarity index 100% rename from test/test-pages/wapo-1/source.html rename to test/test-pages/extraction/wapo-1/source.html diff --git a/test/test-pages/wapo-2/expected-metadata.json b/test/test-pages/extraction/wapo-2/expected-metadata.json similarity index 88% rename from test/test-pages/wapo-2/expected-metadata.json rename to test/test-pages/extraction/wapo-2/expected-metadata.json index bae11fb..2888e9f 100644 --- a/test/test-pages/wapo-2/expected-metadata.json +++ b/test/test-pages/extraction/wapo-2/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Where do strained U.S.-Israeli relations go after Netanyahu’s victory?", "byline": "By Steven Mufson", - "excerpt": "Few foreign leaders have so brazenly stood up to President Obama and the relationship could face its next test this month.", - "readerable": true + "excerpt": "Few foreign leaders have so brazenly stood up to President Obama and the relationship could face its next test this month." } diff --git a/test/test-pages/wapo-2/expected.html b/test/test-pages/extraction/wapo-2/expected.html similarity index 100% rename from test/test-pages/wapo-2/expected.html rename to test/test-pages/extraction/wapo-2/expected.html diff --git a/test/test-pages/wapo-2/source.html b/test/test-pages/extraction/wapo-2/source.html similarity index 100% rename from test/test-pages/wapo-2/source.html rename to test/test-pages/extraction/wapo-2/source.html diff --git a/test/test-pages/webmd-1/expected-metadata.json b/test/test-pages/extraction/webmd-1/expected-metadata.json similarity index 92% rename from test/test-pages/webmd-1/expected-metadata.json rename to test/test-pages/extraction/webmd-1/expected-metadata.json index 4ceddcf..3fe5b00 100644 --- a/test/test-pages/webmd-1/expected-metadata.json +++ b/test/test-pages/extraction/webmd-1/expected-metadata.json @@ -1,6 +1,5 @@ { "title": "Babies Who Eat Peanuts Early May Avoid Allergy", "byline": "By Brenda Goodman, MA\n WebMD Health News", - "excerpt": "Life-threatening peanut allergies have mysteriously been on the rise in the past decade, with little hope for a cure. But a groundbreaking new study may offer a way to stem that rise, while another may offer some hope for those who are already allergic.", - "readerable": true + "excerpt": "Life-threatening peanut allergies have mysteriously been on the rise in the past decade, with little hope for a cure. But a groundbreaking new study may offer a way to stem that rise, while another may offer some hope for those who are already allergic." } diff --git a/test/test-pages/webmd-1/expected.html b/test/test-pages/extraction/webmd-1/expected.html similarity index 100% rename from test/test-pages/webmd-1/expected.html rename to test/test-pages/extraction/webmd-1/expected.html diff --git a/test/test-pages/webmd-1/source.html b/test/test-pages/extraction/webmd-1/source.html similarity index 100% rename from test/test-pages/webmd-1/source.html rename to test/test-pages/extraction/webmd-1/source.html diff --git a/test/test-readability.js b/test/test-readability.js index d658d76..baf4062 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -1,5 +1,6 @@ var prettyPrint = require("html").prettyPrint; var jsdom = require("jsdom").jsdom; +var serializeDocument = require("jsdom").serializeDocument; var chai = require("chai"); chai.config.includeStack = true; var expect = chai.expect; @@ -8,8 +9,6 @@ var readability = require("../index"); var Readability = readability.Readability; var JSDOMParser = readability.JSDOMParser; -var testPages = require("./bootstrap").getTestPages(); - function runTestsWithItems(label, beforeFn, expectedContent, expectedMetadata) { describe(label, function() { var result; @@ -38,7 +37,7 @@ function runTestsWithItems(label, beforeFn, expectedContent, expectedMetadata) { }); it("should probably be readerable", function() { - expect(expectedMetadata.readerable).eql(result.readerable); + expect(result.readerable).eql(true); }); }); } @@ -86,24 +85,26 @@ describe("Readability API", function() { }); }); -describe("Test pages", function() { - testPages.forEach(function(testPage) { +var uri = { + spec: "http://fakehost/test/page.html", + host: "fakehost", + prePath: "http://fakehost", + scheme: "http", + pathBase: "http://fakehost/test/" +}; + +var jsdomOptions = { + features: { + FetchExternalResources: false, + ProcessExternalResources: false + } +}; + +describe("Extraction", function() { + require("./bootstrap").getExtractionTestPages().forEach(function(testPage) { describe(testPage.dir, function() { - var uri = { - spec: "http://fakehost/test/page.html", - host: "fakehost", - prePath: "http://fakehost", - scheme: "http", - pathBase: "http://fakehost/test/" - }; - runTestsWithItems("jsdom", function() { - var doc = jsdom(testPage.source, { - features: { - FetchExternalResources: false, - ProcessExternalResources: false - } - }); + var doc = jsdom(testPage.source, jsdomOptions); removeCommentNodesRecursively(doc); var readability = new Readability(uri, doc); var readerable = readability.isProbablyReaderable(); @@ -123,3 +124,21 @@ describe("Test pages", function() { }); }); }); + +describe("Detection", function() { + require("./bootstrap").getDetectionTestPages().forEach(function(testPage) { + describe(testPage.file, function() { + var readerable; + + before(function() { + var readability = new Readability(uri, jsdom(testPage.source, jsdomOptions)); + readerable = readability.isProbablyReaderable(); + }); + + it("should be detected as " + (testPage.readerable ? "readerable" : "non-readerable"), + function() { + expect(readerable).eql(testPage.readerable); + }); + }); + }); +});