feat: find top candidate function
parent
e2600231ac
commit
9da7a6f2a9
@ -0,0 +1,113 @@
|
||||
import { NON_TOP_CANDIDATE_TAGS_RE } from '../constants'
|
||||
import { getScore } from './index'
|
||||
import {
|
||||
linkDensity,
|
||||
textLength
|
||||
} from '../dom/index'
|
||||
|
||||
// After we've calculated scores, loop through all of the possible
|
||||
// candidate nodes we found and find the one with the highest score.
|
||||
export default function findTopCandidate($) {
|
||||
let candidate, topScore = 0
|
||||
|
||||
$('*[score]').each((index, node) => {
|
||||
// Ignore tags like BR, HR, etc
|
||||
if (NON_TOP_CANDIDATE_TAGS_RE.test(node.tagName)) {
|
||||
return
|
||||
}
|
||||
|
||||
const score = getScore($(node))
|
||||
|
||||
if (score > topScore) {
|
||||
topScore = score
|
||||
candidate = node
|
||||
}
|
||||
})
|
||||
|
||||
// If we don't have a candidate, return the body
|
||||
// or whatever the first element is
|
||||
if (!candidate) {
|
||||
return $('body') || $('*').first()
|
||||
}
|
||||
|
||||
candidate = mergeSiblings(candidate, topScore, $)
|
||||
|
||||
return $(candidate)
|
||||
}
|
||||
|
||||
// Now that we have a top_candidate, look through the siblings of
|
||||
// it to see if any of them are decently scored. If they are, they
|
||||
// may be split parts of the content (Like two divs, a preamble and
|
||||
// a body.) Example:
|
||||
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
|
||||
export function mergeSiblings(candidate, topScore, $) {
|
||||
if (!$(candidate).parent().length) {
|
||||
return candidate
|
||||
}
|
||||
|
||||
const siblingScoreThreshold = Math.max(10, topScore * 0.2)
|
||||
let wrappingDiv = $('<div></div>')
|
||||
|
||||
$(candidate).parent().children().each((index, child) => {
|
||||
// Ignore tags like BR, HR, etc
|
||||
if (NON_TOP_CANDIDATE_TAGS_RE.test(child.tagName)) {
|
||||
return
|
||||
}
|
||||
|
||||
const childScore = getScore($(child))
|
||||
if (childScore) {
|
||||
if (child === candidate) {
|
||||
wrappingDiv.append(child)
|
||||
} else {
|
||||
let contentBonus = 0
|
||||
// extract to scoreLinkDensity() TODO
|
||||
const density = linkDensity($(child))
|
||||
|
||||
// If sibling has a very low link density,
|
||||
// give it a small bonus
|
||||
if (density < .05) {
|
||||
contentBonus = contentBonus + 20
|
||||
}
|
||||
|
||||
// If sibling has a high link density,
|
||||
// give it a penalty
|
||||
if (density >= 0.5) {
|
||||
contentBonus = contentBonus - 20
|
||||
}
|
||||
|
||||
// If sibling node has the same class as
|
||||
// candidate, give it a bonus
|
||||
if ($(child).attr('class') === $(candidate).attr('class')) {
|
||||
contentBonus = contentBonus + topScore * .2
|
||||
}
|
||||
|
||||
const newScore = getScore($(child)) + contentBonus
|
||||
|
||||
if (newScore >= siblingScoreThreshold) {
|
||||
return wrappingDiv.append(child)
|
||||
} else if (node.tagName === 'p') {
|
||||
childContentLength = textLength(child.text())
|
||||
|
||||
if (childContentLength > 80 && density < .25) {
|
||||
return wrappingDiv.append(child)
|
||||
} else if (childContentLength <= 80 && density === 0 &&
|
||||
hasSentenceEnd(childContent)) {
|
||||
|
||||
return wrappingDiv.append(child)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
return wrappingDiv
|
||||
}
|
||||
|
||||
// TODO Extract into util - AP
|
||||
// Given a string, return True if it appears to have an ending sentence
|
||||
// within it, false otherwise.
|
||||
const SENTENCE_END_RE = new RegExp('\.( |$)')
|
||||
function hasSentenceEnd(text) {
|
||||
return SENTENCE_END_RE.test(text)
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
|
||||
import {
|
||||
getScore,
|
||||
findTopCandidate,
|
||||
scoreContent
|
||||
} from './index'
|
||||
|
||||
describe('findTopCandidate($)', () => {
|
||||
it("finds the top candidate from simple case", () => {
|
||||
const $ = cheerio.load(HTML.findDom1)
|
||||
|
||||
const topCandidate = findTopCandidate($)
|
||||
|
||||
assert.equal(getScore(topCandidate), 100)
|
||||
})
|
||||
|
||||
it("finds the top candidate from a nested case", () => {
|
||||
const $ = cheerio.load(HTML.findDom2)
|
||||
|
||||
const topCandidate = findTopCandidate($)
|
||||
|
||||
// this is wrapped in a div so checking
|
||||
// the score of the first child
|
||||
assert.equal(getScore(topCandidate.children().first()), 50)
|
||||
})
|
||||
|
||||
it("ignores tags like BR", () => {
|
||||
const $ = cheerio.load(HTML.findDom3)
|
||||
|
||||
const topCandidate = findTopCandidate($)
|
||||
|
||||
assert.equal(getScore(topCandidate), 50)
|
||||
})
|
||||
|
||||
it("returns BODY if no candidates found", () => {
|
||||
const $ = cheerio.load(HTML.topBody)
|
||||
|
||||
const topCandidate = findTopCandidate($)
|
||||
|
||||
assert.equal(topCandidate.get(0).tagName, 'body')
|
||||
})
|
||||
|
||||
it("appends a sibling with a good enough score", () => {
|
||||
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
|
||||
.replace(/<!--[\s\S]*?-->/g, '')
|
||||
|
||||
let $ = cheerio.load(html)
|
||||
$ = scoreContent($)
|
||||
|
||||
const topCandidate = findTopCandidate($)
|
||||
|
||||
assert.equal($(topCandidate).text().length, 3652)
|
||||
})
|
||||
})
|
||||
|
Loading…
Reference in New Issue