simple logic in place for brsToPs

pull/1/head
Adam Pash 8 years ago
parent d70b9f6709
commit 95d02dadd1

@ -0,0 +1,87 @@
// ## NOTES:
// Another good candidate for refactoring/optimizing.
// Very imperative code, I don't love it. - AP
// Given cheerio object, convert consecutive <br /> tags into
// <p /> tags instead.
//
// :param $: A cheerio object
// :param min_consecutive: Integer, the minimum number of consecutive
// <br /> tags that must exist for them to be converted to <p />
// tags. Must be at least 1.
//
export default function brsToPs($, minConsecutive=2) {
let collapsing = false
$('br').each((index, element) => {
let nextElement = $(element).next().get(0)
if (nextElement && nextElement.tagName === 'br') {
collapsing = true
$(element).remove()
} else if (collapsing) {
collapsing = false
$(element).replaceWith('<p />')
}
})
return $
}
// def _brs_to_paragraphs(self, doc, min_consecutive=2):
// print "_brs_to_paragraphs: convert consecutive brs to p tags"
// brs = doc.xpath('.//br')
//
// # Loop through all of our break tags, looking for consecutive
// # <br />s with no content in between them. If found, replace them
// # with a single P tag.
// for br in brs:
// # Generate a list of all the breaks in a row, with no text in
// # between them.
// joined_brs = []
// cur_br = br
// while True:
// joined_brs.append(cur_br)
//
// if cur_br.tail:
// break
//
// next = cur_br.getnext()
// next_is_br = next is not None and next.tag.lower() == 'br'
//
// if next_is_br:
// cur_br = next
// else:
// break
//
// if len(joined_brs) < min_consecutive:
// continue
//
// last_br = joined_brs[-1]
//
// # Now loop through following siblings, until we hit a block
// # tag or the end, and append them to this P if they are not a
// # block tag that is not a BR.
// self._paragraphize(last_br)
//
// # Drop every break that we no longer need because of the P.
// # The first BR has been turned into a P tag.
// for joined_br in joined_brs:
// if joined_br is not last_br:
// joined_br.drop_tag()
//
// # If we had any new p tags that are already inside a P tag, resolve
// # those by paragraphizing them, which will append their block level
// # contents.
// for fix_count in xrange(1000):
// # Find the first p that contains another p, and paragraphize it.
// # We do this in a loop because we're modifying the dom as we go.
// try:
// parent_p = doc.xpath('//p[./p][1]')[0]
// self._paragraphize(parent_p)
// except IndexError:
// break
// else:
// # We exhausted our loop, which means we've looped too many times
// # such that it's unreasonable. Log a warning.
// logger.warning("Bailing on p parent fix due to crazy "
// "looping for url %s" % self.resource.url)

@ -0,0 +1,43 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { clean } from './test-helpers'
import HTML from './fixtures/html'
import {
brsToPs
} from './index'
describe('Generic Extractor Utils', () => {
describe('brsToPs(node)', () => {
it("does nothing when no BRs present", () => {
const $ = cheerio.load(HTML.positiveId)
assert.equal(brsToPs($).html(), HTML.positiveId)
})
it("does nothing when a single BR is present", () => {
const $ = cheerio.load(HTML.singleBr.before)
assert.equal(brsToPs($).html(), HTML.singleBr.after)
})
it("converts double BR tags to an empty P tag", () => {
const $ = cheerio.load(HTML.doubleBrs.before)
const result = brsToPs($).html()
assert.equal(clean(result), clean(HTML.doubleBrs.after))
})
it("converts several BR tags to an empty P tag", () => {
const $ = cheerio.load(HTML.severalBrs.before)
const result = brsToPs($).html()
assert.equal(clean(result), clean(HTML.severalBrs.after))
})
it("converts BR tags in a P tag into a P containing inline children", () => {
const $ = cheerio.load(HTML.brsInP.before)
const result = brsToPs($).html()
// assert.equal(clean(result), clean(HTML.brsInP.after))
})
})
})

@ -94,6 +94,73 @@ const HTML = {
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p></p>
<p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p></p>
<p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
</p>
<p>
Here is more text
</p>
`,
},
}
export default HTML

@ -1,2 +1,3 @@
export { default as getWeight } from './get-weight'
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'

@ -6,7 +6,7 @@ import {
// ## NOTES:
// This is a working first pass, but if/when we start optimizing
// this is a good candidate.
// this is a good candidate. - AP
export default function stripUnlikelyCandidates($) {
// Loop through the provided document and remove any non-link nodes

@ -1,3 +1,3 @@
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, '')
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}

Loading…
Cancel
Save