simple logic in place for brsToPs

8 years ago · 95d02dadd1
parent d70b9f6709
commit 95d02dadd1
6 changed files with 200 additions and 2 deletions
--- a/src/extractor/generic/utils/brs-to-ps.js
+++ b/src/extractor/generic/utils/brs-to-ps.js
@ -0,0 +1,87 @@
+// ## NOTES:
+// Another good candidate for refactoring/optimizing.
+// Very imperative code, I don't love it. - AP
+
+
+//  Given cheerio object, convert consecutive <br /> tags into
+//  <p /> tags instead.
+//
+//  :param $: A cheerio object
+//  :param min_consecutive: Integer, the minimum number of consecutive
+//       <br /> tags that must exist for them to be converted to <p />
+//       tags. Must be at least 1.
+//
+export default function brsToPs($, minConsecutive=2) {
+  let collapsing = false
+  $('br').each((index, element) => {
+    let nextElement = $(element).next().get(0)
+
+    if (nextElement && nextElement.tagName === 'br') {
+      collapsing = true
+      $(element).remove()
+    } else if (collapsing) {
+      collapsing = false
+      $(element).replaceWith('<p />')
+    }
+  })
+
+  return $
+}
+    // def _brs_to_paragraphs(self, doc, min_consecutive=2):
+    //     print "_brs_to_paragraphs: convert consecutive brs to p tags"
+    //     brs = doc.xpath('.//br')
+    //
+    //     # Loop through all of our break tags, looking for consecutive
+    //     # <br />s with no content in between them. If found, replace them
+    //     # with a single P tag.
+    //     for br in brs:
+    //         # Generate a list of all the breaks in a row, with no text in
+    //         # between them.
+    //         joined_brs = []
+    //         cur_br = br
+    //         while True:
+    //             joined_brs.append(cur_br)
+    //
+    //             if cur_br.tail:
+    //                 break
+    //
+    //             next = cur_br.getnext()
+    //             next_is_br = next is not None and next.tag.lower() == 'br'
+    //
+    //             if next_is_br:
+    //                 cur_br = next
+    //             else:
+    //                 break
+    //
+    //         if len(joined_brs) < min_consecutive:
+    //             continue
+    //
+    //         last_br = joined_brs[-1]
+    //
+    //         # Now loop through following siblings, until we hit a block
+    //         # tag or the end, and append them to this P if they are not a
+    //         # block tag that is not a BR.
+    //         self._paragraphize(last_br)
+    //
+    //         # Drop every break that we no longer need because of the P.
+    //         # The first BR has been turned into a P tag.
+    //         for joined_br in joined_brs:
+    //             if joined_br is not last_br:
+    //                 joined_br.drop_tag()
+    //
+    //     # If we had any new p tags that are already inside a P tag, resolve
+    //     # those by paragraphizing them, which will append their block level
+    //     # contents.
+    //     for fix_count in xrange(1000):
+    //         # Find the first p that contains another p, and paragraphize it.
+    //         # We do this in a loop because we're modifying the dom as we go.
+    //         try:
+    //             parent_p = doc.xpath('//p[./p][1]')[0]
+    //             self._paragraphize(parent_p)
+    //         except IndexError:
+    //             break
+    //     else:
+    //         # We exhausted our loop, which means we've looped too many times
+    //         # such that it's unreasonable. Log a warning.
+    //         logger.warning("Bailing on p parent fix due to crazy "
+    //                         "looping for url %s" % self.resource.url)
--- a/src/extractor/generic/utils/brs-to-ps.test.js
+++ b/src/extractor/generic/utils/brs-to-ps.test.js
@ -0,0 +1,43 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import { clean } from './test-helpers'
+import HTML from './fixtures/html'
+import {
+  brsToPs
+} from './index'
+
+describe('Generic Extractor Utils', () => {
+  describe('brsToPs(node)', () => {
+
+    it("does nothing when no BRs present", () => {
+      const $ = cheerio.load(HTML.positiveId)
+      assert.equal(brsToPs($).html(), HTML.positiveId)
+    })
+
+    it("does nothing when a single BR is present", () => {
+      const $ = cheerio.load(HTML.singleBr.before)
+      assert.equal(brsToPs($).html(), HTML.singleBr.after)
+    })
+
+    it("converts double BR tags to an empty P tag", () => {
+      const $ = cheerio.load(HTML.doubleBrs.before)
+      const result = brsToPs($).html()
+      assert.equal(clean(result), clean(HTML.doubleBrs.after))
+    })
+
+    it("converts several BR tags to an empty P tag", () => {
+      const $ = cheerio.load(HTML.severalBrs.before)
+      const result = brsToPs($).html()
+      assert.equal(clean(result), clean(HTML.severalBrs.after))
+    })
+
+    it("converts BR tags in a P tag into a P containing inline children", () => {
+      const $ = cheerio.load(HTML.brsInP.before)
+      const result = brsToPs($).html()
+      // assert.equal(clean(result), clean(HTML.brsInP.after))
+    })
+
+  })
+})
+
--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -94,6 +94,73 @@ const HTML = {
      </div>
    `,
  },
+
+  // brsToPs
+  singleBr: {
+    before: `
+      <div class="article adbox">
+        <br>
+        <p>Ooo good one</p>
+      </div>
+    `,
+    after: `
+      <div class="article adbox">
+        <br>
+        <p>Ooo good one</p>
+      </div>
+    `,
+  },
+  doubleBrs: {
+    before: `
+      <div class="article adbox">
+        <br />
+        <br />
+        <p>Ooo good one</p>
+      </div>
+    `,
+    after: `
+      <div class="article adbox">
+        <p></p>
+        <p>Ooo good one</p>
+      </div>
+    `,
+  },
+  severalBrs: {
+    before: `
+      <div class="article adbox">
+        <br />
+        <br />
+        <br />
+        <br />
+        <br />
+        <p>Ooo good one</p>
+      </div>
+    `,
+    after: `
+      <div class="article adbox">
+        <p></p>
+        <p>Ooo good one</p>
+      </div>
+    `,
+  },
+  brsInP: {
+    before: `
+      <p>
+        Here is some text
+        <br />
+        <br />
+        Here is more text
+      </p>
+    `,
+    after: `
+      <p>
+        Here is some text
+      </p>
+      <p>
+        Here is more text
+      </p>
+    `,
+  },
 }

 export default HTML
--- a/src/extractor/generic/utils/index.js
+++ b/src/extractor/generic/utils/index.js
@ -1,2 +1,3 @@
 export { default as getWeight } from './get-weight'
 export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
+export { default as brsToPs } from './brs-to-ps'
--- a/src/extractor/generic/utils/strip-unlikely-candidates.js
+++ b/src/extractor/generic/utils/strip-unlikely-candidates.js
@ -6,7 +6,7 @@ import {

 // ## NOTES:
 // This is a working first pass, but if/when we start optimizing
-// this is a good candidate.
+// this is a good candidate. - AP

 export default function stripUnlikelyCandidates($) {
  //  Loop through the provided document and remove any non-link nodes
--- a/src/extractor/generic/utils/test-helpers.js
+++ b/src/extractor/generic/utils/test-helpers.js
@ -1,3 +1,3 @@
 export function clean(string) {
-  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, '')
+  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
 }