unicode, dammit!

pull/1/head
gfxmonk 14 years ago
parent a224c5b759
commit 87ad057706

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from BeautifulSoup import NavigableString from BeautifulSoup import NavigableString
from page_parser import parse, get_title, get_body from page_parser import parse, get_title, get_body, Unparseable
import logging import logging
import re import re
@ -29,7 +29,7 @@ class Document:
RETRY_LENGTH = 250 RETRY_LENGTH = 250
def __init__(self, input, **options): def __init__(self, input, **options):
self.input = input self.input = inpuunicodear
self.options = defaultdict(lambda: None) self.options = defaultdict(lambda: None)
for k, v in options.items(): for k, v in options.items():
self.options[k] = v self.options[k] = v
@ -206,7 +206,7 @@ class Document:
for elem in self.html.findAll(): for elem in self.html.findAll():
if elem.name.lower() == "div": if elem.name.lower() == "div":
# transform <div>s that do not contain other block elements into <p>s # transform <div>s that do not contain other block elements into <p>s
if REGEXES['divToPElementsRe'].search(''.join(map(str, elem.contents))): if REGEXES['divToPElementsRe'].search(''.join(map(unicode, elem.contents))):
self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', ''))) self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
elem.name = "p" elem.name = "p"

Loading…
Cancel
Save