Text summarizer based on `sumy`. Uses `ftfy` to keep sanity in unicode

as it appears on www, and `dominate` to generate html without fuss.
master
Harshad Sharma 7 years ago
parent 2ed6489594
commit 4b6b7f9ffb

@ -0,0 +1,66 @@
# coding=utf-8
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import dominate
from ftfy import fix_text
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from dominate.tags import *
from qutescript import userscript
LANGUAGE = "english"
SENTENCES_COUNT = 10
def generate_html(sentences, title_text):
doc = dominate.document(title='Summary')
with doc.head:
style("""\
body {
background-color: #F9F8F1;
color: #2C232A;
font-family: sans-serif;
font-size: 2.6em;
margin: 3em 1em;
}
""")
with doc:
div(id='header').add(h1(title_text))
with div():
attr(cls='body')
for sentence in sentences:
p(sentence)
return doc
@userscript
def summarize_text(request):
if request.html:
parser = HtmlParser.from_file(file_path=request.html,
url=request.url,
tokenizer=Tokenizer(LANGUAGE))
else:
parser = PlaintextParser.from_file(file_path=request.html,
tokenizer=Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
html = generate_html(sentences, fix_text(request.title)).render()
request.send_html(html)
if __name__ == "__main__":
summarize_text()

@ -0,0 +1,3 @@
dominate
ftfy
sumy
Loading…
Cancel
Save