add a counter bucket to processed, move some stuff out of main, clean up

mail, create bayesian updater
master
Carlo Strub 7 years ago
parent ca967c197a
commit 0afac8f4c8

@ -0,0 +1,100 @@
/*
Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License
*/
package main
import (
"math"
"strconv"
"github.com/boltdb/bolt"
)
// classificationPriors returns the prior probabilities for good and junk
// classes.
func classificationPriors(db *bolt.DB) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gN := good.Stats().KeyN
junk := b.Bucket([]byte("Junk"))
jN := junk.Stats().KeyN
g = float64(gN) / (float64(gN) + float64(jN))
j = float64(jN) / (float64(gN) + float64(jN))
return nil
})
return
}
// classificationWordProb returns P(W|C_j) -- the probability of seeing
// a particular word W in a document of this class.
func classificationWordProb(db *bolt.DB, word string) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gNString := string(good.Get([]byte(word)))
gN, _ := strconv.ParseFloat(gNString, 64)
junk := b.Bucket([]byte("Junk"))
jNString := string(junk.Get([]byte(word)))
jN, _ := strconv.ParseFloat(jNString, 64)
p := tx.Bucket([]byte("Processed"))
counters := p.Bucket([]byte("Counters"))
jString := string(counters.Get([]byte("Junk")))
j, _ := strconv.ParseFloat(jString, 64)
mails := p.Bucket([]byte("Mails"))
pN := mails.Stats().KeyN
g = gN / (float64(pN) - j)
j = jN / j
return nil
})
return g, j
}
// LogScores produces "log-likelihood"-like scores that can
// be used to classify documents into classes.
//
// The value of the score is proportional to the likelihood,
// as determined by the classifier, that the given document
// belongs to the given class. This is true even when scores
// returned are negative, which they will be (since we are
// taking logs of probabilities).
//
// The index j of the score corresponds to the class given
// by c.Classes[j].
//
// Additionally returned are "inx" and "strict" values. The
// inx corresponds to the maximum score in the array. If more
// than one of the scores holds the maximum values, then
// strict is false.
//
// Unlike c.Probabilities(), this function is not prone to
// floating point underflow and is relatively safe to use.
func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {
priorG, priorJ := classificationPriors(db)
// calculate the scores
scoreG = math.Log(priorG)
scoreJ = math.Log(priorJ)
for _, word := range wordlist {
gP, jP := classificationWordProb(db, word)
scoreG += math.Log(gP)
scoreJ += math.Log(jP)
}
if scoreJ == math.Max(scoreG, scoreJ) {
junk = true
}
return scoreG, scoreJ, junk
}

@ -1,12 +1,15 @@
package main
import (
"log"
"github.com/boltdb/bolt"
)
// openDB creates and opens a new database and its respective buckets (if required)
func openDB(maildir string) (db *bolt.DB, err error) {
log.Println("loading database")
// Open the sisyphus.db data file in your current directory.
// It will be created if it doesn't exist.
db, err = bolt.Open(maildir+"/sisyphus.db", 0600, nil)
@ -26,6 +29,32 @@ func openDB(maildir string) (db *bolt.DB, err error) {
return db, err
}
// Create DB bucket for Mails inside bucket Processed
err = db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
_, err := b.CreateBucketIfNotExists([]byte("Mails"))
if err != nil {
return err
}
return nil
})
if err != nil {
return db, err
}
// Create DB bucket for Counters inside bucket Processed
err = db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
_, err := b.CreateBucketIfNotExists([]byte("Counters"))
if err != nil {
return err
}
return nil
})
if err != nil {
return db, err
}
// Create DB bucket for word lists
err = db.Update(func(tx *bolt.Tx) error {
_, err := tx.CreateBucketIfNotExists([]byte("Wordlists"))
@ -61,5 +90,6 @@ func openDB(maildir string) (db *bolt.DB, err error) {
return nil
})
log.Println("database loaded")
return db, err
}

@ -3,8 +3,10 @@ package main
import (
"bufio"
"errors"
"log"
"math"
"mime/quotedprintable"
"os"
"regexp"
"strings"
@ -19,9 +21,21 @@ type Mail struct {
Junk bool
}
// CreateDirs creates all the required dirs -- if not already there.
func CreateDirs(maildir string) {
log.Println("create missing directories")
os.MkdirAll(maildir+"/.Junk/cur", 0700)
os.MkdirAll(maildir+"/new", 0700)
os.MkdirAll(maildir+"/cur", 0700)
return
}
// Index loads all mail keys from the Maildir directory for processing.
func Index(d string) (m []*Mail, err error) {
log.Println("loading mails")
dirs := []string{d, d + "/.Junk"}
for _, dir := range dirs {
j, err := maildir.Dir(dir).Keys()
@ -38,9 +52,46 @@ func Index(d string) (m []*Mail, err error) {
}
}
log.Println("mails loaded")
return m, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {
if m.Junk {
d = d + "/.Junk"
}
message, err := maildir.Dir(d).Message(m.Key)
if err != nil {
return err
}
// get Subject
if m.Subject != nil {
return errors.New("there is already a subject")
}
subject := message.Header.Get("Subject")
m.Subject = &subject
// get Body
bQ := quotedprintable.NewReader(message.Body)
var b []string
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
b = append(b, raw)
}
body := strings.Join(b, " ")
if m.Body != nil {
return errors.New("there is already a body")
}
m.Body = &body
return nil
}
func trimStringFromBase64(s string) string {
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
return s[:idx-1]
@ -48,64 +99,46 @@ func trimStringFromBase64(s string) string {
return s
}
func cleanString(i string) (s string, err error) {
func cleanString(i string) (s string) {
s = trimStringFromBase64(i)
s = sanitize.Accents(s)
s = sanitize.Accents(i)
s = sanitize.HTML(s)
s = strings.ToLower(s)
s = strings.Replace(s, "boundary=", " ", -1)
s = strings.Replace(s, "charset", " ", -1)
s = strings.Replace(s, "content-transfer-encoding", " ", -1)
s = strings.Replace(s, "content-type", " ", -1)
s = strings.Replace(s, "image/jpeg", " ", -1)
s = strings.Replace(s, "multipart/alternative", " ", -1)
s = strings.Replace(s, "multipart/related", " ", -1)
s = strings.Replace(s, "name=", " ", -1)
s = strings.Replace(s, "nextpart", " ", -1)
s = strings.Replace(s, "quoted-printable", " ", -1)
s = strings.Replace(s, "text/html", " ", -1)
s = strings.Replace(s, "text/plain", " ", -1)
s = strings.Replace(s, "this email must be viewed in html mode", " ", -1)
s = strings.Replace(s, "this is a multi-part message in mime format", " ", -1)
s = strings.Replace(s, "windows-1251", " ", -1)
s = strings.Replace(s, "windows-1252", " ", -1)
s = strings.Replace(s, "!", " ", -1)
s = strings.Replace(s, "#", " ", -1)
s = strings.Replace(s, "$", " ", -1)
s = strings.Replace(s, "%", " ", -1)
s = strings.Replace(s, "&", " ", -1)
s = strings.Replace(s, "'", "", -1)
s = strings.Replace(s, "(", " ", -1)
s = strings.Replace(s, ")", " ", -1)
s = strings.Replace(s, "*", " ", -1)
s = strings.Replace(s, "+", " ", -1)
s = strings.Replace(s, ",", " ", -1)
s = strings.Replace(s, ". ", " ", -1)
s = strings.Replace(s, "<", " ", -1)
s = strings.Replace(s, "=", " ", -1)
s = strings.Replace(s, ">", " ", -1)
s = strings.Replace(s, "?", " ", -1)
s = strings.Replace(s, "@", " ", -1)
s = strings.Replace(s, "[", " ", -1)
s = strings.Replace(s, "\"", " ", -1)
s = strings.Replace(s, "\\", " ", -1)
s = strings.Replace(s, "\n", " ", -1)
s = strings.Replace(s, "\t", " ", -1)
s = strings.Replace(s, "]", " ", -1)
s = strings.Replace(s, "^", " ", -1)
s = strings.Replace(s, "_", " ", -1)
s = strings.Replace(s, "{", " ", -1)
s = strings.Replace(s, "|", " ", -1)
s = strings.Replace(s, "}", " ", -1)
bad := []string{
"boundary=", "charset", "content-transfer-encoding",
"content-type", "image/jpeg", "multipart/alternative",
"multipart/related", "name=", "nextpart", "quoted-printable",
"text/html", "text/plain", "this email must be viewed in html mode",
"this is a multi-part message in mime format",
"windows-1251", "windows-1252", "!", "#", "$", "%", "&", "'",
"(", ")", "*", "+", ",", ". ", "<", "=", ">", "?", "@", "[",
"\"", "\\", "\n", "\t", "]", "^", "_", "{", "|", "}",
}
for _, b := range bad {
s = strings.Replace(s, b, " ", -1)
}
for i := 0; i < 10; i++ {
s = strings.Replace(s, " ", " ", -1)
}
return s, nil
return s
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s := trimStringFromBase64(*m.Subject)
s = cleanString(s)
m.Subject = &s
}
if m.Body != nil {
b := trimStringFromBase64(*m.Body)
b = cleanString(b)
m.Body = &b
}
return nil
}
// wordlist takes a string of space separated text and returns a list of unique
@ -151,26 +184,6 @@ func wordlist(s string) (l []string, err error) {
return l, nil
}
// Clean cleans the mail's subject and body
func (m *Mail) Clean() error {
if m.Subject != nil {
s, err := cleanString(*m.Subject)
if err != nil {
return err
}
m.Subject = &s
}
if m.Body != nil {
b, err := cleanString(*m.Body)
if err != nil {
return err
}
m.Body = &b
}
return nil
}
// Wordlists prepares the mail's subject and body for training
func (m *Mail) Wordlists() (subject, body []string, err error) {
if m.Subject != nil {
@ -190,44 +203,14 @@ func (m *Mail) Wordlists() (subject, body []string, err error) {
return subject, body, nil
}
// Load reads a mail's subject and body
func (m *Mail) Load(d string) error {
if m.Junk {
d = d + "/.Junk"
}
message, err := maildir.Dir(d).Message(m.Key)
if err != nil {
return err
}
// get Subject
if m.Subject != nil {
return errors.New("there is already a subject")
}
subject := message.Header.Get("Subject")
m.Subject = &subject
// get Body
bQ := quotedprintable.NewReader(message.Body)
var b []string
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
b = append(b, raw)
}
body := strings.Join(b, " ")
if m.Body != nil {
return errors.New("there is already a body")
}
m.Body = &body
// Classify analyses the mail and decides whether it is Junk or Good
func (m *Mail) Classify() error {
return nil
}
// Classify identifies whether a mail is junk and then learns its words for the
// respective category
func (m *Mail) Classify() error {
// Learn adds the words to the respective list and unlearns on the other, if
// the mail has been moved from there.
func (m *Mail) Learn() error {
return nil
}

@ -262,7 +262,7 @@ var _ = Describe("Mail", func() {
Ω(err).ShouldNot(HaveOccurred())
subjectOutput := "wear glasses your eyes are headed for serious trouble"
bodyOutput := "--c2389532b48d1db204cfca8189242aeb : ; : 8bit --c2389532b48d1db204cfca8189242aeb : ; : 8bit snc .container width: 420px; .container .columns .container .column margin: 0; .container .fourteen.columns .container .fifteen.columns .container .sixteen.columns .container .one-third.column .container .two-thirds.column width: 420px; / self clearing goodness / .container:after content: 0020 ; display: block; height: 0; clear: both; visibility: hidden; .clearfix:before .clearfix:after .row:before .row:after content: 0020; display: block; overflow: hidden; visibility: hidden; width: 0; height: 0; .row:after .clearfix:after clear: both; .row .clearfix zoom: 1; .clear clear: both; display: block; overflow: hidden; visibility: hidden; width: 0; height: 0; if you wear glasses contacts or even if you think your vision can be improved you need to know about this.. in the link below youll discover 1 weird trick that will drastically improve your vision gt; 1 trick to improve your vision today to your success 1 place ville marie 39th floor montreal quebec h3b4m7 canada email marketing by unsu bscribe westcott railway station served the village of westcott buckinghamshire near baron ferdinand de rothschilds estate at manor it was built by the duke of buckingham in 1871 as part of a short horse-drawn tramway that met the aylesbury and buckingham railway at quainton the next year it was converted for passenger use extended to brill railway station and renamed the brill tramway the poor quality locomotives running on the built and line were very slow initially limited to 5 miles per hour 8 km/h the line was taken over by the metropolitan railway in 1899 and transferred to public ownership in 1933 westcott station became part of the london underground despite being over 40 miles 60 km from central london until the closure of the line in 1935 the station building and its associated house pictured are the only significant buildings from the brill tramway to survive other than the junction station at quainton full article.. --c2389532b48d1db204cfca8189242aeb-- "
bodyOutput := "--c2389532b48d1db204cfca8189242aeb : ; : 8bit --c2389532b48d1db204cfca8189242aeb : ; : 8bit snc .container width: 420px; .container .columns .container .column margin: 0; .container .fourteen.columns .container .fifteen.columns .container .sixteen.columns .container .one-third.column .container .two-thirds.column width: 420px; / self clearing goodness / .container:after content: 0020 ; display: block; height: 0; clear: both; visibility: hidden; .clearfix:before .clearfix:after .row:before .row:after content: 0020 ; display: block; overflow: hidden; visibility: hidden; width: 0; height: 0; .row:after .clearfix:after clear: both; .row .clearfix zoom: 1; .clear clear: both; display: block; overflow: hidden; visibility: hidden; width: 0; height: 0; if you wear glasses contacts or even if you think your vision can be improved you need to know about this.. in the link below you ll discover 1 weird trick that will drastically improve your vision gt; 1 trick to improve your vision today to your success 1 place ville marie 39th floor montreal quebec h3b4m7 canada email marketing by unsu bscribe westcott railway station served the village of westcott buckinghamshire near baron ferdinand de rothschild s estate at manor it was built by the duke of buckingham in 1871 as part of a short horse-drawn tramway that met the aylesbury and buckingham railway at quainton the next year it was converted for passenger use extended to brill railway station and renamed the brill tramway the poor quality locomotives running on the built and line were very slow initially limited to 5 miles per hour 8 km/h the line was taken over by the metropolitan railway in 1899 and transferred to public ownership in 1933 westcott station became part of the london underground despite being over 40 miles 60 km from central london until the closure of the line in 1935 the station building and its associated house pictured are the only significant buildings from the brill tramway to survive other than the junction station at quainton full article.. --c2389532b48d1db204cfca8189242aeb-- "
Ω(m).Should(Equal(
s.Mail{
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
@ -287,7 +287,7 @@ var _ = Describe("Mail", func() {
Ω(err).ShouldNot(HaveOccurred())
subjectOutput := "always in good form with our viagra super active."
bodyOutput := " body .maintable height:100 important; width:100 important; margin:0; padding:0; img a img border:0; outline:none; text-decoration:none; .imagefix display:block; table td border-collapse:collapse; mso-table-lspace:0pt; mso-table-rspace:0pt; p margin:0; padding:0; margin-bottom:0; .readmsgbody width:100 ; .externalclass width:100 ; .externalclass .externalclass p .externalclass span .externalclass font .externalclass td .externalclass div line-height:100 ; img -ms-interpolation-mode: bicubic; body table td p a li blockquote -ms-text-size-adjust:100 ; -webkit-text-size-adjust:100 ; 96 \u00a0 if you cant read this email please view it online http://6url.ru/lhcj \u00a0 most popular products and special deals limited time offer hola the leading online store presents pharmaceuticals with delivery service in europe the united states and canada you can buy anti-acidity antifungals blood pressure herpes medication antifungals antibiotics anti-depressant diabetes medication antiviral anti-allergy/asthma and other various products keep your eye out for discount when purchasing\u00a0\u00a0\u00a0 check it now amazon web services inc is a subsidiary of amazon.com inc amazon.com is a registered trademark of amazon.com inc this message was produced and distributed by amazon web services inc 410 terry ave north seattle.https://aws.amazon.com/support if you no longer wish to receive these emails simply click on the following link unsubscribe © 2016 amazon all rights reserved \u00a0 "
bodyOutput := " body .maintable height:100 important; width:100 important; margin:0; padding:0; img a img border:0; outline:none; text-decoration:none; .imagefix display:block; table td border-collapse:collapse; mso-table-lspace:0pt; mso-table-rspace:0pt; p margin:0; padding:0; margin-bottom:0; .readmsgbody width:100 ; .externalclass width:100 ; .externalclass .externalclass p .externalclass span .externalclass font .externalclass td .externalclass div line-height:100 ; img -ms-interpolation-mode: bicubic; body table td p a li blockquote -ms-text-size-adjust:100 ; -webkit-text-size-adjust:100 ; 96 \u00a0 if you can t read this email please view it online http://6url.ru/lhcj \u00a0 most popular products and special deals limited time offer hola the leading online store presents pharmaceuticals with delivery service in europe the united states and canada you can buy anti-acidity antifungals blood pressure herpes medication antifungals antibiotics anti-depressant diabetes medication antiviral anti-allergy/asthma and other various products keep your eye out for discount when purchasing\u00a0\u00a0\u00a0 check it now amazon web services inc is a subsidiary of amazon.com inc amazon.com is a registered trademark of amazon.com inc this message was produced and distributed by amazon web services inc 410 terry ave north seattle.https://aws.amazon.com/support if you no longer wish to receive these emails simply click on the following link unsubscribe © 2016 amazon all rights reserved \u00a0 "
Ω(m).Should(Equal(
s.Mail{
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
@ -419,7 +419,7 @@ var _ = Describe("Mail", func() {
Ω(subject).Should(Equal(
[]string{"eyes", "glasses", "headed", "serious", "trouble", "wear", "your"}))
Ω(body).Should(Equal(
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "youll", "your"}))
[]string{"about", "associated", "aylesbury", "baron", "became", "being", "below", "brill", "bscribe", "buckingham", "building", "buildings", "built", "canada", "central", "clearing", "closure", "contacts", "converted", "despite", "discover", "duke", "email", "estate", "even", "extended", "ferdinand", "floor", "from", "full", "glasses", "goodness", "hour", "house", "improve", "improved", "initially", "junction", "know", "limited", "line", "link", "london", "manor", "marie", "marketing", "miles", "montreal", "near", "need", "next", "only", "other", "over", "ownership", "part", "passenger", "pictured", "place", "poor", "public", "quainton", "quality", "quebec", "railway", "renamed", "rothschild", "running", "self", "served", "short", "slow", "station", "success", "survive", "taken", "than", "that", "think", "today", "tramway", "trick", "unsu", "until", "very", "village", "ville", "vision", "wear", "weird", "were", "westcott", "will", "year", "your"}))
})
It("Wordlist 6", func() {
@ -444,7 +444,7 @@ var _ = Describe("Mail", func() {
Ω(subject).Should(Equal(
[]string{"always", "form", "good", "super", "viagra", "with"}))
Ω(body).Should(Equal(
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "cant", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
[]string{"amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "table", "terry", "these", "this", "time", "trademark", "united", "various", "view", "when", "wish", "with", "your"}))
})
})
})

@ -5,6 +5,7 @@ import (
"log"
"os"
"os/signal"
"strings"
"syscall"
"github.com/boltdb/bolt"
@ -115,40 +116,46 @@ func main() {
log.Fatal("Sorry... only one Maildir supported as of today.")
}
log.Println("create directories if missing")
os.MkdirAll(maildirPaths[0]+"/.Junk/cur", 0700)
os.MkdirAll(maildirPaths[0]+"/new", 0700)
os.MkdirAll(maildirPaths[0]+"/cur", 0700)
CreateDirs(maildirPaths[0])
log.Println("loading mails")
mails, err := Index(maildirPaths[0])
if err != nil {
log.Fatal("Wrong path to Maildir")
}
log.Println("mails loaded")
// Open the database
log.Println("loading database")
db, err := openDB(maildirPaths[0])
if err != nil {
log.Fatal(err)
}
defer db.Close()
log.Println("database loaded")
// Handle all mails initially
// Handle all mails after startup
for i := range mails {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Processed"))
v := b.Get([]byte(mails[i].Key))
if len(v) == 0 {
mails[i].Classify()
err = mails[i].Classify()
if err != nil {
log.Print(err)
}
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
if string(v) == good && mails[i].Junk == true {
mails[i].Classify()
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
if string(v) == junk && mails[i].Junk == false {
mails[i].Classify()
err = mails[i].Learn()
if err != nil {
log.Print(err)
}
}
return nil
})
@ -167,12 +174,19 @@ func main() {
select {
case event := <-watcher.Events:
if event.Op&fsnotify.Create == fsnotify.Create {
log.Println("new mail:", event.Name)
m := s.Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
mailName := strings.Split(event.Name, "/")
m := Mail{
Key: mailName[len(mailName)-1],
}
err := m.Classify()
err = m.Classify()
if err != nil {
log.Print(err)
}
err = m.Learn()
if err != nil {
log.Print(err)
}
}
case err := <-watcher.Errors:

Loading…
Cancel
Save