implement learning and classifying using Bayes' rule and Hyperloglog

data structures -- still way to go though.
master
Carlo Strub 7 years ago
parent 0497999a15
commit b5b3792efe

@ -2,6 +2,9 @@ package sisyphus
import (
"errors"
"log"
"os"
"strconv"
"github.com/boltdb/bolt"
"github.com/gonum/stat"
@ -109,6 +112,38 @@ func classificationWord(db *bolt.DB, word string) (g float64, err error) {
return g, nil
}
// Classify analyses a new mail (a mail that arrived in the "new" directory),
// decides whether it is junk and -- if so -- moves it to the Junk folder. If
// it is not junk, the mail is untouched so it can be handled by the mail
// client.
func (m *Mail) Classify(db *bolt.DB) error {
err := m.Clean()
if err != nil {
return err
}
list := m.Wordlist()
junk, err := Junk(db, list)
if err != nil {
return err
}
log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk))
// Move mail around if junk.
if junk {
m.Junk = junk
err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key)
if err != nil {
return err
}
log.Print("Moved " + m.Key + " from new to Junk folder")
}
return nil
}
// Junk returns true if the wordlist is classified as a junk mail using Bayes'
// rule.
func Junk(db *bolt.DB, wordlist []string) (bool, error) {

@ -52,7 +52,6 @@ func openDB(m Maildir) (db *bolt.DB, err error) {
return err
})
log.Println("database loaded")
return db, err
}
@ -66,16 +65,19 @@ func LoadDatabases(d []Maildir) (databases map[Maildir]*bolt.DB, err error) {
}
}
log.Println("all databases loaded")
return databases, nil
}
// CloseDatabases closes all databases from a given slice of Maildirs
func CloseDatabases(databases map[Maildir]*bolt.DB) {
for _, val := range databases {
for key, val := range databases {
err := val.Close()
if err != nil {
log.Println(err)
}
log.Println("database " + string(key) + "/sisyphus.db closed")
}
return

@ -1,21 +1,15 @@
[
{"linter":"gas","severity":"warning","path":"bayesian.go","line":18,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"gas","severity":"warning","path":"bayesian.go","line":38,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"gas","severity":"warning","path":"bayesian.go","line":42,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"gas","severity":"warning","path":"bayesian.go","line":45,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"gas","severity":"warning","path":"bayesian.go","line":50,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"gas","severity":"warning","path":"daemon.go","line":45,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
{"linter":"gas","severity":"warning","path":"daemon.go","line":115,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
{"linter":"gas","severity":"warning","path":"daemon.go","line":122,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
{"linter":"gocyclo","severity":"warning","path":"mail.go","line":232,"col":0,"message":"cyclomatic complexity 16 of function (*Mail).Classify() is high (\u003e 10)"},
{"linter":"gas","severity":"warning","path":"mail.go","line":168,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":135,"col":0,"message":"duplicate of mail_test.go:160-183"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":160,"col":0,"message":"duplicate of mail_test.go:185-208"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":185,"col":0,"message":"duplicate of mail_test.go:210-233"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":210,"col":0,"message":"duplicate of mail_test.go:235-258"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":235,"col":0,"message":"duplicate of mail_test.go:260-283"},
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":260,"col":0,"message":"duplicate of mail_test.go:135-158"},
{"linter":"errcheck","severity":"warning","path":"bayesian.go","line":18,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"},
{"linter":"errcheck","severity":"warning","path":"bayesian.go","line":38,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"},
{"linter":"errcheck","severity":"warning","path":"daemon.go","line":26,"col":18,"message":"error return value not checked (defer file.Close())"},
{"linter":"errcheck","severity":"warning","path":"mail.go","line":275,"col":11,"message":"error return value not checked (db.Update(func(tx *bolt.Tx) error {)"}
{"linter":"errcheck","severity":"warning","path":"learn.go","line":37,"col":14,"message":"error return value not checked (bucket.Put([]byte(val), word.Marshal()))"},
{"linter":"errcheck","severity":"warning","path":"learn.go","line":62,"col":8,"message":"error return value not checked (p.Put([]byte(key), counter.Marshal()))"}
]

@ -0,0 +1,83 @@
package sisyphus
import (
"log"
"github.com/boltdb/bolt"
"github.com/retailnext/hllpp"
)
// Learn adds the the mail key to the list of words using hyper log log algorithm.
func (m *Mail) Learn(db *bolt.DB) error {
log.Println("learn mail " + m.Key)
err := m.Clean()
if err != nil {
return err
}
list := m.Wordlist()
wordKey := "Good"
if m.Junk {
wordKey = "Junk"
}
// Learn words
for _, val := range list {
err = db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
bucket := b.Bucket([]byte(wordKey))
wordRaw := bucket.Get([]byte(val))
var word *hllpp.HLLPP
if len(wordRaw) == 0 {
word = hllpp.New()
} else {
word, err = hllpp.Unmarshal(wordRaw)
if err != nil {
return err
}
}
word.Add([]byte(m.Key))
err = bucket.Put([]byte(val), word.Marshal())
return err
})
if err != nil {
return err
}
}
// Update the statistics counter
err = db.Update(func(tx *bolt.Tx) error {
p := tx.Bucket([]byte("Statistics"))
key := "ProcessedGood"
if m.Junk {
key = "ProcessedJunk"
}
keyRaw := p.Get([]byte(key))
var counter *hllpp.HLLPP
if len(keyRaw) == 0 {
counter = hllpp.New()
} else {
counter, err = hllpp.Unmarshal(keyRaw)
if err != nil {
return err
}
}
counter.Add([]byte(m.Key))
err = p.Put([]byte(key), counter.Marshal())
return err
})
return err
}

@ -8,10 +8,8 @@ import (
"mime/quotedprintable"
"os"
"regexp"
"strconv"
"strings"
"github.com/boltdb/bolt"
"github.com/kennygrant/sanitize"
"github.com/luksen/maildir"
)
@ -221,44 +219,6 @@ func (m *Mail) Wordlist() (w []string) {
return w
}
// Classify analyses a new mail (a mail that arrived in the "new" directory),
// decides whether it is junk and -- if so -- moves it to the Junk folder. If
// it is not junk, the mail is untouched so it can be handled by the mail
// client.
func (m *Mail) Classify(db *bolt.DB) error {
err := m.Clean()
if err != nil {
return err
}
list := m.Wordlist()
junk, err := Junk(db, list)
if err != nil {
return err
}
log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk))
// Move mail around if junk.
if junk {
m.Junk = junk
err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key)
if err != nil {
return err
}
log.Print("Moved " + m.Key + " from new to Junk folder")
}
return nil
}
// Learn adds the words to the respective list and unlearns on the other, if
// the mail has been moved from there.
func (m *Mail) Learn(db *bolt.DB) error {
return nil
}
// LoadMails creates missing directories and then loads all mails from a given
// slice of Maildirs
func LoadMails(d []Maildir) (mails map[Maildir][]*Mail, err error) {

@ -5,11 +5,9 @@ import (
"log"
"os"
"os/signal"
"strings"
"syscall"
"github.com/carlostrub/sisyphus"
"github.com/fsnotify/fsnotify"
"github.com/urfave/cli"
)
@ -131,87 +129,67 @@ func main() {
defer sisyphus.CloseDatabases(dbs)
// Learn at startup
// for i := range mails {
// db.View(func(tx *bolt.Tx) error {
// b := tx.Bucket([]byte("Processed"))
// bMails := b.Bucket([]byte("Mails"))
// v := bMails.Get([]byte(mails[i].Key))
// if len(v) == 0 {
// err = mails[i].Classify(db)
// if err != nil {
// log.Print(err)
// }
// err = mails[i].Learn(db)
// if err != nil {
// log.Print(err)
// }
// }
// if string(v) == sisyphus.Good && mails[i].Junk == true {
// err = mails[i].Learn(db)
// if err != nil {
// log.Print(err)
// }
// }
// if string(v) == sisyphus.Junk && mails[i].Junk == false {
// err = mails[i].Learn(db)
// if err != nil {
// log.Print(err)
// }
// }
// return nil
// })
// }
// Classify on arrival
watcher, err := fsnotify.NewWatcher()
if err != nil {
log.Fatal(err)
}
defer watcher.Close()
done := make(chan bool)
go func() {
for {
select {
case event := <-watcher.Events:
if event.Op&fsnotify.Create == fsnotify.Create {
mailName := strings.Split(event.Name, "/")
m := sisyphus.Mail{
Key: mailName[len(mailName)-1],
}
if mailName[len(mailName)-2] == "new" {
err = m.Classify(db)
if err != nil {
log.Print(err)
}
} else {
err = m.Learn(db)
if err != nil {
log.Print(err)
}
}
}
case err := <-watcher.Errors:
log.Println("error:", err)
for _, d := range maildirs {
db := dbs[d]
m := mails[d]
for _, val := range m {
err := val.Learn(db)
if err != nil {
log.Fatal(err)
}
}
}()
err = watcher.Add(maildirPaths[0] + "/cur")
if err != nil {
log.Fatal(err)
}
err = watcher.Add(maildirPaths[0] + "/new")
if err != nil {
log.Fatal(err)
}
err = watcher.Add(maildirPaths[0] + "/.Junk/cur")
if err != nil {
log.Fatal(err)
}
<-done
// // Classify on arrival
// watcher, err := fsnotify.NewWatcher()
// if err != nil {
// log.Fatal(err)
// }
// defer watcher.Close()
//
// done := make(chan bool)
// go func() {
// for {
// select {
// case event := <-watcher.Events:
// if event.Op&fsnotify.Create == fsnotify.Create {
// mailName := strings.Split(event.Name, "/")
// m := sisyphus.Mail{
// Key: mailName[len(mailName)-1],
// }
//
// if mailName[len(mailName)-2] == "new" {
// err = m.Classify(db)
// if err != nil {
// log.Print(err)
// }
// } else {
// err = m.Learn(db)
// if err != nil {
// log.Print(err)
// }
// }
//
// }
// case err := <-watcher.Errors:
// log.Println("error:", err)
// }
// }
// }()
//
// err = watcher.Add(maildirPaths[0] + "/cur")
// if err != nil {
// log.Fatal(err)
// }
// err = watcher.Add(maildirPaths[0] + "/new")
// if err != nil {
// log.Fatal(err)
// }
// err = watcher.Add(maildirPaths[0] + "/.Junk/cur")
// if err != nil {
// log.Fatal(err)
// }
// <-done
},
},
{

Loading…
Cancel
Save