diff --git a/bayesian.go b/classify.go similarity index 79% rename from bayesian.go rename to classify.go index af6e05f..087acc4 100644 --- a/bayesian.go +++ b/classify.go @@ -2,6 +2,9 @@ package sisyphus import ( "errors" + "log" + "os" + "strconv" "github.com/boltdb/bolt" "github.com/gonum/stat" @@ -109,6 +112,38 @@ func classificationWord(db *bolt.DB, word string) (g float64, err error) { return g, nil } +// Classify analyses a new mail (a mail that arrived in the "new" directory), +// decides whether it is junk and -- if so -- moves it to the Junk folder. If +// it is not junk, the mail is untouched so it can be handled by the mail +// client. +func (m *Mail) Classify(db *bolt.DB) error { + + err := m.Clean() + if err != nil { + return err + } + + list := m.Wordlist() + junk, err := Junk(db, list) + if err != nil { + return err + } + + log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk)) + + // Move mail around if junk. + if junk { + m.Junk = junk + err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key) + if err != nil { + return err + } + log.Print("Moved " + m.Key + " from new to Junk folder") + } + + return nil +} + // Junk returns true if the wordlist is classified as a junk mail using Bayes' // rule. func Junk(db *bolt.DB, wordlist []string) (bool, error) { diff --git a/database.go b/database.go index 317f2b4..858c9d0 100644 --- a/database.go +++ b/database.go @@ -52,7 +52,6 @@ func openDB(m Maildir) (db *bolt.DB, err error) { return err }) - log.Println("database loaded") return db, err } @@ -66,16 +65,19 @@ func LoadDatabases(d []Maildir) (databases map[Maildir]*bolt.DB, err error) { } } + log.Println("all databases loaded") + return databases, nil } // CloseDatabases closes all databases from a given slice of Maildirs func CloseDatabases(databases map[Maildir]*bolt.DB) { - for _, val := range databases { + for key, val := range databases { err := val.Close() if err != nil { log.Println(err) } + log.Println("database " + string(key) + "/sisyphus.db closed") } return diff --git a/gometalinter.out b/gometalinter.out index 9834ee0..5a9cb4d 100644 --- a/gometalinter.out +++ b/gometalinter.out @@ -1,21 +1,15 @@ [ - {"linter":"gas","severity":"warning","path":"bayesian.go","line":18,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, - {"linter":"gas","severity":"warning","path":"bayesian.go","line":38,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, - {"linter":"gas","severity":"warning","path":"bayesian.go","line":42,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, - {"linter":"gas","severity":"warning","path":"bayesian.go","line":45,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, - {"linter":"gas","severity":"warning","path":"bayesian.go","line":50,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, {"linter":"gas","severity":"warning","path":"daemon.go","line":45,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"}, {"linter":"gas","severity":"warning","path":"daemon.go","line":115,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"}, {"linter":"gas","severity":"warning","path":"daemon.go","line":122,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"}, - {"linter":"gocyclo","severity":"warning","path":"mail.go","line":232,"col":0,"message":"cyclomatic complexity 16 of function (*Mail).Classify() is high (\u003e 10)"}, + {"linter":"gas","severity":"warning","path":"mail.go","line":168,"col":0,"message":"Errors unhandled.,LOW,HIGH"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":135,"col":0,"message":"duplicate of mail_test.go:160-183"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":160,"col":0,"message":"duplicate of mail_test.go:185-208"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":185,"col":0,"message":"duplicate of mail_test.go:210-233"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":210,"col":0,"message":"duplicate of mail_test.go:235-258"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":235,"col":0,"message":"duplicate of mail_test.go:260-283"}, {"linter":"dupl","severity":"warning","path":"mail_test.go","line":260,"col":0,"message":"duplicate of mail_test.go:135-158"}, - {"linter":"errcheck","severity":"warning","path":"bayesian.go","line":18,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"}, - {"linter":"errcheck","severity":"warning","path":"bayesian.go","line":38,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"}, {"linter":"errcheck","severity":"warning","path":"daemon.go","line":26,"col":18,"message":"error return value not checked (defer file.Close())"}, - {"linter":"errcheck","severity":"warning","path":"mail.go","line":275,"col":11,"message":"error return value not checked (db.Update(func(tx *bolt.Tx) error {)"} + {"linter":"errcheck","severity":"warning","path":"learn.go","line":37,"col":14,"message":"error return value not checked (bucket.Put([]byte(val), word.Marshal()))"}, + {"linter":"errcheck","severity":"warning","path":"learn.go","line":62,"col":8,"message":"error return value not checked (p.Put([]byte(key), counter.Marshal()))"} ] diff --git a/learn.go b/learn.go new file mode 100644 index 0000000..15ddf50 --- /dev/null +++ b/learn.go @@ -0,0 +1,83 @@ +package sisyphus + +import ( + "log" + + "github.com/boltdb/bolt" + "github.com/retailnext/hllpp" +) + +// Learn adds the the mail key to the list of words using hyper log log algorithm. +func (m *Mail) Learn(db *bolt.DB) error { + + log.Println("learn mail " + m.Key) + + err := m.Clean() + if err != nil { + return err + } + + list := m.Wordlist() + + wordKey := "Good" + if m.Junk { + wordKey = "Junk" + } + + // Learn words + for _, val := range list { + err = db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("Wordlists")) + + bucket := b.Bucket([]byte(wordKey)) + wordRaw := bucket.Get([]byte(val)) + var word *hllpp.HLLPP + if len(wordRaw) == 0 { + word = hllpp.New() + } else { + word, err = hllpp.Unmarshal(wordRaw) + if err != nil { + return err + } + } + + word.Add([]byte(m.Key)) + + err = bucket.Put([]byte(val), word.Marshal()) + + return err + }) + if err != nil { + return err + } + } + + // Update the statistics counter + err = db.Update(func(tx *bolt.Tx) error { + p := tx.Bucket([]byte("Statistics")) + + key := "ProcessedGood" + if m.Junk { + key = "ProcessedJunk" + } + + keyRaw := p.Get([]byte(key)) + var counter *hllpp.HLLPP + if len(keyRaw) == 0 { + counter = hllpp.New() + } else { + counter, err = hllpp.Unmarshal(keyRaw) + if err != nil { + return err + } + } + + counter.Add([]byte(m.Key)) + + err = p.Put([]byte(key), counter.Marshal()) + + return err + }) + + return err +} diff --git a/mail.go b/mail.go index 8751dac..5e65611 100644 --- a/mail.go +++ b/mail.go @@ -8,10 +8,8 @@ import ( "mime/quotedprintable" "os" "regexp" - "strconv" "strings" - "github.com/boltdb/bolt" "github.com/kennygrant/sanitize" "github.com/luksen/maildir" ) @@ -221,44 +219,6 @@ func (m *Mail) Wordlist() (w []string) { return w } -// Classify analyses a new mail (a mail that arrived in the "new" directory), -// decides whether it is junk and -- if so -- moves it to the Junk folder. If -// it is not junk, the mail is untouched so it can be handled by the mail -// client. -func (m *Mail) Classify(db *bolt.DB) error { - - err := m.Clean() - if err != nil { - return err - } - - list := m.Wordlist() - junk, err := Junk(db, list) - if err != nil { - return err - } - - log.Print("Classified " + m.Key + " as Junk=" + strconv.FormatBool(m.Junk)) - - // Move mail around if junk. - if junk { - m.Junk = junk - err := os.Rename("./new/"+m.Key, "./.Junk/cur/"+m.Key) - if err != nil { - return err - } - log.Print("Moved " + m.Key + " from new to Junk folder") - } - - return nil -} - -// Learn adds the words to the respective list and unlearns on the other, if -// the mail has been moved from there. -func (m *Mail) Learn(db *bolt.DB) error { - return nil -} - // LoadMails creates missing directories and then loads all mails from a given // slice of Maildirs func LoadMails(d []Maildir) (mails map[Maildir][]*Mail, err error) { diff --git a/sisyphus/sisyphus.go b/sisyphus/sisyphus.go index b96b2db..41f02b2 100644 --- a/sisyphus/sisyphus.go +++ b/sisyphus/sisyphus.go @@ -5,11 +5,9 @@ import ( "log" "os" "os/signal" - "strings" "syscall" "github.com/carlostrub/sisyphus" - "github.com/fsnotify/fsnotify" "github.com/urfave/cli" ) @@ -131,87 +129,67 @@ func main() { defer sisyphus.CloseDatabases(dbs) // Learn at startup - // for i := range mails { - // db.View(func(tx *bolt.Tx) error { - // b := tx.Bucket([]byte("Processed")) - // bMails := b.Bucket([]byte("Mails")) - // v := bMails.Get([]byte(mails[i].Key)) - // if len(v) == 0 { - // err = mails[i].Classify(db) - // if err != nil { - // log.Print(err) - // } - // err = mails[i].Learn(db) - // if err != nil { - // log.Print(err) - // } - // } - // if string(v) == sisyphus.Good && mails[i].Junk == true { - // err = mails[i].Learn(db) - // if err != nil { - // log.Print(err) - // } - // } - // if string(v) == sisyphus.Junk && mails[i].Junk == false { - // err = mails[i].Learn(db) - // if err != nil { - // log.Print(err) - // } - // } - // return nil - // }) - // } - - // Classify on arrival - watcher, err := fsnotify.NewWatcher() - if err != nil { - log.Fatal(err) - } - defer watcher.Close() - - done := make(chan bool) - go func() { - for { - select { - case event := <-watcher.Events: - if event.Op&fsnotify.Create == fsnotify.Create { - mailName := strings.Split(event.Name, "/") - m := sisyphus.Mail{ - Key: mailName[len(mailName)-1], - } - - if mailName[len(mailName)-2] == "new" { - err = m.Classify(db) - if err != nil { - log.Print(err) - } - } else { - err = m.Learn(db) - if err != nil { - log.Print(err) - } - } - - } - case err := <-watcher.Errors: - log.Println("error:", err) + for _, d := range maildirs { + db := dbs[d] + m := mails[d] + for _, val := range m { + err := val.Learn(db) + if err != nil { + log.Fatal(err) } } - }() - - err = watcher.Add(maildirPaths[0] + "/cur") - if err != nil { - log.Fatal(err) - } - err = watcher.Add(maildirPaths[0] + "/new") - if err != nil { - log.Fatal(err) - } - err = watcher.Add(maildirPaths[0] + "/.Junk/cur") - if err != nil { - log.Fatal(err) } - <-done + + // // Classify on arrival + // watcher, err := fsnotify.NewWatcher() + // if err != nil { + // log.Fatal(err) + // } + // defer watcher.Close() + // + // done := make(chan bool) + // go func() { + // for { + // select { + // case event := <-watcher.Events: + // if event.Op&fsnotify.Create == fsnotify.Create { + // mailName := strings.Split(event.Name, "/") + // m := sisyphus.Mail{ + // Key: mailName[len(mailName)-1], + // } + // + // if mailName[len(mailName)-2] == "new" { + // err = m.Classify(db) + // if err != nil { + // log.Print(err) + // } + // } else { + // err = m.Learn(db) + // if err != nil { + // log.Print(err) + // } + // } + // + // } + // case err := <-watcher.Errors: + // log.Println("error:", err) + // } + // } + // }() + // + // err = watcher.Add(maildirPaths[0] + "/cur") + // if err != nil { + // log.Fatal(err) + // } + // err = watcher.Add(maildirPaths[0] + "/new") + // if err != nil { + // log.Fatal(err) + // } + // err = watcher.Add(maildirPaths[0] + "/.Junk/cur") + // if err != nil { + // log.Fatal(err) + // } + // <-done }, }, {