diff --git a/.gitignore b/.gitignore index a8c4d2b..0b3ffb5 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ _testmain.go *.test *.prof sisyphus +sisyphus.db diff --git a/database.go b/database.go index 6c0f9e8..d1d9838 100644 --- a/database.go +++ b/database.go @@ -51,7 +51,7 @@ func openDB(maildir string) (db *bolt.DB, err error) { return db, err } - // Create DB bucket for Junk inside bucket Wordlists + // Create DB bucket for Good inside bucket Wordlists err = db.Update(func(tx *bolt.Tx) error { b := tx.Bucket([]byte("Wordlists")) _, err := b.CreateBucketIfNotExists([]byte("Good")) diff --git a/glide.lock b/glide.lock index a8a8193..6017645 100644 --- a/glide.lock +++ b/glide.lock @@ -1,58 +1,16 @@ -hash: 1d7fb4b49ab9cebe25e4d605b7256849d2fb5550372e53dcf1a8dc030d32fce0 -updated: 2017-03-11T20:05:25.966344527Z +hash: ad57db461a149fadda777f61bafa73ed65c46ef4f18e9f86b7e282070ea587e2 +updated: 2017-03-15T21:05:02.415303554Z imports: - name: github.com/boltdb/bolt version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9 -- name: github.com/jbrukh/bayesian - version: bf3f261f9a9c61145c60d47665b0518cc32c774f - name: github.com/kennygrant/sanitize version: 6a0bfdde8629a3a3a7418a7eae45c54154692514 - name: github.com/luksen/maildir version: 5297d9c3091c7d4891c9d4f6fa743d500c038d6f - name: github.com/urfave/cli version: 9e5b04886c4bfee2ceba1465b8121057355c4e53 -- name: golang.org/x/net - version: a6577fac2d73be281a500b310739095313165611 - subpackages: - - html - - html/atom -- name: golang.org/x/sys - version: 99f16d856c9836c42d24e7ab64ea72916925fa97 - subpackages: - - unix testImports: - name: github.com/onsi/ginkgo - version: ab07225d112dc7a93c289ac5b2e12735c2c46035 - subpackages: - - config - - internal/codelocation - - internal/containernode - - internal/failer - - internal/leafnodes - - internal/remote - - internal/spec - - internal/specrunner - - internal/suite - - internal/testingtproxy - - internal/writer - - reporters - - reporters/stenographer - - reporters/stenographer/support/go-colorable - - reporters/stenographer/support/go-isatty - - types + version: 5ca121185e255e5041d7727d77992618545a93d2 - name: github.com/onsi/gomega version: 1de7ab2df9105aa5c15c4d7e14a8a514e3cb8d4b - subpackages: - - format - - internal/assertion - - internal/asyncassertion - - internal/oraclematcher - - internal/testingtsupport - - matchers - - matchers/support/goraph/bipartitegraph - - matchers/support/goraph/edge - - matchers/support/goraph/node - - matchers/support/goraph/util - - types -- name: gopkg.in/yaml.v2 - version: a3f3340b5840cee44f372bddb5880fcbc419b46a diff --git a/glide.yaml b/glide.yaml index 102b46c..2489bd6 100644 --- a/glide.yaml +++ b/glide.yaml @@ -1,6 +1,5 @@ package: github.com/carlostrub/sisyphus import: -- package: github.com/jbrukh/bayesian - package: github.com/kennygrant/sanitize - package: github.com/luksen/maildir - package: github.com/boltdb/bolt diff --git a/mail.go b/mail.go index 02ff4ed..36d0f64 100644 --- a/mail.go +++ b/mail.go @@ -3,6 +3,7 @@ package main import ( "bufio" "errors" + "math" "mime/quotedprintable" "regexp" "strings" @@ -19,26 +20,20 @@ type Mail struct { } // Index loads all mail keys from the Maildir directory for processing. -func Index(d string) (m []*Mail, err error) { +func Index(d string, junk bool) (m []*Mail, err error) { - g, err := maildir.Dir(d).Keys() - if err != nil { - return m, err + if junk { + j, err := maildir.Dir(d + "/.Junk").Keys() + } else { + j, err := maildir.Dir(d).Keys() } - for _, val := range g { - var new Mail - new.Key = val - m = append(m, &new) - } - - j, err := maildir.Dir(d + "/.Junk").Keys() if err != nil { return m, err } for _, val := range j { var new Mail new.Key = val - new.Junk = true + new.Junk = junk m = append(m, &new) } @@ -63,7 +58,6 @@ func cleanString(i string) (s string, err error) { s = strings.Replace(s, "charset", " ", -1) s = strings.Replace(s, "content-transfer-encoding", " ", -1) s = strings.Replace(s, "content-type", " ", -1) - s = strings.Replace(s, "cp-850", " ", -1) s = strings.Replace(s, "image/jpeg", " ", -1) s = strings.Replace(s, "multipart/alternative", " ", -1) s = strings.Replace(s, "multipart/related", " ", -1) @@ -119,24 +113,32 @@ func wordlist(s string) (l []string, err error) { list := make(map[string]int) raw := strings.Split(s, " ") + var clean []string - for _, i := range raw { + for _, w := range raw { // no long or too short words - length := len(i) + length := len(w) if length < 4 || length > 10 { continue } // no numbers, special characters, etc. -- only words - match, _ := regexp.MatchString("(^[a-z]+$)", i) + match, _ := regexp.MatchString("(^[a-z]+$)", w) if !match { continue } else { - list[i]++ + clean = append(clean, w) } } + // only the first 200 words count + maxWords := int(math.Min(200, float64(len(clean)))) + for i := 0; i < maxWords; i++ { + w := clean[i] + list[w]++ + } + for word, count := range list { if count > 10 { continue diff --git a/mail_test.go b/mail_test.go index f961332..7c74113 100644 --- a/mail_test.go +++ b/mail_test.go @@ -162,7 +162,7 @@ var _ = Describe("Mail", func() { Ω(err).ShouldNot(HaveOccurred()) subjectOutput := "hello" - bodyOutput := " ------ 000 0032 01d2912f.05324bc6 : ; : dear cs we are looking for employees working remotely my name is kari i am the personnel manager of a large international company most of the work you can do from home that is at a distance salary is 2000- 5300 if you are interested in this offer please visit our site best regards ------ 000 0032 01d2912f.05324bc6 : ; : dear cs we are looking for employees working remotely. my name is kari i am the personnel manager of a large international company. most of the work you can do from home that is at a distance. salary is 2000- 5300. if you are interested in this offer please visit our site best regards ------ 000 0032 01d2912f.05324bc6-- " + bodyOutput := " ------ 000 0032 01d2912f.05324bc6 : ; cp-850 : dear cs we are looking for employees working remotely my name is kari i am the personnel manager of a large international company most of the work you can do from home that is at a distance salary is 2000- 5300 if you are interested in this offer please visit our site best regards ------ 000 0032 01d2912f.05324bc6 : ; cp-850 : dear cs we are looking for employees working remotely. my name is kari i am the personnel manager of a large international company. most of the work you can do from home that is at a distance. salary is 2000- 5300. if you are interested in this offer please visit our site best regards ------ 000 0032 01d2912f.05324bc6-- " Ω(m).Should(Equal( s.Mail{ Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730", diff --git a/main.go b/main.go index c531206..04db441 100644 --- a/main.go +++ b/main.go @@ -8,20 +8,16 @@ import ( "os/signal" "syscall" + "github.com/boltdb/bolt" "github.com/urfave/cli" ) -var ( -// Processed is a map of e-mail IDs and the value set to true if Junk -// Processed map[string]bool +const ( + good = "0" + junk = "1" ) func main() { - // Get working directory - wd, err := os.Getwd() - if err != nil { - panic(err) - } // Define App app := cli.NewApp() @@ -43,9 +39,7 @@ func main() { }, } - maildirPaths := cli.StringSlice([]string{ - wd + "/Maildir", - }) + maildirPaths := cli.StringSlice([]string{}) var pidfile *string pidfile = new(string) @@ -81,14 +75,16 @@ func main() { fmt.Print(` + ███████╗██╗███████╗██╗ ██╗██████╗ ██╗ ██╗██╗ ██╗███████╗ ██╔════╝██║██╔════╝╚██╗ ██╔╝██╔══██╗██║ ██║██║ ██║██╔════╝ ███████╗██║███████╗ ╚████╔╝ ██████╔╝███████║██║ ██║███████╗ ╚════██║██║╚════██║ ╚██╔╝ ██╔═══╝ ██╔══██║██║ ██║╚════██║ ███████║██║███████║ ██║ ██║ ██║ ██║╚██████╔╝███████║ ╚══════╝╚═╝╚══════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚══════╝ - - `) + + +`) // Make arrangement to remove PID file upon receiving the SIGTERM from kill command ch := make(chan os.Signal, 1) signal.Notify(ch, os.Interrupt, os.Kill, syscall.SIGTERM) @@ -108,25 +104,67 @@ func main() { os.Exit(0) }() - // var maildir []string - // if maildir == nil { - // return errors.New("no maildir selected") - // } - // - // // Load the Maildir - // mails, err := Index(maildirPaths[0]) - // if err != nil { - // return cli.NewExitError(err, 66) - // } - // - // fmt.Println(mails) - // - // // Open the database - // db, err := openDB(maildirPaths[0]) - // if err != nil { - // return cli.NewExitError(err, 66) - // } - // defer db.Close() + + // Load the Maildir + if len(maildirPaths) < 1 { + log.Fatal("No Maildir set.") + } + if len(maildirPaths) > 1 { + log.Fatal("Sorry... only one Maildir supported as of today.") + } + + log.Println("loading mails") + mailsGood, err := Index(maildirPaths[0], false) + if err != nil { + log.Fatal("Wrong path to Maildir") + } + log.Println("good mails loaded") + os.MkdirAll(maildirPaths[0]+"/.Junk/cur", 0700) + mailsJunk, err := Index(maildirPaths[0], true) + if err != nil { + log.Fatal("Wrong path to Maildir") + } + log.Println("junk mails loaded") + + // Open the database + log.Println("loading database") + db, err := openDB(maildirPaths[0]) + if err != nil { + log.Fatal(err) + } + defer db.Close() + log.Println("database loaded") + + // Check for unprocessed mail + var unprocessedJunk, unprocessedGood []string + for i := range mailsGood { + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("Processed")) + v := b.Get([]byte(mails[i].Key)) + if len(v) == 0 { + unprocessedGood = append(unprocessedGood, mails[i].Key) + } + if string(v) == junk { + unprocessedGood = append(unprocessedGood, mails[i].Key) + } + return nil + }) + } + for i := range mailsJunk { + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("Processed")) + v := b.Get([]byte(mails[i].Key)) + if len(v) == 0 { + unprocessedJunk = append(unprocessedJunk, mails[i].Key) + } + if string(v) == good { + unprocessedJunk = append(unprocessedJunk, mails[i].Key) + } + return nil + }) + } + + // Classify and learn unprocessed mail mux := http.NewServeMux() log.Fatalln(http.ListenAndServe(":8080", mux))