From f76bfbd6ebaa22957304ca201c48195add07cbd5 Mon Sep 17 00:00:00 2001 From: Carlo Strub Date: Sat, 13 May 2017 21:22:23 +0000 Subject: [PATCH] add classify test and fix other tests --- classify.go | 48 ++++++++++++++---------- classify_test.go | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ learn_test.go | 8 ++-- mail_test.go | 67 +++++++++++++++++---------------- 4 files changed, 164 insertions(+), 55 deletions(-) create mode 100644 classify_test.go diff --git a/classify.go b/classify.go index 964f819..55c018a 100644 --- a/classify.go +++ b/classify.go @@ -42,13 +42,13 @@ func classificationPrior(db *bolt.DB) (g float64, err error) { func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) { err = db.View(func(tx *bolt.Tx) error { - var gN, jN uint64 + var gN, jN, gTotal, jTotal uint64 b := tx.Bucket([]byte("Wordlists")) good := b.Bucket([]byte("Good")) gWordRaw := good.Get([]byte(word)) - if len(gWordRaw) != 0 { + if len(gWordRaw) > 0 { gWordHLL, err := hllpp.Unmarshal(gWordRaw) if err != nil { return err @@ -57,7 +57,7 @@ func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error } junk := b.Bucket([]byte("Junk")) jWordRaw := junk.Get([]byte(word)) - if len(jWordRaw) != 0 { + if len(jWordRaw) > 0 { jWordHLL, err := hllpp.Unmarshal(jWordRaw) if err != nil { return err @@ -66,22 +66,28 @@ func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error } p := tx.Bucket([]byte("Statistics")) - gHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedGood"))) - if err != nil { - return err + gRaw := p.Get([]byte("ProcessedGood")) + if len(gRaw) > 0 { + gHLL, err := hllpp.Unmarshal(gRaw) + if err != nil { + return err + } + gTotal = gHLL.Count() } - jHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedJunk"))) - if err != nil { - return err + jRaw := p.Get([]byte("ProcessedJunk")) + if len(jRaw) > 0 { + jHLL, err := hllpp.Unmarshal(jRaw) + if err != nil { + return err + } + jTotal = jHLL.Count() } - gTotal := gHLL.Count() if gTotal == 0 { - return errors.New("no good mails have been classified so far") + return errors.New("no good mails have yet been classified") } - jTotal := jHLL.Count() if jTotal == 0 { - return errors.New("no junk mails have been classified so far") + return errors.New("no junk mails have yet been classified") } g = float64(gN) / float64(gTotal) @@ -128,7 +134,7 @@ func (m *Mail) Classify(db *bolt.DB) error { return err } - junk, err := Junk(db, list) + junk, _, err := Junk(db, list) if err != nil { return err } @@ -149,21 +155,23 @@ func (m *Mail) Classify(db *bolt.DB) error { } // Junk returns true if the wordlist is classified as a junk mail using Bayes' -// rule. -func Junk(db *bolt.DB, wordlist []string) (bool, error) { +// rule. If required, it also returns the calculated probability of being junk, +// but this is typically not needed. +func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) { var probabilities []float64 for _, val := range wordlist { p, err := classificationWord(db, val) if err != nil { - return false, err + return false, prob, err } probabilities = append(probabilities, p) } - if stat.HarmonicMean(probabilities, nil) < 0.5 { - return true, nil + prob = stat.HarmonicMean(probabilities, nil) + if prob < 0.5 { + return true, (1 - prob), nil } - return false, nil + return false, (1 - prob), nil } diff --git a/classify_test.go b/classify_test.go new file mode 100644 index 0000000..444007c --- /dev/null +++ b/classify_test.go @@ -0,0 +1,96 @@ +package sisyphus_test + +import ( + "math" + "os" + + . "github.com/carlostrub/sisyphus" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +var _ = Describe("Classify", func() { + Context("Classify a new mail", func() { + BeforeEach(func() { + // check whether there exists a DB file + _, oserr := os.Stat("test/Maildir/sisyphus.db") + Ω(os.IsNotExist(oserr)).Should(BeTrue()) + + // Load db + dbs, err = LoadDatabases([]Maildir{"test/Maildir"}) + Ω(err).ShouldNot(HaveOccurred()) + + m = new(Mail) + + // Load junk mail + m = &Mail{ + Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa", + Junk: true, + } + + err = m.Load("test/Maildir") + Ω(err).ShouldNot(HaveOccurred()) + + m.Learn(dbs["test/Maildir"]) + + // Load good mail + m = &Mail{ + Key: "1488230510.M141612P8565.mail.carlostrub.ch,S=5978,W=6119", + } + + err = m.Load("test/Maildir") + Ω(err).ShouldNot(HaveOccurred()) + + m.Learn(dbs["test/Maildir"]) + }) + AfterEach(func() { + // Cleanup + CloseDatabases(dbs) + + err = os.Remove("test/Maildir/sisyphus.db") + Ω(err).ShouldNot(HaveOccurred()) + }) + + It("Classify one word from the mail that was learned before", func() { + + answer, prob, err := Junk(dbs["test/Maildir"], []string{"london"}) + + Ω(err).ShouldNot(HaveOccurred()) + Ω(prob).Should(Equal(1.0)) + Ω(answer).Should(BeTrue()) + + }) + + It("Classify one word from the mail that was learned before", func() { + + answer, prob, err := Junk(dbs["test/Maildir"], []string{"localbase"}) + + Ω(err).ShouldNot(HaveOccurred()) + Ω(prob).Should(Equal(0.0)) + Ω(answer).Should(BeFalse()) + + }) + + It("Classify one word from the mail that was never learned", func() { + + answer, prob, err := Junk(dbs["test/Maildir"], []string{"abcdefg"}) + + Ω(err).ShouldNot(HaveOccurred()) + Ω(math.IsNaN(prob)).Should(BeTrue()) + Ω(answer).Should(BeFalse()) + + }) + + It("Classify one word from the mail that was learned in good and junk", func() { + + m.Learn(dbs["test/Maildir"]) + answer, prob, err := Junk(dbs["test/Maildir"], []string{"than"}) + + Ω(err).ShouldNot(HaveOccurred()) + Ω(prob).Should(Equal(0.7795275590551181)) + Ω(answer).Should(BeTrue()) + + }) + }) +}) diff --git a/learn_test.go b/learn_test.go index b25bbc5..4521a15 100644 --- a/learn_test.go +++ b/learn_test.go @@ -11,9 +11,11 @@ import ( . "github.com/onsi/gomega" ) -var m *Mail -var dbs map[Maildir]*bolt.DB -var err error +var ( + m *Mail + dbs map[Maildir]*bolt.DB + err error +) var _ = Describe("Learn", func() { Context("Learn a new mail", func() { diff --git a/mail_test.go b/mail_test.go index 44bc6d1..7cca802 100644 --- a/mail_test.go +++ b/mail_test.go @@ -234,10 +234,8 @@ var _ = Describe("Mail", func() { It("More Junk", func() { m := s.Mail{ - Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -259,10 +257,8 @@ var _ = Describe("Mail", func() { It("More Junk", func() { m := s.Mail{ - Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -284,10 +280,8 @@ var _ = Describe("Mail", func() { It("Wordlist 1", func() { m := s.Mail{ - Key: "1488181583.M633084P4781.mail.carlostrub.ch,S=708375,W=720014:2,a", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488181583.M633084P4781.mail.carlostrub.ch,S=708375,W=720014:2,a", + Junk: true, } err := m.Load("test/Maildir") @@ -306,10 +300,8 @@ var _ = Describe("Mail", func() { It("Wordlist 2", func() { m := s.Mail{ - Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -328,10 +320,8 @@ var _ = Describe("Mail", func() { It("Wordlist 3", func() { m := s.Mail{ - Key: "1488226337.M327824P8269.mail.carlostrub.ch,S=8044,W=8167:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488226337.M327824P8269.mail.carlostrub.ch,S=8044,W=8167:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -350,10 +340,8 @@ var _ = Describe("Mail", func() { It("Wordlist 4", func() { m := s.Mail{ - Key: "1488226337.M327825P8269.mail.carlostrub.ch,S=802286,W=812785:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488226337.M327825P8269.mail.carlostrub.ch,S=802286,W=812785:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -372,10 +360,8 @@ var _ = Describe("Mail", func() { It("Wordlist 5", func() { m := s.Mail{ - Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -394,10 +380,8 @@ var _ = Describe("Mail", func() { It("Wordlist 6", func() { m := s.Mail{ - Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa", - Subject: nil, - Body: nil, - Junk: true, + Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa", + Junk: true, } err := m.Load("test/Maildir") @@ -413,5 +397,24 @@ var _ = Describe("Mail", func() { Ω(list).Should(Equal( []string{"always", "amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "form", "good", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "super", "table", "terry", "these", "this", "time", "trademark", "united", "various", "viagra", "view", "when", "wish", "with", "your"})) }) + + It("Wordlist 7", func() { + m := s.Mail{ + Key: "1488230510.M141612P8565.mail.carlostrub.ch,S=5978,W=6119", + } + + err := m.Load("test/Maildir") + Ω(err).ShouldNot(HaveOccurred()) + + err = m.Clean() + Ω(err).ShouldNot(HaveOccurred()) + + list, err := m.Wordlist() + Ω(err).ShouldNot(HaveOccurred()) + sort.Strings(list) + + Ω(list).Should(Equal( + []string{"amending", "both", "build", "builds", "clang", "convert", "danfe", "depends", "drop", "explicit", "fine", "install", "instead", "library", "localbase", "manually", "port", "powerpc", "prefer", "rather", "shared", "static", "than", "their", "uses", "utilize", "with", "xorg"})) + }) }) })