add classify test and fix other tests

master
Carlo Strub 7 years ago
parent a569e3e624
commit f76bfbd6eb

@ -42,13 +42,13 @@ func classificationPrior(db *bolt.DB) (g float64, err error) {
func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {
err = db.View(func(tx *bolt.Tx) error {
var gN, jN uint64
var gN, jN, gTotal, jTotal uint64
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gWordRaw := good.Get([]byte(word))
if len(gWordRaw) != 0 {
if len(gWordRaw) > 0 {
gWordHLL, err := hllpp.Unmarshal(gWordRaw)
if err != nil {
return err
@ -57,7 +57,7 @@ func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error
}
junk := b.Bucket([]byte("Junk"))
jWordRaw := junk.Get([]byte(word))
if len(jWordRaw) != 0 {
if len(jWordRaw) > 0 {
jWordHLL, err := hllpp.Unmarshal(jWordRaw)
if err != nil {
return err
@ -66,22 +66,28 @@ func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error
}
p := tx.Bucket([]byte("Statistics"))
gHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedGood")))
if err != nil {
return err
gRaw := p.Get([]byte("ProcessedGood"))
if len(gRaw) > 0 {
gHLL, err := hllpp.Unmarshal(gRaw)
if err != nil {
return err
}
gTotal = gHLL.Count()
}
jHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedJunk")))
if err != nil {
return err
jRaw := p.Get([]byte("ProcessedJunk"))
if len(jRaw) > 0 {
jHLL, err := hllpp.Unmarshal(jRaw)
if err != nil {
return err
}
jTotal = jHLL.Count()
}
gTotal := gHLL.Count()
if gTotal == 0 {
return errors.New("no good mails have been classified so far")
return errors.New("no good mails have yet been classified")
}
jTotal := jHLL.Count()
if jTotal == 0 {
return errors.New("no junk mails have been classified so far")
return errors.New("no junk mails have yet been classified")
}
g = float64(gN) / float64(gTotal)
@ -128,7 +134,7 @@ func (m *Mail) Classify(db *bolt.DB) error {
return err
}
junk, err := Junk(db, list)
junk, _, err := Junk(db, list)
if err != nil {
return err
}
@ -149,21 +155,23 @@ func (m *Mail) Classify(db *bolt.DB) error {
}
// Junk returns true if the wordlist is classified as a junk mail using Bayes'
// rule.
func Junk(db *bolt.DB, wordlist []string) (bool, error) {
// rule. If required, it also returns the calculated probability of being junk,
// but this is typically not needed.
func Junk(db *bolt.DB, wordlist []string) (junk bool, prob float64, err error) {
var probabilities []float64
for _, val := range wordlist {
p, err := classificationWord(db, val)
if err != nil {
return false, err
return false, prob, err
}
probabilities = append(probabilities, p)
}
if stat.HarmonicMean(probabilities, nil) < 0.5 {
return true, nil
prob = stat.HarmonicMean(probabilities, nil)
if prob < 0.5 {
return true, (1 - prob), nil
}
return false, nil
return false, (1 - prob), nil
}

@ -0,0 +1,96 @@
package sisyphus_test
import (
"math"
"os"
. "github.com/carlostrub/sisyphus"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var _ = Describe("Classify", func() {
Context("Classify a new mail", func() {
BeforeEach(func() {
// check whether there exists a DB file
_, oserr := os.Stat("test/Maildir/sisyphus.db")
Ω(os.IsNotExist(oserr)).Should(BeTrue())
// Load db
dbs, err = LoadDatabases([]Maildir{"test/Maildir"})
Ω(err).ShouldNot(HaveOccurred())
m = new(Mail)
// Load junk mail
m = &Mail{
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Junk: true,
}
err = m.Load("test/Maildir")
Ω(err).ShouldNot(HaveOccurred())
m.Learn(dbs["test/Maildir"])
// Load good mail
m = &Mail{
Key: "1488230510.M141612P8565.mail.carlostrub.ch,S=5978,W=6119",
}
err = m.Load("test/Maildir")
Ω(err).ShouldNot(HaveOccurred())
m.Learn(dbs["test/Maildir"])
})
AfterEach(func() {
// Cleanup
CloseDatabases(dbs)
err = os.Remove("test/Maildir/sisyphus.db")
Ω(err).ShouldNot(HaveOccurred())
})
It("Classify one word from the mail that was learned before", func() {
answer, prob, err := Junk(dbs["test/Maildir"], []string{"london"})
Ω(err).ShouldNot(HaveOccurred())
Ω(prob).Should(Equal(1.0))
Ω(answer).Should(BeTrue())
})
It("Classify one word from the mail that was learned before", func() {
answer, prob, err := Junk(dbs["test/Maildir"], []string{"localbase"})
Ω(err).ShouldNot(HaveOccurred())
Ω(prob).Should(Equal(0.0))
Ω(answer).Should(BeFalse())
})
It("Classify one word from the mail that was never learned", func() {
answer, prob, err := Junk(dbs["test/Maildir"], []string{"abcdefg"})
Ω(err).ShouldNot(HaveOccurred())
Ω(math.IsNaN(prob)).Should(BeTrue())
Ω(answer).Should(BeFalse())
})
It("Classify one word from the mail that was learned in good and junk", func() {
m.Learn(dbs["test/Maildir"])
answer, prob, err := Junk(dbs["test/Maildir"], []string{"than"})
Ω(err).ShouldNot(HaveOccurred())
Ω(prob).Should(Equal(0.7795275590551181))
Ω(answer).Should(BeTrue())
})
})
})

@ -11,9 +11,11 @@ import (
. "github.com/onsi/gomega"
)
var m *Mail
var dbs map[Maildir]*bolt.DB
var err error
var (
m *Mail
dbs map[Maildir]*bolt.DB
err error
)
var _ = Describe("Learn", func() {
Context("Learn a new mail", func() {

@ -234,10 +234,8 @@ var _ = Describe("Mail", func() {
It("More Junk", func() {
m := s.Mail{
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -259,10 +257,8 @@ var _ = Describe("Mail", func() {
It("More Junk", func() {
m := s.Mail{
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -284,10 +280,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 1", func() {
m := s.Mail{
Key: "1488181583.M633084P4781.mail.carlostrub.ch,S=708375,W=720014:2,a",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488181583.M633084P4781.mail.carlostrub.ch,S=708375,W=720014:2,a",
Junk: true,
}
err := m.Load("test/Maildir")
@ -306,10 +300,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 2", func() {
m := s.Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -328,10 +320,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 3", func() {
m := s.Mail{
Key: "1488226337.M327824P8269.mail.carlostrub.ch,S=8044,W=8167:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488226337.M327824P8269.mail.carlostrub.ch,S=8044,W=8167:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -350,10 +340,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 4", func() {
m := s.Mail{
Key: "1488226337.M327825P8269.mail.carlostrub.ch,S=802286,W=812785:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488226337.M327825P8269.mail.carlostrub.ch,S=802286,W=812785:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -372,10 +360,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 5", func() {
m := s.Mail{
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488226337.M327833P8269.mail.carlostrub.ch,S=6960,W=7161:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -394,10 +380,8 @@ var _ = Describe("Mail", func() {
It("Wordlist 6", func() {
m := s.Mail{
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
Subject: nil,
Body: nil,
Junk: true,
Key: "1488228352.M339670P8269.mail.carlostrub.ch,S=12659,W=12782:2,Sa",
Junk: true,
}
err := m.Load("test/Maildir")
@ -413,5 +397,24 @@ var _ = Describe("Mail", func() {
Ω(list).Should(Equal(
[]string{"always", "amazon", "antiviral", "blockquote", "blood", "body", "canada", "check", "click", "deals", "delivery", "diabetes", "discount", "email", "emails", "europe", "following", "font", "form", "good", "herpes", "hola", "keep", "leading", "limited", "link", "longer", "medication", "message", "most", "north", "offer", "online", "other", "please", "popular", "presents", "pressure", "produced", "products", "read", "receive", "registered", "reserved", "rights", "service", "services", "simply", "span", "special", "states", "store", "subsidiary", "super", "table", "terry", "these", "this", "time", "trademark", "united", "various", "viagra", "view", "when", "wish", "with", "your"}))
})
It("Wordlist 7", func() {
m := s.Mail{
Key: "1488230510.M141612P8565.mail.carlostrub.ch,S=5978,W=6119",
}
err := m.Load("test/Maildir")
Ω(err).ShouldNot(HaveOccurred())
err = m.Clean()
Ω(err).ShouldNot(HaveOccurred())
list, err := m.Wordlist()
Ω(err).ShouldNot(HaveOccurred())
sort.Strings(list)
Ω(list).Should(Equal(
[]string{"amending", "both", "build", "builds", "clang", "convert", "danfe", "depends", "drop", "explicit", "fine", "install", "instead", "library", "localbase", "manually", "port", "powerpc", "prefer", "rather", "shared", "static", "than", "their", "uses", "utilize", "with", "xorg"}))
})
})
})

Loading…
Cancel
Save