mirror of https://github.com/carlostrub/sisyphus
many new things...
parent
237fe45ced
commit
0497999a15
@ -1,100 +1,130 @@
|
|||||||
/*
|
|
||||||
Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License
|
|
||||||
*/
|
|
||||||
|
|
||||||
package sisyphus
|
package sisyphus
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
"errors"
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/boltdb/bolt"
|
"github.com/boltdb/bolt"
|
||||||
|
"github.com/gonum/stat"
|
||||||
|
"github.com/retailnext/hllpp"
|
||||||
)
|
)
|
||||||
|
|
||||||
// classificationPriors returns the prior probabilities for good and junk
|
// classificationPrior returns the prior probabilities for good and junk
|
||||||
// classes.
|
// classes.
|
||||||
func classificationPriors(db *bolt.DB) (g, j float64) {
|
func classificationPrior(db *bolt.DB) (g float64, err error) {
|
||||||
|
|
||||||
db.View(func(tx *bolt.Tx) error {
|
err = db.View(func(tx *bolt.Tx) error {
|
||||||
b := tx.Bucket([]byte("Wordlists"))
|
b := tx.Bucket([]byte("Wordlists"))
|
||||||
|
|
||||||
good := b.Bucket([]byte("Good"))
|
good := b.Bucket([]byte("Good"))
|
||||||
gN := float64(good.Stats().KeyN)
|
gN := float64(good.Stats().KeyN)
|
||||||
|
|
||||||
junk := b.Bucket([]byte("Junk"))
|
junk := b.Bucket([]byte("Junk"))
|
||||||
jN := float64(junk.Stats().KeyN)
|
jN := float64(junk.Stats().KeyN)
|
||||||
|
|
||||||
|
// division by zero means there are no learned mails so far
|
||||||
|
if (gN + jN) == 0 {
|
||||||
|
return errors.New("no mails have been classified so far")
|
||||||
|
}
|
||||||
|
|
||||||
g = gN / (gN + jN)
|
g = gN / (gN + jN)
|
||||||
j = jN / (gN + jN)
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
return
|
return g, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// classificationWordProb returns P(W|C_j) -- the probability of seeing
|
// classificationLikelihood returns P(W|C_j) -- the probability of seeing a
|
||||||
// a particular word W in a document of this class.
|
// particular word W in a document of this class.
|
||||||
func classificationWordProb(db *bolt.DB, word string) (g, j float64) {
|
func classificationLikelihood(db *bolt.DB, word string) (g, j float64, err error) {
|
||||||
|
|
||||||
|
err = db.View(func(tx *bolt.Tx) error {
|
||||||
|
var gN, jN uint64
|
||||||
|
|
||||||
db.View(func(tx *bolt.Tx) error {
|
|
||||||
b := tx.Bucket([]byte("Wordlists"))
|
b := tx.Bucket([]byte("Wordlists"))
|
||||||
|
|
||||||
good := b.Bucket([]byte("Good"))
|
good := b.Bucket([]byte("Good"))
|
||||||
gNString := string(good.Get([]byte(word)))
|
gWordRaw := good.Get([]byte(word))
|
||||||
gN, _ := strconv.ParseFloat(gNString, 64)
|
if len(gWordRaw) != 0 {
|
||||||
|
gWordHLL, err := hllpp.Unmarshal(gWordRaw)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
gN = gWordHLL.Count()
|
||||||
|
}
|
||||||
junk := b.Bucket([]byte("Junk"))
|
junk := b.Bucket([]byte("Junk"))
|
||||||
jNString := string(junk.Get([]byte(word)))
|
jWordRaw := junk.Get([]byte(word))
|
||||||
jN, _ := strconv.ParseFloat(jNString, 64)
|
if len(jWordRaw) != 0 {
|
||||||
|
jWordHLL, err := hllpp.Unmarshal(jWordRaw)
|
||||||
p := tx.Bucket([]byte("Processed"))
|
if err != nil {
|
||||||
counters := p.Bucket([]byte("Counters"))
|
return err
|
||||||
jString := string(counters.Get([]byte("Junk")))
|
}
|
||||||
j, _ = strconv.ParseFloat(jString, 64)
|
jN = jWordHLL.Count()
|
||||||
mails := p.Bucket([]byte("Mails"))
|
}
|
||||||
pN := mails.Stats().KeyN
|
|
||||||
|
p := tx.Bucket([]byte("Statistics"))
|
||||||
g = gN / (float64(pN) - j)
|
gHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedGood")))
|
||||||
j = jN / j
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
jHLL, err := hllpp.Unmarshal(p.Get([]byte("ProcessedJunk")))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
gTotal := gHLL.Count()
|
||||||
|
if gTotal == 0 {
|
||||||
|
return errors.New("no good mails have been classified so far")
|
||||||
|
}
|
||||||
|
jTotal := jHLL.Count()
|
||||||
|
if jTotal == 0 {
|
||||||
|
return errors.New("no junk mails have been classified so far")
|
||||||
|
}
|
||||||
|
|
||||||
|
g = float64(gN) / float64(gTotal)
|
||||||
|
j = float64(jN) / float64(jTotal)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
return g, j
|
return g, j, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// classificationWord produces the conditional probability of a word belonging
|
||||||
|
// to good or junk using the classic Bayes' rule.
|
||||||
|
func classificationWord(db *bolt.DB, word string) (g float64, err error) {
|
||||||
|
|
||||||
|
priorG, err := classificationPrior(db)
|
||||||
|
if err != nil {
|
||||||
|
return g, err
|
||||||
|
}
|
||||||
|
|
||||||
|
likelihoodG, likelihoodJ, err := classificationLikelihood(db, word)
|
||||||
|
if err != nil {
|
||||||
|
return g, err
|
||||||
|
}
|
||||||
|
|
||||||
|
g = (likelihoodG * priorG) / (likelihoodG*priorG + likelihoodJ*(1-priorG))
|
||||||
|
|
||||||
|
return g, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// LogScores produces "log-likelihood"-like scores that can
|
// Junk returns true if the wordlist is classified as a junk mail using Bayes'
|
||||||
// be used to classify documents into classes.
|
// rule.
|
||||||
//
|
func Junk(db *bolt.DB, wordlist []string) (bool, error) {
|
||||||
// The value of the score is proportional to the likelihood,
|
var probabilities []float64
|
||||||
// as determined by the classifier, that the given document
|
|
||||||
// belongs to the given class. This is true even when scores
|
for _, val := range wordlist {
|
||||||
// returned are negative, which they will be (since we are
|
p, err := classificationWord(db, val)
|
||||||
// taking logs of probabilities).
|
if err != nil {
|
||||||
//
|
return false, err
|
||||||
// The index j of the score corresponds to the class given
|
}
|
||||||
// by c.Classes[j].
|
probabilities = append(probabilities, p)
|
||||||
//
|
|
||||||
// Additionally returned are "inx" and "strict" values. The
|
|
||||||
// inx corresponds to the maximum score in the array. If more
|
|
||||||
// than one of the scores holds the maximum values, then
|
|
||||||
// strict is false.
|
|
||||||
//
|
|
||||||
// Unlike c.Probabilities(), this function is not prone to
|
|
||||||
// floating point underflow and is relatively safe to use.
|
|
||||||
func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {
|
|
||||||
|
|
||||||
priorG, priorJ := classificationPriors(db)
|
|
||||||
|
|
||||||
// calculate the scores
|
|
||||||
scoreG = math.Log(priorG)
|
|
||||||
scoreJ = math.Log(priorJ)
|
|
||||||
for _, word := range wordlist {
|
|
||||||
gP, jP := classificationWordProb(db, word)
|
|
||||||
scoreG += math.Log(gP)
|
|
||||||
scoreJ += math.Log(jP)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if scoreJ == math.Max(scoreG, scoreJ) {
|
if stat.HarmonicMean(probabilities, nil) < 0.5 {
|
||||||
junk = true
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return scoreG, scoreJ, junk
|
return false, nil
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
[
|
||||||
|
{"linter":"gas","severity":"warning","path":"bayesian.go","line":18,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"bayesian.go","line":38,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"bayesian.go","line":42,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"bayesian.go","line":45,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"bayesian.go","line":50,"col":0,"message":"Errors unhandled.,LOW,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"daemon.go","line":45,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"daemon.go","line":115,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
|
||||||
|
{"linter":"gas","severity":"warning","path":"daemon.go","line":122,"col":0,"message":"Subprocess launching with variable.,HIGH,HIGH"},
|
||||||
|
{"linter":"gocyclo","severity":"warning","path":"mail.go","line":232,"col":0,"message":"cyclomatic complexity 16 of function (*Mail).Classify() is high (\u003e 10)"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":135,"col":0,"message":"duplicate of mail_test.go:160-183"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":160,"col":0,"message":"duplicate of mail_test.go:185-208"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":185,"col":0,"message":"duplicate of mail_test.go:210-233"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":210,"col":0,"message":"duplicate of mail_test.go:235-258"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":235,"col":0,"message":"duplicate of mail_test.go:260-283"},
|
||||||
|
{"linter":"dupl","severity":"warning","path":"mail_test.go","line":260,"col":0,"message":"duplicate of mail_test.go:135-158"},
|
||||||
|
{"linter":"errcheck","severity":"warning","path":"bayesian.go","line":18,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"},
|
||||||
|
{"linter":"errcheck","severity":"warning","path":"bayesian.go","line":38,"col":9,"message":"error return value not checked (db.View(func(tx *bolt.Tx) error {)"},
|
||||||
|
{"linter":"errcheck","severity":"warning","path":"daemon.go","line":26,"col":18,"message":"error return value not checked (defer file.Close())"},
|
||||||
|
{"linter":"errcheck","severity":"warning","path":"mail.go","line":275,"col":11,"message":"error return value not checked (db.Update(func(tx *bolt.Tx) error {)"}
|
||||||
|
]
|
Loading…
Reference in New Issue