You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sisyphus/bayesian.go

101 lines
2.6 KiB
Go

/*
Part of this code is borrowed from github.com/jbrukh/bayesian published under a BSD3CLAUSE License
*/
package sisyphus
import (
"math"
"strconv"
"github.com/boltdb/bolt"
)
// classificationPriors returns the prior probabilities for good and junk
// classes.
func classificationPriors(db *bolt.DB) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gN := float64(good.Stats().KeyN)
junk := b.Bucket([]byte("Junk"))
jN := float64(junk.Stats().KeyN)
g = gN / (gN + jN)
j = jN / (gN + jN)
return nil
})
return
}
// classificationWordProb returns P(W|C_j) -- the probability of seeing
// a particular word W in a document of this class.
func classificationWordProb(db *bolt.DB, word string) (g, j float64) {
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("Wordlists"))
good := b.Bucket([]byte("Good"))
gNString := string(good.Get([]byte(word)))
gN, _ := strconv.ParseFloat(gNString, 64)
junk := b.Bucket([]byte("Junk"))
jNString := string(junk.Get([]byte(word)))
jN, _ := strconv.ParseFloat(jNString, 64)
p := tx.Bucket([]byte("Processed"))
counters := p.Bucket([]byte("Counters"))
jString := string(counters.Get([]byte("Junk")))
j, _ := strconv.ParseFloat(jString, 64)
mails := p.Bucket([]byte("Mails"))
pN := mails.Stats().KeyN
g = gN / (float64(pN) - j)
j = jN / j
return nil
})
return g, j
}
// LogScores produces "log-likelihood"-like scores that can
// be used to classify documents into classes.
//
// The value of the score is proportional to the likelihood,
// as determined by the classifier, that the given document
// belongs to the given class. This is true even when scores
// returned are negative, which they will be (since we are
// taking logs of probabilities).
//
// The index j of the score corresponds to the class given
// by c.Classes[j].
//
// Additionally returned are "inx" and "strict" values. The
// inx corresponds to the maximum score in the array. If more
// than one of the scores holds the maximum values, then
// strict is false.
//
// Unlike c.Probabilities(), this function is not prone to
// floating point underflow and is relatively safe to use.
func LogScores(db *bolt.DB, wordlist []string) (scoreG, scoreJ float64, junk bool) {
priorG, priorJ := classificationPriors(db)
// calculate the scores
scoreG = math.Log(priorG)
scoreJ = math.Log(priorJ)
for _, word := range wordlist {
gP, jP := classificationWordProb(db, word)
scoreG += math.Log(gP)
scoreJ += math.Log(jP)
}
if scoreJ == math.Max(scoreG, scoreJ) {
junk = true
}
return scoreG, scoreJ, junk
}