more tests and better sanitation

master
Carlo Strub 7 years ago
parent 5d367a0455
commit d0c7d79203

13
glide.lock generated

@ -1,10 +1,17 @@
hash: 18c4dbfd645c08604c86e8f380c30401998faf75714ecc3d891f272f992780b2
updated: 2017-02-28T21:30:19.789294511Z
hash: ec8efbbfd183cdfa97087fa1ff3e6b3a1b7eb77eb129d66473ffbc8c97db2238
updated: 2017-03-02T19:09:31.474686494Z
imports:
- name: github.com/jbrukh/bayesian
version: bf3f261f9a9c61145c60d47665b0518cc32c774f
- name: github.com/kennygrant/sanitize
version: 6a0bfdde8629a3a3a7418a7eae45c54154692514
- name: github.com/luksen/maildir
version: 5297d9c3091c7d4891c9d4f6fa743d500c038d6f
- name: golang.org/x/net
version: 906cda9512f77671ab44f8c8563b13a8e707b230
subpackages:
- html
- html/atom
testImports:
- name: github.com/onsi/ginkgo
version: bb93381d543b0e5725244abe752214a110791d01
@ -40,7 +47,7 @@ testImports:
- matchers/support/goraph/util
- types
- name: golang.org/x/sys
version: 21f2569f6feb83b68a25c98c1b20eca5d4e1e6ae
version: 76cc09b634294339fa19ec41b5f2a0b3932cea8b
subpackages:
- unix
- name: gopkg.in/yaml.v2

@ -2,6 +2,7 @@ package: github.com/carlostrub/sisyphus
import:
- package: github.com/jbrukh/bayesian
- package: github.com/luksen/maildir
- package: github.com/kennygrant/sanitize
testImport:
- package: github.com/onsi/ginkgo
- package: github.com/onsi/gomega

@ -11,6 +11,7 @@ import (
"strings"
"github.com/jbrukh/bayesian"
"github.com/kennygrant/sanitize"
"github.com/luksen/maildir"
)
@ -70,8 +71,19 @@ func (m *Mail) Learn() (c Classifiers, err error) {
return
}
func trimStringFromBase64(s string) string {
if idx := strings.Index(s, "Content-Transfer-Encoding: base64"); idx != -1 {
return s[:idx-1]
}
return s
}
func cleanString(i string) (s string, err error) {
s = strings.ToLower(i)
s = trimStringFromBase64(i)
s = sanitize.Accents(s)
s = sanitize.HTML(s)
s = strings.ToLower(s)
s = strings.Replace(s, "!", " ", -1)
s = strings.Replace(s, "#", " ", -1)
s = strings.Replace(s, "$", " ", -1)
@ -103,6 +115,20 @@ func cleanString(i string) (s string, err error) {
s = strings.Replace(s, "|", " ", -1)
s = strings.Replace(s, "}", " ", -1)
s = strings.Replace(s, "this is a multi part message in mime format", " ", -1)
s = strings.Replace(s, "nextpart", " ", -1)
s = strings.Replace(s, "content type", " ", -1)
s = strings.Replace(s, "text plain", " ", -1)
s = strings.Replace(s, "charset", " ", -1)
s = strings.Replace(s, "content transfer encoding", " ", -1)
s = strings.Replace(s, "quoted printable", " ", -1)
s = strings.Replace(s, "text html", " ", -1)
s = strings.Replace(s, "cp 850", " ", -1)
for i := 0; i < 10; i++ {
s = strings.Replace(s, " ", " ", -1)
}
return s, nil
}

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save