more cleaning

master
Carlo Strub 7 years ago
parent 4216909c6a
commit 007a70edb4

@ -70,8 +70,59 @@ func (m *Mail) Learn() (c Classifiers, err error) {
return
}
func cleanString(i string) (s string, err error) {
s = strings.ToLower(i)
s = strings.Replace(s, "!", " ", -1)
s = strings.Replace(s, "#", " ", -1)
s = strings.Replace(s, "$", " ", -1)
s = strings.Replace(s, "%", " ", -1)
s = strings.Replace(s, "&", " ", -1)
s = strings.Replace(s, "(", " ", -1)
s = strings.Replace(s, ")", " ", -1)
s = strings.Replace(s, "*", " ", -1)
s = strings.Replace(s, "+", " ", -1)
s = strings.Replace(s, ",", " ", -1)
s = strings.Replace(s, "-", " ", -1)
s = strings.Replace(s, ".", " ", -1)
s = strings.Replace(s, "/", " ", -1)
s = strings.Replace(s, ":", " ", -1)
s = strings.Replace(s, "<", " ", -1)
s = strings.Replace(s, "=", " ", -1)
s = strings.Replace(s, ">", " ", -1)
s = strings.Replace(s, "@", " ", -1)
s = strings.Replace(s, "[", " ", -1)
s = strings.Replace(s, "\"", " ", -1)
s = strings.Replace(s, "\\", " ", -1)
s = strings.Replace(s, "\n", " ", -1)
s = strings.Replace(s, "\t", " ", -1)
s = strings.Replace(s, "]", " ", -1)
s = strings.Replace(s, "^", " ", -1)
s = strings.Replace(s, "_", " ", -1)
s = strings.Replace(s, "{", " ", -1)
s = strings.Replace(s, "|", " ", -1)
s = strings.Replace(s, "}", " ", -1)
s = strings.Replace(s, ";", " ", -1)
return s, nil
}
// Clean prepares the mail's subject and body for training
func (m *Mail) Clean() error {
if m.Subject != nil {
s, err := cleanString(*m.Subject)
if err != nil {
return err
}
m.Subject = &s
}
if m.Body != nil {
b, err := cleanString(*m.Body)
if err != nil {
return err
}
m.Body = &b
}
return nil
}
@ -96,8 +147,7 @@ func (m *Mail) Load(d string) error {
bScanner := bufio.NewScanner(bQ)
for bScanner.Scan() {
raw := bScanner.Text()
clean := strings.Replace(raw, "\\", "hallo", -1)
b = append(b, clean)
b = append(b, raw)
}
body := strings.Join(b, " ")

@ -60,6 +60,10 @@ var _ = Describe("Main", func() {
},
}))
})
It("Fail if Maildir does not exist", func() {
_, err := Index("test/DOESNOTEXIST")
Ω(err).Should(HaveOccurred())
})
})
Context("Mail", func() {
@ -84,5 +88,53 @@ var _ = Describe("Main", func() {
Junk: true,
}))
})
It("Fail if Subject has content already", func() {
s := "test"
m := Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
Subject: &s,
Body: nil,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).Should(HaveOccurred())
})
It("Fail if Body has content already", func() {
b := "test"
m := Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
Subject: nil,
Body: &b,
Junk: true,
}
err := m.Load("test/Maildir" + "/.Junk")
Ω(err).Should(HaveOccurred())
})
It("Clean mail content", func() {
subject := "Hello"
body := "This is a multi-part message in MIME format. ------=_NextPart_000_0032_01D2912F.05324BC6 Content-Type: text/plain; \tcharset=\"cp-850\" Content-Transfer-Encoding: quoted-printable Dear cs, We are looking for employees working remotely. My name is Kari, I am the personnel manager of a large International company. Most of the work you can do from home, that is, at a distance. Salary is $2000-$5300. If you are interested in this offer, please visit Our Site Best regards! ------=_NextPart_000_0032_01D2912F.05324BC6 Content-Type: text/html; \tcharset=\"cp-850\" Content-Transfer-Encoding: quoted-printable <html xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:w=\"urn:schemas-microsoft-com:office:word\" xmlns:m=\"http://schemas.microsoft.com/office/2004/12/omml\" xmlns=\"http://www.w3.org/TR/REC-html40\"><head><META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=us-ascii\"><meta name=Generator content=\"Microsoft Word 14 (filtered medium)\"><style><!-- /* Font Definitions */ @font-face \t{font-family:Calibri; \tpanose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal \t{margin:0in; \tmargin-bottom:.0001pt; \tfont-size:11.0pt; \tfont-family:\"Calibri\",\"sans-serif\";} a:link, span.MsoHyperlink \t{mso-style-priority:99; \tcolor:blue; \ttext-decoration:underline;} a:visited, span.MsoHyperlinkFollowed \t{mso-style-priority:99; \tcolor:purple; \ttext-decoration:underline;} span.EmailStyle17 \t{mso-style-type:personal-compose; \tfont-family:\"Calibri\",\"sans-serif\"; \tcolor:windowtext;} .MsoChpDefault \t{mso-style-type:export-only; \tfont-family:\"Calibri\",\"sans-serif\";} @page WordSection1 \t{size:8.5in 11.0in; \tmargin:1.0in 1.0in 1.0in 1.0in;} div.WordSection1 \t{page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext=\"edit\" spidmax=\"1026\" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext=\"edit\"> <o:idmap v:ext=\"edit\" data=\"1\" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-US link=blue vlink=purple><div class=WordSection1><p class=MsoNormal>Dear cs,<br> <br> We are looking for employees working remotely.<br> <br> My name is Kari, I am the personnel manager of a large International company.<br> Most of the work you can do from home, that is, at a distance.<br> <b>Salary is $2000-$5300.</b><br> <br> If you are interested in this offer, please visit <a href=\"http://www.xn-----6kcabdfroa7c7a2as1an7a2j.xn--p1ai/components/com_contact/views/categories/tmpl/5f9506d3f8.html\"><b>Our Site</b></a><br> <br> Best regards!<br><o:p></o:p></p></div></body></html> ------=_NextPart_000_0032_01D2912F.05324BC6-- "
m := Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
Subject: &subject,
Body: &body,
Junk: true,
}
err := m.Clean()
Ω(err).ShouldNot(HaveOccurred())
subjectOutput := "hello"
bodyOutput := "This is a multi-part message in MIME format. ------=_NextPart_000_0032_01D2912F.05324BC6 Content-Type: text/plain; \tcharset=\"cp-850\" Content-Transfer-Encoding: quoted-printable Dear cs, We are looking for employees working remotely. My name is Kari, I am the personnel manager of a large International company. Most of the work you can do from home, that is, at a distance. Salary is $2000-$5300. If you are interested in this offer, please visit Our Site Best regards! ------=_NextPart_000_0032_01D2912F.05324BC6 Content-Type: text/html; \tcharset=\"cp-850\" Content-Transfer-Encoding: quoted-printable <html xmlns:v=\"urn:schemas-microsoft-com:vml\" xmlns:o=\"urn:schemas-microsoft-com:office:office\" xmlns:w=\"urn:schemas-microsoft-com:office:word\" xmlns:m=\"http://schemas.microsoft.com/office/2004/12/omml\" xmlns=\"http://www.w3.org/TR/REC-html40\"><head><META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=us-ascii\"><meta name=Generator content=\"Microsoft Word 14 (filtered medium)\"><style><!-- /* Font Definitions */ @font-face \t{font-family:Calibri; \tpanose-1:2 15 5 2 2 2 4 3 2 4;} /* Style Definitions */ p.MsoNormal, li.MsoNormal, div.MsoNormal \t{margin:0in; \tmargin-bottom:.0001pt; \tfont-size:11.0pt; \tfont-family:\"Calibri\",\"sans-serif\";} a:link, span.MsoHyperlink \t{mso-style-priority:99; \tcolor:blue; \ttext-decoration:underline;} a:visited, span.MsoHyperlinkFollowed \t{mso-style-priority:99; \tcolor:purple; \ttext-decoration:underline;} span.EmailStyle17 \t{mso-style-type:personal-compose; \tfont-family:\"Calibri\",\"sans-serif\"; \tcolor:windowtext;} .MsoChpDefault \t{mso-style-type:export-only; \tfont-family:\"Calibri\",\"sans-serif\";} @page WordSection1 \t{size:8.5in 11.0in; \tmargin:1.0in 1.0in 1.0in 1.0in;} div.WordSection1 \t{page:WordSection1;} --></style><!--[if gte mso 9]><xml> <o:shapedefaults v:ext=\"edit\" spidmax=\"1026\" /> </xml><![endif]--><!--[if gte mso 9]><xml> <o:shapelayout v:ext=\"edit\"> <o:idmap v:ext=\"edit\" data=\"1\" /> </o:shapelayout></xml><![endif]--></head><body lang=EN-US link=blue vlink=purple><div class=WordSection1><p class=MsoNormal>Dear cs,<br> <br> We are looking for employees working remotely.<br> <br> My name is Kari, I am the personnel manager of a large International company.<br> Most of the work you can do from home, that is, at a distance.<br> <b>Salary is $2000-$5300.</b><br> <br> If you are interested in this offer, please visit <a href=\"http://www.xn-----6kcabdfroa7c7a2as1an7a2j.xn--p1ai/components/com_contact/views/categories/tmpl/5f9506d3f8.html\"><b>Our Site</b></a><br> <br> Best regards!<br><o:p></o:p></p></div></body></html> ------=_NextPart_000_0032_01D2912F.05324BC6-- "
Ω(m).Should(Equal(
Mail{
Key: "1488226337.M327822P8269.mail.carlostrub.ch,S=3620,W=3730",
Subject: &subjectOutput,
Body: &bodyOutput,
Junk: true,
}))
})
})
})

Loading…
Cancel
Save