mirror of https://github.com/miguelmota/cointop
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
572 lines
16 KiB
Go
572 lines
16 KiB
Go
/* soup package implements a simple web scraper for Go,
|
|
keeping it as similar as possible to BeautifulSoup
|
|
*/
|
|
|
|
package soup
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"net/http/httputil"
|
|
"net/url"
|
|
netURL "net/url"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/charset"
|
|
)
|
|
|
|
// ErrorType defines types of errors that are possible from soup
|
|
type ErrorType int
|
|
|
|
const (
|
|
// ErrUnableToParse will be returned when the HTML could not be parsed
|
|
ErrUnableToParse ErrorType = iota
|
|
// ErrElementNotFound will be returned when element was not found
|
|
ErrElementNotFound
|
|
// ErrNoNextSibling will be returned when no next sibling can be found
|
|
ErrNoNextSibling
|
|
// ErrNoPreviousSibling will be returned when no previous sibling can be found
|
|
ErrNoPreviousSibling
|
|
// ErrNoNextElementSibling will be returned when no next element sibling can be found
|
|
ErrNoNextElementSibling
|
|
// ErrNoPreviousElementSibling will be returned when no previous element sibling can be found
|
|
ErrNoPreviousElementSibling
|
|
// ErrCreatingGetRequest will be returned when the get request couldn't be created
|
|
ErrCreatingGetRequest
|
|
// ErrInGetRequest will be returned when there was an error during the get request
|
|
ErrInGetRequest
|
|
// ErrCreatingPostRequest will be returned when the post request couldn't be created
|
|
ErrCreatingPostRequest
|
|
// ErrMarshallingPostRequest will be returned when the body of a post request couldn't be serialized
|
|
ErrMarshallingPostRequest
|
|
// ErrReadingResponse will be returned if there was an error reading the response to our get request
|
|
ErrReadingResponse
|
|
)
|
|
|
|
// Error allows easier introspection on the type of error returned.
|
|
// If you know you have a Error, you can compare the Type to one of the exported types
|
|
// from this package to see what kind of error it is, then further inspect the Error() method
|
|
// to see if it has more specific details for you, like in the case of a ErrElementNotFound
|
|
// type of error.
|
|
type Error struct {
|
|
Type ErrorType
|
|
msg string
|
|
}
|
|
|
|
func (se Error) Error() string {
|
|
return se.msg
|
|
}
|
|
|
|
func newError(t ErrorType, msg string) Error {
|
|
return Error{Type: t, msg: msg}
|
|
}
|
|
|
|
// Root is a structure containing a pointer to an html node, the node value, and an error variable to return an error if one occurred
|
|
type Root struct {
|
|
Pointer *html.Node
|
|
NodeValue string
|
|
Error error
|
|
}
|
|
|
|
// Init a new HTTP client for use when the client doesn't want to use their own.
|
|
var (
|
|
defaultClient = &http.Client{}
|
|
|
|
debug = false
|
|
|
|
// Headers contains all HTTP headers to send
|
|
Headers = make(map[string]string)
|
|
|
|
// Cookies contains all HTTP cookies to send
|
|
Cookies = make(map[string]string)
|
|
)
|
|
|
|
// SetDebug sets the debug status
|
|
// Setting this to true causes the panics to be thrown and logged onto the console.
|
|
// Setting this to false causes the errors to be saved in the Error field in the returned struct.
|
|
func SetDebug(d bool) {
|
|
debug = d
|
|
}
|
|
|
|
// Header sets a new HTTP header
|
|
func Header(n string, v string) {
|
|
Headers[n] = v
|
|
}
|
|
|
|
// Cookie sets a cookie for http requests
|
|
func Cookie(n string, v string) {
|
|
Cookies[n] = v
|
|
}
|
|
|
|
// GetWithClient returns the HTML returned by the url using a provided HTTP client
|
|
func GetWithClient(url string, client *http.Client) (string, error) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
if debug {
|
|
panic("Couldn't create GET request to " + url)
|
|
}
|
|
return "", newError(ErrCreatingGetRequest, "error creating get request to "+url)
|
|
}
|
|
|
|
setHeadersAndCookies(req)
|
|
|
|
// Perform request
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
if debug {
|
|
panic("Couldn't perform GET request to " + url)
|
|
}
|
|
return "", newError(ErrInGetRequest, "couldn't perform GET request to "+url)
|
|
}
|
|
defer resp.Body.Close()
|
|
utf8Body, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
bytes, err := ioutil.ReadAll(utf8Body)
|
|
if err != nil {
|
|
if debug {
|
|
panic("Unable to read the response body")
|
|
}
|
|
return "", newError(ErrReadingResponse, "unable to read the response body")
|
|
}
|
|
return string(bytes), nil
|
|
}
|
|
|
|
// setHeadersAndCookies helps build a request
|
|
func setHeadersAndCookies(req *http.Request) {
|
|
// Set headers
|
|
for hName, hValue := range Headers {
|
|
req.Header.Set(hName, hValue)
|
|
}
|
|
// Set cookies
|
|
for cName, cValue := range Cookies {
|
|
req.AddCookie(&http.Cookie{
|
|
Name: cName,
|
|
Value: cValue,
|
|
})
|
|
}
|
|
}
|
|
|
|
// getBodyReader serializes the body for a network request. See the test file for examples
|
|
func getBodyReader(rawBody interface{}) (io.Reader, error) {
|
|
var bodyReader io.Reader
|
|
|
|
if rawBody != nil {
|
|
switch body := rawBody.(type) {
|
|
case map[string]string:
|
|
jsonBody, err := json.Marshal(body)
|
|
if err != nil {
|
|
if debug {
|
|
panic("Unable to read the response body")
|
|
}
|
|
return nil, newError(ErrMarshallingPostRequest, "couldn't serialize map of strings to JSON.")
|
|
}
|
|
bodyReader = bytes.NewBuffer(jsonBody)
|
|
case netURL.Values:
|
|
bodyReader = strings.NewReader(body.Encode())
|
|
case []byte: //expects JSON format
|
|
bodyReader = bytes.NewBuffer(body)
|
|
case string: //expects JSON format
|
|
bodyReader = strings.NewReader(body)
|
|
default:
|
|
return nil, newError(ErrMarshallingPostRequest, fmt.Sprintf("Cannot handle body type %T", rawBody))
|
|
}
|
|
}
|
|
|
|
return bodyReader, nil
|
|
}
|
|
|
|
// PostWithClient returns the HTML returned by the url using a provided HTTP client
|
|
// The type of the body must conform to one of the types listed in func getBodyReader()
|
|
func PostWithClient(url string, bodyType string, body interface{}, client *http.Client) (string, error) {
|
|
bodyReader, err := getBodyReader(body)
|
|
if err != nil {
|
|
return "todo:", err
|
|
}
|
|
|
|
req, err := http.NewRequest("POST", url, bodyReader)
|
|
Header("Content-Type", bodyType)
|
|
setHeadersAndCookies(req)
|
|
|
|
if debug {
|
|
// Save a copy of this request for debugging.
|
|
requestDump, err := httputil.DumpRequest(req, true)
|
|
if err != nil {
|
|
fmt.Println(err)
|
|
}
|
|
fmt.Println(string(requestDump))
|
|
}
|
|
|
|
// Perform request
|
|
resp, err := client.Do(req)
|
|
|
|
if err != nil {
|
|
if debug {
|
|
panic("Couldn't perform POST request to " + url)
|
|
}
|
|
return "", newError(ErrCreatingPostRequest, "couldn't perform POST request to"+url)
|
|
}
|
|
defer resp.Body.Close()
|
|
bytes, err := ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
if debug {
|
|
panic("Unable to read the response body")
|
|
}
|
|
return "", newError(ErrReadingResponse, "unable to read the response body")
|
|
}
|
|
return string(bytes), nil
|
|
}
|
|
|
|
// Post returns the HTML returned by the url as a string using the default HTTP client
|
|
func Post(url string, bodyType string, body interface{}) (string, error) {
|
|
return PostWithClient(url, bodyType, body, defaultClient)
|
|
}
|
|
|
|
// PostForm is a convenience method for POST requests that
|
|
func PostForm(url string, data url.Values) (string, error) {
|
|
return PostWithClient(url, "application/x-www-form-urlencoded", data, defaultClient)
|
|
}
|
|
|
|
// Get returns the HTML returned by the url as a string using the default HTTP client
|
|
func Get(url string) (string, error) {
|
|
return GetWithClient(url, defaultClient)
|
|
}
|
|
|
|
// HTMLParse parses the HTML returning a start pointer to the DOM
|
|
func HTMLParse(s string) Root {
|
|
r, err := html.Parse(strings.NewReader(s))
|
|
if err != nil {
|
|
if debug {
|
|
panic("Unable to parse the HTML")
|
|
}
|
|
return Root{Error: newError(ErrUnableToParse, "unable to parse the HTML")}
|
|
}
|
|
for r.Type != html.ElementNode {
|
|
switch r.Type {
|
|
case html.DocumentNode:
|
|
r = r.FirstChild
|
|
case html.DoctypeNode:
|
|
r = r.NextSibling
|
|
case html.CommentNode:
|
|
r = r.NextSibling
|
|
}
|
|
}
|
|
return Root{Pointer: r, NodeValue: r.Data}
|
|
}
|
|
|
|
// Find finds the first occurrence of the given tag name,
|
|
// with or without attribute key and value specified,
|
|
// and returns a struct with a pointer to it
|
|
func (r Root) Find(args ...string) Root {
|
|
temp, ok := findOnce(r.Pointer, args, false, false)
|
|
if ok == false {
|
|
if debug {
|
|
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
|
|
}
|
|
return Root{Error: newError(ErrElementNotFound, fmt.Sprintf("element `%s` with attributes `%s` not found", args[0], strings.Join(args[1:], " ")))}
|
|
}
|
|
return Root{Pointer: temp, NodeValue: temp.Data}
|
|
}
|
|
|
|
// FindAll finds all occurrences of the given tag name,
|
|
// with or without key and value specified,
|
|
// and returns an array of structs, each having
|
|
// the respective pointers
|
|
func (r Root) FindAll(args ...string) []Root {
|
|
temp := findAllofem(r.Pointer, args, false)
|
|
if len(temp) == 0 {
|
|
if debug {
|
|
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
|
|
}
|
|
return []Root{}
|
|
}
|
|
pointers := make([]Root, 0, len(temp))
|
|
for i := 0; i < len(temp); i++ {
|
|
pointers = append(pointers, Root{Pointer: temp[i], NodeValue: temp[i].Data})
|
|
}
|
|
return pointers
|
|
}
|
|
|
|
// FindStrict finds the first occurrence of the given tag name
|
|
// only if all the values of the provided attribute are an exact match
|
|
func (r Root) FindStrict(args ...string) Root {
|
|
temp, ok := findOnce(r.Pointer, args, false, true)
|
|
if ok == false {
|
|
if debug {
|
|
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
|
|
}
|
|
return Root{nil, "", newError(ErrElementNotFound, fmt.Sprintf("element `%s` with attributes `%s` not found", args[0], strings.Join(args[1:], " ")))}
|
|
}
|
|
return Root{Pointer: temp, NodeValue: temp.Data}
|
|
}
|
|
|
|
// FindAllStrict finds all occurrences of the given tag name
|
|
// only if all the values of the provided attribute are an exact match
|
|
func (r Root) FindAllStrict(args ...string) []Root {
|
|
temp := findAllofem(r.Pointer, args, true)
|
|
if len(temp) == 0 {
|
|
if debug {
|
|
panic("Element `" + args[0] + "` with attributes `" + strings.Join(args[1:], " ") + "` not found")
|
|
}
|
|
return []Root{}
|
|
}
|
|
pointers := make([]Root, 0, len(temp))
|
|
for i := 0; i < len(temp); i++ {
|
|
pointers = append(pointers, Root{Pointer: temp[i], NodeValue: temp[i].Data})
|
|
}
|
|
return pointers
|
|
}
|
|
|
|
// FindNextSibling finds the next sibling of the pointer in the DOM
|
|
// returning a struct with a pointer to it
|
|
func (r Root) FindNextSibling() Root {
|
|
nextSibling := r.Pointer.NextSibling
|
|
if nextSibling == nil {
|
|
if debug {
|
|
panic("No next sibling found")
|
|
}
|
|
return Root{Error: newError(ErrNoNextSibling, "no next sibling found")}
|
|
}
|
|
return Root{Pointer: nextSibling, NodeValue: nextSibling.Data}
|
|
}
|
|
|
|
// FindPrevSibling finds the previous sibling of the pointer in the DOM
|
|
// returning a struct with a pointer to it
|
|
func (r Root) FindPrevSibling() Root {
|
|
prevSibling := r.Pointer.PrevSibling
|
|
if prevSibling == nil {
|
|
if debug {
|
|
panic("No previous sibling found")
|
|
}
|
|
|
|
return Root{Error: newError(ErrNoPreviousSibling, "no previous sibling found")}
|
|
}
|
|
return Root{Pointer: prevSibling, NodeValue: prevSibling.Data}
|
|
}
|
|
|
|
// FindNextElementSibling finds the next element sibling of the pointer in the DOM
|
|
// returning a struct with a pointer to it
|
|
func (r Root) FindNextElementSibling() Root {
|
|
nextSibling := r.Pointer.NextSibling
|
|
if nextSibling == nil {
|
|
if debug {
|
|
panic("No next element sibling found")
|
|
}
|
|
return Root{Error: newError(ErrNoNextElementSibling, "no next element sibling found")}
|
|
}
|
|
if nextSibling.Type == html.ElementNode {
|
|
return Root{Pointer: nextSibling, NodeValue: nextSibling.Data}
|
|
}
|
|
p := Root{Pointer: nextSibling, NodeValue: nextSibling.Data}
|
|
return p.FindNextElementSibling()
|
|
}
|
|
|
|
// FindPrevElementSibling finds the previous element sibling of the pointer in the DOM
|
|
// returning a struct with a pointer to it
|
|
func (r Root) FindPrevElementSibling() Root {
|
|
prevSibling := r.Pointer.PrevSibling
|
|
if prevSibling == nil {
|
|
if debug {
|
|
panic("No previous element sibling found")
|
|
}
|
|
return Root{Error: newError(ErrNoPreviousElementSibling, "no previous element sibling found")}
|
|
}
|
|
if prevSibling.Type == html.ElementNode {
|
|
return Root{Pointer: prevSibling, NodeValue: prevSibling.Data}
|
|
}
|
|
p := Root{Pointer: prevSibling, NodeValue: prevSibling.Data}
|
|
return p.FindPrevElementSibling()
|
|
}
|
|
|
|
// Children retuns all direct children of this DOME element.
|
|
func (r Root) Children() []Root {
|
|
child := r.Pointer.FirstChild
|
|
var children []Root
|
|
for child != nil {
|
|
children = append(children, Root{Pointer: child, NodeValue: child.Data})
|
|
child = child.NextSibling
|
|
}
|
|
return children
|
|
}
|
|
|
|
// Attrs returns a map containing all attributes
|
|
func (r Root) Attrs() map[string]string {
|
|
if r.Pointer.Type != html.ElementNode {
|
|
if debug {
|
|
panic("Not an ElementNode")
|
|
}
|
|
return nil
|
|
}
|
|
if len(r.Pointer.Attr) == 0 {
|
|
return nil
|
|
}
|
|
return getKeyValue(r.Pointer.Attr)
|
|
}
|
|
|
|
// Text returns the string inside a non-nested element
|
|
func (r Root) Text() string {
|
|
k := r.Pointer.FirstChild
|
|
checkNode:
|
|
if k != nil && k.Type != html.TextNode {
|
|
k = k.NextSibling
|
|
if k == nil {
|
|
if debug {
|
|
panic("No text node found")
|
|
}
|
|
return ""
|
|
}
|
|
goto checkNode
|
|
}
|
|
if k != nil {
|
|
r, _ := regexp.Compile(`^\s+$`)
|
|
if ok := r.MatchString(k.Data); ok {
|
|
k = k.NextSibling
|
|
if k == nil {
|
|
if debug {
|
|
panic("No text node found")
|
|
}
|
|
return ""
|
|
}
|
|
goto checkNode
|
|
}
|
|
return k.Data
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// HTML returns the HTML code for the specific element
|
|
func (r Root) HTML() string {
|
|
var buf bytes.Buffer
|
|
if err := html.Render(&buf, r.Pointer); err != nil {
|
|
return ""
|
|
}
|
|
return buf.String()
|
|
}
|
|
|
|
// FullText returns the string inside even a nested element
|
|
func (r Root) FullText() string {
|
|
var buf bytes.Buffer
|
|
|
|
var f func(*html.Node)
|
|
f = func(n *html.Node) {
|
|
if n == nil {
|
|
return
|
|
}
|
|
if n.Type == html.TextNode {
|
|
buf.WriteString(n.Data)
|
|
}
|
|
if n.Type == html.ElementNode {
|
|
f(n.FirstChild)
|
|
}
|
|
if n.NextSibling != nil {
|
|
f(n.NextSibling)
|
|
}
|
|
}
|
|
|
|
f(r.Pointer.FirstChild)
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
func matchElementName(n *html.Node, name string) bool {
|
|
return name == "" || name == n.Data
|
|
}
|
|
|
|
// Using depth first search to find the first occurrence and return
|
|
func findOnce(n *html.Node, args []string, uni bool, strict bool) (*html.Node, bool) {
|
|
if uni == true {
|
|
if n.Type == html.ElementNode && matchElementName(n, args[0]) {
|
|
if len(args) > 1 && len(args) < 4 {
|
|
for i := 0; i < len(n.Attr); i++ {
|
|
attr := n.Attr[i]
|
|
searchAttrName := args[1]
|
|
searchAttrVal := args[2]
|
|
if (strict && attributeAndValueEquals(attr, searchAttrName, searchAttrVal)) ||
|
|
(!strict && attributeContainsValue(attr, searchAttrName, searchAttrVal)) {
|
|
return n, true
|
|
}
|
|
}
|
|
} else if len(args) == 1 {
|
|
return n, true
|
|
}
|
|
}
|
|
}
|
|
uni = true
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
p, q := findOnce(c, args, true, strict)
|
|
if q != false {
|
|
return p, q
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
// Using depth first search to find all occurrences and return
|
|
func findAllofem(n *html.Node, args []string, strict bool) []*html.Node {
|
|
var nodeLinks = make([]*html.Node, 0, 10)
|
|
var f func(*html.Node, []string, bool)
|
|
f = func(n *html.Node, args []string, uni bool) {
|
|
if uni == true {
|
|
if n.Type == html.ElementNode && matchElementName(n, args[0]) {
|
|
if len(args) > 1 && len(args) < 4 {
|
|
for i := 0; i < len(n.Attr); i++ {
|
|
attr := n.Attr[i]
|
|
searchAttrName := args[1]
|
|
searchAttrVal := args[2]
|
|
if (strict && attributeAndValueEquals(attr, searchAttrName, searchAttrVal)) ||
|
|
(!strict && attributeContainsValue(attr, searchAttrName, searchAttrVal)) {
|
|
nodeLinks = append(nodeLinks, n)
|
|
}
|
|
}
|
|
} else if len(args) == 1 {
|
|
nodeLinks = append(nodeLinks, n)
|
|
}
|
|
}
|
|
}
|
|
uni = true
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
f(c, args, true)
|
|
}
|
|
}
|
|
f(n, args, false)
|
|
return nodeLinks
|
|
}
|
|
|
|
// attributeAndValueEquals reports when the html.Attribute attr has the same attribute name and value as from
|
|
// provided arguments
|
|
func attributeAndValueEquals(attr html.Attribute, attribute, value string) bool {
|
|
return attr.Key == attribute && attr.Val == value
|
|
}
|
|
|
|
// attributeContainsValue reports when the html.Attribute attr has the same attribute name as from provided
|
|
// attribute argument and compares if it has the same value in its values parameter
|
|
func attributeContainsValue(attr html.Attribute, attribute, value string) bool {
|
|
if attr.Key == attribute {
|
|
for _, attrVal := range strings.Fields(attr.Val) {
|
|
if attrVal == value {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Returns a key pair value (like a dictionary) for each attribute
|
|
func getKeyValue(attributes []html.Attribute) map[string]string {
|
|
var keyvalues = make(map[string]string)
|
|
for i := 0; i < len(attributes); i++ {
|
|
_, exists := keyvalues[attributes[i].Key]
|
|
if exists == false {
|
|
keyvalues[attributes[i].Key] = attributes[i].Val
|
|
}
|
|
}
|
|
return keyvalues
|
|
}
|