Skip to content

Commit

Permalink
Merge pull request #33 from vosmith/multipart
Browse files Browse the repository at this point in the history
initial work on multipart requests, issue #8
  • Loading branch information
asciimoo authored Oct 18, 2017
2 parents 56cd1d4 + e8ae638 commit 30e15f6
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 8 deletions.
74 changes: 66 additions & 8 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package colly

import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io"
Expand Down Expand Up @@ -153,22 +154,32 @@ func (c *Collector) Init() {
// request to the URL specified in parameter.
// Visit also calls the previously provided callbacks
func (c *Collector) Visit(URL string) error {
return c.scrape(URL, "GET", 1, nil, nil)
return c.scrape(URL, "GET", 1, nil, nil, nil)
}

// Post starts a collector job by creating a POST request.
// Post also calls the previously provided callbacks
func (c *Collector) Post(URL string, requestData map[string]string) error {
return c.scrape(URL, "POST", 1, createFormReader(requestData), nil)
return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil)
}

// PostRaw starts a collector job by creating a POST request with raw binary data.
// Post also calls the previously provided callbacks
func (c *Collector) PostRaw(URL string, requestData []byte) error {
return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil)
return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil)
}

func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context) error {
// PostMultipart starts a collector job by creating a Multipart POST request
// with raw binary data. PostMultipart also calls the previously provided callbacks
func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error {
boundary := randomBoundary()
hdr := http.Header{}
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
hdr.Set("User-Agent", c.UserAgent)
return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr)
}

func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header) error {
c.wg.Add(1)
defer c.wg.Done()
if err := c.requestCheck(u, depth); err != nil {
Expand All @@ -188,7 +199,14 @@ func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, c
if err != nil {
return err
}
req.Header.Set("User-Agent", c.UserAgent)
if hdr == nil {
req.Header.Set("User-Agent", c.UserAgent)
if method == "POST" {
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
}
} else {
req.Header = hdr
}
if ctx == nil {
ctx = NewContext()
}
Expand Down Expand Up @@ -475,21 +493,32 @@ func (r *Request) AbsoluteURL(u string) string {
// request and preserves the Context of the previous request.
// Visit also calls the previously provided callbacks
func (r *Request) Visit(URL string) error {
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx)
return r.collector.scrape(r.AbsoluteURL(URL), "GET", r.Depth+1, nil, r.Ctx, nil)
}

// Post continues a collector job by creating a POST request and preserves the Context
// of the previous request.
// Post also calls the previously provided callbacks
func (r *Request) Post(URL string, requestData map[string]string) error {
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx)
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createFormReader(requestData), r.Ctx, nil)
}

// PostRaw starts a collector job by creating a POST request with raw binary data.
// PostRaw preserves the Context of the previous request
// and calls the previously provided callbacks
func (r *Request) PostRaw(URL string, requestData []byte) error {
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx)
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, bytes.NewReader(requestData), r.Ctx, nil)
}

// PostMultipart starts a collector job by creating a Multipart POST request
// with raw binary data. PostMultipart also calls the previously provided.
// callbacks
func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error {
boundary := randomBoundary()
hdr := http.Header{}
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
hdr.Set("User-Agent", r.collector.UserAgent)
return r.collector.scrape(r.AbsoluteURL(URL), "POST", r.Depth+1, createMultipartReader(boundary, requestData), r.Ctx, hdr)
}

// UnmarshalBinary decodes Context value to nil
Expand Down Expand Up @@ -531,6 +560,35 @@ func createFormReader(data map[string]string) io.Reader {
return strings.NewReader(form.Encode())
}

func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
dashBoundary := "--" + boundary

body := []byte{}
buffer := bytes.NewBuffer(body)

buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n")
for contentType, content := range data {
buffer.WriteString(dashBoundary + "\n")
buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n")
buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content)))
buffer.Write(content)
buffer.WriteString("\n")
}
buffer.WriteString(dashBoundary + "--\n\n")
return buffer
}

// randomBoundary was borrowed from
// github.com/golang/go/mime/multipart/writer.go#randomBoundary
func randomBoundary() string {
var buf [30]byte
_, err := io.ReadFull(rand.Reader, buf[:])
if err != nil {
panic(err)
}
return fmt.Sprintf("%x", buf[:])
}

func (r *Response) fixCharset() {
contentType := strings.ToLower(r.Headers.Get("Content-Type"))
if strings.Index(contentType, "charset") == -1 {
Expand Down
Binary file added examples/multipart/asciimoo.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
68 changes: 68 additions & 0 deletions examples/multipart/multipart.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package main

import (
"fmt"
"io/ioutil"
"net/http"
"os"
"time"

"github.com/asciimoo/colly"
)

func generateFormData() map[string][]byte {
f, _ := os.Open("asciimoo.jpg")
defer f.Close()

imgData, _ := ioutil.ReadAll(f)

return map[string][]byte{
"firstname": []byte("one"),
"lastname": []byte("two"),
"email": []byte("onetwo@example.com"),
"file": imgData,
}
}

func setupServer() {
var handler http.HandlerFunc = func(w http.ResponseWriter, r *http.Request) {
fmt.Println("received request")
err := r.ParseMultipartForm(10000000)
if err != nil {
fmt.Println("server: Error")
w.WriteHeader(500)
w.Write([]byte("<html><body>Internal Server Error</body></html>"))
return
}
w.WriteHeader(200)
fmt.Println("server: OK")
w.Write([]byte("<html><body>Success</body></html>"))
}

go http.ListenAndServe(":8080", handler)
}

func main() {
// Start a single route http server to post an image to.
setupServer()

c := colly.NewCollector()
c.AllowURLRevisit = true
c.MaxDepth = 5

// On every a element which has href attribute call callback
c.OnHTML("html", func(e *colly.HTMLElement) {
fmt.Println(e.Text)
time.Sleep(1 * time.Second)
e.Request.PostMultipart("http://localhost:8080/", generateFormData())
})

// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Posting asciimoo.jpg to", r.URL.String())
})

// Start scraping
c.PostMultipart("http://localhost:8080/", generateFormData())
c.Wait()
}

0 comments on commit 30e15f6

Please sign in to comment.