-
Notifications
You must be signed in to change notification settings - Fork 497
/
hakrawler.go
292 lines (255 loc) · 7.79 KB
/
hakrawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
package main
import (
"bufio"
"crypto/tls"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"net/http"
"net/url"
"os"
"regexp"
"strings"
"sync"
"time"
"github.com/gocolly/colly/v2"
)
type Result struct {
Source string
URL string
Where string
}
var headers map[string]string
// Thread safe map
var sm sync.Map
func main() {
inside := flag.Bool("i", false, "Only crawl inside path")
threads := flag.Int("t", 8, "Number of threads to utilise.")
depth := flag.Int("d", 2, "Depth to crawl.")
maxSize := flag.Int("size", -1, "Page size limit, in KB.")
insecure := flag.Bool("insecure", false, "Disable TLS verification.")
subsInScope := flag.Bool("subs", false, "Include subdomains for crawling.")
showJson := flag.Bool("json", false, "Output as JSON.")
showSource := flag.Bool("s", false, "Show the source of URL based on where it was found. E.g. href, form, script, etc.")
showWhere := flag.Bool("w", false, "Show at which link the URL is found.")
rawHeaders := flag.String(("h"), "", "Custom headers separated by two semi-colons. E.g. -h \"Cookie: foo=bar;;Referer: http://example.com/\" ")
unique := flag.Bool(("u"), false, "Show only unique urls.")
proxy := flag.String(("proxy"), "", "Proxy URL. E.g. -proxy http://127.0.0.1:8080")
timeout := flag.Int("timeout", -1, "Maximum time to crawl each URL from stdin, in seconds.")
disableRedirects := flag.Bool("dr", false, "Disable following HTTP redirects.")
flag.Parse()
if *proxy != "" {
os.Setenv("PROXY", *proxy)
}
proxyURL, _ := url.Parse(os.Getenv("PROXY"))
// Convert the headers input to a usable map (or die trying)
err := parseHeaders(*rawHeaders)
if err != nil {
fmt.Fprintln(os.Stderr, "Error parsing headers:", err)
os.Exit(1)
}
// Check for stdin input
stat, _ := os.Stdin.Stat()
if (stat.Mode() & os.ModeCharDevice) != 0 {
fmt.Fprintln(os.Stderr, "No urls detected. Hint: cat urls.txt | hakrawler")
os.Exit(1)
}
results := make(chan string, *threads)
go func() {
// get each line of stdin, push it to the work channel
s := bufio.NewScanner(os.Stdin)
for s.Scan() {
url := s.Text()
hostname, err := extractHostname(url)
if err != nil {
log.Println("Error parsing URL:", err)
continue
}
allowed_domains := []string{hostname}
// if "Host" header is set, append it to allowed domains
if headers != nil {
if val, ok := headers["Host"]; ok {
allowed_domains = append(allowed_domains, val)
}
}
// Instantiate default collector
c := colly.NewCollector(
// default user agent header
colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"),
// set custom headers
colly.Headers(headers),
// limit crawling to the domain of the specified URL
colly.AllowedDomains(allowed_domains...),
// set MaxDepth to the specified depth
colly.MaxDepth(*depth),
// specify Async for threading
colly.Async(true),
)
// set a page size limit
if *maxSize != -1 {
c.MaxBodySize = *maxSize * 1024
}
// if -subs is present, use regex to filter out subdomains in scope.
if *subsInScope {
c.AllowedDomains = nil
c.URLFilters = []*regexp.Regexp{regexp.MustCompile(".*(\\.|\\/\\/)" + strings.ReplaceAll(hostname, ".", "\\.") + "((#|\\/|\\?).*)?")}
}
// If `-dr` flag provided, do not follow HTTP redirects.
if *disableRedirects {
c.SetRedirectHandler(func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
})
}
// Set parallelism
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: *threads})
// Print every href found, and visit it
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
abs_link := e.Request.AbsoluteURL(link)
if strings.Contains(abs_link, url) || !*inside {
printResult(link, "href", *showSource, *showWhere, *showJson, results, e)
e.Request.Visit(link)
}
})
// find and print all the JavaScript files
c.OnHTML("script[src]", func(e *colly.HTMLElement) {
printResult(e.Attr("src"), "script", *showSource, *showWhere, *showJson, results, e)
})
// find and print all the form action URLs
c.OnHTML("form[action]", func(e *colly.HTMLElement) {
printResult(e.Attr("action"), "form", *showSource, *showWhere, *showJson, results, e)
})
// add the custom headers
if headers != nil {
c.OnRequest(func(r *colly.Request) {
for header, value := range headers {
r.Headers.Set(header, value)
}
})
}
if *proxy != "" {
// Skip TLS verification for proxy, if -insecure specified
c.WithTransport(&http.Transport{
Proxy: http.ProxyURL(proxyURL),
TLSClientConfig: &tls.Config{InsecureSkipVerify: *insecure},
})
} else {
// Skip TLS verification if -insecure flag is present
c.WithTransport(&http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: *insecure},
})
}
if *timeout == -1 {
// Start scraping
c.Visit(url)
// Wait until threads are finished
c.Wait()
} else {
finished := make(chan int, 1)
go func() {
// Start scraping
c.Visit(url)
// Wait until threads are finished
c.Wait()
finished <- 0
}()
select {
case _ = <-finished: // the crawling finished before the timeout
close(finished)
continue
case <-time.After(time.Duration(*timeout) * time.Second): // timeout reached
log.Println("[timeout] " + url)
continue
}
}
}
if err := s.Err(); err != nil {
fmt.Fprintln(os.Stderr, "reading standard input:", err)
}
close(results)
}()
w := bufio.NewWriter(os.Stdout)
defer w.Flush()
if *unique {
for res := range results {
if isUnique(res) {
fmt.Fprintln(w, res)
}
}
}
for res := range results {
fmt.Fprintln(w, res)
}
}
// parseHeaders does validation of headers input and saves it to a formatted map.
func parseHeaders(rawHeaders string) error {
if rawHeaders != "" {
if !strings.Contains(rawHeaders, ":") {
return errors.New("headers flag not formatted properly (no colon to separate header and value)")
}
headers = make(map[string]string)
rawHeaders := strings.Split(rawHeaders, ";;")
for _, header := range rawHeaders {
var parts []string
if strings.Contains(header, ": ") {
parts = strings.SplitN(header, ": ", 2)
} else if strings.Contains(header, ":") {
parts = strings.SplitN(header, ":", 2)
} else {
continue
}
headers[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
}
}
return nil
}
// extractHostname() extracts the hostname from a URL and returns it
func extractHostname(urlString string) (string, error) {
u, err := url.Parse(urlString)
if err != nil || !u.IsAbs() {
return "", errors.New("Input must be a valid absolute URL")
}
return u.Hostname(), nil
}
// print result constructs output lines and sends them to the results chan
func printResult(link string, sourceName string, showSource bool, showWhere bool, showJson bool, results chan string, e *colly.HTMLElement) {
result := e.Request.AbsoluteURL(link)
whereURL := e.Request.URL.String()
if result != "" {
if showJson {
where := ""
if showWhere {
where = whereURL
}
bytes, _ := json.Marshal(Result{
Source: sourceName,
URL: result,
Where: where,
})
result = string(bytes)
} else if showSource {
result = "[" + sourceName + "] " + result
}
if showWhere && !showJson {
result = "[" + whereURL + "] " + result
}
// If timeout occurs before goroutines are finished, recover from panic that may occur when attempting writing to results to closed results channel
defer func() {
if err := recover(); err != nil {
return
}
}()
results <- result
}
}
// returns whether the supplied url is unique or not
func isUnique(url string) bool {
_, present := sm.Load(url)
if present {
return false
}
sm.Store(url, true)
return true
}