-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e515a3c
Showing
6 changed files
with
329 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
on: | ||
release: | ||
types: [created] | ||
|
||
jobs: | ||
releases-matrix: | ||
name: Release Go Binary | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
goos: [linux, windows, darwin] | ||
goarch: ["386", amd64, arm64] | ||
exclude: | ||
- goarch: "386" | ||
goos: darwin | ||
- goarch: arm64 | ||
goos: windows | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- uses: wangyoucao577/go-release-action@v1 | ||
with: | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
goos: ${{ matrix.goos }} | ||
goarch: ${{ matrix.goarch }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2023 Mohammed Diaa | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# waybackrobots | ||
Enumerate old versions of robots.txt paths using Wayback Machine for content discovery. | ||
|
||
## Usage | ||
Pass a list of domains/URLs through stdin and waybackrobots will write the full URLs to stdout. | ||
```sh | ||
$ cat targets.txt | waybackrobots | ||
|
||
Enumerating https://google.com/robots.txt versions... 100% |███████████████████████████████████████████| (50/50, 10 it/s) | ||
|
||
https://google.com/analytics/reporting/ | ||
https://google.com/ebooks?*q=related:* | ||
https://google.com/compare/*/apply* | ||
... | ||
``` | ||
|
||
## Command-line options | ||
|
||
| Option | Description | Default | | ||
|----------|----------------------------------------------------------------|---------| | ||
| -limit | Limit the number of crawled snapshots. Use -1 for unlimited. | 50 | | ||
| -recent | Use the most recent snapshots without evenly distributing them | false | | ||
|
||
## Snapshot Distribution | ||
By default, `waybackrobots` evenly distributes the snapshots it analyzes across the file's history when a limit is set. This is done to diversify the results and get a broader view of the `robots.txt` file over time. | ||
|
||
For example, if you set the limit to 5 and there are 10 snapshots, waybackrobots will analyze every other snapshot starting from the latest one. This means it will analyze the first, third, fifth, seventh, and ninth most recent snapshots. | ||
|
||
This default behavior can be changed with the `-recent` option, which tells `waybackrobots` to use only the most recent snapshots. | ||
|
||
```sh | ||
$ echo google.com | waybackrobots | wc | ||
422 422 13973 | ||
$ echo google.com | waybackrobots -recent | wc | ||
277 277 9100 | ||
``` | ||
|
||
## Installation | ||
### Binary | ||
Check out the [latest release](https://github.com/mhmdiaa/waybackrobots/releases/latest). | ||
|
||
### Go install | ||
``` | ||
go install github.com/mhmdiaa/waybackrobots@latest | ||
``` | ||
|
||
## References | ||
- This tool is an improved and updated version of [waybackrobots.py](https://gist.github.com/mhmdiaa/2742c5e147d49a804b408bfed3d32d07). | ||
- If you need a more customizable tool for working with Wayback Machine data, check out [chronos](https://github.com/mhmdiaa/chronos). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
module github.com/mhmdiaa/waybackrobots | ||
|
||
go 1.19 | ||
|
||
require github.com/schollz/progressbar/v3 v3.13.1 | ||
|
||
require ( | ||
github.com/mattn/go-runewidth v0.0.14 // indirect | ||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect | ||
github.com/rivo/uniseg v0.4.4 // indirect | ||
golang.org/x/sys v0.7.0 // indirect | ||
golang.org/x/term v0.7.0 // indirect | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= | ||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= | ||
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= | ||
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= | ||
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= | ||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= | ||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= | ||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= | ||
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= | ||
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= | ||
github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE= | ||
github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ= | ||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | ||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= | ||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= | ||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU= | ||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= | ||
golang.org/x/term v0.7.0 h1:BEvjmm5fURWqcfbSKTdpkDXYBrUS1c0m8agp14W48vQ= | ||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
package main | ||
|
||
import ( | ||
"bufio" | ||
"encoding/json" | ||
"flag" | ||
"fmt" | ||
"io/ioutil" | ||
"net/http" | ||
"net/url" | ||
"os" | ||
"strconv" | ||
"strings" | ||
"sync" | ||
|
||
"github.com/schollz/progressbar/v3" | ||
) | ||
|
||
func main() { | ||
versionsLimit := flag.Int("limit", 50, "limit the number crawled snapshots. Use -1 for unlimited") | ||
recent := flag.Bool("recent", false, "use the most recent snapshots without evenly distributing them") | ||
flag.Parse() | ||
|
||
scanner := bufio.NewScanner(os.Stdin) | ||
for scanner.Scan() { | ||
url, err := cleanURL(scanner.Text()) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
versions, err := GetRobotsTxtVersions(url, *versionsLimit, *recent) | ||
if err != nil { | ||
fmt.Fprintf(os.Stderr, "Error getting versions: %v\n", err) | ||
os.Exit(1) | ||
} | ||
|
||
numThreads := 10 | ||
jobCh := make(chan string, numThreads) | ||
pathCh := make(chan []string) | ||
|
||
progressbarMessage := fmt.Sprintf("Enumerating %s/robots.txt versions...", url) | ||
bar := progressbar.Default(int64(len(versions)), progressbarMessage) | ||
|
||
var wg sync.WaitGroup | ||
wg.Add(numThreads) | ||
|
||
for i := 0; i < numThreads; i++ { | ||
go func() { | ||
defer wg.Done() | ||
for version := range jobCh { | ||
GetRobotsTxtPaths(version, url, pathCh, bar) | ||
} | ||
}() | ||
} | ||
|
||
go func() { | ||
for _, version := range versions { | ||
jobCh <- version | ||
} | ||
close(jobCh) | ||
}() | ||
|
||
go func() { | ||
wg.Wait() | ||
close(pathCh) | ||
}() | ||
|
||
allPaths := make(map[string]bool) | ||
for pathsBatch := range pathCh { | ||
for _, path := range pathsBatch { | ||
allPaths[path] = true | ||
} | ||
} | ||
|
||
for path := range allPaths { | ||
fmt.Println(path) | ||
} | ||
} | ||
|
||
if err := scanner.Err(); err != nil { | ||
fmt.Fprintf(os.Stderr, "Error reading URLs from stdin: %v\n", err) | ||
os.Exit(1) | ||
} | ||
} | ||
|
||
func GetRobotsTxtVersions(url string, limit int, recent bool) ([]string, error) { | ||
requestURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s/robots.txt&output=json&fl=timestamp&filter=statuscode:200&collapse=digest", url) | ||
if limit != -1 && recent { | ||
requestURL += "&limit=-" + strconv.Itoa(limit) | ||
} | ||
|
||
res, err := http.Get(requestURL) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
raw, err := ioutil.ReadAll(res.Body) | ||
res.Body.Close() | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
var versions [][]string | ||
err = json.Unmarshal(raw, &versions) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if len(versions) == 0 { | ||
return []string{}, nil | ||
} | ||
|
||
versions = versions[1:] | ||
|
||
selectedVersions := make([]string, 0) | ||
length := len(versions) | ||
|
||
if recent || limit == -1 || length <= limit { | ||
for _, version := range versions { | ||
selectedVersions = append(selectedVersions, version...) | ||
} | ||
} else { | ||
interval := (length + limit - 1) / limit | ||
|
||
for i := 0; i < limit; i++ { | ||
index := length - 1 - (i * interval) | ||
if index >= length { | ||
index = length - (limit - i) | ||
} | ||
selectedVersions = append(selectedVersions, versions[index]...) | ||
} | ||
} | ||
return selectedVersions, nil | ||
} | ||
|
||
func GetRobotsTxtPaths(version string, url string, pathCh chan []string, bar *progressbar.ProgressBar) { | ||
requestURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s/robots.txt", version, url) | ||
res, err := http.Get(requestURL) | ||
bar.Add(1) | ||
if err != nil || res.StatusCode != 200 { | ||
return | ||
} | ||
|
||
outputURLs := make([]string, 0) | ||
|
||
scanner := bufio.NewScanner(res.Body) | ||
for scanner.Scan() { | ||
line := strings.TrimSpace(scanner.Text()) | ||
if strings.HasPrefix(line, "Disallow:") || strings.HasPrefix(line, "Allow:") { | ||
fields := strings.Fields(line) | ||
if len(fields) < 2 { | ||
continue | ||
} | ||
path := strings.TrimSpace(fields[1]) | ||
if path != "" { | ||
fullURL, err := mergeURLPath(url, path) | ||
if err != nil { | ||
continue | ||
} | ||
outputURLs = append(outputURLs, fullURL) | ||
} | ||
} | ||
} | ||
|
||
if err := scanner.Err(); err != nil { | ||
return | ||
} | ||
pathCh <- outputURLs | ||
} | ||
|
||
func mergeURLPath(baseURL, path string) (string, error) { | ||
host, err := cleanURL(baseURL) | ||
if err != nil { | ||
return "", err | ||
} | ||
|
||
if !strings.HasPrefix(path, "/") { | ||
path = "/" + path | ||
} | ||
|
||
url := fmt.Sprintf(host + path) | ||
return url, nil | ||
} | ||
|
||
func cleanURL(baseURL string) (string, error) { | ||
u, err := url.Parse(baseURL) | ||
if err != nil { | ||
return "", err | ||
} | ||
|
||
if u.Scheme == "" { | ||
u.Scheme = "https" | ||
u.Host = baseURL | ||
} | ||
|
||
return fmt.Sprintf("%s://%s", u.Scheme, u.Host), nil | ||
} |