Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mhmdiaa committed Apr 11, 2023
0 parents commit e515a3c
Show file tree
Hide file tree
Showing 6 changed files with 329 additions and 0 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
on:
release:
types: [created]

jobs:
releases-matrix:
name: Release Go Binary
runs-on: ubuntu-latest
strategy:
matrix:
goos: [linux, windows, darwin]
goarch: ["386", amd64, arm64]
exclude:
- goarch: "386"
goos: darwin
- goarch: arm64
goos: windows
steps:
- uses: actions/checkout@v3
- uses: wangyoucao577/go-release-action@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
goos: ${{ matrix.goos }}
goarch: ${{ matrix.goarch }}
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 Mohammed Diaa

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# waybackrobots
Enumerate old versions of robots.txt paths using Wayback Machine for content discovery.

## Usage
Pass a list of domains/URLs through stdin and waybackrobots will write the full URLs to stdout.
```sh
$ cat targets.txt | waybackrobots

Enumerating https://google.com/robots.txt versions... 100% |███████████████████████████████████████████| (50/50, 10 it/s)

https://google.com/analytics/reporting/
https://google.com/ebooks?*q=related:*
https://google.com/compare/*/apply*
...
```

## Command-line options

| Option | Description | Default |
|----------|----------------------------------------------------------------|---------|
| -limit | Limit the number of crawled snapshots. Use -1 for unlimited. | 50 |
| -recent | Use the most recent snapshots without evenly distributing them | false |

## Snapshot Distribution
By default, `waybackrobots` evenly distributes the snapshots it analyzes across the file's history when a limit is set. This is done to diversify the results and get a broader view of the `robots.txt` file over time.

For example, if you set the limit to 5 and there are 10 snapshots, waybackrobots will analyze every other snapshot starting from the latest one. This means it will analyze the first, third, fifth, seventh, and ninth most recent snapshots.

This default behavior can be changed with the `-recent` option, which tells `waybackrobots` to use only the most recent snapshots.

```sh
$ echo google.com | waybackrobots | wc
422 422 13973
$ echo google.com | waybackrobots -recent | wc
277 277 9100
```

## Installation
### Binary
Check out the [latest release](https://github.com/mhmdiaa/waybackrobots/releases/latest).

### Go install
```
go install github.com/mhmdiaa/waybackrobots@latest
```

## References
- This tool is an improved and updated version of [waybackrobots.py](https://gist.github.com/mhmdiaa/2742c5e147d49a804b408bfed3d32d07).
- If you need a more customizable tool for working with Wayback Machine data, check out [chronos](https://github.com/mhmdiaa/chronos).
13 changes: 13 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
module github.com/mhmdiaa/waybackrobots

go 1.19

require github.com/schollz/progressbar/v3 v3.13.1

require (
github.com/mattn/go-runewidth v0.0.14 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/rivo/uniseg v0.4.4 // indirect
golang.org/x/sys v0.7.0 // indirect
golang.org/x/term v0.7.0 // indirect
)
26 changes: 26 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis=
github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE=
github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
golang.org/x/term v0.7.0 h1:BEvjmm5fURWqcfbSKTdpkDXYBrUS1c0m8agp14W48vQ=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
196 changes: 196 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
package main

import (
"bufio"
"encoding/json"
"flag"
"fmt"
"io/ioutil"
"net/http"
"net/url"
"os"
"strconv"
"strings"
"sync"

"github.com/schollz/progressbar/v3"
)

func main() {
versionsLimit := flag.Int("limit", 50, "limit the number crawled snapshots. Use -1 for unlimited")
recent := flag.Bool("recent", false, "use the most recent snapshots without evenly distributing them")
flag.Parse()

scanner := bufio.NewScanner(os.Stdin)
for scanner.Scan() {
url, err := cleanURL(scanner.Text())
if err != nil {
continue
}

versions, err := GetRobotsTxtVersions(url, *versionsLimit, *recent)
if err != nil {
fmt.Fprintf(os.Stderr, "Error getting versions: %v\n", err)
os.Exit(1)
}

numThreads := 10
jobCh := make(chan string, numThreads)
pathCh := make(chan []string)

progressbarMessage := fmt.Sprintf("Enumerating %s/robots.txt versions...", url)
bar := progressbar.Default(int64(len(versions)), progressbarMessage)

var wg sync.WaitGroup
wg.Add(numThreads)

for i := 0; i < numThreads; i++ {
go func() {
defer wg.Done()
for version := range jobCh {
GetRobotsTxtPaths(version, url, pathCh, bar)
}
}()
}

go func() {
for _, version := range versions {
jobCh <- version
}
close(jobCh)
}()

go func() {
wg.Wait()
close(pathCh)
}()

allPaths := make(map[string]bool)
for pathsBatch := range pathCh {
for _, path := range pathsBatch {
allPaths[path] = true
}
}

for path := range allPaths {
fmt.Println(path)
}
}

if err := scanner.Err(); err != nil {
fmt.Fprintf(os.Stderr, "Error reading URLs from stdin: %v\n", err)
os.Exit(1)
}
}

func GetRobotsTxtVersions(url string, limit int, recent bool) ([]string, error) {
requestURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s/robots.txt&output=json&fl=timestamp&filter=statuscode:200&collapse=digest", url)
if limit != -1 && recent {
requestURL += "&limit=-" + strconv.Itoa(limit)
}

res, err := http.Get(requestURL)
if err != nil {
return nil, err
}

raw, err := ioutil.ReadAll(res.Body)
res.Body.Close()
if err != nil {
return nil, err
}

var versions [][]string
err = json.Unmarshal(raw, &versions)
if err != nil {
return nil, err
}
if len(versions) == 0 {
return []string{}, nil
}

versions = versions[1:]

selectedVersions := make([]string, 0)
length := len(versions)

if recent || limit == -1 || length <= limit {
for _, version := range versions {
selectedVersions = append(selectedVersions, version...)
}
} else {
interval := (length + limit - 1) / limit

for i := 0; i < limit; i++ {
index := length - 1 - (i * interval)
if index >= length {
index = length - (limit - i)
}
selectedVersions = append(selectedVersions, versions[index]...)
}
}
return selectedVersions, nil
}

func GetRobotsTxtPaths(version string, url string, pathCh chan []string, bar *progressbar.ProgressBar) {
requestURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s/robots.txt", version, url)
res, err := http.Get(requestURL)
bar.Add(1)
if err != nil || res.StatusCode != 200 {
return
}

outputURLs := make([]string, 0)

scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "Disallow:") || strings.HasPrefix(line, "Allow:") {
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
path := strings.TrimSpace(fields[1])
if path != "" {
fullURL, err := mergeURLPath(url, path)
if err != nil {
continue
}
outputURLs = append(outputURLs, fullURL)
}
}
}

if err := scanner.Err(); err != nil {
return
}
pathCh <- outputURLs
}

func mergeURLPath(baseURL, path string) (string, error) {
host, err := cleanURL(baseURL)
if err != nil {
return "", err
}

if !strings.HasPrefix(path, "/") {
path = "/" + path
}

url := fmt.Sprintf(host + path)
return url, nil
}

func cleanURL(baseURL string) (string, error) {
u, err := url.Parse(baseURL)
if err != nil {
return "", err
}

if u.Scheme == "" {
u.Scheme = "https"
u.Host = baseURL
}

return fmt.Sprintf("%s://%s", u.Scheme, u.Host), nil
}

0 comments on commit e515a3c

Please sign in to comment.