Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make CDP scraper less detectable #1361

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions pkg/scraper/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/cdproto/page"
"github.com/chromedp/chromedp"
jsoniter "github.com/json-iterator/go"
"golang.org/x/net/html/charset"
Expand All @@ -30,6 +31,42 @@ import (
const scrapeGetTimeout = time.Second * 60
const scrapeDefaultSleep = time.Second * 2

// see: https://intoli.com/blog/not-possible-to-block-chrome-headless/
const bypassScript = `(function(w, n, wn) {
// Pass the Webdriver Test.
Object.defineProperty(n, 'webdriver', {
get: () => false,
});

// Pass the Plugins Length Test.
// Overwrite the plugins property to use a custom getter.
Object.defineProperty(n, 'plugins', {
// This just needs to have length > 0 for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});

// Pass the Languages Test.
// Overwrite the plugins property to use a custom getter.
Object.defineProperty(n, 'languages', {
get: () => ['en-US', 'en'],
});

// Pass the Chrome Test.
// We can mock this in as much depth as we need for the test.
w.chrome = {
runtime: {},
};

// Pass the Permissions Test.
const originalQuery = wn.permissions.query;
return wn.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
})(window, navigator, window.navigator);`

func loadURL(url string, scraperConfig config, globalConfig GlobalConfig) (io.Reader, error) {
driverOptions := scraperConfig.DriverOptions
if driverOptions != nil && driverOptions.UseCDP {
Expand Down Expand Up @@ -172,6 +209,14 @@ func urlFromCDP(url string, driverOptions scraperDriverOptions, globalConfig Glo
setCDPCookies(driverOptions),
printCDPCookies(driverOptions, "Cookies found"),
network.SetExtraHTTPHeaders(network.Headers(headers)),
chromedp.ActionFunc(func(ctx context.Context) error {
var err error
_, err = page.AddScriptToEvaluateOnNewDocument(bypassScript).Do(ctx)
if err != nil {
return err
}
return nil
}),
chromedp.Navigate(url),
chromedp.Sleep(sleepDuration),
setCDPClicks(driverOptions),
Expand Down