Skip to content

Commit

Permalink
Merge branch 'betterProxy'
Browse files Browse the repository at this point in the history
  • Loading branch information
gosom committed Nov 2, 2024
2 parents e79a2b6 + 0d8e5f6 commit b504102
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 51 deletions.
5 changes: 3 additions & 2 deletions adapters/fetchers/jshttp/jshttp.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ func newBrowser(pw *playwright.Playwright, headless, disableImages bool, rotator

next := rotator.Next()

srv := "socks5://" + next
username, password := rotator.GetCredentials()
srv := next.URL
username := next.Username
password := next.Password

return &playwright.Proxy{
Server: srv,
Expand Down
52 changes: 28 additions & 24 deletions adapters/proxy/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,59 +6,63 @@ import (
"net/url"
"sync"
"sync/atomic"

"github.com/gosom/scrapemate"
)

type Rotator struct {
proxies []string
username string
password string
current uint32
cache sync.Map
proxies []scrapemate.Proxy
current uint32
cache sync.Map
}

func New(proxies []string, username, password string) *Rotator {
func New(proxies []string) *Rotator {
if len(proxies) == 0 {
panic("no proxies provided")
}

return &Rotator{
proxies: proxies,
username: username,
password: password,
current: 0,
plist := make([]scrapemate.Proxy, len(proxies))

for i := range proxies {
p, err := scrapemate.NewProxy(proxies[i])
if err != nil {
panic(err)
}

plist[i] = p
}
}

//nolint:gocritic // no need to change the signature
func (pr *Rotator) GetCredentials() (string, string) {
return pr.username, pr.password
return &Rotator{
proxies: plist,
current: 0,
}
}

func (pr *Rotator) Next() string {
current := atomic.AddUint32(&pr.current, 1)
func (pr *Rotator) Next() scrapemate.Proxy {
current := atomic.AddUint32(&pr.current, 1) - 1

return pr.proxies[current%uint32(len(pr.proxies))] //nolint:gosec // no overflow here
}

func (pr *Rotator) RoundTrip(req *http.Request) (*http.Response, error) {
proxyAddr := pr.Next()
next := pr.Next()

transport, ok := pr.cache.Load(proxyAddr)
transport, ok := pr.cache.Load(next.URL)
if !ok {
proxyURL, err := url.Parse("socks5://" + proxyAddr)
proxyURL, err := url.Parse(next.URL)
if err != nil {
return nil, fmt.Errorf("error parsing proxy URL for %s: %v", proxyAddr, err)
return nil, fmt.Errorf("error parsing proxy URL for %s: %v", next.URL, err)
}

if pr.username != "" && pr.password != "" {
proxyURL.User = url.UserPassword(pr.username, pr.password)
if next.Username != "" && next.Password != "" {
proxyURL.User = url.UserPassword(next.Username, next.Password)
}

transport = &http.Transport{
Proxy: http.ProxyURL(proxyURL),
}

pr.cache.Store(proxyAddr, transport)
pr.cache.Store(next.URL, transport)
}

return transport.(*http.Transport).RoundTrip(req)
Expand Down
165 changes: 165 additions & 0 deletions adapters/proxy/proxy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
package proxy //nolint:testpackage // need access to internal fields for testing

import (
"net/http"
"net/http/httptest"
"sync"
"testing"

"github.com/stretchr/testify/require"
)

func TestNewRotator(t *testing.T) {
t.Run("creates rotator with valid proxies", func(t *testing.T) {
proxies := []string{
"socks5://proxy1.example.com:1080",
"http://proxy2.example.com:8080",
"https://proxy3.example.com:8080",
}

r := New(proxies)
require.NotNil(t, r)
require.Len(t, r.proxies, 3)
require.Equal(t, "socks5://proxy1.example.com:1080", r.proxies[0].URL)
require.Equal(t, "http://proxy2.example.com:8080", r.proxies[1].URL)
require.Equal(t, "https://proxy3.example.com:8080", r.proxies[2].URL)
})

t.Run("panics with empty proxy list", func(t *testing.T) {
require.Panics(t, func() {
New([]string{})
})
})

t.Run("panics with invalid proxy URL", func(t *testing.T) {
require.Panics(t, func() {
New([]string{"invalid://proxy"})
})
})
}

func TestRotatorNext(t *testing.T) {
proxies := []string{
"socks5://proxy1.example.com:1080",
"http://proxy2.example.com:8080",
"socks5://proxy3.example.com:1080",
"https://proxy4.example.com:8080",
}

r := New(proxies)

t.Run("rotates through proxies in order", func(t *testing.T) {
p1 := r.Next()
require.Equal(t, "socks5://proxy1.example.com:1080", p1.URL)

p2 := r.Next()
require.Equal(t, "http://proxy2.example.com:8080", p2.URL)

p3 := r.Next()
require.Equal(t, "socks5://proxy3.example.com:1080", p3.URL)

p4 := r.Next()
require.Equal(t, "https://proxy4.example.com:8080", p4.URL)

p5 := r.Next()
require.Equal(t, "socks5://proxy1.example.com:1080", p5.URL)
})
}

func TestRotatorRoundTrip(t *testing.T) {
testServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer testServer.Close()

t.Run("creates and caches transport", func(t *testing.T) {
proxies := []string{
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080",
}
r := New(proxies)

req, err := http.NewRequest("GET", testServer.URL, http.NoBody)
require.NoError(t, err)

_, err = r.RoundTrip(req) //nolint:bodyclose // this is a test
require.Error(t, err)

transport, ok := r.cache.Load("http://proxy1.example.com:8080")
require.True(t, ok)
require.NotNil(t, transport)
})

t.Run("uses credentials when provided", func(t *testing.T) {
proxies := []string{
"http://proxy.example.com:8080",
}
r := New(proxies)
r.proxies[0].Username = "user"
r.proxies[0].Password = "pass"

req, err := http.NewRequest("GET", testServer.URL, http.NoBody)
require.NoError(t, err)

_, err = r.RoundTrip(req) //nolint:bodyclose // this is a test
require.Error(t, err)

transport, ok := r.cache.Load("http://proxy.example.com:8080")
require.True(t, ok)
require.NotNil(t, transport)
})

t.Run("handles invalid proxy URL", func(t *testing.T) {
proxies := []string{
"http://proxy.example.com:8080",
}
r := New(proxies)
r.proxies[0].URL = ":\\invalid"

req, err := http.NewRequest("GET", testServer.URL, http.NoBody)
require.NoError(t, err)

_, err = r.RoundTrip(req) //nolint:bodyclose // this is a test
require.Error(t, err)
require.Contains(t, err.Error(), "error parsing proxy URL")
})
}

func TestRotatorConcurrency(t *testing.T) {
proxies := []string{
"socks5://proxy1.example.com:1080",
"http://proxy2.example.com:8080",
}

r := New(proxies)

t.Run("handles concurrent access", func(t *testing.T) {
var wg sync.WaitGroup

iterations := 100

seen := make(map[string]bool)

var mu sync.Mutex

for i := 0; i < iterations; i++ {
wg.Add(1)

go func() {
defer wg.Done()

proxy := r.Next()

mu.Lock()
seen[proxy.URL] = true
mu.Unlock()
}()
}

wg.Wait()

require.Len(t, seen, 2)
require.True(t, seen["socks5://proxy1.example.com:1080"])
require.True(t, seen["http://proxy2.example.com:8080"])
})
}
57 changes: 57 additions & 0 deletions proxy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package scrapemate

import (
"fmt"
"net/url"
"strings"
)

// Proxy is a struct for proxy
type Proxy struct {
URL string
Username string
Password string
}

func NewProxy(u string) (Proxy, error) {
if !strings.Contains(u, "://") {
u = "socks5://" + u
}

pu, err := url.Parse(u)
if err != nil {
return Proxy{}, err
}

supportedSchemes := []string{"socks5", "http", "https"}

scheme := strings.ToLower(pu.Scheme)

var valid bool

for _, s := range supportedSchemes {
if s == scheme {
valid = true

break
}
}

if !valid {
return Proxy{}, fmt.Errorf("invalid proxy type: %s", scheme)
}

var username, password string
if pu.User != nil {
username = pu.User.Username()
password, _ = pu.User.Password()
}

cleanURL := fmt.Sprintf("%s://%s", scheme, pu.Host)

return Proxy{
URL: cleanURL,
Username: username,
Password: password,
}, nil
}
Loading

0 comments on commit b504102

Please sign in to comment.