Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom httpx proxy and user_agent for Scraper #220

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,29 @@
pip install twitter-api-client -U
```

## New
Scraper now supports httpx proxy settings

```bash
pip install httpx[socks]
```

```python
from twitter.scraper import Scraper

#
proxy_url="socks5://username:paswword@host:port"
httpx_proxies={"http://": proxy_url, "https://": proxy_url}


## resume session using cookies (JSON file) and use proxy
scraper = Scraper(cookies='twitter.cookies', httpx_proxies=httpx_proxies)

## if you want to use the scraper as regular without proxy...
scraper = Scraper(cookies='twitter.cookies')
```


### Automation

![](assets/account.gif)
Expand Down
31 changes: 19 additions & 12 deletions twitter/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@


class Scraper:
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, **kwargs):
def __init__(self, email: str = None, username: str = None, password: str = None, session: Client = None, httpx_proxies: dict = {}, user_agent: str = None, **kwargs):
self.httpx_proxies = httpx_proxies
self.user_agent = user_agent

self.save = kwargs.get('save', True)
self.debug = kwargs.get('debug', 0)
self.pbar = kwargs.get('pbar', True)
Expand Down Expand Up @@ -267,8 +270,12 @@ async def process(fns: Generator) -> list:
'max_keepalive_connections': kwargs.pop('max_keepalive_connections', None),
'keepalive_expiry': kwargs.pop('keepalive_expiry', 5.0),
}
headers = {'user-agent': random.choice(USER_AGENTS)}
async with AsyncClient(limits=Limits(**limits), headers=headers, http2=True, verify=False, timeout=60, follow_redirects=True) as client:
if(self.user_agent != None):
headers = {'user-agent': self.user_agent}
else:
headers = {'user-agent': random.choice(USER_AGENTS)}

async with AsyncClient(proxies=self.httpx_proxies, limits=Limits(**limits), headers=headers, http2=True, verify=False, timeout=60, follow_redirects=True) as client:
return await tqdm_asyncio.gather(*(fn(client=client) for fn in fns), desc='Downloading Media')

def download(urls: list[tuple], out: str) -> Generator:
Expand Down Expand Up @@ -358,7 +365,7 @@ async def process():
offsets = utc or ["-1200", "-1100", "-1000", "-0900", "-0800", "-0700", "-0600", "-0500", "-0400", "-0300",
"-0200", "-0100", "+0000", "+0100", "+0200", "+0300", "+0400", "+0500", "+0600", "+0700",
"+0800", "+0900", "+1000", "+1100", "+1200", "+1300", "+1400"]
async with AsyncClient(headers=get_headers(self.session)) as client:
async with AsyncClient(proxies=self.httpx_proxies, headers=get_headers(self.session)) as client:
tasks = (get_trends(client, o, url) for o in offsets)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Getting trends')
Expand Down Expand Up @@ -516,7 +523,7 @@ async def process():
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(proxies=self.httpx_proxies, limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
tasks = (get(c, key) for key in keys)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Downloading chat data')
Expand All @@ -533,7 +540,7 @@ async def process(data: list[dict]) -> list:
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(proxies=self.httpx_proxies, limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
tasks = []
for d in data:
tasks.extend([get(c, chunk, d['rest_id']) for chunk in d['chunks']])
Expand Down Expand Up @@ -564,7 +571,7 @@ async def process():
limits = Limits(max_connections=100, max_keepalive_connections=10)
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(proxies=self.httpx_proxies, limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
return await asyncio.gather(*(get(c, key) for key in keys))

return asyncio.run(process())
Expand Down Expand Up @@ -609,7 +616,7 @@ async def _query(self, client: AsyncClient, operation: tuple, **kwargs) -> Respo
async def _process(self, operation: tuple, queries: list[dict], **kwargs):
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
async with AsyncClient(limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(proxies=self.httpx_proxies, limits=Limits(max_connections=MAX_ENDPOINT_LIMIT), headers=headers, cookies=cookies, timeout=20) as c:
tasks = (self._paginate(c, operation, **q, **kwargs) for q in queries)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc=operation[-1])
Expand Down Expand Up @@ -739,7 +746,7 @@ async def get(c: AsyncClient, space: dict) -> list[dict]:
return r.json()

limits = Limits(max_connections=100)
async with AsyncClient(headers=client.headers, limits=limits, timeout=30) as c:
async with AsyncClient(proxies=self.httpx_proxies, headers=client.headers, limits=limits, timeout=30) as c:
tasks = (get(c, _id) for _id in spaces)
if self.pbar:
return await tqdm_asyncio.gather(*tasks, desc='Getting live transcripts')
Expand Down Expand Up @@ -838,7 +845,7 @@ async def poll_space(client: AsyncClient, space: dict) -> dict | None:
async def process(spaces: list[dict]):
limits = Limits(max_connections=100)
headers, cookies = self.session.headers, self.session.cookies
async with AsyncClient(limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
async with AsyncClient(proxies=self.httpx_proxies, limits=limits, headers=headers, cookies=cookies, timeout=20) as c:
return await asyncio.gather(*(poll_space(c, space) for space in spaces))

spaces = self.spaces(rooms=rooms)
Expand Down Expand Up @@ -875,13 +882,13 @@ def _validate_session(self, *args, **kwargs):

# try validating cookies dict
if isinstance(cookies, dict) and all(cookies.get(c) for c in {'ct0', 'auth_token'}):
_session = Client(cookies=cookies, follow_redirects=True)
_session = Client(proxies=self.httpx_proxies, cookies=cookies, follow_redirects=True)
_session.headers.update(get_headers(_session))
return _session

# try validating cookies from file
if isinstance(cookies, str):
_session = Client(cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True)
_session = Client(proxies=self.httpx_proxies, cookies=orjson.loads(Path(cookies).read_bytes()), follow_redirects=True)
_session.headers.update(get_headers(_session))
return _session

Expand Down