Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V3.1.3 #41

Merged
merged 8 commits into from
Oct 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 107 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@
<li><a href="#profileArgument">Function Arguments</a></li>
<li><a href="#profileOutput">Keys of the output data</a></li>
</ul>
<li><a href="#keyword">Scraping tweets using keywords</a>
<li><a href="#keywordAPI">Scraping tweets using query with API</a>
<ul>
<li><a href="#keywordAPI">In JSON Format</a></li>
<li><a href="#scrape_keyword_with_apiArgs">Function Argument</a></li>
<li><a href="#scrape_keyword_with_apiKeys">Keys of the output.</a></li>
</ul>
</li>
<li><a href="#keyword">Scraping tweets using keywords with browser automation</a>
<ul>
<li><a href="#keywordJson">In JSON format</a></li>
<li><a href="#keywordCSV">In CSV format</a></li>
Expand Down Expand Up @@ -349,8 +356,105 @@ Output:
</div>
<br>
<hr>
<h3 id="keyword">To scrap tweets using keywords:</h3>
<h3 id="keywordAPI">To scrape tweets using keywords with API:</h3>
<div>

```python
from twitter_scraper_selenium import scrape_keyword_with_api

query = "#gaming"
tweets_count = 10
output_filename = "gaming_hashtag_data"
scrape_keyword_with_api(query=query, tweets_count=tweets_count, output_filename=output_filename)

```
Output:
```js
{
"1583821467732480001": {
"tweet_url" : "https://twitter.com/yakubblackbeard/status/1583821467732480001",
"tweet_detail":{
...
},
"user_detail":{
...
}
}
}
```
</div>
<br>
<div id="scrape_keyword_with_apiArgs">
<p><code>scrape_keyword_with_api()</code> arguments:</p>

<table>
<thead>
<tr>
<td>Argument</td>
<td>Argument Type</td>
<td>Description</td>
</tr>
</thead>
<tbody>
<tr>
<td>query</td>
<td>String</td>
<td>Query to search. The query can be built from <a href="https://developer.twitter.com/apitools/query">here</a> for advanced search.</td>
</tr>
<tr>
<td>tweets_count</td>
<td>Integer</td>
<td>Number of tweets to scrape.</td>
</tr>
<tr>
<td>output_filename</td>
<td>String</td>
<td>What should be the filename where output is stored?.</td>
</tr>
<tr>
<td>output_dir</td>
<td>String</td>
<td>What directory output file should be saved?</td>
</tr>
</tbody>
</table>

</div>
<hr>
<br>
<div>
<p id="scrape_keyword_with_apiKeys">Keys of the output:</p>
<table>
<thead>
<tr>
<td>Key</td>
<td>Type</td>
<td>Description</td>
</tr>
</thead>
<tbody>
<tr>
<td>tweet_url</td>
<td>String</td>
<td>URL of the tweet.</td>
</tr>
<tr>
<td>tweet_detail</td>
<td>Dictionary</td>
<td>A dictionary containing the data about the tweet. All fields which will be available inside can be checked <a href="https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet">here<a></td>
</tr>
<tr>
<td>user_detail</td>
<td>Dictionary</td>
<td>A dictionary containing the data about the tweet owner. All fields which will be available inside can be checked <a href="https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/user">here<a></td>
</tr>
</tbody>
</table>
</div>
<br>
<br>
<hr>
<h3>To scrap tweets using keywords with browser automation</h3>
<div>
<p id="keywordJson">In JSON format:</p>

Expand Down Expand Up @@ -531,6 +635,7 @@ Output:
<hr>
<div id="keywordOutput">
<p>Keys of the output</p>

<table>
<thead>
<tr>
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="twitter_scraper_selenium",
version="3.0.3",
version="3.1.3",
author="Sajid Shaikh",
author_email="shaikhsajid3732@gmail.com",
description="Python package to scrap twitter's front-end easily with selenium",
Expand Down
5 changes: 3 additions & 2 deletions twitter_scraper_selenium/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from .keyword import scrap_keyword
from .profile import scrap_profile
from .topic import scrap_topic

from .keyword_api import scrape_keyword_with_api
#__all__ = ["Initializer",
# "Utilities", "Finder",
# "Scraping_utilities","scrap_profile","scrap_keyword"]
__all__ = ["scrap_profile", "scrap_keyword", "scrap_topic"]
__all__ = ["scrap_profile", "scrap_keyword",
"scrap_topic", "scrape_keyword_with_api"]
2 changes: 1 addition & 1 deletion twitter_scraper_selenium/element_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,4 @@ def find_profile_image_link(tweet) -> Union[str, None]:
try:
return tweet.find_element(By.CSS_SELECTOR, 'img[alt][draggable="true"]').get_attribute('src')
except Exception as ex:
logger.warning("Find Profile Image Link : {}".format(ex))
logger.warning("Error at find_profile_image_link : {}".format(ex))
121 changes: 121 additions & 0 deletions twitter_scraper_selenium/keyword_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3

from typing import Union
from .scraping_utilities import Scraping_utilities
import logging
from urllib.parse import quote
import os
import json

logger = logging.getLogger(__name__)
format = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(format)
logger.addHandler(ch)


class Keywords_api:
def __init__(self, query: str, proxy: Union[str, None],
tweets_count: int) -> None:
self.query = query
self.proxy = proxy
self.tweets_count = tweets_count
self.x_guest_key = ''
self.authorization_key = \
'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
self.posts_data = {}

def parse_tweets(self, tweets, users):
for key in tweets.keys():
user = users[tweets[key]['user_id_str']]
tweet = tweets[key]
self.posts_data[key] = {
"tweet_url": "https://twitter.com/{}/status/{}".format(user['screen_name'], key),
"tweet_detail": tweet,
"user_detail": user
}

def find_cursor(self, timeline):
cursor = None
try:
cursor = timeline["instructions"][-1]["replaceEntry"]["entry"]["content"]["operation"]["cursor"]["value"]
except KeyError:
cursor = timeline["instructions"][0]["addEntries"]["entries"][-1]["content"]["operation"]["cursor"]["value"]
return cursor

def scrap(self):
try:
self.x_guest_key \
= Scraping_utilities.find_x_guest_token(self.authorization_key)
logger.setLevel(logging.INFO)
headers = Scraping_utilities.build_keyword_headers(
self.x_guest_key, self.authorization_key, quote(self.query))
cursor = None
retry = 5
while len(self.posts_data) < self.tweets_count:
params = Scraping_utilities.build_params(self.query, cursor)
response = Scraping_utilities.make_http_request(
'https://twitter.com/i/api/2/search/adaptive.json', params,
headers, self.proxy)
if response:
tweets = response['globalObjects']['tweets']
users = response['globalObjects']['users']
timeline = response['timeline']
self.parse_tweets(tweets, users)
cursor = self.find_cursor(timeline)
if cursor is None:
retry -= 1
logger.info('Number of Tweets scraped : {}'.format(
len(self.posts_data)))
elif retry <= 0:
logger.info("Can't Find more Post")
break
else:
logger.warning('Failed to make request!')
break
data = dict(list(self.posts_data.items())
[0:int(self.tweets_count)])
return data
except Exception as ex:
logger.warning('Error at scrap : {}'.format(ex))


def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
tweets_count: int = 10,
output_filename: Union[str, None] = None,
output_dir: Union[str, None] = os.getcwd()):
"""Function to scrape tweets from Twitter API using provided query.

Args:
query (str): query to search.
proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.. Defaults to None.
tweets_count (int, optional): Number of Tweets to scrape. Defaults to 10.
output_filename (Union[str, None], optional): Name of the output JSON file. Defaults to None.
output_dir (Union[str, None], optional): Directory where to save the file. Defaults to os.getcwd().

Returns:
(dict | none): None if data was saved, else JSON String.
"""
keyword_scraper = Keywords_api(query, proxy, tweets_count)
data = keyword_scraper.scrap()
if output_filename:
path = os.path.join(output_dir, "{}.json".format(output_filename))
mode = 'a'
if os.path.exists(path):
mode = 'r'
with open(path, mode, encoding='utf-8') as file:
if mode == 'r':
try:
file_content = file.read()
content = json.loads(file_content)
except json.decoder.JSONDecodeError:
logger.warning('Invalid JSON Detected!')
content = {}
file.close()
data.update(content)
with open(path, 'w', encoding='utf-8') as file_in_write_mode:
json.dump(data, file_in_write_mode)
logger.info('Data was saved to {}'.format(path))
else:
return data
102 changes: 102 additions & 0 deletions twitter_scraper_selenium/scraping_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import Union
from urllib.parse import quote
import logging
import requests
from fake_headers import Headers

logger = logging.getLogger(__name__)
format = logging.Formatter(
Expand Down Expand Up @@ -89,3 +91,103 @@ def url_generator(keyword: str, since: Union[int, None] = None, until: Union[str
query = keyword + " " + word
base_url = base_url + quote(query) + "&src=typed_query&f=live"
return base_url

@staticmethod
def make_http_request(URL, params, headers, proxy=None):
try:
response = None
if proxy:
proxy_dict = {
"http": "http://{}".format(proxy),
"https": "http://{}".format(proxy)
}
response = requests.get(URL, params=params, headers=headers,
proxies=proxy_dict)
else:
response = requests.get(URL, params=params, headers=headers)
if response and response.status_code == 200:
return response.json()
except Exception as ex:
logger.warning("Error at make_http_request: {}".format(ex))

@staticmethod
def build_params(query, cursor=None):
params = {
'include_profile_interstitial_type': '1',
'include_blocking': '1',
'include_blocked_by': '1',
'include_followed_by': '1',
'include_want_retweets': '1',
'include_mute_edge': '1',
'include_can_dm': '1',
'include_can_media_tag': '1',
'include_ext_has_nft_avatar': '1',
'skip_status': '1',
'cards_platform': 'Web-12',
'include_cards': '1',
'include_ext_alt_text': 'true',
'include_ext_limited_action_results': 'false',
'include_quote_count': 'true',
'include_reply_count': '1',
'tweet_mode': 'extended',
'include_ext_collab_control': 'true',
'include_entities': 'true',
'include_user_entities': 'true',
'include_ext_media_color': 'true',
'include_ext_media_availability': 'true',
'include_ext_sensitive_media_warning': 'true',
'include_ext_trusted_friends_metadata': 'true',
'send_error_codes': 'true',
'simple_quoted_tweet': 'true',
'q': query,
'vertical': 'trends',
'count': '20',
'query_source': 'trend_click',
'pc': '1',
'spelling_corrections': '1',
'include_ext_edit_control': 'true',
'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe',
}
if cursor:
params['cursor'] = cursor
return params

@staticmethod
def build_keyword_headers(x_guest_token, authorization_key, query):
headers = {
'authority': 'twitter.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'authorization': authorization_key,
'referer': "https://twitter.com/search?q={}".format(query),
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': Headers().generate()['User-Agent'],
'x-guest-token': x_guest_token,
'x-twitter-active-user': 'yes',
'x-twitter-client-language': 'en',
}
return headers

@staticmethod
def find_x_guest_token(authorization_key, proxy=None):
try:
headers = {
'authorization': authorization_key,
}
response = None
if proxy:
proxy_dict = {
"http": "http://{}".format(proxy),
"https": "http://{}".format(proxy),
}
response = requests.post(
'https://api.twitter.com/1.1/guest/activate.json', headers=headers, proxies=proxy_dict)
return response.json()['guest_token']

response = requests.post(
'https://api.twitter.com/1.1/guest/activate.json', headers=headers)
return response.json()['guest_token']
except Exception as ex:
logger.warning("Error at find_x_guest_token: {}".format(ex))