shaikhsajid1111 · shaikhsajid1111 · Oct 22, 2022 · Oct 22, 2022 · Oct 22, 2022 · Oct 22, 2022
diff --git a/README.md b/README.md
@@ -31,7 +31,14 @@
     <li><a href="#profileArgument">Function Arguments</a></li>
     <li><a href="#profileOutput">Keys of the output data</a></li>
     </ul>
-    <li><a href="#keyword">Scraping tweets using keywords</a>
+    <li><a href="#keywordAPI">Scraping tweets using query with API</a>
+    <ul>
+    <li><a href="#keywordAPI">In JSON Format</a></li>
+    <li><a href="#scrape_keyword_with_apiArgs">Function Argument</a></li>
+    <li><a href="#scrape_keyword_with_apiKeys">Keys of the output.</a></li>
+    </ul>
+    </li>
+    <li><a href="#keyword">Scraping tweets using keywords with browser automation</a>
     <ul>
     <li><a href="#keywordJson">In JSON format</a></li>
     <li><a href="#keywordCSV">In CSV format</a></li>
@@ -349,8 +356,105 @@ Output:
 </div>
 <br>
 <hr>
-<h3 id="keyword">To scrap tweets using keywords:</h3>
+<h3 id="keywordAPI">To scrape tweets using keywords with API:</h3>
+<div>
+
+```python
+from twitter_scraper_selenium import scrape_keyword_with_api
+
+query = "#gaming"
+tweets_count = 10
+output_filename = "gaming_hashtag_data"
+scrape_keyword_with_api(query=query, tweets_count=tweets_count, output_filename=output_filename)
+
+```
+Output:
+```js
+{
+  "1583821467732480001": {
+    "tweet_url" : "https://twitter.com/yakubblackbeard/status/1583821467732480001",
+    "tweet_detail":{
+      ...
+    },
+    "user_detail":{
+      ...
+    }
+  }
+}
+```
+</div>
+<br>
+<div id="scrape_keyword_with_apiArgs">
+<p><code>scrape_keyword_with_api()</code> arguments:</p>
 
+<table>
+    <thead>
+        <tr>
+            <td>Argument</td>
+            <td>Argument Type</td>
+            <td>Description</td>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>query</td>
+            <td>String</td>
+            <td>Query to search. The query can be built from <a href="https://developer.twitter.com/apitools/query">here</a> for advanced search.</td>
+        </tr>
+        <tr>
+            <td>tweets_count</td>
+            <td>Integer</td>
+            <td>Number of tweets to scrape.</td>
+        </tr>
+        <tr>
+            <td>output_filename</td>
+            <td>String</td>
+            <td>What should be the filename where output is stored?.</td>
+        </tr>
+        <tr>
+            <td>output_dir</td>
+            <td>String</td>
+            <td>What directory output file should be saved?</td>
+        </tr>
+    </tbody>
+</table>
+
+</div>
+<hr>
+<br>
+<div>
+<p id="scrape_keyword_with_apiKeys">Keys of the output:</p>
+<table>
+<thead>
+        <tr>
+            <td>Key</td>
+            <td>Type</td>
+            <td>Description</td>
+        </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>tweet_url</td>
+    <td>String</td>
+    <td>URL of the tweet.</td>
+    </tr>
+    <tr>
+    <td>tweet_detail</td>
+    <td>Dictionary</td>
+    <td>A dictionary containing the data about the tweet. All fields which will be available inside can be checked <a href="https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet">here<a></td>
+    </tr>
+    <tr>
+    <td>user_detail</td>
+    <td>Dictionary</td>
+    <td>A dictionary containing the data about the tweet owner. All fields which will be available inside can be checked <a href="https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/user">here<a></td>
+    </tr>
+    </tbody>
+</table>
+</div>
+<br>
+<br>
+<hr>
+<h3>To scrap tweets using keywords with browser automation</h3>
 <div>
 <p id="keywordJson">In JSON format:</p>
 
@@ -531,6 +635,7 @@ Output:
 <hr>
 <div id="keywordOutput">
 <p>Keys of the output</p>
+
 <table>
     <thead>
         <tr>

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name="twitter_scraper_selenium",
-    version="3.0.3",
+    version="3.1.3",
     author="Sajid Shaikh",
     author_email="shaikhsajid3732@gmail.com",
     description="Python package to scrap twitter's front-end easily with selenium",

diff --git a/twitter_scraper_selenium/__init__.py b/twitter_scraper_selenium/__init__.py
@@ -5,8 +5,9 @@
 from .keyword import scrap_keyword
 from .profile import scrap_profile
 from .topic import scrap_topic
-
+from .keyword_api import scrape_keyword_with_api
 #__all__ = ["Initializer",
 #           "Utilities", "Finder",
 #           "Scraping_utilities","scrap_profile","scrap_keyword"]
-__all__ = ["scrap_profile", "scrap_keyword", "scrap_topic"]
+__all__ = ["scrap_profile", "scrap_keyword",
+           "scrap_topic", "scrape_keyword_with_api"]
diff --git a/twitter_scraper_selenium/element_finder.py b/twitter_scraper_selenium/element_finder.py
@@ -202,4 +202,4 @@ def find_profile_image_link(tweet) -> Union[str, None]:
         try:
             return tweet.find_element(By.CSS_SELECTOR, 'img[alt][draggable="true"]').get_attribute('src')
         except Exception as ex:
-            logger.warning("Find Profile Image Link : {}".format(ex))
+            logger.warning("Error at find_profile_image_link : {}".format(ex))
diff --git a/twitter_scraper_selenium/keyword_api.py b/twitter_scraper_selenium/keyword_api.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+from typing import Union
+from .scraping_utilities import Scraping_utilities
+import logging
+from urllib.parse import quote
+import os
+import json
+
+logger = logging.getLogger(__name__)
+format = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+ch = logging.StreamHandler()
+ch.setFormatter(format)
+logger.addHandler(ch)
+
+
+class Keywords_api:
+    def __init__(self, query: str, proxy: Union[str, None],
+                 tweets_count: int) -> None:
+        self.query = query
+        self.proxy = proxy
+        self.tweets_count = tweets_count
+        self.x_guest_key = ''
+        self.authorization_key = \
+            'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
+        self.posts_data = {}
+
+    def parse_tweets(self, tweets, users):
+        for key in tweets.keys():
+            user = users[tweets[key]['user_id_str']]
+            tweet = tweets[key]
+            self.posts_data[key] = {
+                "tweet_url": "https://twitter.com/{}/status/{}".format(user['screen_name'], key),
+                "tweet_detail": tweet,
+                "user_detail": user
+            }
+
+    def find_cursor(self, timeline):
+        cursor = None
+        try:
+            cursor = timeline["instructions"][-1]["replaceEntry"]["entry"]["content"]["operation"]["cursor"]["value"]
+        except KeyError:
+            cursor = timeline["instructions"][0]["addEntries"]["entries"][-1]["content"]["operation"]["cursor"]["value"]
+        return cursor
+
+    def scrap(self):
+        try:
+            self.x_guest_key \
+                = Scraping_utilities.find_x_guest_token(self.authorization_key)
+            logger.setLevel(logging.INFO)
+            headers = Scraping_utilities.build_keyword_headers(
+                self.x_guest_key, self.authorization_key, quote(self.query))
+            cursor = None
+            retry = 5
+            while len(self.posts_data) < self.tweets_count:
+                params = Scraping_utilities.build_params(self.query, cursor)
+                response = Scraping_utilities.make_http_request(
+                    'https://twitter.com/i/api/2/search/adaptive.json', params,
+                    headers, self.proxy)
+                if response:
+                    tweets = response['globalObjects']['tweets']
+                    users = response['globalObjects']['users']
+                    timeline = response['timeline']
+                    self.parse_tweets(tweets, users)
+                    cursor = self.find_cursor(timeline)
+                    if cursor is None:
+                        retry -= 1
+                    logger.info('Number of Tweets scraped : {}'.format(
+                        len(self.posts_data)))
+                elif retry <= 0:
+                    logger.info("Can't Find more Post")
+                    break
+                else:
+                    logger.warning('Failed to make request!')
+                    break
+            data = dict(list(self.posts_data.items())
+                        [0:int(self.tweets_count)])
+            return data
+        except Exception as ex:
+            logger.warning('Error at scrap : {}'.format(ex))
+
+
+def scrape_keyword_with_api(query: str, proxy: Union[str, None] = None,
+                            tweets_count: int = 10,
+                            output_filename: Union[str, None] = None,
+                            output_dir: Union[str, None] = os.getcwd()):
+    """Function to scrape tweets from Twitter API using provided query.
+
+    Args:
+        query (str): query to search.
+        proxy (Union[str, None], optional): Optional parameter, if user wants to use proxy for scraping. If the proxy is authenticated proxy then the proxy format is username:password@host:port. Defaults to None.. Defaults to None.
+        tweets_count (int, optional): Number of Tweets to scrape. Defaults to 10.
+        output_filename (Union[str, None], optional): Name of the output JSON file. Defaults to None.
+        output_dir (Union[str, None], optional): Directory where to save the file. Defaults to os.getcwd().
+
+    Returns:
+        (dict | none): None if data was saved, else JSON String.
+    """
+    keyword_scraper = Keywords_api(query, proxy, tweets_count)
+    data = keyword_scraper.scrap()
+    if output_filename:
+        path = os.path.join(output_dir, "{}.json".format(output_filename))
+        mode = 'a'
+        if os.path.exists(path):
+            mode = 'r'
+        with open(path, mode, encoding='utf-8') as file:
+            if mode == 'r':
+                try:
+                    file_content = file.read()
+                    content = json.loads(file_content)
+                except json.decoder.JSONDecodeError:
+                    logger.warning('Invalid JSON Detected!')
+                    content = {}
+                file.close()
+                data.update(content)
+                with open(path, 'w', encoding='utf-8') as file_in_write_mode:
+                    json.dump(data, file_in_write_mode)
+                    logger.info('Data was saved to {}'.format(path))
+    else:
+        return data
diff --git a/twitter_scraper_selenium/scraping_utilities.py b/twitter_scraper_selenium/scraping_utilities.py
@@ -3,6 +3,8 @@
 from typing import Union
 from urllib.parse import quote
 import logging
+import requests
+from fake_headers import Headers
 
 logger = logging.getLogger(__name__)
 format = logging.Formatter(
@@ -89,3 +91,103 @@ def url_generator(keyword: str, since: Union[int, None] = None, until: Union[str
             query = keyword + " " + word
             base_url = base_url + quote(query) + "&src=typed_query&f=live"
         return base_url
+
+    @staticmethod
+    def make_http_request(URL, params, headers, proxy=None):
+        try:
+            response = None
+            if proxy:
+                proxy_dict = {
+                    "http": "http://{}".format(proxy),
+                    "https": "http://{}".format(proxy)
+                }
+                response = requests.get(URL, params=params, headers=headers,
+                                        proxies=proxy_dict)
+            else:
+                response = requests.get(URL, params=params, headers=headers)
+            if response and response.status_code == 200:
+                return response.json()
+        except Exception as ex:
+            logger.warning("Error at make_http_request: {}".format(ex))
+
+    @staticmethod
+    def build_params(query, cursor=None):
+        params = {
+            'include_profile_interstitial_type': '1',
+            'include_blocking': '1',
+            'include_blocked_by': '1',
+            'include_followed_by': '1',
+            'include_want_retweets': '1',
+            'include_mute_edge': '1',
+            'include_can_dm': '1',
+            'include_can_media_tag': '1',
+            'include_ext_has_nft_avatar': '1',
+            'skip_status': '1',
+            'cards_platform': 'Web-12',
+            'include_cards': '1',
+            'include_ext_alt_text': 'true',
+            'include_ext_limited_action_results': 'false',
+            'include_quote_count': 'true',
+            'include_reply_count': '1',
+            'tweet_mode': 'extended',
+            'include_ext_collab_control': 'true',
+            'include_entities': 'true',
+            'include_user_entities': 'true',
+            'include_ext_media_color': 'true',
+            'include_ext_media_availability': 'true',
+            'include_ext_sensitive_media_warning': 'true',
+            'include_ext_trusted_friends_metadata': 'true',
+            'send_error_codes': 'true',
+            'simple_quoted_tweet': 'true',
+            'q': query,
+            'vertical': 'trends',
+            'count': '20',
+            'query_source': 'trend_click',
+            'pc': '1',
+            'spelling_corrections': '1',
+            'include_ext_edit_control': 'true',
+            'ext': 'mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,enrichments,superFollowMetadata,unmentionInfo,editControl,collab_control,vibe',
+        }
+        if cursor:
+            params['cursor'] = cursor
+        return params
+
+    @staticmethod
+    def build_keyword_headers(x_guest_token, authorization_key, query):
+        headers = {
+            'authority': 'twitter.com',
+            'accept': '*/*',
+            'accept-language': 'en-US,en;q=0.9',
+            'authorization': authorization_key,
+            'referer': "https://twitter.com/search?q={}".format(query),
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': Headers().generate()['User-Agent'],
+            'x-guest-token': x_guest_token,
+            'x-twitter-active-user': 'yes',
+            'x-twitter-client-language': 'en',
+        }
+        return headers
+
+    @staticmethod
+    def find_x_guest_token(authorization_key, proxy=None):
+        try:
+            headers = {
+                'authorization': authorization_key,
+            }
+            response = None
+            if proxy:
+                proxy_dict = {
+                    "http": "http://{}".format(proxy),
+                    "https": "http://{}".format(proxy),
+                }
+                response = requests.post(
+                    'https://api.twitter.com/1.1/guest/activate.json', headers=headers, proxies=proxy_dict)
+                return response.json()['guest_token']
+
+            response = requests.post(
+                'https://api.twitter.com/1.1/guest/activate.json', headers=headers)
+            return response.json()['guest_token']
+        except Exception as ex:
+            logger.warning("Error at find_x_guest_token: {}".format(ex))