Merge pull request #26 from MLH-Fellowship/staging

Staging
MLH-Fellowship · Oct 12, 2020 · d3eac80 · d3eac80
2 parents 760a1b3 + 5c56bc8
commit d3eac80
Show file tree

Hide file tree

Showing 76 changed files with 18,593 additions and 7 deletions.
diff --git a/.github/workflows/main_twitstat.yml b/.github/workflows/main_twitstat.yml
@@ -0,0 +1,34 @@
+# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy
+# More GitHub Actions for Azure: https://github.com/Azure/actions
+
+name: Build and deploy Python app to Azure Web App - twitstat
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@master
+
+    - name: Set up Python version
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.8'
+
+    - name: Build using AppService-Build
+      uses: azure/appservice-build@v2
+      with:
+        platform: python
+        platform-version: '3.8'
+
+    - name: 'Deploy to Azure Web App'
+      uses: azure/webapps-deploy@v2
+      with:
+        app-name: 'twitstat'
+        slot-name: 'production'
+        publish-profile: ${{ secrets.AzureAppService_PublishProfile_f35043737dd4487e9131f39d0ff4e657 }}
diff --git a/.github/workflows/staging_twitstat(staging).yml b/.github/workflows/staging_twitstat(staging).yml
@@ -0,0 +1,44 @@
+# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy
+# More GitHub Actions for Azure: https://github.com/Azure/actions
+
+name: Build and deploy Python app to Azure Web App - twitstat(staging)
+
+on:
+  push:
+    branches:
+      - staging
+
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@master
+
+    - name: Set up Python version
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.8'
+
+    - name: Build using AppService-Build
+      uses: azure/appservice-build@v2
+      with:
+        platform: python
+        platform-version: '3.8'
+
+    - name: Setting up Virtual Environment
+      run: |
+        python3 -m venv venv
+        source venv/bin/activate
+        
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements/production.txt
+
+    - name: 'Deploy to Azure Web App'
+      uses: azure/webapps-deploy@v2
+      with:
+        app-name: 'twitstat'
+        slot-name: 'staging'
+        publish-profile: ${{ secrets.AzureAppService_PublishProfile_6d60a0be539140669ccdea72aea4def0 }}
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/twitstat.iml b/.idea/twitstat.iml
diff --git a/README.rst b/README.rst
@@ -146,6 +146,11 @@ Contribution Guidelines
 
 * Please read our `Code of Conduct <./CODE_OF_CONDUCT.md>`__.
 
+Contributors
+-------------
+
+Made with :heart: by `Aditya Raman <https://github.com/ramanaditya>`_ and `Garima Singh <https://github.com/grimmmyshini>`_!
+
 License
 --------
 

diff --git a/app.py b/app.py
@@ -1,8 +1,42 @@
-from flask import Flask
+import sys
 
-app = Flask(__name__)
+import pandas as pd
+from flask import Flask, jsonify
 
+from apps.process_data import ProcessData, Results
+from apps.tweets import Twitter
+from loguru import logger
+from flask import render_template
 
-@app.route("/")
-def hello_world():
-    return "Hello, World!"
+logger.add(
+    "logs/twitstat.log",
+    colorize=True,
+    format="<green>{time}</green> <level>{message}</level>",
+    rotation="50 MB",
+    backtrace=True,
+    diagnose=True,
+)
+app = Flask(__name__,
+            static_url_path='',
+            static_folder='./static',
+            template_folder='./templates')
+
+
+@app.route("/", methods=["GET"])
+def analyze_tweets():
+    # twitter = Twitter()
+    # top_trends = twitter.get_top_trends()
+    # logger.info(f"Top trends now are {top_trends}")
+    # trending_tweets = twitter.get_trending_tweets(top_trends[0]["name"])
+    # df = pd.DataFrame(trending_tweets)
+    # esp = 1.29
+    # df, clusters_count = ProcessData().cluster(esp, df)
+    # res, clusters_count = Results(df).get_result()
+    # logger.info(f"Clusters: {clusters_count}")
+    # result = {}
+    # for ind, row in res.iterrows():
+    #     result[ind] = dict(row)
+    # response = dict()
+    # response["cluserts_count"] = clusters_count.to_json()
+    # response["result"] = result
+    return render_template('index.html')
diff --git a/apps/clean_data.py b/apps/clean_data.py
@@ -0,0 +1,40 @@
+import re
+import string
+
+
+class CleanData:
+    def de_emojify(self, tweet):
+        """Remove emoticons from given text"""
+        regrex_pattern = re.compile(
+            pattern="["
+            "\U0001F600-\U0001F64F"  # emoticons
+            "\U0001F300-\U0001F5FF"  # symbols & pictographs
+            "\U0001F680-\U0001F6FF"  # transport & map symbols
+            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
+            "\U00002500-\U00002BEF"  # chinese char
+            "\U00002702-\U000027B0"
+            "\U00002702-\U000027B0"
+            "\U000024C2-\U0001F251"
+            "\U0001f926-\U0001f937"
+            "\U00010000-\U0010ffff"
+            "\u2640-\u2642"
+            "\u2600-\u2B55"
+            "\u200d"
+            "\u23cf"
+            "\u23e9"
+            "\u231a"
+            "\ufe0f"  # dingbats
+            "\u3030"
+            "]+",
+            flags=re.UNICODE,
+        )
+        return regrex_pattern.sub(r"", tweet)
+
+    def remove_punctuation(self, tweet):
+        """Remove links and other punctuation from text"""
+        tweet = tweet.replace("\n", "")
+        tweet = tweet.replace("\t", "")
+        re.sub(r"http\S+", "", tweet)  # removes links
+
+        translator = str.maketrans("", "", string.punctuation)
+        return tweet.lower().translate(translator)
diff --git a/apps/process_data.py b/apps/process_data.py
@@ -0,0 +1,74 @@
+import logging
+
+import nltk
+from sklearn.cluster import DBSCAN
+from sklearn.feature_extraction.text import TfidfVectorizer
+from textblob import TextBlob
+
+from apps.clean_data import CleanData
+
+try:
+    from nltk import word_tokenize, sent_tokenize
+except ImportError:
+    nltk.download("punkt")
+    from nltk import word_tokenize, sent_tokenize
+
+
+class ProcessData:
+    def __init__(self):
+        self.porter_stemmer = nltk.PorterStemmer()
+        self.clean_data = CleanData()
+
+    def tokenize(self, tweet):
+        """Stem and tokenizes input text, used as custom tokenizer in tfi-df vectorization"""
+        tokens = nltk.word_tokenize(tweet)
+        stems = []
+        for item in tokens:
+            stems.append(self.porter_stemmer.stem(item))
+        return stems
+
+    def analyse_sentiment(self, tweet):
+        """Analyses the sentiment of the given tweet"""
+        analysis = TextBlob(tweet)
+        sentiment = analysis.sentiment.polarity
+        if sentiment > 0:
+            return "positive"
+        elif sentiment == 0:
+            return "neutral"
+        else:
+            return "negative"
+
+    def cluster(self, esp, df):
+        """Clusters data using DBSCAN with a specified esp value"""
+        df["tweet_clean"] = df["tweets"].apply(
+            lambda y: self.clean_data.remove_punctuation(y)
+        )
+        df["tweet_clean"] = df["tweet_clean"].apply(
+            lambda y: self.clean_data.de_emojify(y)
+        )
+
+        vectorizer = TfidfVectorizer(
+            tokenizer=self.tokenize, stop_words="english", min_df=1
+        )
+        x = vectorizer.fit_transform(df.loc[:, "tweet_clean"])
+
+        db = DBSCAN(esp, min_samples=20).fit(x)
+
+        df["clusters"] = db.labels_
+        logging.info(f"Number of unique clusters generated: {df.clusters.nunique()}")
+
+        return df, df.clusters.nunique()
+
+
+class Results:
+    def __init__(self, df):
+        """Initialize final results of the analysis"""
+        self.df = df
+        self.clusters_count = df.clusters.value_counts()
+
+    def get_result(self):
+        df_results = self.df.groupby(["clusters"]).max().reset_index()
+        df_results["sentiment"] = df_results["tweet_clean"].apply(
+            lambda y: ProcessData().analyse_sentiment(y)
+        )
+        return df_results, self.clusters_count
diff --git a/apps/tweets.py b/apps/tweets.py
@@ -57,6 +57,9 @@ def get_trending_tweets(self, find_word):
                 tweets_list.append(tweets)
 
                 tweet_counter += 1
+
+                if tweet_counter > 1000:
+                    break
 
         return tweets_list
 

diff --git a/frontend/css/bootstrap.min.css b/frontend/css/bootstrap.min.css