Skip to content

Commit

Permalink
Merge pull request #20 from MLH-Fellowship/celery
Browse files Browse the repository at this point in the history
Built the json output to be integrated with the HTML
  • Loading branch information
grimmmyshini authored Oct 12, 2020
2 parents 9197387 + 1e4eeb3 commit fccd9f2
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 4 deletions.
39 changes: 35 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,39 @@
from flask import Flask
import sys

import pandas as pd
from flask import Flask, jsonify

from apps.process_data import ProcessData, Results
from apps.tweets import Twitter
from loguru import logger

logger.add(
"logs/twitstat.log",
colorize=True,
format="<green>{time}</green> <level>{message}</level>",
rotation="50 MB",
backtrace=True,
diagnose=True,
)
app = Flask(__name__)


@app.route("/")
def hello_world():
return "Hello, World!"
@app.route("/", methods=["GET"])
def analyze_tweets():
logger.info("Analyze tweets initiated")
twitter = Twitter()
top_trends = twitter.get_top_trends()
logger.info(f"Top trends now are {top_trends}")
trending_tweets = twitter.get_trending_tweets(top_trends[0]["name"])
df = pd.DataFrame(trending_tweets)
esp = 1.29
df, clusters_count = ProcessData().cluster(esp, df)
res, clusters_count = Results(df).get_result()
logger.info(f"Clusters: {clusters_count}")
result = {}
for ind, row in res.iterrows():
result[ind] = dict(row)
response = dict()
response["cluserts_count"] = clusters_count.to_json()
response["result"] = result
return jsonify(response)
40 changes: 40 additions & 0 deletions apps/clean_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import re
import string


class CleanData:
def de_emojify(self, tweet):
"""Remove emoticons from given text"""
regrex_pattern = re.compile(
pattern="["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002500-\U00002BEF" # chinese char
"\U00002702-\U000027B0"
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"\U0001f926-\U0001f937"
"\U00010000-\U0010ffff"
"\u2640-\u2642"
"\u2600-\u2B55"
"\u200d"
"\u23cf"
"\u23e9"
"\u231a"
"\ufe0f" # dingbats
"\u3030"
"]+",
flags=re.UNICODE,
)
return regrex_pattern.sub(r"", tweet)

def remove_punctuation(self, tweet):
"""Remove links and other punctuation from text"""
tweet = tweet.replace("\n", "")
tweet = tweet.replace("\t", "")
re.sub(r"http\S+", "", tweet) # removes links

translator = str.maketrans("", "", string.punctuation)
return tweet.lower().translate(translator)
74 changes: 74 additions & 0 deletions apps/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging

import nltk
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

from apps.clean_data import CleanData

try:
from nltk import word_tokenize, sent_tokenize
except ImportError:
nltk.download("punkt")
from nltk import word_tokenize, sent_tokenize


class ProcessData:
def __init__(self):
self.porter_stemmer = nltk.PorterStemmer()
self.clean_data = CleanData()

def tokenize(self, tweet):
"""Stem and tokenizes input text, used as custom tokenizer in tfi-df vectorization"""
tokens = nltk.word_tokenize(tweet)
stems = []
for item in tokens:
stems.append(self.porter_stemmer.stem(item))
return stems

def analyse_sentiment(self, tweet):
"""Analyses the sentiment of the given tweet"""
analysis = TextBlob(tweet)
sentiment = analysis.sentiment.polarity
if sentiment > 0:
return "positive"
elif sentiment == 0:
return "neutral"
else:
return "negative"

def cluster(self, esp, df):
"""Clusters data using DBSCAN with a specified esp value"""
df["tweet_clean"] = df["tweets"].apply(
lambda y: self.clean_data.remove_punctuation(y)
)
df["tweet_clean"] = df["tweet_clean"].apply(
lambda y: self.clean_data.de_emojify(y)
)

vectorizer = TfidfVectorizer(
tokenizer=self.tokenize, stop_words="english", min_df=1
)
x = vectorizer.fit_transform(df.loc[:, "tweet_clean"])

db = DBSCAN(esp, min_samples=20).fit(x)

df["clusters"] = db.labels_
logging.info(f"Number of unique clusters generated: {df.clusters.nunique()}")

return df, df.clusters.nunique()


class Results:
def __init__(self, df):
"""Initialize final results of the analysis"""
self.df = df
self.clusters_count = df.clusters.value_counts()

def get_result(self):
df_results = self.df.groupby(["clusters"]).max().reset_index()
df_results["sentiment"] = df_results["tweet_clean"].apply(
lambda y: ProcessData().analyse_sentiment(y)
)
return df_results, self.clusters_count
2 changes: 2 additions & 0 deletions apps/tweets.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def get_trending_tweets(self, find_word):
tweets_list.append(tweets)

tweet_counter += 1
if tweet_counter > 1000:
break

return tweets_list

Expand Down
5 changes: 5 additions & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ Jinja2==2.11.2 # https://github.com/pallets/jinja
MarkupSafe==1.1.1 # https://github.com/pallets/markupsafe
Werkzeug==1.0.1 # https://github.com/pallets/werkzeug

# Celery
# ------------------------------------------------------------------------------
celery==5.0.0 # https://github.com/celery/celery
redis==3.5.3 # https://github.com/redis/redis

# Others
# ------------------------------------------------------------------------------
tweepy==3.9.0 # https://github.com/tweepy/tweepy
Expand Down

0 comments on commit fccd9f2

Please sign in to comment.