-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #20 from MLH-Fellowship/celery
Built the json output to be integrated with the HTML
- Loading branch information
Showing
5 changed files
with
156 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,39 @@ | ||
from flask import Flask | ||
import sys | ||
|
||
import pandas as pd | ||
from flask import Flask, jsonify | ||
|
||
from apps.process_data import ProcessData, Results | ||
from apps.tweets import Twitter | ||
from loguru import logger | ||
|
||
logger.add( | ||
"logs/twitstat.log", | ||
colorize=True, | ||
format="<green>{time}</green> <level>{message}</level>", | ||
rotation="50 MB", | ||
backtrace=True, | ||
diagnose=True, | ||
) | ||
app = Flask(__name__) | ||
|
||
|
||
@app.route("/") | ||
def hello_world(): | ||
return "Hello, World!" | ||
@app.route("/", methods=["GET"]) | ||
def analyze_tweets(): | ||
logger.info("Analyze tweets initiated") | ||
twitter = Twitter() | ||
top_trends = twitter.get_top_trends() | ||
logger.info(f"Top trends now are {top_trends}") | ||
trending_tweets = twitter.get_trending_tweets(top_trends[0]["name"]) | ||
df = pd.DataFrame(trending_tweets) | ||
esp = 1.29 | ||
df, clusters_count = ProcessData().cluster(esp, df) | ||
res, clusters_count = Results(df).get_result() | ||
logger.info(f"Clusters: {clusters_count}") | ||
result = {} | ||
for ind, row in res.iterrows(): | ||
result[ind] = dict(row) | ||
response = dict() | ||
response["cluserts_count"] = clusters_count.to_json() | ||
response["result"] = result | ||
return jsonify(response) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import re | ||
import string | ||
|
||
|
||
class CleanData: | ||
def de_emojify(self, tweet): | ||
"""Remove emoticons from given text""" | ||
regrex_pattern = re.compile( | ||
pattern="[" | ||
"\U0001F600-\U0001F64F" # emoticons | ||
"\U0001F300-\U0001F5FF" # symbols & pictographs | ||
"\U0001F680-\U0001F6FF" # transport & map symbols | ||
"\U0001F1E0-\U0001F1FF" # flags (iOS) | ||
"\U00002500-\U00002BEF" # chinese char | ||
"\U00002702-\U000027B0" | ||
"\U00002702-\U000027B0" | ||
"\U000024C2-\U0001F251" | ||
"\U0001f926-\U0001f937" | ||
"\U00010000-\U0010ffff" | ||
"\u2640-\u2642" | ||
"\u2600-\u2B55" | ||
"\u200d" | ||
"\u23cf" | ||
"\u23e9" | ||
"\u231a" | ||
"\ufe0f" # dingbats | ||
"\u3030" | ||
"]+", | ||
flags=re.UNICODE, | ||
) | ||
return regrex_pattern.sub(r"", tweet) | ||
|
||
def remove_punctuation(self, tweet): | ||
"""Remove links and other punctuation from text""" | ||
tweet = tweet.replace("\n", "") | ||
tweet = tweet.replace("\t", "") | ||
re.sub(r"http\S+", "", tweet) # removes links | ||
|
||
translator = str.maketrans("", "", string.punctuation) | ||
return tweet.lower().translate(translator) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import logging | ||
|
||
import nltk | ||
from sklearn.cluster import DBSCAN | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from textblob import TextBlob | ||
|
||
from apps.clean_data import CleanData | ||
|
||
try: | ||
from nltk import word_tokenize, sent_tokenize | ||
except ImportError: | ||
nltk.download("punkt") | ||
from nltk import word_tokenize, sent_tokenize | ||
|
||
|
||
class ProcessData: | ||
def __init__(self): | ||
self.porter_stemmer = nltk.PorterStemmer() | ||
self.clean_data = CleanData() | ||
|
||
def tokenize(self, tweet): | ||
"""Stem and tokenizes input text, used as custom tokenizer in tfi-df vectorization""" | ||
tokens = nltk.word_tokenize(tweet) | ||
stems = [] | ||
for item in tokens: | ||
stems.append(self.porter_stemmer.stem(item)) | ||
return stems | ||
|
||
def analyse_sentiment(self, tweet): | ||
"""Analyses the sentiment of the given tweet""" | ||
analysis = TextBlob(tweet) | ||
sentiment = analysis.sentiment.polarity | ||
if sentiment > 0: | ||
return "positive" | ||
elif sentiment == 0: | ||
return "neutral" | ||
else: | ||
return "negative" | ||
|
||
def cluster(self, esp, df): | ||
"""Clusters data using DBSCAN with a specified esp value""" | ||
df["tweet_clean"] = df["tweets"].apply( | ||
lambda y: self.clean_data.remove_punctuation(y) | ||
) | ||
df["tweet_clean"] = df["tweet_clean"].apply( | ||
lambda y: self.clean_data.de_emojify(y) | ||
) | ||
|
||
vectorizer = TfidfVectorizer( | ||
tokenizer=self.tokenize, stop_words="english", min_df=1 | ||
) | ||
x = vectorizer.fit_transform(df.loc[:, "tweet_clean"]) | ||
|
||
db = DBSCAN(esp, min_samples=20).fit(x) | ||
|
||
df["clusters"] = db.labels_ | ||
logging.info(f"Number of unique clusters generated: {df.clusters.nunique()}") | ||
|
||
return df, df.clusters.nunique() | ||
|
||
|
||
class Results: | ||
def __init__(self, df): | ||
"""Initialize final results of the analysis""" | ||
self.df = df | ||
self.clusters_count = df.clusters.value_counts() | ||
|
||
def get_result(self): | ||
df_results = self.df.groupby(["clusters"]).max().reset_index() | ||
df_results["sentiment"] = df_results["tweet_clean"].apply( | ||
lambda y: ProcessData().analyse_sentiment(y) | ||
) | ||
return df_results, self.clusters_count |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters