analysis_utils.py

# coding: utf-8
from datasketch import MinHash, MinHashLSH
from datetime import datetime
from difflib import SequenceMatcher
from matplotlib import font_manager, pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import make_scorer, auc, roc_auc_score, roc_curve
import time

from read_write_utils import load_joined_df
from utils import *

pos_emojis = "(\:‑\)|\:\)|\:-\]|\:\]|\:-3|\:->|\:>|8-\)|\:-\}|\:\}|\:o\)|\:c\)|\:^\)|=\]|=\)|\:‑D|" \
             "\:D|8‑D|x‑D|xD|X‑D|XD|=D|=3|B^D|\:-\)\)|\:'‑\)|\:'\)|\xF0\x9F\x98\x81|\xF0\x9F\x98\x82|" \
             "\xF0\x9F\x98\x83|\xF0\x9F\x98\x84|\xF0\x9F\x98\x85|\xF0\x9F\x98\x86|\xF0\x9F\x98\x89|" \
             "\xF0\x9F\x98\x8A|\xF0\x9F\x98\x8B|\xF0\x9F\x98\x8C|\xF0\x9F\x98\x8D|\xF0\x9F\x98\x8F|" \
             "\xF0\x9F\x91\x8D|\xF0\x9F\x91\x8F|\xF0\x9F\x91\x90|\xF0\x9F\x98\x80|\xF0\x9F\x98\x87|" \
             "\xF0\x9F\x91\x8C|\xE2\x9D\xA4|\xE2\x99\xA5)"

neg_emojis = "(\:‑\(|\:\(|\:‑c|\:c|\:‑<|\:<|\:‑\[|\:\[|\:-\|\||>\:\[|\:{|\:@|>\:\(|\:'‑\(|\:'\(|D‑'\:|D\:<|" \
             "D\:|D;|D=|\xF0\x9F\x98\x90|\xF0\x9F\x98\x91|\xF0\x9F\x98\x95|\xF0\x9F\x98\x9F|\xF0\x9F\x98\xA6|" \
             "\xF0\x9F\x98\xA7|\xF0\x9F\x91\x8E|\xF0\x9F\x98\x9E|\xF0\x9F\x98\xA0|\xF0\x9F\x98\xA1|" \
             "\xF0\x9F\x98\xA2|\xF0\x9F\x98\xA3|\xF0\x9F\x98\xA4|\xF0\x9F\x98\xA5e|\xF0\x9F\x98\xA8|" \
             "\xF0\x9F\x98\xA9|\xF0\x9F\x98\xAA|\xF0\x9F\x98\xAB|\xF0\x9F\x98\xAD|\xF0\x9F\x98\xB0|" \
             "\xF0\x9F\x98\xB1|\xF0\x9F\x98\xB2)"


def similar(a, b):
    """
    ratio of similarity between strings
    :param a:
    :param b:
    :return:
    """
    return SequenceMatcher(None, str(a), str(b)).ratio()


def remove_similar_tweets(df, text_col="text", lang_col="lang_x", max_jaccard_simularity=0.5):
    """
    use local similarity hashing to efficiently remove tweets that are similar to others
    (might be autogenerated or retweets)
    english tweets only
    """
    t0 = time.time()
    df["tweet_clean"] = np.vectorize(TweetsAnalysis.preprocess_tweet)(df[text_col], df[lang_col])
    tweets = [t.split(" ") for t in df["tweet_clean"]]
    t1 = time.time()
    print t1 - t0, "cleaned tweets"
    lsh = MinHashLSH(threshold=max_jaccard_simularity, num_perm=64)  # jaccard similarity
    idx_selected = {}
    df_indices = df.index.values.tolist()
    for idx, tweet in zip(df_indices, tweets):
        s = MinHash(num_perm=64)
        for word in tweet:
            s.update(word.encode('utf8'))
        # only add if the tweet is not similar to existing ones
        if len(lsh.query(s)) == 0:
            lsh.insert(idx, s)
            idx_selected[idx] = True
    t2 = time.time()
    print t2 - t1, "created lsh"
    # only select the first tweet in a group of similar tweets
    df['select'] = pd.Series([idx_selected.get(idx, False) for idx in df_indices], index=df_indices)
    print df["select"].value_counts()
    t3 = time.time()
    print t3-t2, "selected df"
    return df[df["select"]]


def get_most_used_words(df, text_col='text'):
    text_list = df[text_col].values.tolist()
    text_list = [str(x) for x in text_list]  # nan values
    # split strings into words
    word_list = []
    for x in text_list:
        word_list += x.split(' ')
    word_list = pd.Series(word_list)
    return word_list.value_counts()


class TweetsAnalysis:
    def __init__(self, begin_time, end_time, root_path='/home/paul/databeers'):
        self.beg_time = begin_time
        self.end_time = end_time
        self.root_path = root_path

    def kein_bier_vor_vier(self):
        """
        Analyze the German saying "Kein Bier vor 4" (No beer before 4 (p.m.)):
        Plot (relative) number of German vs non-German beer tweets dependent on the hour.
        :param begin_time:
        :param end_time:
        :param root_path:
        :return:
        """
        time_df = load_joined_df(self.beg_time, self.end_time,
                                 columns=['utc_offset', 'created_at', 'lang'], root_path=self.root_path)
        print len(time_df)
        time_df = time_df[time_df['utc_offset'] != ""]
        time_df = add_local_time_col(time_df)
        time_df['is_german'] = time_df['lang_x'] == 'de'
        german_tweets = sum(time_df['is_german'])
        non_ger_tweets = len(time_df) - german_tweets
        tweets_abs_hour = (time_df.groupby(
            [time_df.local_time.dt.hour, 'is_german'])
                           .count()['created_at']
                           .unstack()
                           )
        tweets_rel_hour = pd.DataFrame()
        tweets_rel_hour["German"] = 100 * tweets_abs_hour[True] / german_tweets
        tweets_rel_hour["Other"] = 100 * tweets_abs_hour[False] / non_ger_tweets
        ax = tweets_rel_hour.plot(figsize=(16, 9))
        ax.set_xlabel("Hour")
        ax.set_ylabel("Share of tweets [%]")
        ax.set_xticks(range(0, 24, 2))
        return tweets_rel_hour

    def brewer_analysis(self):
        """
        "Give a man a beer and waste an hour, teach a man to brew and waste a lifetime"
        Quantitative analysis of the previous statement, i.e. check if people who tweet about beer a lot are brewers.
        :param begin_time: datetime.datetime, begin time for data
        :param end_time: datetime.datetime, begin time for data
        :param root_path: to the databeers directory(dependent on machine, Paul's Linux or Mac)
        :return:
        """
        descr_df = load_joined_df(self.beg_time, self.end_time, columns=['text', 'description', 'id_str'],
                                  root_path=self.root_path)
        brewer_pattern = u"(?i)brewing|brewer|cervejaria|cervecería|brasserie|brauerei"
        descr_df['brewer'] = descr_df['description'].str.contains(brewer_pattern)
        avg_tweets = descr_df.groupby(['brewer', 'id_str_y']).count()['id_str_x'].reset_index().groupby(
            'brewer').mean()['id_str_x']
        ax = avg_tweets.plot(kind='bar')
        ax.set_ylabel("Average Number of Tweets")
        return avg_tweets

    def awd_analysis(self):
        """
        Analyze when people in different countries start to drink, in the UK after work drinks (awd) are popular,
        whereas in southern europe everything starts later (and also ends later).
        :return:
        """
        time_df = load_joined_df(self.beg_time, self.end_time,
                                 columns=['utc_offset', 'created_at', 'lang', 'country'], root_path=self.root_path)
        time_df = time_df[time_df['utc_offset'] != ""]
        time_df = add_local_time_col(time_df, col_out_name='loc_time')
        time_df = add_estimated_country(time_df, col_out_name='est_country')
        total_tweets_per_country = time_df.groupby('est_country').count()['created_at'].reset_index()
        total_tweets_per_country.columns = ['est_country', 'total_tweets']
        thirsty_countries = total_tweets_per_country.sort_values(
            'total_tweets', ascending=False)['est_country'].values[:9]
        awd_tweets_per_country = time_df[time_df.loc_time.dt.weekday <= 3].groupby(
            [time_df.loc_time.dt.hour, 'est_country']).count()['created_at'].reset_index()
        awd_tweets_per_country.columns = ['hour', 'est_country', 'n_tweets']
        joined_df = awd_tweets_per_country.merge(total_tweets_per_country, on='est_country')
        joined_df['share_tweets'] = 100 * joined_df['n_tweets'] / joined_df['total_tweets']
        plt_df = joined_df.groupby(['hour', 'est_country']).max()['share_tweets'].unstack()
        ax = plt_df[[x for x in thirsty_countries if x != '']].plot()
        ax.set_ylabel("% After Work Beers / All Beer")
        return plt_df

    def socializing_analysis(self):
        """
        analyze how the number of beer tweets correlates with friends/followers count
        :return:
        """
        df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path)[['id_str_y',
                                                                                     u'default_profile',
                                                                                     u'description',
                                                                                     u'favourites_count',
                                                                                     u'followers_count',
                                                                                     u'friends_count']]
        cast_cols = ["favourites_count", "followers_count", "friends_count"]
        df[cast_cols] = df[cast_cols].apply(pd.to_numeric)
        df = df.groupby(["id_str_y"], as_index=False).agg({"description": "count",
                                                           "favourites_count": "max",
                                                           "followers_count": "max",
                                                           "friends_count": "max"}).rename(
            columns={"description": "n_tweets"})
        plt_df = df.groupby(["n_tweets"]).agg({"favourites_count": "mean",
                                               "followers_count": "mean",
                                               "friends_count": "mean",
                                               "id_str_y": "count"})
        ax = plt_df[cast_cols].plot(xlim=(1, 10))
        ax.set_ylabel("Average Number")
        return plt_df

    @staticmethod
    def preprocess_tweet(raw_tweet, language_code, remove_stopwords=True, return_string=True):
        """
        preprocess tweet for count vectorizer of the text, this means removing stopwords
        :param raw_tweet:
        :param language_code:
        :param remove_stopwords: Boolean if stopwords should be removed
        :param return_string: Boolean if True returns a string, else a list of words.
        :return: string (if return string) else list
        """
        # Remove non-letters
        letters_only = re.sub("[^a-zA-Z#]", " ", raw_tweet)
        # Convert to lower case, split into individual words
        words = letters_only.lower().split()
        if remove_stopwords:
            language_code_dict = {"es": "spanish", "pt": "portuguese", "fr": "french", "de": "german", "it": "italian",
                                  "nl": "dutch", "tr": "turkish", "ru": "russian"}
            # Use set since it is hash based and hence faster
            stops = set(stopwords.words(language_code_dict.get(language_code, "english"))).union({"xd"})
            # Remove stop words
            meaningful_words = [w for w in words if w not in stops]
        else:
            meaningful_words = words
        if return_string:
            return " ".join(meaningful_words)
        else:
            return meaningful_words

    def sentiment_analysis(self, n_words=1000, plot_top_n=10):
        """
        The idea is to get which words have a postive and a negative connotation in terms of beers.
        For this we vectorize the text and learn a logistic regression (a LR is chosen for good interpretability)
        To get a sparse ouput we use L1 regularisation.
        As a ground truth we take for positive tweets, pos emojis and neg emojis for neg tweets.
        We do this for english tweets only
        :return:
        """
        df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path,
                            columns=["text", "id_str", "lang"])
        df = df[df["lang_x"] == "en"]
        df["pos"] = df['text'].str.contains(pos_emojis)
        df["neg"] = df['text'].str.contains(neg_emojis)

        # filter the df, first get tweets with emojis only, then remove retweets and some auto-generated tweets
        filtered_df = df[np.vectorize(np.logical_xor)(df["pos"], df["neg"])]
        filtered_df = filtered_df[~filtered_df["text"].str.upper().str.contains("RT @")]

        # some users often post a pretty similar tweet, remove all but one of these.
        # since it is computationally to expensive to compare all tweets with each other we only compare with the next
        # and repeat this step 3 times
        for _ in range(3):
            filtered_df['text_lead'] = filtered_df.groupby(['user_id'])['text'].shift(1)
            filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
            filtered_df = filtered_df[filtered_df["sim"] < 0.7]

            # another problem are auto-generated tweets that are not completely the same but similar (e.g. tweets
            # generated by an app). To capture these we do something similar as above

        for _ in range(2):
            filtered_df['text_lead'] = filtered_df.sort_values(['text'])['text'].shift(1)
            filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
            filtered_df = filtered_df[filtered_df["sim"] < 0.7]

        filtered_df["text_reversed"] = filtered_df["text"].map(lambda x: x[::-1])
        for _ in range(2):
            filtered_df['text_lead'] = filtered_df.sort_values(['text_reversed'])['text'].shift(1)
            filtered_df["sim"] = np.vectorize(similar)(filtered_df["text"], filtered_df["text_lead"])
            filtered_df = filtered_df[filtered_df["sim"] < 0.7]

        filtered_df["cleaned_text"] = np.vectorize(self.preprocess_tweet)(filtered_df['text'], filtered_df['lang_x'])

        vectorizer = CountVectorizer(analyzer="word",
                                     tokenizer=None,
                                     preprocessor=None,
                                     stop_words=None,
                                     max_features=n_words)

        msk = np.random.rand(len(filtered_df)) < 0.9
        train_data_features = vectorizer.fit_transform(filtered_df["cleaned_text"][msk])
        test_data_features = vectorizer.transform(filtered_df["cleaned_text"][~msk])

        df_uq = pd.DataFrame(train_data_features.toarray())
        df_uq["label"] = pd.Series(filtered_df[msk]["pos"].map({True: 1, False: 0}).values)
        df_uq = df_uq.drop_duplicates()

        # choose L1 regularisation in order to get a sparse solution
        lr_model = LogisticRegressionCV(penalty='l1', solver="liblinear", scoring=make_scorer(roc_auc_score))
        lr_model.fit(df_uq[range(n_words)].values, df_uq["label"].values)

        roc_data = roc_curve(pd.to_numeric(filtered_df[msk]["pos"]),
                             [x[1] for x in lr_model.predict_proba(train_data_features.toarray())])
        print "auc on train set", auc(roc_data[0], roc_data[1])

        roc_data = roc_curve(pd.to_numeric(filtered_df[~msk]["pos"]),
                             [x[1] for x in lr_model.predict_proba(test_data_features.toarray())])
        print "auc on test set", auc(roc_data[0], roc_data[1])

        coefs = sorted(zip(lr_model.coef_[0], vectorizer.get_feature_names()), key=lambda x: x[0])

        fig, axes = plt.subplots(ncols=2)
        y = range(plot_top_n)
        axes[0].barh(y, [x[0] for x in coefs[-plot_top_n:]], align='center', color='green', zorder=10)
        axes[0].set(title='Positive Connotated Words')
        negs = coefs[:plot_top_n]
        negs.reverse()
        axes[1].barh(y, [abs(x[0]) for x in negs], align='center', color='red', zorder=10)
        axes[1].set(title='Negative Connotated Words')

        axes[0].set_ylim((-1, plot_top_n))
        axes[1].set_ylim((-1, plot_top_n))

        x_lim = max(coefs[0][0], coefs[-1][0]) * 1.01
        axes[0].set_xlim((0, x_lim))
        axes[1].set_xlim((0, x_lim))

        axes[0].invert_xaxis()
        axes[0].set(yticks=y, yticklabels=[x[1] for x in coefs[-plot_top_n:]])
        axes[0].yaxis.tick_right()

        axes[1].set(yticks=y, yticklabels=[x[1] for x in negs])

        for ax in axes.flat:
            ax.margins(0.03)
            ax.grid(True)

        fig.tight_layout()
        fig.subplots_adjust(wspace=0.59)

    def emoji_analysis(self):
        """
        easy version of sentiment analysis only analysing the emojis used in the tweet
        :return: two pandas Series with the emojis as keys and the number of occurences as values.
        """
        df = load_joined_df(self.beg_time, self.end_time, root_path=self.root_path, columns=['text', 'id_str'])
        pos = df['text'].str.extract(pos_emojis).value_counts()
        pos["pos"] = sum(pos)

        neg = df['text'].str.extract(neg_emojis).value_counts()
        neg["neg"] = sum(neg)

        # Unfortunately it is not easy to plot this here, since matplotlib has problems displaying the emojis,
        # however on a linux machine there is a solution that is described in
        # http://stackoverflow.com/questions/33190985/substitute-node-labels-with-emoji-using-networkx-in-python/
        # 33204587#33204587 - the package that is used there is only available on a linux.
        # Hence we export the data plot here and add a separate plotting function.

        return pos, neg


def plot_emoji_analysis(pos, neg, plot_top_n):
    """
    plotting function for TweetsAnalysis.emoji_analysis
    :param pos: pd.Series
    :param neg: pd.Series
    :param plot_top_n: int
    :return:
    """
    fig, axes = plt.subplots(ncols=2)
    y = range(plot_top_n)
    axes[0].barh(y, pos.values[:plot_top_n], align='center', color='green', zorder=10)
    axes[0].set(title='Positive Emojis {}'.format(pos["pos"]))
    axes[1].barh(y, neg.values[:plot_top_n], align='center', color='red', zorder=10)
    axes[1].set(title='Negative Emojis {}'.format(neg["neg"]))

    axes[0].set_ylim((-1, plot_top_n))
    axes[1].set_ylim((-1, plot_top_n))

    x_lim = max(max(pos), max(neg)) * 1.01
    axes[0].set_xlim((0, x_lim))
    axes[1].set_xlim((0, x_lim))

    axes[0].invert_xaxis()
    axes[0].set(yticks=y, yticklabels=list(pos.index[:plot_top_n]))
    for label in axes[0].get_yticklabels():
        label.set_fontproperties(font_manager.FontProperties(family="Symbolia"))
    axes[0].yaxis.tick_right()

    axes[1].set(yticks=y, yticklabels=list(neg.index[:plot_top_n]))
    for label in axes[1].get_yticklabels():
        label.set_fontproperties(font_manager.FontProperties(family="Symbolia"))

    for ax in axes.flat:
        ax.margins(0.03)
        ax.grid(True)

    fig.tight_layout()
    fig.subplots_adjust(wspace=0.19)