diff --git a/fleiss-kappa-navigation.ipynb b/fleiss-kappa-navigation.ipynb new file mode 100644 index 0000000..1cfc5e5 --- /dev/null +++ b/fleiss-kappa-navigation.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fleiss' Kappa Explorer\n", + "Measuring interrater agreement for each rater" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### First round of agreement measures\n", + "We did this round to see if the labels we settled on were practical." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import statsmodels\n", + "from statsmodels.stats import inter_rater as irr\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "data = pd.read_csv(\"fleisspilot1.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transform = data[[\"rater_1\", \"rater_2\"]]\n", + "conversions = dict([(\"News\", 0), (\"Politicians\", 1), (\"Activists\", 2), (\"Black Twitter\", 3), (\"Celebrities\", 4), (\"Organizations\", 5),\n", + "(\"Professional\", 6), (\"Professional\", 6), (\"Other\", 7), (\"NaN\", 8)])\n", + "conversions\n", + "\n", + "for i in transform.columns:\n", + " transform[i] = transform[i].map(conversions) #map cats to nums\n", + "\n", + "dats, cats= irr.aggregate_raters(transform, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fleiss2 = pd.read_csv(\"fleisspilot2.csv\")\n", + "transform2 = fleiss2[[\"rater_1\", \"rater_2\"]] #get only the ratings\n", + "\n", + "for i in transform2.columns:\n", + " transform2[i] = transform2[i].map(conversions)\n", + "\n", + "dats2, cats2 = irr.aggregate_raters(transform2, n_cat=None)\n", + "print(irr.fleiss_kappa(dats2, method='fleiss'))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8314840499306517" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean([0.8571428571428571, 0.8058252427184465])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Second round of agreement measures\n", + "We did this round when labeling all 65 communities, to double check that our annotations could be trusted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import ConfusionMatrixDisplay" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Fleiss Maximizer Function**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "anno1 = []\n", + "anno2 = []\n", + "def get_highest_fleiss():\n", + " for i in range(0,19):\n", + " if data.loc[:, \"rater_11\"][i] == data.loc[:, \"rater_21\"][i]:\n", + " anno1.append(data.loc[:, \"rater_11\"][i])\n", + " anno2.append(data.loc[:, \"rater_21\"][i])\n", + " elif data.loc[:, \"rater_11\"][i] == data.loc[:, \"rater_22\"][i]:\n", + " anno1.append(data.loc[:, \"rater_11\"][i])\n", + " anno2.append(data.loc[:, \"rater_22\"][i]) \n", + " elif data.loc[:, \"rater_12\"][i] == data.loc[:, \"rater_21\"][i]:\n", + " anno1.append(data.loc[:, \"rater_12\"][i])\n", + " anno2.append(data.loc[:, \"rater_21\"][i]) \n", + " else: #none equal OR this one happens to be equal. in either case, doesnt matter, use data.loc[:, \"rater_12\"][i], data.loc[:, \"rater_22\"][i]\n", + " anno1.append(data.loc[:, \"rater_12\"][i])\n", + " anno2.append(data.loc[:, \"rater_22\"][i])\n", + "\n", + " return pd.DataFrame(list(zip(anno1, anno2)), columns = [\"anno1\", \"anno2\"]) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T3_C52', 'T4_C40']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss1.csv\").fillna(\"None\") #doing confusion matrix through an error when NaNs were present\n", + "\n", + "#transform = data[[\"rater_11\", \"rater_12\", \"rater_21\", \"rater_22\"]]\n", + "conversions = dict([(\"Established Media\", 0), (\"Politicians\", 1), (\"Activists\", 2), (\"Black Twitter\", 3), (\"Celebrities\", 4), (\"Organizations\", 5),\n", + "(\"Professional (individual)\", 6), (\"Professional\", 7), (\"Other\", 8), (\"None\", 9)])\n", + "\n", + "sub = data.loc[:, [\"rater_11\", \"rater_21\"]]\n", + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss'))\n", + "\n", + "sub = data.loc[:, [\"rater_11\", \"rater_22\"]]\n", + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann3's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann4's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T3_C52', 'T4_C40']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Calculate Fleiss\n", + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T0_C5', 'T1_C0', 'T2_C1', 'T3_C1', 'T4_C0', 'T5_C1']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss2.csv\").fillna(\"None\") \n", + "anno1 = []\n", + "anno2 = []\n", + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann4's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann1's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T0_C5', 'T1_C0', 'T2_C1', 'T3_C1', 'T4_C0', 'T5_C1']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T3_C13', 'T4_C8', 'T5_C9', 'T6_C9']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss3.csv\").fillna(\"None\")\n", + "anno1 = []\n", + "anno2 = []\n", + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann1's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann2's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T3_C13', 'T4_C8', 'T5_C9', 'T6_C9']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T2_C34', 'T3_C63', 'T4_C43']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss4.csv\").fillna(\"None\")\n", + "anno1 = []\n", + "anno2 = []\n", + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann3's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann2's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T2_C34', 'T3_C63', 'T4_C43']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T4_C9', 'T5_C4', 'T6_C8']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss5.csv\").fillna(\"None\")\n", + "anno1 = []\n", + "anno2 = []\n", + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann2's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann4's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T4_C9', 'T5_C4', 'T6_C8']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**['T0_C41', 'T1_C34']** Agreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\"./new_fleiss6.csv\").fillna(\"None\")\n", + "anno1 = []\n", + "anno2 = []\n", + "sub = get_highest_fleiss()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "disp = ConfusionMatrixDisplay.from_predictions (sub.anno1, sub.anno2, labels=list(conversions.keys()), xticks_rotation=\"vertical\")\n", + "disp.ax_.set_xlabel (\"Ann1's annotations\")\n", + "disp.ax_.set_ylabel (\"Ann3's annotations\")\n", + "disp.ax_.set_title (\"Annotations of Highest Fleiss' Kappa for ['T0_C41', 'T1_C34']\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in sub.columns:\n", + " sub[i] = sub[i].map(conversions)\n", + "dats, cats = irr.aggregate_raters(sub, n_cat=None)\n", + "print(irr.fleiss_kappa(dats, method='fleiss')) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Calculate the mean\n", + "np.mean([0.6707105719237435, 0.845213849287169, 0.6122448979591837, 0.8716216216216217, 0.8535645472061657, 0.9339130434782608])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mask-fill-test.py b/mask-fill-test.py new file mode 100644 index 0000000..a3f82f1 --- /dev/null +++ b/mask-fill-test.py @@ -0,0 +1,106 @@ +import python_utils +import json +import csv +import re +import pandas +import nameutils +from nameutils import MaskedNER, find_word_indices, string_match +import argparse +from argparse import ArgumentParser + +def catch_key_error(dic, index): + try: + return dic[index] + except KeyError: + return False + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument("--twts", + help="choose which tweet file to use, include suffix. JSONL only.", + type=str, + required=True) + + args = parser.parse_args() + + with open("word_changes_sample_1k.tsv") as fd: #random sample of 1k words + #with open("word_changes.tsv") as fd: + word_changes = list(csv.reader(fd, delimiter=";", quotechar='"')) + + word_changes = word_changes[1:] #rid first row + + wordchange_start = {} + wordchange_end = {} + + for row in word_changes: + wordchange_start.update({row[0]: row[2]}) #period 1 + wordchange_end.update({row[0]: row[3]}) #period 2 + + mner = MaskedNER(census = {}) + info = [] + i = 1 + + print("Opening file") + with open(f"{args.twts}") as reader: + print("File opened") + for entry in reader: + if i%100 == 0: + print(i) + entry = json.loads(entry) #parse string as dictionary + + if entry['is_retweet']: #only unique tweets! + i = i+1 + continue + + else: + tokens = list(set(entry['normalized_tweet'].lower().split(' '))) #unique words in the current tweet + + for tk in tokens: #for every word in the tweet (tokens) + if entry['fasttext_langid'] == 'en' and entry['langid_langid'] == 'en' and entry['temp_bin'] == catch_key_error(wordchange_start, tk) and bool(string_match(entry['normalized_tweet'].lower(), {f'{tk}'})): #if the time of the tweet is the same as the time in the dictionary + #tweets_at_start[tk].append(entry['normalized_tweet'].lower()) #append to list with purpose unknown for now + top1k_start = mner.prob_name(mner.preprocess_data(entries=[entry['normalized_tweet'].lower()], span_sets=[string_match(entry['normalized_tweet'].lower(), {f'{tk}'})])[0][0], f'{tk}', True, 1000) + top10_start = mner.prob_name(mner.preprocess_data(entries=[entry['normalized_tweet'].lower()], span_sets=[string_match(entry['normalized_tweet'].lower(), {f'{tk}'})])[0][0], f'{tk}', True, 10) #unsure how to get index from this output alone + + try: + idx = list(top1k_start[1].keys()).index(tk) + 1 + except: + idx = -1 + + info.append({"word": tk, + "bin": entry['temp_bin'], + "tweet": entry['normalized_tweet'], + "top_n": top10_start[1], + "prob_word": top1k_start[0], + "word_rank": idx}) #index + 1 + + elif entry['fasttext_langid'] == 'en' and entry['langid_langid'] == 'en' and entry['temp_bin'] == catch_key_error(wordchange_end, tk) and bool(string_match(entry['normalized_tweet'].lower(), {f'{tk}'})): #if the time bins are the same + #tweets_at_end[tk].append(entry['normalized_tweet'].lower()) + top1k_end = mner.prob_name(mner.preprocess_data(entries=[entry['normalized_tweet'].lower()], span_sets=[string_match(entry['normalized_tweet'].lower(), {f'{tk}'})])[0][0], f'{tk}', True, 1000) + top10_end = mner.prob_name(mner.preprocess_data(entries=[entry['normalized_tweet'].lower()], span_sets=[string_match(entry['normalized_tweet'].lower(), {f'{tk}'})])[0][0], f'{tk}', True, 10) #unsure how to get index from this output alone + + try: + idx = list(top1k_end[1].keys()).index(tk) + 1 + except: + idx = -1 + + info.append({"word": tk, + "bin": entry['temp_bin'], + "tweet": entry['normalized_tweet'], + "top_n": top10_end[1], + "prob_word": top1k_end[0], + "word_rank": idx}) #index + 1 + + else: + continue + + i = i+1 + + + ##----SAVE FILE----## + with open(f"output-{args.twts}", 'w') as fin: + fin.write('\n'.join(map(json.dumps, info))) + + +if __name__ == "__main__": + main() \ No newline at end of file