diff --git a/Copy_of_arekit_ss.ipynb b/Copy_of_arekit_ss.ipynb new file mode 100644 index 0000000..6b7ef7d --- /dev/null +++ b/Copy_of_arekit_ss.ipynb @@ -0,0 +1,548 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyM75S4caHyPV3cxcjsqrIjW", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install git+https://github.com/nicolay-r/arekit-ss.git@0.25.0" + ], + "metadata": { + "id": "pDjk8iivQRgr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.download_data" + ], + "metadata": { + "id": "tXP4dLoxSwHi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Sentiment Analysis examples section\n", + "\n", + "1. Scenario with the bert-based formatter from `RuSentRel` collection with text translations into `en` language using Google Translate API." + ], + "metadata": { + "id": "ebkYuqXIHtwk" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler nn --dest_lang en --docs_limit 5" + ], + "metadata": { + "id": "YfuraM8nLWXL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler bert --dest_lang en --docs_limit 5" + ], + "metadata": { + "id": "yYmPY419TKXL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "2. Prompting scenario for further application of the sampled data in LLM.\n", + "You can compose your personal prompt:\n", + "1. `text` -- original text of the sentence\n", + "2. `s_ind`, `t_ind` -- word indices in the original text\n", + "3. `label` -- sentiment label of the text." + ], + "metadata": { + "id": "t_vMoU9mKhfb" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler prompt \\\n", + " --prompt \"For text: '{text}', the attitude between '{s_val}' and '{t_val}' is: '{label_val}'\" \\\n", + " --dest_lang en --docs_limit 1" + ], + "metadata": { + "id": "srhpk7TrIEbz", + "outputId": "00101ba2-754c-4d87-d5cb-68bf3dfaba53", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading RuSentRel Collection: 100%|████████████████████████████████████████████| 1253/1253 [00:01<00:00, 1040.72opins/s]\n", + "sample [DataType.Train]: 1550it [00:29, 51.88it/s, docs_seen=1, doc_now=1] \n", + "sample [DataType.Test]: 582it [00:09, 60.77it/s, docs_seen=1, doc_now=46] \n", + "INFO:arekit_ss:Done: _out/rusentrel-prompt-tpc50-en-l1 [csv]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "CglgUCmsIAge" + } + }, + { + "cell_type": "markdown", + "source": [ + "Now we can take a look onto the results:" + ], + "metadata": { + "id": "s7Yf2z-xmrp_" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install pandas" + ], + "metadata": { + "id": "I_fpFmYIrn9J", + "outputId": "78d513b7-be9c-48f5-eb40-cee85331e5ae", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"_out/rusentrel-prompt-tpc50-en-l1-train.csv\", sep='\\t')\n", + "df.head()" + ], + "metadata": { + "id": "WiPl5RgIlTfd", + "outputId": "1d832a9d-2883-444e-9f21-8d6be3594c31", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id doc_id label_uint label_str \\\n", + "0 0 1 2 NegativeTo \n", + "1 1 1 2 NegativeTo \n", + "2 2 1 2 NegativeTo \n", + "3 3 1 2 NegativeTo \n", + "4 4 1 2 NegativeTo \n", + "\n", + " text_a s_ind t_ind \\\n", + "0 For text: 'Outdoor Unknovn According to state... 7 9 \n", + "1 For text: 'Outdoor Unknovn Why is it that in ... 15 9 \n", + "2 For text: 'Outdoor Unknovn Why is it that in ... 11 15 \n", + "3 For text: 'Outdoor Unknovn The concern of USA... 9 5 \n", + "4 For text: 'Outdoor Unknovn As an example Ali-K... 5 9 \n", + "\n", + " opinion_id linkage_id entities \\\n", + "0 0 0 1,3,5,7,9,11 \n", + "1 1 0 1,3,5,7,9,11,13,15,17 \n", + "2 2 0 1,3,5,7,9,11,13,15,17 \n", + "3 3 0 1,3,5,7,9,11,13,15,17 \n", + "4 4 0 0,1,3,5,7,9 \n", + "\n", + " entity_types \n", + "0 PER,PER,ORG,LOC,ORG,LOC \n", + "1 PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER \n", + "2 PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER \n", + "3 PER,PER,LOC,PER,LOC,ORG,GEOPOLIT,GEOPOLIT,LOC \n", + "4 PER,PER,PER,LOC,LOC,GEOPOLIT " + ], + "text/html": [ + "\n", + "
\n", + " | id | \n", + "doc_id | \n", + "label_uint | \n", + "label_str | \n", + "text_a | \n", + "s_ind | \n", + "t_ind | \n", + "opinion_id | \n", + "linkage_id | \n", + "entities | \n", + "entity_types | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "1 | \n", + "2 | \n", + "NegativeTo | \n", + "For text: 'Outdoor Unknovn According to state... | \n", + "7 | \n", + "9 | \n", + "0 | \n", + "0 | \n", + "1,3,5,7,9,11 | \n", + "PER,PER,ORG,LOC,ORG,LOC | \n", + "
1 | \n", + "1 | \n", + "1 | \n", + "2 | \n", + "NegativeTo | \n", + "For text: 'Outdoor Unknovn Why is it that in ... | \n", + "15 | \n", + "9 | \n", + "1 | \n", + "0 | \n", + "1,3,5,7,9,11,13,15,17 | \n", + "PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER | \n", + "
2 | \n", + "2 | \n", + "1 | \n", + "2 | \n", + "NegativeTo | \n", + "For text: 'Outdoor Unknovn Why is it that in ... | \n", + "11 | \n", + "15 | \n", + "2 | \n", + "0 | \n", + "1,3,5,7,9,11,13,15,17 | \n", + "PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER | \n", + "
3 | \n", + "3 | \n", + "1 | \n", + "2 | \n", + "NegativeTo | \n", + "For text: 'Outdoor Unknovn The concern of USA... | \n", + "9 | \n", + "5 | \n", + "3 | \n", + "0 | \n", + "1,3,5,7,9,11,13,15,17 | \n", + "PER,PER,LOC,PER,LOC,ORG,GEOPOLIT,GEOPOLIT,LOC | \n", + "
4 | \n", + "4 | \n", + "1 | \n", + "2 | \n", + "NegativeTo | \n", + "For text: 'Outdoor Unknovn As an example Ali-K... | \n", + "5 | \n", + "9 | \n", + "4 | \n", + "0 | \n", + "0,1,3,5,7,9 | \n", + "PER,PER,PER,LOC,LOC,GEOPOLIT | \n", + "
diff --git a/dependencies.txt b/dependencies.txt index fbaff47..6c79f02 100644 --- a/dependencies.txt +++ b/dependencies.txt @@ -1,3 +1,3 @@ -arekit @ git+https://github.com/nicolay-r/AREkit@0.25.0-rc +arekit==0.25.0 googletrans==3.1.0a0 -requests>=2.28.1 \ No newline at end of file +requests>=2.28.1