diff --git a/Copy_of_arekit_ss.ipynb b/Copy_of_arekit_ss.ipynb new file mode 100644 index 0000000..6b7ef7d --- /dev/null +++ b/Copy_of_arekit_ss.ipynb @@ -0,0 +1,548 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyM75S4caHyPV3cxcjsqrIjW", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install git+https://github.com/nicolay-r/arekit-ss.git@0.25.0" + ], + "metadata": { + "id": "pDjk8iivQRgr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.download_data" + ], + "metadata": { + "id": "tXP4dLoxSwHi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Sentiment Analysis examples section\n", + "\n", + "1. Scenario with the bert-based formatter from `RuSentRel` collection with text translations into `en` language using Google Translate API." + ], + "metadata": { + "id": "ebkYuqXIHtwk" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler nn --dest_lang en --docs_limit 5" + ], + "metadata": { + "id": "YfuraM8nLWXL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler bert --dest_lang en --docs_limit 5" + ], + "metadata": { + "id": "yYmPY419TKXL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "2. Prompting scenario for further application of the sampled data in LLM.\n", + "You can compose your personal prompt:\n", + "1. `text` -- original text of the sentence\n", + "2. `s_ind`, `t_ind` -- word indices in the original text\n", + "3. `label` -- sentiment label of the text." + ], + "metadata": { + "id": "t_vMoU9mKhfb" + } + }, + { + "cell_type": "code", + "source": [ + "!python -m arekit_ss.sample --writer csv --source rusentrel --sampler prompt \\\n", + " --prompt \"For text: '{text}', the attitude between '{s_val}' and '{t_val}' is: '{label_val}'\" \\\n", + " --dest_lang en --docs_limit 1" + ], + "metadata": { + "id": "srhpk7TrIEbz", + "outputId": "00101ba2-754c-4d87-d5cb-68bf3dfaba53", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Loading RuSentRel Collection: 100%|████████████████████████████████████████████| 1253/1253 [00:01<00:00, 1040.72opins/s]\n", + "sample [DataType.Train]: 1550it [00:29, 51.88it/s, docs_seen=1, doc_now=1] \n", + "sample [DataType.Test]: 582it [00:09, 60.77it/s, docs_seen=1, doc_now=46] \n", + "INFO:arekit_ss:Done: _out/rusentrel-prompt-tpc50-en-l1 [csv]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "CglgUCmsIAge" + } + }, + { + "cell_type": "markdown", + "source": [ + "Now we can take a look onto the results:" + ], + "metadata": { + "id": "s7Yf2z-xmrp_" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install pandas" + ], + "metadata": { + "id": "I_fpFmYIrn9J", + "outputId": "78d513b7-be9c-48f5-eb40-cee85331e5ae", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", + "Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df = pd.read_csv(\"_out/rusentrel-prompt-tpc50-en-l1-train.csv\", sep='\\t')\n", + "df.head()" + ], + "metadata": { + "id": "WiPl5RgIlTfd", + "outputId": "1d832a9d-2883-444e-9f21-8d6be3594c31", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " id doc_id label_uint label_str \\\n", + "0 0 1 2 NegativeTo \n", + "1 1 1 2 NegativeTo \n", + "2 2 1 2 NegativeTo \n", + "3 3 1 2 NegativeTo \n", + "4 4 1 2 NegativeTo \n", + "\n", + " text_a s_ind t_ind \\\n", + "0 For text: 'Outdoor Unknovn According to state... 7 9 \n", + "1 For text: 'Outdoor Unknovn Why is it that in ... 15 9 \n", + "2 For text: 'Outdoor Unknovn Why is it that in ... 11 15 \n", + "3 For text: 'Outdoor Unknovn The concern of USA... 9 5 \n", + "4 For text: 'Outdoor Unknovn As an example Ali-K... 5 9 \n", + "\n", + " opinion_id linkage_id entities \\\n", + "0 0 0 1,3,5,7,9,11 \n", + "1 1 0 1,3,5,7,9,11,13,15,17 \n", + "2 2 0 1,3,5,7,9,11,13,15,17 \n", + "3 3 0 1,3,5,7,9,11,13,15,17 \n", + "4 4 0 0,1,3,5,7,9 \n", + "\n", + " entity_types \n", + "0 PER,PER,ORG,LOC,ORG,LOC \n", + "1 PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER \n", + "2 PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER \n", + "3 PER,PER,LOC,PER,LOC,ORG,GEOPOLIT,GEOPOLIT,LOC \n", + "4 PER,PER,PER,LOC,LOC,GEOPOLIT " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddoc_idlabel_uintlabel_strtext_as_indt_indopinion_idlinkage_identitiesentity_types
0012NegativeToFor text: 'Outdoor Unknovn According to state...79001,3,5,7,9,11PER,PER,ORG,LOC,ORG,LOC
1112NegativeToFor text: 'Outdoor Unknovn Why is it that in ...159101,3,5,7,9,11,13,15,17PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER
2212NegativeToFor text: 'Outdoor Unknovn Why is it that in ...1115201,3,5,7,9,11,13,15,17PER,PER,LOC,LOC,ORG,LOC,GEOPOLIT,LOC,PER
3312NegativeToFor text: 'Outdoor Unknovn The concern of USA...95301,3,5,7,9,11,13,15,17PER,PER,LOC,PER,LOC,ORG,GEOPOLIT,GEOPOLIT,LOC
4412NegativeToFor text: 'Outdoor Unknovn As an example Ali-K...59400,1,3,5,7,9PER,PER,PER,LOC,LOC,GEOPOLIT
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 1550,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 447,\n \"min\": 0,\n \"max\": 1549,\n \"num_unique_values\": 1550,\n \"samples\": [\n 1519,\n 1441,\n 351\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"doc_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 1,\n \"num_unique_values\": 1,\n \"samples\": [\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_uint\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 2,\n \"num_unique_values\": 3,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_str\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"NegativeTo\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text_a\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1486,\n \"samples\": [\n \"For text: 'Outdoor Unknovn Vladimir-Putin immediately dismissed seven high-ranking security officials: among them - the southern transport prosecutor, the head of Office-of-the-Federal-Penitentiary-Service for the city Moscow , and also deputy head of Ministry-of-Internal-Affairs Republic-of-Crimea .', the attitude between 'Unknovn' and 'Republic of Crimea' is: 'no-label'\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"s_ind\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4,\n \"min\": 0,\n \"max\": 19,\n \"num_unique_values\": 17,\n \"samples\": [\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"t_ind\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4,\n \"min\": 0,\n \"max\": 19,\n \"num_unique_values\": 17,\n \"samples\": [\n 9\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"opinion_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 447,\n \"min\": 0,\n \"max\": 1549,\n \"num_unique_values\": 1550,\n \"samples\": [\n 1519\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"linkage_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 0,\n \"num_unique_values\": 1,\n \"samples\": [\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entities\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 18,\n \"samples\": [\n \"1,3,5,7,9,11\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entity_types\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 44,\n \"samples\": [\n \"PER,PER,ORG,LOC\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 11 + } + ] + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md index d10847d..6030054 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ ## arekit-ss 0.25.0 ![](https://img.shields.io/badge/Python-3.9-brightgreen.svg) -![](https://img.shields.io/badge/AREkit-0.24.0-orange.svg) +![](https://img.shields.io/badge/AREkit-0.25.0-orange.svg) [![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nicolay-r/arekit-ss/blob/master/arekit_ss.ipynb) +### [📜 List of binded sources](https://github.com/nicolay-r/AREkit/wiki/Binded-Sources) +

diff --git a/dependencies.txt b/dependencies.txt index fbaff47..6c79f02 100644 --- a/dependencies.txt +++ b/dependencies.txt @@ -1,3 +1,3 @@ -arekit @ git+https://github.com/nicolay-r/AREkit@0.25.0-rc +arekit==0.25.0 googletrans==3.1.0a0 -requests>=2.28.1 \ No newline at end of file +requests>=2.28.1