From 1184801607930fbff484c592a8ca6fb0b9dcf5e1 Mon Sep 17 00:00:00 2001 From: Francisco Castillo Date: Fri, 18 Nov 2022 16:18:58 -0800 Subject: [PATCH 1/4] Add dataset example --- examples/dataset.ipynb | 473 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 473 insertions(+) create mode 100644 examples/dataset.ipynb diff --git a/examples/dataset.ipynb b/examples/dataset.ipynb new file mode 100644 index 0000000000..3e1654232a --- /dev/null +++ b/examples/dataset.ipynb @@ -0,0 +1,473 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0364599c", + "metadata": {}, + "source": [ + "# Phoenix Dataset Object\n", + "\n", + "This small tutorial is to demonstrate how we can use the 🔥🐦 Phoenix `Dataset` object. \n", + "\n", + "This object currently is composed of a dataframe and a schema. Data can be consumed from:\n", + "* Pandas DataFrame directly\n", + "* From local files: csv & hdf5" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "28f8890a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from phoenix.datasets import Dataset, Schema, EmbeddingColumnNames" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "02c7d1e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prediction_tsreviewer_agereviewer_genderproduct_categorylanguagetexttext_vectorlabelpred_label
01.650092e+0921femaleapparelenglishPoor quality of fabric and ridiculously tight ...[-7.05169961e-02 6.64003372e-01 3.35792184e-...negativenegative
11.650092e+0929malekitchenenglishLove these glasses, thought they'd be everyday...[-2.44109239e-03 -5.40627480e-01 3.17134917e-...positivepositive
21.650093e+0926femalesportsenglishThese are disgusting, it tastes like you are \"...[ 4.04878825e-01 8.23539615e-01 3.83339435e-...negativenegative
31.650093e+0926maleotherenglishMy husband has a pair of TaoTronics so I decid...[ 0.01881652 0.53441304 0.4907303 -0.024163...neutralneutral
41.650093e+0937malehome_improvementenglishThreads too deep. Engages on tank, but gasket ...[-0.25348073 0.31603432 0.35810202 -0.246728...negativenegative
\n", + "
" + ], + "text/plain": [ + " prediction_ts reviewer_age reviewer_gender product_category language \\\n", + "0 1.650092e+09 21 female apparel english \n", + "1 1.650092e+09 29 male kitchen english \n", + "2 1.650093e+09 26 female sports english \n", + "3 1.650093e+09 26 male other english \n", + "4 1.650093e+09 37 male home_improvement english \n", + "\n", + " text \\\n", + "0 Poor quality of fabric and ridiculously tight ... \n", + "1 Love these glasses, thought they'd be everyday... \n", + "2 These are disgusting, it tastes like you are \"... \n", + "3 My husband has a pair of TaoTronics so I decid... \n", + "4 Threads too deep. Engages on tank, but gasket ... \n", + "\n", + " text_vector label pred_label \n", + "0 [-7.05169961e-02 6.64003372e-01 3.35792184e-... negative negative \n", + "1 [-2.44109239e-03 -5.40627480e-01 3.17134917e-... positive positive \n", + "2 [ 4.04878825e-01 8.23539615e-01 3.83339435e-... negative negative \n", + "3 [ 0.01881652 0.53441304 0.4907303 -0.024163... neutral neutral \n", + "4 [-0.25348073 0.31603432 0.35810202 -0.246728... negative negative " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_filename = \"NLP_sentiment_classification_language_drift\"\n", + "\n", + "df1 = pd.read_csv(f\"./fixtures/{test_filename}.csv\")\n", + "df1.head()" + ] + }, + { + "cell_type": "markdown", + "id": "cc50b386", + "metadata": {}, + "source": [ + "Define the schema same as you would in our SDK" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ae788250", + "metadata": {}, + "outputs": [], + "source": [ + "features = [\n", + " 'reviewer_age',\n", + " 'reviewer_gender',\n", + " 'product_category',\n", + " 'language',\n", + "]\n", + "\n", + "embedding_features = {\n", + " \"embedding_feature\": EmbeddingColumnNames(\n", + " vector_column_name=\"text_vector\", # Will be name of embedding feature in the app\n", + " data_column_name=\"text\",\n", + " ),\n", + "}\n", + "\n", + "# Define a Schema() object for Arize to pick up data from the correct columns for logging\n", + "schema = Schema(\n", + " prediction_id_column_name=\"prediction_id\",\n", + " timestamp_column_name=\"prediction_ts\",\n", + " prediction_label_column_name=\"pred_label\",\n", + " actual_label_column_name=\"label\",\n", + " feature_column_names=features,\n", + " embedding_feature_column_names=embedding_features\n", + ")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "d35bed3c", + "metadata": {}, + "source": [ + "You are ready to define a `Dataset`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "af7450a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Defined directly from dataframe\n", + "dataset1 = Dataset(df1,schema)\n", + "dataset2 = Dataset.from_dataframe(df1, schema)\n", + "# Defined from csv\n", + "dataset3 = Dataset.from_csv(f\"./fixtures/{test_filename}.csv\", schema=schema)\n", + "# Defined from hdf5\n", + "dataset4 = Dataset.from_hdf(f\"./fixtures/{test_filename}.hdf5\", schema=schema, key=\"training\")" + ] + }, + { + "cell_type": "markdown", + "id": "4ce754b1", + "metadata": {}, + "source": [ + "The following is an issue we need to investigate. We see that all datasets are equal. At first glance that seems ok. But, when loading a csv file, the embeddings are read as strings (issue to fix is filed). Hence the following condition should not be True" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8ca50f81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset1==dataset2==dataset3==dataset4" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7a9f0f11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampreviewer_agereviewer_genderproduct_categorylanguagetexttext_vectoractual_labelpred_label
01.650092e+0921femaleapparelenglishPoor quality of fabric and ridiculously tight ...[-7.05169961e-02 6.64003372e-01 3.35792184e-...negativenegative
11.650092e+0929malekitchenenglishLove these glasses, thought they'd be everyday...[-2.44109239e-03 -5.40627480e-01 3.17134917e-...positivepositive
21.650093e+0926femalesportsenglishThese are disgusting, it tastes like you are \"...[ 4.04878825e-01 8.23539615e-01 3.83339435e-...negativenegative
31.650093e+0926maleotherenglishMy husband has a pair of TaoTronics so I decid...[ 0.01881652 0.53441304 0.4907303 -0.024163...neutralneutral
41.650093e+0937malehome_improvementenglishThreads too deep. Engages on tank, but gasket ...[-0.25348073 0.31603432 0.35810202 -0.246728...negativenegative
\n", + "
" + ], + "text/plain": [ + " timestamp reviewer_age reviewer_gender product_category language \\\n", + "0 1.650092e+09 21 female apparel english \n", + "1 1.650092e+09 29 male kitchen english \n", + "2 1.650093e+09 26 female sports english \n", + "3 1.650093e+09 26 male other english \n", + "4 1.650093e+09 37 male home_improvement english \n", + "\n", + " text \\\n", + "0 Poor quality of fabric and ridiculously tight ... \n", + "1 Love these glasses, thought they'd be everyday... \n", + "2 These are disgusting, it tastes like you are \"... \n", + "3 My husband has a pair of TaoTronics so I decid... \n", + "4 Threads too deep. Engages on tank, but gasket ... \n", + "\n", + " text_vector actual_label pred_label \n", + "0 [-7.05169961e-02 6.64003372e-01 3.35792184e-... negative negative \n", + "1 [-2.44109239e-03 -5.40627480e-01 3.17134917e-... positive positive \n", + "2 [ 4.04878825e-01 8.23539615e-01 3.83339435e-... negative negative \n", + "3 [ 0.01881652 0.53441304 0.4907303 -0.024163... neutral neutral \n", + "4 [-0.25348073 0.31603432 0.35810202 -0.246728... negative negative " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = df1.copy()\n", + "df2.rename(\n", + " columns={\n", + " \"prediction_ts\":\"timestamp\",\n", + " \"label\":\"actual_label\"\n", + " },\n", + " inplace=True\n", + ")\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a5ec5454", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a Schema() object for Arize to pick up data from the correct columns for logging\n", + "schema = Schema(\n", + " prediction_id_column_name=\"prediction_id\",\n", + " timestamp_column_name=\"timestamp\",\n", + " prediction_label_column_name=\"pred_label\",\n", + " actual_label_column_name=\"actual_label\",\n", + " feature_column_names=features,\n", + " embedding_feature_column_names=embedding_features\n", + ")\n", + "dataset5 = Dataset(df1,schema)" + ] + }, + { + "cell_type": "markdown", + "id": "f74d293f", + "metadata": {}, + "source": [ + "This is another issue. In this case we have different dataframes with different schemas. However the Dataset objects are equal?" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "8128eda1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset1==dataset5" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 8c5ccacff152e90ef782a43d28e93d3be5f66dac Mon Sep 17 00:00:00 2001 From: Francisco Castillo Date: Fri, 18 Nov 2022 16:19:30 -0800 Subject: [PATCH 2/4] Add reading from hdf5 --- phoenix/datasets/dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/phoenix/datasets/dataset.py b/phoenix/datasets/dataset.py index 85bee5618a..88d2290f66 100644 --- a/phoenix/datasets/dataset.py +++ b/phoenix/datasets/dataset.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from typing import Optional -from pandas import DataFrame, Series, read_csv +from pandas import DataFrame, Series, read_csv, read_hdf from .types import Schema @@ -66,6 +66,10 @@ def from_dataframe(cls, dataframe: DataFrame, schema: Schema): def from_csv(cls, filepath: str, schema: Schema): return cls(read_csv(filepath), schema) + @classmethod + def from_hdf(cls, filepath: str, schema: Schema, key: Optional[str] = None): + return cls(read_hdf(filepath, key), schema) + @staticmethod def _parse_dataframe(dataframe: DataFrame, schema: Schema) -> DataFrame: schema_cols = [ From bf30fc1c548f6bbe5afcc31c9fa25e2a7b00214a Mon Sep 17 00:00:00 2001 From: Francisco Castillo Date: Fri, 18 Nov 2022 16:44:39 -0800 Subject: [PATCH 3/4] Update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 13d5149251..d87dea1080 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ node_modules dist *__pycache__* **/.ipynb_checkpoints/ +examples/fixtures/* \ No newline at end of file From 2a9d523911bc6ed61a48e1ab5571b0c476bae218 Mon Sep 17 00:00:00 2001 From: Francisco Castillo Date: Fri, 18 Nov 2022 17:47:21 -0800 Subject: [PATCH 4/4] UMAP drift update --- examples/umap_drift.ipynb | 3170 ++++++++++++++++++++++++++++++++++++- phoenix/umap/umap.py | 2 +- 2 files changed, 3163 insertions(+), 9 deletions(-) diff --git a/examples/umap_drift.ipynb b/examples/umap_drift.ipynb index 7e229a6e54..d84cb4e6eb 100644 --- a/examples/umap_drift.ipynb +++ b/examples/umap_drift.ipynb @@ -1,28 +1,3183 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Phoenix Embeddings\n", + "\n", + "This small tutorial goes over creating Phoenix's `Dataset` objects and using them to obtain a UMAP pointcloud using the `UMAPWidget`" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "from arize_toolbox.widgets.umapWidget import UMAPWidget\n" + "from phoenix.widgets import UMAPWidget\n", + "from phoenix.datasets import Dataset, Schema, EmbeddingColumnNames\n", + "from phoenix.umap import CalculateUMAP" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "widget = UMAPWidget()\n", - "widget.show()\n" + "test_filename = \"NLP_sentiment_classification_language_drift\"\n", + "\n", + "features = [\n", + " 'reviewer_age',\n", + " 'reviewer_gender',\n", + " 'product_category',\n", + " 'language',\n", + "]\n", + "\n", + "embedding_features = {\n", + " \"embedding_feature\": EmbeddingColumnNames(\n", + " vector_column_name=\"text_vector\", # Will be name of embedding feature in the app\n", + " data_column_name=\"text\",\n", + " ),\n", + "}\n", + "\n", + "# Define a Schema() object for Arize to pick up data from the correct columns for logging\n", + "schema = Schema(\n", + " prediction_id_column_name=\"prediction_id\",\n", + " timestamp_column_name=\"prediction_ts\",\n", + " prediction_label_column_name=\"pred_label\",\n", + " actual_label_column_name=\"label\",\n", + " feature_column_names=features,\n", + " embedding_feature_column_names=embedding_features\n", + ")\n", + "\n", + "train_ds = Dataset.from_hdf(f\"./fixtures/{test_filename}.hdf5\", schema=schema, key=\"training\")\n", + "prod_ds = Dataset.from_hdf(f\"./fixtures/{test_filename}.hdf5\", schema=schema, key=\"production\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], + "source": [ + "# Obtain the point cloud\n", + "pc = CalculateUMAP(prod_ds, train_ds, \"embedding_feature\")" ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "widget = UMAPWidget(pc.to_json())\n", + "widget.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.3 ('notebook')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -36,9 +3191,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.3" + "version": "3.8.13" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "f8476af36f82f278d0224c299b3c84ccca1fb7344702eda80d935d8f2c34d234" diff --git a/phoenix/umap/umap.py b/phoenix/umap/umap.py index 27d319ef73..ab989e2031 100644 --- a/phoenix/umap/umap.py +++ b/phoenix/umap/umap.py @@ -73,7 +73,7 @@ def CalculateUMAP( primary_dataset: Dataset, reference_dataset: Dataset, embedding_feature: str, - n_components: Optional[int] = 2, + n_components: Optional[int] = 3, n_neighbors: Optional[int] = 15, min_dist: Optional[int] = 0.1, ) -> PointCloud: