updated adhoc prepare featurestore

rdelubi · Dec 14, 2020 · 9e62abf · 9e62abf
1 parent 4d6a663
commit 9e62abf
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 67 deletions.
diff --git a/06_prepare/99_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb b/06_prepare/99_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb
@@ -273,11 +273,11 @@
     "\n",
     "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
     "inputs = df.apply(lambda x: Input(\n",
-    "                    label = x[LABEL_COLUMN],\n",
-    "                    text = x[REVIEW_BODY_COLUMN],\n",
-    "                    review_id = x[REVIEW_ID_COLUMN],\n",
-    "                    date = timestamp\n",
-    "                                ),\n",
+    "                                label = x[LABEL_COLUMN],\n",
+    "                                text = x[REVIEW_BODY_COLUMN],\n",
+    "                                review_id = x[REVIEW_ID_COLUMN],\n",
+    "                                date = timestamp\n",
+    "                            ),\n",
     "                  axis = 1)\n"
    ]
   },
@@ -855,14 +855,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running SELECT embedding FROM \"reviews-feature-group-14-06-16-36-1607926596\"\n"
+      "Running SELECT embedding FROM \"reviews-feature-group-14-06-16-36-1607926596\" LIMIT 1\n"
      ]
     },
     {
@@ -903,7 +903,7 @@
        "0  b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\..."
       ]
      },
-     "execution_count": 35,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -914,7 +914,7 @@
     "reviews_table = reviews_query.table_name\n",
     "# sagemaker_featurestore.reviews-feature-group-14-06-16-36\n",
     "\n",
-    "query_string = 'SELECT embedding FROM \"'+reviews_table+'\"'\n",
+    "query_string = 'SELECT embedding FROM \"'+reviews_table+'\" LIMIT 1'\n",
     "print('Running ' + query_string)\n",
     "\n",
     "# run Athena query. The output is loaded to a Pandas dataframe.\n",
@@ -928,76 +928,93 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Prepare query results for training.\n",
+    "# query_execution = reviews_query.get_query_execution()\n",
+    "# query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'\n",
+    "# print(query_result)\n",
+    "\n",
+    "# # Select useful columns for training with target column as the first.\n",
+    "# dataset = dataset[[\"embedding\"]]\n",
+    "# dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TODO: Check if to_csv is ok\n",
+    "## Possibly change to sth. else (might be ok for now, as we have only 1 column)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_name = './data-tfrecord-featurestore/reviews-embeddings.tfrecord'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Write to csv in S3 without headers and index column.\n",
+    "# dataset.to_csv(file_name, header=False, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from io import TextIOWrapper\n",
+    "\n",
+    "with open(file_name, mode=\"wb\") as fd:\n",
+    "  dataset.to_csv(TextIOWrapper(fd), header=False, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#s3.upload_file('reviews-embeddings.csv', bucket, prefix+'/training_input/reviews_embeddings.csv')\n",
+    "#dataset_uri_prefix = 's3://'+bucket+'/'+prefix+'/training_input/';"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "s3://sagemaker-us-east-1-806570384721/reviews_feature_store/query_results/f9dd9e76-272e-4fad-accd-b7cf5ade83b5.csv\n"
+      "b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\nn\\n\\tinput_ids\\x12a\\x1a_\\n]e\\x834\\xf2\\x07\\xc0\\x1e\\xcd\\x0f\\xea\\x0f\\xc1J\\xc3\\x13\\xf2\\x07\\xce\\x0f\\x95\\x08\\x90\\x10\\xed\\x07\\xa0\\x08\\x9f&\\xa47\\xf8F\\xdc\\x10\\xd9\\x0f\\xf4\\x07\\x95\\x08\\xb4\\x11\\xe7\\x0f\\xbf\\x1f\\xd3\\x0f\\xdbx\\xfb>\\xce\\x0f\\x8f\\x8e\\x01f\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\nR\\n\\ninput_mask\\x12D\\x1aB\\n@\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\n\\x12\\n\\tlabel_ids\\x12\\x05\\x1a\\x03\\n\\x01\\x00'\n"
      ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>embedding</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                           embedding\n",
-       "0  b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\..."
-      ]
-     },
-     "execution_count": 37,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Prepare query results for training.\n",
-    "query_execution = reviews_query.get_query_execution()\n",
-    "query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'\n",
-    "print(query_result)\n",
-    "\n",
-    "# Select useful columns for training with target column as the first.\n",
-    "dataset = dataset[[\"embedding\"]]\n",
-    "\n",
-    "# Write to csv in S3 without headers and index column.\n",
-    "dataset.to_csv('reviews-embeddings.csv', header=False, index=False)\n",
-    "s3.upload_file('reviews-embeddings.csv', bucket, prefix+'/training_input/reviews_embeddings.csv')\n",
-    "dataset_uri_prefix = 's3://'+bucket+'/'+prefix+'/training_input/';\n",
-    "\n",
-    "dataset"
+    "!head $file_name"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1007,6 +1024,53 @@
     "Now it's time to launch a Training job to fit our model."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_record_dataset = tf.data.TFRecordDataset(file_name)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<TFRecordDatasetV2 shapes: (), types: tf.string>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tf_record_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/06_prepare/data-tfrecord-featurestore/reviews-embeddings.tfrecord b/06_prepare/data-tfrecord-featurestore/reviews-embeddings.tfrecord
@@ -0,0 +1 @@
+b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00'
diff --git a/07_train/data-tfrecord-featurestore/reviews-embeddings.tfrecord b/07_train/data-tfrecord-featurestore/reviews-embeddings.tfrecord
@@ -0,0 +1 @@
+\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00