Skip to content

Commit

Permalink
updated adhoc prepare featurestore
Browse files Browse the repository at this point in the history
  • Loading branch information
antje committed Dec 14, 2020
1 parent 4d6a663 commit 9e62abf
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 67 deletions.
198 changes: 131 additions & 67 deletions 06_prepare/99_Prepare_Dataset_BERT_Scikit_AdHoc_FeatureStore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,11 @@
"\n",
"# Use the InputExample class from BERT's run_classifier code to create examples from the data\n",
"inputs = df.apply(lambda x: Input(\n",
" label = x[LABEL_COLUMN],\n",
" text = x[REVIEW_BODY_COLUMN],\n",
" review_id = x[REVIEW_ID_COLUMN],\n",
" date = timestamp\n",
" ),\n",
" label = x[LABEL_COLUMN],\n",
" text = x[REVIEW_BODY_COLUMN],\n",
" review_id = x[REVIEW_ID_COLUMN],\n",
" date = timestamp\n",
" ),\n",
" axis = 1)\n"
]
},
Expand Down Expand Up @@ -855,14 +855,14 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running SELECT embedding FROM \"reviews-feature-group-14-06-16-36-1607926596\"\n"
"Running SELECT embedding FROM \"reviews-feature-group-14-06-16-36-1607926596\" LIMIT 1\n"
]
},
{
Expand Down Expand Up @@ -903,7 +903,7 @@
"0 b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\..."
]
},
"execution_count": 35,
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -914,7 +914,7 @@
"reviews_table = reviews_query.table_name\n",
"# sagemaker_featurestore.reviews-feature-group-14-06-16-36\n",
"\n",
"query_string = 'SELECT embedding FROM \"'+reviews_table+'\"'\n",
"query_string = 'SELECT embedding FROM \"'+reviews_table+'\" LIMIT 1'\n",
"print('Running ' + query_string)\n",
"\n",
"# run Athena query. The output is loaded to a Pandas dataframe.\n",
Expand All @@ -928,76 +928,93 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # Prepare query results for training.\n",
"# query_execution = reviews_query.get_query_execution()\n",
"# query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'\n",
"# print(query_result)\n",
"\n",
"# # Select useful columns for training with target column as the first.\n",
"# dataset = dataset[[\"embedding\"]]\n",
"# dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TODO: Check if to_csv is ok\n",
"## Possibly change to sth. else (might be ok for now, as we have only 1 column)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"file_name = './data-tfrecord-featurestore/reviews-embeddings.tfrecord'"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# Write to csv in S3 without headers and index column.\n",
"# dataset.to_csv(file_name, header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"from io import TextIOWrapper\n",
"\n",
"with open(file_name, mode=\"wb\") as fd:\n",
" dataset.to_csv(TextIOWrapper(fd), header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#s3.upload_file('reviews-embeddings.csv', bucket, prefix+'/training_input/reviews_embeddings.csv')\n",
"#dataset_uri_prefix = 's3://'+bucket+'/'+prefix+'/training_input/';"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"s3://sagemaker-us-east-1-806570384721/reviews_feature_store/query_results/f9dd9e76-272e-4fad-accd-b7cf5ade83b5.csv\n"
"b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\nn\\n\\tinput_ids\\x12a\\x1a_\\n]e\\x834\\xf2\\x07\\xc0\\x1e\\xcd\\x0f\\xea\\x0f\\xc1J\\xc3\\x13\\xf2\\x07\\xce\\x0f\\x95\\x08\\x90\\x10\\xed\\x07\\xa0\\x08\\x9f&\\xa47\\xf8F\\xdc\\x10\\xd9\\x0f\\xf4\\x07\\x95\\x08\\xb4\\x11\\xe7\\x0f\\xbf\\x1f\\xd3\\x0f\\xdbx\\xfb>\\xce\\x0f\\x8f\\x8e\\x01f\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\nR\\n\\ninput_mask\\x12D\\x1aB\\n@\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\n\\x12\\n\\tlabel_ids\\x12\\x05\\x1a\\x03\\n\\x01\\x00'\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>embedding</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" embedding\n",
"0 b'\\n\\xad\\x02\\nS\\n\\x0bsegment_ids\\x12D\\x1aB\\n@\\..."
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Prepare query results for training.\n",
"query_execution = reviews_query.get_query_execution()\n",
"query_result = 's3://'+bucket+'/'+prefix+'/query_results/'+query_execution['QueryExecution']['QueryExecutionId']+'.csv'\n",
"print(query_result)\n",
"\n",
"# Select useful columns for training with target column as the first.\n",
"dataset = dataset[[\"embedding\"]]\n",
"\n",
"# Write to csv in S3 without headers and index column.\n",
"dataset.to_csv('reviews-embeddings.csv', header=False, index=False)\n",
"s3.upload_file('reviews-embeddings.csv', bucket, prefix+'/training_input/reviews_embeddings.csv')\n",
"dataset_uri_prefix = 's3://'+bucket+'/'+prefix+'/training_input/';\n",
"\n",
"dataset"
"!head $file_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -1007,6 +1024,53 @@
"Now it's time to launch a Training job to fit our model."
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"tf_record_dataset = tf.data.TFRecordDataset(file_name)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<TFRecordDatasetV2 shapes: (), types: tf.string>\n"
]
}
],
"source": [
"print(tf_record_dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b'\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
\n\xad\x02\nS\n\x0bsegment_ids\x12D\x1aB\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nn\n\tinput_ids\x12a\x1a_\n]e\x834\xf2\x07\xc0\x1e\xcd\x0f\xea\x0f\xc1J\xc3\x13\xf2\x07\xce\x0f\x95\x08\x90\x10\xed\x07\xa0\x08\x9f&\xa47\xf8F\xdc\x10\xd9\x0f\xf4\x07\x95\x08\xb4\x11\xe7\x0f\xbf\x1f\xd3\x0f\xdbx\xfb>\xce\x0f\x8f\x8e\x01f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\nR\n\ninput_mask\x12D\x1aB\n@\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\x12\n\tlabel_ids\x12\x05\x1a\x03\n\x01\x00

0 comments on commit 9e62abf

Please sign in to comment.