Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: simplify data downloading and add mlflow to uplift modelling #1659

Merged
merged 3 commits into from
Sep 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@
}
},
"source": [
"### Download dataset and Upload to lakehouse"
"### Download dataset and Upload to lakehouse\n",
"\n",
"**Please add a lakehouse to the notebook before running it.**"
]
},
{
Expand All @@ -158,36 +160,21 @@
"source": [
"if not IS_CUSTOM_DATA:\n",
" # Download demo data files into lakehouse if not exist\n",
" import os, requests\n",
"\n",
" remote_url = \"https://synapseaisolutionsa.blob.core.windows.net/public/Book-Recommendation-Dataset\"\n",
" file_list = [\"Books.csv\", \"Ratings.csv\", \"Users.csv\"]\n",
"\n",
" # For this demo, we first check if the dataset files are already prepared in the default lakehouse. If not, we'll download the dataset.\n",
" import os\n",
" import requests\n",
" download_path = f\"/lakehouse/default/{DATA_FOLDER}/raw\"\n",
"\n",
" if not os.path.exists(\"/lakehouse/default\"):\n",
" # ask user to add a lakehouse if no default lakehouse added to the notebook.\n",
" # a new notebook will not link to any lakehouse by default.\n",
" raise FileNotFoundError(\n",
" \"Default lakehouse not found, please add a lakehouse for the notebook.\"\n",
" )\n",
" else:\n",
" # check if the needed files are already in the lakehouse, try to download if not.\n",
" # raise an error if downloading failed.\n",
" os.makedirs(f\"/lakehouse/default/{DATA_FOLDER}/raw/\", exist_ok=True)\n",
" for fname in file_list:\n",
" if not os.path.exists(f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\"):\n",
" try:\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(\n",
" f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\", \"wb\"\n",
" ) as f:\n",
" f.write(r.content)\n",
" print(f\"Downloaded {fname} into {DATA_FOLDER}/raw/.\")\n",
" except Exception as e:\n",
" print(f\"Failed on downloading {fname}, error message: {e}\")\n",
" else:\n",
" print(f\"{fname} already exists in {DATA_FOLDER}/raw/.\")"
" raise FileNotFoundError(\"Default lakehouse not found, please add a lakehouse.\")\n",
" os.makedirs(download_path, exist_ok=True)\n",
" for fname in file_list:\n",
" if not os.path.exists(f\"{download_path}/{fname}\"):\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(f\"{download_path}/{fname}\", \"wb\") as f:\n",
" f.write(r.content)\n",
" print(\"Downloaded demo data files into lakehouse.\")"
]
},
{
Expand Down Expand Up @@ -1419,7 +1406,9 @@
"outputs": [],
"source": [
"# code for saving userRecs into lakehouse\n",
"userRecs.write.mode(\"overwrite\").parquet(f\"{DATA_FOLDER}/predictions/userRecs\")"
"userRecs.write.format(\"delta\").mode(\"overwrite\").save(\n",
" f\"{DATA_FOLDER}/predictions/userRecs\"\n",
")"
]
},
{
Expand Down
44 changes: 15 additions & 29 deletions notebooks/community/aisamples/AIsample - Fraud Detection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@
}
},
"source": [
"### Download dataset and Upload to lakehouse"
"### Download dataset and Upload to lakehouse\n",
"\n",
"**Please add a lakehouse to the notebook before running it.**"
]
},
{
Expand All @@ -144,36 +146,20 @@
"source": [
"if not IS_CUSTOM_DATA:\n",
" # Download demo data files into lakehouse if not exist\n",
" remote_url = \"https://synapseaisolutionsa.blob.core.windows.net/public/Credit_Card_Fraud_Detection\"\n",
" file_list = [\"creditcard.csv\"]\n",
" import os, requests\n",
"\n",
" # For this demo, we first check if the dataset files are already prepared in the default lakehouse. If not, we'll download the dataset.\n",
" import os\n",
" import requests\n",
" remote_url = \"https://synapseaisolutionsa.blob.core.windows.net/public/Credit_Card_Fraud_Detection\"\n",
" fname = \"creditcard.csv\"\n",
" download_path = f\"/lakehouse/default/{DATA_FOLDER}/raw\"\n",
"\n",
" if not os.path.exists(\"/lakehouse/default\"):\n",
" # ask user to add a lakehouse if no default lakehouse added to the notebook.\n",
" # a new notebook will not link to any lakehouse by default.\n",
" raise FileNotFoundError(\n",
" \"Default lakehouse not found, please add a lakehouse for the notebook.\"\n",
" )\n",
" else:\n",
" # check if the needed files are already in the lakehouse, try to download if not.\n",
" # raise an error if downloading failed.\n",
" os.makedirs(f\"/lakehouse/default/{DATA_FOLDER}/raw/\", exist_ok=True)\n",
" for fname in file_list:\n",
" if not os.path.exists(f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\"):\n",
" try:\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(\n",
" f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\", \"wb\"\n",
" ) as f:\n",
" f.write(r.content)\n",
" print(f\"Downloaded {fname} into {DATA_FOLDER}/raw/.\")\n",
" except Exception as e:\n",
" print(f\"Failed on downloading {fname}, error message: {e}\")\n",
" else:\n",
" print(f\"{fname} already exists in {DATA_FOLDER}/raw/.\")"
" raise FileNotFoundError(\"Default lakehouse not found, please add a lakehouse.\")\n",
" os.makedirs(download_path, exist_ok=True)\n",
" if not os.path.exists(f\"{download_path}/{fname}\"):\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(f\"{download_path}/{fname}\", \"wb\") as f:\n",
" f.write(r.content)\n",
" print(\"Downloaded demo data files into lakehouse.\")"
]
},
{
Expand Down Expand Up @@ -972,7 +958,7 @@
"outputs": [],
"source": [
"# code for saving predictions into lakehouse\n",
"batch_predictions.write.mode(\"overwrite\").parquet(\n",
"batch_predictions.write.format(\"delta\").mode(\"overwrite\").save(\n",
" f\"{DATA_FOLDER}/predictions/batch_predictions\"\n",
")"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
},
"outputs": [],
"source": [
"IS_CUSTOMER_DATA = False # if True, dataset has to be uploaded manually by user\n",
"IS_CUSTOM_DATA = False # if True, dataset has to be uploaded manually by user\n",
"DATA_FOLDER = \"Files/title-genre-classification\"\n",
"DATA_FILE = \"blbooksgenre.csv\"\n",
"\n",
Expand Down Expand Up @@ -199,7 +199,9 @@
}
},
"source": [
"### Download dataset and upload to lakehouse"
"### Download dataset and upload to lakehouse\n",
"\n",
"**Please add a lakehouse to the notebook before running it.**"
]
},
{
Expand All @@ -221,38 +223,22 @@
},
"outputs": [],
"source": [
"if not IS_CUSTOMER_DATA:\n",
"if not IS_CUSTOM_DATA:\n",
" # Download demo data files into lakehouse if not exist\n",
" remote_url = \"https://synapseaisolutionsa.blob.core.windows.net/public/Title_Genre_Classification\"\n",
" file_list = [\"blbooksgenre.csv\"]\n",
" import os, requests\n",
"\n",
" # For this demo, we first check if the dataset files are already prepared in the default lakehouse. If not, we'll download the dataset.\n",
" import os\n",
" import requests\n",
" remote_url = \"https://synapseaisolutionsa.blob.core.windows.net/public/Title_Genre_Classification\"\n",
" fname = \"blbooksgenre.csv\"\n",
" download_path = f\"/lakehouse/default/{DATA_FOLDER}/raw\"\n",
"\n",
" if not os.path.exists(\"/lakehouse/default\"):\n",
" # ask user to add a lakehouse if no default lakehouse added to the notebook.\n",
" # a new notebook will not link to any lakehouse by default.\n",
" raise FileNotFoundError(\n",
" \"Default lakehouse not found, please add a lakehouse for the notebook.\"\n",
" )\n",
" else:\n",
" # check if the needed files are already in the lakehouse, try to download if not.\n",
" # raise an error if downloading failed.\n",
" os.makedirs(f\"/lakehouse/default/{DATA_FOLDER}/raw/\", exist_ok=True)\n",
" for fname in file_list:\n",
" if not os.path.exists(f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\"):\n",
" try:\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(\n",
" f\"/lakehouse/default/{DATA_FOLDER}/raw/{fname}\", \"wb\"\n",
" ) as f:\n",
" f.write(r.content)\n",
" print(f\"Downloaded {fname} into {DATA_FOLDER}/raw/.\")\n",
" except Exception as e:\n",
" print(f\"Failed on downloading {fname}, error message: {e}\")\n",
" else:\n",
" print(f\"{fname} already exists in {DATA_FOLDER}/raw/.\")"
" raise FileNotFoundError(\"Default lakehouse not found, please add a lakehouse.\")\n",
" os.makedirs(download_path, exist_ok=True)\n",
" if not os.path.exists(f\"{download_path}/{fname}\"):\n",
" r = requests.get(f\"{remote_url}/{fname}\", timeout=30)\n",
" with open(f\"{download_path}/{fname}\", \"wb\") as f:\n",
" f.write(r.content)\n",
" print(\"Downloaded demo data files into lakehouse.\")"
]
},
{
Expand Down
Loading