Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added nemo/NeMo-Data-Designer/.DS_Store
Binary file not shown.
Binary file added nemo/NeMo-Data-Designer/advanced/.DS_Store
Binary file not shown.

Large diffs are not rendered by default.

1,176 changes: 590 additions & 586 deletions nemo/NeMo-Data-Designer/advanced/forms/w2-dataset.ipynb

Large diffs are not rendered by default.

Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,794 changes: 962 additions & 832 deletions nemo/NeMo-Data-Designer/advanced/healthcare-datasets/clinical-trials.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,548 changes: 817 additions & 731 deletions nemo/NeMo-Data-Designer/advanced/healthcare-datasets/insurance-claims.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"cells": [
{
"cell_type": "markdown",
"id": "a9883b84",
"metadata": {},
"source": [
"# 🧑‍⚕️ NeMo Data Designer: Realistic Patient Data & Physician Notes"
Expand Down Expand Up @@ -43,13 +42,17 @@
"metadata": {},
"outputs": [],
"source": [
"from nemo_microservices import NeMoMicroservices\n",
"from nemo_microservices.beta.data_designer import (\n",
"from nemo_microservices.data_designer.essentials import (\n",
" NeMoDataDesignerClient,\n",
" DataDesignerConfigBuilder,\n",
" DataDesignerClient,\n",
")\n",
"from nemo_microservices.beta.data_designer.config import columns as C\n",
"from nemo_microservices.beta.data_designer.config import params as P"
" LLMTextColumnConfig,\n",
" PersonSamplerParams,\n",
" SamplerColumnConfig,\n",
" SamplerType,\n",
" ModelConfig,\n",
" InferenceParameters,\n",
" UUIDSamplerParams,\n",
")"
]
},
{
Expand All @@ -68,7 +71,7 @@
"metadata": {},
"outputs": [],
"source": [
"data_designer_client = DataDesignerClient(client=NeMoMicroservices(base_url=\"http://localhost:8080\"))"
"data_designer_client = NeMoDataDesignerClient(base_url=\"http://localhost:8080\")"
]
},
{
Expand Down Expand Up @@ -105,16 +108,15 @@
"source": [
"config_builder = DataDesignerConfigBuilder(\n",
" model_configs=[\n",
" P.ModelConfig(\n",
" ModelConfig(\n",
" alias=model_alias,\n",
" provider=\"nvidiabuild\",\n",
" model=model_id,\n",
" inference_parameters=P.InferenceParameters(\n",
" inference_parameters=InferenceParameters(\n",
" max_tokens=1024,\n",
" temperature=0.6,\n",
" top_p=0.95,\n",
" ),\n",
" is_reasoner=True\n",
" ),\n",
" ]\n",
")"
Expand Down Expand Up @@ -166,14 +168,27 @@
"metadata": {},
"outputs": [],
"source": [
"# We use with_replacement=False, so our max num_records is 853.\n",
"seed_reference = data_designer_client.upload_seed_dataset(df_seed, repo_id=\"advanced/healthcare-datasets\", datastore_settings={\"endpoint\": \"http://localhost:3000/v1/hf\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_builder.with_seed_dataset(seed_reference)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_builder.with_seed_dataset(\n",
" repo_id=\"advanced/healthcare-datasets\",\n",
" filename=\"symptom_to_diagnosis.csv\",\n",
" dataset_path=\"./data/symptom_to_diagnosis.csv\",\n",
" sampling_strategy=\"shuffle\", # \"ordered\"\n",
" with_replacement=True,\n",
" datastore={\"endpoint\": \"http://localhost:3000/v1/hf\"}\n",
" dataset_reference=seed_reference,\n",
" sampling_strategy=\"shuffle\"\n",
")"
]
},
Expand All @@ -184,7 +199,21 @@
"outputs": [],
"source": [
"# Create a couple random person samplers.\n",
"config_builder.with_person_samplers({\"patient_sampler\": {}, \"doctor_sampler\": {}})"
"config_builder.add_column(\n",
" SamplerColumnConfig(\n",
" name=\"patient_sampler\",\n",
" sampler_type=SamplerType.PERSON,\n",
" params=PersonSamplerParams(),\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" SamplerColumnConfig(\n",
" name=\"doctor_sampler\",\n",
" sampler_type=SamplerType.PERSON,\n",
" params=PersonSamplerParams(),\n",
" )\n",
")\n"
]
},
{
Expand All @@ -208,47 +237,48 @@
"outputs": [],
"source": [
"config_builder.add_column(\n",
" name=\"patient_id\",\n",
" type=\"uuid\",\n",
" params={\"prefix\": \"PT-\", \"short_form\": True, \"uppercase\": True},\n",
" SamplerColumnConfig(\n",
" name=\"patient_id\",\n",
" sampler_type=SamplerType.UUID,\n",
" params=UUIDSamplerParams(prefix=\"PT-\", short_form=True, uppercase=True),\n",
" )\n",
")\n",
"\n",
"config_builder.add_column(\n",
" name=\"first_name\",\n",
" type=\"expression\",\n",
" column_type=\"expression\",\n",
" expr=\"{{patient_sampler.first_name}}\"\n",
")\n",
"\n",
"config_builder.add_column(\n",
" name=\"last_name\",\n",
" type=\"expression\",\n",
" column_type=\"expression\",\n",
" expr=\"{{patient_sampler.last_name}}\"\n",
")\n",
"\n",
"\n",
"config_builder.add_column(\n",
" name=\"dob\",\n",
" type=\"expression\",\n",
" column_type=\"expression\",\n",
" expr=\"{{patient_sampler.birth_date}}\"\n",
")\n",
"\n",
"\n",
"config_builder.add_column(\n",
" name=\"patient_email\",\n",
" type=\"expression\",\n",
" column_type=\"expression\",\n",
" expr=\"{{patient_sampler.email_address}}\"\n",
")\n",
"\n",
"\n",
"config_builder.add_column(\n",
" name=\"symptom_onset_date\",\n",
" type=\"datetime\",\n",
" column_type=\"sampler\",\n",
" sampler_type=\"datetime\",\n",
" params={\"start\": \"2024-01-01\", \"end\": \"2024-12-31\"},\n",
")\n",
"\n",
"config_builder.add_column(\n",
" name=\"date_of_visit\",\n",
" type=\"timedelta\",\n",
" column_type=\"sampler\",\n",
" sampler_type=\"timedelta\",\n",
" params={\n",
" \"dt_min\": 1,\n",
" \"dt_max\": 30,\n",
Expand All @@ -258,7 +288,7 @@
"\n",
"config_builder.add_column(\n",
" name=\"physician\",\n",
" type=\"expression\",\n",
" column_type=\"expression\",\n",
" expr=\"Dr. {{doctor_sampler.first_name}} {{doctor_sampler.last_name}}\",\n",
")"
]
Expand All @@ -284,12 +314,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Note we have access to the seed data fields.\n",
"config_builder.add_column(\n",
" name=\"physician_notes\",\n",
" type=\"llm-text\",\n",
" model_alias=model_alias,\n",
" prompt=\"\"\"\\\n",
" LLMTextColumnConfig(\n",
" name=\"physician_notes\",\n",
" model_alias=model_alias,\n",
" prompt=\"\"\"\\\n",
"<context>\n",
"You are a primary-care physician who just had an appointment with {{first_name}} {{last_name}},\n",
"who has been struggling with symptoms from {{diagnosis}} since {{symptom_onset_date}}.\n",
Expand All @@ -307,7 +336,8 @@
"Format the notes as a busy doctor might.\n",
"</task>\n",
"\"\"\"\n",
" )"
" )\n",
")"
]
},
{
Expand All @@ -325,7 +355,7 @@
"metadata": {},
"outputs": [],
"source": [
"preview = data_designer_client.preview(config_builder, verbose_logging=True)"
"preview = data_designer_client.preview(config_builder)"
]
},
{
Expand Down Expand Up @@ -391,7 +421,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "sdg_venv",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -405,9 +435,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
Loading