Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Himidiri committed Aug 7, 2023
0 parents commit 65d4173
Show file tree
Hide file tree
Showing 10 changed files with 32,942 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/waveApp.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added __pycache__/app.cpython-311.pyc
Binary file not shown.
337 changes: 337 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
import pandas as pd
from h2o_wave import main, app, Q, ui, data
from sklearn.ensemble import RandomForestClassifier

# Global variables to store the data
df = None
test_df = None
train = None
test = None


@app('/waveApp')
async def serve(q: Q):
global df, test_df, train, test

# Initialization and setting up the UI
if not q.client.initialized:
setup_ui(q)
# Handling user actions when uploading the dataset
elif q.args.upload:
upload_dataset(q)
# Previewing the dataset
elif q.args.preview:
preview_dataset(q)
# Handling user actions after uploading the dataset
elif q.args.user_files:
file_path = await q.site.download(q.args.user_files[0], '.')
df = load_data(file_path)
print(df.columns)
print('Preview Dataset')
print(df.head(10))
preview_dataset(q)
# Plotting data distribution for specific columns
elif q.args.plot:
plot_view(q)
# Preprocessing the data
elif q.args.preprocess:
preprocessing_data(q)
# Training the machine learning model
elif q.args.train:
train_model(q)
else:
upload_dataset(q)
await q.page.save()


# Function to load data from CSV file
def load_data(file_path):
data = pd.read_csv(file_path)
return data


# Function to load the test dataset from CSV file
def test_data():
tdata = pd.read_csv('test.csv')
return tdata


# Function to set up the initial UI components
def setup_ui(q):
q.page["header"] = ui.header_card(
box="1 1 12 1",
title='Wave App',
subtitle='H2O.ai Technical Assignment',
)

q.page["tabs"] = ui.tab_card(
box="1 2 9 1",
items=[
ui.tab(name="upload", label="Upload Dataset"),
ui.tab(name="preview", label="Preview Dataset"),
ui.tab(name="plot", label="Plot View"),
ui.tab(name="preprocess", label="Preprocess Data"),
ui.tab(name="train", label="Train and Predict Model"),
],
)

q.page['footer'] = ui.footer_card(
box='1 10 12 1',
caption='**Developed by Himidiri Himakanika**',
)

q.client.initialized = True


# Function to create a markdown table from DataFrame
def make_markdown_row(values):
return f"| {' | '.join([str(x) for x in values])} |"


def make_markdown_table(fields, rows):
return '\n'.join([
make_markdown_row(fields),
make_markdown_row(['---'] * len(fields)),
'\n'.join([make_markdown_row(row) for row in rows]),
])


# Function to create a markdown table for data preview
def make_preprocess_table(df):
markdown_table = make_markdown_table(fields=df.columns.tolist(), rows=df.head(10).values.tolist())
return markdown_table


# Function to handle dataset upload
def upload_dataset(q):
del q.page['preview_dataset']
del q.page['plot_view']
del q.page['education_plot']
del q.page['occupation_plot']
del q.page['age_plot']
del q.page['target_vs_age_plot']
del q.page['target_vs_hours_plot']
del q.page['preprocessing_data']
del q.page['train_model']
q.page['upload_dataset'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**UPLOAD DATASET**'),
ui.file_upload(name='user_files', label='Upload census_data.csv', multiple=False),
],
)


# Function to display a preview of the dataset
def preview_dataset(q):
del q.page['upload_dataset']
del q.page['plot_view']
del q.page['education_plot']
del q.page['occupation_plot']
del q.page['age_plot']
del q.page['target_vs_age_plot']
del q.page['target_vs_hours_plot']
del q.page['preprocessing_data']
del q.page['train_model']
if df is None:
q.page['preview_dataset'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**PREVIEW DATASET**'),
ui.text('Please upload the dataset first.'),
],
)
return
q.page['preview_dataset'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**PREVIEW DATASET**'),
ui.text(make_markdown_table(fields=df.columns.tolist(), rows=df.head(5000).values.tolist()))
],
)


# Function to plot data distribution for specific columns
def plot_view(q):
del q.page['upload_dataset']
del q.page['preview_dataset']
del q.page['plot_view']
del q.page['preprocessing_data']
del q.page['train_model']
if df is None:
q.page['plot_view'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**PLOT VIEW**'),
ui.text('Please upload the dataset first.'),
],
)
return

# Data distribution for education, occupation, and age
education_distribution = df.groupby([' education', ' sex']).size().reset_index(name='count')
occupation_distribution = df.groupby([' occupation', ' sex']).size().reset_index(name='count')
age_distribution = df.groupby(['age', ' sex']).size().reset_index(name='count')
target_vs_age = df.groupby(['age', ' amount']).size().reset_index(name='count')
target_vs_hours = df.groupby([' hour-per-week', ' amount']).size().reset_index(name='count')

q.page['education_plot'] = ui.plot_card(
box='1 3 4 3',
title='Education Distribution',
data=data(fields=['education', 'sex', 'count'], rows=education_distribution.values.tolist()),
plot=ui.plot([
ui.mark(type='interval',
x='=education', y='=count',
color='=sex')
])
)

q.page['occupation_plot'] = ui.plot_card(
box='5 3 4 3',
title='Occupation Distribution',
data=data(fields=['occupation', 'sex', 'count'], rows=occupation_distribution.values.tolist()),
plot=ui.plot([
ui.mark(type='interval',
x='=occupation', y='=count',
color='=sex')
])
)

q.page['age_plot'] = ui.plot_card(
box='9 3 4 3',
title='Age Distribution',
data=data(fields=['age', 'sex', 'count'], rows=age_distribution.values.tolist()),
plot=ui.plot([ui.mark(type='line', x='=age', y='=count', color='=sex')]),
)

q.page['target_vs_age_plot'] = ui.plot_card(
box='3 6 4 3',
title='Amount vs Age',
data=data(fields=['age', 'amount', 'count'], rows=target_vs_age.values.tolist()),
plot=ui.plot(
[ui.mark(type='point', x='=age', y='=count', size='=count', color='=amount', shape='circle')]),
)

q.page['target_vs_hours_plot'] = ui.plot_card(
box='7 6 4 3',
title='Amount vs Hours Per Week',
data=data(fields=['hour-per-week', 'amount', 'count'], rows=target_vs_hours.values.tolist()),
plot=ui.plot([ui.mark(type='point', x='=hour-per-week', y='=amount', size='=count', color='=amount',
shape='circle')]),
)


# Function to preprocess the data
def preprocess(df):
df = df.replace('?', pd.NA).dropna()
df = pd.get_dummies(df, columns=[' workclass', ' education', ' marital-status', ' occupation', ' relationship',
' race', ' sex', ' native-country'])
df[' amount'] = df[' amount'].map({' <=50K': 0, ' >50K': 1})
df.drop(' fnlwgt', axis=1, inplace=True)
return df


# Function to handle data preprocessing
def preprocessing_data(q):
del q.page['upload_dataset']
del q.page['preview_dataset']
del q.page['plot_view']
del q.page['education_plot']
del q.page['occupation_plot']
del q.page['age_plot']
del q.page['target_vs_age_plot']
del q.page['target_vs_hours_plot']
del q.page['preprocessing_data']
del q.page['train_model']
global df, test_df, train, test

if df is None:
q.page['preprocessing_data'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**PREPROCESS DATA**'),
ui.text('Please upload the dataset first.'),
],
)
return
test_df = test_data()
train = preprocess(df)
test = preprocess(test_df)
train_data_table = make_preprocess_table(train)
print('Preprocess Uploaded Dataset')
print(train.head(10))
print('Preprocess Test Dataset')
print(test.head(10))

q.page['preprocessing_data'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**PREPROCESSED DATA**'),
ui.text(train_data_table),
],
)


# Function to train model
def random_forest_training(train, test):
y = train[' amount']
train = train.drop([' amount'], axis=1)
X = train.values
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X, y)
return clf


# Function to get information about the trained model
def get_model_info(clf):
model_info = {
'Feature Importances': clf.feature_importances_,
'Number of Estimators': clf.n_estimators,
'Max Depth': clf.max_depth,
'Number of Classes': clf.n_classes_,
}
return model_info


# Function to train the machine learning model
def train_model(q):
del q.page['upload_dataset']
del q.page['preview_dataset']
del q.page['plot_view']
del q.page['education_plot']
del q.page['occupation_plot']
del q.page['age_plot']
del q.page['target_vs_age_plot']
del q.page['target_vs_hours_plot']
del q.page['preprocessing_data']
del q.page['train_model']
global df, train, test

if df is not None:
q.page['train_model'] = ui.form_card(box='1 12 5 2', items=[ui.progress('Running...')])
model = random_forest_training(train, test)
model_info = get_model_info(model)

# Message to indicate successful model training
message = 'Model Trained Successfully'
field_names = list(model_info.keys())
rows = [list(model_info.values())]

markdown_table = make_markdown_table(fields=field_names, rows=rows)

q.page['train_model'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**TRAIN AND PREDICT MODEL**'),
ui.message_bar('info', message),
ui.text(markdown_table),
],
)
else:
q.page['train_model'] = ui.form_card(
box='1 3 12 7',
items=[
ui.text_l('**TRAIN AND PREDICT MODEL**'),
ui.text('Please preprocess data before training the model'),
],
)
Loading

0 comments on commit 65d4173

Please sign in to comment.