diff --git a/streamlit/__pycache__/functions.cpython-311.pyc b/streamlit/__pycache__/functions.cpython-311.pyc new file mode 100644 index 0000000..04cf9e5 Binary files /dev/null and b/streamlit/__pycache__/functions.cpython-311.pyc differ diff --git a/streamlit/__pycache__/main_analysis.cpython-311.pyc b/streamlit/__pycache__/main_analysis.cpython-311.pyc new file mode 100644 index 0000000..064e3cd Binary files /dev/null and b/streamlit/__pycache__/main_analysis.cpython-311.pyc differ diff --git a/streamlit/functions.py b/streamlit/functions.py index 9af1d9a..2925f55 100644 --- a/streamlit/functions.py +++ b/streamlit/functions.py @@ -11,9 +11,9 @@ data = pd.read_csv('df2020.csv') df2018 = pd.read_csv('df2018.csv') -full_data2018 = pd.read_csv('../Data/survey_results_public_2018.csv') -full_data2019=pd.read_csv('../Data/survey_results_public_2019.csv') -full_df2020 = pd.read_csv('../Data/survey_results_public_2020.csv') +full_data2018 = pd.read_csv('survey_results_sample_2018.csv') +full_data2019=pd.read_csv('survey_results_sample_2019.csv') +full_df2020 = pd.read_csv('survey_results_sample_2020.csv') df2019 = pd.read_csv('df2019.csv') df2020 = data[(data['SalaryUSD'] < 200000)] diff --git a/streamlit/home.py b/streamlit/home.py index ac242bc..6490fae 100644 --- a/streamlit/home.py +++ b/streamlit/home.py @@ -8,10 +8,8 @@ from scipy.stats import norm from scipy import stats import random -from scipy.stats import norm import functions as ff import main_analysis as main -# import streamlit_app as sp ####################################### # DATA LOADING @@ -19,13 +17,17 @@ st.set_page_config(layout='wide') +# Loading data files df = pd.read_csv('df2020.csv') df2018 = pd.read_csv('df2018.csv') -full_data2018 = pd.read_csv('../Data/survey_results_public_2018.csv') -full_data2019=pd.read_csv('../Data/survey_results_public_2019.csv') -full_df2020 = pd.read_csv('../Data/survey_results_public_2020.csv') +full_data2018 = pd.read_csv('survey_results_sample_2018.csv') +full_data2019 = pd.read_csv('survey_results_sample_2019.csv') +full_df2020 = pd.read_csv('survey_results_sample_2020.csv') df2019 = pd.read_csv('df2019.csv') -df2020 = df[(df['SalaryUSD'] < 200000)] + +# Filter the 2020 dataframe +df2020 = df[df['SalaryUSD'] < 200000] + # Load CSS file def local_css(file_name): with open(file_name) as f: @@ -37,24 +39,28 @@ def local_css(file_name): # DATA PREPARATION FOR VISUALISATION ####################################### +# Dropping unnamed columns that might be present df2018 = df2018.drop(df2018.columns[0], axis=1) df2019 = df2019.drop(df2019.columns[0], axis=1) -full_data2018 = full_data2018.rename(columns={"Hobby": "Hobbyist", - "RaceEthnicity": "Ethnicity", - "YearsCoding": "YearsCode", - "YearsCodingProf": "YearsCodePro", - "JobSatisfaction": "JobSat", - "FormalEducation": "EdLevel", - "OperatingSystem": "OpSys"}) - -df_ai = full_data2018[['AIDangerous','AIInteresting','AIResponsible','AIFuture']] +# Renaming columns for consistency +full_data2018 = full_data2018.rename(columns={ + "Hobby": "Hobbyist", + "RaceEthnicity": "Ethnicity", + "YearsCoding": "YearsCode", + "YearsCodingProf": "YearsCodePro", + "JobSatisfaction": "JobSat", + "FormalEducation": "EdLevel", + "OperatingSystem": "OpSys" +}) + +# Data cleaning and transformation +df_ai = full_data2018[['AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture']] df2018['Gender'] = df2018['Gender'].replace({"Male": "Man", "Female": "Woman"}) -# for highest paying ds full_data2018.rename(columns={'ConvertedSalary': 'SalaryUSD'}, inplace=True) -# Strip leading and trailing whitespace from all columns +# Strip leading and trailing whitespace from all columns in df_ai df_ai = df_ai.applymap(lambda x: x.strip() if isinstance(x, str) else x) # Mapping for shorter versions @@ -74,145 +80,121 @@ def local_css(file_name): df_ai.replace(short_mapping, inplace=True) # Function to create value count plots for each column -def plot_value_counts(column_name, ax): - colors = ['skyblue','yellow'] - df_ai[column_name].value_counts().plot(kind='bar', color=random.choice(colors), ax=ax) - ax.set_title(f'Value Counts for {column_name}') - ax.set_xlabel('Response') - ax.set_ylabel('Count') - ax.tick_params(axis='x', rotation=45) - st.plotly_chart() +def plot_value_counts(column_name): + colors = ['skyblue', 'yellow'] + fig = px.bar(df_ai[column_name].value_counts().reset_index(), x='index', y=column_name, color_discrete_sequence=[random.choice(colors)]) + fig.update_layout(title=f'Value Counts for {column_name}', xaxis_title='Response', yaxis_title='Count') + st.plotly_chart(fig) ######################################################################### + +# Sidebar for year selection year = st.sidebar.selectbox('Select Year', ['2018', '2019', '2020']) + if year == '2018': - main.main_analysis(df2018) - # age bar plot - visual, analysis = st.columns((3,1)) + main.main_analysis(df2018) + + visual, analysis = st.columns((3, 1)) with visual: - st.title("Highest Paying Countries for Data Scientists") - ff.heighest_paying(full_data2018) + st.title("Highest Paying Countries for Data Scientists") + ff.heighest_paying(full_data2018) with analysis: highest_paying_ds_text = """ -