1
+ import pandas as pd
2
+
3
+
4
+ def calculate_demographic_data (print_data = True ):
5
+ # Read data from file
6
+ df = pd .read_csv ('adult.data.csv' )
7
+
8
+ # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
9
+ race_count = df ['race' ].value_counts ()
10
+
11
+ # What is the average age of men?
12
+ men = df [df ['sex' ] == 'Male' ]
13
+ average_age_men = round (men ['age' ].mean (), 1 )
14
+
15
+ # What is the percentage of people who have a Bachelor's degree?
16
+ bachelor = df [df ['education' ] == 'Bachelors' ]
17
+ percentage_bachelors = round (bachelor .shape [0 ] / df .shape [0 ] * 100 , 1 )
18
+
19
+ # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
20
+ # What percentage of people without advanced education make more than 50K?
21
+
22
+ # with and without `Bachelors`, `Masters`, or `Doctorate`
23
+ higher_education = df [df ['education' ].isin (['Bachelors' , 'Masters' , 'Doctorate' ])]
24
+ lower_education = df [~ df ['education' ].isin (['Bachelors' , 'Masters' , 'Doctorate' ])]
25
+
26
+ # percentage with salary >50K
27
+ higher_education_rich = round (higher_education [higher_education ['salary' ] == '>50K' ].shape [0 ] / higher_education .shape [0 ] * 100 , 1 )
28
+ lower_education_rich = round (lower_education [lower_education ['salary' ] == '>50K' ].shape [0 ] / lower_education .shape [0 ] * 100 , 1 )
29
+
30
+ # What is the minimum number of hours a person works per week (hours-per-week feature)?
31
+ min_work_hours = df ['hours-per-week' ].min ()
32
+
33
+ # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
34
+ num_min_workers = df [df ['hours-per-week' ] == min_work_hours ]
35
+
36
+ rich_percentage = round (num_min_workers [num_min_workers ['salary' ] == '>50K' ].shape [0 ] / num_min_workers .shape [0 ] * 100 , 1 )
37
+
38
+ # What country has the highest percentage of people that earn >50K?
39
+ num_high_salary_per_country = df [df ['salary' ] == ">50K" ]['native-country' ].value_counts ()
40
+ num_country = df ['native-country' ].value_counts ()
41
+ percentage = num_high_salary_per_country / num_country * 100
42
+
43
+ highest_earning_country = percentage .idxmax ()
44
+ highest_earning_country_percentage = round (percentage .max (), 1 )
45
+
46
+ # Identify the most popular occupation for those who earn >50K in India.
47
+ high_salary_india_occ = df [(df ['salary' ] == ">50K" ) & (df ['native-country' ] == "India" )]['occupation' ].value_counts ()
48
+
49
+ top_IN_occupation = high_salary_india_occ .idxmax ()
50
+
51
+ # DO NOT MODIFY BELOW THIS LINE
52
+
53
+ if print_data :
54
+ print ("Number of each race:\n " , race_count )
55
+ print ("Average age of men:" , average_age_men )
56
+ print (f"Percentage with Bachelors degrees: { percentage_bachelors } %" )
57
+ print (f"Percentage with higher education that earn >50K: { higher_education_rich } %" )
58
+ print (f"Percentage without higher education that earn >50K: { lower_education_rich } %" )
59
+ print (f"Min work time: { min_work_hours } hours/week" )
60
+ print (f"Percentage of rich among those who work fewest hours: { rich_percentage } %" )
61
+ print ("Country with highest percentage of rich:" , highest_earning_country )
62
+ print (f"Highest percentage of rich people in country: { highest_earning_country_percentage } %" )
63
+ print ("Top occupations in India:" , top_IN_occupation )
64
+
65
+ return {
66
+ 'race_count' : race_count ,
67
+ 'average_age_men' : average_age_men ,
68
+ 'percentage_bachelors' : percentage_bachelors ,
69
+ 'higher_education_rich' : higher_education_rich ,
70
+ 'lower_education_rich' : lower_education_rich ,
71
+ 'min_work_hours' : min_work_hours ,
72
+ 'rich_percentage' : rich_percentage ,
73
+ 'highest_earning_country' : highest_earning_country ,
74
+ 'highest_earning_country_percentage' :
75
+ highest_earning_country_percentage ,
76
+ 'top_IN_occupation' : top_IN_occupation
77
+ }
0 commit comments