-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_viz.R
113 lines (97 loc) · 3.57 KB
/
data_viz.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Library and loading----
library(tidyverse)
library(ggplot2)
df <- read_csv("data/broward_agg.csv",
na = "NA")
df <- df |>
mutate(race = as.factor(race),
sex = as.factor(sex),
charge_degree = as.factor(charge_degree))
df |>
group_by(aggregated) |>
count() |>
filter(n >= 20) |>
ggplot(aes(x = fct_reorder(aggregated, n), y = n)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + #rotate label
labs(title = "Distribution of Charges Using New Label Aggregates",
x = "Charge Type",
y = "Count")
#visualizing the dataset----
df |>
filter(race == 1 | race == 2) |>
group_by(race) |>
count() |>
ggplot(aes(race, n)) +
geom_col()
df |>
filter(aggregated == "Drug-Related") |>
group_by(drug_type) |>
count() |>
ggplot(aes(drug_type, n)) +
geom_col()
df |>
filter(aggregated == "Drug-Related",
drug_type %in% c("Cannabis/Marijuana",
"Cocaine",
"Controlled Substance")) |>
group_by(drug_type) |>
count(drug_type, compas_decile_score, .drop = FALSE) |>
ggplot(aes(x = compas_decile_score, y = n, fill = drug_type)) +
geom_bar(position="dodge", stat="identity") + #can check if theres a uniform distribution here... GOF test
#NAs from Trafficking unspecified drug
labs(title = "Distribution of Drug Usage",
x = "COMPAS Decile Score",
y = "Count",
fill = "Drug Type") +
scale_x_discrete(limits = factor(1:10))
df |>
filter(aggregated == "Drug-Related",
!is.na(drug_type)) |>
group_by(drug_type) |>
count(drug_type, race, .drop = FALSE) |>
filter(race %in% 1:2) |>
ggplot(aes(x = drug_type, y = n, fill = race)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Bar Plot Comparing Race and Drug Type",
x = "Charge Type",
y = "Count",
fill = "Race") +
scale_fill_manual("Race",
labels=c("White", "Black"),
values=c("#999999", "#000000")) +
theme(title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
df |>
filter(race %in% 1:2,
drug_type %in% c("Cannabis/Marijuana", "Cocaine", "Controlled Substance")) |>
group_by(compas_decile_score) |>
count(race, compas_decile_score) |>
ggplot(aes(x = compas_decile_score, y = n, fill = race)) +
geom_bar(position="dodge", stat="identity") +
labs(title = "Bar Plot Comparing Race and Decile Score for Drug-Related Offenses",
x = "COMPAS Decile Score",
y = "Count",
fill = "Race") +
scale_fill_manual("Race",
labels=c("White", "Black"),
values=c("#999999", "#000000")) +
scale_x_discrete(limits = factor(1:10))
#compas guess and correct
df |>
filter(race %in% 1:2) |>
group_by(race) |>
ggplot(aes(x = priors_count, y = compas_guess, color = race)) +
geom_point(position = position_jitter(width = 0.05, height = 0.05), alpha = 0.5)
df |>
filter(race %in% 1:2) |>
group_by(race) |>
ggplot(aes(x = priors_count, y = compas_correct, color = race)) +
geom_point(position = position_jitter(width = 0.05, height = 0.05), alpha = 0.5)
# you see that, btwn the graphs, theres an increase in blue at the bottom for the correct graph, showing how Black people are more likely to be misguessed
#FOCUS: i want to figure out a relationship btwn priors/age and another variable...
df |>
filter(!is.na(drug_interaction)) |>
group_by(drug_interaction) |>
count(drug_interaction, race, .drop = FALSE) |>
filter(race %in% 1:2) #not much difference here~