-
Notifications
You must be signed in to change notification settings - Fork 0
/
biGram_alt_lieu.R
111 lines (81 loc) · 4 KB
/
biGram_alt_lieu.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
library(tm)
library(wordcloud)
library(RColorBrewer)
library(readxl)
library(ggplot2)
library(ggwordcloud)
# Load data
my_data <- read_xlsx("raw_data/Excel_bigram.xlsx")
# Check data loading
print(head(my_data))
# Check unique locations
print(unique(my_data$Lieu))
# Clean and preprocess text data
my_corpus <- Corpus(VectorSource(my_data$Données[my_data$Lieu == "Ahouli"]))
my_corpus <- tm_map(my_corpus, content_transformer(tolower))
my_corpus <- tm_map(my_corpus, removeNumbers)
my_corpus <- tm_map(my_corpus, removePunctuation)
my_corpus <- tm_map(my_corpus, removeWords, c("d’emploi Médecin", "publique Bus", "d’emploi La", "féminines tapis", "NA NA"))
my_corpus <- tm_map(my_corpus, stripWhitespace)
# Create bigrams function
createBigrams <- function(text) {
words <- unlist(strsplit(text, "\\s+"))
bigrams <- paste(words[1:(length(words) - 1)], words[2:length(words)], sep = " ")
return(bigrams)
}
# Apply bigram creation function to corpus
my_bigrams <- lapply(my_corpus, createBigrams)
# Flatten the list of bigrams
all_bigrams <- unlist(my_bigrams)
# Count frequency of bigrams
bigram_freq <- table(all_bigrams)
# Convert to data frame
bigram_freq_df <- data.frame(bigram = names(bigram_freq), freq = as.numeric(bigram_freq))
# Sort by frequency
bigram_freq_df <- bigram_freq_df[order(bigram_freq_df$freq, decreasing = TRUE), ]
bigram_freq_df <- bigram_freq_df[bigram_freq_df$freq > 2, ]
# Generate word cloud
bigram_alt_Ahouli <- wordcloud(words = bigram_freq_df$bigram, freq = bigram_freq_df$freq, scale = c(1, 0.35),
colors = brewer.pal(8, "Dark2"), random.order = FALSE, max.words = 75)
# Clean and preprocess text data
my_corpus2 <- Corpus(VectorSource(my_data$Données[my_data$Lieu == "Mibladen"]))
my_corpus2 <- tm_map(my_corpus2, content_transformer(tolower))
my_corpus2 <- tm_map(my_corpus2, removeNumbers)
my_corpus2 <- tm_map(my_corpus2, removePunctuation)
my_corpus2 <- tm_map(my_corpus2, removeWords, c("d’emploi Médecin", "publique Bus", "d’emploi La", "féminines tapis"))
my_corpus2 <- tm_map(my_corpus2, stripWhitespace)
# Create bigrams function
createBigrams2 <- function(text) {
words <- unlist(strsplit(text, "\\s+"))
bigrams <- paste(words[1:(length(words) - 1)], words[2:length(words)], sep = " ")
return(bigrams)
}
# Apply bigram creation function to corpus
my_bigrams2 <- lapply(my_corpus2, createBigrams2)
# Flatten the list of bigrams
all_bigrams2 <- unlist(my_bigrams2)
# Count frequency of bigrams
bigram_freq2 <- table(all_bigrams2)
# Convert to data frame
bigram_freq_df2 <- data.frame(bigram = names(bigram_freq2), freq = as.numeric(bigram_freq2))
# Sort by frequency
bigram_freq_df2 <- bigram_freq_df2[order(bigram_freq_df2$freq, decreasing = TRUE), ]
bigram_freq_df2 <- bigram_freq_df2[bigram_freq_df2$freq > 2, ]
# Generate word cloud
bigram_alt_Mibladen <- wordcloud(words = bigram_freq_df2$bigram, freq = bigram_freq_df2$freq, scale = c(1, 0.35),
colors = brewer.pal(8, "Dark2"), random.order = FALSE, max.words = 75)
bigram_freq_df$lieu <- 'Ahouli'
bigram_freq_df2$lieu <- 'Mibladen'
df <- rbind(bigram_freq_df, bigram_freq_df2)
df <- df[!(df$bigram %in% c('NA NA','agricoles coopératives','agricoles ouverture','agricoles promouvoir', 'féminines plantation','mine terre','usines coopératives','agricoles projets','féminines ouverture','mine promouvoir','agricole promouvoir','tourisme coopératives','agriculture promouvoir','agriculture ouverture','agriculture coopératives' )),]
ggplot(df, aes(label = bigram, size = freq, color = freq)) +
geom_text_wordcloud() +
facet_wrap(~lieu, ) +
scale_color_gradient(low = "darkgrey", high = "darkgreen") +
clessnverse::theme_clean_light(base_size = 15) +
labs(title = "Mots les plus fréquents dans les réponses \nsur la question des alternatives économiques \nselon le lieu de résidence\n") +
theme(plot.title = element_text(hjust = 0.5, size = 30)) +
scale_size_area(max_size = 13) +
theme(strip.text = element_text(size = 25))
ggsave("graphs/Bigram/alt_eco_lieu.png",
width = 12, height = 8)