generated from github/welcome-to-github
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path5_Text_als_Daten.R
111 lines (71 loc) · 2.83 KB
/
5_Text_als_Daten.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
library(friends)
library(dplyr)
library(tidytext)
library(quanteda)
library(quanteda.textplots)
setwd('C:/Users/Drecker/Documents/Lehre')
load(".\\Daten\\harry_data.Rda")
harry_data %>% corpus %>% tokens() %>%
dfm(verbose = FALSE)
harry_data %>%
corpus %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)
harry_data %>%
corpus %>% tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
tokens_remove( c(stopwords("english"))) %>%
tokens_ngrams( n = 1)
harry_data %>%
corpus %>%
tokens() %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title) %>%
quanteda.textplots::textplot_wordcloud(comparison = TRUE, max.words = 300,title.size = 1)
harry_data %>%
corpus %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title) %>%
quanteda.textplots::textplot_wordcloud(comparison = TRUE, max.words = 300,title.size = 1)
harry_data %>%
corpus %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
tokens_remove( c(stopwords("english"))) %>%
tokens_ngrams( n = 1) %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title) %>%
quanteda.textplots::textplot_wordcloud(comparison = TRUE, max.words = 300,title.size = 1)
harry_data %>%
corpus %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE, "c") %>%
tokens_remove( c(stopwords("english"))) %>%
tokens_wordstem( language = quanteda_options("english")) %>%
tokens_ngrams( n = 1) %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title) %>%
quanteda.textplots::textplot_wordcloud(comparison = TRUE, max.words = 300,title.size = 1)
harry_corpus <- harry_data %>% corpus()
lemma_en <- udpipe_download_model(language = "english")
lemma_en <- udpipe_load_model(file = lemma_en$file_model)
lemma_en <- udpipe(harry_corpus, lemma_en, parallel.cores = 8)
lemma_en <- lemma_en %>% filter(upos != 'PUNCT' & is.na(lemma) == F)
harry_token <-harry_corpus %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>%
tokens_remove( c(stopwords("english"),"c") ) %>%
tokens_ngrams( n = 1)
harry_token <- tokens_replace(tokens(harry_token), pattern = lemma_en$token, replacement = lemma_en$lemma)
harry_token %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title)
harry_dfm <- harry_token %>%
dfm(verbose = FALSE) %>%
dfm_group(groups = title)
harry_dfm%>%
quanteda.textplots::textplot_wordcloud(comparison = TRUE, max.words = 300,title.size = 1)
library(reshape2)
library(wordcloud)
harry_token %>%
dfm(verbose = F) %>%
dfm_group(groups = title) %>%
dfm_tfidf() %>%
tidy()%>%
acast(term ~ document, value.var = "count", fill = 0) %>%
comparison.cloud( title.size = 1, random.order = FALSE, max.words = 300)