-
Notifications
You must be signed in to change notification settings - Fork 0
/
prep.R
65 lines (56 loc) · 2.16 KB
/
prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
clean <- function(x) {
x %>%
str_remove_all("\\*.*") %>% # asterisks
str_remove("^(about|additional|accompaniments?:)\\s*") %>% # annotations
str_remove("^an?\\s+") %>% # leading indefinite article
str_replace_all("[-—–‑\"\':%;]", " ") %>% # remove various stuff
str_remove_all("\\(.*?\\)\\s*") %>% # remove parentheses
str_remove_all("\\[.*?\\]\\s*") %>% # remove brackets
str_remove_all("\\d+(/\\d+)?(\\.\\d+)?\\s*(or|to)\\s*\\d+(/\\d+)?(\\.\\d+)?\\s?") %>% # number to|or number
str_remove_all("\\d+(/\\d+)?(\\.\\d+)?\\s*") %>% # number-number
str_remove_all("^/") %>%
str_remove("^\\s*(cup|tablespoon|teaspoon|gram|ounce|pound|large|recipe|medium|small|stick|inch|quart|pint|can|gallon|bag|oz|qt|lb|g|tbsp|tsp)s?\\.? ") %>% # measurements
str_remove("^(,\\s*)+") %>% # left over commas
str_trim()
}
prep <- function(ingredients, min_num = 1, to_mat = TRUE) {
# replace nas with regex cleaned version
nas <- is.na(ingredients$base)
ingredients$marker <- nas
ingredients[nas, "base"] <- ingredients$ingredients[nas] %>%
clean()
ingredients <- ingredients %>%
mutate(base = base %>%
stringi::stri_trans_general(id = "latin-ascii") %>%
clean()) %>%
filter(str_length(base) > 0)
# remove doubles
dups <- ingredients %>%
with(!is.na(base) & str_detect(base, "^(.+) \\1$"))
ingredients[dups, "base"] <- ingredients[dups, "base"] %>%
mutate(base = str_match(base, "^(.+) \\1$")[, 2]) %>%
pull(base)
# combine plurals and singulars
plurals <- ingredients %>%
distinct(base) %>%
filter(str_detect(base, "s$")) %>%
mutate(s = str_match(base, "^(.*)s$")[, 2]) %>%
filter(s %in% ingredients$base)
plu_list <- plurals$s
names(plu_list) <- plurals$base
ingredients <- ingredients %>%
mutate(base = if_else(base %in% names(plu_list), plu_list[.$base], base))
if (!to_mat) {
ingredients
} else {
ingredients %>%
distinct(id, base) %>%
add_count(base) %>%
filter(n >= min_num) %>%
select(-n) %>%
mutate(n = 1) %>%
pivot_wider(names_from = "base", values_from = n, values_fill = 0) %>%
select(-id) %>%
as.matrix()
}
}