-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreateDictionary.R
46 lines (45 loc) · 1.22 KB
/
createDictionary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
args <- commandArgs(trailingOnly = TRUE)
library(dplyr)
library(tidyr)
library(data.table)
fixCollapsed <- function(df){
colnames(df) <- c("key", "value")
df <- df %>%
mutate(key = strsplit(key, "; ")) %>%
unnest(key)
df <- df[, c(2, 1)]
return(df)
}
fixDuplicated <- function(df){
df <- df %>%
group_by(key) %>%
summarise(value = paste(value, collapse = "; "))
values <- strsplit(df$value, "; ")
values <- lapply(values, unique)
values <- sapply(values, paste, collapse = "; ")
df$value <- values
return(df)
}
removeUnknown <- function(df){
idx <- grepl("^-", df$key)
df <- df[!idx,]
return(df)
}
df <- fread(args[2], stringsAsFactors = FALSE,
head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
fixCollapsed() %>%
fixDuplicated() %>%
removeUnknown() %>%
fwrite(file = args[1], sep = "\t",
nThread = as.integer(args[4]))
df <- fread(args[3], stringsAsFactors = FALSE,
head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
fixCollapsed() %>%
fixDuplicated() %>%
removeUnknown() %>%
fwrite(file = args[1], sep = "\t",
append = TRUE, nThread = as.integer(args[4]))