-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize.nf
executable file
·109 lines (90 loc) · 3.83 KB
/
tokenize.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
log.info "----------------------------------"
log.info "Tokenisation Pipeline using ucto"
log.info "----------------------------------"
def env = System.getenv()
params.virtualenv = env.containsKey('VIRTUAL_ENV') ? env['VIRTUAL_ENV'] : ""
params.extension = "txt"
params.outputdir = "tokenized_output"
params.sentenceperline = false
params.inputclass = "current"
params.outputclass = "current"
if (params.containsKey('help') || !params.containsKey('inputdir') || !params.containsKey('language')) {
log.info "Usage:"
log.info " tokenize.nf"
log.info ""
log.info "Mandatory parameters:"
log.info " --inputdir DIRECTORY Path to the corpus directory"
log.info " --language STR The language to tokenise for (eng,nld,spa,por,ita,fra,deu,tur,rus,generic)"
log.info ""
log.info "Optional parameters:"
log.info " --extension EXTENSION Extension of input documents (default: txt, suggestion: folia.xml)"
log.info " --inputformat STR Set to 'text' or 'folia', automatically determined from extension if possible"
log.info " --virtualenv PATH Path to Python Virtual Environment to load (usually path to LaMachine)"
log.info " --sentenceperline Indicates that the input (plain text only) is already in a one sentence per line format, skips sentence detection (default: false)"
log.info " --outputdir DIRECTORY Output directory (FoLiA documents)"
log.info " --inputclass CLASS Set the FoLiA text class to use as input (default: current)"
log.info " --outputclass CLASS Set the FoLiA text class to use as output (default: current)"
exit 2
}
if ((params.extension.find('xml') != null) || (params.extension.find('folia') != null)) {
params.inputformat = "folia"
} else {
params.inputformat = "text"
}
inputdocuments = Channel.fromPath(params.inputdir + "/**." + params.extension).filter { it.baseName != "trace" }
if (params.inputformat == "folia") {
process tokenize_folia2folia {
publishDir params.outputdir, mode: 'copy', overwrite: true
input:
file inputdocument from inputdocuments
val language from params.language
val inputclass from params.inputclass
val outputclass from params.outputclass
val virtualenv from params.virtualenv
output:
file "${inputdocument.baseName}.tok.folia.xml" into tokoutput
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
ID="${inputdocument.baseName}"
ucto -L "${language}" -X --id "\$ID" --inputclass "${inputclass}" --outputclass "${outputclass}" -F "${inputdocument}" "${inputdocument.baseName}.tok.folia.xml"
"""
}
} else {
//assume text
process tokenize_text2folia {
publishDir params.outputdir, mode: 'copy', overwrite: true
input:
file inputdocument from inputdocuments
val language from params.language
val sentenceperline from params.sentenceperline
val virtualenv from params.virtualenv
val outputclass from params.outputclass
output:
file "${inputdocument.baseName}.tok.folia.xml" into tokoutput
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
opts=""
if [[ "${sentenceperline}" == "true" ]]; then
opts="\$opts -n"
fi
ID="${inputdocument.baseName}"
ucto -L "${language}" \$opts -X --id \$ID --outputclass "${outputclass}" "${inputdocument}" "${inputdocument.baseName}.tok.folia.xml"
"""
}
}
tokoutput.subscribe { println "Tokenizer output document written to " + params.outputdir + "/" + it.name }