-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_pairs.py
106 lines (93 loc) · 5.12 KB
/
make_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import argparse
import json
import re
tecjl = [95652, 177778, 715422, 777469, 1065395, 664972, 627170, 597509, 136756, 427746,
953578, 983195, 1073665, 276920, 967431, 564422, 945421, 260887, 779547, 831645,
222711, 451442, 878669, 231348, 981621, 304944, 1035925, 425207, 889352, 913259,
228825, 63072, 431643, 899396, 965385, 124527, 757132, 1124968, 1070799, 298089,
383389, 724131, 716216, 1089491, 290840, 1044037, 873489, 308344, 207910, 66816,
157453, 941330, 984824, 1083067, 431419, 1111492, 114421, 50210, 156002, 804052,
379494, 648453, 1090244, 113166, 419427, 274190, 942221, 520924, 1068032, 948147,
89675, 604184, 806131, 344050, 1090763, 1031417, 568978, 438727, 121095, 1011743,
47733, 1109303, 140570, 123616, 490766, 863941, 400410, 1119717, 676673, 303119,
270650, 591056, 1039775, 745208, 711075, 1052873, 444706, 936281, 802941, 173586,
124581, 59635, 360410, 504768, 516054, 960459, 723381, 89917, 371210, 683934,
97822, 379286, 442117, 78882, 912027, 988396, 817387, 390945, 748278, 645871,
1060258, 27459, 820331, 351016, 172105, 920991, 144289, 898654, 428448, 1108849,
848794, 1060658, 518676, 933870, 1104123, 142372, 735249, 920268, 906241]
fluency_corpus = [555525, 585226, 622606, 953358, 602640, 652305, 188437, 912918, 378906, 571930,
1041438, 845344, 841761, 351783, 167464, 358440, 1036333, 534573, 524845, 19506,
1011255, 128056, 290874, 203837, 318014, 852544, 629313, 947778, 151104, 701511,
687177, 939081, 625230, 270927, 953940, 947801, 993885, 139361, 973922, 806906,
765547, 1056369, 889970, 575092, 1088119, 833655, 710267, 669823, 986752, 693380,
836744, 744585, 140431, 759956, 759960, 311973, 47271, 535719, 259240, 130738,
813234, 1031353, 307390, 943295, 143039, 205504, 435393, 1093322, 1087181, 179919,
670419, 774868, 553689, 463079, 204522, 385772, 63214, 844528, 220914, 133879,
139513, 1000185, 158459, 851194, 644351, 805120, 224000, 793858, 720134, 603399,
561415, 475401, 769801, 155405, 246035, 767251, 270613, 131347, 513301, 218906,
482074, 673564, 615709, 191268, 830757, 519973, 63271, 642855, 859960, 99647,
90434, 162632, 176458, 76107, 1017163, 952653, 350539, 891220, 613730, 983906,
222565, 203122, 703346, 811892, 529782, 1084279, 320887, 1028473, 763770, 503679,
13186, 262534, 226702, 554895, 130449, 223122, 694163, 689042, 8088, 922011,
969629, 817055, 337321, 273324, 484787, 689079, 620472, 966583, 1010108, 283070,
36800, 981442, 613828, 67016, 521160, 939472, 279508, 1119701, 140767, 852448,
521190, 162792, 680427, 210417, 281588, 136696, 865274, 930812, 862207]
def parse_args():
parser = argparse.ArgumentParser(description="This program creates Japanese sentence pairs except ones used in the TEC-JL from the Lang-8 corpus.")
parser.add_argument("-i", "--input", dest="input_path", metavar="<path to the input file>", required=True, help="Please set the path to the Lang-8 corpus (for example, ./lang-8-20111007-2.0/lang-8-20111007-L1-v2.dat).")
parser.add_argument("-o", "--output", dest="output_path", metavar="<path to the output file>", required=True, help="Please set the path to the output file.")
args = parser.parse_args()
return args
def main():
args = parse_args()
with open(args.input_path, "r", encoding="utf8") as input_file, open(args.output_path, "w") as output_file:
for line in input_file:
try:
line = json.loads(line)
journal_id = int(line[0])
sentence_id = int(line[1])
l2 = line[2]
l1 = line[3]
srcs = line[4] # ["learner_sentence1","learner_sentence2",...]
trgs = line[5] # [["correction1_to_sentence1","correction2_to_sentence1",...], ["correction1_to_sentence2","correction2_to_sentence2",...], ...]
if "Japanese" not in l2:
continue
if journal_id in tecjl:
continue
if journal_id in fluency_corpus:
continue
outputs = make_pairs(srcs, trgs)
for output in outputs:
print(output, file=output_file)
except:
pass
def make_pairs(srcs, trgs):
outputs = []
for i, src in enumerate(srcs):
src = src.replace("\t", " ")
src = remove_tags(src)
if len(trgs[i]) == 0:
output = src + "\t" + src
outputs.append(output)
else:
for trg in trgs[i]:
trg = trg.replace("\t", " ")
trg = remove_tags(trg)
output = src + "\t" + trg
outputs.append(output)
return outputs
def remove_tags(sent):
sent = re.sub(r"\[f-red\]", r"", sent)
sent = re.sub(r"\[f-blue\]", r"", sent)
sent = re.sub(r"\[f-bold\]", r"", sent)
sent = re.sub(r"\[赤\]", r"", sent)
sent = re.sub(r"\[青\]", r"", sent)
sent = re.sub(r"\[\/f-red\]", r"", sent)
sent = re.sub(r"\[\/f-blue\]", r"", sent)
sent = re.sub(r"\[\/f-bold\]", r"", sent)
sent = re.sub(r"\[\/赤\]", r"", sent)
sent = re.sub(r"\[\/青\]", r"", sent)
sent = re.sub(r"\[sline\](.*)\[\/sline\]", r"", sent)
return sent
if __name__ == "__main__":
main()