forked from koskenni/pytwolc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
words2zerofilled.py
166 lines (147 loc) · 5.69 KB
/
words2zerofilled.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""Aligns morphs that occur in the example words by inserting zero symbols
© Kimmo Koskenniemi, 2017-2018. This is free software under the GPL 3 license.
"""
import argparse
argparser = argparse.ArgumentParser(
"python3 parad2zerofilled.py",
description="Aligns a set of word forms with morph boundaries")
argparser.add_argument(
"input",
default="ksk-seg-examp.csv",
help="moprheme names and segmented example words as a CSV file")
argparser.add_argument(
"output",
default="ksk-alig-examp.csv",
help="example words plus zero-filled aligned forms as a CSV file")
argparser.add_argument(
"-s", "--morph-separator",
default=".",
help="separator between morphs in the word form")
argparser.add_argument(
"-d", "--csv-delimiter",
default=",",
help="delimiter between the fields")
argparser.add_argument(
"-n", "--name-separator",
default=" ",
help="separator between morpheme names in the morpheme list")
argparser.add_argument(
"-z", "--zero-symbol",
default="Ø",
help="symbol to be inserted in word forms to align them")
argparser.add_argument(
"-v", "--verbosity", default=0, type=int,
help="level of diagnostic and debugging output")
args = argparser.parse_args()
import re
import csv
import collections
from orderedset import OrderedSet
# STEP 1:
# Read in the segmented words and collect the allomorphs of each morpheme
morphs_of_morpheme = {}
"""A dict to which allomorphs of each morpheme are collected:
morphs_of_morpheme[morpheme_name] == OrderedSet of its allomorphs.
"""
seg_example_list = []
"""A list to which of all example words are collected.
Each word is represented as a list of (morpheme,morph) pairs.
"""
stem_name_set = set()
"""Set of stem morphemes i.e. names of stem morphemes.
"""
csvfile = open(args.input)
csv.excel.delimiter = args.csv_delimiter
reader = csv.DictReader(csvfile)
i = 0
morphs_of_morpheme = {}
for row in reader:
morpheme_list = row["MORPHEMES"].strip().split(args.name_separator)
morph_list = row["MORPHS"].strip().split(args.morph_separator)
if args.verbosity >= 25:
print(row["MORPHEMES"])
print(morpheme_list)
print(row["MORPHS"])
print(morph_list)
i = i + 1
if len(morpheme_list) != len(morph_list):
print("** line", i, ":", row["MORPHEMES"],
"is incompatible with", row["MORPHS"])
continue
if not morpheme_list:
continue
stem_name_set.add(morpheme_list[0])
name_morph_pair_lst = list(zip(morpheme_list, morph_list))
seg_example_list.append(name_morph_pair_lst)
for morpheme, morph in name_morph_pair_lst:
if morpheme not in morphs_of_morpheme:
morphs_of_morpheme[morpheme] = OrderedSet()
morphs_of_morpheme[morpheme].add(morph.strip())
csvfile.close()
print("-- STEP 1 COMPLETED (seg_example_list, stem_name_set, morphs_of_morpheme done)--")
# STEP 2:
# align the allomorphs of each morpheme
from multialign import aligner
alignments = {}
"""All aligned morphs. index: morpheme name, value: sequence of aligned symbols.
Each aligned symbol has as many characters as there are items in the sequence.
"""
for morpheme in sorted(morphs_of_morpheme.keys()):
words = list(morphs_of_morpheme[morpheme])
if args.verbosity >= 20:
print("words:", words)
nz = 1 if len(words) > 10 else 2
aligned_sym_seq = aligner(words,
nz, morpheme)
if args.verbosity >= 20:
print("aligned_sym_seq:", aligned_sym_seq)
alignments[morpheme] = aligned_sym_seq
print("-- STEP 2 COMPLETED (alignments done) --")
# STEP 3:
# Compute the zero filled morphs out of the sequences of aligned symbols
aligned_morphs = {}
"""index: (morpheme, morph), value: zero-filled morph
"""
for morpheme, aligned_sym_seq in alignments.items():
if args.verbosity >= 25:
print("aligned_sym_seq:", aligned_sym_seq)
if morpheme not in aligned_morphs:
aligned_morphs[morpheme] = collections.OrderedDict()
if aligned_sym_seq:
l = len(aligned_sym_seq[0])
zero_filled_morphs = ["".join([x[i] for x in aligned_sym_seq])
for i in range(0,l)]
original_morphs = [re.sub(r"[Ø ]+", r"", x) for x in zero_filled_morphs]
for origm, zerofm in zip(original_morphs, zero_filled_morphs):
#if origm:
# aligned_morphs[morpheme][origm] = zerofm
aligned_morphs[morpheme][origm] = zerofm
else:
aligned_morphs[morpheme] = {"": ""}
if args.verbosity >= 20:
print("aligned_morphs", aligned_morphs)
print("-- STEP 3 COMPLETED (aligned_morphs done) --")
# STEP 4:
# Write the example word forms plus their a zero filled morphs
out_file = open(args.output, "w", newline="")
writer = csv.DictWriter(out_file,
["MORPHEMES","MORPHS","ZEROFILLED"],
delimiter=args.csv_delimiter)
forms_of_morphs = {}
writer.writeheader()
d = {}
for seg_example in seg_example_list:
if args.verbosity >= 20:
print("seg_example:", seg_example)
morpheme_lst = [morpheme for morpheme, morph in seg_example]
morph_lst = [morph for morpheme, morph in seg_example]
zero_filled_morph_lst = [aligned_morphs[morpheme].get(morph, "")
for (morpheme, morph) in seg_example]
d["MORPHEMES"] = " ".join(morpheme_lst)
d["MORPHS"] = args.morph_separator.join(morph_lst)
d["ZEROFILLED"] = args.morph_separator.join(zero_filled_morph_lst)
writer.writerow(d)
if morph_lst[0] not in forms_of_morphs:
forms_of_morphs[morph_lst[0]] = set()
forms_of_morphs[morph_lst[0]].add(" ".join(x for x in morpheme_lst[1:]))
print("-- STEP 4 COMPLETED (zero-filled morphs and the CSV file done) --")