NW: Writer class implemented
This commit is contained in:
parent
1298a45d0f
commit
916269e710
78
wani.py
78
wani.py
|
@ -779,6 +779,84 @@ def load_corpus(filename):
|
||||||
|
|
||||||
return list(words.values())
|
return list(words.values())
|
||||||
|
|
||||||
|
class Writer:
|
||||||
|
def __init__(self, args):
|
||||||
|
self.group = args.group
|
||||||
|
self.lemma_only = args.lemma_only
|
||||||
|
self.without_rep = args.without_rep
|
||||||
|
self.output_file = args.output
|
||||||
|
|
||||||
|
def header(self):
|
||||||
|
cols = ["Lemma"]
|
||||||
|
if not self.lemma_only:
|
||||||
|
cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
|
||||||
|
|
||||||
|
if not self.without_rep:
|
||||||
|
cols.append("Representative_form")
|
||||||
|
|
||||||
|
assert(len(cols) == self.length())
|
||||||
|
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
|
||||||
|
cols = ["Structure_ID"] + cols + ["Collocation_ID"]
|
||||||
|
|
||||||
|
if not self.without_rep:
|
||||||
|
cols.append("Joint_representative_form")
|
||||||
|
if self.group:
|
||||||
|
cols.append("Frequency")
|
||||||
|
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def length(self):
|
||||||
|
return 1 + 3 * int(not self.lemma_only) + int(not self.without_rep)
|
||||||
|
|
||||||
|
def from_word(self, word):
|
||||||
|
if word is None:
|
||||||
|
return "" * self.length()
|
||||||
|
else:
|
||||||
|
cols = [word.lemma]
|
||||||
|
if not self.lemma_only:
|
||||||
|
cols = [word.id, word.text] + cols + [word.msd]
|
||||||
|
if not self.without_rep:
|
||||||
|
cols += "" #not yet implemented...
|
||||||
|
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def write_out_worker(self, file_handler, matches, structures, colocation_ids):
|
||||||
|
file_handler.write(", ".join(self.header()) + "\n")
|
||||||
|
|
||||||
|
for s in structures:
|
||||||
|
ms = matches[s.id]
|
||||||
|
|
||||||
|
for m, reason, cid in ms:
|
||||||
|
to_write = []
|
||||||
|
|
||||||
|
for idx, comp in enumerate(s.components):
|
||||||
|
idx = str(idx + 1)
|
||||||
|
word = m[idx] if idx in m else None
|
||||||
|
to_write.extend(self.from_word(word))
|
||||||
|
|
||||||
|
# make them equal size
|
||||||
|
to_write.extend([""] * (MAX_NUM_COMPONENTS * 5 - len(to_write)))
|
||||||
|
to_write = [s.id] + to_write + [colocation_ids.to_id(cid)]
|
||||||
|
|
||||||
|
if not self.without_rep:
|
||||||
|
to_write.append("") # not yet implemented...
|
||||||
|
|
||||||
|
if self.group:
|
||||||
|
if colocation_ids.is_written(cid):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
to_write.append(colocation_ids.num(cid))
|
||||||
|
colocation_ids.set_written(cid)
|
||||||
|
|
||||||
|
file_handler.write(", ".join(to_write) + "\n")
|
||||||
|
|
||||||
|
file_handler.flush()
|
||||||
|
|
||||||
|
def write_out(self, matches, structures, colocation_ids):
|
||||||
|
fp = sys.stdout if self.output_file is None else open(self.output_file, "w")
|
||||||
|
self.write_out_worker(fp, matches, structures, colocation_ids)
|
||||||
|
if self.output_file is not None:
|
||||||
|
fp.close()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import time
|
import time
|
||||||
|
|
Loading…
Reference in New Issue
Block a user