diff --git a/wani.py b/wani.py index d1e3db3..150a28e 100644 --- a/wani.py +++ b/wani.py @@ -779,6 +779,84 @@ def load_corpus(filename): return list(words.values()) +class Writer: + def __init__(self, args): + self.group = args.group + self.lemma_only = args.lemma_only + self.without_rep = args.without_rep + self.output_file = args.output + + def header(self): + cols = ["Lemma"] + if not self.lemma_only: + cols = ["Token_ID", "Word_form"] + cols + ["Msd"] + + if not self.without_rep: + cols.append("Representative_form") + + assert(len(cols) == self.length()) + cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols] + cols = ["Structure_ID"] + cols + ["Collocation_ID"] + + if not self.without_rep: + cols.append("Joint_representative_form") + if self.group: + cols.append("Frequency") + + return cols + + def length(self): + return 1 + 3 * int(not self.lemma_only) + int(not self.without_rep) + + def from_word(self, word): + if word is None: + return "" * self.length() + else: + cols = [word.lemma] + if not self.lemma_only: + cols = [word.id, word.text] + cols + [word.msd] + if not self.without_rep: + cols += "" #not yet implemented... + + return cols + + def write_out_worker(self, file_handler, matches, structures, colocation_ids): + file_handler.write(", ".join(self.header()) + "\n") + + for s in structures: + ms = matches[s.id] + + for m, reason, cid in ms: + to_write = [] + + for idx, comp in enumerate(s.components): + idx = str(idx + 1) + word = m[idx] if idx in m else None + to_write.extend(self.from_word(word)) + + # make them equal size + to_write.extend([""] * (MAX_NUM_COMPONENTS * 5 - len(to_write))) + to_write = [s.id] + to_write + [colocation_ids.to_id(cid)] + + if not self.without_rep: + to_write.append("") # not yet implemented... + + if self.group: + if colocation_ids.is_written(cid): + continue + else: + to_write.append(colocation_ids.num(cid)) + colocation_ids.set_written(cid) + + file_handler.write(", ".join(to_write) + "\n") + + file_handler.flush() + + def write_out(self, matches, structures, colocation_ids): + fp = sys.stdout if self.output_file is None else open(self.output_file, "w") + self.write_out_worker(fp, matches, structures, colocation_ids) + if self.output_file is not None: + fp.close() def main(): import time