diff --git a/wani.py b/wani.py index 68952a3..5d4de6c 100644 --- a/wani.py +++ b/wani.py @@ -786,6 +786,7 @@ class Writer: self.lemma_only = args.lemma_only self.without_rep = args.without_rep self.output_file = args.output + self.multiple_output = args.multiple_output def header(self): cols = ["Lemma"] @@ -821,43 +822,66 @@ class Writer: return cols - def write_out_worker(self, file_handler, matches, structures, colocation_ids): + def write_header(self, file_handler): file_handler.write(", ".join(self.header()) + "\n") - for s in structures: - ms = matches[s.id] + def write_out_worker(self, file_handler, matches, structure_id, components, colocation_ids): + for m, reason, cid in matches: + to_write = [] - for m, reason, cid in ms: - to_write = [] + for idx, comp in enumerate(components): + idx = str(idx + 1) + word = m[idx] if idx in m else None + to_write.extend(self.from_word(word)) - for idx, comp in enumerate(s.components): - idx = str(idx + 1) - word = m[idx] if idx in m else None - to_write.extend(self.from_word(word)) + # make them equal size + to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) + to_write = [structure_id] + to_write + [colocation_ids.to_id(cid)] - # make them equal size - to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) - to_write = [s.id] + to_write + [colocation_ids.to_id(cid)] + if not self.without_rep: + to_write.append("") # not yet implemented... - if not self.without_rep: - to_write.append("") # not yet implemented... + if self.group: + if colocation_ids.is_written(cid): + continue + else: + to_write.append(colocation_ids.num(cid)) + colocation_ids.set_written(cid) - if self.group: - if colocation_ids.is_written(cid): - continue - else: - to_write.append(colocation_ids.num(cid)) - colocation_ids.set_written(cid) - - file_handler.write(", ".join(to_write) + "\n") + file_handler.write(", ".join(to_write) + "\n") file_handler.flush() def write_out(self, matches, structures, colocation_ids): - fp = sys.stdout if self.output_file is None else open(self.output_file, "w") - self.write_out_worker(fp, matches, structures, colocation_ids) - if self.output_file is not None: - fp.close() + def fp_close(fp_): + if fp_ != sys.stdout: + fp_.close() + + def fp_open(snum=None): + if self.output_file is None: + return sys.stdout + elif snum is None: + return open(self.output_file, "w") + else: + return open("{}.{}".format(self.output_file, snum), "w") + + if not self.multiple_output: + fp = fp_open() + self.write_header(fp) + + for s in structures: + if self.multiple_output: + fp=fp_open(s.id) + self.write_header(fp) + + sid_matches = matches[s.id] + self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids) + + if self.multiple_output: + fp_close(fp) + + if not self.multiple_output: + fp_close(fp) class ColocationIds: @@ -939,6 +963,7 @@ if __name__ == '__main__': parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true') parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info") + parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())