diff --git a/wani.py b/wani.py index 0cee15b..9ffced7 100644 --- a/wani.py +++ b/wani.py @@ -388,6 +388,7 @@ class Component: for feature in representation: f = ComponentRepresentation.new(dict(feature.attrib)) + print(f) if type(f) is None: logging.warning("Unknown representation in component {}, skipping...".format(self.idx), file=sys.stderr) @@ -812,49 +813,48 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): return list(words.values()) class Writer: - def __init__(self, args): - self.group = args.group - self.lemma_only = args.lemma_only - self.without_rep = args.without_rep - self.output_file = args.output - self.multiple_output = args.multiple_output + @staticmethod + def make_output_writer(args): + return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed) + + @staticmethod + def make_all_writer(args): + return Writer(True, args.all, False, -1, False) - self.sort_by = int(args.sort_by) - self.sort_order = args.sort_reversed + def __init__(self, all, filename, multiple_output, sort_by, sort_reversed): + self.all = all + self.output_file = filename + self.multiple_output = multiple_output + + self.sort_by = sort_by + self.sort_order = sort_reversed def header(self): cols = ["Lemma"] - if not self.lemma_only: + if self.all: cols = ["Token_ID", "Word_form"] + cols + ["Msd"] - - if not self.without_rep: + else: cols.append("Representative_form") assert(len(cols) == self.length()) cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols] - cols = ["Structure_ID"] + cols + ["Collocation_ID"] + cols = ["Structure_ID"] + cols + ["Colocation_ID"] - if not self.without_rep: - cols.append("Joint_representative_form") - if self.group: - cols.append("Frequency") + if not self.all: + cols += ["Joint_representative_form", "Frequency"] return cols def length(self): - return 1 + 3 * int(not self.lemma_only) + int(not self.without_rep) + return 4 if self.all else 2 def from_word(self, word): if word is None: return "" * self.length() + elif self.all: + return [word.id, word.text, word.lemma, word.msd] else: - cols = [word.lemma] - if not self.lemma_only: - cols = [word.id, word.text] + cols + [word.msd] - if not self.without_rep: - cols += [""] #not yet implemented... - - return cols + return [word.lemma, "REP?"] def sorted_rows(self, rows): if self.sort_by < 0 or len(rows) < 2: @@ -889,10 +889,7 @@ class Writer: to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) to_write = [structure_id] + to_write + [colocation_ids.to_id(cid)] - if not self.without_rep: - to_write.append("") # not yet implemented... - - if self.group: + if not self.all: if colocation_ids.should_write(cid): to_write.append(colocation_ids.num(cid)) colocation_ids.set_written(cid) @@ -941,7 +938,7 @@ class Writer: class ColocationIds: def __init__(self): self.data = {} - self.min_frequency = args.group + self.min_frequency = args.min_freq def add_match(self, key): if key in self.data: @@ -992,7 +989,6 @@ def match_file(words, structures): def main(input_file, structures_file, args): - writer = Writer(args) structures = build_structures(structures_file) for s in structures: logging.debug(str(s)) @@ -1039,7 +1035,9 @@ def main(input_file, structures_file, args): else: matches = colocation_ids.merge_matches(matches, new_matches) - writer.write_out(matches, structures, colocation_ids) + if args.all: + Writer.make_all_writer(args).write_out(matches, structures, colocation_ids) + Writer.make_output_writer(args).write_out(matches, structures, colocation_ids) logging.debug([(k, len(v)) for k, v in matches.items()]) logging.debug(sum(len(v) for _, v in matches.items())) @@ -1049,12 +1047,11 @@ if __name__ == '__main__': parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+') parser.add_argument('--output', help='Output file (if none given, then output to stdout)') + parser.add_argument('--all', help='Additional output file, writes more data') parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') - parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true') - parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') - parser.add_argument('--group', help='Group collocations with same collocation ID', type=int, default=0, const=1, nargs='?') + parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') @@ -1072,3 +1069,4 @@ if __name__ == '__main__': start = time.time() main(args.input, args.structures, args) logging.info("TIME: {}".format(time.time() - start)) +