From de6c73980ee8e7165ba6159451cdc7b73cc2d895 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Tue, 19 Feb 2019 14:57:48 +0100 Subject: [PATCH] adding min-frequency option --- wani.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/wani.py b/wani.py index 4219b08..6de5a76 100644 --- a/wani.py +++ b/wani.py @@ -856,7 +856,7 @@ class Writer: return rows if len(rows[0]) <= self.sort_by: - print("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])), file=sys.stderr) + logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0]))) return rows try: @@ -888,17 +888,18 @@ class Writer: to_write.append("") # not yet implemented... if self.group: - if colocation_ids.is_written(cid): - continue - else: + if colocation_ids.should_write(cid): to_write.append(colocation_ids.num(cid)) colocation_ids.set_written(cid) + else: + continue rows.append(to_write) - rows = self.sorted_rows(rows) - file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") - file_handler.flush() + if len(rows) > 0: + rows = self.sorted_rows(rows) + file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") + file_handler.flush() def write_out(self, matches, structures, colocation_ids): def fp_close(fp_): @@ -923,8 +924,7 @@ class Writer: self.write_header(fp) sid_matches = matches[s.id] - if len(sid_matches) > 0: - self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids) + self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids) if self.multiple_output: fp_close(fp) @@ -936,6 +936,7 @@ class Writer: class ColocationIds: def __init__(self): self.data = {} + self.min_frequency = args.group def add_match(self, key): if key in self.data: @@ -946,8 +947,8 @@ class ColocationIds: def get(self, key, n): return self.data[key][n] - def is_written(self, key): - return self.get(key, 2) + def should_write(self, key): + return self.get(key, 1) >= self.min_frequency and not self.get(key, 2) def num(self, key): return str(self.get(key, 1)) @@ -1048,8 +1049,8 @@ if __name__ == '__main__': parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true') parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') - parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true') - parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info") + parser.add_argument('--group', help='Group collocations with same collocation ID', type=int, default=0, const=1, nargs='?') + parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')