diff --git a/wani.py b/wani.py index 61f477b..3694b21 100644 --- a/wani.py +++ b/wani.py @@ -813,6 +813,7 @@ class Writer: self.without_rep = args.without_rep self.output_file = args.output self.multiple_output = args.multiple_output + self.sort_by = int(args.sort_by) def header(self): cols = ["Lemma"] @@ -847,11 +848,28 @@ class Writer: cols += [""] #not yet implemented... return cols + + def sorted_rows(self, rows): + if self.sort_by < 0 or len(rows) < 2: + return rows + + if len(rows[0]) <= self.sort_by: + print("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])), file=sys.stderr) + return rows + + try: + int(rows[0][self.sort_by]) + key=lambda row: int(row[self.sort_by]) + except ValueError: + key=lambda row: row[self.sort_by].lower() + + return sorted(rows, key=key) def write_header(self, file_handler): file_handler.write(", ".join(self.header()) + "\n") def write_out_worker(self, file_handler, matches, structure_id, components, colocation_ids): + rows = [] for m, reason, cid in matches: to_write = [] @@ -874,8 +892,10 @@ class Writer: to_write.append(colocation_ids.num(cid)) colocation_ids.set_written(cid) - file_handler.write(", ".join(to_write) + "\n") + rows.append(to_write) + rows = self.sorted_rows(rows) + file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") file_handler.flush() def write_out(self, matches, structures, colocation_ids): @@ -901,7 +921,8 @@ class Writer: self.write_header(fp) sid_matches = matches[s.id] - self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids) + if len(sid_matches) > 0: + self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids) if self.multiple_output: fp_close(fp) @@ -1029,6 +1050,7 @@ if __name__ == '__main__': parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info") parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') + parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')