From eb86a6bb1c3e8b44ab747eb8986b77fc7f6ba374 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 20 Jul 2020 10:51:09 +0200 Subject: [PATCH] Added collocation_sentence_map_dest --- src/collocation_sentence_mapper.py | 11 ++++++++++ src/wani.py | 2 ++ src/writer.py | 32 ++++++++++++++++++++++-------- 3 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 src/collocation_sentence_mapper.py diff --git a/src/collocation_sentence_mapper.py b/src/collocation_sentence_mapper.py new file mode 100644 index 0000000..0da59e6 --- /dev/null +++ b/src/collocation_sentence_mapper.py @@ -0,0 +1,11 @@ + +class CollocationSentenceMapper: + def __init__(self, output_dir): + self.output = open(output_dir, "w") + self.output.write(f'Collocation_id\tSentence_id\n') + + def close(self): + self.output.close() + + def add_map(self, collocation_id, sentence_id): + self.output.write(f'{collocation_id}\t{sentence_id}\n') diff --git a/src/wani.py b/src/wani.py index cde8dc5..de35eab 100644 --- a/src/wani.py +++ b/src/wani.py @@ -151,6 +151,8 @@ if __name__ == '__main__': parser.add_argument('--db', help="Database file to use (instead of memory)", default=None) + parser.add_argument('--collocation_sentence_map_dest', + help="Destination to folder where collocation-sentence mapper (mappers in case of multiple-output).", default=None) parser.add_argument('--new-db', help="Writes over database file, if there exists one", action='store_true') diff --git a/src/writer.py b/src/writer.py index bda8c23..4f665aa 100644 --- a/src/writer.py +++ b/src/writer.py @@ -1,8 +1,13 @@ -import logging +import logging +import os + from progress_bar import progress from formatter import OutFormatter, OutNoStatFormatter, AllFormatter, StatsFormatter +from collocation_sentence_mapper import CollocationSentenceMapper + + class Writer: @staticmethod def other_params(args): @@ -11,23 +16,25 @@ class Writer: @staticmethod def make_output_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), params) + return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) @staticmethod def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), params) + return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) @staticmethod def make_all_writer(args, num_components, colocation_ids, word_renderer): - return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), None) + return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None) @staticmethod def make_stats_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), params) + return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) - def __init__(self, file_out, num_components, formatter, params): + def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params): + # TODO FIX THIS + self.collocation_sentence_map_dest = collocation_sentence_map_dest if params is None: self.multiple_output = False self.sort_by = -1 @@ -73,7 +80,7 @@ class Writer: def write_header(self, file_handler): file_handler.write(",".join(self.header()) + "\n") - def write_out_worker(self, file_handler, structure, colocation_ids): + def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map): rows = [] components = structure.components for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)): @@ -84,6 +91,11 @@ class Writer: variable_word_order = self.find_variable_word_order(match.matches) + if col_sent_map is not None: + # TODO find better way to get sentence_id + for words in match.matches: + col_sent_map.add_map(match.match_id, '.'.join(words['1'].id.split('.')[:-1])) + for words in match.matches: to_write = [] @@ -128,14 +140,18 @@ class Writer: if not self.multiple_output: fp = fp_open() self.write_header(fp) + col_sent_map = CollocationSentenceMapper(os.path.join(self.collocation_sentence_map_dest, 'mapper.txt')) \ + if self.collocation_sentence_map_dest is not None else None for s in progress(structures, "writing:{}".format(self.formatter)): if self.multiple_output: fp = fp_open(s.id) self.write_header(fp) + col_sent_map = CollocationSentenceMapper(os.path.join(self.collocation_sentence_map_dest, f'{s.id}_mapper.txt')) \ + if self.collocation_sentence_map_dest is not None else None self.formatter.set_structure(s) - self.write_out_worker(fp, s, colocation_ids) + self.write_out_worker(fp, s, colocation_ids, col_sent_map) if self.multiple_output: fp_close(fp)