diff --git a/wani.py b/wani.py index 432f417..affd965 100644 --- a/wani.py +++ b/wani.py @@ -279,7 +279,8 @@ class WordFormAgreementCR(WordFormMsdCR): def match(self, word_msd): existing = [(w.msd, w.text) for w in self.words] - for candidate_msd, candidate_text in self.word_renderer.available_words(self.lemma, existing): + lemma_available_words = self.word_renderer.available_words(self.lemma, existing) + for candidate_msd, candidate_text in lemma_available_words: if self.msd[0] != candidate_msd[0]: continue @@ -299,7 +300,8 @@ class WordFormAgreementCR(WordFormMsdCR): t1 = msd1[0] # if not in msd, some strange msd was tries, skipping... if agr_case not in TAGSET[t1]: - logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1)) + logging.warning("Cannot do agreement: {} for msd {} not found!" + .format(agr_case, msd1)) return False v1 = TAGSET[t1].index(agr_case) @@ -312,7 +314,8 @@ class WordFormAgreementCR(WordFormMsdCR): # REPEAT (not DRY!) t2 = msd2[0] if agr_case not in TAGSET[t2]: - logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2)) + logging.warning("Cannot do agreement: {} for msd {} not found!" + .format(agr_case, msd2)) return False v2 = TAGSET[t2].index(agr_case) if v2 + 1 >= len(msd2): @@ -707,7 +710,8 @@ class SyntacticStructure: assert(system.get('type') == 'JOS') components, dependencies, definitions = list(system) - deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ] + deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) + for dep in dependencies] comps = { comp.get('cid'): dict(comp.items()) for comp in components } restrs, forms = {}, {} @@ -724,7 +728,8 @@ class SyntacticStructure: elif el.tag.startswith("representation"): st.add_representation(n, el, forms) else: - raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id)) + raise NotImplementedError("Unknown definition: {} in structure {}" + .format(el.tag, st.id)) fake_root_component = Component({'cid': '#', 'type': 'other'}) st.components = fake_root_component.find_next(deps, comps, restrs, forms) @@ -1023,7 +1028,8 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): class Writer: @staticmethod def make_output_writer(args): - return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed) + return Writer(False, args.output, args.multiple_output, + int(args.sort_by), args.sort_reversed) @staticmethod def make_all_writer(args): @@ -1202,7 +1208,8 @@ class ColocationIds: components_dict = {structure.id: structure for structure in structures} idx = 1 for _1, sm in tqdm(self.data.items()): - ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer) + ComponentRendition.set_representations( + sm, components_dict[sm.structure_id], word_renderer) idx += 1 def determine_colocation_dispersions(self): @@ -1257,7 +1264,8 @@ def main(input_file, structures_file, args): del cmd[pidx] def func(n): - cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)] + cmdn = [sys.executable] + cmd + [args.input[n], + "--match-to-file", "{}/{}.p".format(tmpdirname, n)] subprocess.check_call(cmdn) return n @@ -1296,24 +1304,43 @@ def main(input_file, structures_file, args): Writer.make_all_writer(args).write_out(structures, colocation_ids) if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.') - parser.add_argument('structures', help='Structures definitions in xml file') - parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+') - parser.add_argument('--output', help='Output file (if none given, then output to stdout)') - parser.add_argument('--all', help='Additional output file, writes more data') + parser = argparse.ArgumentParser( + description='Extract structures from a parsed corpus.') + parser.add_argument('structures', + help='Structures definitions in xml file') + parser.add_argument('input', + help='input xml file in `ssj500k form`, can list more than one', nargs='+') + parser.add_argument('--output', + help='Output file (if none given, then output to stdout)') + parser.add_argument('--all', + help='Additional output file, writes more data') - parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') - parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') - parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') - parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') - parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') - parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') + parser.add_argument('--no-msd-translate', + help='MSDs are translated from slovene to english by default', + action='store_true') + parser.add_argument('--skip-id-check', + help='Skips checks for ids of and , if they are in correct format', + action='store_true') + parser.add_argument('--min_freq', help='Minimal frequency in output', + type=int, default=0, const=1, nargs='?') + parser.add_argument('--verbose', help='Enable verbose output to stderr', + choices=["warning", "info", "debug"], default="info", + const="info", nargs='?') + parser.add_argument('--count-files', + help="Count files: more verbose output", action='store_true') + parser.add_argument('--multiple-output', + help='Generate one output for each syntactic structure', + action='store_true') - parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) - parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') + parser.add_argument('--sort-by', + help="Sort by a this column (index)", type=int, default=-1) + parser.add_argument('--sort-reversed', + help="Sort in reversed ored", action='store_true') - parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") - parser.add_argument('--parallel', help='Run in multiple processes, should speed things up') + parser.add_argument('--pc-tag', + help='Tag for separators, usually pc or c', default="pc") + parser.add_argument('--parallel', + help='Run in multiple processes, should speed things up') parser.add_argument('--match-to-file', help='Do not use!') args = parser.parse_args() @@ -1322,5 +1349,3 @@ if __name__ == '__main__': start = time.time() main(args.input, args.structures, args) logging.info("TIME: {}".format(time.time() - start)) - -# 2876, 2945 type