diff --git a/wani.py b/wani.py index 7060d20..68952a3 100644 --- a/wani.py +++ b/wani.py @@ -738,7 +738,7 @@ def is_root_id(id_): return len(id_.split('.')) == 3 -def load_corpus(filename, check_ids): +def load_corpus(filename, skip_id_check): with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') @@ -763,7 +763,7 @@ def load_corpus(filename, check_ids): lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: - if check_ids and is_root_id(lfrom): + if not skip_id_check and is_root_id(lfrom): logging.error("NOO: ", lfrom) sys.exit(1) @@ -817,7 +817,7 @@ class Writer: if not self.lemma_only: cols = [word.id, word.text] + cols + [word.msd] if not self.without_rep: - cols += "" #not yet implemented... + cols += [""] #not yet implemented... return cols @@ -836,7 +836,7 @@ class Writer: to_write.extend(self.from_word(word)) # make them equal size - to_write.extend([""] * (MAX_NUM_COMPONENTS * 5 - len(to_write))) + to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write))) to_write = [s.id] + to_write + [colocation_ids.to_id(cid)] if not self.without_rep: @@ -896,7 +896,7 @@ def main(input_file, structures_file, args): logging.debug(str(s)) logging.info("LOADING TEXT...") - words = load_corpus(input_file, args.check_ids) + words = load_corpus(input_file, args.skip_id_check) # useful for faster debugging... # import pickle @@ -934,8 +934,8 @@ if __name__ == '__main__': parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('--output', help='Output file (if none given, then output to stdout)') - parser.add_argument('--check-ids', help='Checks ids of and to be in correct structure', action='store_true') - parser.add_argument('--lemma_only', help='Will not write word ids, forms and msds in output', action='store_true') + parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') + parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true') parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true') parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info")