diff --git a/wani.py b/wani.py index 41f72d1..7060d20 100644 --- a/wani.py +++ b/wani.py @@ -738,7 +738,7 @@ def is_root_id(id_): return len(id_.split('.')) == 3 -def load_corpus(filename): +def load_corpus(filename, check_ids): with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') @@ -763,7 +763,7 @@ def load_corpus(filename): lfrom, dest = l.get('target').replace('#', '').split() if lfrom in words: - if is_root_id(lfrom): + if check_ids and is_root_id(lfrom): logging.error("NOO: ", lfrom) sys.exit(1) @@ -896,7 +896,7 @@ def main(input_file, structures_file, args): logging.debug(str(s)) logging.info("LOADING TEXT...") - words = load_corpus(input_file) + words = load_corpus(input_file, args.check_ids) # useful for faster debugging... # import pickle @@ -934,6 +934,7 @@ if __name__ == '__main__': parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('--output', help='Output file (if none given, then output to stdout)') + parser.add_argument('--check-ids', help='Checks ids of and to be in correct structure', action='store_true') parser.add_argument('--lemma_only', help='Will not write word ids, forms and msds in output', action='store_true') parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true')