Check root ids is now skipped by default.

This commit is contained in:
Ozbolt Menegatti 2019-02-06 15:33:33 +01:00
parent 27a60c439b
commit 5f7b5f969c

View File

@ -738,7 +738,7 @@ def is_root_id(id_):
return len(id_.split('.')) == 3 return len(id_.split('.')) == 3
def load_corpus(filename): def load_corpus(filename, check_ids):
with open(filename, 'r') as fp: with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ') xmlstring = xmlstring.replace(' xml:', ' ')
@ -763,7 +763,7 @@ def load_corpus(filename):
lfrom, dest = l.get('target').replace('#', '').split() lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words: if lfrom in words:
if is_root_id(lfrom): if check_ids and is_root_id(lfrom):
logging.error("NOO: ", lfrom) logging.error("NOO: ", lfrom)
sys.exit(1) sys.exit(1)
@ -896,7 +896,7 @@ def main(input_file, structures_file, args):
logging.debug(str(s)) logging.debug(str(s))
logging.info("LOADING TEXT...") logging.info("LOADING TEXT...")
words = load_corpus(input_file) words = load_corpus(input_file, args.check_ids)
# useful for faster debugging... # useful for faster debugging...
# import pickle # import pickle
@ -934,6 +934,7 @@ if __name__ == '__main__':
parser.add_argument('structures', help='Structures definitions in xml file') parser.add_argument('structures', help='Structures definitions in xml file')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)') parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--check-ids', help='Checks ids of <w> and <pc> to be in correct structure', action='store_true')
parser.add_argument('--lemma_only', help='Will not write word ids, forms and msds in output', action='store_true') parser.add_argument('--lemma_only', help='Will not write word ids, forms and msds in output', action='store_true')
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true') parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true') parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true')