Multiple input files support

This commit is contained in:
Ozbolt Menegatti 2019-02-09 13:25:26 +01:00
parent b4e73e2d60
commit 518fe5e113

22
wani.py
View File

@ -738,7 +738,20 @@ def is_root_id(id_):
return len(id_.split('.')) == 3
def load_corpus(filename, skip_id_check):
def load_corpus(args):
filenames = args.input
skip_id_check = args.skip_id_check
result = []
for fname in filenames:
load_tei_file(fname, skip_id_check, result)
return result
def load_tei_file(filename, skip_id_check, previous_words):
logging.info("LOADING FILE: {}".format(filename))
with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
@ -778,7 +791,7 @@ def load_corpus(filename, skip_id_check):
# strange errors, just skip...
pass
return list(words.values())
previous_words.extend(words.values())
class Writer:
def __init__(self, args):
@ -919,8 +932,7 @@ def main(input_file, structures_file, args):
for s in structures:
logging.debug(str(s))
logging.info("LOADING TEXT...")
words = load_corpus(input_file, args.skip_id_check)
words = load_corpus(args)
# useful for faster debugging...
# import pickle
@ -954,8 +966,8 @@ def main(input_file, structures_file, args):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
parser.add_argument('input', help='input xml file in `ssj500k form`')
parser.add_argument('structures', help='Structures definitions in xml file')
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')