diff --git a/wani.py b/wani.py index 5d4de6c..bb87325 100644 --- a/wani.py +++ b/wani.py @@ -738,7 +738,20 @@ def is_root_id(id_): return len(id_.split('.')) == 3 -def load_corpus(filename, skip_id_check): +def load_corpus(args): + filenames = args.input + skip_id_check = args.skip_id_check + result = [] + + for fname in filenames: + load_tei_file(fname, skip_id_check, result) + + return result + + +def load_tei_file(filename, skip_id_check, previous_words): + logging.info("LOADING FILE: {}".format(filename)) + with open(filename, 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') @@ -778,7 +791,7 @@ def load_corpus(filename, skip_id_check): # strange errors, just skip... pass - return list(words.values()) + previous_words.extend(words.values()) class Writer: def __init__(self, args): @@ -919,8 +932,7 @@ def main(input_file, structures_file, args): for s in structures: logging.debug(str(s)) - logging.info("LOADING TEXT...") - words = load_corpus(input_file, args.skip_id_check) + words = load_corpus(args) # useful for faster debugging... # import pickle @@ -954,8 +966,8 @@ def main(input_file, structures_file, args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.') - parser.add_argument('input', help='input xml file in `ssj500k form`') parser.add_argument('structures', help='Structures definitions in xml file') + parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+') parser.add_argument('--output', help='Output file (if none given, then output to stdout)') parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true')