Multiple input files support
This commit is contained in:
parent
b4e73e2d60
commit
518fe5e113
22
wani.py
22
wani.py
|
@ -738,7 +738,20 @@ def is_root_id(id_):
|
|||
return len(id_.split('.')) == 3
|
||||
|
||||
|
||||
def load_corpus(filename, skip_id_check):
|
||||
def load_corpus(args):
|
||||
filenames = args.input
|
||||
skip_id_check = args.skip_id_check
|
||||
result = []
|
||||
|
||||
for fname in filenames:
|
||||
load_tei_file(fname, skip_id_check, result)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_tei_file(filename, skip_id_check, previous_words):
|
||||
logging.info("LOADING FILE: {}".format(filename))
|
||||
|
||||
with open(filename, 'r') as fp:
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
|
@ -778,7 +791,7 @@ def load_corpus(filename, skip_id_check):
|
|||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
return list(words.values())
|
||||
previous_words.extend(words.values())
|
||||
|
||||
class Writer:
|
||||
def __init__(self, args):
|
||||
|
@ -919,8 +932,7 @@ def main(input_file, structures_file, args):
|
|||
for s in structures:
|
||||
logging.debug(str(s))
|
||||
|
||||
logging.info("LOADING TEXT...")
|
||||
words = load_corpus(input_file, args.skip_id_check)
|
||||
words = load_corpus(args)
|
||||
|
||||
# useful for faster debugging...
|
||||
# import pickle
|
||||
|
@ -954,8 +966,8 @@ def main(input_file, structures_file, args):
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
|
||||
parser.add_argument('input', help='input xml file in `ssj500k form`')
|
||||
parser.add_argument('structures', help='Structures definitions in xml file')
|
||||
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
||||
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
||||
|
||||
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
||||
|
|
Loading…
Reference in New Issue
Block a user