Multiple input files support
This commit is contained in:
parent
b4e73e2d60
commit
518fe5e113
22
wani.py
22
wani.py
|
@ -738,7 +738,20 @@ def is_root_id(id_):
|
||||||
return len(id_.split('.')) == 3
|
return len(id_.split('.')) == 3
|
||||||
|
|
||||||
|
|
||||||
def load_corpus(filename, skip_id_check):
|
def load_corpus(args):
|
||||||
|
filenames = args.input
|
||||||
|
skip_id_check = args.skip_id_check
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for fname in filenames:
|
||||||
|
load_tei_file(fname, skip_id_check, result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def load_tei_file(filename, skip_id_check, previous_words):
|
||||||
|
logging.info("LOADING FILE: {}".format(filename))
|
||||||
|
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
||||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||||
|
@ -778,7 +791,7 @@ def load_corpus(filename, skip_id_check):
|
||||||
# strange errors, just skip...
|
# strange errors, just skip...
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return list(words.values())
|
previous_words.extend(words.values())
|
||||||
|
|
||||||
class Writer:
|
class Writer:
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
|
@ -919,8 +932,7 @@ def main(input_file, structures_file, args):
|
||||||
for s in structures:
|
for s in structures:
|
||||||
logging.debug(str(s))
|
logging.debug(str(s))
|
||||||
|
|
||||||
logging.info("LOADING TEXT...")
|
words = load_corpus(args)
|
||||||
words = load_corpus(input_file, args.skip_id_check)
|
|
||||||
|
|
||||||
# useful for faster debugging...
|
# useful for faster debugging...
|
||||||
# import pickle
|
# import pickle
|
||||||
|
@ -954,8 +966,8 @@ def main(input_file, structures_file, args):
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
|
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
|
||||||
parser.add_argument('input', help='input xml file in `ssj500k form`')
|
|
||||||
parser.add_argument('structures', help='Structures definitions in xml file')
|
parser.add_argument('structures', help='Structures definitions in xml file')
|
||||||
|
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
||||||
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
||||||
|
|
||||||
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user