load_files now returns a generator of senteces, not a generator of the whole file
This makes it much slower, but more adaptable for huge files.
This commit is contained in:
parent
a8183cf507
commit
0d8aeb2282
83
src/wani.py
83
src/wani.py
|
@ -35,57 +35,60 @@ def load_files(args):
|
||||||
status = " :: {} / {}".format(n, len(filenames))
|
status = " :: {} / {}".format(n, len(filenames))
|
||||||
else:
|
else:
|
||||||
status = ""
|
status = ""
|
||||||
yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
|
yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
|
||||||
|
|
||||||
|
|
||||||
def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
|
||||||
logging.info("LOADING FILE: {}{}".format(filename, status))
|
|
||||||
|
|
||||||
|
def load_xml(filename, status):
|
||||||
|
logging.info("LOADING XML: {}{}".format(filename, status))
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
content = fp.read()
|
||||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
||||||
et = ElementTree.XML(xmlstring)
|
|
||||||
|
|
||||||
words = {}
|
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
||||||
for w in et.iter("w"):
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||||
words[w.get('id')] = Word(w, do_msd_translate)
|
return ElementTree.XML(xmlstring)
|
||||||
for pc in et.iter(pc_tag):
|
|
||||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
||||||
|
|
||||||
for l in et.iter("link"):
|
def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
|
||||||
if 'dep' in l.keys():
|
et = load_xml(filename, status)
|
||||||
ana = l.get('afun')
|
for sentence in et.iter('s'):
|
||||||
lfrom = l.get('from')
|
words = {}
|
||||||
dest = l.get('dep')
|
for w in sentence.iter("w"):
|
||||||
else:
|
words[w.get('id')] = Word(w, do_msd_translate)
|
||||||
ana = l.get('ana')
|
for pc in sentence.iter(pc_tag):
|
||||||
if ana[:4] != 'syn:': # dont bother...
|
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||||
continue
|
|
||||||
ana = ana[4:]
|
|
||||||
lfrom, dest = l.get('target').replace('#', '').split()
|
|
||||||
|
|
||||||
if lfrom in words:
|
for l in sentence.iter("link"):
|
||||||
if not skip_id_check and is_root_id(lfrom):
|
if 'dep' in l.keys():
|
||||||
logging.error("NOO: {}".format(lfrom))
|
ana = l.get('afun')
|
||||||
sys.exit(1)
|
lfrom = l.get('from')
|
||||||
|
dest = l.get('dep')
|
||||||
if dest in words:
|
|
||||||
next_word = words[dest]
|
|
||||||
words[lfrom].add_link(ana, next_word)
|
|
||||||
else:
|
else:
|
||||||
logging.error("Unknown id: {}".format(dest))
|
ana = l.get('ana')
|
||||||
sys.exit(1)
|
if ana[:4] != 'syn:': # dont bother...
|
||||||
|
continue
|
||||||
|
ana = ana[4:]
|
||||||
|
lfrom, dest = l.get('target').replace('#', '').split()
|
||||||
|
|
||||||
else:
|
if lfrom in words:
|
||||||
# strange errors, just skip...
|
if not skip_id_check and is_root_id(lfrom):
|
||||||
pass
|
logging.error("NOO: {}".format(lfrom))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
return list(words.values())
|
if dest in words:
|
||||||
|
next_word = words[dest]
|
||||||
|
words[lfrom].add_link(ana, next_word)
|
||||||
|
else:
|
||||||
|
logging.error("Unknown id: {}".format(dest))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# strange errors, just skip...
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield list(words.values())
|
||||||
|
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
matches = {s: [] for s in structures}
|
matches = {s: [] for s in structures}
|
||||||
|
|
||||||
for s in tqdm(structures):
|
for s in structures:
|
||||||
for w in words:
|
for w in words:
|
||||||
mhere = s.match(w)
|
mhere = s.match(w)
|
||||||
for match in mhere:
|
for match in mhere:
|
||||||
|
@ -136,7 +139,7 @@ def main(structures_file, args):
|
||||||
word_stats.add_words(words)
|
word_stats.add_words(words)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
for words in load_files(args):
|
for words in tqdm(load_files(args)):
|
||||||
matches = match_file(words, structures)
|
matches = match_file(words, structures)
|
||||||
# just save to temporary file, used for children of a parallel process
|
# just save to temporary file, used for children of a parallel process
|
||||||
# MUST NOT have more than one file
|
# MUST NOT have more than one file
|
||||||
|
|
Loading…
Reference in New Issue
Block a user