load_files now returns a generator of senteces, not a generator of the whole file

This makes it much slower, but more adaptable for huge files.
This commit is contained in:
Ozbolt Menegatti 2019-06-15 22:30:43 +02:00
parent a8183cf507
commit 0d8aeb2282

View File

@ -35,57 +35,60 @@ def load_files(args):
status = " :: {} / {}".format(n, len(filenames)) status = " :: {} / {}".format(n, len(filenames))
else: else:
status = "" status = ""
yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status) yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
logging.info("LOADING FILE: {}{}".format(filename, status))
def load_xml(filename, status):
logging.info("LOADING XML: {}{}".format(filename, status))
with open(filename, 'r') as fp: with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) content = fp.read()
xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring)
words = {} xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
for w in et.iter("w"): xmlstring = xmlstring.replace(' xml:', ' ')
words[w.get('id')] = Word(w, do_msd_translate) return ElementTree.XML(xmlstring)
for pc in et.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in et.iter("link"): def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
if 'dep' in l.keys(): et = load_xml(filename, status)
ana = l.get('afun') for sentence in et.iter('s'):
lfrom = l.get('from') words = {}
dest = l.get('dep') for w in sentence.iter("w"):
else: words[w.get('id')] = Word(w, do_msd_translate)
ana = l.get('ana') for pc in sentence.iter(pc_tag):
if ana[:4] != 'syn:': # dont bother... words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words: for l in sentence.iter("link"):
if not skip_id_check and is_root_id(lfrom): if 'dep' in l.keys():
logging.error("NOO: {}".format(lfrom)) ana = l.get('afun')
sys.exit(1) lfrom = l.get('from')
dest = l.get('dep')
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else: else:
logging.error("Unknown id: {}".format(dest)) ana = l.get('ana')
sys.exit(1) if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
else: if lfrom in words:
# strange errors, just skip... if not skip_id_check and is_root_id(lfrom):
pass logging.error("NOO: {}".format(lfrom))
sys.exit(1)
return list(words.values()) if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
yield list(words.values())
def match_file(words, structures): def match_file(words, structures):
matches = {s: [] for s in structures} matches = {s: [] for s in structures}
for s in tqdm(structures): for s in structures:
for w in words: for w in words:
mhere = s.match(w) mhere = s.match(w)
for match in mhere: for match in mhere:
@ -136,7 +139,7 @@ def main(structures_file, args):
word_stats.add_words(words) word_stats.add_words(words)
else: else:
for words in load_files(args): for words in tqdm(load_files(args)):
matches = match_file(words, structures) matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process # just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file # MUST NOT have more than one file