From 0d8aeb2282b0d85c9357e462731d40099106a752 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sat, 15 Jun 2019 22:30:43 +0200 Subject: [PATCH] load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. --- src/wani.py | 83 +++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/src/wani.py b/src/wani.py index 3fb3c37..bf732eb 100644 --- a/src/wani.py +++ b/src/wani.py @@ -35,57 +35,60 @@ def load_files(args): status = " :: {} / {}".format(n, len(filenames)) else: status = "" - yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status) - - -def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): - logging.info("LOADING FILE: {}{}".format(filename, status)) + yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status) +def load_xml(filename, status): + logging.info("LOADING XML: {}{}".format(filename, status)) with open(filename, 'r') as fp: - xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) - xmlstring = xmlstring.replace(' xml:', ' ') - et = ElementTree.XML(xmlstring) + content = fp.read() - words = {} - for w in et.iter("w"): - words[w.get('id')] = Word(w, do_msd_translate) - for pc in et.iter(pc_tag): - words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) + xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + return ElementTree.XML(xmlstring) - for l in et.iter("link"): - if 'dep' in l.keys(): - ana = l.get('afun') - lfrom = l.get('from') - dest = l.get('dep') - else: - ana = l.get('ana') - if ana[:4] != 'syn:': # dont bother... - continue - ana = ana[4:] - lfrom, dest = l.get('target').replace('#', '').split() +def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status): + et = load_xml(filename, status) + for sentence in et.iter('s'): + words = {} + for w in sentence.iter("w"): + words[w.get('id')] = Word(w, do_msd_translate) + for pc in sentence.iter(pc_tag): + words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) - if lfrom in words: - if not skip_id_check and is_root_id(lfrom): - logging.error("NOO: {}".format(lfrom)) - sys.exit(1) - - if dest in words: - next_word = words[dest] - words[lfrom].add_link(ana, next_word) + for l in sentence.iter("link"): + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') else: - logging.error("Unknown id: {}".format(dest)) - sys.exit(1) + ana = l.get('ana') + if ana[:4] != 'syn:': # dont bother... + continue + ana = ana[4:] + lfrom, dest = l.get('target').replace('#', '').split() - else: - # strange errors, just skip... - pass + if lfrom in words: + if not skip_id_check and is_root_id(lfrom): + logging.error("NOO: {}".format(lfrom)) + sys.exit(1) - return list(words.values()) + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) + + else: + # strange errors, just skip... + pass + + yield list(words.values()) def match_file(words, structures): matches = {s: [] for s in structures} - for s in tqdm(structures): + for s in structures: for w in words: mhere = s.match(w) for match in mhere: @@ -136,7 +139,7 @@ def main(structures_file, args): word_stats.add_words(words) else: - for words in load_files(args): + for words in tqdm(load_files(args)): matches = match_file(words, structures) # just save to temporary file, used for children of a parallel process # MUST NOT have more than one file