load_files now returns a generator of senteces, not a generator of the whole file

This makes it much slower, but more adaptable for huge files.
2019-06-15 22:30:43 +02:00 · 2019-06-15 22:30:43 +02:00 · 0d8aeb2282
commit 0d8aeb2282
parent a8183cf507
1 changed files with 43 additions and 40 deletions
--- a/src/wani.py
+++ b/src/wani.py
@ -35,24 +35,27 @@ def load_files(args):
            status = " :: {} / {}".format(n, len(filenames))
        else:
            status = ""
-        yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
-
-
-def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
-    logging.info("LOADING FILE: {}{}".format(filename, status))
+        yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status)

+def load_xml(filename, status):
+    logging.info("LOADING XML: {}{}".format(filename, status))
    with open(filename, 'r') as fp:
-        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
-        xmlstring = xmlstring.replace(' xml:', ' ')
-        et = ElementTree.XML(xmlstring)
+        content = fp.read()

+    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
+    xmlstring = xmlstring.replace(' xml:', ' ')
+    return ElementTree.XML(xmlstring)
+
+def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
+    et = load_xml(filename, status)
+    for sentence in et.iter('s'):
        words = {}
-    for w in et.iter("w"):
+        for w in sentence.iter("w"):
            words[w.get('id')] = Word(w, do_msd_translate)
-    for pc in et.iter(pc_tag):
+        for pc in sentence.iter(pc_tag):
            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)

-    for l in et.iter("link"):
+        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
@ -80,12 +83,12 @@ def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
                # strange errors, just skip...
                pass

-    return list(words.values())
+        yield list(words.values())

 def match_file(words, structures):
    matches = {s: [] for s in structures}

-    for s in tqdm(structures):
+    for s in structures:
        for w in words:
            mhere = s.match(w)
            for match in mhere:
@ -136,7 +139,7 @@ def main(structures_file, args):
                    word_stats.add_words(words)

    else:
-        for words in load_files(args):
+        for words in tqdm(load_files(args)):
            matches = match_file(words, structures)
            # just save to temporary file, used for children of a parallel process
            # MUST NOT have more than one file