From 0d8aeb2282b0d85c9357e462731d40099106a752 Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Sat, 15 Jun 2019 22:30:43 +0200
Subject: [PATCH] load_files now returns a generator of senteces, not a
 generator of the whole file

This makes it much slower, but more adaptable for huge files.
---
 src/wani.py | 83 +++++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/src/wani.py b/src/wani.py
index 3fb3c37..bf732eb 100644
--- a/src/wani.py
+++ b/src/wani.py
@@ -35,57 +35,60 @@ def load_files(args):
             status = " :: {} / {}".format(n, len(filenames))
         else:
             status = ""
-        yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
-
-
-def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
-    logging.info("LOADING FILE: {}{}".format(filename, status))
+        yield from file_sentence_generator(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
 
+def load_xml(filename, status):
+    logging.info("LOADING XML: {}{}".format(filename, status))
     with open(filename, 'r') as fp:
-        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
-        xmlstring = xmlstring.replace(' xml:', ' ')
-        et = ElementTree.XML(xmlstring)
+        content = fp.read()
 
-    words = {}
-    for w in et.iter("w"):
-        words[w.get('id')] = Word(w, do_msd_translate)
-    for pc in et.iter(pc_tag):
-        words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
+    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
+    xmlstring = xmlstring.replace(' xml:', ' ')
+    return ElementTree.XML(xmlstring)
 
-    for l in et.iter("link"):
-        if 'dep' in l.keys():
-            ana = l.get('afun')
-            lfrom = l.get('from')
-            dest = l.get('dep')
-        else:
-            ana = l.get('ana')
-            if ana[:4] != 'syn:': # dont bother...
-                continue
-            ana = ana[4:]
-            lfrom, dest = l.get('target').replace('#', '').split()
+def file_sentence_generator(filename, skip_id_check, do_msd_translate, pc_tag, status):
+    et = load_xml(filename, status)
+    for sentence in et.iter('s'):
+        words = {}
+        for w in sentence.iter("w"):
+            words[w.get('id')] = Word(w, do_msd_translate)
+        for pc in sentence.iter(pc_tag):
+            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
 
-        if lfrom in words:
-            if not skip_id_check and is_root_id(lfrom):
-                logging.error("NOO: {}".format(lfrom))
-                sys.exit(1)
-
-            if dest in words:
-                next_word = words[dest]
-                words[lfrom].add_link(ana, next_word)
+        for l in sentence.iter("link"):
+            if 'dep' in l.keys():
+                ana = l.get('afun')
+                lfrom = l.get('from')
+                dest = l.get('dep')
             else:
-                logging.error("Unknown id: {}".format(dest))
-                sys.exit(1)
+                ana = l.get('ana')
+                if ana[:4] != 'syn:': # dont bother...
+                    continue
+                ana = ana[4:]
+                lfrom, dest = l.get('target').replace('#', '').split()
 
-        else:
-            # strange errors, just skip...
-            pass
+            if lfrom in words:
+                if not skip_id_check and is_root_id(lfrom):
+                    logging.error("NOO: {}".format(lfrom))
+                    sys.exit(1)
 
-    return list(words.values())
+                if dest in words:
+                    next_word = words[dest]
+                    words[lfrom].add_link(ana, next_word)
+                else:
+                    logging.error("Unknown id: {}".format(dest))
+                    sys.exit(1)
+
+            else:
+                # strange errors, just skip...
+                pass
+
+        yield list(words.values())
 
 def match_file(words, structures):
     matches = {s: [] for s in structures}
 
-    for s in tqdm(structures):
+    for s in structures:
         for w in words:
             mhere = s.match(w)
             for match in mhere:
@@ -136,7 +139,7 @@ def main(structures_file, args):
                     word_stats.add_words(words)
 
     else:
-        for words in load_files(args):
+        for words in tqdm(load_files(args)):
             matches = match_file(words, structures)
             # just save to temporary file, used for children of a parallel process
             # MUST NOT have more than one file