Loader to its own module

2019-06-17 15:38:55 +02:00 · 2019-06-17 15:38:55 +02:00 · 3552f14b81
commit 3552f14b81
parent 51cf3e7064
2 changed files with 73 additions and 67 deletions
--- a/src/loader.py
+++ b/src/loader.py
@ -0,0 +1,72 @@
+from xml.etree import ElementTree
+import logging
+import re
+import sys
+
+from tqdm import tqdm
+
+from word import Word
+
+
+def is_root_id(id_):
+    return len(id_.split('.')) == 3
+
+
+def load_files(args):
+    filenames = args.input
+    skip_id_check = args.skip_id_check
+    do_msd_translate = not args.no_msd_translate
+
+    for fname in filenames:
+        et = load_xml(fname)
+        yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
+
+
+def load_xml(filename):
+    logging.info("LOADING XML: {}".format(filename))
+    with open(filename, 'r') as fp:
+        content = fp.read()
+
+    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
+    xmlstring = xmlstring.replace(' xml:', ' ')
+    return ElementTree.XML(xmlstring)
+
+
+def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
+    words = {}
+    sentences = list(et.iter('s'))
+    for sentence in tqdm(sentences, desc="load-text"):
+        for w in sentence.iter("w"):
+            words[w.get('id')] = Word(w, do_msd_translate)
+        for pc in sentence.iter(pc_tag):
+            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
+
+        for l in sentence.iter("link"):
+            if 'dep' in l.keys():
+                ana = l.get('afun')
+                lfrom = l.get('from')
+                dest = l.get('dep')
+            else:
+                ana = l.get('ana')
+                if ana[:4] != 'syn:': # dont bother...
+                    continue
+                ana = ana[4:]
+                lfrom, dest = l.get('target').replace('#', '').split()
+
+            if lfrom in words:
+                if not skip_id_check and is_root_id(lfrom):
+                    logging.error("NOO: {}".format(lfrom))
+                    sys.exit(1)
+
+                if dest in words:
+                    next_word = words[dest]
+                    words[lfrom].add_link(ana, next_word)
+                else:
+                    logging.error("Unknown id: {}".format(dest))
+                    sys.exit(1)
+
+            else:
+                # strange errors, just skip...
+                pass
+
+    return list(words.values())
--- a/src/wani.py
+++ b/src/wani.py
@ -19,73 +19,9 @@ from syntactic_structure import build_structures
 from match_store import MatchStore
 from word_stats import WordStats
 from writer import Writer
+from loader import load_files


-def is_root_id(id_):
-    return len(id_.split('.')) == 3
-
-
-def load_files(args):
-    filenames = args.input
-    skip_id_check = args.skip_id_check
-    do_msd_translate = not args.no_msd_translate
-
-    for n, fname in enumerate(filenames):
-        et = load_xml(fname)
-        yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
-
-
-def load_xml(filename):
-    logging.info("\rLOADING XML: {}".format(filename))
-    with open(filename, 'r') as fp:
-        content = fp.read()
-
-    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
-    xmlstring = xmlstring.replace(' xml:', ' ')
-    return ElementTree.XML(xmlstring)
-
-
-def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
-    words = {}
-    for sentence in et.iter('s'):
-        for w in sentence.iter("w"):
-            words[w.get('id')] = Word(w, do_msd_translate)
-        for pc in sentence.iter(pc_tag):
-            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
-
-        for l in sentence.iter("link"):
-            if 'dep' in l.keys():
-                ana = l.get('afun')
-                lfrom = l.get('from')
-                dest = l.get('dep')
-            else:
-                ana = l.get('ana')
-                if ana[:4] != 'syn:': # dont bother...
-                    continue
-                ana = ana[4:]
-                lfrom, dest = l.get('target').replace('#', '').split()
-
-            if lfrom in words:
-                if not skip_id_check and is_root_id(lfrom):
-                    logging.error("NOO: {}".format(lfrom))
-                    sys.exit(1)
-
-                if dest in words:
-                    next_word = words[dest]
-                    words[lfrom].add_link(ana, next_word)
-                else:
-                    logging.error("Unknown id: {}".format(dest))
-                    sys.exit(1)
-
-            else:
-                # strange errors, just skip...
-                pass
-
-        if chunk_size > 0 and len(words) > chunk_size:
-            yield list(words.values())
-            words = {}
-    
-    yield list(words.values())

 def match_file(words, structures):
    matches = {s: [] for s in structures}
@ -109,8 +45,6 @@ def main(args):
    match_store = MatchStore(args)
    word_stats = WordStats(lemma_msds)

-    args.chunk_size = 50000
-    
    if args.parallel:
        num_parallel = int(args.parallel)