White reset at paragraphs not sentences + progress bar updates on paragraphs not sentences.

2021-01-26 14:57:42 +01:00 · 2021-01-26 14:57:42 +01:00 · f1366548b6
commit f1366548b6
parent 552f2e4bd0
1 changed files with 61 additions and 64 deletions
--- a/luscenje_struktur/loader.py
+++ b/luscenje_struktur/loader.py
@ -197,76 +197,73 @@ def file_sentence_generator(et, args):
    previous_pc = False
    words = {}
-    sentences = list(et.iter('s'))
+    paragraphs = list(et.iter('p'))
-    for sentence in progress(sentences, "load-text"):
+    for paragraph in progress(paragraphs, "load-text"):
        # create fake root word
        words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
        last_word_id = None
        previous_glue = ''
        sentences = list(paragraph.iter('s'))
        for sentence in sentences:
            # create fake root word
            words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
            last_word_id = None
-        if args.new_tei:
+            if args.new_tei:
-            for w in sentence.iter():
+                for w in sentence.iter():
-                if w.tag == 'w':
+                    if w.tag == 'w':
-                    words[w.get('id')] = Word.from_xml(w, do_msd_translate)
+                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
-                    if use_punctuations:
+                        if use_punctuations:
-                        previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
+                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
-                elif w.tag == pc_tag:
+                    elif w.tag == pc_tag:
-                    words[w.get('id')] = Word.pc_word(w, do_msd_translate)
+                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
-                    if use_punctuations:
+                        if use_punctuations:
-                        words[w.get('id')].previous_glue = previous_glue
+                            words[w.get('id')].previous_glue = previous_glue
-                        words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
+                            words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
-                        previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
+                            previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' '
        else:
            for w in sentence.iter():
                if w.tag == 'w':
                    words[w.get('id')] = Word.from_xml(w, do_msd_translate)
                    if use_punctuations:
                        previous_glue = ''
                        last_word_id = None
                elif w.tag == pc_tag:
                    words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                    if use_punctuations:
                        last_word_id = w.get('id')
                        words[w.get('id')].previous_glue = previous_glue
                        previous_glue = ''
                elif use_punctuations and w.tag == 'c':
                    # always save previous glue
                    previous_glue = w.text
                    if last_word_id:
                        words[last_word_id].glue += w.text
        # for w in sentence.iter("w"):
        #     words[w.get('id')] = Word.from_xml(w, do_msd_translate)
        # for pc in sentence.iter(pc_tag):
        #     words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
-                ana = l.get('ana')
+                for w in sentence.iter():
-                if ana[:8] != 'jos-syn:': # dont bother...
+                    if w.tag == 'w':
-                    continue
+                        words[w.get('id')] = Word.from_xml(w, do_msd_translate)
-                ana = ana[8:]
+                        if use_punctuations:
-                lfrom, dest = l.get('target').replace('#', '').split()
+                            previous_glue = ''
                            last_word_id = None
                    elif w.tag == pc_tag:
                        words[w.get('id')] = Word.pc_word(w, do_msd_translate)
                        if use_punctuations:
                            last_word_id = w.get('id')
                            words[w.get('id')].previous_glue = previous_glue
                            previous_glue = ''
                    elif use_punctuations and w.tag == 'c':
                        # always save previous glue
                        previous_glue = w.text
                        if last_word_id:
                            words[last_word_id].glue += w.text
-            if lfrom in words:
+            for l in sentence.iter("link"):
-                if not skip_id_check and is_root_id(lfrom):
+                if 'dep' in l.keys():
-                    logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
+                    ana = l.get('afun')
-                    sys.exit(1)
+                    lfrom = l.get('from')
-
+                    dest = l.get('dep')
                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
-                    logging.error("Unknown id: {}".format(dest))
+                    ana = l.get('ana')
-                    sys.exit(1)
+                    if ana[:8] != 'jos-syn:': # dont bother...
                        continue
                    ana = ana[8:]
                    lfrom, dest = l.get('target').replace('#', '').split()
-            else:
+                if lfrom in words:
-                # strange errors, just skip...
+                    if not skip_id_check and is_root_id(lfrom):
-                pass
+                        logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom))
                        sys.exit(1)
                    if dest in words:
                        next_word = words[dest]
                        words[lfrom].add_link(ana, next_word)
                    else:
                        logging.error("Unknown id: {}".format(dest))
                        sys.exit(1)
                else:
                    # strange errors, just skip...
                    pass
    return list(words.values())