diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py index 08d2ec0..e77ff88 100644 --- a/luscenje_struktur/loader.py +++ b/luscenje_struktur/loader.py @@ -197,76 +197,73 @@ def file_sentence_generator(et, args): previous_pc = False words = {} - sentences = list(et.iter('s')) - for sentence in progress(sentences, "load-text"): - # create fake root word - words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) - last_word_id = None + paragraphs = list(et.iter('p')) + for paragraph in progress(paragraphs, "load-text"): previous_glue = '' + sentences = list(paragraph.iter('s')) + for sentence in sentences: + # create fake root word + words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) + last_word_id = None - if args.new_tei: - for w in sentence.iter(): - if w.tag == 'w': - words[w.get('id')] = Word.from_xml(w, do_msd_translate) - if use_punctuations: - previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' - elif w.tag == pc_tag: - words[w.get('id')] = Word.pc_word(w, do_msd_translate) - if use_punctuations: - words[w.get('id')].previous_glue = previous_glue - words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' - previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' - else: - for w in sentence.iter(): - if w.tag == 'w': - words[w.get('id')] = Word.from_xml(w, do_msd_translate) - if use_punctuations: - previous_glue = '' - last_word_id = None - elif w.tag == pc_tag: - words[w.get('id')] = Word.pc_word(w, do_msd_translate) - if use_punctuations: - last_word_id = w.get('id') - words[w.get('id')].previous_glue = previous_glue - previous_glue = '' - elif use_punctuations and w.tag == 'c': - # always save previous glue - previous_glue = w.text - if last_word_id: - words[last_word_id].glue += w.text - - # for w in sentence.iter("w"): - # words[w.get('id')] = Word.from_xml(w, do_msd_translate) - # for pc in sentence.iter(pc_tag): - # words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) - - for l in sentence.iter("link"): - if 'dep' in l.keys(): - ana = l.get('afun') - lfrom = l.get('from') - dest = l.get('dep') + if args.new_tei: + for w in sentence.iter(): + if w.tag == 'w': + words[w.get('id')] = Word.from_xml(w, do_msd_translate) + if use_punctuations: + previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' + elif w.tag == pc_tag: + words[w.get('id')] = Word.pc_word(w, do_msd_translate) + if use_punctuations: + words[w.get('id')].previous_glue = previous_glue + words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' + previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' else: - ana = l.get('ana') - if ana[:8] != 'jos-syn:': # dont bother... - continue - ana = ana[8:] - lfrom, dest = l.get('target').replace('#', '').split() - - if lfrom in words: - if not skip_id_check and is_root_id(lfrom): - logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom)) - sys.exit(1) - - if dest in words: - next_word = words[dest] - words[lfrom].add_link(ana, next_word) + for w in sentence.iter(): + if w.tag == 'w': + words[w.get('id')] = Word.from_xml(w, do_msd_translate) + if use_punctuations: + previous_glue = '' + last_word_id = None + elif w.tag == pc_tag: + words[w.get('id')] = Word.pc_word(w, do_msd_translate) + if use_punctuations: + last_word_id = w.get('id') + words[w.get('id')].previous_glue = previous_glue + previous_glue = '' + elif use_punctuations and w.tag == 'c': + # always save previous glue + previous_glue = w.text + if last_word_id: + words[last_word_id].glue += w.text + + for l in sentence.iter("link"): + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') else: - logging.error("Unknown id: {}".format(dest)) - sys.exit(1) + ana = l.get('ana') + if ana[:8] != 'jos-syn:': # dont bother... + continue + ana = ana[8:] + lfrom, dest = l.get('target').replace('#', '').split() + + if lfrom in words: + if not skip_id_check and is_root_id(lfrom): + logging.error("Id {} is not fine, you might want to try with tag --skip-id-check".format(lfrom)) + sys.exit(1) + + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) - else: - # strange errors, just skip... - pass + else: + # strange errors, just skip... + pass return list(words.values())