diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py index f0cea89..a310fe9 100644 --- a/luscenje_struktur/loader.py +++ b/luscenje_struktur/loader.py @@ -204,23 +204,36 @@ def file_sentence_generator(et, args): words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) last_word_id = None - for w in sentence.iter(): - if w.tag == 'w': - words[w.get('id')] = Word.from_xml(w, do_msd_translate) - if use_punctuations: - previous_glue = '' - last_word_id = None - elif w.tag == pc_tag: - words[w.get('id')] = Word.pc_word(w, do_msd_translate) - if use_punctuations: - last_word_id = w.get('id') - words[w.get('id')].previous_glue = previous_glue - previous_glue = '' - elif use_punctuations and w.tag == 'c': - # always save previous glue - previous_glue = w.text - if last_word_id: - words[last_word_id].glue += w.text + if args.new_tei: + for w in sentence.iter(): + if w.tag == 'w': + words[w.get('id')] = Word.from_xml(w, do_msd_translate) + if use_punctuations: + previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' + elif w.tag == pc_tag: + words[w.get('id')] = Word.pc_word(w, do_msd_translate) + if use_punctuations: + words[w.get('id')].previous_glue = previous_glue + words[w.get('id')].glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' + previous_glue = '' if 'join' in w.attrib and w.get('join') == 'right' else ' ' + else: + for w in sentence.iter(): + if w.tag == 'w': + words[w.get('id')] = Word.from_xml(w, do_msd_translate) + if use_punctuations: + previous_glue = '' + last_word_id = None + elif w.tag == pc_tag: + words[w.get('id')] = Word.pc_word(w, do_msd_translate) + if use_punctuations: + last_word_id = w.get('id') + words[w.get('id')].previous_glue = previous_glue + previous_glue = '' + elif use_punctuations and w.tag == 'c': + # always save previous glue + previous_glue = w.text + if last_word_id: + words[last_word_id].glue += w.text # for w in sentence.iter("w"): # words[w.get('id')] = Word.from_xml(w, do_msd_translate) diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py index 79d863c..e888f6d 100644 --- a/luscenje_struktur/syntactic_structure.py +++ b/luscenje_struktur/syntactic_structure.py @@ -115,6 +115,8 @@ def build_structures(args): structures = [] for structure in et.iter('syntactic_structure'): + if structure.attrib['type'] == 'single': + continue to_append = SyntacticStructure.from_xml(structure, no_stats) if to_append is None: continue diff --git a/wani.py b/wani.py index 09b4f6b..3136b21 100644 --- a/wani.py +++ b/wani.py @@ -160,6 +160,9 @@ if __name__ == '__main__': parser.add_argument('--fixed-restriction-order', help='If used, words have to be in the same order as components.', action='store_true') + parser.add_argument('--new-tei', + help='Attribute to be used, when using new version of tei. (default=False)', + action='store_true') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())