luscenje_struktur/src/loader.py

70 lines
2.0 KiB
Python
Raw Normal View History

2019-06-17 13:38:55 +00:00
from xml.etree import ElementTree
import logging
import re
import sys
2019-06-17 15:30:51 +00:00
from progress_bar import progress
2019-06-17 13:38:55 +00:00
from word import Word
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for fname in filenames:
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
def load_xml(filename):
with open(filename, 'r') as fp:
content = fp.read()
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
words = {}
sentences = list(et.iter('s'))
2019-06-17 15:30:51 +00:00
for sentence in progress(sentences, "load-text", infile=True):
2019-06-17 13:38:55 +00:00
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())