72 lines
2.1 KiB
Python
72 lines
2.1 KiB
Python
from xml.etree import ElementTree
|
|
import logging
|
|
import re
|
|
import sys
|
|
|
|
from tqdm import tqdm
|
|
|
|
from word import Word
|
|
|
|
|
|
def is_root_id(id_):
|
|
return len(id_.split('.')) == 3
|
|
|
|
|
|
def load_files(args):
|
|
filenames = args.input
|
|
skip_id_check = args.skip_id_check
|
|
do_msd_translate = not args.no_msd_translate
|
|
|
|
for fname in filenames:
|
|
et = load_xml(fname)
|
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
|
|
|
|
|
def load_xml(filename):
|
|
logging.info("LOADING XML: {}".format(filename))
|
|
with open(filename, 'r') as fp:
|
|
content = fp.read()
|
|
|
|
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
return ElementTree.XML(xmlstring)
|
|
|
|
|
|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
|
words = {}
|
|
sentences = list(et.iter('s'))
|
|
for sentence in tqdm(sentences, desc="load-text"):
|
|
for w in sentence.iter("w"):
|
|
words[w.get('id')] = Word(w, do_msd_translate)
|
|
for pc in sentence.iter(pc_tag):
|
|
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
|
|
for l in sentence.iter("link"):
|
|
if 'dep' in l.keys():
|
|
ana = l.get('afun')
|
|
lfrom = l.get('from')
|
|
dest = l.get('dep')
|
|
else:
|
|
ana = l.get('ana')
|
|
if ana[:4] != 'syn:': # dont bother...
|
|
continue
|
|
ana = ana[4:]
|
|
lfrom, dest = l.get('target').replace('#', '').split()
|
|
|
|
if lfrom in words:
|
|
if not skip_id_check and is_root_id(lfrom):
|
|
logging.error("NOO: {}".format(lfrom))
|
|
sys.exit(1)
|
|
|
|
if dest in words:
|
|
next_word = words[dest]
|
|
words[lfrom].add_link(ana, next_word)
|
|
else:
|
|
logging.error("Unknown id: {}".format(dest))
|
|
sys.exit(1)
|
|
|
|
else:
|
|
# strange errors, just skip...
|
|
pass
|
|
|
|
return list(words.values()) |