70 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			70 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from xml.etree import ElementTree
 | 
						|
import logging
 | 
						|
import re
 | 
						|
import sys
 | 
						|
 | 
						|
from progress_bar import progress
 | 
						|
from word import Word
 | 
						|
 | 
						|
 | 
						|
def is_root_id(id_):
 | 
						|
    return len(id_.split('.')) == 3
 | 
						|
 | 
						|
 | 
						|
def load_files(args):
 | 
						|
    filenames = args.input
 | 
						|
    skip_id_check = args.skip_id_check
 | 
						|
    do_msd_translate = not args.no_msd_translate
 | 
						|
 | 
						|
    for fname in filenames:
 | 
						|
        et = load_xml(fname)
 | 
						|
        yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
 | 
						|
 | 
						|
 | 
						|
def load_xml(filename):
 | 
						|
    with open(filename, 'r') as fp:
 | 
						|
        content = fp.read()
 | 
						|
 | 
						|
    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
 | 
						|
    xmlstring = xmlstring.replace(' xml:', ' ')
 | 
						|
    return ElementTree.XML(xmlstring)
 | 
						|
 | 
						|
 | 
						|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
 | 
						|
    words = {}
 | 
						|
    sentences = list(et.iter('s'))
 | 
						|
    for sentence in progress(sentences, "load-text", infile=True):
 | 
						|
        for w in sentence.iter("w"):
 | 
						|
            words[w.get('id')] = Word(w, do_msd_translate)
 | 
						|
        for pc in sentence.iter(pc_tag):
 | 
						|
            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
 | 
						|
 | 
						|
        for l in sentence.iter("link"):
 | 
						|
            if 'dep' in l.keys():
 | 
						|
                ana = l.get('afun')
 | 
						|
                lfrom = l.get('from')
 | 
						|
                dest = l.get('dep')
 | 
						|
            else:
 | 
						|
                ana = l.get('ana')
 | 
						|
                if ana[:4] != 'syn:': # dont bother...
 | 
						|
                    continue
 | 
						|
                ana = ana[4:]
 | 
						|
                lfrom, dest = l.get('target').replace('#', '').split()
 | 
						|
 | 
						|
            if lfrom in words:
 | 
						|
                if not skip_id_check and is_root_id(lfrom):
 | 
						|
                    logging.error("NOO: {}".format(lfrom))
 | 
						|
                    sys.exit(1)
 | 
						|
 | 
						|
                if dest in words:
 | 
						|
                    next_word = words[dest]
 | 
						|
                    words[lfrom].add_link(ana, next_word)
 | 
						|
                else:
 | 
						|
                    logging.error("Unknown id: {}".format(dest))
 | 
						|
                    sys.exit(1)
 | 
						|
 | 
						|
            else:
 | 
						|
                # strange errors, just skip...
 | 
						|
                pass
 | 
						|
 | 
						|
    return list(words.values()) |