124 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			124 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from xml.etree import ElementTree
 | |
| import logging
 | |
| import re
 | |
| import sys
 | |
| import gzip
 | |
| import pathlib
 | |
| 
 | |
| from progress_bar import progress
 | |
| from word import Word
 | |
| 
 | |
| 
 | |
| def is_root_id(id_):
 | |
|     return len(id_.split('.')) == 3
 | |
| 
 | |
| 
 | |
| def load_files(args):
 | |
|     filenames = args.input
 | |
|     skip_id_check = args.skip_id_check
 | |
|     do_msd_translate = not args.no_msd_translate
 | |
| 
 | |
|     for idx, fname in enumerate(filenames):
 | |
|         print("FILE ", fname, "{}/{}".format(idx, len(filenames)))
 | |
|         extension = pathlib.Path(fname).suffix
 | |
| 
 | |
|         if extension == ".xml":
 | |
|             et = load_xml(fname)
 | |
|             yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
 | |
|         elif extension == ".gz":
 | |
|             yield load_gz(fname)
 | |
|         else:
 | |
|             raise NotImplementedError("Unknown file extension: {}".format(extension))
 | |
| 
 | |
| 
 | |
| def load_gz(filename):
 | |
|     result = []
 | |
|     bad_sentence = False
 | |
| 
 | |
|     words = {}
 | |
|     links = []
 | |
| 
 | |
|     def sentence_end(bad_sentence):
 | |
|         if bad_sentence:
 | |
|             return
 | |
| 
 | |
|         for lfrom, ldest, ana in links:
 | |
|             if lfrom not in words or ldest not in words:
 | |
|                 logging.warning("Bad link in sentence: " + line_split[0])
 | |
|                 continue
 | |
|             words[lfrom].add_link(ana, words[ldest])
 | |
|         result.extend(words.values())
 | |
| 
 | |
|     with gzip.open(filename, 'r') as fp:
 | |
|         for line in progress(fp, 'load-gz'):
 | |
|             line_str = line.decode('utf8').strip()
 | |
|             line_fixed = line_str.replace(',', '\t').replace('\t\t\t', '\t,\t')
 | |
|             line_split = line_fixed.split("\t")
 | |
| 
 | |
|             if line_split[1] == "1" and len(words) > 0:
 | |
|                 sentence_end(bad_sentence)
 | |
|                 bad_sentence = False
 | |
|                 links = []
 | |
|                 words = {}
 | |
| 
 | |
|             try:
 | |
|                 sid, wid, text, msd, lemma, link_src, link_type = line_split
 | |
|             except ValueError:
 | |
|                 bad_sentence = True
 | |
|             full_id = "{}.{}".format(sid, wid)
 | |
|             
 | |
|             words[wid] = Word(lemma, msd, full_id, text, True)
 | |
|             if link_src != '0':
 | |
|                 links.append((link_src, wid, link_type))
 | |
|     
 | |
|     sentence_end(bad_sentence)
 | |
|     return result
 | |
| 
 | |
| def load_xml(filename):
 | |
|     with open(filename, 'r') as fp:
 | |
|         content = fp.read()
 | |
| 
 | |
|     xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
 | |
|     xmlstring = xmlstring.replace(' xml:', ' ')
 | |
|     return ElementTree.XML(xmlstring)
 | |
| 
 | |
| 
 | |
| def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
 | |
|     words = {}
 | |
|     sentences = list(et.iter('s'))
 | |
|     for sentence in progress(sentences, "load-text"):
 | |
|         for w in sentence.iter("w"):
 | |
|             words[w.get('id')] = Word.from_xml(w, do_msd_translate)
 | |
|         for pc in sentence.iter(pc_tag):
 | |
|             words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
 | |
| 
 | |
|         for l in sentence.iter("link"):
 | |
|             if 'dep' in l.keys():
 | |
|                 ana = l.get('afun')
 | |
|                 lfrom = l.get('from')
 | |
|                 dest = l.get('dep')
 | |
|             else:
 | |
|                 ana = l.get('ana')
 | |
|                 if ana[:4] != 'syn:': # dont bother...
 | |
|                     continue
 | |
|                 ana = ana[4:]
 | |
|                 lfrom, dest = l.get('target').replace('#', '').split()
 | |
| 
 | |
|             if lfrom in words:
 | |
|                 if not skip_id_check and is_root_id(lfrom):
 | |
|                     logging.error("NOO: {}".format(lfrom))
 | |
|                     sys.exit(1)
 | |
| 
 | |
|                 if dest in words:
 | |
|                     next_word = words[dest]
 | |
|                     words[lfrom].add_link(ana, next_word)
 | |
|                 else:
 | |
|                     logging.error("Unknown id: {}".format(dest))
 | |
|                     sys.exit(1)
 | |
| 
 | |
|             else:
 | |
|                 # strange errors, just skip...
 | |
|                 pass
 | |
| 
 | |
|     return list(words.values())
 |