Loader to its own module

This commit is contained in:
Ozbolt Menegatti 2019-06-17 15:38:55 +02:00
parent 51cf3e7064
commit 3552f14b81
2 changed files with 73 additions and 67 deletions

72
src/loader.py Normal file
View File

@ -0,0 +1,72 @@
from xml.etree import ElementTree
import logging
import re
import sys
from tqdm import tqdm
from word import Word
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for fname in filenames:
et = load_xml(fname)
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
def load_xml(filename):
logging.info("LOADING XML: {}".format(filename))
with open(filename, 'r') as fp:
content = fp.read()
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
words = {}
sentences = list(et.iter('s'))
for sentence in tqdm(sentences, desc="load-text"):
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())

View File

@ -19,73 +19,9 @@ from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for n, fname in enumerate(filenames):
et = load_xml(fname)
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
def load_xml(filename):
logging.info("\rLOADING XML: {}".format(filename))
with open(filename, 'r') as fp:
content = fp.read()
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
return ElementTree.XML(xmlstring)
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
words = {}
for sentence in et.iter('s'):
for w in sentence.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in sentence.iter(pc_tag):
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
for l in sentence.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
if chunk_size > 0 and len(words) > chunk_size:
yield list(words.values())
words = {}
yield list(words.values())
def match_file(words, structures):
matches = {s: [] for s in structures}
@ -109,8 +45,6 @@ def main(args):
match_store = MatchStore(args)
word_stats = WordStats(lemma_msds)
args.chunk_size = 50000
if args.parallel:
num_parallel = int(args.parallel)