Loader to its own module
This commit is contained in:
parent
51cf3e7064
commit
3552f14b81
72
src/loader.py
Normal file
72
src/loader.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
from xml.etree import ElementTree
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from word import Word
|
||||
|
||||
|
||||
def is_root_id(id_):
|
||||
return len(id_.split('.')) == 3
|
||||
|
||||
|
||||
def load_files(args):
|
||||
filenames = args.input
|
||||
skip_id_check = args.skip_id_check
|
||||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
for fname in filenames:
|
||||
et = load_xml(fname)
|
||||
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||
|
||||
|
||||
def load_xml(filename):
|
||||
logging.info("LOADING XML: {}".format(filename))
|
||||
with open(filename, 'r') as fp:
|
||||
content = fp.read()
|
||||
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
return ElementTree.XML(xmlstring)
|
||||
|
||||
|
||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||
words = {}
|
||||
sentences = list(et.iter('s'))
|
||||
for sentence in tqdm(sentences, desc="load-text"):
|
||||
for w in sentence.iter("w"):
|
||||
words[w.get('id')] = Word(w, do_msd_translate)
|
||||
for pc in sentence.iter(pc_tag):
|
||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||
|
||||
for l in sentence.iter("link"):
|
||||
if 'dep' in l.keys():
|
||||
ana = l.get('afun')
|
||||
lfrom = l.get('from')
|
||||
dest = l.get('dep')
|
||||
else:
|
||||
ana = l.get('ana')
|
||||
if ana[:4] != 'syn:': # dont bother...
|
||||
continue
|
||||
ana = ana[4:]
|
||||
lfrom, dest = l.get('target').replace('#', '').split()
|
||||
|
||||
if lfrom in words:
|
||||
if not skip_id_check and is_root_id(lfrom):
|
||||
logging.error("NOO: {}".format(lfrom))
|
||||
sys.exit(1)
|
||||
|
||||
if dest in words:
|
||||
next_word = words[dest]
|
||||
words[lfrom].add_link(ana, next_word)
|
||||
else:
|
||||
logging.error("Unknown id: {}".format(dest))
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
return list(words.values())
|
68
src/wani.py
68
src/wani.py
|
@ -19,73 +19,9 @@ from syntactic_structure import build_structures
|
|||
from match_store import MatchStore
|
||||
from word_stats import WordStats
|
||||
from writer import Writer
|
||||
from loader import load_files
|
||||
|
||||
|
||||
def is_root_id(id_):
|
||||
return len(id_.split('.')) == 3
|
||||
|
||||
|
||||
def load_files(args):
|
||||
filenames = args.input
|
||||
skip_id_check = args.skip_id_check
|
||||
do_msd_translate = not args.no_msd_translate
|
||||
|
||||
for n, fname in enumerate(filenames):
|
||||
et = load_xml(fname)
|
||||
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
|
||||
|
||||
|
||||
def load_xml(filename):
|
||||
logging.info("\rLOADING XML: {}".format(filename))
|
||||
with open(filename, 'r') as fp:
|
||||
content = fp.read()
|
||||
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
return ElementTree.XML(xmlstring)
|
||||
|
||||
|
||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
|
||||
words = {}
|
||||
for sentence in et.iter('s'):
|
||||
for w in sentence.iter("w"):
|
||||
words[w.get('id')] = Word(w, do_msd_translate)
|
||||
for pc in sentence.iter(pc_tag):
|
||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||
|
||||
for l in sentence.iter("link"):
|
||||
if 'dep' in l.keys():
|
||||
ana = l.get('afun')
|
||||
lfrom = l.get('from')
|
||||
dest = l.get('dep')
|
||||
else:
|
||||
ana = l.get('ana')
|
||||
if ana[:4] != 'syn:': # dont bother...
|
||||
continue
|
||||
ana = ana[4:]
|
||||
lfrom, dest = l.get('target').replace('#', '').split()
|
||||
|
||||
if lfrom in words:
|
||||
if not skip_id_check and is_root_id(lfrom):
|
||||
logging.error("NOO: {}".format(lfrom))
|
||||
sys.exit(1)
|
||||
|
||||
if dest in words:
|
||||
next_word = words[dest]
|
||||
words[lfrom].add_link(ana, next_word)
|
||||
else:
|
||||
logging.error("Unknown id: {}".format(dest))
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
# strange errors, just skip...
|
||||
pass
|
||||
|
||||
if chunk_size > 0 and len(words) > chunk_size:
|
||||
yield list(words.values())
|
||||
words = {}
|
||||
|
||||
yield list(words.values())
|
||||
|
||||
def match_file(words, structures):
|
||||
matches = {s: [] for s in structures}
|
||||
|
@ -109,8 +45,6 @@ def main(args):
|
|||
match_store = MatchStore(args)
|
||||
word_stats = WordStats(lemma_msds)
|
||||
|
||||
args.chunk_size = 50000
|
||||
|
||||
if args.parallel:
|
||||
num_parallel = int(args.parallel)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user