Loader to its own module
This commit is contained in:
parent
51cf3e7064
commit
3552f14b81
72
src/loader.py
Normal file
72
src/loader.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from word import Word
|
||||||
|
|
||||||
|
|
||||||
|
def is_root_id(id_):
|
||||||
|
return len(id_.split('.')) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def load_files(args):
|
||||||
|
filenames = args.input
|
||||||
|
skip_id_check = args.skip_id_check
|
||||||
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
|
for fname in filenames:
|
||||||
|
et = load_xml(fname)
|
||||||
|
yield file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag)
|
||||||
|
|
||||||
|
|
||||||
|
def load_xml(filename):
|
||||||
|
logging.info("LOADING XML: {}".format(filename))
|
||||||
|
with open(filename, 'r') as fp:
|
||||||
|
content = fp.read()
|
||||||
|
|
||||||
|
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
||||||
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||||
|
return ElementTree.XML(xmlstring)
|
||||||
|
|
||||||
|
|
||||||
|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||||
|
words = {}
|
||||||
|
sentences = list(et.iter('s'))
|
||||||
|
for sentence in tqdm(sentences, desc="load-text"):
|
||||||
|
for w in sentence.iter("w"):
|
||||||
|
words[w.get('id')] = Word(w, do_msd_translate)
|
||||||
|
for pc in sentence.iter(pc_tag):
|
||||||
|
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
||||||
|
|
||||||
|
for l in sentence.iter("link"):
|
||||||
|
if 'dep' in l.keys():
|
||||||
|
ana = l.get('afun')
|
||||||
|
lfrom = l.get('from')
|
||||||
|
dest = l.get('dep')
|
||||||
|
else:
|
||||||
|
ana = l.get('ana')
|
||||||
|
if ana[:4] != 'syn:': # dont bother...
|
||||||
|
continue
|
||||||
|
ana = ana[4:]
|
||||||
|
lfrom, dest = l.get('target').replace('#', '').split()
|
||||||
|
|
||||||
|
if lfrom in words:
|
||||||
|
if not skip_id_check and is_root_id(lfrom):
|
||||||
|
logging.error("NOO: {}".format(lfrom))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if dest in words:
|
||||||
|
next_word = words[dest]
|
||||||
|
words[lfrom].add_link(ana, next_word)
|
||||||
|
else:
|
||||||
|
logging.error("Unknown id: {}".format(dest))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# strange errors, just skip...
|
||||||
|
pass
|
||||||
|
|
||||||
|
return list(words.values())
|
68
src/wani.py
68
src/wani.py
|
@ -19,73 +19,9 @@ from syntactic_structure import build_structures
|
||||||
from match_store import MatchStore
|
from match_store import MatchStore
|
||||||
from word_stats import WordStats
|
from word_stats import WordStats
|
||||||
from writer import Writer
|
from writer import Writer
|
||||||
|
from loader import load_files
|
||||||
|
|
||||||
|
|
||||||
def is_root_id(id_):
|
|
||||||
return len(id_.split('.')) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def load_files(args):
|
|
||||||
filenames = args.input
|
|
||||||
skip_id_check = args.skip_id_check
|
|
||||||
do_msd_translate = not args.no_msd_translate
|
|
||||||
|
|
||||||
for n, fname in enumerate(filenames):
|
|
||||||
et = load_xml(fname)
|
|
||||||
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
|
|
||||||
|
|
||||||
|
|
||||||
def load_xml(filename):
|
|
||||||
logging.info("\rLOADING XML: {}".format(filename))
|
|
||||||
with open(filename, 'r') as fp:
|
|
||||||
content = fp.read()
|
|
||||||
|
|
||||||
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
|
||||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
||||||
return ElementTree.XML(xmlstring)
|
|
||||||
|
|
||||||
|
|
||||||
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
|
|
||||||
words = {}
|
|
||||||
for sentence in et.iter('s'):
|
|
||||||
for w in sentence.iter("w"):
|
|
||||||
words[w.get('id')] = Word(w, do_msd_translate)
|
|
||||||
for pc in sentence.iter(pc_tag):
|
|
||||||
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
||||||
|
|
||||||
for l in sentence.iter("link"):
|
|
||||||
if 'dep' in l.keys():
|
|
||||||
ana = l.get('afun')
|
|
||||||
lfrom = l.get('from')
|
|
||||||
dest = l.get('dep')
|
|
||||||
else:
|
|
||||||
ana = l.get('ana')
|
|
||||||
if ana[:4] != 'syn:': # dont bother...
|
|
||||||
continue
|
|
||||||
ana = ana[4:]
|
|
||||||
lfrom, dest = l.get('target').replace('#', '').split()
|
|
||||||
|
|
||||||
if lfrom in words:
|
|
||||||
if not skip_id_check and is_root_id(lfrom):
|
|
||||||
logging.error("NOO: {}".format(lfrom))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if dest in words:
|
|
||||||
next_word = words[dest]
|
|
||||||
words[lfrom].add_link(ana, next_word)
|
|
||||||
else:
|
|
||||||
logging.error("Unknown id: {}".format(dest))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# strange errors, just skip...
|
|
||||||
pass
|
|
||||||
|
|
||||||
if chunk_size > 0 and len(words) > chunk_size:
|
|
||||||
yield list(words.values())
|
|
||||||
words = {}
|
|
||||||
|
|
||||||
yield list(words.values())
|
|
||||||
|
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
matches = {s: [] for s in structures}
|
matches = {s: [] for s in structures}
|
||||||
|
@ -109,8 +45,6 @@ def main(args):
|
||||||
match_store = MatchStore(args)
|
match_store = MatchStore(args)
|
||||||
word_stats = WordStats(lemma_msds)
|
word_stats = WordStats(lemma_msds)
|
||||||
|
|
||||||
args.chunk_size = 50000
|
|
||||||
|
|
||||||
if args.parallel:
|
if args.parallel:
|
||||||
num_parallel = int(args.parallel)
|
num_parallel = int(args.parallel)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user