2019-06-15 16:55:35 +00:00
|
|
|
from xml.etree import ElementTree
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import logging
|
|
|
|
import argparse
|
|
|
|
import pickle
|
|
|
|
import time
|
|
|
|
import subprocess
|
|
|
|
import concurrent.futures
|
|
|
|
import tempfile
|
|
|
|
|
|
|
|
try:
|
|
|
|
from tqdm import tqdm
|
|
|
|
except ImportError:
|
|
|
|
tqdm = lambda x: x
|
|
|
|
|
|
|
|
from word import Word
|
|
|
|
from syntactic_structure import build_structures
|
|
|
|
from match_store import MatchStore
|
|
|
|
from word_stats import WordStats
|
|
|
|
from writer import Writer
|
|
|
|
|
|
|
|
|
|
|
|
def is_root_id(id_):
|
|
|
|
return len(id_.split('.')) == 3
|
|
|
|
|
|
|
|
|
|
|
|
def load_files(args):
|
|
|
|
filenames = args.input
|
|
|
|
skip_id_check = args.skip_id_check
|
|
|
|
do_msd_translate = not args.no_msd_translate
|
|
|
|
|
|
|
|
for n, fname in enumerate(filenames):
|
2019-06-15 22:59:44 +00:00
|
|
|
et = load_xml(fname)
|
|
|
|
yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)
|
2019-06-15 23:32:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
def load_xml(filename):
|
|
|
|
logging.info("\rLOADING XML: {}".format(filename))
|
2019-06-15 16:55:35 +00:00
|
|
|
with open(filename, 'r') as fp:
|
2019-06-15 20:30:43 +00:00
|
|
|
content = fp.read()
|
|
|
|
|
|
|
|
xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
|
|
|
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
|
|
return ElementTree.XML(xmlstring)
|
|
|
|
|
2019-06-15 22:59:44 +00:00
|
|
|
|
|
|
|
def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
|
|
|
|
words = {}
|
2019-06-15 20:30:43 +00:00
|
|
|
for sentence in et.iter('s'):
|
|
|
|
for w in sentence.iter("w"):
|
|
|
|
words[w.get('id')] = Word(w, do_msd_translate)
|
|
|
|
for pc in sentence.iter(pc_tag):
|
|
|
|
words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)
|
|
|
|
|
|
|
|
for l in sentence.iter("link"):
|
|
|
|
if 'dep' in l.keys():
|
|
|
|
ana = l.get('afun')
|
|
|
|
lfrom = l.get('from')
|
|
|
|
dest = l.get('dep')
|
2019-06-15 16:55:35 +00:00
|
|
|
else:
|
2019-06-15 20:30:43 +00:00
|
|
|
ana = l.get('ana')
|
|
|
|
if ana[:4] != 'syn:': # dont bother...
|
|
|
|
continue
|
|
|
|
ana = ana[4:]
|
|
|
|
lfrom, dest = l.get('target').replace('#', '').split()
|
|
|
|
|
|
|
|
if lfrom in words:
|
|
|
|
if not skip_id_check and is_root_id(lfrom):
|
|
|
|
logging.error("NOO: {}".format(lfrom))
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
if dest in words:
|
|
|
|
next_word = words[dest]
|
|
|
|
words[lfrom].add_link(ana, next_word)
|
|
|
|
else:
|
|
|
|
logging.error("Unknown id: {}".format(dest))
|
|
|
|
sys.exit(1)
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-06-15 20:30:43 +00:00
|
|
|
else:
|
|
|
|
# strange errors, just skip...
|
|
|
|
pass
|
2019-06-15 16:55:35 +00:00
|
|
|
|
2019-06-15 22:59:44 +00:00
|
|
|
if chunk_size > 0 and len(words) > chunk_size:
|
|
|
|
yield list(words.values())
|
|
|
|
words = {}
|
|
|
|
|
|
|
|
yield list(words.values())
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
def match_file(words, structures):
|
|
|
|
matches = {s: [] for s in structures}
|
|
|
|
|
2019-06-15 20:30:43 +00:00
|
|
|
for s in structures:
|
2019-06-15 16:55:35 +00:00
|
|
|
for w in words:
|
|
|
|
mhere = s.match(w)
|
|
|
|
for match in mhere:
|
|
|
|
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
|
|
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
|
|
|
colocation_id = tuple(colocation_id)
|
|
|
|
|
|
|
|
matches[s].append((match, colocation_id))
|
|
|
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
|
2019-06-15 23:00:22 +00:00
|
|
|
def main(args):
|
|
|
|
structures, lemma_msds, max_num_components = build_structures(args)
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
match_store = MatchStore(args)
|
|
|
|
word_stats = WordStats(lemma_msds)
|
|
|
|
|
2019-06-15 22:59:44 +00:00
|
|
|
args.chunk_size = 50000
|
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
if args.parallel:
|
|
|
|
num_parallel = int(args.parallel)
|
|
|
|
|
|
|
|
# make temporary directory to hold temporary files
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
|
|
cmd = sys.argv
|
|
|
|
for inpt in args.input:
|
|
|
|
if inpt in cmd:
|
|
|
|
cmd.remove(inpt)
|
|
|
|
|
|
|
|
# remove "--parallel X"
|
|
|
|
pidx = cmd.index('--parallel')
|
|
|
|
del cmd[pidx]
|
|
|
|
del cmd[pidx]
|
|
|
|
|
|
|
|
def func(n):
|
|
|
|
cmdn = [sys.executable] + cmd + [args.input[n],
|
|
|
|
"--match-to-file", "{}/{}.p".format(tmpdirname, n)]
|
|
|
|
subprocess.check_call(cmdn)
|
|
|
|
return n
|
|
|
|
|
|
|
|
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
|
|
|
|
# fancy interface to wait for threads to finish
|
|
|
|
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
|
|
|
|
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
|
|
|
|
words, matches = pickle.load(fp)
|
|
|
|
|
|
|
|
match_store.add_matches(matches)
|
|
|
|
word_stats.add_words(words)
|
|
|
|
|
|
|
|
else:
|
2019-06-15 20:30:43 +00:00
|
|
|
for words in tqdm(load_files(args)):
|
2019-06-15 16:55:35 +00:00
|
|
|
matches = match_file(words, structures)
|
|
|
|
# just save to temporary file, used for children of a parallel process
|
|
|
|
# MUST NOT have more than one file
|
|
|
|
if args.match_to_file is not None:
|
|
|
|
with open(args.match_to_file, "wb") as fp:
|
|
|
|
pickle.dump((words, matches), fp)
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
match_store.add_matches(matches)
|
|
|
|
word_stats.add_words(words)
|
|
|
|
|
|
|
|
# get word renders for lemma/msd
|
|
|
|
word_stats.generate_renders()
|
|
|
|
match_store.determine_colocation_dispersions()
|
|
|
|
|
|
|
|
# figure out representations!
|
|
|
|
if args.out or args.out_no_stat:
|
|
|
|
match_store.set_representations(word_stats)
|
|
|
|
|
|
|
|
Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
|
|
structures, match_store)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='Extract structures from a parsed corpus.')
|
|
|
|
parser.add_argument('structures',
|
|
|
|
help='Structures definitions in xml file')
|
|
|
|
parser.add_argument('input',
|
|
|
|
help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
|
|
|
parser.add_argument('--out',
|
|
|
|
help='Classic output file')
|
|
|
|
parser.add_argument('--out-no-stat',
|
|
|
|
help='Output file, but without statistical columns')
|
|
|
|
parser.add_argument('--all',
|
|
|
|
help='Additional output file, writes more data')
|
|
|
|
parser.add_argument('--stats',
|
|
|
|
help='Output file for statistics')
|
|
|
|
|
|
|
|
parser.add_argument('--no-msd-translate',
|
|
|
|
help='MSDs are translated from slovene to english by default',
|
|
|
|
action='store_true')
|
|
|
|
parser.add_argument('--skip-id-check',
|
|
|
|
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
|
|
|
|
action='store_true')
|
|
|
|
parser.add_argument('--min_freq', help='Minimal frequency in output',
|
|
|
|
type=int, default=0, const=1, nargs='?')
|
|
|
|
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
|
|
|
choices=["warning", "info", "debug"], default="info",
|
|
|
|
const="info", nargs='?')
|
|
|
|
parser.add_argument('--count-files',
|
|
|
|
help="Count files: more verbose output", action='store_true')
|
|
|
|
parser.add_argument('--multiple-output',
|
|
|
|
help='Generate one output for each syntactic structure',
|
|
|
|
action='store_true')
|
|
|
|
|
|
|
|
parser.add_argument('--sort-by',
|
|
|
|
help="Sort by a this column (index)", type=int, default=-1)
|
|
|
|
parser.add_argument('--sort-reversed',
|
|
|
|
help="Sort in reversed ored", action='store_true')
|
|
|
|
|
|
|
|
parser.add_argument('--pc-tag',
|
|
|
|
help='Tag for separators, usually pc or c', default="pc")
|
|
|
|
parser.add_argument('--parallel',
|
|
|
|
help='Run in multiple processes, should speed things up')
|
2019-06-15 23:00:22 +00:00
|
|
|
|
2019-06-15 16:55:35 +00:00
|
|
|
parser.add_argument('--match-to-file', help='Do not use!')
|
2019-06-15 23:00:22 +00:00
|
|
|
parser.add_argument('--pickled-structures', help='Do not use!', action='store_true')
|
2019-06-15 16:55:35 +00:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
|
|
|
|
|
|
|
start = time.time()
|
2019-06-15 23:00:22 +00:00
|
|
|
main(args)
|
2019-06-15 16:55:35 +00:00
|
|
|
logging.info("TIME: {}".format(time.time() - start))
|