luscenje_struktur/src/wani.py

from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x: x

from word import Word
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer


def is_root_id(id_):
    return len(id_.split('.')) == 3


def load_files(args):
    filenames = args.input
    skip_id_check = args.skip_id_check
    do_msd_translate = not args.no_msd_translate

    for n, fname in enumerate(filenames):
        et = load_xml(fname)
        yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)


def load_xml(filename):
    logging.info("\rLOADING XML: {}".format(filename))
    with open(filename, 'r') as fp:
        content = fp.read()

    xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)
    xmlstring = xmlstring.replace(' xml:', ' ')
    return ElementTree.XML(xmlstring)


def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):
    words = {}
    for sentence in et.iter('s'):
        for w in sentence.iter("w"):
            words[w.get('id')] = Word(w, do_msd_translate)
        for pc in sentence.iter(pc_tag):
            words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)

        for l in sentence.iter("link"):
            if 'dep' in l.keys():
                ana = l.get('afun')
                lfrom = l.get('from')
                dest = l.get('dep')
            else:
                ana = l.get('ana')
                if ana[:4] != 'syn:': # dont bother...
                    continue
                ana = ana[4:]
                lfrom, dest = l.get('target').replace('#', '').split()

            if lfrom in words:
                if not skip_id_check and is_root_id(lfrom):
                    logging.error("NOO: {}".format(lfrom))
                    sys.exit(1)

                if dest in words:
                    next_word = words[dest]
                    words[lfrom].add_link(ana, next_word)
                else:
                    logging.error("Unknown id: {}".format(dest))
                    sys.exit(1)

            else:
                # strange errors, just skip...
                pass

        if chunk_size > 0 and len(words) > chunk_size:
            yield list(words.values())
            words = {}
    
    yield list(words.values())

def match_file(words, structures):
    matches = {s: [] for s in structures}

    for s in structures:
        for w in words:
            mhere = s.match(w)
            for match in mhere:
                colocation_id = [(idx, w.lemma) for idx, w in match.items()]
                colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
                colocation_id = tuple(colocation_id)

                matches[s].append((match, colocation_id))

    return matches


def main(args):
    structures, lemma_msds, max_num_components = build_structures(args)

    match_store = MatchStore(args)
    word_stats = WordStats(lemma_msds)

    args.chunk_size = 50000
    
    if args.parallel:
        num_parallel = int(args.parallel)

        # make temporary directory to hold temporary files
        with tempfile.TemporaryDirectory() as tmpdirname:
            cmd = sys.argv
            for inpt in args.input:
                if inpt in cmd:
                    cmd.remove(inpt)

            # remove "--parallel X"
            pidx = cmd.index('--parallel')
            del cmd[pidx]
            del cmd[pidx]

            def func(n):
                cmdn = [sys.executable] + cmd + [args.input[n],
                                                 "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
                subprocess.check_call(cmdn)
                return n

            # use ThreadPoolExecuter to run subprocesses in parallel using py threads
            with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
                # fancy interface to wait for threads to finish
                for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
                    with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
                        words, matches = pickle.load(fp)

                    match_store.add_matches(matches)
                    word_stats.add_words(words)

    else:
        for words in tqdm(load_files(args)):
            matches = match_file(words, structures)
            # just save to temporary file, used for children of a parallel process
            # MUST NOT have more than one file
            if args.match_to_file is not None:
                with open(args.match_to_file, "wb") as fp:
                    pickle.dump((words, matches), fp)
                    return
            else:
                match_store.add_matches(matches)
                word_stats.add_words(words)

    # get word renders for lemma/msd
    word_stats.generate_renders()
    match_store.determine_colocation_dispersions()

    # figure out representations!
    if args.out or args.out_no_stat:
        match_store.set_representations(word_stats)

    Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)
    Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
        structures, match_store)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Extract structures from a parsed corpus.')
    parser.add_argument('structures',
                        help='Structures definitions in xml file')
    parser.add_argument('input',
                        help='input xml file in `ssj500k form`, can list more than one', nargs='+')
    parser.add_argument('--out',
                        help='Classic output file')
    parser.add_argument('--out-no-stat',
                        help='Output file, but without statistical columns')
    parser.add_argument('--all',
                        help='Additional output file, writes more data')
    parser.add_argument('--stats',
                        help='Output file for statistics')

    parser.add_argument('--no-msd-translate',
                        help='MSDs are translated from slovene to english by default',
                        action='store_true')
    parser.add_argument('--skip-id-check',
                        help='Skips checks for ids of <w> and <pc>, if they are in correct format',
                        action='store_true')
    parser.add_argument('--min_freq', help='Minimal frequency in output',
                        type=int, default=0, const=1, nargs='?')
    parser.add_argument('--verbose', help='Enable verbose output to stderr',
                        choices=["warning", "info", "debug"], default="info",
                        const="info", nargs='?')
    parser.add_argument('--count-files',
                        help="Count files: more verbose output", action='store_true')
    parser.add_argument('--multiple-output',
                        help='Generate one output for each syntactic structure',
                        action='store_true')

    parser.add_argument('--sort-by',
                        help="Sort by a this column (index)", type=int, default=-1)
    parser.add_argument('--sort-reversed',
                        help="Sort in reversed ored", action='store_true')

    parser.add_argument('--pc-tag',
                        help='Tag for separators, usually pc or c', default="pc")
    parser.add_argument('--parallel',
                        help='Run in multiple processes, should speed things up')

    parser.add_argument('--match-to-file', help='Do not use!')
    parser.add_argument('--pickled-structures', help='Do not use!', action='store_true')

    args = parser.parse_args()
    logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

    start = time.time()
    main(args)
    logging.info("TIME: {}".format(time.time() - start))
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`from xml.etree import ElementTree`
			`import re`
			`import sys`
			`import logging`
			`import argparse`
			`import pickle`
			`import time`
			`import subprocess`
			`import concurrent.futures`
			`import tempfile`

			`try:`
			`from tqdm import tqdm`
			`except ImportError:`
			`tqdm = lambda x: x`

			`from word import Word`
			`from syntactic_structure import build_structures`
			`from match_store import MatchStore`
			`from word_stats import WordStats`
			`from writer import Writer`


			`def is_root_id(id_):`
			`return len(id_.split('.')) == 3`


			`def load_files(args):`
			`filenames = args.input`
			`skip_id_check = args.skip_id_check`
			`do_msd_translate = not args.no_msd_translate`

			`for n, fname in enumerate(filenames):`
chunk size now handled in file-sentence-generator 2019-06-15 22:59:44 +00:00			`et = load_xml(fname)`
			`yield from file_sentence_generator(et, skip_id_check, do_msd_translate, args.pc_tag, args.chunk_size)`
Improving debugging ouptut 2019-06-15 23:32:31 +00:00

			`def load_xml(filename):`
			`logging.info("\rLOADING XML: {}".format(filename))`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`with open(filename, 'r') as fp:`
load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`content = fp.read()`

			`xmlstring = re.sub(' xmlns="[^"]+"', '', content, count=1)`
			`xmlstring = xmlstring.replace(' xml:', ' ')`
			`return ElementTree.XML(xmlstring)`

chunk size now handled in file-sentence-generator 2019-06-15 22:59:44 +00:00
			`def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag, chunk_size):`
			`words = {}`
load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`for sentence in et.iter('s'):`
			`for w in sentence.iter("w"):`
			`words[w.get('id')] = Word(w, do_msd_translate)`
			`for pc in sentence.iter(pc_tag):`
			`words[pc.get('id')] = Word.pc_word(pc, do_msd_translate)`

			`for l in sentence.iter("link"):`
			`if 'dep' in l.keys():`
			`ana = l.get('afun')`
			`lfrom = l.get('from')`
			`dest = l.get('dep')`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`else:`
load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`ana = l.get('ana')`
			`if ana[:4] != 'syn:': # dont bother...`
			`continue`
			`ana = ana[4:]`
			`lfrom, dest = l.get('target').replace('#', '').split()`

			`if lfrom in words:`
			`if not skip_id_check and is_root_id(lfrom):`
			`logging.error("NOO: {}".format(lfrom))`
			`sys.exit(1)`

			`if dest in words:`
			`next_word = words[dest]`
			`words[lfrom].add_link(ana, next_word)`
			`else:`
			`logging.error("Unknown id: {}".format(dest))`
			`sys.exit(1)`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`else:`
			`# strange errors, just skip...`
			`pass`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
chunk size now handled in file-sentence-generator 2019-06-15 22:59:44 +00:00			`if chunk_size > 0 and len(words) > chunk_size:`
			`yield list(words.values())`
			`words = {}`

			`yield list(words.values())`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`def match_file(words, structures):`
			`matches = {s: [] for s in structures}`

load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`for s in structures:`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`for w in words:`
			`mhere = s.match(w)`
			`for match in mhere:`
			`colocation_id = [(idx, w.lemma) for idx, w in match.items()]`
			`colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))`
			`colocation_id = tuple(colocation_id)`

			`matches[s].append((match, colocation_id))`

			`return matches`


able to load pickled structures 2019-06-15 23:00:22 +00:00			`def main(args):`
			`structures, lemma_msds, max_num_components = build_structures(args)`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`match_store = MatchStore(args)`
			`word_stats = WordStats(lemma_msds)`

chunk size now handled in file-sentence-generator 2019-06-15 22:59:44 +00:00			`args.chunk_size = 50000`

HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`if args.parallel:`
			`num_parallel = int(args.parallel)`

			`# make temporary directory to hold temporary files`
			`with tempfile.TemporaryDirectory() as tmpdirname:`
			`cmd = sys.argv`
			`for inpt in args.input:`
			`if inpt in cmd:`
			`cmd.remove(inpt)`

			`# remove "--parallel X"`
			`pidx = cmd.index('--parallel')`
			`del cmd[pidx]`
			`del cmd[pidx]`

			`def func(n):`
			`cmdn = [sys.executable] + cmd + [args.input[n],`
			`"--match-to-file", "{}/{}.p".format(tmpdirname, n)]`
			`subprocess.check_call(cmdn)`
			`return n`

			`# use ThreadPoolExecuter to run subprocesses in parallel using py threads`
			`with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:`
			`# fancy interface to wait for threads to finish`
			`for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):`
			`with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:`
			`words, matches = pickle.load(fp)`

			`match_store.add_matches(matches)`
			`word_stats.add_words(words)`

			`else:`
load_files now returns a generator of senteces, not a generator of the whole file This makes it much slower, but more adaptable for huge files. 2019-06-15 20:30:43 +00:00			`for words in tqdm(load_files(args)):`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`matches = match_file(words, structures)`
			`# just save to temporary file, used for children of a parallel process`
			`# MUST NOT have more than one file`
			`if args.match_to_file is not None:`
			`with open(args.match_to_file, "wb") as fp:`
			`pickle.dump((words, matches), fp)`
			`return`
			`else:`
			`match_store.add_matches(matches)`
			`word_stats.add_words(words)`

			`# get word renders for lemma/msd`
			`word_stats.generate_renders()`
			`match_store.determine_colocation_dispersions()`

			`# figure out representations!`
			`if args.out or args.out_no_stat:`
			`match_store.set_representations(word_stats)`

			`Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(`
			`structures, match_store)`
			`Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(`
			`structures, match_store)`
			`Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(`
			`structures, match_store)`
			`Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(`
			`structures, match_store)`

			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser(`
			`description='Extract structures from a parsed corpus.')`
			`parser.add_argument('structures',`
			`help='Structures definitions in xml file')`
			`parser.add_argument('input',`
			help='input xml file in `ssj500k form`, can list more than one', nargs='+')
			`parser.add_argument('--out',`
			`help='Classic output file')`
			`parser.add_argument('--out-no-stat',`
			`help='Output file, but without statistical columns')`
			`parser.add_argument('--all',`
			`help='Additional output file, writes more data')`
			`parser.add_argument('--stats',`
			`help='Output file for statistics')`

			`parser.add_argument('--no-msd-translate',`
			`help='MSDs are translated from slovene to english by default',`
			`action='store_true')`
			`parser.add_argument('--skip-id-check',`
			`help='Skips checks for ids of <w> and <pc>, if they are in correct format',`
			`action='store_true')`
			`parser.add_argument('--min_freq', help='Minimal frequency in output',`
			`type=int, default=0, const=1, nargs='?')`
			`parser.add_argument('--verbose', help='Enable verbose output to stderr',`
			`choices=["warning", "info", "debug"], default="info",`
			`const="info", nargs='?')`
			`parser.add_argument('--count-files',`
			`help="Count files: more verbose output", action='store_true')`
			`parser.add_argument('--multiple-output',`
			`help='Generate one output for each syntactic structure',`
			`action='store_true')`

			`parser.add_argument('--sort-by',`
			`help="Sort by a this column (index)", type=int, default=-1)`
			`parser.add_argument('--sort-reversed',`
			`help="Sort in reversed ored", action='store_true')`

			`parser.add_argument('--pc-tag',`
			`help='Tag for separators, usually pc or c', default="pc")`
			`parser.add_argument('--parallel',`
			`help='Run in multiple processes, should speed things up')`
able to load pickled structures 2019-06-15 23:00:22 +00:00
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`parser.add_argument('--match-to-file', help='Do not use!')`
able to load pickled structures 2019-06-15 23:00:22 +00:00			`parser.add_argument('--pickled-structures', help='Do not use!', action='store_true')`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00
			`args = parser.parse_args()`
			`logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())`

			`start = time.time()`
able to load pickled structures 2019-06-15 23:00:22 +00:00			`main(args)`
HUGE refactor, creating lots of modules, no code changes though! 2019-06-15 16:55:35 +00:00			`logging.info("TIME: {}".format(time.time() - start))`