You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/wani.py

1043 lines
31 KiB

from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile
from msd_translate import MSD_TRANSLATE
MAX_NUM_COMPONENTS = 5
CODES = {
"Noun": "N",
"Verb": "V",
"Adjective": "A",
"Adverb": "R",
"Pronoun": "P",
"Numeral": "M",
"Preposition": "S",
"Conjunction": "C",
"Particle": "Q",
"Interjection": "I",
"Abbreviation": "Y",
"Residual": "X",
'common': 'c',
'proper': 'p',
'masculine': 'm',
'feminine': 'f',
'neuter': 'n',
"singular": "s",
"dual": "d",
"plural": "p",
"nominative": "n",
"genitive": "g",
"dative": "d",
"accusative": "a",
"locative": "l",
"instrumental": "i",
"no": "n",
"yes": "y",
"main": "m",
"auxiliary": "a",
"perfective": "e",
"progressive": "p",
"biaspectual": "b",
"infinitive": "n",
"supine": "u",
"participle": "p",
"present": "r",
"future": "f",
"conditional": "c",
"imperative": "m",
"first": "1",
"second": "2",
"third": "3",
"general": "g",
"possessive": "s",
"positive": "p",
"comparative": "c",
"superlative": "s",
"personal": "p",
"demonstrative": "d",
"relative": "r",
"reflexive": "x",
"interrogative": "q",
"indefinite": "i",
"negative": "z",
"bound": "b",
"digit": "d",
"roman": "r",
"letter": "l",
"cardinal": "c",
"ordinal": "o",
"pronominal": "p",
"special": "s",
"coordinating": "c",
"subordinating": "s",
"foreign": "f",
"typo": "t",
"program": "p",
}
TAGSET = {
"N": ['type', 'gender', 'number', 'case', 'animate'],
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
"R": ['type', 'degree'],
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
"S": ['case'],
"C": ['type'],
"Q": [],
"I": [],
"Y": [],
"X": ['type']
}
CATEGORY_BASES = {
"N": ['.'] * 5,
"V": ['.'] * 7,
"A": ['.'] * 6,
"R": ['.'] * 2,
"P": ['.'] * 6,
"M": ['.'] * 6,
"S": ['.'] * 1,
"C": ['.'] * 1,
"Q": [],
"I": [],
"Y": [],
"X": ['.'] * 1
}
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
MatchAll = 2
class Rendition(Enum):
Lemma = 0
WordForm = 1
Unknown = 2
class Order(Enum):
FromTo = 0
ToFrom = 1
Any = 2
@staticmethod
def new(order):
if order is not None:
if order == "to-from":
return Order.ToFrom
elif order == "from-to":
return Order.FromTo
else:
raise NotImplementedError("What kind of ordering is: {}".format(order))
else:
return Order.Any
def match(self, from_w, to_w):
if self is Order.Any:
return True
fi = from_w.int_id
ti = to_w.int_id
if self is Order.FromTo:
return fi < ti
elif self is Order.ToFrom:
return ti < fi
else:
raise NotImplementedError("Should not be here: Order match")
class ComponentRendition:
def __init__(self, rendition=Rendition.Unknown):
self.word_form = {}
self.rendition = rendition
def render(self, word):
if self.rendition == Rendition.Lemma:
return word.lemma
elif self.rendition == Rendition.WordForm:
return word.text
elif self.rendition == Rendition.Unknown:
return None
else:
raise RuntimeError("Unknown rendition: {}".format(self.rendition))
def __str__(self):
return str(self.rendition)
# dont know...
class StructureSelection(Enum):
All = 0
Frequency = 1
class ComponentRepresentation:
def new(s):
if 'rendition' in s:
if s['rendition'] == "lemma":
return ComponentRendition(Rendition.Lemma)
elif s['rendition'] == "word_form":
return ComponentRendition(Rendition.WordForm)
else:
raise NotImplementedError("Rendition: {}".format(s))
elif 'selection' in s:
if s['selection'] == "frequency":
return StructureSelection.Frequency
elif s['selection'] == "all":
return StructureSelection.All
else:
return {s['selection']: s['value']}
else:
return None
class ComponentStatus(Enum):
Optional = 0
Required = 1
Forbidden = 2
def __str__(self):
if self == ComponentStatus.Optional:
return "?"
elif self == ComponentStatus.Required:
return "!"
else: #Forbidden
return "X"
def get_level(restriction):
for feature in restriction:
if "level" in feature.keys():
lvl = feature.get("level")
else:
continue
raise RuntimeError("Unreachable!")
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
match_type = True
if "filter" in feature_dict:
assert(feature_dict['filter'] == "negative")
match_type = False
del feature_dict['filter']
assert(len(feature_dict) == 1)
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
assert('POS' in restr_dict)
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS']
min_msd_length = 1
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert(index >= 0)
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
def matcher(text):
if len(text) <= min_msd_length:
return False
for c, r in zip(text, rgx):
if not re.match(r, c):
return False
return True
return " ".join(rgx), matcher
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert("lemma" in restr_dict)
match_list = restr_dict['lemma'].split('|')
return match_list, lambda text: text in match_list
class Restriction:
def __init__(self, restriction_tag):
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
self.present = None
return
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
self.present, self.matcher = build_morphology_regex(list(restriction_tag))
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
else:
raise NotImplementedError()
def match(self, word):
if self.type == RestrictionType.Morphology:
match_to = word.msd
elif self.type == RestrictionType.Lexis:
match_to = word.lemma
elif self.type == RestrictionType.MatchAll:
return True
else:
raise RuntimeError("Unreachable!")
return self.matcher(match_to)
def __str__(self):
return "({:s} {})".format(str(self.type).split('.')[1], self.present)
def __repr__(self):
return str(self)
class Component:
def __init__(self, info):
idx = info['cid']
name = info['name'] if 'name' in info else None
if 'status' not in info:
status = ComponentStatus.Required
elif info['status'] == 'forbidden':
status = ComponentStatus.Forbidden
elif info['status'] == 'obligatory':
status = ComponentStatus.Required
elif info['status'] == 'optional':
status = ComponentStatus.Optional
else:
raise NotImplementedError("strange status: {}".format(info['status']))
self.status = status
self.name = name
self.idx = idx
self.restriction = None
self.next_element = []
self.rendition = ComponentRendition()
self.selection = {}
self.iter_ctr = 0
def render_word(self, word):
return self.rendition.render(word)
def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order)))
def set_restriction(self, restrictions_tag):
if restrictions_tag is None:
self.restriction = Restriction(None)
elif restrictions_tag.tag == "restriction":
self.restriction = Restriction(restrictions_tag)
elif restrictions_tag.tag == "restriction_or":
self.restriction = [Restriction(el) for el in restrictions_tag]
else:
raise RuntimeError("Unreachable")
def set_representation(self, representation):
cr = None
if representation is not None:
self.representation = []
for feature in representation:
f = ComponentRepresentation.new(dict(feature.attrib))
if type(f) is None:
logging.warning("Unknown representation in component {}, skipping...".format(self.idx), file=sys.stderr)
continue
if type(f) is StructureSelection:
assert(cr is None)
cr = f
elif type(f) is ComponentRendition:
self.rendition = f
elif type(f) is dict:
self.selection.update(f)
else:
raise RuntimeError("Unreachable: {}".format(f))
return cr
def find_next(self, deps, comps, restrs, reprs):
representation = StructureSelection.All
to_ret = []
for d in deps:
if d[0] == self.idx:
_, idx, dep_label, order = d
next_component = Component(comps[idx])
next_component.set_restriction(restrs[idx])
r1 = next_component.set_representation(reprs[idx])
to_ret.append(next_component)
self.add_next(next_component, dep_label, order)
others, r2 = next_component.find_next(deps, comps, restrs, reprs)
to_ret.extend(others)
if StructureSelection.Frequency in (r1, r2):
representation = StructureSelection.Frequency
return to_ret, representation
def name_str(self):
return "_" if self.name is None else self.name
def __str__(self):
n = self.name_str()
return "{:s}) {:7s}:{} [{}] :{}".format(
self.idx, n, self.status, self.restriction, self.rendition)
def tree(self):
el = []
for next, link, order in self.next_element:
s = "{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx)
if order != Order.Any:
s += " " + str(order)[6:]
el.append(s)
el.extend(next.tree())
return el
def __repr__(self):
return str(self)
def match(self, word):
m1 = self._match_self(word)
if m1 is None:
return None
mn = self._match_next(word)
if mn is None:
return None
to_ret = [m1]
for cmatch in mn:
# if good match but nothing to add, just continue
if len(cmatch) == 0:
continue
# if more than one match found for particular component
elif len(cmatch) > 1:
logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch))
# if more than one match in multiple components, NOPE!
if len(to_ret) > 1:
logging.warning("Strange multiple match: {}".format(
str([w.id for w in cmatch[0].values()])))
for tr in to_ret:
tr.update(cmatch[0])
continue
# yeah, so we have found more than one match, =>
# more than one element in to_ret
to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
else:
for tr in to_ret:
tr.update(cmatch[0])
logging.debug("MA: {}".format(str(to_ret)))
return to_ret
def _match_self(self, word):
matched = None
# matching
if type(self.restriction) is list:
for restr in self.restriction:
matched = restr.match(word)
if matched: # match either
break
else:
matched = self.restriction.match(word)
logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
# recurse to next
if not matched:
return None
else:
return {self.idx: word}
def _match_next(self, word):
# matches for every component in links from this component
to_ret = []
# need to get all links that match
for next, link, order in self.next_element:
next_links = word.get_links(link)
logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
to_ret.append([])
# good flag
good = next.status != ComponentStatus.Required
for next_word in next_links:
logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
if not order.match(word, next_word):
continue
match = next.match(next_word)
if match is not None:
# special treatement for forbidden
if next.status == ComponentStatus.Forbidden:
good = False
break
else:
assert(type(match) is list)
to_ret[-1].extend(match)
good = True
# if none matched, nothing found!
if not good:
logging.debug("BAD")
return None
return to_ret
class SyntacticStructure:
def __init__(self):
self.id = None
self.lbs = None
self.agreements = []
self.components = []
self.selection = StructureSelection.All
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = xml.get('id')
st.lbs = xml.get('LBS')
assert(len(list(xml)) == 1)
system = next(iter(xml))
assert(system.get('type') == 'JOS')
components, dependencies, definitions = list(system)
deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
restrs, forms = {}, {}
for comp in definitions:
n = comp.get('cid')
restrs[n] = None
forms[n] = None
for el in comp:
if el.tag.startswith("restriction"):
assert(restrs[n] is None)
restrs[n] = el
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:
raise NotImplementedError("definition??")
fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components, st.selection = fake_root_component.find_next(deps, comps, restrs, forms)
return st
def add_representation(self, n, el, forms):
if el.tag == "representation":
els = [el]
elif el.tag == "representation_and":
els = list(el)
else:
raise NotImplementedError("Unknown representation tag: {}".format(el.tag))
for el in els:
if el.get('basic') == 'form':
assert(forms[n] is None)
forms[n] = el
elif el.get('basic') == "agreement":
self.add_agreement(n, el)
else:
logging.warning("Strange representation (basic={}) in structure {}. Skipping"
.format(el.get('basic'), self.id))
continue
def add_agreement(self, n, el):
assert(el.get('head')[:4] == 'cid_')
n1 = n
n2 = el.get('head')[4:]
agreement_str = next(iter(el)).get('agreement')
self.agreements.append({
'n1': n1,
'n2': n2,
'match': agreement_str.split('|')})
def __str__(self):
comp_str = "\n".join(str(comp) for comp in self.components)
agrs = "\n".join("({} -[{}]- {}) ".format(
a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
links_str = "\n".join(self.components[0].tree())
return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
def get_component(self, idx):
for c in self.components:
if c.idx == idx:
return c
raise RuntimeError("Unknown component id: {}".format(idx))
def check_agreements(self, match):
for agr in self.agreements:
w1 = match[agr['n1']]
w2 = match[agr['n2']]
for agr_case in agr['match']:
t1 = w1.msd[0]
v1 = TAGSET[t1].index(agr_case)
assert(v1 >= 0)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(w1.msd):
continue
# first is uppercase, not in TAGSET
m1 = w1.msd[v1 + 1]
# REPEAT (not DRY!)
t2 = w2.msd[0]
v2 = TAGSET[t2].index(agr_case)
assert(v2 >= 0)
if v2 + 1 >= len(w2.msd):
continue
m2 = w2.msd[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def check_form(self, match):
for midx, w in match.items():
c = self.get_component(midx)
for key, value in c.selection.items():
t = w.msd[0]
v = TAGSET[t].index(key.lower())
f1 = w.msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
def match(self, word):
matches = self.components[0].match(word)
if matches is None:
return []
to_ret = []
for m in matches:
if not self.check_agreements(m):
bad = "Agreement"
elif not self.check_form(m):
bad = "Form"
else:
bad = "OK"
to_ret.append((m, bad))
return to_ret
def build_structures(filename):
structures = []
with open(filename, 'r') as fp:
et = ElementTree.XML(fp.read())
for structure in et.iter('syntactic_structure'):
to_append = SyntacticStructure.from_xml(structure)
if to_append is None:
continue
structures.append(to_append)
return structures
def get_msd(comp):
d = dict(comp.items())
if 'msd' in d:
return d['msd']
elif 'ana' in d:
return d['ana'][4:]
else:
logging.error(d, file=sys.stderr)
raise NotImplementedError("MSD?")
class Word:
def __init__(self, xml, do_msd_translate):
self.lemma = xml.get('lemma')
self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
self.id = xml.get('id')
self.text = xml.text
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789':
last_num = last_num[1:]
self.int_id = int(last_num)
assert(None not in (self.id, self.lemma, self.msd))
@staticmethod
def pcWord(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
if link not in self.links and "|" in link:
for l in link.split('|'):
self.links[link].extend(self.links[l])
return self.links[link]
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for n, fname in enumerate(filenames):
if args.count_files:
status = " :: {} / {}".format(n, len(filenames))
else:
status = ""
yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
logging.info("LOADING FILE: {}{}".format(filename, status))
with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring)
words = {}
for w in et.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in et.iter(pc_tag):
words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)
for l in et.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: ", lfrom)
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())
class Writer:
def __init__(self, args):
self.group = args.group
self.lemma_only = args.lemma_only
self.without_rep = args.without_rep
self.output_file = args.output
self.multiple_output = args.multiple_output
def header(self):
cols = ["Lemma"]
if not self.lemma_only:
cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
if not self.without_rep:
cols.append("Representative_form")
assert(len(cols) == self.length())
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
cols = ["Structure_ID"] + cols + ["Collocation_ID"]
if not self.without_rep:
cols.append("Joint_representative_form")
if self.group:
cols.append("Frequency")
return cols
def length(self):
return 1 + 3 * int(not self.lemma_only) + int(not self.without_rep)
def from_word(self, word):
if word is None:
return "" * self.length()
else:
cols = [word.lemma]
if not self.lemma_only:
cols = [word.id, word.text] + cols + [word.msd]
if not self.without_rep:
cols += [""] #not yet implemented...
return cols
def write_header(self, file_handler):
file_handler.write(", ".join(self.header()) + "\n")
def write_out_worker(self, file_handler, matches, structure_id, components, colocation_ids):
for m, reason, cid in matches:
to_write = []
for idx, comp in enumerate(components):
idx = str(idx + 1)
word = m[idx] if idx in m else None
to_write.extend(self.from_word(word))
# make them equal size
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write)))
to_write = [structure_id] + to_write + [colocation_ids.to_id(cid)]
if not self.without_rep:
to_write.append("") # not yet implemented...
if self.group:
if colocation_ids.is_written(cid):
continue
else:
to_write.append(colocation_ids.num(cid))
colocation_ids.set_written(cid)
file_handler.write(", ".join(to_write) + "\n")
file_handler.flush()
def write_out(self, matches, structures, colocation_ids):
def fp_close(fp_):
if fp_ != sys.stdout:
fp_.close()
def fp_open(snum=None):
if self.output_file is None:
return sys.stdout
elif snum is None:
return open(self.output_file, "w")
else:
return open("{}.{}".format(self.output_file, snum), "w")
if not self.multiple_output:
fp = fp_open()
self.write_header(fp)
for s in structures:
if self.multiple_output:
fp=fp_open(s.id)
self.write_header(fp)
sid_matches = matches[s.id]
self.write_out_worker(fp, sid_matches, s.id, s.components, colocation_ids)
if self.multiple_output:
fp_close(fp)
if not self.multiple_output:
fp_close(fp)
class ColocationIds:
def __init__(self):
self.data = {}
def add_match(self, key):
if key in self.data:
self.data[key][1] += 1
else:
self.data[key] = [str(len(self.data) + 1), 1, False]
def get(self, key, n):
return self.data[key][n]
def is_written(self, key):
return self.get(key, 2)
def num(self, key):
return str(self.get(key, 1))
def to_id(self, key):
return self.get(key, 0)
def set_written(self, key):
self.data[key][2] = True
def merge_matches(self, matches, new_matches):
for _id, nms in new_matches.items():
for nm in nms:
matches[_id].append(nm)
self.add_match(nm[2])
return matches
def match_file(words, structures):
matches = {s.id: [] for s in structures}
for idx, s in enumerate(structures):
logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id))
for w in words:
mhere = s.match(w)
logging.debug(" GOT: {}".format(len(mhere)))
for match, reason in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
colocation_id = tuple(colocation_id)
matches[s.id].append((match, reason, colocation_id))
return matches
def main(input_file, structures_file, args):
writer = Writer(args)
structures = build_structures(structures_file)
for s in structures:
logging.debug(str(s))
colocation_ids = ColocationIds()
matches = {s.id: [] for s in structures}
if args.parallel:
num_parallel = int(args.parallel)
# make temporary directory to hold temporary files
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = sys.argv
for inpt in args.input:
if inpt in cmd:
cmd.remove(inpt)
# remove "--parallel X"
pidx = cmd.index('--parallel')
del cmd[pidx]
del cmd[pidx]
def func(n):
cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
subprocess.check_call(cmdn)
return n
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
# fancy interface to wait for threads to finish
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
new_matches = pickle.load(fp)
matches = colocation_ids.merge_matches(matches, new_matches)
else:
for words in load_files(args):
new_matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
if args.match_to_file is not None:
with open(args.match_to_file, "wb") as fp:
pickle.dump(new_matches, fp)
return
else:
matches = colocation_ids.merge_matches(matches, new_matches)
writer.write_out(matches, structures, colocation_ids)
logging.debug([(k, len(v)) for k, v in matches.items()])
logging.debug(sum(len(v) for _, v in matches.items()))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
parser.add_argument('structures', help='Structures definitions in xml file')
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
parser.add_argument('--group', help='Group collocations with same collocation ID', action='store_true')
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info")
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')
parser.add_argument('--match-to-file', help='Do not use!')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args.input, args.structures, args)
logging.info("TIME: {}".format(time.time() - start))