You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
luscenje_struktur/wani.py

1063 lines
32 KiB

from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile
from msd_translate import MSD_TRANSLATE
MAX_NUM_COMPONENTS = 5
CODES = {
"Noun": "N",
"Verb": "V",
"Adjective": "A",
"Adverb": "R",
"Pronoun": "P",
"Numeral": "M",
"Preposition": "S",
"Conjunction": "C",
"Particle": "Q",
"Interjection": "I",
"Abbreviation": "Y",
"Residual": "X",
'common': 'c',
'proper': 'p',
'masculine': 'm',
'feminine': 'f',
'neuter': 'n',
"singular": "s",
"dual": "d",
"plural": "p",
"nominative": "n",
"genitive": "g",
"dative": "d",
"accusative": "a",
"locative": "l",
"instrumental": "i",
"no": "n",
"yes": "y",
"main": "m",
"auxiliary": "a",
"perfective": "e",
"progressive": "p",
"biaspectual": "b",
"infinitive": "n",
"supine": "u",
"participle": "p",
"present": "r",
"future": "f",
"conditional": "c",
"imperative": "m",
"first": "1",
"second": "2",
"third": "3",
"general": "g",
"possessive": "s",
"positive": "p",
"comparative": "c",
"superlative": "s",
"personal": "p",
"demonstrative": "d",
"relative": "r",
"reflexive": "x",
"interrogative": "q",
"indefinite": "i",
"negative": "z",
"bound": "b",
"digit": "d",
"roman": "r",
"letter": "l",
"cardinal": "c",
"ordinal": "o",
"pronominal": "p",
"special": "s",
"coordinating": "c",
"subordinating": "s",
"foreign": "f",
"typo": "t",
"program": "p",
}
TAGSET = {
"N": ['type', 'gender', 'number', 'case', 'animate'],
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
"R": ['type', 'degree'],
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
"S": ['case'],
"C": ['type'],
"Q": [],
"I": [],
"Y": [],
"X": ['type']
}
CATEGORY_BASES = {
"N": ['.'] * 5,
"V": ['.'] * 7,
"A": ['.'] * 6,
"R": ['.'] * 2,
"P": ['.'] * 6,
"M": ['.'] * 6,
"S": ['.'] * 1,
"C": ['.'] * 1,
"Q": [],
"I": [],
"Y": [],
"X": ['.'] * 1
}
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
MatchAll = 2
class Rendition(Enum):
Lemma = 0
WordForm = 1
Lexis = 2
Unknown = 3
class StructureSelection(Enum):
All = 0
Msd = 1
class Order(Enum):
FromTo = 0
ToFrom = 1
Any = 2
@staticmethod
def new(order):
if order is not None:
if order == "to-from":
return Order.ToFrom
elif order == "from-to":
return Order.FromTo
else:
raise NotImplementedError("What kind of ordering is: {}".format(order))
else:
return Order.Any
def match(self, from_w, to_w):
if self is Order.Any:
return True
fi = from_w.int_id
ti = to_w.int_id
if self is Order.FromTo:
return fi < ti
elif self is Order.ToFrom:
return ti < fi
else:
raise NotImplementedError("Should not be here: Order match")
class ComponentRendition:
def __init__(self):
self.more = None
self.rendition = Rendition.Unknown
def _set_rendition(self, r):
assert(self.rendition is Rendition.Unknown)
self.rendition = r
def _set_more(self, m):
assert(self.more is None and m is not None)
self.more = m
def add_feature(self, feature):
if 'rendition' in feature:
if feature['rendition'] == "lemma":
self._set_rendition(Rendition.Lemma)
elif feature['rendition'] == "word_form":
self._set_rendition(Rendition.WordForm)
elif feature['rendition'] == "lexis":
self._set_rendition(Rendition.Lexis)
self._set_more(feature['string'])
else:
raise NotImplementedError("Representation rendition: {}".format(feature))
elif 'selection' in feature:
if feature['selection'] == "msd":
self._set_more(StructureSelection.Msd)
elif feature['selection'] == "all":
self._set_more(StructureSelection.All)
else:
raise NotImplementedError("Representation selection: {}".format(feature))
else:
return None
def render(self, words):
if self.rendition == Rendition.Lemma:
return words[0].lemma
elif self.rendition == Rendition.Lexis:
return self.more
elif self.rendition == Rendition.Unknown:
return None
elif self.rendition == Rendition.WordForm:
# check more!
return words[0].text
else:
raise RuntimeError("Unknown rendition: {}".format(self.rendition))
def __str__(self):
return str(self.rendition)
class ComponentStatus(Enum):
Optional = 0
Required = 1
Forbidden = 2
def __str__(self):
if self == ComponentStatus.Optional:
return "?"
elif self == ComponentStatus.Required:
return "!"
else: #Forbidden
return "X"
def get_level(restriction):
for feature in restriction:
if "level" in feature.keys():
lvl = feature.get("level")
else:
continue
raise RuntimeError("Unreachable!")
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
match_type = True
if "filter" in feature_dict:
assert(feature_dict['filter'] == "negative")
match_type = False
del feature_dict['filter']
assert(len(feature_dict) == 1)
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
assert('POS' in restr_dict)
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS']
min_msd_length = 1
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert(index >= 0)
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
def matcher(text):
if len(text) <= min_msd_length:
return False
for c, r in zip(text, rgx):
if not re.match(r, c):
return False
return True
return " ".join(rgx), matcher
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert("lemma" in restr_dict)
match_list = restr_dict['lemma'].split('|')
return match_list, lambda text: text in match_list
class Restriction:
def __init__(self, restriction_tag):
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
self.present = None
return
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
self.present, self.matcher = build_morphology_regex(list(restriction_tag))
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
else:
raise NotImplementedError()
def match(self, word):
if self.type == RestrictionType.Morphology:
match_to = word.msd
elif self.type == RestrictionType.Lexis:
match_to = word.lemma
elif self.type == RestrictionType.MatchAll:
return True
else:
raise RuntimeError("Unreachable!")
return self.matcher(match_to)
def __str__(self):
return "({:s} {})".format(str(self.type).split('.')[1], self.present)
def __repr__(self):
return str(self)
class Component:
def __init__(self, info):
idx = info['cid']
name = info['name'] if 'name' in info else None
if 'status' not in info:
status = ComponentStatus.Required
elif info['status'] == 'forbidden':
status = ComponentStatus.Forbidden
elif info['status'] == 'obligatory':
status = ComponentStatus.Required
elif info['status'] == 'optional':
status = ComponentStatus.Optional
else:
raise NotImplementedError("strange status: {}".format(info['status']))
self.status = status
self.name = name
self.idx = idx
self.restriction = None
self.next_element = []
self.representation = ComponentRendition()
self.selection = {}
self.iter_ctr = 0
def render_word(self, word):
return self.representation.render(word)
def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order)))
def set_restriction(self, restrictions_tag):
if restrictions_tag is None:
self.restriction = Restriction(None)
elif restrictions_tag.tag == "restriction":
self.restriction = Restriction(restrictions_tag)
elif restrictions_tag.tag == "restriction_or":
self.restriction = [Restriction(el) for el in restrictions_tag]
else:
raise RuntimeError("Unreachable")
def set_representation(self, representation):
# for r in representation:
# print(ElementTree.tostring(r).decode('ascii').replace('\n', ''))
# print("--")
if len(representation) > 0:
for feature in representation:
self.representation.add_feature(feature)
def find_next(self, deps, comps, restrs, reprs):
to_ret = []
for d in deps:
if d[0] == self.idx:
_, idx, dep_label, order = d
next_component = Component(comps[idx])
next_component.set_restriction(restrs[idx])
next_component.set_representation(reprs[idx])
to_ret.append(next_component)
self.add_next(next_component, dep_label, order)
others = next_component.find_next(deps, comps, restrs, reprs)
to_ret.extend(others)
return to_ret
def name_str(self):
return "_" if self.name is None else self.name
def __str__(self):
n = self.name_str()
return "{:s}) {:7s}:{} [{}] :{}".format(
self.idx, n, self.status, self.restriction, self.representation)
def tree(self):
el = []
for next, link, order in self.next_element:
s = "{:3} -- {:5} --> {:3}".format(self.idx, link, next.idx)
if order != Order.Any:
s += " " + str(order)[6:]
el.append(s)
el.extend(next.tree())
return el
def __repr__(self):
return str(self)
def match(self, word):
m1 = self._match_self(word)
if m1 is None:
return None
mn = self._match_next(word)
if mn is None:
return None
to_ret = [m1]
for cmatch in mn:
# if good match but nothing to add, just continue
if len(cmatch) == 0:
continue
# if more than one match found for particular component
elif len(cmatch) > 1:
logging.debug("MULTIPLE: {}, {}".format(self.idx, cmatch))
# if more than one match in multiple components, NOPE!
if len(to_ret) > 1:
logging.warning("Strange multiple match: {}".format(
str([w.id for w in cmatch[0].values()])))
for tr in to_ret:
tr.update(cmatch[0])
continue
# yeah, so we have found more than one match, =>
# more than one element in to_ret
to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
else:
for tr in to_ret:
tr.update(cmatch[0])
logging.debug("MA: {}".format(str(to_ret)))
return to_ret
def _match_self(self, word):
matched = None
# matching
if type(self.restriction) is list:
for restr in self.restriction:
matched = restr.match(word)
if matched: # match either
break
else:
matched = self.restriction.match(word)
logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
# recurse to next
if not matched:
return None
else:
return {self.idx: word}
def _match_next(self, word):
# matches for every component in links from this component
to_ret = []
# need to get all links that match
for next, link, order in self.next_element:
next_links = word.get_links(link)
logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
to_ret.append([])
# good flag
good = next.status != ComponentStatus.Required
for next_word in next_links:
logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
if not order.match(word, next_word):
continue
match = next.match(next_word)
if match is not None:
# special treatement for forbidden
if next.status == ComponentStatus.Forbidden:
good = False
break
else:
assert(type(match) is list)
to_ret[-1].extend(match)
good = True
# if none matched, nothing found!
if not good:
logging.debug("BAD")
return None
return to_ret
class SyntacticStructure:
def __init__(self):
self.id = None
self.lbs = None
self.agreements = []
self.components = []
self.selection = StructureSelection.All
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = xml.get('id')
st.lbs = xml.get('LBS')
assert(len(list(xml)) == 1)
system = next(iter(xml))
assert(system.get('type') == 'JOS')
components, dependencies, definitions = list(system)
deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
restrs, forms = {}, {}
for comp in definitions:
n = comp.get('cid')
restrs[n] = None
forms[n] = []
for el in comp:
if el.tag.startswith("restriction"):
assert(restrs[n] is None)
restrs[n] = el
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:
raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))
fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
return st
def add_representation(self, n, rep_el, forms):
if rep_el.tag == "representation_and":
rep_el = rep_el[0]
logging.warning("Only using first reprentation in representation_and in structure {}".format(self.id))
assert(rep_el.tag == "representation")
for el in rep_el:
assert(el.tag == "feature")
if 'rendition' in el.attrib:
forms[n].append(el)
elif 'selection' in el.attrib and el.attrib["selection"] != "agreement":
forms[n].append(el)
elif 'selection' in el.attrib:
self.add_agreement(n, el)
else:
logging.warning("Strange representation feature in structure {}. Skipping"
.format(self.id))
continue
def add_agreement(self, n, el):
assert(el.get('head')[:4] == 'cid_')
n1 = n
n2 = el.get('head')[4:]
agreement_str = el.get('msd')
assert(agreement_str is not None)
self.agreements.append({
'n1': n1,
'n2': n2,
'match': agreement_str.split('+')})
def __str__(self):
comp_str = "\n".join(str(comp) for comp in self.components)
agrs = "\n".join("({} -[{}]- {}) ".format(
a['n1'], "|".join(a['match']), a['n2']) for a in self.agreements)
links_str = "\n".join(self.components[0].tree())
return "{} LBS {}\nCOMPONENTS\n{}\nAGREEMENTS\n{}\nLINKS\n{}\n{}".format(
self.id, self.lbs, comp_str, agrs, links_str, "-" * 40)
def get_component(self, idx):
for c in self.components:
if c.idx == idx:
return c
raise RuntimeError("Unknown component id: {}".format(idx))
def check_agreements(self, match):
for agr in self.agreements:
w1 = match[agr['n1']]
w2 = match[agr['n2']]
for agr_case in agr['match']:
t1 = w1.msd[0]
v1 = TAGSET[t1].index(agr_case)
assert(v1 >= 0)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(w1.msd):
continue
# first is uppercase, not in TAGSET
m1 = w1.msd[v1 + 1]
# REPEAT (not DRY!)
t2 = w2.msd[0]
v2 = TAGSET[t2].index(agr_case)
assert(v2 >= 0)
if v2 + 1 >= len(w2.msd):
continue
m2 = w2.msd[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def check_form(self, match):
for midx, w in match.items():
c = self.get_component(midx)
for key, value in c.selection.items():
t = w.msd[0]
v = TAGSET[t].index(key.lower())
f1 = w.msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
def match(self, word):
matches = self.components[0].match(word)
if matches is None:
return []
to_ret = []
for m in matches:
if not self.check_agreements(m):
bad = "Agreement"
elif not self.check_form(m):
bad = "Form"
else:
bad = "OK"
to_ret.append((m, bad))
return to_ret
def build_structures(filename):
structures = []
with open(filename, 'r') as fp:
et = ElementTree.XML(fp.read())
for structure in et.iter('syntactic_structure'):
to_append = SyntacticStructure.from_xml(structure)
if to_append is None:
continue
structures.append(to_append)
return structures
def get_msd(comp):
d = dict(comp.items())
if 'msd' in d:
return d['msd']
elif 'ana' in d:
return d['ana'][4:]
else:
logging.error(d, file=sys.stderr)
raise NotImplementedError("MSD?")
class Word:
def __init__(self, xml, do_msd_translate):
self.lemma = xml.get('lemma')
self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
self.id = xml.get('id')
self.text = xml.text
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789':
last_num = last_num[1:]
self.int_id = int(last_num)
assert(None not in (self.id, self.lemma, self.msd))
@staticmethod
def pcWord(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
if link not in self.links and "|" in link:
for l in link.split('|'):
self.links[link].extend(self.links[l])
return self.links[link]
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for n, fname in enumerate(filenames):
if args.count_files:
status = " :: {} / {}".format(n, len(filenames))
else:
status = ""
yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
logging.info("LOADING FILE: {}{}".format(filename, status))
with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring)
words = {}
for w in et.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in et.iter(pc_tag):
words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)
for l in et.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: ", lfrom)
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())
class Writer:
@staticmethod
def make_output_writer(args):
return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed)
@staticmethod
def make_all_writer(args):
return Writer(True, args.all, False, -1, False)
def __init__(self, all, filename, multiple_output, sort_by, sort_reversed):
self.all = all
self.output_file = filename
self.multiple_output = multiple_output
self.sort_by = sort_by
self.sort_order = sort_reversed
def header(self):
cols = ["Lemma"]
if self.all:
cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
else:
cols.append("Representative_form")
assert(len(cols) == self.length())
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
cols = ["Structure_ID"] + cols + ["Colocation_ID"]
if not self.all:
cols += ["Joint_representative_form", "Frequency"]
return cols
def length(self):
return 4 if self.all else 2
def from_word(self, word):
if word is None:
return [""] * self.length()
elif self.all:
return [word.id, word.text, word.lemma, word.msd]
else:
return [word.lemma, "REP?"]
def sorted_rows(self, rows):
if self.sort_by < 0 or len(rows) < 2:
return rows
if len(rows[0]) <= self.sort_by:
logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
return rows
try:
int(rows[0][self.sort_by])
key=lambda row: int(row[self.sort_by])
except ValueError:
key=lambda row: row[self.sort_by].lower()
return sorted(rows, key=key, reverse=self.sort_order)
def write_header(self, file_handler):
file_handler.write(", ".join(self.header()) + "\n")
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
rows = []
for cid, m, reason, freq in colocation_ids.get_matches_for(structure_id, not self.all):
to_write = []
representation = ""
for idx, comp in enumerate(components):
idx = str(idx + 1)
word = m[idx] if idx in m else None
to_write.extend(self.from_word(word))
representation += " " + to_write[-1]
# make them equal size
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write)))
to_write = [structure_id] + to_write + [cid]
if not self.all:
representation = re.sub(' +', ' ', representation)
to_write.append(representation.strip())
to_write.append(str(freq))
rows.append(to_write)
if len(rows) > 0:
rows = self.sorted_rows(rows)
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
file_handler.flush()
def write_out(self, structures, colocation_ids):
def fp_close(fp_):
if fp_ != sys.stdout:
fp_.close()
def fp_open(snum=None):
if self.output_file is None:
return sys.stdout
elif snum is None:
return open(self.output_file, "w")
else:
return open("{}.{}".format(self.output_file, snum), "w")
if not self.multiple_output:
fp = fp_open()
self.write_header(fp)
for s in structures:
if self.multiple_output:
fp=fp_open(s.id)
self.write_header(fp)
self.write_out_worker(fp, s.id, s.components, colocation_ids)
if self.multiple_output:
fp_close(fp)
if not self.multiple_output:
fp_close(fp)
class ColocationIds:
def __init__(self):
self.data = {}
self.min_frequency = args.min_freq
def _add_match(self, key, sid, match):
if key in self.data:
self.data[key][1].append(match)
else:
self.data[key] = (str(len(self.data) + 1), [match], sid)
def get(self, key, n):
return self.data[key][n]
def num(self, key):
return str(len(self.get(key, 1)))
def to_id(self, key):
return self.get(key, 0)
def add_matches(self, matches):
for sid, nms in matches.items():
for nm in nms:
self._add_match(nm[2], sid, (nm[0], nm[1]))
def get_matches_for(self, structure_id, group):
for _cid_tup, (cid, cid_matches, sid) in self.data.items():
if sid != structure_id:
continue
for words, reason in cid_matches:
yield (cid, words, reason, len(cid_matches))
if group:
break
def match_file(words, structures):
matches = {s.id: [] for s in structures}
for idx, s in enumerate(structures):
logging.info("{}/{}: {:7s}".format(idx, len(structures), s.id))
for w in words:
mhere = s.match(w)
logging.debug(" GOT: {}".format(len(mhere)))
for match, reason in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
colocation_id = tuple(colocation_id)
matches[s.id].append((match, reason, colocation_id))
return matches
def main(input_file, structures_file, args):
structures = build_structures(structures_file)
for s in structures:
logging.debug(str(s))
colocation_ids = ColocationIds()
if args.parallel:
num_parallel = int(args.parallel)
# make temporary directory to hold temporary files
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = sys.argv
for inpt in args.input:
if inpt in cmd:
cmd.remove(inpt)
# remove "--parallel X"
pidx = cmd.index('--parallel')
del cmd[pidx]
del cmd[pidx]
def func(n):
cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
subprocess.check_call(cmdn)
return n
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
# fancy interface to wait for threads to finish
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
matches = pickle.load(fp)
colocation_ids.add_matches(matches)
else:
for words in load_files(args):
matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
if args.match_to_file is not None:
with open(args.match_to_file, "wb") as fp:
pickle.dump(matches, fp)
return
else:
colocation_ids.add_matches(matches)
if args.all:
Writer.make_all_writer(args).write_out(structures, colocation_ids)
Writer.make_output_writer(args).write_out(structures, colocation_ids)
logging.debug([(k, len(v)) for k, v in matches.items()])
logging.debug(sum(len(v) for _, v in matches.items()))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
parser.add_argument('structures', help='Structures definitions in xml file')
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--all', help='Additional output file, writes more data')
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true')
parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')
parser.add_argument('--match-to-file', help='Do not use!')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args.input, args.structures, args)
logging.info("TIME: {}".format(time.time() - start))