luscenje_struktur/wani.py

1327 lines
42 KiB
Python

from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict, namedtuple, Counter
import sys
import logging
import argparse
import pickle
import time
import subprocess
import concurrent.futures
import tempfile
from msd_translate import MSD_TRANSLATE
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda x: x
MAX_NUM_COMPONENTS = 5
CODES = {
"Noun": "N",
"Verb": "V",
"Adjective": "A",
"Adverb": "R",
"Pronoun": "P",
"Numeral": "M",
"Preposition": "S",
"Conjunction": "C",
"Particle": "Q",
"Interjection": "I",
"Abbreviation": "Y",
"Residual": "X",
'common': 'c',
'proper': 'p',
'masculine': 'm',
'feminine': 'f',
'neuter': 'n',
"singular": "s",
"dual": "d",
"plural": "p",
"nominative": "n",
"genitive": "g",
"dative": "d",
"accusative": "a",
"locative": "l",
"instrumental": "i",
"no": "n",
"yes": "y",
"main": "m",
"auxiliary": "a",
"perfective": "e",
"progressive": "p",
"biaspectual": "b",
"infinitive": "n",
"supine": "u",
"participle": "p",
"present": "r",
"future": "f",
"conditional": "c",
"imperative": "m",
"first": "1",
"second": "2",
"third": "3",
"general": "g",
"possessive": "s",
"positive": "p",
"comparative": "c",
"superlative": "s",
"personal": "p",
"demonstrative": "d",
"relative": "r",
"reflexive": "x",
"interrogative": "q",
"indefinite": "i",
"negative": "z",
"bound": "b",
"digit": "d",
"roman": "r",
"letter": "l",
"cardinal": "c",
"ordinal": "o",
"pronominal": "p",
"special": "s",
"coordinating": "c",
"subordinating": "s",
"foreign": "f",
"typo": "t",
"program": "p",
}
TAGSET = {
"N": ['type', 'gender', 'number', 'case', 'animate'],
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
"R": ['type', 'degree'],
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
"S": ['case'],
"C": ['type'],
"Q": [],
"I": [],
"Y": [],
"X": ['type']
}
CATEGORY_BASES = {
"N": ['.'] * 5,
"V": ['.'] * 7,
"A": ['.'] * 6,
"R": ['.'] * 2,
"P": ['.'] * 6,
"M": ['.'] * 6,
"S": ['.'] * 1,
"C": ['.'] * 1,
"Q": [],
"I": [],
"Y": [],
"X": ['.'] * 1
}
class ComponentType(Enum):
Other = 0
Core = 2
Core2w = 3
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
MatchAll = 2
class Order(Enum):
FromTo = 0
ToFrom = 1
Any = 2
@staticmethod
def new(order):
if order is not None:
if order == "to-from":
return Order.ToFrom
elif order == "from-to":
return Order.FromTo
else:
raise NotImplementedError("What kind of ordering is: {}".format(order))
else:
return Order.Any
def match(self, from_w, to_w):
if self is Order.Any:
return True
fi = from_w.int_id
ti = to_w.int_id
if self is Order.FromTo:
return fi < ti
elif self is Order.ToFrom:
return ti < fi
else:
raise NotImplementedError("Should not be here: Order match")
class ComponentRepresentation:
def __init__(self, data, word_renderer):
self.data = data
self.word_renderer = word_renderer
self.words = []
self.rendition_text = None
self.agreement = []
def get_agreement(self):
return []
def add_word(self, word):
self.words.append(word)
def render(self):
if self.rendition_text is None:
self.rendition_text = self._render()
def _render(self):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self):
return self.words[0].lemma if len(self.words) > 0 else None
class LexisCR(ComponentRepresentation):
def _render(self):
return self.data['lexis']
class WordFormAllCR(ComponentRepresentation):
def _render(self):
if len(self.words) == 0:
return None
else:
forms = [w.text.lower() for w in self.words]
return "/".join(set(forms))
class WordFormAnyCR(ComponentRepresentation):
def _render(self):
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
text_forms[(msd, lemma)] = text
words_counter = []
for word in self.words:
words_counter.append((word.msd, word.lemma))
sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x))
for word_msd, word_lemma in sorted_words:
for agr in self.agreement:
if not agr.match(word_msd):
break
else:
for agr in self.agreement:
agr.confirm_match()
if word_lemma is None:
return None
else:
return text_forms[(word_msd, word_lemma)]
class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args):
super().__init__(*args)
self.lemma = None
self.msd = None
def check_msd(self, word_msd):
if 'msd' not in self.data:
return True
selectors = self.data['msd']
for key, value in selectors.items():
t = word_msd[0]
v = TAGSET[t].index(key.lower())
f1 = word_msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
pass
def add_word(self, word):
if self.lemma is None:
self.lemma = word.lemma
self.msd = word.msd
if self.check_msd(word.msd):
super().add_word(word)
def _render(self):
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd)
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
self.words.append(backup_word)
return super()._render()
class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
self.rendition_candidate = None
def get_agreement(self):
return self.data['other']
def match(self, word_msd):
existing = [(w.msd, w.text) for w in self.words]
for candidate_msd, candidate_text in self.word_renderer.available_words(self.lemma, existing):
if self.msd[0] != candidate_msd[0]:
continue
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text
return True
return False
def confirm_match(self):
self.rendition_text = self.rendition_candidate
@staticmethod
def check_agreement(msd1, msd2, agreements):
for agr_case in agreements:
t1 = msd1[0]
# if not in msd, some strange msd was tries, skipping...
if agr_case not in TAGSET[t1]:
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
return False
v1 = TAGSET[t1].index(agr_case)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(msd1):
continue
# first is uppercase, not in TAGSET
m1 = msd1[v1 + 1]
# REPEAT (not DRY!)
t2 = msd2[0]
if agr_case not in TAGSET[t2]:
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
return False
v2 = TAGSET[t2].index(agr_case)
if v2 + 1 >= len(msd2):
continue
m2 = msd2[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def render(self):
pass
class ComponentRendition:
def __init__(self):
self.more = {}
self.representation_factory = ComponentRepresentation
def add_feature(self, feature):
if 'rendition' in feature:
if feature['rendition'] == "lemma":
self.representation_factory = LemmaCR
elif feature['rendition'] == "word_form":
# just by default, changes with selection
self.representation_factory = WordFormAnyCR
elif feature['rendition'] == "lexis":
self.representation_factory = LexisCR
self.more['lexis'] = feature['string']
else:
raise NotImplementedError("Representation rendition: {}".format(feature))
elif 'selection' in feature:
if feature['selection'] == "msd":
# could already be agreement
if self.representation_factory != WordFormAgreementCR:
self.representation_factory = WordFormMsdCR
self.more['msd'] = {k: v for k, v in feature.items() if k != 'selection'}
elif feature['selection'] == "all":
self.representation_factory = WordFormAllCR
elif feature['selection'] == 'agreement':
assert(feature['head'][:4] == 'cid_')
assert(feature['msd'] is not None)
self.representation_factory = WordFormAgreementCR
self.more['agreement'] = feature['msd'].split('+')
self.more['other'] = feature['head'][4:]
else:
raise NotImplementedError("Representation selection: {}".format(feature))
else:
return None
def cr_instance(self, word_renderer):
return self.representation_factory(self.more, word_renderer)
@staticmethod
def set_representations(matches, structure, word_renderer):
representations = {}
for c in structure.components:
representations[c.idx] = []
for rep in c.representation:
representations[c.idx].append(rep.cr_instance(word_renderer))
for cid, reps in representations.items():
for rep in reps:
for agr in rep.get_agreement():
if len(representations[agr]) != 1:
n = len(representations[agr])
raise NotImplementedError(
"Structure {}: ".format(structure.id) +
"component {} has agreement".format(cid) +
" with component {}".format(agr) +
", however there are {} (!= 1) representations".format(n) +
" of component {}!".format(agr))
representations[agr][0].agreement.append(rep)
for words in matches.matches:
# first pass, check everything but agreements
for w_id, w in words.items():
component = structure.get_component(w_id)
component_representations = representations[component.idx]
for representation in component_representations:
representation.add_word(w)
for cid, reps in representations.items():
for rep in reps:
rep.render()
for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps]
if len(reps) == 0:
pass
elif all(r is None for r in reps):
matches.representations[cid] = None
else:
matches.representations[cid] = " ".join(("" if r is None else r) for r in reps)
class ComponentStatus(Enum):
Optional = 0
Required = 1
Forbidden = 2
def get_level(restriction):
for feature in restriction:
if "level" in feature.keys():
lvl = feature.get("level")
else:
continue
raise RuntimeError("Unreachable!")
def determine_ppb(rgx):
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
if 'a' in rgx[1]:
return 3
elif 'm' in rgx[1]:
return 1
else:
return 2
else:
return 4
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
feature_dict = dict(feature.items())
match_type = True
if "filter" in feature_dict:
assert(feature_dict['filter'] == "negative")
match_type = False
del feature_dict['filter']
assert(len(feature_dict) == 1)
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
assert('POS' in restr_dict)
category = restr_dict['POS'][0].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS']
min_msd_length = 1
for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert(index >= 0)
if '|' in value:
match = "".join(CODES[val] for val in value.split('|'))
else:
match = CODES[value]
match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
re_objects = [re.compile(r) for r in rgx]
def matcher(text):
if len(text) <= min_msd_length:
return False
for c, r in zip(text, re_objects):
if not r.match(c):
return False
return True
return rgx, matcher
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert("lemma" in restr_dict)
match_list = restr_dict['lemma'].split('|')
return match_list, lambda text: text in match_list
class Restriction:
def __init__(self, restriction_tag):
self.ppb = 4 # polnopomenska beseda (0-4)
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
self.present = None
return
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
present, self.matcher = build_morphology_regex(list(restriction_tag))
self.present = " ".join(present)
self.ppb = determine_ppb(present)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
else:
raise NotImplementedError()
def match(self, word):
if self.type == RestrictionType.Morphology:
match_to = word.msd
elif self.type == RestrictionType.Lexis:
match_to = word.lemma
elif self.type == RestrictionType.MatchAll:
return True
else:
raise RuntimeError("Unreachable!")
return self.matcher(match_to)
class Component:
def __init__(self, info):
idx = info['cid']
name = info['name'] if 'name' in info else None
typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other
if 'status' not in info:
status = ComponentStatus.Required
elif info['status'] == 'forbidden':
status = ComponentStatus.Forbidden
elif info['status'] == 'obligatory':
status = ComponentStatus.Required
elif info['status'] == 'optional':
status = ComponentStatus.Optional
else:
raise NotImplementedError("strange status: {}".format(info['status']))
self.status = status
self.name = name
self.idx = idx
self.restrictions = []
self.next_element = []
self.representation = []
self.selection = {}
self.type = typ
self.iter_ctr = 0
def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order)))
def set_restriction(self, restrictions_tag):
if restrictions_tag is None:
self.restrictions = [Restriction(None)]
elif restrictions_tag.tag == "restriction":
self.restrictions = [Restriction(restrictions_tag)]
elif restrictions_tag.tag == "restriction_or":
self.restrictions = [Restriction(el) for el in restrictions_tag]
else:
raise RuntimeError("Unreachable")
def set_representation(self, representation):
for rep in representation:
crend = ComponentRendition()
for feature in rep:
crend.add_feature(feature.attrib)
self.representation.append(crend)
def find_next(self, deps, comps, restrs, reprs):
to_ret = []
for d in deps:
if d[0] == self.idx:
_, idx, dep_label, order = d
next_component = Component(comps[idx])
next_component.set_restriction(restrs[idx])
next_component.set_representation(reprs[idx])
to_ret.append(next_component)
self.add_next(next_component, dep_label, order)
others = next_component.find_next(deps, comps, restrs, reprs)
to_ret.extend(others)
return to_ret
def name_str(self):
return "_" if self.name is None else self.name
def match(self, word):
m1 = self._match_self(word)
if m1 is None:
return None
mn = self._match_next(word)
if mn is None:
return None
to_ret = [m1]
for cmatch in mn:
# if good match but nothing to add, just continue
if len(cmatch) == 0:
continue
# if more than one match found for particular component
elif len(cmatch) > 1:
# if more than one match in multiple components, NOPE!
if len(to_ret) > 1:
logging.warning("Strange multiple match: {}".format(
str([w.id for w in cmatch[0].values()])))
for tr in to_ret:
tr.update(cmatch[0])
continue
# yeah, so we have found more than one match, =>
# more than one element in to_ret
to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
else:
for tr in to_ret:
tr.update(cmatch[0])
return to_ret
def _match_self(self, word):
# matching
for restr in self.restrictions:
if restr.match(word): # match either
return {self.idx: word}
def _match_next(self, word):
# matches for every component in links from this component
to_ret = []
# need to get all links that match
for next, link, order in self.next_element:
next_links = word.get_links(link)
to_ret.append([])
# good flag
good = next.status != ComponentStatus.Required
for next_word in next_links:
if not order.match(word, next_word):
continue
match = next.match(next_word)
if match is not None:
# special treatement for forbidden
if next.status == ComponentStatus.Forbidden:
good = False
break
else:
assert(type(match) is list)
to_ret[-1].extend(match)
good = True
# if none matched, nothing found!
if not good:
return None
return to_ret
class SyntacticStructure:
def __init__(self):
self.id = None
self.lbs = None
self.components = []
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = xml.get('id')
st.lbs = xml.get('LBS')
assert(len(list(xml)) == 1)
system = next(iter(xml))
assert(system.get('type') == 'JOS')
components, dependencies, definitions = list(system)
deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
restrs, forms = {}, {}
for comp in definitions:
n = comp.get('cid')
restrs[n] = None
forms[n] = []
for el in comp:
if el.tag.startswith("restriction"):
assert(restrs[n] is None)
restrs[n] = el
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:
raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))
fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
st.determine_core2w()
return st
def determine_core2w(self):
ppb_components = []
for c in self.components:
if c.type != ComponentType.Core:
continue
ppb = 4
for r in c.restrictions:
ppb = min(r.ppb, ppb)
ppb_components.append((c, ppb))
ppb_components = sorted(ppb_components, key=lambda c: c[1])
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
for c, _ in ppb_components[:2]:
c.type = ComponentType.Core2w
def add_representation(self, n, rep_el, forms):
assert(rep_el.tag == "representation")
to_add = []
for el in rep_el:
assert(el.tag == "feature")
if 'rendition' in el.attrib or 'selection' in el.attrib:
to_add.append(el)
else:
logging.warning("Strange representation feature in structure {}. Skipping"
.format(self.id))
continue
forms[n].append(to_add)
def get_component(self, idx):
for c in self.components:
if c.idx == idx:
return c
raise RuntimeError("Unknown component id: {}".format(idx))
def match(self, word):
matches = self.components[0].match(word)
return [] if matches is None else matches
def load_structures(filename):
with open(filename, 'r') as fp:
et = ElementTree.XML(fp.read())
return build_structures(et), get_lemma_features(et)
def build_structures(et):
structures = []
for structure in et.iter('syntactic_structure'):
to_append = SyntacticStructure.from_xml(structure)
if to_append is None:
continue
structures.append(to_append)
return structures
def get_lemma_features(et):
lf = et.find('lemma_features')
if lf is None:
return {}
result = {}
for pos in lf.iter('POS'):
rgx_list, _ = build_morphology_regex(pos)
rgx_str = ""
for position in rgx_list:
if position == ".":
rgx_str += " "
elif len(position) == 1:
rgx_str += position
elif len(position) == 3 and position[0] == "[" and position[2] == "]":
rgx_str += position[1]
else:
raise RuntimeError("Strange rgx for lemma_feature...")
assert(rgx_str[0].isupper())
result[rgx_str[0]] = rgx_str.strip().replace(' ', '-')
return result
def get_msd(comp):
d = dict(comp.items())
if 'msd' in d:
return d['msd']
elif 'ana' in d:
return d['ana'][4:]
else:
logging.error(d, file=sys.stderr)
raise NotImplementedError("MSD?")
def lemma_only_word(msd):
if msd is None:
return None
else:
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
return WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
class Word:
def __init__(self, xml, do_msd_translate):
self.lemma = xml.get('lemma')
self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
self.id = xml.get('id')
self.text = xml.text
self.links = defaultdict(list)
last_num = self.id.split('.')[-1]
if last_num[0] not in '0123456789':
last_num = last_num[1:]
self.int_id = int(last_num)
assert(None not in (self.id, self.lemma, self.msd))
@staticmethod
def pcWord(pc, do_msd_translate):
pc.set('lemma', pc.text)
pc.set('msd', "N" if do_msd_translate else "U")
return Word(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
if link not in self.links and "|" in link:
for l in link.split('|'):
self.links[link].extend(self.links[l])
return self.links[link]
def most_frequent_text(self, word_renderer):
return word_renderer.render(self.lemma, self.msd)
class WordMsdRenderer:
def __init__(self, lemma_features):
self.all_words = []
self.rendered_words = {}
self.frequent_words = {}
self.num_words = defaultdict(int)
self.lemma_msd = {}
self.lemma_features = lemma_features
self.memoized_msd_merges = {}
def add_words(self, words):
self.all_words.extend(words)
def num_all_words(self):
return len(self.all_words)
def generate_renders(self):
data = defaultdict(lambda: defaultdict(list))
for w in self.all_words:
data[w.lemma][w.msd].append(w.text)
for lemma, ld in data.items():
self.rendered_words[lemma] = {}
freq_words = defaultdict(int)
common_msd = "*" * 10
for msd, texts in ld.items():
self.num_words[(lemma, msd[0])] += len(texts)
rep = max(set(texts), key=texts.count)
self.rendered_words[lemma][msd] = (rep, len(texts))
for txt in texts:
freq_words[(msd, txt)] += 1
common_msd = self.merge_msd(common_msd, msd)
self.lemma_msd[lemma] = common_msd
self.frequent_words[lemma] = []
for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]):
self.frequent_words[lemma].append((msd, txt, n))
lf = self.lemma_features
for lemma in self.lemma_msd.keys():
cmsd = self.lemma_msd[lemma]
if cmsd[0] in lf:
self.lemma_msd[lemma] = "".join(
l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd)
)
def merge_msd(self, common_msd, new_msd):
key = (common_msd, new_msd)
if key in self.memoized_msd_merges:
return self.memoized_msd_merges[key]
def merge_letter(l1, l2):
if l1 == "*":
return l2
elif l1 != l2:
return "-"
else:
return l1
value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd))
self.memoized_msd_merges[key] = value
return value
def render(self, lemma, msd):
if lemma in self.rendered_words:
if msd in self.rendered_words[lemma]:
return self.rendered_words[lemma][msd][0]
def available_words(self, lemma, existing_texts):
counted_texts = Counter(existing_texts)
for (msd, text), _n in counted_texts.most_common():
yield (msd, text)
if lemma in self.frequent_words:
for msd, text, _ in self.frequent_words[lemma]:
if (msd, text) not in counted_texts:
yield (msd, text)
def get_lemma_msd(self, lemma, word_msd):
# should be here, since we collect every lemmas
lemma_msd = self.lemma_msd[lemma]
if lemma_msd[0] == '-':
if word_msd[0] in self.lemma_features:
return self.lemma_features[word_msd[0]]
else:
return '-'
else:
return lemma_msd
def is_root_id(id_):
return len(id_.split('.')) == 3
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for n, fname in enumerate(filenames):
if args.count_files:
status = " :: {} / {}".format(n, len(filenames))
else:
status = ""
yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status)
def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status):
logging.info("LOADING FILE: {}{}".format(filename, status))
with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring)
words = {}
for w in et.iter("w"):
words[w.get('id')] = Word(w, do_msd_translate)
for pc in et.iter(pc_tag):
words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)
for l in et.iter("link"):
if 'dep' in l.keys():
ana = l.get('afun')
lfrom = l.get('from')
dest = l.get('dep')
else:
ana = l.get('ana')
if ana[:4] != 'syn:': # dont bother...
continue
ana = ana[4:]
lfrom, dest = l.get('target').replace('#', '').split()
if lfrom in words:
if not skip_id_check and is_root_id(lfrom):
logging.error("NOO: {}".format(lfrom))
sys.exit(1)
if dest in words:
next_word = words[dest]
words[lfrom].add_link(ana, next_word)
else:
logging.error("Unknown id: {}".format(dest))
sys.exit(1)
else:
# strange errors, just skip...
pass
return list(words.values())
class Writer:
@staticmethod
def make_output_writer(args):
return Writer(False, args.output, args.multiple_output, int(args.sort_by), args.sort_reversed)
@staticmethod
def make_all_writer(args):
return Writer(True, args.all, False, -1, False)
def __init__(self, all, filename, multiple_output, sort_by, sort_reversed):
self.all = all
self.output_file = filename
self.multiple_output = multiple_output
self.sort_by = sort_by
self.sort_order = sort_reversed
def header(self):
cols = ["Lemma"]
if self.all:
cols = ["Token_ID", "Word_form"] + cols + ["Msd"]
else:
cols.extend(["Representative_form", "RF_scenario"])
assert(len(cols) == self.length())
cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) for thd in cols]
cols = ["Structure_ID"] + cols + ["Colocation_ID"]
if not self.all:
cols += ["Joint_representative_form", "Frequency"]
return cols
def length(self):
return 4 if self.all else 3
def from_word(self, word, representation, rep_exists):
if word is None:
return [""] * self.length()
elif self.all:
return [word.id, word.text, word.lemma, word.msd]
elif not rep_exists:
return [word.lemma, "", ""]
elif representation is None:
return [word.lemma, word.lemma, "lemma_fallback"]
else:
return [word.lemma, representation, "ok"]
def sorted_rows(self, rows):
if self.sort_by < 0 or len(rows) < 2:
return rows
if len(rows[0]) <= self.sort_by:
logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0])))
return rows
try:
int(rows[0][self.sort_by])
key=lambda row: int(row[self.sort_by])
except ValueError:
key=lambda row: row[self.sort_by].lower()
return sorted(rows, key=key, reverse=self.sort_order)
def write_header(self, file_handler):
file_handler.write(", ".join(self.header()) + "\n")
def write_out_worker(self, file_handler, structure_id, components, colocation_ids):
rows = []
for cid, m, freq, rprsnt in colocation_ids.get_matches_for(structure_id, not self.all):
to_write = []
representation = ""
for idx, _comp in enumerate(components):
idx = str(idx + 1)
word = m[idx] if idx in m else None
rep_exists = idx in rprsnt
rep = rprsnt[idx] if rep_exists else None
to_write.extend(self.from_word(word, rep, rep_exists))
representation += " " + to_write[-2]
# make them equal size
to_write.extend([""] * (MAX_NUM_COMPONENTS * self.length() - len(to_write)))
to_write = [structure_id] + to_write + [cid]
if not self.all:
representation = re.sub(' +', ' ', representation)
to_write.append(representation.strip())
to_write.append(str(freq))
rows.append(to_write)
if len(rows) > 0:
rows = self.sorted_rows(rows)
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
file_handler.flush()
def write_out(self, structures, colocation_ids):
def fp_close(fp_):
if fp_ != sys.stdout:
fp_.close()
def fp_open(snum=None):
if self.output_file is None:
return sys.stdout
elif snum is None:
return open(self.output_file, "w")
else:
return open("{}.{}".format(self.output_file, snum), "w")
if not self.multiple_output:
fp = fp_open()
self.write_header(fp)
for s in structures:
if self.multiple_output:
fp=fp_open(s.id)
self.write_header(fp)
self.write_out_worker(fp, s.id, s.components, colocation_ids)
if self.multiple_output:
fp_close(fp)
if not self.multiple_output:
fp_close(fp)
class StructureMatch:
def __init__(self, match_id, structure_id):
self.match_id = match_id
self.structure_id = structure_id
self.matches = []
self.representations = {}
def distinct_matches(self):
dm = set()
keys = list(self.matches[0].keys())
for words in self.matches:
dm.add(" ".join(words[k].text for k in keys))
return len(dm)
def append(self, match):
self.matches.append(match)
def __len__(self):
return len(self.matches)
class ColocationIds:
def __init__(self):
self.data = {}
self.min_frequency = args.min_freq
self.dispersions = {}
def _add_match(self, key, sid, match):
if key not in self.data:
self.data[key] = StructureMatch(str(len(self.data) + 1), sid)
self.data[key].append(match)
def get(self, key, n):
return self.data[key][n]
def add_matches(self, matches):
for sid, nms in matches.items():
for nm in nms:
self._add_match(nm[1], sid, nm[0])
def get_matches_for(self, structure_id, group):
for _cid_tup, sm in self.data.items():
if sm.structure_id != structure_id:
continue
for words in sm.matches:
yield (sm.match_id, words, len(sm), sm.representations)
if group:
break
def set_representations(self, structures, word_renderer):
components_dict = {structure.id: structure for structure in structures}
idx = 1
for _1, sm in tqdm(self.data.items()):
ComponentRendition.set_representations(sm, components_dict[sm.structure_id], word_renderer)
idx += 1
def determine_colocation_dispersions(self):
dispersions = defaultdict(int)
for (structure_id, *word_tups) in self.data.keys():
for component_id, lemma in word_tups:
dispersions[(structure_id, component_id, lemma)] += 1
self.dispersions = dict(dispersions)
def match_file(words, structures):
matches = {s.id: [] for s in structures}
for s in tqdm(structures):
for w in words:
mhere = s.match(w)
for match in mhere:
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x:x[0]))
colocation_id = tuple(colocation_id)
matches[s.id].append((match, colocation_id))
return matches
def main(input_file, structures_file, args):
structures, lemma_msds = load_structures(structures_file)
colocation_ids = ColocationIds()
word_renderer = WordMsdRenderer(lemma_msds)
# if True:
# with open("match_word.p", "rb") as fp:
# words, matches = pickle.load(fp)
# colocation_ids.add_matches(matches)
# word_renderer.add_words(words)
if args.parallel:
num_parallel = int(args.parallel)
# make temporary directory to hold temporary files
with tempfile.TemporaryDirectory() as tmpdirname:
cmd = sys.argv
for inpt in args.input:
if inpt in cmd:
cmd.remove(inpt)
# remove "--parallel X"
pidx = cmd.index('--parallel')
del cmd[pidx]
del cmd[pidx]
def func(n):
cmdn = [sys.executable] + cmd + [args.input[n], "--match-to-file", "{}/{}.p".format(tmpdirname, n)]
subprocess.check_call(cmdn)
return n
# use ThreadPoolExecuter to run subprocesses in parallel using py threads
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor:
# fancy interface to wait for threads to finish
for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]):
with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp:
words, matches = pickle.load(fp)
colocation_ids.add_matches(matches)
word_renderer.add_words(words)
else:
for words in load_files(args):
matches = match_file(words, structures)
# just save to temporary file, used for children of a parallel process
# MUST NOT have more than one file
if args.match_to_file is not None:
with open(args.match_to_file, "wb") as fp:
pickle.dump((words, matches), fp)
return
else:
colocation_ids.add_matches(matches)
word_renderer.add_words(words)
# get word renders for lemma/msd
word_renderer.generate_renders()
colocation_ids.determine_colocation_dispersions()
if args.output:
# figure out representations!
colocation_ids.set_representations(structures, word_renderer)
Writer.make_output_writer(args).write_out(structures, colocation_ids)
if args.all:
Writer.make_all_writer(args).write_out(structures, colocation_ids)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Extract structures from a parsed corpus.')
parser.add_argument('structures', help='Structures definitions in xml file')
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--all', help='Additional output file, writes more data')
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?')
parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?')
parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true')
parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true')
parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1)
parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true')
parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc")
parser.add_argument('--parallel', help='Run in multiple processes, should speed things up')
parser.add_argument('--match-to-file', help='Do not use!')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args.input, args.structures, args)
logging.info("TIME: {}".format(time.time() - start))
# 2876, 2945 type