luscenje_struktur/wani.py

1327 lines
42 KiB
Python
Raw Normal View History

2018-10-29 10:29:51 +00:00
from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict, namedtuple, Counter
2019-01-08 20:13:36 +00:00
import sys
2019-01-19 21:42:51 +00:00
import logging
2019-02-06 14:28:39 +00:00
import argparse
2019-02-09 12:40:57 +00:00
import pickle
2019-02-06 14:29:37 +00:00
import time
import subprocess
import concurrent.futures
import tempfile
2018-10-29 10:29:51 +00:00
from msd_translate import MSD_TRANSLATE
2019-06-03 07:47:36 +00:00
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda x: x
2018-10-29 10:29:51 +00:00
MAX_NUM_COMPONENTS = 5
2018-10-29 10:29:51 +00:00
CODES = {
"Noun": "N",
"Verb": "V",
"Adjective": "A",
"Adverb": "R",
"Pronoun": "P",
"Numeral": "M",
"Preposition": "S",
"Conjunction": "C",
"Particle": "Q",
"Interjection": "I",
"Abbreviation": "Y",
"Residual": "X",
'common': 'c',
'proper': 'p',
'masculine': 'm',
'feminine': 'f',
'neuter': 'n',
"singular": "s",
"dual": "d",
"plural": "p",
"nominative": "n",
"genitive": "g",
"dative": "d",
"accusative": "a",
"locative": "l",
"instrumental": "i",
"no": "n",
"yes": "y",
"main": "m",
"auxiliary": "a",
"perfective": "e",
"progressive": "p",
"biaspectual": "b",
"infinitive": "n",
"supine": "u",
"participle": "p",
"present": "r",
"future": "f",
"conditional": "c",
"imperative": "m",
"first": "1",
"second": "2",
"third": "3",
"general": "g",
"possessive": "s",
"positive": "p",
"comparative": "c",
"superlative": "s",
"personal": "p",
"demonstrative": "d",
"relative": "r",
"reflexive": "x",
"interrogative": "q",
"indefinite": "i",
"negative": "z",
"bound": "b",
"digit": "d",
"roman": "r",
"letter": "l",
"cardinal": "c",
"ordinal": "o",
"pronominal": "p",
"special": "s",
"coordinating": "c",
"subordinating": "s",
"foreign": "f",
"typo": "t",
"program": "p",
}
TAGSET = {
"N": ['type', 'gender', 'number', 'case', 'animate'],
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
"R": ['type', 'degree'],
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
"S": ['case'],
"C": ['type'],
"Q": [],
"I": [],
"Y": [],
"X": ['type']
}
CATEGORY_BASES = {
2019-01-19 21:42:51 +00:00
"N": ['.'] * 5,
"V": ['.'] * 7,
"A": ['.'] * 6,
"R": ['.'] * 2,
"P": ['.'] * 6,
"M": ['.'] * 6,
"S": ['.'] * 1,
"C": ['.'] * 1,
2018-10-29 10:29:51 +00:00
"Q": [],
"I": [],
"Y": [],
2019-01-19 21:42:51 +00:00
"X": ['.'] * 1
2018-10-29 10:29:51 +00:00
}
class ComponentType(Enum):
Other = 0
Core = 2
Core2w = 3
2018-10-29 10:29:51 +00:00
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
2019-01-19 21:42:51 +00:00
MatchAll = 2
2018-10-29 10:29:51 +00:00
2019-02-04 10:01:30 +00:00
class Order(Enum):
FromTo = 0
ToFrom = 1
Any = 2
@staticmethod
def new(order):
if order is not None:
if order == "to-from":
return Order.ToFrom
2019-02-04 10:01:30 +00:00
elif order == "from-to":
return Order.FromTo
2019-02-04 10:01:30 +00:00
else:
raise NotImplementedError("What kind of ordering is: {}".format(order))
else:
return Order.Any
2019-02-12 16:38:32 +00:00
2019-02-04 10:01:30 +00:00
def match(self, from_w, to_w):
if self is Order.Any:
return True
2019-02-12 16:38:32 +00:00
fi = from_w.int_id
ti = to_w.int_id
2019-02-04 10:01:30 +00:00
if self is Order.FromTo:
return fi < ti
elif self is Order.ToFrom:
return ti < fi
else:
raise NotImplementedError("Should not be here: Order match")
class ComponentRepresentation:
def __init__(self, data, word_renderer):
self.data = data
self.word_renderer = word_renderer
self.words = []
self.rendition_text = None
self.agreement = []
def get_agreement(self):
return []
def add_word(self, word):
self.words.append(word)
def render(self):
if self.rendition_text is None:
self.rendition_text = self._render()
def _render(self):
raise NotImplementedError("Not implemented for class: {}".format(type(self)))
class LemmaCR(ComponentRepresentation):
def _render(self):
return self.words[0].lemma if len(self.words) > 0 else None
class LexisCR(ComponentRepresentation):
def _render(self):
2019-06-02 10:53:16 +00:00
return self.data['lexis']
class WordFormAllCR(ComponentRepresentation):
def _render(self):
2019-06-01 08:33:02 +00:00
if len(self.words) == 0:
return None
else:
forms = [w.text.lower() for w in self.words]
return "/".join(set(forms))
class WordFormAnyCR(ComponentRepresentation):
def _render(self):
text_forms = {}
msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words])
for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()):
text_forms[(msd, lemma)] = text
words_counter = []
for word in self.words:
words_counter.append((word.msd, word.lemma))
sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x))
for word_msd, word_lemma in sorted_words:
for agr in self.agreement:
if not agr.match(word_msd):
break
else:
for agr in self.agreement:
agr.confirm_match()
if word_lemma is None:
return None
else:
return text_forms[(word_msd, word_lemma)]
class WordFormMsdCR(WordFormAnyCR):
def __init__(self, *args):
super().__init__(*args)
2019-06-02 10:53:16 +00:00
self.lemma = None
self.msd = None
def check_msd(self, word_msd):
2019-06-02 10:53:16 +00:00
if 'msd' not in self.data:
return True
selectors = self.data['msd']
2019-06-02 11:51:32 +00:00
for key, value in selectors.items():
t = word_msd[0]
v = TAGSET[t].index(key.lower())
f1 = word_msd[v + 1]
f2 = CODES[value]
if '-' not in [f1, f2] and f1 != f2:
return False
return True
pass
def add_word(self, word):
2019-06-02 10:53:16 +00:00
if self.lemma is None:
self.lemma = word.lemma
self.msd = word.msd
if self.check_msd(word.msd):
super().add_word(word)
def _render(self):
2019-06-02 10:53:16 +00:00
msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd)
WordLemma = namedtuple('WordLemmaOnly', 'msd most_frequent_text lemma text')
backup_word = WordLemma(msd=msd, most_frequent_text=lambda *x: None, lemma=None, text=None)
self.words.append(backup_word)
2019-06-02 11:51:32 +00:00
return super()._render()
2019-06-02 10:53:16 +00:00
class WordFormAgreementCR(WordFormMsdCR):
def __init__(self, data, word_renderer):
super().__init__(data, word_renderer)
self.rendition_candidate = None
def get_agreement(self):
2019-06-02 10:53:16 +00:00
return self.data['other']
def match(self, word_msd):
existing = [(w.msd, w.text) for w in self.words]
2019-06-02 10:53:16 +00:00
for candidate_msd, candidate_text in self.word_renderer.available_words(self.lemma, existing):
if self.msd[0] != candidate_msd[0]:
continue
2019-06-02 10:53:16 +00:00
if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']):
if self.check_msd(candidate_msd):
self.rendition_candidate = candidate_text
return True
return False
def confirm_match(self):
self.rendition_text = self.rendition_candidate
@staticmethod
def check_agreement(msd1, msd2, agreements):
for agr_case in agreements:
t1 = msd1[0]
# if not in msd, some strange msd was tries, skipping...
if agr_case not in TAGSET[t1]:
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd1))
return False
v1 = TAGSET[t1].index(agr_case)
# if none specified: nedolocnik, always agrees
if v1 + 1 >= len(msd1):
continue
# first is uppercase, not in TAGSET
m1 = msd1[v1 + 1]
# REPEAT (not DRY!)
t2 = msd2[0]
if agr_case not in TAGSET[t2]:
logging.warning("Cannot do agreement: {} for msd {} not found!".format(agr_case, msd2))
return False
v2 = TAGSET[t2].index(agr_case)
if v2 + 1 >= len(msd2):
continue
m2 = msd2[v2 + 1]
# match!
if '-' not in [m1, m2] and m1 != m2:
return False
return True
def render(self):
pass
2019-01-19 21:42:51 +00:00
class ComponentRendition:
def __init__(self):
2019-06-02 10:53:16 +00:00
self.more = {}
self.representation_factory = ComponentRepresentation
def add_feature(self, feature):
if 'rendition' in feature:
if feature['rendition'] == "lemma":
self.representation_factory = LemmaCR
elif feature['rendition'] == "word_form":
# just by default, changes with selection
self.representation_factory = WordFormAnyCR
elif feature['rendition'] == "lexis":
self.representation_factory = LexisCR
2019-06-02 10:53:16 +00:00
self.more['lexis'] = feature['string']
else:
raise NotImplementedError("Representation rendition: {}".format(feature))
elif 'selection' in feature:
if feature['selection'] == "msd":
2019-06-02 10:53:16 +00:00
# could already be agreement
if self.representation_factory != WordFormAgreementCR:
self.representation_factory = WordFormMsdCR
self.more['msd'] = {k: v for k, v in feature.items() if k != 'selection'}
elif feature['selection'] == "all":
self.representation_factory = WordFormAllCR
elif feature['selection'] == 'agreement':
assert(feature['head'][:4] == 'cid_')
assert(feature['msd'] is not None)
self.representation_factory = WordFormAgreementCR
2019-06-02 10:53:16 +00:00
self.more['agreement'] = feature['msd'].split('+')
self.more['other'] = feature['head'][4:]
else:
raise NotImplementedError("Representation selection: {}".format(feature))
2019-01-19 21:42:51 +00:00
else:
return None
def cr_instance(self, word_renderer):
return self.representation_factory(self.more, word_renderer)
@staticmethod
def set_representations(matches, structure, word_renderer):
representations = {}
for c in structure.components:
representations[c.idx] = []
for rep in c.representation:
representations[c.idx].append(rep.cr_instance(word_renderer))
for cid, reps in representations.items():
for rep in reps:
for agr in rep.get_agreement():
if len(representations[agr]) != 1:
n = len(representations[agr])
raise NotImplementedError(
"Structure {}: ".format(structure.id) +
"component {} has agreement".format(cid) +
" with component {}".format(agr) +
", however there are {} (!= 1) representations".format(n) +
" of component {}!".format(agr))
representations[agr][0].agreement.append(rep)
for words in matches.matches:
# first pass, check everything but agreements
for w_id, w in words.items():
component = structure.get_component(w_id)
component_representations = representations[component.idx]
for representation in component_representations:
representation.add_word(w)
for cid, reps in representations.items():
for rep in reps:
rep.render()
for cid, reps in representations.items():
reps = [rep.rendition_text for rep in reps]
if len(reps) == 0:
pass
elif all(r is None for r in reps):
matches.representations[cid] = None
else:
matches.representations[cid] = " ".join(("" if r is None else r) for r in reps)
2019-01-19 21:42:51 +00:00
class ComponentStatus(Enum):
Optional = 0
Required = 1
Forbidden = 2
2018-10-29 10:29:51 +00:00
def get_level(restriction):
for feature in restriction:
if "level" in feature.keys():
lvl = feature.get("level")
2019-01-19 21:42:51 +00:00
else:
continue
2018-10-29 10:29:51 +00:00
raise RuntimeError("Unreachable!")
def determine_ppb(rgx):
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
if 'a' in rgx[1]:
return 3
elif 'm' in rgx[1]:
return 1
else:
return 2
else:
return 4
2018-10-29 10:29:51 +00:00
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
2019-01-19 21:42:51 +00:00
feature_dict = dict(feature.items())
match_type = True
if "filter" in feature_dict:
assert(feature_dict['filter'] == "negative")
match_type = False
del feature_dict['filter']
assert(len(feature_dict) == 1)
key, value = next(iter(feature_dict.items()))
restr_dict[key] = (value, match_type)
2018-10-29 10:29:51 +00:00
assert('POS' in restr_dict)
2019-01-19 21:42:51 +00:00
category = restr_dict['POS'][0].capitalize()
2018-10-29 10:29:51 +00:00
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS']
min_msd_length = 1
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
for attribute, (value, typ) in restr_dict.items():
2018-10-29 10:29:51 +00:00
index = TAGSET[cat_code].index(attribute.lower())
assert(index >= 0)
if '|' in value:
2019-01-19 21:42:51 +00:00
match = "".join(CODES[val] for val in value.split('|'))
2018-10-29 10:29:51 +00:00
else:
match = CODES[value]
2019-01-19 21:42:51 +00:00
match = "[{}{}]".format("" if typ else "^", match)
2018-10-29 10:29:51 +00:00
rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
2019-06-02 10:50:04 +00:00
re_objects = [re.compile(r) for r in rgx]
2019-01-19 21:42:51 +00:00
def matcher(text):
if len(text) <= min_msd_length:
return False
2019-06-02 10:50:04 +00:00
for c, r in zip(text, re_objects):
if not r.match(c):
2019-01-19 21:42:51 +00:00
return False
return True
return rgx, matcher
2018-10-29 10:29:51 +00:00
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
2019-01-19 21:42:51 +00:00
assert("lemma" in restr_dict)
match_list = restr_dict['lemma'].split('|')
return match_list, lambda text: text in match_list
2018-10-29 10:29:51 +00:00
class Restriction:
def __init__(self, restriction_tag):
self.ppb = 4 # polnopomenska beseda (0-4)
2019-01-19 21:42:51 +00:00
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
self.present = None
return
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.type = RestrictionType.Morphology
present, self.matcher = build_morphology_regex(list(restriction_tag))
self.present = " ".join(present)
self.ppb = determine_ppb(present)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
2019-01-19 21:42:51 +00:00
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
else:
raise NotImplementedError()
def match(self, word):
if self.type == RestrictionType.Morphology:
match_to = word.msd
elif self.type == RestrictionType.Lexis:
match_to = word.lemma
2019-01-19 21:42:51 +00:00
elif self.type == RestrictionType.MatchAll:
return True
else:
raise RuntimeError("Unreachable!")
2019-01-19 21:42:51 +00:00
return self.matcher(match_to)
2018-10-29 10:29:51 +00:00
class Component:
2019-01-19 21:42:51 +00:00
def __init__(self, info):
idx = info['cid']
name = info['name'] if 'name' in info else None
typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other
2019-01-19 21:42:51 +00:00
if 'status' not in info:
status = ComponentStatus.Required
elif info['status'] == 'forbidden':
status = ComponentStatus.Forbidden
elif info['status'] == 'obligatory':
status = ComponentStatus.Required
elif info['status'] == 'optional':
status = ComponentStatus.Optional
else:
raise NotImplementedError("strange status: {}".format(info['status']))
2019-01-19 21:42:51 +00:00
self.status = status
self.name = name
self.idx = idx
self.restrictions = []
self.next_element = []
self.representation = []
2019-01-19 21:42:51 +00:00
self.selection = {}
self.type = typ
2018-10-29 10:29:51 +00:00
self.iter_ctr = 0
2019-02-04 10:01:30 +00:00
def add_next(self, next_component, link_label, order):
self.next_element.append((next_component, link_label, Order.new(order)))
2018-10-29 10:29:51 +00:00
def set_restriction(self, restrictions_tag):
2019-01-19 21:42:51 +00:00
if restrictions_tag is None:
self.restrictions = [Restriction(None)]
2019-01-19 21:42:51 +00:00
elif restrictions_tag.tag == "restriction":
self.restrictions = [Restriction(restrictions_tag)]
2018-10-29 10:29:51 +00:00
elif restrictions_tag.tag == "restriction_or":
self.restrictions = [Restriction(el) for el in restrictions_tag]
else:
raise RuntimeError("Unreachable")
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
def set_representation(self, representation):
for rep in representation:
crend = ComponentRendition()
for feature in rep:
crend.add_feature(feature.attrib)
self.representation.append(crend)
2019-01-19 21:42:51 +00:00
def find_next(self, deps, comps, restrs, reprs):
to_ret = []
for d in deps:
if d[0] == self.idx:
2019-02-04 10:01:30 +00:00
_, idx, dep_label, order = d
2019-01-19 21:42:51 +00:00
next_component = Component(comps[idx])
next_component.set_restriction(restrs[idx])
next_component.set_representation(reprs[idx])
2019-01-19 21:42:51 +00:00
to_ret.append(next_component)
2019-02-04 10:01:30 +00:00
self.add_next(next_component, dep_label, order)
others = next_component.find_next(deps, comps, restrs, reprs)
2019-01-19 21:42:51 +00:00
to_ret.extend(others)
return to_ret
2019-01-19 21:42:51 +00:00
def name_str(self):
return "_" if self.name is None else self.name
2018-10-29 10:29:51 +00:00
def match(self, word):
2019-01-19 21:42:51 +00:00
m1 = self._match_self(word)
if m1 is None:
return None
mn = self._match_next(word)
if mn is None:
return None
2019-01-19 21:42:51 +00:00
to_ret = [m1]
for cmatch in mn:
# if good match but nothing to add, just continue
if len(cmatch) == 0:
continue
# if more than one match found for particular component
elif len(cmatch) > 1:
# if more than one match in multiple components, NOPE!
if len(to_ret) > 1:
logging.warning("Strange multiple match: {}".format(
str([w.id for w in cmatch[0].values()])))
for tr in to_ret:
tr.update(cmatch[0])
continue
# yeah, so we have found more than one match, =>
# more than one element in to_ret
to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
else:
for tr in to_ret:
tr.update(cmatch[0])
return to_ret
def _match_self(self, word):
# matching
for restr in self.restrictions:
if restr.match(word): # match either
return {self.idx: word}
2019-01-19 21:42:51 +00:00
def _match_next(self, word):
# matches for every component in links from this component
to_ret = []
# need to get all links that match
2019-02-04 10:01:30 +00:00
for next, link, order in self.next_element:
next_links = word.get_links(link)
2019-01-19 21:42:51 +00:00
to_ret.append([])
# good flag
good = next.status != ComponentStatus.Required
for next_word in next_links:
2019-02-04 10:01:30 +00:00
if not order.match(word, next_word):
continue
2019-01-19 21:42:51 +00:00
match = next.match(next_word)
if match is not None:
# special treatement for forbidden
if next.status == ComponentStatus.Forbidden:
good = False
break
2019-01-19 21:42:51 +00:00
else:
assert(type(match) is list)
to_ret[-1].extend(match)
good = True
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
# if none matched, nothing found!
if not good:
return None
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
return to_ret
2018-10-29 10:29:51 +00:00
class SyntacticStructure:
def __init__(self):
self.id = None
self.lbs = None
2019-01-19 21:42:51 +00:00
self.components = []
2018-10-29 10:29:51 +00:00
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = xml.get('id')
2018-10-29 10:29:51 +00:00
st.lbs = xml.get('LBS')
2019-01-19 21:42:51 +00:00
assert(len(list(xml)) == 1)
system = next(iter(xml))
2018-10-29 10:29:51 +00:00
assert(system.get('type') == 'JOS')
2019-01-19 21:42:51 +00:00
components, dependencies, definitions = list(system)
2018-10-29 10:29:51 +00:00
2019-02-04 10:01:30 +00:00
deps = [ (dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) for dep in dependencies ]
2019-01-19 21:42:51 +00:00
comps = { comp.get('cid'): dict(comp.items()) for comp in components }
restrs, forms = {}, {}
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
for comp in definitions:
n = comp.get('cid')
restrs[n] = None
forms[n] = []
2018-10-29 10:29:51 +00:00
2019-01-19 21:42:51 +00:00
for el in comp:
if el.tag.startswith("restriction"):
assert(restrs[n] is None)
restrs[n] = el
elif el.tag.startswith("representation"):
st.add_representation(n, el, forms)
else:
raise NotImplementedError("Unknown definition: {} in structure {}".format(el.tag, st.id))
2019-01-19 21:42:51 +00:00
fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
st.determine_core2w()
2018-10-29 10:29:51 +00:00
return st
def determine_core2w(self):
ppb_components = []
for c in self.components:
if c.type != ComponentType.Core:
continue
ppb = 4
for r in c.restrictions:
ppb = min(r.ppb, ppb)