2018-10-29 10:29:51 +00:00
|
|
|
from xml.etree import ElementTree
|
|
|
|
import re
|
|
|
|
from enum import Enum
|
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
from msd_translate import MSD_TRANSLATE
|
|
|
|
|
|
|
|
|
2018-10-29 11:16:42 +00:00
|
|
|
STRUKTURE = "Kolokacije_strukture_09_new-system.xml"
|
2018-10-29 10:29:51 +00:00
|
|
|
STAVKI = "k2.xml"
|
|
|
|
|
|
|
|
CODES = {
|
|
|
|
"Noun": "N",
|
|
|
|
"Verb": "V",
|
|
|
|
"Adjective": "A",
|
|
|
|
"Adverb": "R",
|
|
|
|
"Pronoun": "P",
|
|
|
|
"Numeral": "M",
|
|
|
|
"Preposition": "S",
|
|
|
|
"Conjunction": "C",
|
|
|
|
"Particle": "Q",
|
|
|
|
"Interjection": "I",
|
|
|
|
"Abbreviation": "Y",
|
|
|
|
"Residual": "X",
|
|
|
|
|
|
|
|
'common': 'c',
|
|
|
|
'proper': 'p',
|
|
|
|
'masculine': 'm',
|
|
|
|
'feminine': 'f',
|
|
|
|
'neuter': 'n',
|
|
|
|
"singular": "s",
|
|
|
|
"dual": "d",
|
|
|
|
"plural": "p",
|
|
|
|
"nominative": "n",
|
|
|
|
"genitive": "g",
|
|
|
|
"dative": "d",
|
|
|
|
"accusative": "a",
|
|
|
|
"locative": "l",
|
|
|
|
"instrumental": "i",
|
|
|
|
"no": "n",
|
|
|
|
"yes": "y",
|
|
|
|
"main": "m",
|
|
|
|
"auxiliary": "a",
|
|
|
|
"perfective": "e",
|
|
|
|
"progressive": "p",
|
|
|
|
"biaspectual": "b",
|
|
|
|
"infinitive": "n",
|
|
|
|
"supine": "u",
|
|
|
|
"participle": "p",
|
|
|
|
"present": "r",
|
|
|
|
"future": "f",
|
|
|
|
"conditional": "c",
|
|
|
|
"imperative": "m",
|
|
|
|
"first": "1",
|
|
|
|
"second": "2",
|
|
|
|
"third": "3",
|
|
|
|
"general": "g",
|
|
|
|
"possessive": "s",
|
|
|
|
"positive": "p",
|
|
|
|
"comparative": "c",
|
|
|
|
"superlative": "s",
|
|
|
|
"personal": "p",
|
|
|
|
"demonstrative": "d",
|
|
|
|
"relative": "r",
|
|
|
|
"reflexive": "x",
|
|
|
|
"interrogative": "q",
|
|
|
|
"indefinite": "i",
|
|
|
|
"negative": "z",
|
|
|
|
"bound": "b",
|
|
|
|
"digit": "d",
|
|
|
|
"roman": "r",
|
|
|
|
"letter": "l",
|
|
|
|
"cardinal": "c",
|
|
|
|
"ordinal": "o",
|
|
|
|
"pronominal": "p",
|
|
|
|
"special": "s",
|
|
|
|
"coordinating": "c",
|
|
|
|
"subordinating": "s",
|
|
|
|
"foreign": "f",
|
|
|
|
"typo": "t",
|
|
|
|
"program": "p",
|
|
|
|
}
|
|
|
|
|
|
|
|
TAGSET = {
|
|
|
|
"N": ['type', 'gender', 'number', 'case', 'animate'],
|
|
|
|
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
|
|
|
|
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
|
|
|
|
"R": ['type', 'degree'],
|
|
|
|
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
|
|
|
|
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
|
|
|
|
"S": ['case'],
|
|
|
|
"C": ['type'],
|
|
|
|
"Q": [],
|
|
|
|
"I": [],
|
|
|
|
"Y": [],
|
|
|
|
"X": ['type']
|
|
|
|
}
|
|
|
|
|
|
|
|
CATEGORY_BASES = {
|
|
|
|
"N": ['.', '.', '.', '.', '.?'],
|
|
|
|
"V": ['.', '.', '.', '.', '.?', '.?', '.?'],
|
|
|
|
"A": ['.', '.', '.', '.', '.', '.?'],
|
|
|
|
"R": ['.', '.?'],
|
|
|
|
"P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
|
|
|
|
"M": ['.', '.', '.', '.?', '.?', '.?'],
|
|
|
|
"S": ['.'],
|
|
|
|
"C": ['.'],
|
|
|
|
"Q": [],
|
|
|
|
"I": [],
|
|
|
|
"Y": [],
|
|
|
|
"X": ['.?']
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class RestrictionType(Enum):
|
|
|
|
Morphology = 0
|
|
|
|
Lexis = 1
|
|
|
|
|
|
|
|
|
|
|
|
class ComponentLevel(Enum):
|
|
|
|
Lemma = 0
|
|
|
|
WordForm = 1
|
|
|
|
|
|
|
|
|
|
|
|
def get_level(restriction):
|
|
|
|
for feature in restriction:
|
|
|
|
if "level" in feature.keys():
|
|
|
|
lvl = feature.get("level")
|
|
|
|
if lvl == "lemma":
|
|
|
|
return ComponentLevel.Lemma
|
|
|
|
elif lvl == "word_form":
|
|
|
|
return ComponentLevel.WordForm
|
|
|
|
else:
|
|
|
|
continue
|
|
|
|
|
|
|
|
raise RuntimeError("Unreachable!")
|
|
|
|
|
|
|
|
|
|
|
|
def build_morphology_regex(restriction):
|
|
|
|
restr_dict = {}
|
|
|
|
for feature in restriction:
|
|
|
|
restr_dict.update(feature.items())
|
|
|
|
|
|
|
|
assert('POS' in restr_dict)
|
|
|
|
category = restr_dict['POS'].capitalize()
|
|
|
|
cat_code = CODES[category]
|
|
|
|
rgx = [cat_code] + CATEGORY_BASES[cat_code]
|
|
|
|
|
|
|
|
del restr_dict['POS']
|
|
|
|
del restr_dict['level']
|
|
|
|
|
|
|
|
for attribute, value in restr_dict.items():
|
|
|
|
index = TAGSET[cat_code].index(attribute.lower())
|
|
|
|
assert(index >= 0)
|
|
|
|
|
|
|
|
if '|' in value:
|
|
|
|
match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
|
|
|
|
else:
|
|
|
|
match = CODES[value]
|
|
|
|
|
|
|
|
rgx[index + 1] = match
|
|
|
|
|
|
|
|
return re.compile("".join(rgx))
|
|
|
|
|
|
|
|
|
|
|
|
def build_lexis_regex(restriction):
|
|
|
|
restr_dict = {}
|
|
|
|
for feature in restriction:
|
|
|
|
restr_dict.update(feature.items())
|
|
|
|
|
|
|
|
return re.compile(restr_dict['lemma'])
|
|
|
|
|
|
|
|
|
2018-10-29 11:16:42 +00:00
|
|
|
class Restriction:
|
|
|
|
def __init__(self, restriction_tag):
|
|
|
|
restriction_type = restriction_tag.get('type')
|
|
|
|
if restriction_type == "morphology":
|
|
|
|
self.type = RestrictionType.Morphology
|
|
|
|
self.matcher = build_morphology_regex(restriction_tag.getchildren())
|
|
|
|
elif restriction_type == "lexis":
|
|
|
|
self.type = RestrictionType.Lexis
|
|
|
|
self.matcher = build_lexis_regex(restriction_tag.getchildren())
|
|
|
|
else:
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
def match(self, word):
|
|
|
|
if self.type == RestrictionType.Morphology:
|
|
|
|
match_to = word.msd
|
|
|
|
elif self.type == RestrictionType.Lexis:
|
|
|
|
match_to = word.lemma
|
|
|
|
else:
|
|
|
|
raise RuntimeError("Unreachable!")
|
|
|
|
|
|
|
|
return self.matcher.match(match_to)
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return str(self)
|
|
|
|
|
|
|
|
|
2018-10-29 10:29:51 +00:00
|
|
|
class Component:
|
2018-10-30 12:33:08 +00:00
|
|
|
def __init__(self, name, idx):
|
|
|
|
assert(idx is not None)
|
|
|
|
|
|
|
|
self.name = name if name is not None else "" # for printing...
|
|
|
|
self.idx = idx
|
2018-10-29 10:29:51 +00:00
|
|
|
self.restriction = None
|
2018-10-30 12:33:08 +00:00
|
|
|
self.next_element = []
|
2018-10-29 10:29:51 +00:00
|
|
|
self.level = None
|
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
self.iter_ctr = 0
|
|
|
|
|
2018-10-29 10:29:51 +00:00
|
|
|
def word_to_str(self, word):
|
|
|
|
if self.level == ComponentLevel.Lemma:
|
|
|
|
return word.lemma, word.msd
|
|
|
|
elif self.level == ComponentLevel.WordForm:
|
|
|
|
return word.text, word.msd
|
|
|
|
else:
|
|
|
|
raise RuntimeError("Unreachable")
|
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
def __iter__(self):
|
|
|
|
self.iter_ctr = 0
|
|
|
|
return self
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
def __next__(self):
|
|
|
|
if self.iter_ctr < len(self.next_element):
|
|
|
|
to_ret = self.next_element[self.iter_ctr]
|
|
|
|
self.iter_ctr += 1
|
|
|
|
return to_ret
|
|
|
|
else:
|
|
|
|
raise StopIteration
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
def add_next(self, next_component, link_label):
|
|
|
|
self.next_element.append((next_component, link_label))
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-29 11:16:42 +00:00
|
|
|
def set_restriction(self, restrictions_tag):
|
|
|
|
if restrictions_tag.tag == "restriction":
|
|
|
|
self.restriction = Restriction(restrictions_tag)
|
|
|
|
self.level = get_level(restrictions_tag)
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-29 11:16:42 +00:00
|
|
|
elif restrictions_tag.tag == "restriction_or":
|
|
|
|
self.restriction = [Restriction(el) for el in restrictions_tag]
|
|
|
|
self.level = get_level(restrictions_tag[0])
|
|
|
|
|
|
|
|
# same level for every restriction for now and only or available
|
|
|
|
levels = [get_level(el) for el in restrictions_tag]
|
|
|
|
assert(len(set(levels)) == 1)
|
|
|
|
|
|
|
|
else:
|
|
|
|
raise RuntimeError("Unreachable")
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
def find_next(self, deps, comps, restrs):
|
|
|
|
for d in deps:
|
|
|
|
if d[0] == self.idx:
|
|
|
|
_, idx, dep_label = d
|
|
|
|
|
|
|
|
next_component = Component(comps[idx], idx)
|
|
|
|
next_component.set_restriction(restrs[idx])
|
|
|
|
|
|
|
|
self.add_next(next_component, dep_label)
|
|
|
|
next_component.find_next(deps, comps, restrs)
|
|
|
|
|
2018-10-29 10:29:51 +00:00
|
|
|
def __str__(self):
|
2018-10-30 12:33:08 +00:00
|
|
|
el = "({:10} {})".format(self.name, str(self.restriction))
|
|
|
|
for next, link in self:
|
|
|
|
el += "\n{:10} -- {:10} --> {}".format(self.name, link, str(next))
|
2018-10-29 10:29:51 +00:00
|
|
|
return el
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return str(self)
|
|
|
|
|
|
|
|
def match(self, word):
|
2018-10-29 11:16:42 +00:00
|
|
|
matched = None
|
|
|
|
|
|
|
|
# matching
|
|
|
|
if type(self.restriction) is list:
|
|
|
|
for restr in self.restriction:
|
|
|
|
matched = restr.match(word)
|
|
|
|
if matched is not None:
|
|
|
|
break
|
2018-10-29 10:29:51 +00:00
|
|
|
else:
|
2018-10-29 11:16:42 +00:00
|
|
|
matched = self.restriction.match(word)
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-29 11:16:42 +00:00
|
|
|
# recurse to next
|
|
|
|
if matched:
|
2018-10-29 10:29:51 +00:00
|
|
|
to_ret = [self.word_to_str(word)]
|
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
for next, link in self:
|
|
|
|
# need to get all links that match
|
|
|
|
for next_word in word.get_links(link):
|
|
|
|
match = next.match(next_word)
|
|
|
|
# if matches, return
|
|
|
|
if match is not None:
|
|
|
|
to_ret.extend(match)
|
|
|
|
break
|
|
|
|
|
|
|
|
# if none matched, nothing found!
|
|
|
|
else:
|
|
|
|
return None
|
2018-10-29 10:29:51 +00:00
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
return to_ret
|
2018-10-29 10:29:51 +00:00
|
|
|
|
|
|
|
# return None...
|
|
|
|
|
|
|
|
|
|
|
|
class SyntacticStructure:
|
|
|
|
def __init__(self):
|
2018-10-30 12:33:08 +00:00
|
|
|
self.root_component = Component("", 'root')
|
2018-10-29 10:29:51 +00:00
|
|
|
self.id = None
|
|
|
|
self.lbs = None
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def from_xml(xml):
|
|
|
|
st = SyntacticStructure()
|
2018-10-29 11:16:42 +00:00
|
|
|
st.id = xml.get('id')
|
2018-10-29 10:29:51 +00:00
|
|
|
st.lbs = xml.get('LBS')
|
|
|
|
|
|
|
|
components, system = xml.getchildren()
|
|
|
|
dependencies, restrictions = system.getchildren()
|
|
|
|
|
|
|
|
assert(system.get('type') == 'JOS')
|
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
deps = [ (dep.get('from'), dep.get('to'), dep.get('label')) for dep in dependencies ]
|
2018-10-29 10:29:51 +00:00
|
|
|
comps = { comp.get('cid'): comp.get('name') for comp in components }
|
|
|
|
restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }
|
|
|
|
|
2018-10-30 12:33:08 +00:00
|
|
|
st.root_component.find_next(deps, comps, restrs)
|
|
|
|
st.root_component = list(st.root_component)[0][0] # get first next
|
2018-10-29 10:29:51 +00:00
|
|
|
|
|
|
|
return st
|
|
|
|
|
|
|
|
def __str__(self):
|
2018-10-30 12:33:08 +00:00
|
|
|
arrow = "root -- modra --> "
|
|
|
|
return "{} LBS {}\n------\n{}{}".format(self.id, self.lbs, arrow, str(self.root_component))
|
2018-10-29 10:29:51 +00:00
|
|
|
|
|
|
|
def match(self, word):
|
|
|
|
return self.root_component.match(word)
|
|
|
|
|
|
|
|
|
|
|
|
def build_structures(filename):
|
|
|
|
structures = []
|
|
|
|
with open(filename, 'r') as fp:
|
|
|
|
et = ElementTree.XML(fp.read())
|
2018-10-29 11:16:42 +00:00
|
|
|
for structure in et.iter('syntactic_structure'):
|
2018-10-29 10:29:51 +00:00
|
|
|
structures.append(SyntacticStructure.from_xml(structure))
|
|
|
|
return structures
|
|
|
|
|
|
|
|
|
|
|
|
class Word:
|
|
|
|
def __init__(self, xml):
|
|
|
|
self.lemma = xml.get('lemma')
|
|
|
|
self.msd = MSD_TRANSLATE[xml.get('msd')]
|
|
|
|
self.id = xml.get('id')
|
|
|
|
self.text = xml.text
|
|
|
|
self.links = defaultdict(list)
|
|
|
|
|
|
|
|
assert(None not in (self.id, self.lemma, self.msd))
|
|
|
|
|
|
|
|
def add_link(self, link, to):
|
|
|
|
self.links[link].append(to)
|
|
|
|
|
|
|
|
def get_links(self, link):
|
2018-10-29 11:43:07 +00:00
|
|
|
if link not in self.links and "|" in link:
|
|
|
|
for l in link.split('|'):
|
|
|
|
self.links[link].extend(self.links[l])
|
|
|
|
|
2018-10-29 10:29:51 +00:00
|
|
|
return self.links[link]
|
|
|
|
|
|
|
|
|
|
|
|
def load_corpus(filename):
|
|
|
|
with open(filename, 'r') as fp:
|
|
|
|
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
|
|
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
|
|
|
et = ElementTree.XML(xmlstring)
|
|
|
|
|
|
|
|
words = {}
|
|
|
|
for w in et.iter("w"):
|
|
|
|
words[w.get('id')] = Word(w)
|
|
|
|
|
|
|
|
for l in et.iter("link"):
|
|
|
|
assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
|
|
|
|
|
|
|
|
lfrom = l.get('from')
|
|
|
|
if lfrom in words:
|
2019-01-08 18:37:28 +00:00
|
|
|
assert(not lfrom.endswith('.0'))
|
2018-10-29 10:29:51 +00:00
|
|
|
next_word_id = l.get('dep')
|
|
|
|
if next_word_id in words:
|
|
|
|
next_word = words[next_word_id]
|
|
|
|
words[l.get('from')].add_link(l.get('afun'), next_word)
|
|
|
|
|
2019-01-08 18:37:28 +00:00
|
|
|
# catch modra links from root
|
|
|
|
elif lfrom[-1] == '0' and l.get('afun') == 'modra':
|
|
|
|
root_words.add(l.get('dep'))
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# strange errors, just skip...
|
|
|
|
pass
|
2018-10-29 10:29:51 +00:00
|
|
|
return list(words.values())
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
words = load_corpus(STAVKI)
|
|
|
|
|
|
|
|
import time
|
|
|
|
t = time.time()
|
|
|
|
|
|
|
|
structures = build_structures(STRUKTURE)
|
|
|
|
for s in structures:
|
|
|
|
print(s)
|
|
|
|
|
|
|
|
num_matches = 0
|
|
|
|
for w in words:
|
|
|
|
for s in structures:
|
|
|
|
m = s.match(w)
|
|
|
|
if m is not None:
|
|
|
|
num_matches += 1
|
|
|
|
print(s.id, m)
|
|
|
|
|
|
|
|
print("TIME", time.time() - t)
|
2018-10-29 11:16:42 +00:00
|
|
|
print(num_matches)
|
2018-10-29 10:29:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|