First commit

This commit is contained in:
Ozbolt Menegatti 2018-10-29 11:29:51 +01:00
parent 4604ac1878
commit 74a1e4834b
2 changed files with 2275 additions and 0 deletions

1903
msd_translate.py Normal file

File diff suppressed because it is too large Load Diff

372
wani.py Normal file
View File

@ -0,0 +1,372 @@
from xml.etree import ElementTree
import re
from enum import Enum
from collections import defaultdict
from msd_translate import MSD_TRANSLATE
STRUKTURE = "Kolokacije_strukture_08_new-system.xml"
STAVKI = "k2.xml"
CODES = {
"Noun": "N",
"Verb": "V",
"Adjective": "A",
"Adverb": "R",
"Pronoun": "P",
"Numeral": "M",
"Preposition": "S",
"Conjunction": "C",
"Particle": "Q",
"Interjection": "I",
"Abbreviation": "Y",
"Residual": "X",
'common': 'c',
'proper': 'p',
'masculine': 'm',
'feminine': 'f',
'neuter': 'n',
"singular": "s",
"dual": "d",
"plural": "p",
"nominative": "n",
"genitive": "g",
"dative": "d",
"accusative": "a",
"locative": "l",
"instrumental": "i",
"no": "n",
"yes": "y",
"main": "m",
"auxiliary": "a",
"perfective": "e",
"progressive": "p",
"biaspectual": "b",
"infinitive": "n",
"supine": "u",
"participle": "p",
"present": "r",
"future": "f",
"conditional": "c",
"imperative": "m",
"first": "1",
"second": "2",
"third": "3",
"general": "g",
"possessive": "s",
"positive": "p",
"comparative": "c",
"superlative": "s",
"personal": "p",
"demonstrative": "d",
"relative": "r",
"reflexive": "x",
"interrogative": "q",
"indefinite": "i",
"negative": "z",
"bound": "b",
"digit": "d",
"roman": "r",
"letter": "l",
"cardinal": "c",
"ordinal": "o",
"pronominal": "p",
"special": "s",
"coordinating": "c",
"subordinating": "s",
"foreign": "f",
"typo": "t",
"program": "p",
}
TAGSET = {
"N": ['type', 'gender', 'number', 'case', 'animate'],
"V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
"A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
"R": ['type', 'degree'],
"P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
"M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
"S": ['case'],
"C": ['type'],
"Q": [],
"I": [],
"Y": [],
"X": ['type']
}
CATEGORY_BASES = {
"N": ['.', '.', '.', '.', '.?'],
"V": ['.', '.', '.', '.', '.?', '.?', '.?'],
"A": ['.', '.', '.', '.', '.', '.?'],
"R": ['.', '.?'],
"P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
"M": ['.', '.', '.', '.?', '.?', '.?'],
"S": ['.'],
"C": ['.'],
"Q": [],
"I": [],
"Y": [],
"X": ['.?']
}
class RestrictionType(Enum):
Morphology = 0
Lexis = 1
class ComponentLevel(Enum):
Lemma = 0
WordForm = 1
def get_level(restriction):
for feature in restriction:
if "level" in feature.keys():
lvl = feature.get("level")
if lvl == "lemma":
return ComponentLevel.Lemma
elif lvl == "word_form":
return ComponentLevel.WordForm
else:
continue
raise RuntimeError("Unreachable!")
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
assert('POS' in restr_dict)
category = restr_dict['POS'].capitalize()
cat_code = CODES[category]
rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS']
del restr_dict['level']
for attribute, value in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower())
assert(index >= 0)
if '|' in value:
match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
else:
match = CODES[value]
rgx[index + 1] = match
return re.compile("".join(rgx))
def build_lexis_regex(restriction):
restr_dict = {}
for feature in restriction:
restr_dict.update(feature.items())
return re.compile(restr_dict['lemma'])
class Component:
def __init__(self, name):
self.name = name if name is not None else ""
self.restriction_type = None
self.restriction = None
self.next_element = None
self.level = None
def word_to_str(self, word):
if self.level == ComponentLevel.Lemma:
return word.lemma, word.msd
elif self.level == ComponentLevel.WordForm:
return word.text, word.msd
else:
raise RuntimeError("Unreachable")
def has_next(self):
return self.next_element is not None
def get_next(self):
return self.next_element[0]
def link_label(self):
return self.next_element[1]
def set_next(self, next_component, link_label):
self.next_element = (next_component, link_label)
def set_restriction(self, restriction_tag):
restriction_type = restriction_tag.get('type')
if restriction_type == "morphology":
self.restriction_type = RestrictionType.Morphology
self.restriction = build_morphology_regex(restriction_tag.getchildren())
elif restriction_type == "lexis":
self.restriction_type = RestrictionType.Lexis
self.restriction = build_lexis_regex(restriction_tag.getchildren())
else:
raise NotImplementedError()
self.level = get_level(restriction_tag.getchildren())
def __str__(self):
el = "(N.{:7s} {:12s} {})".format(self.name, str(self.restriction_type).split('.')[1], self.restriction)
if self.has_next():
el += " -- {} -->\n{}".format(self.link_label(), str(self.get_next()))
return el
def __repr__(self):
return str(self)
def match(self, word):
if self.restriction_type == RestrictionType.Morphology:
match_to = word.msd
elif self.restriction_type == RestrictionType.Lexis:
match_to = word.lemma
else:
raise RuntimeError("Unreachable!")
if self.restriction.match(match_to):
to_ret = [self.word_to_str(word)]
# already matched everything!
if not self.has_next():
return to_ret
# need to get all links that match
for next_word in word.get_links(self.link_label()):
match = self.get_next().match(next_word)
# if matches, return
if match is not None:
to_ret.extend(match)
return to_ret
# return None...
class SyntacticStructure:
def __init__(self):
self.root_component = Component('root')
self.id = None
self.lbs = None
@staticmethod
def from_xml(xml):
st = SyntacticStructure()
st.id = int(xml.get('id'))
st.lbs = xml.get('LBS')
components, system = xml.getchildren()
dependencies, restrictions = system.getchildren()
assert(system.get('type') == 'JOS')
deps = { dep.get('from'): (dep.get('to'), dep.get('label')) for dep in dependencies }
comps = { comp.get('cid'): comp.get('name') for comp in components }
restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }
current_component = st.root_component
idx = 'root'
while idx in deps:
idx, dep_label = deps[idx]
next_component = Component(comps[idx])
next_component.set_restriction(restrs[idx])
current_component.set_next(next_component, dep_label)
current_component = next_component
st.root_component = st.root_component.get_next()
return st
def __str__(self):
return "{} LBS {}\n------\n{}".format(self.id, self.lbs, str(self.root_component))
def match(self, word):
return self.root_component.match(word)
def build_structures(filename):
structures = []
with open(filename, 'r') as fp:
et = ElementTree.XML(fp.read())
for structure in et.iterfind('syntactic_structure'):
structures.append(SyntacticStructure.from_xml(structure))
return structures
class Word:
def __init__(self, xml):
self.lemma = xml.get('lemma')
self.msd = MSD_TRANSLATE[xml.get('msd')]
self.id = xml.get('id')
self.text = xml.text
self.links = defaultdict(list)
assert(None not in (self.id, self.lemma, self.msd))
def add_link(self, link, to):
self.links[link].append(to)
def get_links(self, link):
return self.links[link]
def load_corpus(filename):
with open(filename, 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring)
words = {}
for w in et.iter("w"):
words[w.get('id')] = Word(w)
for l in et.iter("link"):
assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
lfrom = l.get('from')
if lfrom in words:
next_word_id = l.get('dep')
if next_word_id in words:
next_word = words[next_word_id]
words[l.get('from')].add_link(l.get('afun'), next_word)
return list(words.values())
def main():
words = load_corpus(STAVKI)
import time
t = time.time()
structures = build_structures(STRUKTURE)
for s in structures:
print(s)
exit(0)
print(STAVKI)
num_matches = 0
for w in words:
for s in structures:
m = s.match(w)
if m is not None:
num_matches += 1
print(s.id, m)
print("TIME", time.time() - t)
# print(num_matches)
if __name__ == '__main__':
main()