406 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			406 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from xml.etree import ElementTree
 | |
| import re
 | |
| from enum import Enum
 | |
| from collections import defaultdict
 | |
| 
 | |
| from msd_translate import MSD_TRANSLATE
 | |
| 
 | |
| 
 | |
| STRUKTURE = "Kolokacije_strukture_09_new-system.xml"
 | |
| STAVKI = "k2.xml"
 | |
| 
 | |
| CODES = {
 | |
|     "Noun": "N",
 | |
|     "Verb": "V",
 | |
|     "Adjective": "A",
 | |
|     "Adverb": "R",
 | |
|     "Pronoun": "P",
 | |
|     "Numeral": "M",
 | |
|     "Preposition": "S",
 | |
|     "Conjunction": "C",
 | |
|     "Particle": "Q",
 | |
|     "Interjection": "I",
 | |
|     "Abbreviation": "Y",
 | |
|     "Residual": "X",
 | |
| 
 | |
|     'common': 'c',
 | |
|     'proper': 'p',
 | |
|     'masculine': 'm',
 | |
|     'feminine': 'f',
 | |
|     'neuter': 'n',
 | |
|     "singular": "s",
 | |
|     "dual": "d",
 | |
|     "plural": "p",
 | |
|     "nominative": "n",
 | |
|     "genitive": "g",
 | |
|     "dative": "d",
 | |
|     "accusative": "a",
 | |
|     "locative": "l",
 | |
|     "instrumental": "i",
 | |
|     "no": "n",
 | |
|     "yes": "y",
 | |
|     "main": "m",
 | |
|     "auxiliary": "a",
 | |
|     "perfective": "e",
 | |
|     "progressive": "p",
 | |
|     "biaspectual": "b",
 | |
|     "infinitive": "n",
 | |
|     "supine": "u",
 | |
|     "participle": "p",
 | |
|     "present": "r",
 | |
|     "future": "f",
 | |
|     "conditional": "c",
 | |
|     "imperative": "m",
 | |
|     "first": "1",
 | |
|     "second": "2",
 | |
|     "third": "3",
 | |
|     "general": "g",
 | |
|     "possessive": "s",
 | |
|     "positive": "p",
 | |
|     "comparative": "c",
 | |
|     "superlative": "s",
 | |
|     "personal": "p",
 | |
|     "demonstrative": "d",
 | |
|     "relative": "r",
 | |
|     "reflexive": "x",
 | |
|     "interrogative": "q",
 | |
|     "indefinite": "i",
 | |
|     "negative": "z",
 | |
|     "bound": "b",
 | |
|     "digit": "d",
 | |
|     "roman": "r",
 | |
|     "letter": "l",
 | |
|     "cardinal": "c",
 | |
|     "ordinal": "o",
 | |
|     "pronominal": "p",
 | |
|     "special": "s",
 | |
|     "coordinating": "c",
 | |
|     "subordinating": "s",
 | |
|     "foreign": "f",
 | |
|     "typo": "t",
 | |
|     "program": "p",
 | |
| }
 | |
| 
 | |
| TAGSET = {
 | |
|     "N": ['type', 'gender', 'number', 'case', 'animate'],
 | |
|     "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'],
 | |
|     "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'],
 | |
|     "R": ['type', 'degree'],
 | |
|     "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'],
 | |
|     "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'],
 | |
|     "S": ['case'],
 | |
|     "C": ['type'],
 | |
|     "Q": [],
 | |
|     "I": [],
 | |
|     "Y": [],
 | |
|     "X": ['type']
 | |
| }
 | |
| 
 | |
| CATEGORY_BASES = {
 | |
|     "N": ['.', '.', '.', '.', '.?'],
 | |
|     "V": ['.', '.', '.', '.', '.?', '.?', '.?'],
 | |
|     "A": ['.', '.', '.', '.', '.', '.?'],
 | |
|     "R": ['.', '.?'],
 | |
|     "P": ['. ', '.', '.', '.', '.', '.', '.?', '.?'],
 | |
|     "M": ['.', '.', '.', '.?', '.?', '.?'],
 | |
|     "S": ['.'],
 | |
|     "C": ['.'],
 | |
|     "Q": [],
 | |
|     "I": [],
 | |
|     "Y": [],
 | |
|     "X": ['.?']
 | |
| }
 | |
| 
 | |
| 
 | |
| class RestrictionType(Enum):
 | |
|     Morphology = 0
 | |
|     Lexis = 1
 | |
| 
 | |
| 
 | |
| class ComponentLevel(Enum):
 | |
|     Lemma = 0
 | |
|     WordForm = 1
 | |
| 
 | |
| 
 | |
| def get_level(restriction):
 | |
|     for feature in restriction:
 | |
|         if "level" in feature.keys():
 | |
|             lvl = feature.get("level")
 | |
|             if lvl == "lemma":
 | |
|                 return ComponentLevel.Lemma
 | |
|             elif lvl == "word_form":
 | |
|                 return ComponentLevel.WordForm
 | |
|             else:
 | |
|                 continue
 | |
| 
 | |
|     raise RuntimeError("Unreachable!")
 | |
| 
 | |
| 
 | |
| def build_morphology_regex(restriction):
 | |
|     restr_dict = {}
 | |
|     for feature in restriction:
 | |
|         restr_dict.update(feature.items())
 | |
| 
 | |
|     assert('POS' in restr_dict)
 | |
|     category = restr_dict['POS'].capitalize()
 | |
|     cat_code = CODES[category]
 | |
|     rgx = [cat_code] + CATEGORY_BASES[cat_code]
 | |
| 
 | |
|     del restr_dict['POS']
 | |
|     del restr_dict['level']
 | |
| 
 | |
|     for attribute, value in restr_dict.items():
 | |
|         index = TAGSET[cat_code].index(attribute.lower())
 | |
|         assert(index >= 0)
 | |
| 
 | |
|         if '|' in value:
 | |
|             match = '[' + "".join(CODES[val] for val in value.split('|')) + ']'
 | |
|         else:
 | |
|             match = CODES[value]
 | |
| 
 | |
|         rgx[index + 1] = match
 | |
| 
 | |
|     return re.compile("".join(rgx))
 | |
| 
 | |
| 
 | |
| def build_lexis_regex(restriction):
 | |
|     restr_dict = {}
 | |
|     for feature in restriction:
 | |
|         restr_dict.update(feature.items())
 | |
| 
 | |
|     return re.compile(restr_dict['lemma'])
 | |
| 
 | |
| 
 | |
| class Restriction:
 | |
|     def __init__(self, restriction_tag):
 | |
|         restriction_type = restriction_tag.get('type')
 | |
|         if restriction_type == "morphology":
 | |
|             self.type = RestrictionType.Morphology
 | |
|             self.matcher = build_morphology_regex(restriction_tag.getchildren())
 | |
|         elif restriction_type == "lexis":
 | |
|             self.type = RestrictionType.Lexis
 | |
|             self.matcher = build_lexis_regex(restriction_tag.getchildren())
 | |
|         else:
 | |
|             raise NotImplementedError()
 | |
| 
 | |
|     def match(self, word):
 | |
|         if self.type == RestrictionType.Morphology:
 | |
|             match_to = word.msd
 | |
|         elif self.type == RestrictionType.Lexis:
 | |
|             match_to = word.lemma
 | |
|         else:
 | |
|             raise RuntimeError("Unreachable!")
 | |
| 
 | |
|         return self.matcher.match(match_to)
 | |
| 
 | |
|     def __str__(self):
 | |
|         return "({:s} {})".format(str(self.type).split('.')[1], self.matcher)
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return str(self)
 | |
| 
 | |
| 
 | |
| class Component:
 | |
|     def __init__(self, name):
 | |
|         self.name = name if name is not None else ""
 | |
|         self.restriction = None
 | |
|         self.next_element = None
 | |
|         self.level = None
 | |
| 
 | |
|     def word_to_str(self, word):
 | |
|         if self.level == ComponentLevel.Lemma:
 | |
|             return word.lemma, word.msd
 | |
|         elif self.level == ComponentLevel.WordForm:
 | |
|             return word.text, word.msd
 | |
|         else:
 | |
|             raise RuntimeError("Unreachable")
 | |
| 
 | |
|     def has_next(self):
 | |
|         return self.next_element is not None
 | |
| 
 | |
|     def get_next(self):
 | |
|         return self.next_element[0]
 | |
| 
 | |
|     def link_label(self):
 | |
|         return self.next_element[1]
 | |
| 
 | |
|     def set_next(self, next_component, link_label):
 | |
|         self.next_element = (next_component, link_label)
 | |
| 
 | |
|     def set_restriction(self, restrictions_tag):
 | |
|         if restrictions_tag.tag == "restriction":
 | |
|             self.restriction = Restriction(restrictions_tag)
 | |
|             self.level = get_level(restrictions_tag)
 | |
| 
 | |
|         elif restrictions_tag.tag == "restriction_or":
 | |
|             self.restriction = [Restriction(el) for el in restrictions_tag]
 | |
|             self.level = get_level(restrictions_tag[0])
 | |
| 
 | |
|             # same level for every restriction for now and only or available
 | |
|             levels = [get_level(el) for el in restrictions_tag]
 | |
|             assert(len(set(levels)) == 1)
 | |
| 
 | |
|         else:
 | |
|             raise RuntimeError("Unreachable")
 | |
| 
 | |
|     def __str__(self):
 | |
|         el = "(N.{:7s} {})".format(self.name, str(self.restriction))
 | |
|         if self.has_next():
 | |
|             el += "  -- {} -->\n{}".format(self.link_label(), str(self.get_next()))
 | |
|         return el
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return str(self)
 | |
| 
 | |
|     def match(self, word):
 | |
|         matched = None
 | |
| 
 | |
|         # matching
 | |
|         if type(self.restriction) is list:
 | |
|             for restr in self.restriction:
 | |
|                 matched = restr.match(word)
 | |
|                 if matched is not None:
 | |
|                     break
 | |
|         else:
 | |
|             matched = self.restriction.match(word)
 | |
| 
 | |
|         # recurse to next
 | |
|         if matched:
 | |
|             to_ret = [self.word_to_str(word)]
 | |
| 
 | |
|             # already matched everything!
 | |
|             if not self.has_next():
 | |
|                 return to_ret
 | |
| 
 | |
|             # need to get all links that match
 | |
|             for next_word in word.get_links(self.link_label()):
 | |
|                 match = self.get_next().match(next_word)
 | |
|                 # if matches, return
 | |
|                 if match is not None:
 | |
|                     to_ret.extend(match)
 | |
|                     return to_ret
 | |
| 
 | |
|         # return None...
 | |
| 
 | |
| 
 | |
| class SyntacticStructure:
 | |
|     def __init__(self):
 | |
|         self.root_component = Component('root')
 | |
|         self.id = None
 | |
|         self.lbs = None
 | |
| 
 | |
|     @staticmethod
 | |
|     def from_xml(xml):
 | |
|         st = SyntacticStructure()
 | |
|         st.id = xml.get('id')
 | |
|         st.lbs = xml.get('LBS')
 | |
| 
 | |
|         components, system = xml.getchildren()
 | |
|         dependencies, restrictions = system.getchildren()
 | |
| 
 | |
|         assert(system.get('type') == 'JOS')
 | |
| 
 | |
|         deps = { dep.get('from'): (dep.get('to'), dep.get('label')) for dep in dependencies }
 | |
|         comps = { comp.get('cid'): comp.get('name') for comp in components }
 | |
|         restrs = { r.get('cid'): r.getchildren()[0] for r in restrictions }
 | |
| 
 | |
|         current_component = st.root_component
 | |
|         idx = 'root'
 | |
| 
 | |
|         while idx in deps:
 | |
|             idx, dep_label = deps[idx]
 | |
| 
 | |
|             next_component = Component(comps[idx])
 | |
|             next_component.set_restriction(restrs[idx])
 | |
| 
 | |
|             current_component.set_next(next_component, dep_label)
 | |
|             current_component = next_component
 | |
| 
 | |
|         st.root_component = st.root_component.get_next()
 | |
|         return st
 | |
| 
 | |
|     def __str__(self):
 | |
|         return "{} LBS {}\n------\n{}".format(self.id, self.lbs, str(self.root_component))
 | |
| 
 | |
|     def match(self, word):
 | |
|         return self.root_component.match(word)
 | |
| 
 | |
| 
 | |
| def build_structures(filename):
 | |
|     structures = []
 | |
|     with open(filename, 'r') as fp:
 | |
|         et = ElementTree.XML(fp.read())
 | |
|         for structure in et.iter('syntactic_structure'):
 | |
|             structures.append(SyntacticStructure.from_xml(structure))
 | |
|     return structures
 | |
| 
 | |
| 
 | |
| class Word:
 | |
|     def __init__(self, xml):
 | |
|         self.lemma = xml.get('lemma')
 | |
|         self.msd = MSD_TRANSLATE[xml.get('msd')]
 | |
|         self.id = xml.get('id')
 | |
|         self.text = xml.text
 | |
|         self.links = defaultdict(list)
 | |
| 
 | |
|         assert(None not in (self.id, self.lemma, self.msd))
 | |
| 
 | |
|     def add_link(self, link, to):
 | |
|         self.links[link].append(to)
 | |
| 
 | |
|     def get_links(self, link):
 | |
|         return self.links[link]
 | |
| 
 | |
| 
 | |
| def load_corpus(filename):
 | |
|     with open(filename, 'r') as fp:
 | |
|         xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
 | |
|         xmlstring = xmlstring.replace(' xml:', ' ')
 | |
|         et = ElementTree.XML(xmlstring)
 | |
| 
 | |
|     words = {}
 | |
|     for w in et.iter("w"):
 | |
|         words[w.get('id')] = Word(w)
 | |
| 
 | |
|     for l in et.iter("link"):
 | |
|         assert('dep' in l.keys() and 'from' in l.keys() and 'afun' in l.keys())
 | |
| 
 | |
|         lfrom = l.get('from')
 | |
|         if lfrom in words:
 | |
|             next_word_id = l.get('dep')
 | |
|             if next_word_id in words:
 | |
|                 next_word = words[next_word_id]
 | |
|                 words[l.get('from')].add_link(l.get('afun'), next_word)
 | |
| 
 | |
|     return list(words.values())
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     words = load_corpus(STAVKI)
 | |
| 
 | |
|     import time
 | |
|     t = time.time()
 | |
| 
 | |
|     structures = build_structures(STRUKTURE)
 | |
|     for s in structures:
 | |
|         print(s)
 | |
| 
 | |
|     num_matches = 0
 | |
|     for w in words:
 | |
|         for s in structures:
 | |
|             m = s.match(w)
 | |
|             if m is not None:
 | |
|                 num_matches += 1
 | |
|                 print(s.id, m)
 | |
| 
 | |
|     print("TIME", time.time() - t)
 | |
|     print(num_matches)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     main()
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 |