From 90dbbca5d54dacbe8672647532ec933a5bac01e2 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Sat, 15 Jun 2019 18:55:35 +0200 Subject: [PATCH] HUGE refactor, creating lots of modules, no code changes though! --- src/codes_tagset.py | 86 ++ src/component.py | 166 +++ src/formatter.py | 190 +++ src/lemma_features.py | 26 + src/match.py | 20 + src/match_store.py | 46 + msd_translate.py => src/msd_translate.py | 0 src/order.py | 34 + src/representation.py | 167 +++ src/representation_assigner.py | 82 ++ src/restriction.py | 133 ++ src/syntactic_structure.py | 110 ++ src/wani.py | 217 +++ src/word.py | 61 + src/word_stats.py | 100 ++ src/writer.py | 137 ++ src/writerpy | 133 ++ wani.py | 1544 ---------------------- 18 files changed, 1708 insertions(+), 1544 deletions(-) create mode 100644 src/codes_tagset.py create mode 100644 src/component.py create mode 100644 src/formatter.py create mode 100644 src/lemma_features.py create mode 100644 src/match.py create mode 100644 src/match_store.py rename msd_translate.py => src/msd_translate.py (100%) create mode 100644 src/order.py create mode 100644 src/representation.py create mode 100644 src/representation_assigner.py create mode 100644 src/restriction.py create mode 100644 src/syntactic_structure.py create mode 100644 src/wani.py create mode 100644 src/word.py create mode 100644 src/word_stats.py create mode 100644 src/writer.py create mode 100644 src/writerpy delete mode 100644 wani.py diff --git a/src/codes_tagset.py b/src/codes_tagset.py new file mode 100644 index 0000000..c3f3fb6 --- /dev/null +++ b/src/codes_tagset.py @@ -0,0 +1,86 @@ +CODES = { + "Noun": "N", + "Verb": "V", + "Adjective": "A", + "Adverb": "R", + "Pronoun": "P", + "Numeral": "M", + "Preposition": "S", + "Conjunction": "C", + "Particle": "Q", + "Interjection": "I", + "Abbreviation": "Y", + "Residual": "X", + + 'common': 'c', + 'proper': 'p', + 'masculine': 'm', + 'feminine': 'f', + 'neuter': 'n', + "singular": "s", + "dual": "d", + "plural": "p", + "nominative": "n", + "genitive": "g", + "dative": "d", + "accusative": "a", + "locative": "l", + "instrumental": "i", + "no": "n", + "yes": "y", + "main": "m", + "auxiliary": "a", + "perfective": "e", + "progressive": "p", + "biaspectual": "b", + "infinitive": "n", + "supine": "u", + "participle": "p", + "present": "r", + "future": "f", + "conditional": "c", + "imperative": "m", + "first": "1", + "second": "2", + "third": "3", + "general": "g", + "possessive": "s", + "positive": "p", + "comparative": "c", + "superlative": "s", + "personal": "p", + "demonstrative": "d", + "relative": "r", + "reflexive": "x", + "interrogative": "q", + "indefinite": "i", + "negative": "z", + "bound": "b", + "digit": "d", + "roman": "r", + "letter": "l", + "cardinal": "c", + "ordinal": "o", + "pronominal": "p", + "special": "s", + "coordinating": "c", + "subordinating": "s", + "foreign": "f", + "typo": "t", + "program": "p", +} + +TAGSET = { + "N": ['type', 'gender', 'number', 'case', 'animate'], + "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'], + "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'], + "R": ['type', 'degree'], + "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'], + "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'], + "S": ['case'], + "C": ['type'], + "Q": [], + "I": [], + "Y": [], + "X": ['type'] +} diff --git a/src/component.py b/src/component.py new file mode 100644 index 0000000..05035ad --- /dev/null +++ b/src/component.py @@ -0,0 +1,166 @@ +from enum import Enum +import logging + +from restriction import Restriction +from order import Order +from representation_assigner import RepresentationAssigner + + +class ComponentStatus(Enum): + Optional = 0 + Required = 1 + Forbidden = 2 + + +class ComponentType(Enum): + Other = 0 + Core = 2 + Core2w = 3 + + +class Component: + def __init__(self, info): + idx = info['cid'] + name = info['name'] if 'name' in info else None + typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other + + if 'status' not in info: + status = ComponentStatus.Required + elif info['status'] == 'forbidden': + status = ComponentStatus.Forbidden + elif info['status'] == 'obligatory': + status = ComponentStatus.Required + elif info['status'] == 'optional': + status = ComponentStatus.Optional + else: + raise NotImplementedError("strange status: {}".format(info['status'])) + + self.status = status + self.name = name + self.idx = idx + self.restrictions = [] + self.next_element = [] + self.representation = [] + self.selection = {} + self.type = typ + + self.iter_ctr = 0 + + def add_next(self, next_component, link_label, order): + self.next_element.append((next_component, link_label, Order.new(order))) + + def set_restriction(self, restrictions_tag): + if restrictions_tag is None: + self.restrictions = [Restriction(None)] + + elif restrictions_tag.tag == "restriction": + self.restrictions = [Restriction(restrictions_tag)] + + elif restrictions_tag.tag == "restriction_or": + self.restrictions = [Restriction(el) for el in restrictions_tag] + + else: + raise RuntimeError("Unreachable") + + def set_representation(self, representation): + for rep in representation: + crend = RepresentationAssigner() + for feature in rep: + crend.add_feature(feature.attrib) + self.representation.append(crend) + + def find_next(self, deps, comps, restrs, reprs): + to_ret = [] + for d in deps: + if d[0] == self.idx: + _, idx, dep_label, order = d + + next_component = Component(comps[idx]) + next_component.set_restriction(restrs[idx]) + next_component.set_representation(reprs[idx]) + to_ret.append(next_component) + + self.add_next(next_component, dep_label, order) + others = next_component.find_next(deps, comps, restrs, reprs) + to_ret.extend(others) + + return to_ret + + def name_str(self): + return "_" if self.name is None else self.name + + def match(self, word): + m1 = self._match_self(word) + if m1 is None: + return None + + mn = self._match_next(word) + if mn is None: + return None + + to_ret = [m1] + for cmatch in mn: + # if good match but nothing to add, just continue + if len(cmatch) == 0: + continue + + # if more than one match found for particular component + elif len(cmatch) > 1: + # if more than one match in multiple components, NOPE! + if len(to_ret) > 1: + logging.warning("Strange multiple match: {}".format( + str([w.id for w in cmatch[0].values()]))) + + for tr in to_ret: + tr.update(cmatch[0]) + continue + + # yeah, so we have found more than one match, => + # more than one element in to_ret + to_ret = [{**dict(to_ret[0]), **m} for m in cmatch] + + else: + for tr in to_ret: + tr.update(cmatch[0]) + + return to_ret + + def _match_self(self, word): + # matching + for restr in self.restrictions: + if restr.match(word): # match either + return {self.idx: word} + + def _match_next(self, word): + # matches for every component in links from this component + to_ret = [] + + # need to get all links that match + for next, link, order in self.next_element: + next_links = word.get_links(link) + to_ret.append([]) + + # good flag + good = next.status != ComponentStatus.Required + for next_word in next_links: + if not order.match(word, next_word): + continue + + match = next.match(next_word) + + if match is not None: + # special treatement for forbidden + if next.status == ComponentStatus.Forbidden: + good = False + break + + else: + assert type(match) is list + to_ret[-1].extend(match) + good = True + + # if none matched, nothing found! + if not good: + return None + + return to_ret \ No newline at end of file diff --git a/src/formatter.py b/src/formatter.py new file mode 100644 index 0000000..c526229 --- /dev/null +++ b/src/formatter.py @@ -0,0 +1,190 @@ +from math import log2 +import re + +from component import ComponentType + + +class Formatter: + def __init__(self, colocation_ids, word_renderer): + self.colocation_ids = colocation_ids + self.word_renderer = word_renderer + self.additional_init() + + def header_repeat(self): + raise NotImplementedError("Header repeat formatter not implemented") + def header_right(self): + raise NotImplementedError("Header right formatter not implemented") + def content_repeat(self, words, representations, idx, sidx): + raise NotImplementedError("Content repeat formatter not implemented") + def content_right(self, freq): + raise NotImplementedError("Content right formatter not implemented") + def group(self): + raise NotImplementedError("Group for formatter not implemented") + + def additional_init(self): + pass + def length(self): + return len(self.header_repeat()) + + def set_structure(self, structure): + pass + def new_match(self, match): + pass + + +class OutNoStatFormatter(Formatter): + def additional_init(self): + self.representation = "" + + def header_repeat(self): + return ["Lemma", "Representative_form", "RF_scenario"] + + def header_right(self): + return ["Joint_representative_form", "Frequency"] + + def content_repeat(self, words, representations, idx, _sidx): + word = words[idx] + if idx not in representations: + return [word.lemma, "", ""] + + rep = representations[idx] + if rep is None: + self.representation += " " + word.lemma + return [word.lemma, word.lemma, "lemma_fallback"] + else: + self.representation += " " + rep + return [word.lemma, rep, "ok"] + + def content_right(self, freq): + rep = re.sub(' +', ' ', self.representation.strip()) + result = [rep, str(freq)] + self.representation = "" + return result + + def group(self): + return True + +class AllFormatter(Formatter): + def header_repeat(self): + return ["Token_ID", "Word_form", "Lemma", "Msd"] + + def header_right(self): + return [] + + def content_repeat(self, words, _representations, idx, _sidx): + word = words[idx] + return [word.id, word.text, word.lemma, word.msd] + + def content_right(self, _freq): + return [] + + def group(self): + return False + +class StatsFormatter(Formatter): + def additional_init(self): + self.stats = None + self.jppb = None + self.corew = None + + @staticmethod + def stat_str(num): + return "{:.5f}".format(num) if isinstance(num, float) else str(num) + + def set_structure(self, structure): + jppb = [] + corew = [] + + for component in structure.components: + if component.type == ComponentType.Core2w: + jppb.append(component.idx) + if component.type != ComponentType.Other: + corew.append(component.idx) + + assert(len(jppb) == 2) + self.jppb = tuple(jppb) + self.corew = tuple(corew) + + def new_match(self, match): + self.stats = {"freq": {}} + + for cid in self.corew: + if cid not in match.matches[0]: + freq = 0 + else: + word = match.matches[0][cid] + freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] + + self.stats["freq"][cid] = freq + + fx = self.stats["freq"][self.jppb[0]] + fy = self.stats["freq"][self.jppb[1]] + freq = len(match) + N = self.word_renderer.num_all_words() + + self.stats['d12'] = freq / fx - (fy - freq) / (N - fx) + self.stats['d21'] = freq / fy - (fx - freq) / (N - fy) + + self.stats['df'] = match.distinct_forms() + self.stats['freq_all'] = freq + + def header_repeat(self): + return ["Distribution"] + + def header_right(self): + return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"] + + def content_repeat(self, words, representations, idx, sidx): + # not a core word + if idx not in self.corew: + return [""] * self.length() + + word = words[idx] + key = (sidx, idx, word.lemma) + distribution = self.colocation_ids.dispersions[key] + return [self.stat_str(distribution)] + + def content_right(self, freq): + fx = self.stats["freq"][self.jppb[0]] + fy = self.stats["freq"][self.jppb[1]] + freq = self.stats['freq_all'] + logdice_core = 14 + log2(2 * freq / (fx + fy)) + + fi = [self.stats["freq"][idx] for idx in self.corew] + fi = [f for f in fi if f > 0] + logdice_all = 14 + log2(len(fi) * freq / sum(fi)) + + return [self.stat_str(x) for x in ( + self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df'] + )] + + def group(self): + return True + +class OutFormatter(Formatter): + def additional_init(self): + self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer) + self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer) + + def header_repeat(self): + return self.f1.header_repeat() + self.f2.header_repeat() + + def header_right(self): + return self.f1.header_right() + self.f2.header_right() + + def content_repeat(self, words, representations, idx, sidx): + cr1 = self.f1.content_repeat(words, representations, idx, sidx) + cr2 = self.f2.content_repeat(words, representations, idx, sidx) + return cr1 + cr2 + + def content_right(self, freq): + return self.f1.content_right(freq) + self.f2.content_right(freq) + + def group(self): + return self.f1.group() and self.f2.group() + + def set_structure(self, structure): + self.f2.set_structure(structure) + + def new_match(self, match): + self.f2.new_match(match) \ No newline at end of file diff --git a/src/lemma_features.py b/src/lemma_features.py new file mode 100644 index 0000000..5650fa4 --- /dev/null +++ b/src/lemma_features.py @@ -0,0 +1,26 @@ +from restriction import MorphologyRegex + + +def get_lemma_features(et): + lf = et.find('lemma_features') + if lf is None: + return {} + + result = {} + for pos in lf.iter('POS'): + rgx_list = MorphologyRegex(pos).rgx + rgx_str = "" + for position in rgx_list: + if position == ".": + rgx_str += " " + elif len(position) == 1: + rgx_str += position + elif len(position) == 3 and position[0] == "[" and position[2] == "]": + rgx_str += position[1] + else: + raise RuntimeError("Strange rgx for lemma_feature...") + + assert rgx_str[0].isupper() + result[rgx_str[0]] = rgx_str.strip().replace(' ', '-') + + return result \ No newline at end of file diff --git a/src/match.py b/src/match.py new file mode 100644 index 0000000..b2b11e9 --- /dev/null +++ b/src/match.py @@ -0,0 +1,20 @@ +class StructureMatch: + def __init__(self, match_id, structure): + self.match_id = match_id + self.structure = structure + + self.matches = [] + self.representations = {} + + def distinct_forms(self): + dm = set() + keys = list(self.matches[0].keys()) + for words in self.matches: + dm.add(" ".join(words[k].text for k in keys)) + return len(dm) + + def append(self, match): + self.matches.append(match) + + def __len__(self): + return len(self.matches) \ No newline at end of file diff --git a/src/match_store.py b/src/match_store.py new file mode 100644 index 0000000..35c5e6a --- /dev/null +++ b/src/match_store.py @@ -0,0 +1,46 @@ +from collections import defaultdict + +from match import StructureMatch +from representation_assigner import RepresentationAssigner + +try: + from tqdm import tqdm +except ImportError: + tqdm = lambda x: x + +class MatchStore: + def __init__(self, args): + self.data = {} + self.min_frequency = args.min_freq + self.dispersions = {} + + def _add_match(self, key, structure, match): + if key not in self.data: + self.data[key] = StructureMatch(str(len(self.data) + 1), structure) + self.data[key].append(match) + + def get(self, key, n): + return self.data[key][n] + + def add_matches(self, matches): + for structure, nms in matches.items(): + for nm in nms: + self._add_match(nm[1], structure, nm[0]) + + def get_matches_for(self, structure): + for _cid_tup, sm in self.data.items(): + if sm.structure != structure: + continue + + yield sm + + def set_representations(self, word_renderer): + for _1, sm in tqdm(self.data.items()): + RepresentationAssigner.set_representations(sm, word_renderer) + + def determine_colocation_dispersions(self): + dispersions = defaultdict(int) + for (structure_id, *word_tups) in self.data.keys(): + for component_id, lemma in word_tups: + dispersions[(structure_id, component_id, lemma)] += 1 + self.dispersions = dict(dispersions) diff --git a/msd_translate.py b/src/msd_translate.py similarity index 100% rename from msd_translate.py rename to src/msd_translate.py diff --git a/src/order.py b/src/order.py new file mode 100644 index 0000000..03f55f4 --- /dev/null +++ b/src/order.py @@ -0,0 +1,34 @@ +from enum import Enum + + +class Order(Enum): + FromTo = 0 + ToFrom = 1 + Any = 2 + + @staticmethod + def new(order): + if order is None: + return Order.Any + elif order == "to-from": + return Order.ToFrom + elif order == "from-to": + return Order.FromTo + else: + raise NotImplementedError("What kind of ordering is: {}".format(order)) + + + def match(self, from_w, to_w): + if self is Order.Any: + return True + + fi = from_w.int_id + ti = to_w.int_id + + if self is Order.FromTo: + return fi < ti + elif self is Order.ToFrom: + return ti < fi + else: + raise NotImplementedError("Should not be here: Order match") + diff --git a/src/representation.py b/src/representation.py new file mode 100644 index 0000000..78a6388 --- /dev/null +++ b/src/representation.py @@ -0,0 +1,167 @@ +import logging + +from collections import Counter +from codes_tagset import TAGSET, CODES +from word import WordMsdOnly + +class ComponentRepresentation: + def __init__(self, data, word_renderer): + self.data = data + self.word_renderer = word_renderer + + self.words = [] + self.rendition_text = None + self.agreement = [] + + def get_agreement(self): + return [] + + def add_word(self, word): + self.words.append(word) + + def render(self): + if self.rendition_text is None: + self.rendition_text = self._render() + + def _render(self): + raise NotImplementedError("Not implemented for class: {}".format(type(self))) + +class LemmaCR(ComponentRepresentation): + def _render(self): + return self.words[0].lemma if len(self.words) > 0 else None + +class LexisCR(ComponentRepresentation): + def _render(self): + return self.data['lexis'] + +class WordFormAllCR(ComponentRepresentation): + def _render(self): + if len(self.words) == 0: + return None + else: + forms = [w.text.lower() for w in self.words] + return "/".join(set(forms)) + +class WordFormAnyCR(ComponentRepresentation): + def _render(self): + text_forms = {} + msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words]) + for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): + text_forms[(msd, lemma)] = text + + words_counter = [] + for word in self.words: + words_counter.append((word.msd, word.lemma)) + sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) + + for word_msd, word_lemma in sorted_words: + for agr in self.agreement: + if not agr.match(word_msd): + break + else: + for agr in self.agreement: + agr.confirm_match() + + if word_lemma is None: + return None + else: + return text_forms[(word_msd, word_lemma)] + +class WordFormMsdCR(WordFormAnyCR): + def __init__(self, *args): + super().__init__(*args) + self.lemma = None + self.msd = None + + def check_msd(self, word_msd): + if 'msd' not in self.data: + return True + selectors = self.data['msd'] + + for key, value in selectors.items(): + t = word_msd[0] + v = TAGSET[t].index(key.lower()) + f1 = word_msd[v + 1] + f2 = CODES[value] + + if '-' not in [f1, f2] and f1 != f2: + return False + + return True + + def add_word(self, word): + if self.lemma is None: + self.lemma = word.lemma + self.msd = word.msd + + if self.check_msd(word.msd): + super().add_word(word) + + def _render(self): + msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd) + self.words.append(WordMsdOnly(msd)) + + return super()._render() + +class WordFormAgreementCR(WordFormMsdCR): + def __init__(self, data, word_renderer): + super().__init__(data, word_renderer) + self.rendition_candidate = None + + def get_agreement(self): + return self.data['other'] + + def match(self, word_msd): + existing = [(w.msd, w.text) for w in self.words] + + lemma_available_words = self.word_renderer.available_words(self.lemma, existing) + for candidate_msd, candidate_text in lemma_available_words: + if self.msd[0] != candidate_msd[0]: + continue + + if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): + if self.check_msd(candidate_msd): + self.rendition_candidate = candidate_text + return True + + return False + + def confirm_match(self): + self.rendition_text = self.rendition_candidate + + @staticmethod + def check_agreement(msd1, msd2, agreements): + for agr_case in agreements: + t1 = msd1[0] + # if not in msd, some strange msd was tries, skipping... + if agr_case not in TAGSET[t1]: + logging.warning("Cannot do agreement: {} for msd {} not found!" + .format(agr_case, msd1)) + return False + + v1 = TAGSET[t1].index(agr_case) + # if none specified: nedolocnik, always agrees + if v1 + 1 >= len(msd1): + continue + # first is uppercase, not in TAGSET + m1 = msd1[v1 + 1] + + # REPEAT (not DRY!) + t2 = msd2[0] + if agr_case not in TAGSET[t2]: + logging.warning("Cannot do agreement: {} for msd {} not found!" + .format(agr_case, msd2)) + return False + v2 = TAGSET[t2].index(agr_case) + if v2 + 1 >= len(msd2): + continue + m2 = msd2[v2 + 1] + + # match! + if '-' not in [m1, m2] and m1 != m2: + return False + + return True + + def render(self): + pass diff --git a/src/representation_assigner.py b/src/representation_assigner.py new file mode 100644 index 0000000..ac4708d --- /dev/null +++ b/src/representation_assigner.py @@ -0,0 +1,82 @@ +from representation import ComponentRepresentation, LemmaCR, LexisCR, WordFormAgreementCR, WordFormAnyCR, WordFormMsdCR, WordFormAllCR + +class RepresentationAssigner: + def __init__(self): + self.more = {} + self.representation_factory = ComponentRepresentation + + def add_feature(self, feature): + if 'rendition' in feature: + if feature['rendition'] == "lemma": + self.representation_factory = LemmaCR + elif feature['rendition'] == "word_form": + # just by default, changes with selection + self.representation_factory = WordFormAnyCR + elif feature['rendition'] == "lexis": + self.representation_factory = LexisCR + self.more['lexis'] = feature['string'] + else: + raise NotImplementedError("Representation rendition: {}".format(feature)) + + elif 'selection' in feature: + if feature['selection'] == "msd": + # could already be agreement + if self.representation_factory != WordFormAgreementCR: + self.representation_factory = WordFormMsdCR + self.more['msd'] = {k: v for k, v in feature.items() if k != 'selection'} + elif feature['selection'] == "all": + self.representation_factory = WordFormAllCR + elif feature['selection'] == 'agreement': + assert feature['head'][:4] == 'cid_' + assert feature['msd'] is not None + self.representation_factory = WordFormAgreementCR + self.more['agreement'] = feature['msd'].split('+') + self.more['other'] = feature['head'][4:] + else: + raise NotImplementedError("Representation selection: {}".format(feature)) + + def cr_instance(self, word_renderer): + return self.representation_factory(self.more, word_renderer) + + @staticmethod + def set_representations(match, word_renderer): + representations = {} + for c in match.structure.components: + representations[c.idx] = [] + for rep in c.representation: + representations[c.idx].append(rep.cr_instance(word_renderer)) + + for cid, reps in representations.items(): + for rep in reps: + for agr in rep.get_agreement(): + if len(representations[agr]) != 1: + n = len(representations[agr]) + raise NotImplementedError( + "Structure {}: ".format(match.structure.id) + + "component {} has agreement".format(cid) + + " with component {}".format(agr) + + ", however there are {} (!= 1) representations".format(n) + + " of component {}!".format(agr)) + + representations[agr][0].agreement.append(rep) + + for words in match.matches: + # first pass, check everything but agreements + for w_id, w in words.items(): + component = match.structure.get_component(w_id) + component_representations = representations[component.idx] + for representation in component_representations: + representation.add_word(w) + + for cid, reps in representations.items(): + for rep in reps: + rep.render() + + for cid, reps in representations.items(): + reps = [rep.rendition_text for rep in reps] + if reps == []: + pass + elif all(r is None for r in reps): + match.representations[cid] = None + else: + match.representations[cid] = " ".join(("" if r is None else r) for r in reps) diff --git a/src/restriction.py b/src/restriction.py new file mode 100644 index 0000000..f521f2f --- /dev/null +++ b/src/restriction.py @@ -0,0 +1,133 @@ +import re +from enum import Enum + +from codes_tagset import CODES, TAGSET + + +class RestrictionType(Enum): + Morphology = 0 + Lexis = 1 + MatchAll = 2 + + +def determine_ppb(rgx): + if rgx[0] in ("A", "N", "R"): + return 0 + elif rgx[0] == "V": + if len(rgx) == 1: + return 2 + elif 'a' in rgx[1]: + return 3 + elif 'm' in rgx[1]: + return 1 + else: + return 2 + else: + return 4 + +class MorphologyRegex: + def __init__(self, restriction): + self.min_msd_length = 1 + + restr_dict = {} + for feature in restriction: + feature_dict = dict(feature.items()) + + match_type = True + if "filter" in feature_dict: + assert feature_dict['filter'] == "negative" + match_type = False + del feature_dict['filter'] + + assert len(feature_dict) == 1 + key, value = next(iter(feature_dict.items())) + restr_dict[key] = (value, match_type) + + assert 'POS' in restr_dict + category = restr_dict['POS'][0].capitalize() + cat_code = CODES[category] + rgx = [cat_code] + ['.'] * 10 + + del restr_dict['POS'] + + for attribute, (value, typ) in restr_dict.items(): + index = TAGSET[cat_code].index(attribute.lower()) + assert index >= 0 + + if '|' in value: + match = "".join(CODES[val] for val in value.split('|')) + else: + match = CODES[value] + + match = "[{}{}]".format("" if typ else "^", match) + rgx[index + 1] = match + + if typ: + self.min_msd_length = max(index + 1, self.min_msd_length) + + # strip rgx + for i in reversed(range(len(rgx))): + if rgx[i] == '.': + rgx = rgx[:-1] + else: + break + + self.re_objects = [re.compile(r) for r in rgx] + self.rgx = rgx + + def __call__(self, text): + if len(text) <= self.min_msd_length: + return False + + for c, r in zip(text, self.re_objects): + if not r.match(c): + return False + return True + + +class LexisRegex: + def __init__(self, restriction): + restr_dict = {} + for feature in restriction: + restr_dict.update(feature.items()) + + assert "lemma" in restr_dict + self.match_list = restr_dict['lemma'].split('|') + + def __call__(self, text): + return text in self.match_list + +class Restriction: + def __init__(self, restriction_tag): + self.ppb = 4 # polnopomenska beseda (0-4) + + if restriction_tag is None: + self.type = RestrictionType.MatchAll + self.matcher = None + self.present = None + return + + restriction_type = restriction_tag.get('type') + if restriction_type == "morphology": + self.type = RestrictionType.Morphology + self.matcher = MorphologyRegex(list(restriction_tag)) + self.ppb = determine_ppb(self.matcher.rgx) + + elif restriction_type == "lexis": + self.type = RestrictionType.Lexis + self.matcher = LexisRegex(list(restriction_tag)) + else: + raise NotImplementedError() + + def match(self, word): + if self.type == RestrictionType.Morphology: + match_to = word.msd + elif self.type == RestrictionType.Lexis: + match_to = word.lemma + elif self.type == RestrictionType.MatchAll: + return True + else: + raise RuntimeError("Unreachable!") + + return self.matcher(match_to) + diff --git a/src/syntactic_structure.py b/src/syntactic_structure.py new file mode 100644 index 0000000..31aa60a --- /dev/null +++ b/src/syntactic_structure.py @@ -0,0 +1,110 @@ +from xml.etree import ElementTree +import logging + +from component import Component, ComponentType +from lemma_features import get_lemma_features + +class SyntacticStructure: + def __init__(self): + self.id = None + self.lbs = None + self.components = [] + + @staticmethod + def from_xml(xml): + st = SyntacticStructure() + st.id = xml.get('id') + st.lbs = xml.get('LBS') + + assert len(list(xml)) == 1 + system = next(iter(xml)) + + assert system.get('type') == 'JOS' + components, dependencies, definitions = list(system) + + deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) + for dep in dependencies] + comps = {comp.get('cid'): dict(comp.items()) for comp in components} + + restrs, forms = {}, {} + + for comp in definitions: + n = comp.get('cid') + restrs[n] = None + forms[n] = [] + + for el in comp: + if el.tag.startswith("restriction"): + assert restrs[n] is None + restrs[n] = el + elif el.tag.startswith("representation"): + st.add_representation(n, el, forms) + else: + raise NotImplementedError("Unknown definition: {} in structure {}" + .format(el.tag, st.id)) + + fake_root_component = Component({'cid': '#', 'type': 'other'}) + st.components = fake_root_component.find_next(deps, comps, restrs, forms) + + st.determine_core2w() + return st + + def determine_core2w(self): + ppb_components = [] + for c in self.components: + if c.type != ComponentType.Core: + continue + + ppb = 4 + for r in c.restrictions: + ppb = min(r.ppb, ppb) + + ppb_components.append((c, ppb)) + + ppb_components = sorted(ppb_components, key=lambda c: c[1]) + if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]: + raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id) + + for c, _ in ppb_components[:2]: + c.type = ComponentType.Core2w + + def add_representation(self, n, rep_el, forms): + assert rep_el.tag == "representation" + to_add = [] + for el in rep_el: + assert el.tag == "feature" + if 'rendition' in el.attrib or 'selection' in el.attrib: + to_add.append(el) + else: + logging.warning("Strange representation feature in structure {}. Skipping" + .format(self.id)) + continue + forms[n].append(to_add) + + def get_component(self, idx): + for c in self.components: + if c.idx == idx: + return c + raise RuntimeError("Unknown component id: {}".format(idx)) + + def match(self, word): + matches = self.components[0].match(word) + return [] if matches is None else matches + + +def build_structures(filename): + max_num_components = -1 + with open(filename, 'r') as fp: + et = ElementTree.XML(fp.read()) + + structures = [] + for structure in et.iter('syntactic_structure'): + to_append = SyntacticStructure.from_xml(structure) + if to_append is None: + continue + + structures.append(to_append) + max_num_components = max(max_num_components, len(to_append.components)) + + lemma_features = get_lemma_features(et) + return structures, lemma_features, max_num_components diff --git a/src/wani.py b/src/wani.py new file mode 100644 index 0000000..3fb3c37 --- /dev/null +++ b/src/wani.py @@ -0,0 +1,217 @@ +from xml.etree import ElementTree +import re +import sys +import logging +import argparse +import pickle +import time +import subprocess +import concurrent.futures +import tempfile + +try: + from tqdm import tqdm +except ImportError: + tqdm = lambda x: x + +from word import Word +from syntactic_structure import build_structures +from match_store import MatchStore +from word_stats import WordStats +from writer import Writer + + +def is_root_id(id_): + return len(id_.split('.')) == 3 + + +def load_files(args): + filenames = args.input + skip_id_check = args.skip_id_check + do_msd_translate = not args.no_msd_translate + + for n, fname in enumerate(filenames): + if args.count_files: + status = " :: {} / {}".format(n, len(filenames)) + else: + status = "" + yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status) + + +def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): + logging.info("LOADING FILE: {}{}".format(filename, status)) + + with open(filename, 'r') as fp: + xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + et = ElementTree.XML(xmlstring) + + words = {} + for w in et.iter("w"): + words[w.get('id')] = Word(w, do_msd_translate) + for pc in et.iter(pc_tag): + words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) + + for l in et.iter("link"): + if 'dep' in l.keys(): + ana = l.get('afun') + lfrom = l.get('from') + dest = l.get('dep') + else: + ana = l.get('ana') + if ana[:4] != 'syn:': # dont bother... + continue + ana = ana[4:] + lfrom, dest = l.get('target').replace('#', '').split() + + if lfrom in words: + if not skip_id_check and is_root_id(lfrom): + logging.error("NOO: {}".format(lfrom)) + sys.exit(1) + + if dest in words: + next_word = words[dest] + words[lfrom].add_link(ana, next_word) + else: + logging.error("Unknown id: {}".format(dest)) + sys.exit(1) + + else: + # strange errors, just skip... + pass + + return list(words.values()) + +def match_file(words, structures): + matches = {s: [] for s in structures} + + for s in tqdm(structures): + for w in words: + mhere = s.match(w) + for match in mhere: + colocation_id = [(idx, w.lemma) for idx, w in match.items()] + colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) + colocation_id = tuple(colocation_id) + + matches[s].append((match, colocation_id)) + + return matches + + +def main(structures_file, args): + structures, lemma_msds, max_num_components = build_structures(structures_file) + + match_store = MatchStore(args) + word_stats = WordStats(lemma_msds) + + if args.parallel: + num_parallel = int(args.parallel) + + # make temporary directory to hold temporary files + with tempfile.TemporaryDirectory() as tmpdirname: + cmd = sys.argv + for inpt in args.input: + if inpt in cmd: + cmd.remove(inpt) + + # remove "--parallel X" + pidx = cmd.index('--parallel') + del cmd[pidx] + del cmd[pidx] + + def func(n): + cmdn = [sys.executable] + cmd + [args.input[n], + "--match-to-file", "{}/{}.p".format(tmpdirname, n)] + subprocess.check_call(cmdn) + return n + + # use ThreadPoolExecuter to run subprocesses in parallel using py threads + with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor: + # fancy interface to wait for threads to finish + for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): + with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: + words, matches = pickle.load(fp) + + match_store.add_matches(matches) + word_stats.add_words(words) + + else: + for words in load_files(args): + matches = match_file(words, structures) + # just save to temporary file, used for children of a parallel process + # MUST NOT have more than one file + if args.match_to_file is not None: + with open(args.match_to_file, "wb") as fp: + pickle.dump((words, matches), fp) + return + else: + match_store.add_matches(matches) + word_stats.add_words(words) + + # get word renders for lemma/msd + word_stats.generate_renders() + match_store.determine_colocation_dispersions() + + # figure out representations! + if args.out or args.out_no_stat: + match_store.set_representations(word_stats) + + Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( + structures, match_store) + Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out( + structures, match_store) + Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out( + structures, match_store) + Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( + structures, match_store) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Extract structures from a parsed corpus.') + parser.add_argument('structures', + help='Structures definitions in xml file') + parser.add_argument('input', + help='input xml file in `ssj500k form`, can list more than one', nargs='+') + parser.add_argument('--out', + help='Classic output file') + parser.add_argument('--out-no-stat', + help='Output file, but without statistical columns') + parser.add_argument('--all', + help='Additional output file, writes more data') + parser.add_argument('--stats', + help='Output file for statistics') + + parser.add_argument('--no-msd-translate', + help='MSDs are translated from slovene to english by default', + action='store_true') + parser.add_argument('--skip-id-check', + help='Skips checks for ids of and , if they are in correct format', + action='store_true') + parser.add_argument('--min_freq', help='Minimal frequency in output', + type=int, default=0, const=1, nargs='?') + parser.add_argument('--verbose', help='Enable verbose output to stderr', + choices=["warning", "info", "debug"], default="info", + const="info", nargs='?') + parser.add_argument('--count-files', + help="Count files: more verbose output", action='store_true') + parser.add_argument('--multiple-output', + help='Generate one output for each syntactic structure', + action='store_true') + + parser.add_argument('--sort-by', + help="Sort by a this column (index)", type=int, default=-1) + parser.add_argument('--sort-reversed', + help="Sort in reversed ored", action='store_true') + + parser.add_argument('--pc-tag', + help='Tag for separators, usually pc or c', default="pc") + parser.add_argument('--parallel', + help='Run in multiple processes, should speed things up') + parser.add_argument('--match-to-file', help='Do not use!') + + args = parser.parse_args() + logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) + + start = time.time() + main(args.structures, args) + logging.info("TIME: {}".format(time.time() - start)) diff --git a/src/word.py b/src/word.py new file mode 100644 index 0000000..569d57c --- /dev/null +++ b/src/word.py @@ -0,0 +1,61 @@ +from collections import defaultdict +import logging + +from msd_translate import MSD_TRANSLATE + + +class WordMsdOnly: + def __init__(self, msd): + self.msd = msd + self.lemma = None + self.text = None + + def most_frequent_text(self, _): + return None + + +class Word: + def __init__(self, xml, do_msd_translate): + self.lemma = xml.get('lemma') + self.msd = Word.get_msd(xml) + self.msd = MSD_TRANSLATE[self.msd] if do_msd_translate else self.msd + self.id = xml.get('id') + self.text = xml.text + self.links = defaultdict(list) + + last_num = self.id.split('.')[-1] + if last_num[0] not in '0123456789': + last_num = last_num[1:] + self.int_id = int(last_num) + + assert None not in (self.id, self.lemma, self.msd) + + @staticmethod + def get_msd(comp): + d = dict(comp.items()) + if 'msd' in d: + return d['msd'] + elif 'ana' in d: + return d['ana'][4:] + else: + logging.error(d) + raise NotImplementedError("MSD?") + + @staticmethod + def pc_word(pc, do_msd_translate): + pc.set('lemma', pc.text) + pc.set('msd', "N" if do_msd_translate else "U") + return Word(pc, do_msd_translate) + + def add_link(self, link, to): + self.links[link].append(to) + + def get_links(self, link): + if link not in self.links and "|" in link: + for l in link.split('|'): + self.links[link].extend(self.links[l]) + + return self.links[link] + + def most_frequent_text(self, word_renderer): + return word_renderer.render(self.lemma, self.msd) \ No newline at end of file diff --git a/src/word_stats.py b/src/word_stats.py new file mode 100644 index 0000000..ac02b3c --- /dev/null +++ b/src/word_stats.py @@ -0,0 +1,100 @@ +from collections import defaultdict, Counter + +class WordStats: + def __init__(self, lemma_features): + self.all_words = [] + self.rendered_words = {} + self.frequent_words = {} + self.num_words = {} + self.lemma_msd = {} + self.lemma_features = lemma_features + self.memoized_msd_merges = {} + + def add_words(self, words): + self.all_words.extend(words) + + def num_all_words(self): + return len(self.all_words) + + def generate_renders(self): + num_words = defaultdict(int) + data = defaultdict(lambda: defaultdict(list)) + for w in self.all_words: + data[w.lemma][w.msd].append(w.text) + + for lemma, ld in data.items(): + self.rendered_words[lemma] = {} + freq_words = defaultdict(int) + common_msd = "*" * 10 + + for msd, texts in ld.items(): + # TODO: this should be out of generate_renders... + num_words[(lemma, msd[0])] += len(texts) + + rep = max(set(texts), key=texts.count) + self.rendered_words[lemma][msd] = (rep, len(texts)) + + for txt in texts: + freq_words[(msd, txt)] += 1 + + common_msd = self.merge_msd(common_msd, msd) + + self.lemma_msd[lemma] = common_msd + + self.frequent_words[lemma] = [] + for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): + self.frequent_words[lemma].append((msd, txt, n)) + + lf = self.lemma_features + for lemma in self.lemma_msd: + cmsd = self.lemma_msd[lemma] + if cmsd[0] in lf: + self.lemma_msd[lemma] = "".join( + l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) + ) + + self.num_words = dict(num_words) + + def merge_msd(self, common_msd, new_msd): + key = (common_msd, new_msd) + if key in self.memoized_msd_merges: + return self.memoized_msd_merges[key] + + def merge_letter(l1, l2): + if l1 == "*": + return l2 + elif l1 != l2: + return "-" + else: + return l1 + + value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd)) + self.memoized_msd_merges[key] = value + return value + + def render(self, lemma, msd): + if lemma in self.rendered_words: + if msd in self.rendered_words[lemma]: + return self.rendered_words[lemma][msd][0] + + def available_words(self, lemma, existing_texts): + counted_texts = Counter(existing_texts) + for (msd, text), _n in counted_texts.most_common(): + yield (msd, text) + + if lemma in self.frequent_words: + for msd, text, _ in self.frequent_words[lemma]: + if (msd, text) not in counted_texts: + yield (msd, text) + + def get_lemma_msd(self, lemma, word_msd): + # should be here, since we collect every lemmas + lemma_msd = self.lemma_msd[lemma] + + if lemma_msd[0] == '-': + if word_msd[0] in self.lemma_features: + return self.lemma_features[word_msd[0]] + else: + return '-' + else: + return lemma_msd \ No newline at end of file diff --git a/src/writer.py b/src/writer.py new file mode 100644 index 0000000..a4d5ec1 --- /dev/null +++ b/src/writer.py @@ -0,0 +1,137 @@ +import logging + +from formatter import OutFormatter, OutNoStatFormatter, AllFormatter, StatsFormatter + +class Writer: + @staticmethod + def other_params(args): + return (args.multiple_output, int(args.sort_by), args.sort_reversed) + + @staticmethod + def make_output_writer(args, num_components, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), params) + + @staticmethod + def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), params) + + @staticmethod + def make_all_writer(args, num_components, colocation_ids, word_renderer): + return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), None) + + @staticmethod + def make_stats_writer(args, num_components, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), params) + + def __init__(self, file_out, num_components, formatter, params): + if params is None: + self.multiple_output = False + self.sort_by = -1 + self.sort_order = None + else: + self.multiple_output = params[0] + self.sort_by = params[1] + self.sort_order = params[2] + + self.num_components = num_components + self.output_file = file_out + self.formatter = formatter + + def header(self): + repeating_cols = self.formatter.header_repeat() + cols = ["C{}_{}".format(i + 1, thd) for i in range(self.num_components) + for thd in repeating_cols] + + cols = ["Structure_ID"] + cols + ["Colocation_ID"] + cols += self.formatter.header_right() + return cols + + def sorted_rows(self, rows): + if self.sort_by < 0 or len(rows) < 2: + return rows + + if len(rows[0]) <= self.sort_by: + logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0]))) + return rows + + try: + int(rows[0][self.sort_by]) + def key(row): + return int(row[self.sort_by]) + except ValueError: + def key(row): + return row[self.sort_by].lower() + + return sorted(rows, key=key, reverse=self.sort_order) + + def write_header(self, file_handler): + file_handler.write(", ".join(self.header()) + "\n") + + def write_out_worker(self, file_handler, structure, colocation_ids): + rows = [] + components = structure.components + + for match in colocation_ids.get_matches_for(structure): + self.formatter.new_match(match) + + for words in match.matches: + to_write = [] + + for idx, _comp in enumerate(components): + idx = str(idx + 1) + if idx not in words: + to_write.extend([""] * self.formatter.length()) + else: + to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id)) + + # make them equal size + to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write))) + + # structure_id and colocation_id + to_write = [structure.id] + to_write + [match.match_id] + + # header_right + to_write.extend(self.formatter.content_right(len(match))) + rows.append(to_write) + + if self.formatter.group(): + break + + if rows != []: + rows = self.sorted_rows(rows) + file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") + file_handler.flush() + + def write_out(self, structures, colocation_ids): + if self.output_file is None: + return + + def fp_close(fp_): + fp_.close() + + def fp_open(snum=None): + if snum is None: + return open(self.output_file, "w") + else: + return open("{}.{}".format(self.output_file, snum), "w") + + if not self.multiple_output: + fp = fp_open() + self.write_header(fp) + + for s in structures: + if self.multiple_output: + fp = fp_open(s.id) + self.write_header(fp) + + self.formatter.set_structure(s) + self.write_out_worker(fp, s, colocation_ids) + + if self.multiple_output: + fp_close(fp) + + if not self.multiple_output: + fp_close(fp) diff --git a/src/writerpy b/src/writerpy new file mode 100644 index 0000000..54c3fd5 --- /dev/null +++ b/src/writerpy @@ -0,0 +1,133 @@ +class Writer: + @staticmethod + def other_params(args): + return (args.multiple_output, int(args.sort_by), args.sort_reversed) + + @staticmethod + def make_output_writer(args, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params) + + @staticmethod + def make_output_no_stat_writer(args, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params) + + @staticmethod + def make_all_writer(args, colocation_ids, word_renderer): + return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None) + + @staticmethod + def make_stats_writer(args, colocation_ids, word_renderer): + params = Writer.other_params(args) + return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params) + + def __init__(self, file_out, formatter, params): + if params is None: + self.multiple_output = False + self.sort_by = -1 + self.sort_order = None + else: + self.multiple_output = params[0] + self.sort_by = params[1] + self.sort_order = params[2] + + self.output_file = file_out + self.formatter = formatter + + def header(self): + repeating_cols = self.formatter.header_repeat() + cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) + for thd in repeating_cols] + + cols = ["Structure_ID"] + cols + ["Colocation_ID"] + cols += self.formatter.header_right() + return cols + + def sorted_rows(self, rows): + if self.sort_by < 0 or len(rows) < 2: + return rows + + if len(rows[0]) <= self.sort_by: + logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0]))) + return rows + + try: + int(rows[0][self.sort_by]) + def key(row): + return int(row[self.sort_by]) + except ValueError: + def key(row): + return row[self.sort_by].lower() + + return sorted(rows, key=key, reverse=self.sort_order) + + def write_header(self, file_handler): + file_handler.write(", ".join(self.header()) + "\n") + + def write_out_worker(self, file_handler, structure, colocation_ids): + rows = [] + components = structure.components + + for match in colocation_ids.get_matches_for(structure): + self.formatter.new_match(match) + + for words in match.matches: + to_write = [] + + for idx, _comp in enumerate(components): + idx = str(idx + 1) + if idx not in words: + to_write.extend([""] * self.formatter.length()) + else: + to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id)) + + # make them equal size + to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write))) + + # structure_id and colocation_id + to_write = [structure.id] + to_write + [match.match_id] + + # header_right + to_write.extend(self.formatter.content_right(len(match))) + rows.append(to_write) + + if self.formatter.group(): + break + + if rows != []: + rows = self.sorted_rows(rows) + file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") + file_handler.flush() + + def write_out(self, structures, colocation_ids): + if self.output_file is None: + return + + def fp_close(fp_): + if fp_ != sys.stdout: + fp_.close() + + def fp_open(snum=None): + if snum is None: + return open(self.output_file, "w") + else: + return open("{}.{}".format(self.output_file, snum), "w") + + if not self.multiple_output: + fp = fp_open() + self.write_header(fp) + + for s in structures: + if self.multiple_output: + fp = fp_open(s.id) + self.write_header(fp) + + self.formatter.set_structure(s) + self.write_out_worker(fp, s, colocation_ids) + + if self.multiple_output: + fp_close(fp) + + if not self.multiple_output: + fp_close(fp) diff --git a/wani.py b/wani.py deleted file mode 100644 index 50690cb..0000000 --- a/wani.py +++ /dev/null @@ -1,1544 +0,0 @@ -from xml.etree import ElementTree -import re -from enum import Enum -from collections import defaultdict, namedtuple, Counter -import sys -import logging -import argparse -import pickle -import time -import subprocess -import concurrent.futures -import tempfile -from math import log2 - -from msd_translate import MSD_TRANSLATE - -try: - from tqdm import tqdm -except ImportError: - tqdm = lambda x: x - - -MAX_NUM_COMPONENTS = -1 - -CODES = { - "Noun": "N", - "Verb": "V", - "Adjective": "A", - "Adverb": "R", - "Pronoun": "P", - "Numeral": "M", - "Preposition": "S", - "Conjunction": "C", - "Particle": "Q", - "Interjection": "I", - "Abbreviation": "Y", - "Residual": "X", - - 'common': 'c', - 'proper': 'p', - 'masculine': 'm', - 'feminine': 'f', - 'neuter': 'n', - "singular": "s", - "dual": "d", - "plural": "p", - "nominative": "n", - "genitive": "g", - "dative": "d", - "accusative": "a", - "locative": "l", - "instrumental": "i", - "no": "n", - "yes": "y", - "main": "m", - "auxiliary": "a", - "perfective": "e", - "progressive": "p", - "biaspectual": "b", - "infinitive": "n", - "supine": "u", - "participle": "p", - "present": "r", - "future": "f", - "conditional": "c", - "imperative": "m", - "first": "1", - "second": "2", - "third": "3", - "general": "g", - "possessive": "s", - "positive": "p", - "comparative": "c", - "superlative": "s", - "personal": "p", - "demonstrative": "d", - "relative": "r", - "reflexive": "x", - "interrogative": "q", - "indefinite": "i", - "negative": "z", - "bound": "b", - "digit": "d", - "roman": "r", - "letter": "l", - "cardinal": "c", - "ordinal": "o", - "pronominal": "p", - "special": "s", - "coordinating": "c", - "subordinating": "s", - "foreign": "f", - "typo": "t", - "program": "p", -} - -TAGSET = { - "N": ['type', 'gender', 'number', 'case', 'animate'], - "V": ['type', 'aspect', 'vform', 'person', 'number', 'gender', 'negative'], - "A": ['type', 'degree', 'gender', 'number', 'case', 'definiteness'], - "R": ['type', 'degree'], - "P": ['type', 'person', 'gender', 'number', 'case', 'owner_number', 'owned_gender', 'clitic'], - "M": ['form', 'type', 'gender', 'number', 'case', 'definiteness'], - "S": ['case'], - "C": ['type'], - "Q": [], - "I": [], - "Y": [], - "X": ['type'] -} - -class ComponentType(Enum): - Other = 0 - Core = 2 - Core2w = 3 - -class RestrictionType(Enum): - Morphology = 0 - Lexis = 1 - MatchAll = 2 - -class Order(Enum): - FromTo = 0 - ToFrom = 1 - Any = 2 - - @staticmethod - def new(order): - if order is None: - return Order.Any - elif order == "to-from": - return Order.ToFrom - elif order == "from-to": - return Order.FromTo - else: - raise NotImplementedError("What kind of ordering is: {}".format(order)) - - - def match(self, from_w, to_w): - if self is Order.Any: - return True - - fi = from_w.int_id - ti = to_w.int_id - - if self is Order.FromTo: - return fi < ti - elif self is Order.ToFrom: - return ti < fi - else: - raise NotImplementedError("Should not be here: Order match") - - -class ComponentRepresentation: - def __init__(self, data, word_renderer): - self.data = data - self.word_renderer = word_renderer - - self.words = [] - self.rendition_text = None - self.agreement = [] - - def get_agreement(self): - return [] - - def add_word(self, word): - self.words.append(word) - - def render(self): - if self.rendition_text is None: - self.rendition_text = self._render() - - def _render(self): - raise NotImplementedError("Not implemented for class: {}".format(type(self))) - -class LemmaCR(ComponentRepresentation): - def _render(self): - return self.words[0].lemma if len(self.words) > 0 else None - -class LexisCR(ComponentRepresentation): - def _render(self): - return self.data['lexis'] - -class WordFormAllCR(ComponentRepresentation): - def _render(self): - if len(self.words) == 0: - return None - else: - forms = [w.text.lower() for w in self.words] - return "/".join(set(forms)) - -class WordFormAnyCR(ComponentRepresentation): - def _render(self): - text_forms = {} - msd_lemma_txt_triplets = Counter([(w.msd, w.lemma, w.text) for w in self.words]) - for (msd, lemma, text), _n in reversed(msd_lemma_txt_triplets.most_common()): - text_forms[(msd, lemma)] = text - - words_counter = [] - for word in self.words: - words_counter.append((word.msd, word.lemma)) - sorted_words = sorted(set(words_counter), key=lambda x: -words_counter.count(x)) - - for word_msd, word_lemma in sorted_words: - for agr in self.agreement: - if not agr.match(word_msd): - break - else: - for agr in self.agreement: - agr.confirm_match() - - if word_lemma is None: - return None - else: - return text_forms[(word_msd, word_lemma)] - -class WordFormMsdCR(WordFormAnyCR): - def __init__(self, *args): - super().__init__(*args) - self.lemma = None - self.msd = None - - def check_msd(self, word_msd): - if 'msd' not in self.data: - return True - selectors = self.data['msd'] - - for key, value in selectors.items(): - t = word_msd[0] - v = TAGSET[t].index(key.lower()) - f1 = word_msd[v + 1] - f2 = CODES[value] - - if '-' not in [f1, f2] and f1 != f2: - return False - - return True - - def add_word(self, word): - if self.lemma is None: - self.lemma = word.lemma - self.msd = word.msd - - if self.check_msd(word.msd): - super().add_word(word) - - def _render(self): - msd = self.word_renderer.get_lemma_msd(self.lemma, self.msd) - self.words.append(WordMsdOnly(msd)) - - return super()._render() - -class WordFormAgreementCR(WordFormMsdCR): - def __init__(self, data, word_renderer): - super().__init__(data, word_renderer) - self.rendition_candidate = None - - def get_agreement(self): - return self.data['other'] - - def match(self, word_msd): - existing = [(w.msd, w.text) for w in self.words] - - lemma_available_words = self.word_renderer.available_words(self.lemma, existing) - for candidate_msd, candidate_text in lemma_available_words: - if self.msd[0] != candidate_msd[0]: - continue - - if WordFormAgreementCR.check_agreement(word_msd, candidate_msd, self.data['agreement']): - if self.check_msd(candidate_msd): - self.rendition_candidate = candidate_text - return True - - return False - - def confirm_match(self): - self.rendition_text = self.rendition_candidate - - @staticmethod - def check_agreement(msd1, msd2, agreements): - for agr_case in agreements: - t1 = msd1[0] - # if not in msd, some strange msd was tries, skipping... - if agr_case not in TAGSET[t1]: - logging.warning("Cannot do agreement: {} for msd {} not found!" - .format(agr_case, msd1)) - return False - - v1 = TAGSET[t1].index(agr_case) - # if none specified: nedolocnik, always agrees - if v1 + 1 >= len(msd1): - continue - # first is uppercase, not in TAGSET - m1 = msd1[v1 + 1] - - # REPEAT (not DRY!) - t2 = msd2[0] - if agr_case not in TAGSET[t2]: - logging.warning("Cannot do agreement: {} for msd {} not found!" - .format(agr_case, msd2)) - return False - v2 = TAGSET[t2].index(agr_case) - if v2 + 1 >= len(msd2): - continue - m2 = msd2[v2 + 1] - - # match! - if '-' not in [m1, m2] and m1 != m2: - return False - - return True - - def render(self): - pass - - -class ComponentRendition: - def __init__(self): - self.more = {} - self.representation_factory = ComponentRepresentation - - def add_feature(self, feature): - if 'rendition' in feature: - if feature['rendition'] == "lemma": - self.representation_factory = LemmaCR - elif feature['rendition'] == "word_form": - # just by default, changes with selection - self.representation_factory = WordFormAnyCR - elif feature['rendition'] == "lexis": - self.representation_factory = LexisCR - self.more['lexis'] = feature['string'] - else: - raise NotImplementedError("Representation rendition: {}".format(feature)) - - elif 'selection' in feature: - if feature['selection'] == "msd": - # could already be agreement - if self.representation_factory != WordFormAgreementCR: - self.representation_factory = WordFormMsdCR - self.more['msd'] = {k: v for k, v in feature.items() if k != 'selection'} - elif feature['selection'] == "all": - self.representation_factory = WordFormAllCR - elif feature['selection'] == 'agreement': - assert feature['head'][:4] == 'cid_' - assert feature['msd'] is not None - self.representation_factory = WordFormAgreementCR - self.more['agreement'] = feature['msd'].split('+') - self.more['other'] = feature['head'][4:] - else: - raise NotImplementedError("Representation selection: {}".format(feature)) - - def cr_instance(self, word_renderer): - return self.representation_factory(self.more, word_renderer) - - @staticmethod - def set_representations(match, word_renderer): - representations = {} - for c in match.structure.components: - representations[c.idx] = [] - for rep in c.representation: - representations[c.idx].append(rep.cr_instance(word_renderer)) - - for cid, reps in representations.items(): - for rep in reps: - for agr in rep.get_agreement(): - if len(representations[agr]) != 1: - n = len(representations[agr]) - raise NotImplementedError( - "Structure {}: ".format(match.structure.id) + - "component {} has agreement".format(cid) + - " with component {}".format(agr) + - ", however there are {} (!= 1) representations".format(n) + - " of component {}!".format(agr)) - - representations[agr][0].agreement.append(rep) - - for words in match.matches: - # first pass, check everything but agreements - for w_id, w in words.items(): - component = match.structure.get_component(w_id) - component_representations = representations[component.idx] - for representation in component_representations: - representation.add_word(w) - - for cid, reps in representations.items(): - for rep in reps: - rep.render() - - for cid, reps in representations.items(): - reps = [rep.rendition_text for rep in reps] - if reps == []: - pass - elif all(r is None for r in reps): - match.representations[cid] = None - else: - match.representations[cid] = " ".join(("" if r is None else r) for r in reps) - - -class ComponentStatus(Enum): - Optional = 0 - Required = 1 - Forbidden = 2 - - -def get_level(restriction): - for feature in restriction: - if "level" in feature.keys(): - lvl = feature.get("level") - else: - continue - - raise RuntimeError("Unreachable!") - - -def determine_ppb(rgx): - if rgx[0] in ("A", "N", "R"): - return 0 - elif rgx[0] == "V": - if len(rgx) == 1: - return 2 - elif 'a' in rgx[1]: - return 3 - elif 'm' in rgx[1]: - return 1 - else: - return 2 - else: - return 4 - -class MorphologyRegex: - def __init__(self, restriction): - self.min_msd_length = 1 - - restr_dict = {} - for feature in restriction: - feature_dict = dict(feature.items()) - - match_type = True - if "filter" in feature_dict: - assert feature_dict['filter'] == "negative" - match_type = False - del feature_dict['filter'] - - assert len(feature_dict) == 1 - key, value = next(iter(feature_dict.items())) - restr_dict[key] = (value, match_type) - - assert 'POS' in restr_dict - category = restr_dict['POS'][0].capitalize() - cat_code = CODES[category] - rgx = [cat_code] + ['.'] * 10 - - del restr_dict['POS'] - - for attribute, (value, typ) in restr_dict.items(): - index = TAGSET[cat_code].index(attribute.lower()) - assert index >= 0 - - if '|' in value: - match = "".join(CODES[val] for val in value.split('|')) - else: - match = CODES[value] - - match = "[{}{}]".format("" if typ else "^", match) - rgx[index + 1] = match - - if typ: - self.min_msd_length = max(index + 1, self.min_msd_length) - - # strip rgx - for i in reversed(range(len(rgx))): - if rgx[i] == '.': - rgx = rgx[:-1] - else: - break - - self.re_objects = [re.compile(r) for r in rgx] - self.rgx = rgx - - def __call__(self, text): - if len(text) <= self.min_msd_length: - return False - - for c, r in zip(text, self.re_objects): - if not r.match(c): - return False - return True - - -class LexisRegex: - def __init__(self, restriction): - restr_dict = {} - for feature in restriction: - restr_dict.update(feature.items()) - - assert "lemma" in restr_dict - self.match_list = restr_dict['lemma'].split('|') - - def __call__(self, text): - return text in self.match_list - - -class Restriction: - def __init__(self, restriction_tag): - self.ppb = 4 # polnopomenska beseda (0-4) - - if restriction_tag is None: - self.type = RestrictionType.MatchAll - self.matcher = None - self.present = None - return - - restriction_type = restriction_tag.get('type') - if restriction_type == "morphology": - self.type = RestrictionType.Morphology - self.matcher = MorphologyRegex(list(restriction_tag)) - self.ppb = determine_ppb(self.matcher.rgx) - - elif restriction_type == "lexis": - self.type = RestrictionType.Lexis - self.matcher = LexisRegex(list(restriction_tag)) - else: - raise NotImplementedError() - - def match(self, word): - if self.type == RestrictionType.Morphology: - match_to = word.msd - elif self.type == RestrictionType.Lexis: - match_to = word.lemma - elif self.type == RestrictionType.MatchAll: - return True - else: - raise RuntimeError("Unreachable!") - - return self.matcher(match_to) - - -class Component: - def __init__(self, info): - idx = info['cid'] - name = info['name'] if 'name' in info else None - typ = ComponentType.Core if info['type'] == "core" else ComponentType.Other - - if 'status' not in info: - status = ComponentStatus.Required - elif info['status'] == 'forbidden': - status = ComponentStatus.Forbidden - elif info['status'] == 'obligatory': - status = ComponentStatus.Required - elif info['status'] == 'optional': - status = ComponentStatus.Optional - else: - raise NotImplementedError("strange status: {}".format(info['status'])) - - self.status = status - self.name = name - self.idx = idx - self.restrictions = [] - self.next_element = [] - self.representation = [] - self.selection = {} - self.type = typ - - self.iter_ctr = 0 - - def add_next(self, next_component, link_label, order): - self.next_element.append((next_component, link_label, Order.new(order))) - - def set_restriction(self, restrictions_tag): - if restrictions_tag is None: - self.restrictions = [Restriction(None)] - - elif restrictions_tag.tag == "restriction": - self.restrictions = [Restriction(restrictions_tag)] - - elif restrictions_tag.tag == "restriction_or": - self.restrictions = [Restriction(el) for el in restrictions_tag] - - else: - raise RuntimeError("Unreachable") - - def set_representation(self, representation): - for rep in representation: - crend = ComponentRendition() - for feature in rep: - crend.add_feature(feature.attrib) - self.representation.append(crend) - - def find_next(self, deps, comps, restrs, reprs): - to_ret = [] - for d in deps: - if d[0] == self.idx: - _, idx, dep_label, order = d - - next_component = Component(comps[idx]) - next_component.set_restriction(restrs[idx]) - next_component.set_representation(reprs[idx]) - to_ret.append(next_component) - - self.add_next(next_component, dep_label, order) - others = next_component.find_next(deps, comps, restrs, reprs) - to_ret.extend(others) - - return to_ret - - def name_str(self): - return "_" if self.name is None else self.name - - def match(self, word): - m1 = self._match_self(word) - if m1 is None: - return None - - mn = self._match_next(word) - if mn is None: - return None - - to_ret = [m1] - for cmatch in mn: - # if good match but nothing to add, just continue - if len(cmatch) == 0: - continue - - # if more than one match found for particular component - elif len(cmatch) > 1: - # if more than one match in multiple components, NOPE! - if len(to_ret) > 1: - logging.warning("Strange multiple match: {}".format( - str([w.id for w in cmatch[0].values()]))) - - for tr in to_ret: - tr.update(cmatch[0]) - continue - - # yeah, so we have found more than one match, => - # more than one element in to_ret - to_ret = [{**dict(to_ret[0]), **m} for m in cmatch] - - else: - for tr in to_ret: - tr.update(cmatch[0]) - - return to_ret - - def _match_self(self, word): - # matching - for restr in self.restrictions: - if restr.match(word): # match either - return {self.idx: word} - - def _match_next(self, word): - # matches for every component in links from this component - to_ret = [] - - # need to get all links that match - for next, link, order in self.next_element: - next_links = word.get_links(link) - to_ret.append([]) - - # good flag - good = next.status != ComponentStatus.Required - for next_word in next_links: - if not order.match(word, next_word): - continue - - match = next.match(next_word) - - if match is not None: - # special treatement for forbidden - if next.status == ComponentStatus.Forbidden: - good = False - break - - else: - assert type(match) is list - to_ret[-1].extend(match) - good = True - - # if none matched, nothing found! - if not good: - return None - - return to_ret - - -class SyntacticStructure: - def __init__(self): - self.id = None - self.lbs = None - self.components = [] - - @staticmethod - def from_xml(xml): - st = SyntacticStructure() - st.id = xml.get('id') - st.lbs = xml.get('LBS') - - assert len(list(xml)) == 1 - system = next(iter(xml)) - - assert system.get('type') == 'JOS' - components, dependencies, definitions = list(system) - - deps = [(dep.get('from'), dep.get('to'), dep.get('label'), dep.get('order')) - for dep in dependencies] - comps = {comp.get('cid'): dict(comp.items()) for comp in components} - - restrs, forms = {}, {} - - for comp in definitions: - n = comp.get('cid') - restrs[n] = None - forms[n] = [] - - for el in comp: - if el.tag.startswith("restriction"): - assert restrs[n] is None - restrs[n] = el - elif el.tag.startswith("representation"): - st.add_representation(n, el, forms) - else: - raise NotImplementedError("Unknown definition: {} in structure {}" - .format(el.tag, st.id)) - - fake_root_component = Component({'cid': '#', 'type': 'other'}) - st.components = fake_root_component.find_next(deps, comps, restrs, forms) - - st.determine_core2w() - return st - - def determine_core2w(self): - ppb_components = [] - for c in self.components: - if c.type != ComponentType.Core: - continue - - ppb = 4 - for r in c.restrictions: - ppb = min(r.ppb, ppb) - - ppb_components.append((c, ppb)) - - ppb_components = sorted(ppb_components, key=lambda c: c[1]) - if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]: - raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id) - - for c, _ in ppb_components[:2]: - c.type = ComponentType.Core2w - - def add_representation(self, n, rep_el, forms): - assert rep_el.tag == "representation" - to_add = [] - for el in rep_el: - assert el.tag == "feature" - if 'rendition' in el.attrib or 'selection' in el.attrib: - to_add.append(el) - else: - logging.warning("Strange representation feature in structure {}. Skipping" - .format(self.id)) - continue - forms[n].append(to_add) - - def get_component(self, idx): - for c in self.components: - if c.idx == idx: - return c - raise RuntimeError("Unknown component id: {}".format(idx)) - - def match(self, word): - matches = self.components[0].match(word) - return [] if matches is None else matches - -def load_structures(filename): - with open(filename, 'r') as fp: - et = ElementTree.XML(fp.read()) - - return build_structures(et), get_lemma_features(et) - -def build_structures(et): - global MAX_NUM_COMPONENTS - - structures = [] - for structure in et.iter('syntactic_structure'): - to_append = SyntacticStructure.from_xml(structure) - if to_append is None: - continue - - structures.append(to_append) - MAX_NUM_COMPONENTS = max(MAX_NUM_COMPONENTS, len(to_append.components)) - - return structures - -def get_lemma_features(et): - lf = et.find('lemma_features') - if lf is None: - return {} - - result = {} - for pos in lf.iter('POS'): - rgx_list = MorphologyRegex(pos).rgx - rgx_str = "" - for position in rgx_list: - if position == ".": - rgx_str += " " - elif len(position) == 1: - rgx_str += position - elif len(position) == 3 and position[0] == "[" and position[2] == "]": - rgx_str += position[1] - else: - raise RuntimeError("Strange rgx for lemma_feature...") - - assert rgx_str[0].isupper() - result[rgx_str[0]] = rgx_str.strip().replace(' ', '-') - - return result - -def get_msd(comp): - d = dict(comp.items()) - if 'msd' in d: - return d['msd'] - elif 'ana' in d: - return d['ana'][4:] - else: - logging.error(d, file=sys.stderr) - raise NotImplementedError("MSD?") - - -class WordMsdOnly: - def __init__(self, msd): - self.msd = msd - self.lemma = None - self.text = None - - def most_frequent_text(self, _): - return None - - -class Word: - def __init__(self, xml, do_msd_translate): - self.lemma = xml.get('lemma') - self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml) - self.id = xml.get('id') - self.text = xml.text - self.links = defaultdict(list) - - last_num = self.id.split('.')[-1] - if last_num[0] not in '0123456789': - last_num = last_num[1:] - self.int_id = int(last_num) - - assert None not in (self.id, self.lemma, self.msd) - - @staticmethod - def pc_word(pc, do_msd_translate): - pc.set('lemma', pc.text) - pc.set('msd', "N" if do_msd_translate else "U") - return Word(pc, do_msd_translate) - - def add_link(self, link, to): - self.links[link].append(to) - - def get_links(self, link): - if link not in self.links and "|" in link: - for l in link.split('|'): - self.links[link].extend(self.links[l]) - - return self.links[link] - - def most_frequent_text(self, word_renderer): - return word_renderer.render(self.lemma, self.msd) - -class WordMsdRenderer: - def __init__(self, lemma_features): - self.all_words = [] - self.rendered_words = {} - self.frequent_words = {} - self.num_words = {} - self.lemma_msd = {} - self.lemma_features = lemma_features - self.memoized_msd_merges = {} - - def add_words(self, words): - self.all_words.extend(words) - - def num_all_words(self): - return len(self.all_words) - - def generate_renders(self): - num_words = defaultdict(int) - data = defaultdict(lambda: defaultdict(list)) - for w in self.all_words: - data[w.lemma][w.msd].append(w.text) - - for lemma, ld in data.items(): - self.rendered_words[lemma] = {} - freq_words = defaultdict(int) - common_msd = "*" * 10 - - for msd, texts in ld.items(): - # TODO: this should be out of generate_renders... - num_words[(lemma, msd[0])] += len(texts) - - rep = max(set(texts), key=texts.count) - self.rendered_words[lemma][msd] = (rep, len(texts)) - - for txt in texts: - freq_words[(msd, txt)] += 1 - - common_msd = self.merge_msd(common_msd, msd) - - self.lemma_msd[lemma] = common_msd - - self.frequent_words[lemma] = [] - for (msd, txt), n in sorted(freq_words.items(), key=lambda x: -x[1]): - self.frequent_words[lemma].append((msd, txt, n)) - - lf = self.lemma_features - for lemma in self.lemma_msd: - cmsd = self.lemma_msd[lemma] - if cmsd[0] in lf: - self.lemma_msd[lemma] = "".join( - l1 if l1 != "-" else l2 for l1, l2 in zip(lf[cmsd[0]], cmsd) - ) - - self.num_words = dict(num_words) - - def merge_msd(self, common_msd, new_msd): - key = (common_msd, new_msd) - if key in self.memoized_msd_merges: - return self.memoized_msd_merges[key] - - def merge_letter(l1, l2): - if l1 == "*": - return l2 - elif l1 != l2: - return "-" - else: - return l1 - - value = "".join(merge_letter(l1, l2) for l1, l2 in zip(common_msd, new_msd)) - self.memoized_msd_merges[key] = value - return value - - def render(self, lemma, msd): - if lemma in self.rendered_words: - if msd in self.rendered_words[lemma]: - return self.rendered_words[lemma][msd][0] - - def available_words(self, lemma, existing_texts): - counted_texts = Counter(existing_texts) - for (msd, text), _n in counted_texts.most_common(): - yield (msd, text) - - if lemma in self.frequent_words: - for msd, text, _ in self.frequent_words[lemma]: - if (msd, text) not in counted_texts: - yield (msd, text) - - def get_lemma_msd(self, lemma, word_msd): - # should be here, since we collect every lemmas - lemma_msd = self.lemma_msd[lemma] - - if lemma_msd[0] == '-': - if word_msd[0] in self.lemma_features: - return self.lemma_features[word_msd[0]] - else: - return '-' - else: - return lemma_msd - -def is_root_id(id_): - return len(id_.split('.')) == 3 - - -def load_files(args): - filenames = args.input - skip_id_check = args.skip_id_check - do_msd_translate = not args.no_msd_translate - - for n, fname in enumerate(filenames): - if args.count_files: - status = " :: {} / {}".format(n, len(filenames)) - else: - status = "" - yield load_tei_file(fname, skip_id_check, do_msd_translate, args.pc_tag, status) - - -def load_tei_file(filename, skip_id_check, do_msd_translate, pc_tag, status): - logging.info("LOADING FILE: {}{}".format(filename, status)) - - with open(filename, 'r') as fp: - xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) - xmlstring = xmlstring.replace(' xml:', ' ') - et = ElementTree.XML(xmlstring) - - words = {} - for w in et.iter("w"): - words[w.get('id')] = Word(w, do_msd_translate) - for pc in et.iter(pc_tag): - words[pc.get('id')] = Word.pc_word(pc, do_msd_translate) - - for l in et.iter("link"): - if 'dep' in l.keys(): - ana = l.get('afun') - lfrom = l.get('from') - dest = l.get('dep') - else: - ana = l.get('ana') - if ana[:4] != 'syn:': # dont bother... - continue - ana = ana[4:] - lfrom, dest = l.get('target').replace('#', '').split() - - if lfrom in words: - if not skip_id_check and is_root_id(lfrom): - logging.error("NOO: {}".format(lfrom)) - sys.exit(1) - - if dest in words: - next_word = words[dest] - words[lfrom].add_link(ana, next_word) - else: - logging.error("Unknown id: {}".format(dest)) - sys.exit(1) - - else: - # strange errors, just skip... - pass - - return list(words.values()) - - -class Formatter: - def __init__(self, colocation_ids, word_renderer): - self.colocation_ids = colocation_ids - self.word_renderer = word_renderer - self.additional_init() - - def header_repeat(self): - raise NotImplementedError("Header repeat formatter not implemented") - def header_right(self): - raise NotImplementedError("Header right formatter not implemented") - def content_repeat(self, words, representations, idx, sidx): - raise NotImplementedError("Content repeat formatter not implemented") - def content_right(self, freq): - raise NotImplementedError("Content right formatter not implemented") - def group(self): - raise NotImplementedError("Group for formatter not implemented") - - def additional_init(self): - pass - def length(self): - return len(self.header_repeat()) - - def set_structure(self, structure): - pass - def new_match(self, match): - pass - - -class OutNoStatFormatter(Formatter): - def additional_init(self): - self.representation = "" - - def header_repeat(self): - return ["Lemma", "Representative_form", "RF_scenario"] - - def header_right(self): - return ["Joint_representative_form", "Frequency"] - - def content_repeat(self, words, representations, idx, _sidx): - word = words[idx] - if idx not in representations: - return [word.lemma, "", ""] - - rep = representations[idx] - if rep is None: - self.representation += " " + word.lemma - return [word.lemma, word.lemma, "lemma_fallback"] - else: - self.representation += " " + rep - return [word.lemma, rep, "ok"] - - def content_right(self, freq): - rep = re.sub(' +', ' ', self.representation.strip()) - result = [rep, str(freq)] - self.representation = "" - return result - - def group(self): - return True - -class AllFormatter(Formatter): - def header_repeat(self): - return ["Token_ID", "Word_form", "Lemma", "Msd"] - - def header_right(self): - return [] - - def content_repeat(self, words, _representations, idx, _sidx): - word = words[idx] - return [word.id, word.text, word.lemma, word.msd] - - def content_right(self, _freq): - return [] - - def group(self): - return False - -class StatsFormatter(Formatter): - def additional_init(self): - self.stats = None - self.jppb = None - self.corew = None - - @staticmethod - def stat_str(num): - return "{:.5f}".format(num) if isinstance(num, float) else str(num) - - def set_structure(self, structure): - jppb = [] - corew = [] - - for component in structure.components: - if component.type == ComponentType.Core2w: - jppb.append(component.idx) - if component.type != ComponentType.Other: - corew.append(component.idx) - - assert(len(jppb) == 2) - self.jppb = tuple(jppb) - self.corew = tuple(corew) - - def new_match(self, match): - self.stats = {"freq": {}} - - for cid in self.corew: - if cid not in match.matches[0]: - freq = 0 - else: - word = match.matches[0][cid] - freq = self.word_renderer.num_words[(word.lemma, word.msd[0])] - - self.stats["freq"][cid] = freq - - fx = self.stats["freq"][self.jppb[0]] - fy = self.stats["freq"][self.jppb[1]] - freq = len(match) - N = self.word_renderer.num_all_words() - - self.stats['d12'] = freq / fx - (fy - freq) / (N - fx) - self.stats['d21'] = freq / fy - (fx - freq) / (N - fy) - - self.stats['df'] = match.distinct_forms() - self.stats['freq_all'] = freq - - def header_repeat(self): - return ["Distribution"] - - def header_right(self): - return ["Delta_p12", "Delta_p21", "LogDice_core", "LogDice_all", "Distinct_forms"] - - def content_repeat(self, words, representations, idx, sidx): - # not a core word - if idx not in self.corew: - return [""] * self.length() - - word = words[idx] - key = (sidx, idx, word.lemma) - distribution = self.colocation_ids.dispersions[key] - return [self.stat_str(distribution)] - - def content_right(self, freq): - fx = self.stats["freq"][self.jppb[0]] - fy = self.stats["freq"][self.jppb[1]] - freq = self.stats['freq_all'] - logdice_core = 14 + log2(2 * freq / (fx + fy)) - - fi = [self.stats["freq"][idx] for idx in self.corew] - fi = [f for f in fi if f > 0] - logdice_all = 14 + log2(len(fi) * freq / sum(fi)) - - return [self.stat_str(x) for x in ( - self.stats["d12"], self.stats["d21"], logdice_core, logdice_all, self.stats['df'] - )] - - def group(self): - return True - -class OutFormatter(Formatter): - def additional_init(self): - self.f1 = OutNoStatFormatter(self.colocation_ids, self.word_renderer) - self.f2 = StatsFormatter(self.colocation_ids, self.word_renderer) - - def header_repeat(self): - return self.f1.header_repeat() + self.f2.header_repeat() - - def header_right(self): - return self.f1.header_right() + self.f2.header_right() - - def content_repeat(self, words, representations, idx, sidx): - cr1 = self.f1.content_repeat(words, representations, idx, sidx) - cr2 = self.f2.content_repeat(words, representations, idx, sidx) - return cr1 + cr2 - - def content_right(self, freq): - return self.f1.content_right(freq) + self.f2.content_right(freq) - - def group(self): - return self.f1.group() and self.f2.group() - - def set_structure(self, structure): - self.f2.set_structure(structure) - - def new_match(self, match): - self.f2.new_match(match) - - -class Writer: - @staticmethod - def other_params(args): - return (args.multiple_output, int(args.sort_by), args.sort_reversed) - - @staticmethod - def make_output_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.out, OutFormatter(colocation_ids, word_renderer), params) - - @staticmethod - def make_output_no_stat_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.out_no_stat, OutNoStatFormatter(colocation_ids, word_renderer), params) - - @staticmethod - def make_all_writer(args, colocation_ids, word_renderer): - return Writer(args.all, AllFormatter(colocation_ids, word_renderer), None) - - @staticmethod - def make_stats_writer(args, colocation_ids, word_renderer): - params = Writer.other_params(args) - return Writer(args.stats, StatsFormatter(colocation_ids, word_renderer), params) - - def __init__(self, file_out, formatter, params): - if params is None: - self.multiple_output = False - self.sort_by = -1 - self.sort_order = None - else: - self.multiple_output = params[0] - self.sort_by = params[1] - self.sort_order = params[2] - - self.output_file = file_out - self.formatter = formatter - - def header(self): - repeating_cols = self.formatter.header_repeat() - cols = ["C{}_{}".format(i + 1, thd) for i in range(MAX_NUM_COMPONENTS) - for thd in repeating_cols] - - cols = ["Structure_ID"] + cols + ["Colocation_ID"] - cols += self.formatter.header_right() - return cols - - def sorted_rows(self, rows): - if self.sort_by < 0 or len(rows) < 2: - return rows - - if len(rows[0]) <= self.sort_by: - logging.warning("Cannot sort by column #{}: Not enough columns!".format(len(rows[0]))) - return rows - - try: - int(rows[0][self.sort_by]) - def key(row): - return int(row[self.sort_by]) - except ValueError: - def key(row): - return row[self.sort_by].lower() - - return sorted(rows, key=key, reverse=self.sort_order) - - def write_header(self, file_handler): - file_handler.write(", ".join(self.header()) + "\n") - - def write_out_worker(self, file_handler, structure, colocation_ids): - rows = [] - components = structure.components - - for match in colocation_ids.get_matches_for(structure): - self.formatter.new_match(match) - - for words in match.matches: - to_write = [] - - for idx, _comp in enumerate(components): - idx = str(idx + 1) - if idx not in words: - to_write.extend([""] * self.formatter.length()) - else: - to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id)) - - # make them equal size - to_write.extend([""] * (MAX_NUM_COMPONENTS * self.formatter.length() - len(to_write))) - - # structure_id and colocation_id - to_write = [structure.id] + to_write + [match.match_id] - - # header_right - to_write.extend(self.formatter.content_right(len(match))) - rows.append(to_write) - - if self.formatter.group(): - break - - if rows != []: - rows = self.sorted_rows(rows) - file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") - file_handler.flush() - - def write_out(self, structures, colocation_ids): - if self.output_file is None: - return - - def fp_close(fp_): - if fp_ != sys.stdout: - fp_.close() - - def fp_open(snum=None): - if snum is None: - return open(self.output_file, "w") - else: - return open("{}.{}".format(self.output_file, snum), "w") - - if not self.multiple_output: - fp = fp_open() - self.write_header(fp) - - for s in structures: - if self.multiple_output: - fp = fp_open(s.id) - self.write_header(fp) - - self.formatter.set_structure(s) - self.write_out_worker(fp, s, colocation_ids) - - if self.multiple_output: - fp_close(fp) - - if not self.multiple_output: - fp_close(fp) - -class StructureMatch: - def __init__(self, match_id, structure): - self.match_id = match_id - self.structure = structure - - self.matches = [] - self.representations = {} - - def distinct_forms(self): - dm = set() - keys = list(self.matches[0].keys()) - for words in self.matches: - dm.add(" ".join(words[k].text for k in keys)) - return len(dm) - - def append(self, match): - self.matches.append(match) - - def __len__(self): - return len(self.matches) - -class ColocationIds: - def __init__(self): - self.data = {} - self.min_frequency = args.min_freq - self.dispersions = {} - - def _add_match(self, key, structure, match): - if key not in self.data: - self.data[key] = StructureMatch(str(len(self.data) + 1), structure) - self.data[key].append(match) - - def get(self, key, n): - return self.data[key][n] - - def add_matches(self, matches): - for structure, nms in matches.items(): - for nm in nms: - self._add_match(nm[1], structure, nm[0]) - - def get_matches_for(self, structure): - for _cid_tup, sm in self.data.items(): - if sm.structure != structure: - continue - - yield sm - - def set_representations(self, word_renderer): - for _1, sm in tqdm(self.data.items()): - ComponentRendition.set_representations(sm, word_renderer) - - def determine_colocation_dispersions(self): - dispersions = defaultdict(int) - for (structure_id, *word_tups) in self.data.keys(): - for component_id, lemma in word_tups: - dispersions[(structure_id, component_id, lemma)] += 1 - self.dispersions = dict(dispersions) - - -def match_file(words, structures): - matches = {s: [] for s in structures} - - for s in tqdm(structures): - for w in words: - mhere = s.match(w) - for match in mhere: - colocation_id = [(idx, w.lemma) for idx, w in match.items()] - colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) - colocation_id = tuple(colocation_id) - - matches[s].append((match, colocation_id)) - - return matches - - -def main(structures_file, args): - structures, lemma_msds = load_structures(structures_file) - - colocation_ids = ColocationIds() - word_renderer = WordMsdRenderer(lemma_msds) - - if args.parallel: - num_parallel = int(args.parallel) - - # make temporary directory to hold temporary files - with tempfile.TemporaryDirectory() as tmpdirname: - cmd = sys.argv - for inpt in args.input: - if inpt in cmd: - cmd.remove(inpt) - - # remove "--parallel X" - pidx = cmd.index('--parallel') - del cmd[pidx] - del cmd[pidx] - - def func(n): - cmdn = [sys.executable] + cmd + [args.input[n], - "--match-to-file", "{}/{}.p".format(tmpdirname, n)] - subprocess.check_call(cmdn) - return n - - # use ThreadPoolExecuter to run subprocesses in parallel using py threads - with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel) as executor: - # fancy interface to wait for threads to finish - for id_input in executor.map(func, [i for i, _ in enumerate(args.input)]): - with open("{}/{}.p".format(tmpdirname, id_input), "rb") as fp: - words, matches = pickle.load(fp) - - colocation_ids.add_matches(matches) - word_renderer.add_words(words) - - else: - for words in load_files(args): - matches = match_file(words, structures) - # just save to temporary file, used for children of a parallel process - # MUST NOT have more than one file - if args.match_to_file is not None: - with open(args.match_to_file, "wb") as fp: - pickle.dump((words, matches), fp) - return - else: - colocation_ids.add_matches(matches) - word_renderer.add_words(words) - - # get word renders for lemma/msd - word_renderer.generate_renders() - colocation_ids.determine_colocation_dispersions() - - # figure out representations! - if args.out or args.out_no_stat: - colocation_ids.set_representations(word_renderer) - - Writer.make_output_writer(args, colocation_ids, word_renderer).write_out( - structures, colocation_ids) - Writer.make_output_no_stat_writer(args, colocation_ids, word_renderer).write_out( - structures, colocation_ids) - Writer.make_all_writer(args, colocation_ids, word_renderer).write_out( - structures, colocation_ids) - Writer.make_stats_writer(args, colocation_ids, word_renderer).write_out( - structures, colocation_ids) - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Extract structures from a parsed corpus.') - parser.add_argument('structures', - help='Structures definitions in xml file') - parser.add_argument('input', - help='input xml file in `ssj500k form`, can list more than one', nargs='+') - parser.add_argument('--out', - help='Classic output file') - parser.add_argument('--out-no-stat', - help='Output file, but without statistical columns') - parser.add_argument('--all', - help='Additional output file, writes more data') - parser.add_argument('--stats', - help='Output file for statistics') - - parser.add_argument('--no-msd-translate', - help='MSDs are translated from slovene to english by default', - action='store_true') - parser.add_argument('--skip-id-check', - help='Skips checks for ids of and , if they are in correct format', - action='store_true') - parser.add_argument('--min_freq', help='Minimal frequency in output', - type=int, default=0, const=1, nargs='?') - parser.add_argument('--verbose', help='Enable verbose output to stderr', - choices=["warning", "info", "debug"], default="info", - const="info", nargs='?') - parser.add_argument('--count-files', - help="Count files: more verbose output", action='store_true') - parser.add_argument('--multiple-output', - help='Generate one output for each syntactic structure', - action='store_true') - - parser.add_argument('--sort-by', - help="Sort by a this column (index)", type=int, default=-1) - parser.add_argument('--sort-reversed', - help="Sort in reversed ored", action='store_true') - - parser.add_argument('--pc-tag', - help='Tag for separators, usually pc or c', default="pc") - parser.add_argument('--parallel', - help='Run in multiple processes, should speed things up') - parser.add_argument('--match-to-file', help='Do not use!') - - args = parser.parse_args() - logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) - - start = time.time() - main(args.structures, args) - logging.info("TIME: {}".format(time.time() - start))