diff --git a/wani.py b/wani.py index 80bcd8d..cb6ece8 100644 --- a/wani.py +++ b/wani.py @@ -430,6 +430,19 @@ def get_level(restriction): raise RuntimeError("Unreachable!") +def determine_ppb(rgx): + if rgx[0] in ("A", "N", "R"): + return 0 + elif rgx[0] == "V": + if 'a' in rgx[1]: + return 3 + elif 'm' in rgx[1]: + return 1 + else: + return 2 + else: + return 4 + def build_morphology_regex(restriction): restr_dict = {} for feature in restriction: @@ -494,6 +507,8 @@ def build_lexis_regex(restriction): class Restriction: def __init__(self, restriction_tag): + self.ppb = 4 # polnopomenska beseda (0-4) + if restriction_tag is None: self.type = RestrictionType.MatchAll self.matcher = None @@ -505,6 +520,8 @@ class Restriction: self.type = RestrictionType.Morphology present, self.matcher = build_morphology_regex(list(restriction_tag)) self.present = " ".join(present) + self.ppb = determine_ppb(present) + elif restriction_type == "lexis": self.type = RestrictionType.Lexis self.present, self.matcher = build_lexis_regex(list(restriction_tag)) @@ -711,7 +728,28 @@ class SyntacticStructure: fake_root_component = Component({'cid': '#', 'type': 'other'}) st.components = fake_root_component.find_next(deps, comps, restrs, forms) + + st.determine_core2w() return st + + def determine_core2w(self): + ppb_components = [] + for c in self.components: + if c.type != ComponentType.Core: + continue + + ppb = 4 + for r in c.restrictions: + ppb = min(r.ppb, ppb) + + ppb_components.append((c, ppb)) + + ppb_components = sorted(ppb_components, key=lambda c: c[1]) + if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]: + raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id) + + for c, _ in ppb_components[:2]: + c.type = ComponentType.Core2w def add_representation(self, n, rep_el, forms): assert(rep_el.tag == "representation")