determining jppb (for 2 word statistics)

This commit is contained in:
Ozbolt Menegatti 2019-06-08 11:31:52 +02:00
parent 30a5e80569
commit 3a22cd91c3

38
wani.py
View File

@ -430,6 +430,19 @@ def get_level(restriction):
raise RuntimeError("Unreachable!")
def determine_ppb(rgx):
if rgx[0] in ("A", "N", "R"):
return 0
elif rgx[0] == "V":
if 'a' in rgx[1]:
return 3
elif 'm' in rgx[1]:
return 1
else:
return 2
else:
return 4
def build_morphology_regex(restriction):
restr_dict = {}
for feature in restriction:
@ -494,6 +507,8 @@ def build_lexis_regex(restriction):
class Restriction:
def __init__(self, restriction_tag):
self.ppb = 4 # polnopomenska beseda (0-4)
if restriction_tag is None:
self.type = RestrictionType.MatchAll
self.matcher = None
@ -505,6 +520,8 @@ class Restriction:
self.type = RestrictionType.Morphology
present, self.matcher = build_morphology_regex(list(restriction_tag))
self.present = " ".join(present)
self.ppb = determine_ppb(present)
elif restriction_type == "lexis":
self.type = RestrictionType.Lexis
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
@ -711,8 +728,29 @@ class SyntacticStructure:
fake_root_component = Component({'cid': '#', 'type': 'other'})
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
st.determine_core2w()
return st
def determine_core2w(self):
ppb_components = []
for c in self.components:
if c.type != ComponentType.Core:
continue
ppb = 4
for r in c.restrictions:
ppb = min(r.ppb, ppb)
ppb_components.append((c, ppb))
ppb_components = sorted(ppb_components, key=lambda c: c[1])
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
for c, _ in ppb_components[:2]:
c.type = ComponentType.Core2w
def add_representation(self, n, rep_el, forms):
assert(rep_el.tag == "representation")
to_add = []