determining jppb (for 2 word statistics)
This commit is contained in:
parent
30a5e80569
commit
3a22cd91c3
38
wani.py
38
wani.py
|
@ -430,6 +430,19 @@ def get_level(restriction):
|
|||
raise RuntimeError("Unreachable!")
|
||||
|
||||
|
||||
def determine_ppb(rgx):
|
||||
if rgx[0] in ("A", "N", "R"):
|
||||
return 0
|
||||
elif rgx[0] == "V":
|
||||
if 'a' in rgx[1]:
|
||||
return 3
|
||||
elif 'm' in rgx[1]:
|
||||
return 1
|
||||
else:
|
||||
return 2
|
||||
else:
|
||||
return 4
|
||||
|
||||
def build_morphology_regex(restriction):
|
||||
restr_dict = {}
|
||||
for feature in restriction:
|
||||
|
@ -494,6 +507,8 @@ def build_lexis_regex(restriction):
|
|||
|
||||
class Restriction:
|
||||
def __init__(self, restriction_tag):
|
||||
self.ppb = 4 # polnopomenska beseda (0-4)
|
||||
|
||||
if restriction_tag is None:
|
||||
self.type = RestrictionType.MatchAll
|
||||
self.matcher = None
|
||||
|
@ -505,6 +520,8 @@ class Restriction:
|
|||
self.type = RestrictionType.Morphology
|
||||
present, self.matcher = build_morphology_regex(list(restriction_tag))
|
||||
self.present = " ".join(present)
|
||||
self.ppb = determine_ppb(present)
|
||||
|
||||
elif restriction_type == "lexis":
|
||||
self.type = RestrictionType.Lexis
|
||||
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
|
||||
|
@ -711,8 +728,29 @@ class SyntacticStructure:
|
|||
|
||||
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
||||
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
||||
|
||||
st.determine_core2w()
|
||||
return st
|
||||
|
||||
def determine_core2w(self):
|
||||
ppb_components = []
|
||||
for c in self.components:
|
||||
if c.type != ComponentType.Core:
|
||||
continue
|
||||
|
||||
ppb = 4
|
||||
for r in c.restrictions:
|
||||
ppb = min(r.ppb, ppb)
|
||||
|
||||
ppb_components.append((c, ppb))
|
||||
|
||||
ppb_components = sorted(ppb_components, key=lambda c: c[1])
|
||||
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
|
||||
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
|
||||
|
||||
for c, _ in ppb_components[:2]:
|
||||
c.type = ComponentType.Core2w
|
||||
|
||||
def add_representation(self, n, rep_el, forms):
|
||||
assert(rep_el.tag == "representation")
|
||||
to_add = []
|
||||
|
|
Loading…
Reference in New Issue
Block a user