determining jppb (for 2 word statistics)
This commit is contained in:
parent
30a5e80569
commit
3a22cd91c3
38
wani.py
38
wani.py
|
@ -430,6 +430,19 @@ def get_level(restriction):
|
||||||
raise RuntimeError("Unreachable!")
|
raise RuntimeError("Unreachable!")
|
||||||
|
|
||||||
|
|
||||||
|
def determine_ppb(rgx):
|
||||||
|
if rgx[0] in ("A", "N", "R"):
|
||||||
|
return 0
|
||||||
|
elif rgx[0] == "V":
|
||||||
|
if 'a' in rgx[1]:
|
||||||
|
return 3
|
||||||
|
elif 'm' in rgx[1]:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 4
|
||||||
|
|
||||||
def build_morphology_regex(restriction):
|
def build_morphology_regex(restriction):
|
||||||
restr_dict = {}
|
restr_dict = {}
|
||||||
for feature in restriction:
|
for feature in restriction:
|
||||||
|
@ -494,6 +507,8 @@ def build_lexis_regex(restriction):
|
||||||
|
|
||||||
class Restriction:
|
class Restriction:
|
||||||
def __init__(self, restriction_tag):
|
def __init__(self, restriction_tag):
|
||||||
|
self.ppb = 4 # polnopomenska beseda (0-4)
|
||||||
|
|
||||||
if restriction_tag is None:
|
if restriction_tag is None:
|
||||||
self.type = RestrictionType.MatchAll
|
self.type = RestrictionType.MatchAll
|
||||||
self.matcher = None
|
self.matcher = None
|
||||||
|
@ -505,6 +520,8 @@ class Restriction:
|
||||||
self.type = RestrictionType.Morphology
|
self.type = RestrictionType.Morphology
|
||||||
present, self.matcher = build_morphology_regex(list(restriction_tag))
|
present, self.matcher = build_morphology_regex(list(restriction_tag))
|
||||||
self.present = " ".join(present)
|
self.present = " ".join(present)
|
||||||
|
self.ppb = determine_ppb(present)
|
||||||
|
|
||||||
elif restriction_type == "lexis":
|
elif restriction_type == "lexis":
|
||||||
self.type = RestrictionType.Lexis
|
self.type = RestrictionType.Lexis
|
||||||
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
|
self.present, self.matcher = build_lexis_regex(list(restriction_tag))
|
||||||
|
@ -711,7 +728,28 @@ class SyntacticStructure:
|
||||||
|
|
||||||
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
||||||
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
||||||
|
|
||||||
|
st.determine_core2w()
|
||||||
return st
|
return st
|
||||||
|
|
||||||
|
def determine_core2w(self):
|
||||||
|
ppb_components = []
|
||||||
|
for c in self.components:
|
||||||
|
if c.type != ComponentType.Core:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ppb = 4
|
||||||
|
for r in c.restrictions:
|
||||||
|
ppb = min(r.ppb, ppb)
|
||||||
|
|
||||||
|
ppb_components.append((c, ppb))
|
||||||
|
|
||||||
|
ppb_components = sorted(ppb_components, key=lambda c: c[1])
|
||||||
|
if len(ppb_components) > 2 and ppb_components[1][1] == ppb_components[2][1]:
|
||||||
|
raise RuntimeError("Cannot determine 2 'jedrna polnopomenska beseda' for", self.id)
|
||||||
|
|
||||||
|
for c, _ in ppb_components[:2]:
|
||||||
|
c.type = ComponentType.Core2w
|
||||||
|
|
||||||
def add_representation(self, n, rep_el, forms):
|
def add_representation(self, n, rep_el, forms):
|
||||||
assert(rep_el.tag == "representation")
|
assert(rep_el.tag == "representation")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user