Fixes for msd length matching and pc matching

Also some cleanup and fix output formatting
This commit is contained in:
Ozbolt Menegatti 2019-01-25 11:58:40 +01:00
parent cddeb9c4e4
commit 6a221ae8fe
2 changed files with 42 additions and 41 deletions

View File

@ -1900,4 +1900,5 @@ MSD_TRANSLATE = {
"N": "X", "N": "X",
"Nj": "Xf", "Nj": "Xf",
"Nt": "Xt", "Nt": "Xt",
"Np": "Xp"} "Np": "Xp",
"U": "N"}

80
wani.py
View File

@ -8,6 +8,9 @@ import logging
from msd_translate import MSD_TRANSLATE from msd_translate import MSD_TRANSLATE
MAX_NUM_COMPONENTS = 5
STAVKI = sys.argv[1] STAVKI = sys.argv[1]
STRUKTURE = sys.argv[2] STRUKTURE = sys.argv[2]
FILE_OUT = sys.argv[3] FILE_OUT = sys.argv[3]
@ -215,6 +218,7 @@ def build_morphology_regex(restriction):
rgx = [cat_code] + CATEGORY_BASES[cat_code] rgx = [cat_code] + CATEGORY_BASES[cat_code]
del restr_dict['POS'] del restr_dict['POS']
min_msd_length = 1
for attribute, (value, typ) in restr_dict.items(): for attribute, (value, typ) in restr_dict.items():
index = TAGSET[cat_code].index(attribute.lower()) index = TAGSET[cat_code].index(attribute.lower())
@ -228,7 +232,13 @@ def build_morphology_regex(restriction):
match = "[{}{}]".format("" if typ else "^", match) match = "[{}{}]".format("" if typ else "^", match)
rgx[index + 1] = match rgx[index + 1] = match
if typ:
min_msd_length = max(index + 1, min_msd_length)
def matcher(text): def matcher(text):
if len(text) <= min_msd_length:
return False
for c, r in zip(text, rgx): for c, r in zip(text, rgx):
if not re.match(r, c): if not re.match(r, c):
return False return False
@ -443,16 +453,6 @@ class Component:
logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched)) logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
# check with status
# if self.status is ComponentStatus.Optional:
# if not matched:
# # nothing to add, but still good...
# return {}
# elif self.status is ComponentStatus.Forbidden:
# # forbiddent is handled at return stage in _match_next
# # just process normally...
# pass
# recurse to next # recurse to next
if not matched: if not matched:
return None return None
@ -465,12 +465,13 @@ class Component:
# need to get all links that match # need to get all links that match
for next, link in self.next_element: for next, link in self.next_element:
logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx)) next_links = word.get_links(link)
logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
to_ret.append([]) to_ret.append([])
# good flag # good flag
good = next.status != ComponentStatus.Required good = next.status != ComponentStatus.Required
for next_word in word.get_links(link): for next_word in next_links:
logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id)) logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
match = next.match(next_word) match = next.match(next_word)
@ -679,6 +680,11 @@ class Word:
assert(None not in (self.id, self.lemma, self.msd)) assert(None not in (self.id, self.lemma, self.msd))
@staticmethod
def pcWord(pc):
pc.set('lemma', pc.text)
return Word(pc)
def add_link(self, link, to): def add_link(self, link, to):
self.links[link].append(to) self.links[link].append(to)
@ -700,14 +706,11 @@ def load_corpus(filename):
xmlstring = xmlstring.replace(' xml:', ' ') xmlstring = xmlstring.replace(' xml:', ' ')
et = ElementTree.XML(xmlstring) et = ElementTree.XML(xmlstring)
root_words = set()
words = {} words = {}
for w in et.iter("w"): for w in et.iter("w"):
words[w.get('id')] = Word(w) words[w.get('id')] = Word(w)
pcs = set()
for pc in et.iter("pc"): for pc in et.iter("pc"):
pcs.add(pc.get('id')) words[pc.get('id')] = Word.pcWord(pc)
for l in et.iter("link"): for l in et.iter("link"):
if 'dep' in l.keys(): if 'dep' in l.keys():
@ -723,29 +726,20 @@ def load_corpus(filename):
if lfrom in words: if lfrom in words:
if is_root_id(lfrom): if is_root_id(lfrom):
logging.error("NOO: ", lfrom, file=sys.stderr) logging.error("NOO: ", lfrom)
sys.exit(1) sys.exit(1)
if dest in words: if dest in words:
next_word = words[dest] next_word = words[dest]
words[lfrom].add_link(ana, next_word) words[lfrom].add_link(ana, next_word)
else:
# catch links from root logging.error("Unknown id: {}".format(dest))
elif is_root_id(lfrom): sys.exit(1)
root_words.add(dest)
# catch links from <pc> :S
elif lfrom in pcs:
logging.warning(str(("link from <pc>: ", lfrom)))
else: else:
# strange errors, just skip... # strange errors, just skip...
pass pass
no_root_words = [w for k, w in words.items() if k in root_words]
missing = root_words - set(w.id for w in no_root_words)
# what should i do with this I forgot :(
return list(words.values()) return list(words.values())
@ -774,15 +768,16 @@ def main():
logging.debug(" GOT: {}".format(len(mhere))) logging.debug(" GOT: {}".format(len(mhere)))
for match, reason in mhere: for match, reason in mhere:
matches[s.id].append((match, reason)) matches[s.id].append((match, reason))
print("") print("")
header = [ header = ["Structure_ID"]
"Structure_ID", "Component_ID", "Token_ID", "Word_form", for i in range(MAX_NUM_COMPONENTS):
"Lemma", "Msd", "Representative_form_1", "Component_ID", header.extend("C{}_{}".format(i + 1, thd) for thd in
"Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2", ["Token_ID", "Word_form", "Lemma", "Msd", "Representative_form"])
"Collocation_ID", "Joint_representative_form"] header.extend(["Collocation_ID", "Joint_representative_form"])
csv = [", ".join(header)]
csv = [", ".join(header)]
colocation_ids = {} colocation_ids = {}
for s in structures: for s in structures:
@ -790,28 +785,33 @@ def main():
for m, reason in ms: for m, reason in ms:
colocation_id = [s.id] colocation_id = [s.id]
to_print = [s.id] to_print = []
m_sorted = defaultdict(lambda: None, m.items()) m_sorted = defaultdict(lambda: None, m.items())
for idx, comp in enumerate(s.components): for idx, comp in enumerate(s.components):
idx = str(idx + 1) idx = str(idx + 1)
if idx not in m_sorted: if idx not in m_sorted:
to_print.extend([idx, "", "", "", "", ""]) to_print.extend(["", "", "", "", ""])
else: else:
w = m_sorted[idx] w = m_sorted[idx]
# if comp.render_word(m_sorted[idx]) is not None: # if comp.render_word(m_sorted[idx]) is not None:
if True: if True:
to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""]) to_print.extend([w.id, w.text, w.lemma, w.msd, ""])
colocation_id.append(w.lemma) colocation_id.append(w.lemma)
colocation_id = tuple(colocation_id) colocation_id = tuple(colocation_id)
if colocation_id in colocation_ids: if colocation_id in colocation_ids:
cid = colocation_ids[colocation_id] cid = colocation_ids[colocation_id]
else: else:
cid = len(colocation_ids) cid = len(colocation_ids) + 1
colocation_ids[colocation_id] = cid colocation_ids[colocation_id] = cid
to_print = [s.id] + to_print
length = 1 + MAX_NUM_COMPONENTS * 5
# make them equal size
to_print.extend([""] * (length - len(to_print)))
to_print.extend([str(cid), ""]) to_print.extend([str(cid), ""])
csv.append(", ".join(to_print)) csv.append(", ".join(to_print))