Fixes for msd length matching and pc matching
Also some cleanup and fix output formatting
This commit is contained in:
parent
cddeb9c4e4
commit
6a221ae8fe
|
@ -1900,4 +1900,5 @@ MSD_TRANSLATE = {
|
||||||
"N": "X",
|
"N": "X",
|
||||||
"Nj": "Xf",
|
"Nj": "Xf",
|
||||||
"Nt": "Xt",
|
"Nt": "Xt",
|
||||||
"Np": "Xp"}
|
"Np": "Xp",
|
||||||
|
"U": "N"}
|
||||||
|
|
80
wani.py
80
wani.py
|
@ -8,6 +8,9 @@ import logging
|
||||||
from msd_translate import MSD_TRANSLATE
|
from msd_translate import MSD_TRANSLATE
|
||||||
|
|
||||||
|
|
||||||
|
MAX_NUM_COMPONENTS = 5
|
||||||
|
|
||||||
|
|
||||||
STAVKI = sys.argv[1]
|
STAVKI = sys.argv[1]
|
||||||
STRUKTURE = sys.argv[2]
|
STRUKTURE = sys.argv[2]
|
||||||
FILE_OUT = sys.argv[3]
|
FILE_OUT = sys.argv[3]
|
||||||
|
@ -215,6 +218,7 @@ def build_morphology_regex(restriction):
|
||||||
rgx = [cat_code] + CATEGORY_BASES[cat_code]
|
rgx = [cat_code] + CATEGORY_BASES[cat_code]
|
||||||
|
|
||||||
del restr_dict['POS']
|
del restr_dict['POS']
|
||||||
|
min_msd_length = 1
|
||||||
|
|
||||||
for attribute, (value, typ) in restr_dict.items():
|
for attribute, (value, typ) in restr_dict.items():
|
||||||
index = TAGSET[cat_code].index(attribute.lower())
|
index = TAGSET[cat_code].index(attribute.lower())
|
||||||
|
@ -228,7 +232,13 @@ def build_morphology_regex(restriction):
|
||||||
match = "[{}{}]".format("" if typ else "^", match)
|
match = "[{}{}]".format("" if typ else "^", match)
|
||||||
rgx[index + 1] = match
|
rgx[index + 1] = match
|
||||||
|
|
||||||
|
if typ:
|
||||||
|
min_msd_length = max(index + 1, min_msd_length)
|
||||||
|
|
||||||
def matcher(text):
|
def matcher(text):
|
||||||
|
if len(text) <= min_msd_length:
|
||||||
|
return False
|
||||||
|
|
||||||
for c, r in zip(text, rgx):
|
for c, r in zip(text, rgx):
|
||||||
if not re.match(r, c):
|
if not re.match(r, c):
|
||||||
return False
|
return False
|
||||||
|
@ -443,16 +453,6 @@ class Component:
|
||||||
|
|
||||||
logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
|
logging.debug("SELF MATCH({}: {} -> {}".format(self.idx, word.text, matched))
|
||||||
|
|
||||||
# check with status
|
|
||||||
# if self.status is ComponentStatus.Optional:
|
|
||||||
# if not matched:
|
|
||||||
# # nothing to add, but still good...
|
|
||||||
# return {}
|
|
||||||
# elif self.status is ComponentStatus.Forbidden:
|
|
||||||
# # forbiddent is handled at return stage in _match_next
|
|
||||||
# # just process normally...
|
|
||||||
# pass
|
|
||||||
|
|
||||||
# recurse to next
|
# recurse to next
|
||||||
if not matched:
|
if not matched:
|
||||||
return None
|
return None
|
||||||
|
@ -465,12 +465,13 @@ class Component:
|
||||||
|
|
||||||
# need to get all links that match
|
# need to get all links that match
|
||||||
for next, link in self.next_element:
|
for next, link in self.next_element:
|
||||||
logging.debug("FIND LINKS FOR: {} -> {}".format(self.idx, next.idx))
|
next_links = word.get_links(link)
|
||||||
|
logging.debug("FIND LINKS FOR: {} -> {}: #{}".format(self.idx, next.idx, len(next_links)))
|
||||||
to_ret.append([])
|
to_ret.append([])
|
||||||
|
|
||||||
# good flag
|
# good flag
|
||||||
good = next.status != ComponentStatus.Required
|
good = next.status != ComponentStatus.Required
|
||||||
for next_word in word.get_links(link):
|
for next_word in next_links:
|
||||||
logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
|
logging.debug("link: {}: {} -> {}".format(link, word.id, next_word.id))
|
||||||
match = next.match(next_word)
|
match = next.match(next_word)
|
||||||
|
|
||||||
|
@ -679,6 +680,11 @@ class Word:
|
||||||
|
|
||||||
assert(None not in (self.id, self.lemma, self.msd))
|
assert(None not in (self.id, self.lemma, self.msd))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pcWord(pc):
|
||||||
|
pc.set('lemma', pc.text)
|
||||||
|
return Word(pc)
|
||||||
|
|
||||||
def add_link(self, link, to):
|
def add_link(self, link, to):
|
||||||
self.links[link].append(to)
|
self.links[link].append(to)
|
||||||
|
|
||||||
|
@ -700,14 +706,11 @@ def load_corpus(filename):
|
||||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||||
et = ElementTree.XML(xmlstring)
|
et = ElementTree.XML(xmlstring)
|
||||||
|
|
||||||
root_words = set()
|
|
||||||
words = {}
|
words = {}
|
||||||
for w in et.iter("w"):
|
for w in et.iter("w"):
|
||||||
words[w.get('id')] = Word(w)
|
words[w.get('id')] = Word(w)
|
||||||
|
|
||||||
pcs = set()
|
|
||||||
for pc in et.iter("pc"):
|
for pc in et.iter("pc"):
|
||||||
pcs.add(pc.get('id'))
|
words[pc.get('id')] = Word.pcWord(pc)
|
||||||
|
|
||||||
for l in et.iter("link"):
|
for l in et.iter("link"):
|
||||||
if 'dep' in l.keys():
|
if 'dep' in l.keys():
|
||||||
|
@ -723,29 +726,20 @@ def load_corpus(filename):
|
||||||
|
|
||||||
if lfrom in words:
|
if lfrom in words:
|
||||||
if is_root_id(lfrom):
|
if is_root_id(lfrom):
|
||||||
logging.error("NOO: ", lfrom, file=sys.stderr)
|
logging.error("NOO: ", lfrom)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if dest in words:
|
if dest in words:
|
||||||
next_word = words[dest]
|
next_word = words[dest]
|
||||||
words[lfrom].add_link(ana, next_word)
|
words[lfrom].add_link(ana, next_word)
|
||||||
|
else:
|
||||||
# catch links from root
|
logging.error("Unknown id: {}".format(dest))
|
||||||
elif is_root_id(lfrom):
|
sys.exit(1)
|
||||||
root_words.add(dest)
|
|
||||||
|
|
||||||
# catch links from <pc> :S
|
|
||||||
elif lfrom in pcs:
|
|
||||||
logging.warning(str(("link from <pc>: ", lfrom)))
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# strange errors, just skip...
|
# strange errors, just skip...
|
||||||
pass
|
pass
|
||||||
|
|
||||||
no_root_words = [w for k, w in words.items() if k in root_words]
|
|
||||||
missing = root_words - set(w.id for w in no_root_words)
|
|
||||||
# what should i do with this I forgot :(
|
|
||||||
|
|
||||||
return list(words.values())
|
return list(words.values())
|
||||||
|
|
||||||
|
|
||||||
|
@ -774,15 +768,16 @@ def main():
|
||||||
logging.debug(" GOT: {}".format(len(mhere)))
|
logging.debug(" GOT: {}".format(len(mhere)))
|
||||||
for match, reason in mhere:
|
for match, reason in mhere:
|
||||||
matches[s.id].append((match, reason))
|
matches[s.id].append((match, reason))
|
||||||
|
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
header = [
|
header = ["Structure_ID"]
|
||||||
"Structure_ID", "Component_ID", "Token_ID", "Word_form",
|
for i in range(MAX_NUM_COMPONENTS):
|
||||||
"Lemma", "Msd", "Representative_form_1", "Component_ID",
|
header.extend("C{}_{}".format(i + 1, thd) for thd in
|
||||||
"Token_ID", "Word_form", "Lemma", "Msd", "Representative_form_2",
|
["Token_ID", "Word_form", "Lemma", "Msd", "Representative_form"])
|
||||||
"Collocation_ID", "Joint_representative_form"]
|
header.extend(["Collocation_ID", "Joint_representative_form"])
|
||||||
csv = [", ".join(header)]
|
|
||||||
|
|
||||||
|
csv = [", ".join(header)]
|
||||||
colocation_ids = {}
|
colocation_ids = {}
|
||||||
|
|
||||||
for s in structures:
|
for s in structures:
|
||||||
|
@ -790,28 +785,33 @@ def main():
|
||||||
|
|
||||||
for m, reason in ms:
|
for m, reason in ms:
|
||||||
colocation_id = [s.id]
|
colocation_id = [s.id]
|
||||||
to_print = [s.id]
|
to_print = []
|
||||||
|
|
||||||
m_sorted = defaultdict(lambda: None, m.items())
|
m_sorted = defaultdict(lambda: None, m.items())
|
||||||
for idx, comp in enumerate(s.components):
|
for idx, comp in enumerate(s.components):
|
||||||
idx = str(idx + 1)
|
idx = str(idx + 1)
|
||||||
if idx not in m_sorted:
|
if idx not in m_sorted:
|
||||||
to_print.extend([idx, "", "", "", "", ""])
|
to_print.extend(["", "", "", "", ""])
|
||||||
else:
|
else:
|
||||||
w = m_sorted[idx]
|
w = m_sorted[idx]
|
||||||
# if comp.render_word(m_sorted[idx]) is not None:
|
# if comp.render_word(m_sorted[idx]) is not None:
|
||||||
if True:
|
if True:
|
||||||
to_print.extend([idx, w.id, w.text, w.lemma, w.msd, ""])
|
to_print.extend([w.id, w.text, w.lemma, w.msd, ""])
|
||||||
colocation_id.append(w.lemma)
|
colocation_id.append(w.lemma)
|
||||||
|
|
||||||
colocation_id = tuple(colocation_id)
|
colocation_id = tuple(colocation_id)
|
||||||
if colocation_id in colocation_ids:
|
if colocation_id in colocation_ids:
|
||||||
cid = colocation_ids[colocation_id]
|
cid = colocation_ids[colocation_id]
|
||||||
else:
|
else:
|
||||||
cid = len(colocation_ids)
|
cid = len(colocation_ids) + 1
|
||||||
colocation_ids[colocation_id] = cid
|
colocation_ids[colocation_id] = cid
|
||||||
|
|
||||||
|
to_print = [s.id] + to_print
|
||||||
|
length = 1 + MAX_NUM_COMPONENTS * 5
|
||||||
|
# make them equal size
|
||||||
|
to_print.extend([""] * (length - len(to_print)))
|
||||||
to_print.extend([str(cid), ""])
|
to_print.extend([str(cid), ""])
|
||||||
|
|
||||||
csv.append(", ".join(to_print))
|
csv.append(", ".join(to_print))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user