Changed output document to reflect most frequent word order
This commit is contained in:
parent
777791ad1e
commit
de3e52c57c
|
@ -34,7 +34,7 @@ class Formatter:
|
||||||
|
|
||||||
class OutNoStatFormatter(Formatter):
|
class OutNoStatFormatter(Formatter):
|
||||||
def additional_init(self):
|
def additional_init(self):
|
||||||
self.representation = ""
|
self.representation = {}
|
||||||
|
|
||||||
def header_repeat(self):
|
def header_repeat(self):
|
||||||
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
|
return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"]
|
||||||
|
@ -49,16 +49,18 @@ class OutNoStatFormatter(Formatter):
|
||||||
|
|
||||||
rep = representations[idx]
|
rep = representations[idx]
|
||||||
if rep is None:
|
if rep is None:
|
||||||
self.representation += " " + word.lemma
|
self.representation[idx] = word.lemma
|
||||||
return [word.lemma, word.lemma, "", "lemma_fallback"]
|
return [word.lemma, word.lemma, "", "lemma_fallback"]
|
||||||
else:
|
else:
|
||||||
self.representation += " " + rep
|
self.representation[idx] = rep
|
||||||
return [word.lemma, rep, word.msd, "ok"]
|
return [word.lemma, rep, word.msd, "ok"]
|
||||||
|
|
||||||
def content_right(self, freq):
|
def content_right(self, freq, best_word_order=None):
|
||||||
rep = re.sub(' +', ' ', self.representation.strip())
|
if best_word_order is None:
|
||||||
|
best_word_order = sorted(self.representation.keys())
|
||||||
|
rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation])
|
||||||
result = [rep, str(freq)]
|
result = [rep, str(freq)]
|
||||||
self.representation = ""
|
self.representation = {}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def group(self):
|
def group(self):
|
||||||
|
@ -181,13 +183,13 @@ class OutFormatter(Formatter):
|
||||||
def header_right(self):
|
def header_right(self):
|
||||||
return self.f1.header_right() + self.f2.header_right()
|
return self.f1.header_right() + self.f2.header_right()
|
||||||
|
|
||||||
def content_repeat(self, words, representations, idx, sidx):
|
def content_repeat(self, words, representations, idx, sidx, best_word_order=None):
|
||||||
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
|
cr1 = self.f1.content_repeat(words, representations, idx, sidx)
|
||||||
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
|
cr2 = self.f2.content_repeat(words, representations, idx, sidx)
|
||||||
return cr1 + cr2
|
return cr1 + cr2
|
||||||
|
|
||||||
def content_right(self, freq):
|
def content_right(self, freq, best_word_order=None):
|
||||||
return self.f1.content_right(freq) + self.f2.content_right(freq)
|
return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq)
|
||||||
|
|
||||||
def group(self):
|
def group(self):
|
||||||
return self.f1.group() and self.f2.group()
|
return self.f1.group() and self.f2.group()
|
||||||
|
|
|
@ -113,6 +113,8 @@ class WordFormMsdCR(WordFormAnyCR):
|
||||||
for key, value in selectors.items():
|
for key, value in selectors.items():
|
||||||
t = word_msd[0]
|
t = word_msd[0]
|
||||||
v = TAGSET[t].index(key.lower())
|
v = TAGSET[t].index(key.lower())
|
||||||
|
if v + 1 >= len(word_msd):
|
||||||
|
return False
|
||||||
f1 = word_msd[v + 1]
|
f1 = word_msd[v + 1]
|
||||||
f2 = CODES[value]
|
f2 = CODES[value]
|
||||||
|
|
||||||
|
|
13
src/wani.py
13
src/wani.py
|
@ -38,6 +38,19 @@ def match_file(words, structures, postprocessor):
|
||||||
colocation_id = tuple(colocation_id)
|
colocation_id = tuple(colocation_id)
|
||||||
|
|
||||||
matches[s].append((match, colocation_id))
|
matches[s].append((match, colocation_id))
|
||||||
|
# for key, val in matches.items():
|
||||||
|
# if key.id == '15':
|
||||||
|
# for el in val:
|
||||||
|
# if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje':
|
||||||
|
# word_id = '.'.join(words[0].id.split('.')[:-1])
|
||||||
|
# print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}")
|
||||||
|
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
|
||||||
|
|
||||||
|
# if s.id == '15':
|
||||||
|
# if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje':
|
||||||
|
# word_id = '.'.join(match['1'].id.split('.')[:-1])
|
||||||
|
# print(f"ID: {word_id}")
|
||||||
|
# print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id]))
|
||||||
|
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
|
|
@ -71,21 +71,26 @@ class Writer:
|
||||||
return sorted(rows, key=key, reverse=self.sort_order)
|
return sorted(rows, key=key, reverse=self.sort_order)
|
||||||
|
|
||||||
def write_header(self, file_handler):
|
def write_header(self, file_handler):
|
||||||
file_handler.write(", ".join(self.header()) + "\n")
|
file_handler.write(",".join(self.header()) + "\n")
|
||||||
|
|
||||||
def write_out_worker(self, file_handler, structure, colocation_ids):
|
def write_out_worker(self, file_handler, structure, colocation_ids):
|
||||||
rows = []
|
rows = []
|
||||||
components = structure.components
|
components = structure.components
|
||||||
|
|
||||||
for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
|
for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
|
||||||
if len(match) < self.min_frequency:
|
if len(match) < self.min_frequency:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
self.formatter.new_match(match)
|
self.formatter.new_match(match)
|
||||||
|
|
||||||
|
best_word_order = self.find_best_word_order(match.matches)
|
||||||
|
|
||||||
for words in match.matches:
|
for words in match.matches:
|
||||||
to_write = []
|
to_write = []
|
||||||
|
|
||||||
|
# TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare
|
||||||
|
# word.int_id and return most popular order and append to it remaining numbers to len(components)
|
||||||
|
|
||||||
|
|
||||||
for idx, _comp in enumerate(components):
|
for idx, _comp in enumerate(components):
|
||||||
idx = str(idx + 1)
|
idx = str(idx + 1)
|
||||||
if idx not in words:
|
if idx not in words:
|
||||||
|
@ -100,7 +105,7 @@ class Writer:
|
||||||
to_write = [structure.id] + to_write + [match.match_id]
|
to_write = [structure.id] + to_write + [match.match_id]
|
||||||
|
|
||||||
# header_right
|
# header_right
|
||||||
to_write.extend(self.formatter.content_right(len(match)))
|
to_write.extend(self.formatter.content_right(len(match), best_word_order))
|
||||||
rows.append(to_write)
|
rows.append(to_write)
|
||||||
|
|
||||||
if self.formatter.group():
|
if self.formatter.group():
|
||||||
|
@ -108,7 +113,7 @@ class Writer:
|
||||||
|
|
||||||
if rows != []:
|
if rows != []:
|
||||||
rows = self.sorted_rows(rows)
|
rows = self.sorted_rows(rows)
|
||||||
file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n")
|
file_handler.write("\n".join([",".join(row) for row in rows]) + "\n")
|
||||||
file_handler.flush()
|
file_handler.flush()
|
||||||
|
|
||||||
def write_out(self, structures, colocation_ids):
|
def write_out(self, structures, colocation_ids):
|
||||||
|
@ -141,3 +146,11 @@ class Writer:
|
||||||
|
|
||||||
if not self.multiple_output:
|
if not self.multiple_output:
|
||||||
fp_close(fp)
|
fp_close(fp)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find_best_word_order(matches):
|
||||||
|
orders = {}
|
||||||
|
for words in matches:
|
||||||
|
order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)])
|
||||||
|
orders[order] = orders.get(order, 0) + 1
|
||||||
|
return max(orders, key=orders.get)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user