diff --git a/src/formatter.py b/src/formatter.py index 932062f..363ad21 100644 --- a/src/formatter.py +++ b/src/formatter.py @@ -34,7 +34,7 @@ class Formatter: class OutNoStatFormatter(Formatter): def additional_init(self): - self.representation = "" + self.representation = {} def header_repeat(self): return ["Lemma", "Representative_form", "RF_msd", "RF_scenario"] @@ -49,16 +49,18 @@ class OutNoStatFormatter(Formatter): rep = representations[idx] if rep is None: - self.representation += " " + word.lemma + self.representation[idx] = word.lemma return [word.lemma, word.lemma, "", "lemma_fallback"] else: - self.representation += " " + rep + self.representation[idx] = rep return [word.lemma, rep, word.msd, "ok"] - def content_right(self, freq): - rep = re.sub(' +', ' ', self.representation.strip()) + def content_right(self, freq, best_word_order=None): + if best_word_order is None: + best_word_order = sorted(self.representation.keys()) + rep = ' '.join([self.representation[o] for o in best_word_order if o in self.representation]) result = [rep, str(freq)] - self.representation = "" + self.representation = {} return result def group(self): @@ -181,13 +183,13 @@ class OutFormatter(Formatter): def header_right(self): return self.f1.header_right() + self.f2.header_right() - def content_repeat(self, words, representations, idx, sidx): + def content_repeat(self, words, representations, idx, sidx, best_word_order=None): cr1 = self.f1.content_repeat(words, representations, idx, sidx) cr2 = self.f2.content_repeat(words, representations, idx, sidx) return cr1 + cr2 - def content_right(self, freq): - return self.f1.content_right(freq) + self.f2.content_right(freq) + def content_right(self, freq, best_word_order=None): + return self.f1.content_right(freq, best_word_order) + self.f2.content_right(freq) def group(self): return self.f1.group() and self.f2.group() diff --git a/src/representation.py b/src/representation.py index 70d0e66..31bdeff 100644 --- a/src/representation.py +++ b/src/representation.py @@ -113,6 +113,8 @@ class WordFormMsdCR(WordFormAnyCR): for key, value in selectors.items(): t = word_msd[0] v = TAGSET[t].index(key.lower()) + if v + 1 >= len(word_msd): + return False f1 = word_msd[v + 1] f2 = CODES[value] diff --git a/src/wani.py b/src/wani.py index 9eeb9cf..b386697 100644 --- a/src/wani.py +++ b/src/wani.py @@ -38,6 +38,19 @@ def match_file(words, structures, postprocessor): colocation_id = tuple(colocation_id) matches[s].append((match, colocation_id)) + # for key, val in matches.items(): + # if key.id == '15': + # for el in val: + # if el[0]['1'].lemma == 'biti' and el[0]['2'].lemma == 'po' and el[0]['3'].lemma == 'mnenje': + # word_id = '.'.join(words[0].id.split('.')[:-1]) + # print(f"ID: {'.'.join(words[0].id.split('.')[:-1])}") + # print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id])) + + # if s.id == '15': + # if match['1'].lemma == 'biti' and match['2'].lemma == 'po' and match['3'].lemma == 'mnenje': + # word_id = '.'.join(match['1'].id.split('.')[:-1]) + # print(f"ID: {word_id}") + # print(' '.join([w.text for w in words if '.'.join(w.id.split('.')[:-1]) == word_id])) return matches diff --git a/src/writer.py b/src/writer.py index 8695de4..613cc18 100644 --- a/src/writer.py +++ b/src/writer.py @@ -71,21 +71,26 @@ class Writer: return sorted(rows, key=key, reverse=self.sort_order) def write_header(self, file_handler): - file_handler.write(", ".join(self.header()) + "\n") + file_handler.write(",".join(self.header()) + "\n") def write_out_worker(self, file_handler, structure, colocation_ids): rows = [] components = structure.components - for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)): if len(match) < self.min_frequency: continue self.formatter.new_match(match) + best_word_order = self.find_best_word_order(match.matches) + for words in match.matches: to_write = [] + # TODO instead of enumerate in bottom components first iterate over all words in match.matches, compare + # word.int_id and return most popular order and append to it remaining numbers to len(components) + + for idx, _comp in enumerate(components): idx = str(idx + 1) if idx not in words: @@ -100,7 +105,7 @@ class Writer: to_write = [structure.id] + to_write + [match.match_id] # header_right - to_write.extend(self.formatter.content_right(len(match))) + to_write.extend(self.formatter.content_right(len(match), best_word_order)) rows.append(to_write) if self.formatter.group(): @@ -108,7 +113,7 @@ class Writer: if rows != []: rows = self.sorted_rows(rows) - file_handler.write("\n".join([", ".join(row) for row in rows]) + "\n") + file_handler.write("\n".join([",".join(row) for row in rows]) + "\n") file_handler.flush() def write_out(self, structures, colocation_ids): @@ -141,3 +146,11 @@ class Writer: if not self.multiple_output: fp_close(fp) + + @staticmethod + def find_best_word_order(matches): + orders = {} + for words in matches: + order = tuple([tup[0] for tup in sorted(words.items(), key=lambda x: x[1].int_id)]) + orders[order] = orders.get(order, 0) + 1 + return max(orders, key=orders.get)