From 06435aa3a2237dfe381ffcaa2cdee2e42c27cb4f Mon Sep 17 00:00:00 2001 From: Luka Date: Fri, 9 Oct 2020 15:18:52 +0200 Subject: [PATCH] Added options for "modra" --- luscenje_struktur/component.py | 27 +++++++----------------- luscenje_struktur/loader.py | 10 +++++++-- luscenje_struktur/msd_translate.py | 2 +- luscenje_struktur/restriction.py | 9 +------- luscenje_struktur/syntactic_structure.py | 14 +++++++++--- luscenje_struktur/word.py | 8 ++++++- luscenje_struktur/word_stats.py | 2 ++ luscenje_struktur/writer.py | 27 ++++++++++++++---------- wani.py | 2 ++ 9 files changed, 56 insertions(+), 45 deletions(-) diff --git a/luscenje_struktur/component.py b/luscenje_struktur/component.py index c162278..b5ce25a 100644 --- a/luscenje_struktur/component.py +++ b/luscenje_struktur/component.py @@ -38,7 +38,7 @@ class Component: self.status = status self.name = name self.idx = idx - self.restrictions = [] + self.restrictions = [Restriction(None)] if 'restriction' in info else [] self.next_element = [] self.representation = [] self.selection = {} @@ -104,24 +104,13 @@ class Component: if len(cmatch) == 0: continue - # if more than one match found for particular component - elif len(cmatch) > 1: - # if more than one match in multiple components, NOPE! - if len(to_ret) > 1: - logging.warning("Strange multiple match: {}".format( - str([w.id for w in cmatch[0].values()]))) - - for tr in to_ret: - tr.update(cmatch[0]) - continue - - # yeah, so we have found more than one match, => - # more than one element in to_ret - to_ret = [{**dict(to_ret[0]), **m} for m in cmatch] - - else: - for tr in to_ret: - tr.update(cmatch[0]) + # create new to_ret, to which extend all results + new_to_ret = [] + for tr in to_ret: + # make sure that one word is not used twice in same to_ret + new_to_ret.extend([{**dict(tr), **m} for m in cmatch if any([m_v not in dict(tr).values() for m_v in m.values()])]) + to_ret = new_to_ret + del new_to_ret return to_ret diff --git a/luscenje_struktur/loader.py b/luscenje_struktur/loader.py index 77fdd34..8da969b 100644 --- a/luscenje_struktur/loader.py +++ b/luscenje_struktur/loader.py @@ -102,6 +102,8 @@ def load_csv(filename, compressed): line_split = line_fixed.split("\t") if line_split[1] == "1" and len(words) > 0: + # adding fake word + words['0'] = Word('', '', '0', '', False, True) sentence_end(bad_sentence) bad_sentence = False links = [] @@ -114,9 +116,11 @@ def load_csv(filename, compressed): full_id = "{}.{}".format(sid, wid) words[wid] = Word(lemma, msd, full_id, text, True) - if link_src != '0': - links.append((link_src, wid, link_type)) + # if link_src != '0': + links.append((link_src, wid, link_type)) + # adding fake word + words['0'] = Word('', '', '0', '', False, True) sentence_end(bad_sentence) return result @@ -189,6 +193,8 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag): words = {} sentences = list(et.iter('s')) for sentence in progress(sentences, "load-text"): + # create fake root word + words[sentence.get('id')] = Word.fake_root_word(sentence.get('id')) for w in sentence.iter("w"): words[w.get('id')] = Word.from_xml(w, do_msd_translate) for pc in sentence.iter(pc_tag): diff --git a/luscenje_struktur/msd_translate.py b/luscenje_struktur/msd_translate.py index 24ab90b..6b526e6 100644 --- a/luscenje_struktur/msd_translate.py +++ b/luscenje_struktur/msd_translate.py @@ -1911,4 +1911,4 @@ MSD_TRANSLATE = { "Ne": "Ne", "Nh": "Nh", "Na": "Na", - "U": "N"} + "U": "Z"} diff --git a/luscenje_struktur/restriction.py b/luscenje_struktur/restriction.py index afd1da4..af4dd68 100644 --- a/luscenje_struktur/restriction.py +++ b/luscenje_struktur/restriction.py @@ -95,17 +95,10 @@ class MorphologyRegex: self.re_objects.append([re.compile(r) for r in rgx]) self.rgxs.append(rgx) self.min_msd_lengths.append(min_msd_length) - - # self.re_objects = [re.compile(r) for r in rgx] - # self.rgx = rgx def __call__(self, text): - # if len(text) <= self.min_msd_length: - # return False - # if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1: - # a = 1 for i, re_object in enumerate(self.re_objects): - if len(text) <= self.min_msd_lengths[i]: + if len(text) < self.min_msd_lengths[i]: continue match = True diff --git a/luscenje_struktur/syntactic_structure.py b/luscenje_struktur/syntactic_structure.py index aa6ff7e..49a5a81 100644 --- a/luscenje_struktur/syntactic_structure.py +++ b/luscenje_struktur/syntactic_structure.py @@ -10,6 +10,7 @@ class SyntacticStructure: self.id = None self.lbs = None self.components = [] + self.fake_root_included = False @staticmethod def from_xml(xml, no_stats): @@ -44,8 +45,14 @@ class SyntacticStructure: raise NotImplementedError("Unknown definition: {} in structure {}" .format(el.tag, st.id)) - fake_root_component = Component({'cid': '#', 'type': 'other'}) - st.components = fake_root_component.find_next(deps, comps, restrs, forms) + fake_root_component = Component({'cid': '#', 'type': 'other', 'restriction': None}) + fake_root_component_children = fake_root_component.find_next(deps, comps, restrs, forms) + # all dep with value modra point to artificial root - fake_root_component + if any([dep[2] == 'modra' for dep in deps]): + st.fake_root_included = True + st.components = [fake_root_component] + fake_root_component_children + else: + st.components = fake_root_component_children if not no_stats: st.determine_core2w() @@ -112,7 +119,8 @@ def build_structures(args): continue structures.append(to_append) - max_num_components = max(max_num_components, len(to_append.components)) + to_append_len = len(to_append.components) if not to_append.fake_root_included else len(to_append.components) - 1 + max_num_components = max(max_num_components, to_append_len) lemma_features = get_lemma_features(et) return structures, lemma_features, max_num_components diff --git a/luscenje_struktur/word.py b/luscenje_struktur/word.py index d168070..bf5c423 100644 --- a/luscenje_struktur/word.py +++ b/luscenje_struktur/word.py @@ -32,13 +32,14 @@ class WordDummy: class Word: - def __init__(self, lemma, msd, wid, text, do_msd_translate): + def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False): self.lemma = lemma self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd self.id = wid self.idi = None self.text = text self.glue = '' + self.fake_word = fake_word self.links = defaultdict(list) @@ -74,6 +75,11 @@ class Word: pc.set('msd', "N" if do_msd_translate else "U") return Word.from_xml(pc, do_msd_translate) + @staticmethod + def fake_root_word(sentence_id): + wid = sentence_id + return Word('', '', wid, '', False, True) + def add_link(self, link, to): self.links[link].append(to) diff --git a/luscenje_struktur/word_stats.py b/luscenje_struktur/word_stats.py index 7ca87a4..64472e5 100644 --- a/luscenje_struktur/word_stats.py +++ b/luscenje_struktur/word_stats.py @@ -25,6 +25,8 @@ class WordStats: def add_words(self, words): for w in progress(words, "adding-words"): + if w.fake_word: + continue params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text} res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1 WHERE lemma=:lemma AND msd=:msd AND text=:text""", params) diff --git a/luscenje_struktur/writer.py b/luscenje_struktur/writer.py index c6c14b8..7e9a035 100644 --- a/luscenje_struktur/writer.py +++ b/luscenje_struktur/writer.py @@ -16,23 +16,23 @@ class Writer: @staticmethod def make_output_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) + return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator) @staticmethod def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) + return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator) @staticmethod def make_all_writer(args, num_components, colocation_ids, word_renderer): - return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None) + return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None, args.separator) @staticmethod def make_stats_writer(args, num_components, colocation_ids, word_renderer): params = Writer.other_params(args) - return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params) + return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator) - def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params): + def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params, separator): # TODO FIX THIS self.collocation_sentence_map_dest = collocation_sentence_map_dest if params is None: @@ -49,6 +49,7 @@ class Writer: self.num_components = num_components self.output_file = file_out self.formatter = formatter + self.separator = separator def header(self): repeating_cols = self.formatter.header_repeat() @@ -78,7 +79,7 @@ class Writer: return sorted(rows, key=key, reverse=self.sort_order) def write_header(self, file_handler): - file_handler.write(",".join(self.header()) + "\n") + file_handler.write(self.separator.join(self.header()) + "\n") def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map): rows = [] @@ -99,12 +100,16 @@ class Writer: for words in match.matches: to_write = [] - for idx, _comp in enumerate(components): - idx = str(idx + 1) - if idx not in words: + idx = 1 + for _comp in components: + if _comp.idx == '#': + continue + idx_s = str(idx) + idx += 1 + if idx_s not in words: to_write.extend([""] * self.formatter.length()) else: - to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id)) + to_write.extend(self.formatter.content_repeat(words, match.representations, idx_s, structure.id)) # make them equal size to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write))) @@ -121,7 +126,7 @@ class Writer: if rows != []: rows = self.sorted_rows(rows) - file_handler.write("\n".join([",".join(row) for row in rows]) + "\n") + file_handler.write("\n".join([self.separator.join(row) for row in rows]) + "\n") file_handler.flush() def write_out(self, structures, colocation_ids): diff --git a/wani.py b/wani.py index 021f7a2..1ba4b7a 100644 --- a/wani.py +++ b/wani.py @@ -151,6 +151,8 @@ if __name__ == '__main__': parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") + parser.add_argument('--separator', + help='Separator in output file', default="\t") args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())