Added options for "modra"
This commit is contained in:
parent
1ea454f63c
commit
06435aa3a2
|
@ -38,7 +38,7 @@ class Component:
|
||||||
self.status = status
|
self.status = status
|
||||||
self.name = name
|
self.name = name
|
||||||
self.idx = idx
|
self.idx = idx
|
||||||
self.restrictions = []
|
self.restrictions = [Restriction(None)] if 'restriction' in info else []
|
||||||
self.next_element = []
|
self.next_element = []
|
||||||
self.representation = []
|
self.representation = []
|
||||||
self.selection = {}
|
self.selection = {}
|
||||||
|
@ -104,24 +104,13 @@ class Component:
|
||||||
if len(cmatch) == 0:
|
if len(cmatch) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# if more than one match found for particular component
|
# create new to_ret, to which extend all results
|
||||||
elif len(cmatch) > 1:
|
new_to_ret = []
|
||||||
# if more than one match in multiple components, NOPE!
|
for tr in to_ret:
|
||||||
if len(to_ret) > 1:
|
# make sure that one word is not used twice in same to_ret
|
||||||
logging.warning("Strange multiple match: {}".format(
|
new_to_ret.extend([{**dict(tr), **m} for m in cmatch if any([m_v not in dict(tr).values() for m_v in m.values()])])
|
||||||
str([w.id for w in cmatch[0].values()])))
|
to_ret = new_to_ret
|
||||||
|
del new_to_ret
|
||||||
for tr in to_ret:
|
|
||||||
tr.update(cmatch[0])
|
|
||||||
continue
|
|
||||||
|
|
||||||
# yeah, so we have found more than one match, =>
|
|
||||||
# more than one element in to_ret
|
|
||||||
to_ret = [{**dict(to_ret[0]), **m} for m in cmatch]
|
|
||||||
|
|
||||||
else:
|
|
||||||
for tr in to_ret:
|
|
||||||
tr.update(cmatch[0])
|
|
||||||
|
|
||||||
return to_ret
|
return to_ret
|
||||||
|
|
||||||
|
|
|
@ -102,6 +102,8 @@ def load_csv(filename, compressed):
|
||||||
line_split = line_fixed.split("\t")
|
line_split = line_fixed.split("\t")
|
||||||
|
|
||||||
if line_split[1] == "1" and len(words) > 0:
|
if line_split[1] == "1" and len(words) > 0:
|
||||||
|
# adding fake word
|
||||||
|
words['0'] = Word('', '', '0', '', False, True)
|
||||||
sentence_end(bad_sentence)
|
sentence_end(bad_sentence)
|
||||||
bad_sentence = False
|
bad_sentence = False
|
||||||
links = []
|
links = []
|
||||||
|
@ -114,9 +116,11 @@ def load_csv(filename, compressed):
|
||||||
full_id = "{}.{}".format(sid, wid)
|
full_id = "{}.{}".format(sid, wid)
|
||||||
|
|
||||||
words[wid] = Word(lemma, msd, full_id, text, True)
|
words[wid] = Word(lemma, msd, full_id, text, True)
|
||||||
if link_src != '0':
|
# if link_src != '0':
|
||||||
links.append((link_src, wid, link_type))
|
links.append((link_src, wid, link_type))
|
||||||
|
|
||||||
|
# adding fake word
|
||||||
|
words['0'] = Word('', '', '0', '', False, True)
|
||||||
sentence_end(bad_sentence)
|
sentence_end(bad_sentence)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -189,6 +193,8 @@ def file_sentence_generator(et, skip_id_check, do_msd_translate, pc_tag):
|
||||||
words = {}
|
words = {}
|
||||||
sentences = list(et.iter('s'))
|
sentences = list(et.iter('s'))
|
||||||
for sentence in progress(sentences, "load-text"):
|
for sentence in progress(sentences, "load-text"):
|
||||||
|
# create fake root word
|
||||||
|
words[sentence.get('id')] = Word.fake_root_word(sentence.get('id'))
|
||||||
for w in sentence.iter("w"):
|
for w in sentence.iter("w"):
|
||||||
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
words[w.get('id')] = Word.from_xml(w, do_msd_translate)
|
||||||
for pc in sentence.iter(pc_tag):
|
for pc in sentence.iter(pc_tag):
|
||||||
|
|
|
@ -1911,4 +1911,4 @@ MSD_TRANSLATE = {
|
||||||
"Ne": "Ne",
|
"Ne": "Ne",
|
||||||
"Nh": "Nh",
|
"Nh": "Nh",
|
||||||
"Na": "Na",
|
"Na": "Na",
|
||||||
"U": "N"}
|
"U": "Z"}
|
||||||
|
|
|
@ -95,17 +95,10 @@ class MorphologyRegex:
|
||||||
self.re_objects.append([re.compile(r) for r in rgx])
|
self.re_objects.append([re.compile(r) for r in rgx])
|
||||||
self.rgxs.append(rgx)
|
self.rgxs.append(rgx)
|
||||||
self.min_msd_lengths.append(min_msd_length)
|
self.min_msd_lengths.append(min_msd_length)
|
||||||
|
|
||||||
# self.re_objects = [re.compile(r) for r in rgx]
|
|
||||||
# self.rgx = rgx
|
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
# if len(text) <= self.min_msd_length:
|
|
||||||
# return False
|
|
||||||
# if len(self.rgxs[0]) > 1 and len(self.rgxs) > 1:
|
|
||||||
# a = 1
|
|
||||||
for i, re_object in enumerate(self.re_objects):
|
for i, re_object in enumerate(self.re_objects):
|
||||||
if len(text) <= self.min_msd_lengths[i]:
|
if len(text) < self.min_msd_lengths[i]:
|
||||||
continue
|
continue
|
||||||
match = True
|
match = True
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ class SyntacticStructure:
|
||||||
self.id = None
|
self.id = None
|
||||||
self.lbs = None
|
self.lbs = None
|
||||||
self.components = []
|
self.components = []
|
||||||
|
self.fake_root_included = False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_xml(xml, no_stats):
|
def from_xml(xml, no_stats):
|
||||||
|
@ -44,8 +45,14 @@ class SyntacticStructure:
|
||||||
raise NotImplementedError("Unknown definition: {} in structure {}"
|
raise NotImplementedError("Unknown definition: {} in structure {}"
|
||||||
.format(el.tag, st.id))
|
.format(el.tag, st.id))
|
||||||
|
|
||||||
fake_root_component = Component({'cid': '#', 'type': 'other'})
|
fake_root_component = Component({'cid': '#', 'type': 'other', 'restriction': None})
|
||||||
st.components = fake_root_component.find_next(deps, comps, restrs, forms)
|
fake_root_component_children = fake_root_component.find_next(deps, comps, restrs, forms)
|
||||||
|
# all dep with value modra point to artificial root - fake_root_component
|
||||||
|
if any([dep[2] == 'modra' for dep in deps]):
|
||||||
|
st.fake_root_included = True
|
||||||
|
st.components = [fake_root_component] + fake_root_component_children
|
||||||
|
else:
|
||||||
|
st.components = fake_root_component_children
|
||||||
|
|
||||||
if not no_stats:
|
if not no_stats:
|
||||||
st.determine_core2w()
|
st.determine_core2w()
|
||||||
|
@ -112,7 +119,8 @@ def build_structures(args):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
structures.append(to_append)
|
structures.append(to_append)
|
||||||
max_num_components = max(max_num_components, len(to_append.components))
|
to_append_len = len(to_append.components) if not to_append.fake_root_included else len(to_append.components) - 1
|
||||||
|
max_num_components = max(max_num_components, to_append_len)
|
||||||
|
|
||||||
lemma_features = get_lemma_features(et)
|
lemma_features = get_lemma_features(et)
|
||||||
return structures, lemma_features, max_num_components
|
return structures, lemma_features, max_num_components
|
||||||
|
|
|
@ -32,13 +32,14 @@ class WordDummy:
|
||||||
|
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
def __init__(self, lemma, msd, wid, text, do_msd_translate):
|
def __init__(self, lemma, msd, wid, text, do_msd_translate, fake_word=False):
|
||||||
self.lemma = lemma
|
self.lemma = lemma
|
||||||
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
self.msd = MSD_TRANSLATE[msd] if do_msd_translate else msd
|
||||||
self.id = wid
|
self.id = wid
|
||||||
self.idi = None
|
self.idi = None
|
||||||
self.text = text
|
self.text = text
|
||||||
self.glue = ''
|
self.glue = ''
|
||||||
|
self.fake_word = fake_word
|
||||||
|
|
||||||
self.links = defaultdict(list)
|
self.links = defaultdict(list)
|
||||||
|
|
||||||
|
@ -74,6 +75,11 @@ class Word:
|
||||||
pc.set('msd', "N" if do_msd_translate else "U")
|
pc.set('msd', "N" if do_msd_translate else "U")
|
||||||
return Word.from_xml(pc, do_msd_translate)
|
return Word.from_xml(pc, do_msd_translate)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fake_root_word(sentence_id):
|
||||||
|
wid = sentence_id
|
||||||
|
return Word('', '', wid, '', False, True)
|
||||||
|
|
||||||
def add_link(self, link, to):
|
def add_link(self, link, to):
|
||||||
self.links[link].append(to)
|
self.links[link].append(to)
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,8 @@ class WordStats:
|
||||||
|
|
||||||
def add_words(self, words):
|
def add_words(self, words):
|
||||||
for w in progress(words, "adding-words"):
|
for w in progress(words, "adding-words"):
|
||||||
|
if w.fake_word:
|
||||||
|
continue
|
||||||
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
|
params = {'lemma': w.lemma, 'msd': w.msd, 'text': w.text}
|
||||||
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
|
res = self.db.execute("""UPDATE UniqWords SET frequency=frequency + 1
|
||||||
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
|
WHERE lemma=:lemma AND msd=:msd AND text=:text""", params)
|
||||||
|
|
|
@ -16,23 +16,23 @@ class Writer:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_output_writer(args, num_components, colocation_ids, word_renderer):
|
def make_output_writer(args, num_components, colocation_ids, word_renderer):
|
||||||
params = Writer.other_params(args)
|
params = Writer.other_params(args)
|
||||||
return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
|
return Writer(args.out, num_components, OutFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer):
|
def make_output_no_stat_writer(args, num_components, colocation_ids, word_renderer):
|
||||||
params = Writer.other_params(args)
|
params = Writer.other_params(args)
|
||||||
return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
|
return Writer(args.out_no_stat, num_components, OutNoStatFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_all_writer(args, num_components, colocation_ids, word_renderer):
|
def make_all_writer(args, num_components, colocation_ids, word_renderer):
|
||||||
return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None)
|
return Writer(args.all, num_components, AllFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, None, args.separator)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_stats_writer(args, num_components, colocation_ids, word_renderer):
|
def make_stats_writer(args, num_components, colocation_ids, word_renderer):
|
||||||
params = Writer.other_params(args)
|
params = Writer.other_params(args)
|
||||||
return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params)
|
return Writer(args.stats, num_components, StatsFormatter(colocation_ids, word_renderer), args.collocation_sentence_map_dest, params, args.separator)
|
||||||
|
|
||||||
def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params):
|
def __init__(self, file_out, num_components, formatter, collocation_sentence_map_dest, params, separator):
|
||||||
# TODO FIX THIS
|
# TODO FIX THIS
|
||||||
self.collocation_sentence_map_dest = collocation_sentence_map_dest
|
self.collocation_sentence_map_dest = collocation_sentence_map_dest
|
||||||
if params is None:
|
if params is None:
|
||||||
|
@ -49,6 +49,7 @@ class Writer:
|
||||||
self.num_components = num_components
|
self.num_components = num_components
|
||||||
self.output_file = file_out
|
self.output_file = file_out
|
||||||
self.formatter = formatter
|
self.formatter = formatter
|
||||||
|
self.separator = separator
|
||||||
|
|
||||||
def header(self):
|
def header(self):
|
||||||
repeating_cols = self.formatter.header_repeat()
|
repeating_cols = self.formatter.header_repeat()
|
||||||
|
@ -78,7 +79,7 @@ class Writer:
|
||||||
return sorted(rows, key=key, reverse=self.sort_order)
|
return sorted(rows, key=key, reverse=self.sort_order)
|
||||||
|
|
||||||
def write_header(self, file_handler):
|
def write_header(self, file_handler):
|
||||||
file_handler.write(",".join(self.header()) + "\n")
|
file_handler.write(self.separator.join(self.header()) + "\n")
|
||||||
|
|
||||||
def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map):
|
def write_out_worker(self, file_handler, structure, colocation_ids, col_sent_map):
|
||||||
rows = []
|
rows = []
|
||||||
|
@ -99,12 +100,16 @@ class Writer:
|
||||||
for words in match.matches:
|
for words in match.matches:
|
||||||
to_write = []
|
to_write = []
|
||||||
|
|
||||||
for idx, _comp in enumerate(components):
|
idx = 1
|
||||||
idx = str(idx + 1)
|
for _comp in components:
|
||||||
if idx not in words:
|
if _comp.idx == '#':
|
||||||
|
continue
|
||||||
|
idx_s = str(idx)
|
||||||
|
idx += 1
|
||||||
|
if idx_s not in words:
|
||||||
to_write.extend([""] * self.formatter.length())
|
to_write.extend([""] * self.formatter.length())
|
||||||
else:
|
else:
|
||||||
to_write.extend(self.formatter.content_repeat(words, match.representations, idx, structure.id))
|
to_write.extend(self.formatter.content_repeat(words, match.representations, idx_s, structure.id))
|
||||||
|
|
||||||
# make them equal size
|
# make them equal size
|
||||||
to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write)))
|
to_write.extend([""] * (self.num_components * self.formatter.length() - len(to_write)))
|
||||||
|
@ -121,7 +126,7 @@ class Writer:
|
||||||
|
|
||||||
if rows != []:
|
if rows != []:
|
||||||
rows = self.sorted_rows(rows)
|
rows = self.sorted_rows(rows)
|
||||||
file_handler.write("\n".join([",".join(row) for row in rows]) + "\n")
|
file_handler.write("\n".join([self.separator.join(row) for row in rows]) + "\n")
|
||||||
file_handler.flush()
|
file_handler.flush()
|
||||||
|
|
||||||
def write_out(self, structures, colocation_ids):
|
def write_out(self, structures, colocation_ids):
|
||||||
|
|
2
wani.py
2
wani.py
|
@ -151,6 +151,8 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
parser.add_argument('--pc-tag',
|
parser.add_argument('--pc-tag',
|
||||||
help='Tag for separators, usually pc or c', default="pc")
|
help='Tag for separators, usually pc or c', default="pc")
|
||||||
|
parser.add_argument('--separator',
|
||||||
|
help='Separator in output file', default="\t")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||||
|
|
Loading…
Reference in New Issue
Block a user