Fixing loop in jos srl

This commit is contained in:
Luka 2020-09-30 11:40:55 +02:00
parent 5cdc963c2d
commit 2551a9c6a8

View File

@ -1220,7 +1220,9 @@ def match_file(words, structures):
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'} possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
def find_word_sons(word, deppar_dict, word_id, role): def find_word_sons(word, deppar_dict, word_id, role, parents):
if word.id in parents:
return False
for k, v in word.links.items(): for k, v in word.links.items():
for w in v: for w in v:
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21': # if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
@ -1229,10 +1231,11 @@ def find_word_sons(word, deppar_dict, word_id, role):
if w.id not in deppar_dict: if w.id not in deppar_dict:
deppar_dict[w.id] = {} deppar_dict[w.id] = {}
deppar_dict[w.id][word_id] = role deppar_dict[w.id][word_id] = role
find_word_sons(w, deppar_dict, word_id, role) if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]):
return False
# elif k in possible_jos_links: # elif k in possible_jos_links:
# raise Exception('One word in multiple dependency parsetrees') # raise Exception('One word in multiple dependency parsetrees')
return True
# for ignoring punctuations # for ignoring punctuations
def idi_word_generator(sentence): def idi_word_generator(sentence):
@ -1252,7 +1255,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
formatted_sentences = {} formatted_sentences = {}
start_time = time.time() start_time = time.time()
sentences_num_limit = 10000 sentences_num_limit = 15000
sentences_in_ram = 0 sentences_in_ram = 0
sentence_glue_numbers = None sentence_glue_numbers = None
@ -1274,6 +1277,13 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
logging.warning( logging.warning(
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
sentence_glue = next(glue_words_gen) sentence_glue = next(glue_words_gen)
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
# has to be here for when next sentence_glue is selected in while loop
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
logging.warning(
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
continue
if sent_id != sentence_glue[0]: if sent_id != sentence_glue[0]:
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
@ -1319,6 +1329,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
deppar_dict = {} deppar_dict = {}
# idi = 0 # idi = 0
incorrect_sentence = False
# create output and form dependency parsetree sons # create output and form dependency parsetree sons
for idi, word in idi_word_generator(sentence): for idi, word in idi_word_generator(sentence):
@ -1355,9 +1366,14 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
# print(sentences_of_interest[sent_id][idi][1]) # print(sentences_of_interest[sent_id][idi][1])
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi: # if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
# print('HERE') # print('HERE')
find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1]) if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []):
incorrect_sentence = True
# idi += 1 # idi += 1
if incorrect_sentence:
logging.warning(
f"Sentence {sent_id} contains srl connections that loop!")
continue
# print(time.time() - start_time) # print(time.time() - start_time)
for word in sentence: for word in sentence: