From 2551a9c6a89adc1a373964abfac7c7f0b97d22e9 Mon Sep 17 00:00:00 2001 From: Luka Date: Wed, 30 Sep 2020 11:40:55 +0200 Subject: [PATCH] Fixing loop in jos srl --- scripts/create_xml.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/scripts/create_xml.py b/scripts/create_xml.py index c6fe4df..36c10b4 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -1220,7 +1220,9 @@ def match_file(words, structures): possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'} -def find_word_sons(word, deppar_dict, word_id, role): +def find_word_sons(word, deppar_dict, word_id, role, parents): + if word.id in parents: + return False for k, v in word.links.items(): for w in v: # if k in possible_jos_links and w.id == 'ssj1.1.1.t21': @@ -1229,10 +1231,11 @@ def find_word_sons(word, deppar_dict, word_id, role): if w.id not in deppar_dict: deppar_dict[w.id] = {} deppar_dict[w.id][word_id] = role - find_word_sons(w, deppar_dict, word_id, role) + if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]): + return False # elif k in possible_jos_links: # raise Exception('One word in multiple dependency parsetrees') - + return True # for ignoring punctuations def idi_word_generator(sentence): @@ -1252,7 +1255,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co formatted_sentences = {} start_time = time.time() - sentences_num_limit = 10000 + sentences_num_limit = 15000 sentences_in_ram = 0 sentence_glue_numbers = None @@ -1274,6 +1277,13 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co logging.warning( f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") sentence_glue = next(glue_words_gen) + sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')]) + + # has to be here for when next sentence_glue is selected in while loop + if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers: + logging.warning( + f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") + continue if sent_id != sentence_glue[0]: raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") @@ -1319,6 +1329,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co deppar_dict = {} # idi = 0 + incorrect_sentence = False # create output and form dependency parsetree sons for idi, word in idi_word_generator(sentence): @@ -1355,9 +1366,14 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co # print(sentences_of_interest[sent_id][idi][1]) # if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi: # print('HERE') - find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1]) + if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []): + incorrect_sentence = True # idi += 1 + if incorrect_sentence: + logging.warning( + f"Sentence {sent_id} contains srl connections that loop!") + continue # print(time.time() - start_time) for word in sentence: