forked from kristjan/cjvt-valency
Fixing loop in jos srl
This commit is contained in:
parent
5cdc963c2d
commit
2551a9c6a8
|
@ -1220,7 +1220,9 @@ def match_file(words, structures):
|
|||
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
|
||||
|
||||
|
||||
def find_word_sons(word, deppar_dict, word_id, role):
|
||||
def find_word_sons(word, deppar_dict, word_id, role, parents):
|
||||
if word.id in parents:
|
||||
return False
|
||||
for k, v in word.links.items():
|
||||
for w in v:
|
||||
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
||||
|
@ -1229,10 +1231,11 @@ def find_word_sons(word, deppar_dict, word_id, role):
|
|||
if w.id not in deppar_dict:
|
||||
deppar_dict[w.id] = {}
|
||||
deppar_dict[w.id][word_id] = role
|
||||
find_word_sons(w, deppar_dict, word_id, role)
|
||||
if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]):
|
||||
return False
|
||||
# elif k in possible_jos_links:
|
||||
# raise Exception('One word in multiple dependency parsetrees')
|
||||
|
||||
return True
|
||||
|
||||
# for ignoring punctuations
|
||||
def idi_word_generator(sentence):
|
||||
|
@ -1252,7 +1255,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
|||
formatted_sentences = {}
|
||||
start_time = time.time()
|
||||
|
||||
sentences_num_limit = 10000
|
||||
sentences_num_limit = 15000
|
||||
sentences_in_ram = 0
|
||||
sentence_glue_numbers = None
|
||||
|
||||
|
@ -1274,6 +1277,13 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
|||
logging.warning(
|
||||
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||
sentence_glue = next(glue_words_gen)
|
||||
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
|
||||
|
||||
# has to be here for when next sentence_glue is selected in while loop
|
||||
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
|
||||
logging.warning(
|
||||
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||
continue
|
||||
|
||||
if sent_id != sentence_glue[0]:
|
||||
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||
|
@ -1319,6 +1329,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
|||
deppar_dict = {}
|
||||
|
||||
# idi = 0
|
||||
incorrect_sentence = False
|
||||
|
||||
# create output and form dependency parsetree sons
|
||||
for idi, word in idi_word_generator(sentence):
|
||||
|
@ -1355,9 +1366,14 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
|||
# print(sentences_of_interest[sent_id][idi][1])
|
||||
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
|
||||
# print('HERE')
|
||||
find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1])
|
||||
if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []):
|
||||
incorrect_sentence = True
|
||||
# idi += 1
|
||||
|
||||
if incorrect_sentence:
|
||||
logging.warning(
|
||||
f"Sentence {sent_id} contains srl connections that loop!")
|
||||
continue
|
||||
# print(time.time() - start_time)
|
||||
|
||||
for word in sentence:
|
||||
|
|
Loading…
Reference in New Issue
Block a user