forked from kristjan/cjvt-valency
Fixing loop in jos srl
This commit is contained in:
parent
5cdc963c2d
commit
2551a9c6a8
|
@ -1220,7 +1220,9 @@ def match_file(words, structures):
|
||||||
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
|
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
|
||||||
|
|
||||||
|
|
||||||
def find_word_sons(word, deppar_dict, word_id, role):
|
def find_word_sons(word, deppar_dict, word_id, role, parents):
|
||||||
|
if word.id in parents:
|
||||||
|
return False
|
||||||
for k, v in word.links.items():
|
for k, v in word.links.items():
|
||||||
for w in v:
|
for w in v:
|
||||||
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
||||||
|
@ -1229,10 +1231,11 @@ def find_word_sons(word, deppar_dict, word_id, role):
|
||||||
if w.id not in deppar_dict:
|
if w.id not in deppar_dict:
|
||||||
deppar_dict[w.id] = {}
|
deppar_dict[w.id] = {}
|
||||||
deppar_dict[w.id][word_id] = role
|
deppar_dict[w.id][word_id] = role
|
||||||
find_word_sons(w, deppar_dict, word_id, role)
|
if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]):
|
||||||
|
return False
|
||||||
# elif k in possible_jos_links:
|
# elif k in possible_jos_links:
|
||||||
# raise Exception('One word in multiple dependency parsetrees')
|
# raise Exception('One word in multiple dependency parsetrees')
|
||||||
|
return True
|
||||||
|
|
||||||
# for ignoring punctuations
|
# for ignoring punctuations
|
||||||
def idi_word_generator(sentence):
|
def idi_word_generator(sentence):
|
||||||
|
@ -1252,7 +1255,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
formatted_sentences = {}
|
formatted_sentences = {}
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
sentences_num_limit = 10000
|
sentences_num_limit = 15000
|
||||||
sentences_in_ram = 0
|
sentences_in_ram = 0
|
||||||
sentence_glue_numbers = None
|
sentence_glue_numbers = None
|
||||||
|
|
||||||
|
@ -1274,6 +1277,13 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
logging.warning(
|
logging.warning(
|
||||||
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
sentence_glue = next(glue_words_gen)
|
sentence_glue = next(glue_words_gen)
|
||||||
|
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
|
||||||
|
|
||||||
|
# has to be here for when next sentence_glue is selected in while loop
|
||||||
|
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
|
||||||
|
logging.warning(
|
||||||
|
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
|
continue
|
||||||
|
|
||||||
if sent_id != sentence_glue[0]:
|
if sent_id != sentence_glue[0]:
|
||||||
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
|
@ -1319,6 +1329,7 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
deppar_dict = {}
|
deppar_dict = {}
|
||||||
|
|
||||||
# idi = 0
|
# idi = 0
|
||||||
|
incorrect_sentence = False
|
||||||
|
|
||||||
# create output and form dependency parsetree sons
|
# create output and form dependency parsetree sons
|
||||||
for idi, word in idi_word_generator(sentence):
|
for idi, word in idi_word_generator(sentence):
|
||||||
|
@ -1355,9 +1366,14 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
# print(sentences_of_interest[sent_id][idi][1])
|
# print(sentences_of_interest[sent_id][idi][1])
|
||||||
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
|
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
|
||||||
# print('HERE')
|
# print('HERE')
|
||||||
find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1])
|
if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []):
|
||||||
|
incorrect_sentence = True
|
||||||
# idi += 1
|
# idi += 1
|
||||||
|
|
||||||
|
if incorrect_sentence:
|
||||||
|
logging.warning(
|
||||||
|
f"Sentence {sent_id} contains srl connections that loop!")
|
||||||
|
continue
|
||||||
# print(time.time() - start_time)
|
# print(time.time() - start_time)
|
||||||
|
|
||||||
for word in sentence:
|
for word in sentence:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user