forked from kristjan/cjvt-valency
Added warnings to skip sentences that do not match
This commit is contained in:
parent
ce1fb46b4e
commit
5cdc963c2d
|
@ -223,4 +223,9 @@ docker exec -it ef0a /bin/bash
|
||||||
|
|
||||||
# check if it worked by
|
# check if it worked by
|
||||||
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
mongo --username <REGULAR USER> --password --authenticationDatabase valdb
|
||||||
|
|
||||||
|
# make mongodb visible only privately
|
||||||
|
docker stack rm dbstack
|
||||||
|
cd dockerfiles/database/
|
||||||
|
docker-compose up
|
||||||
```
|
```
|
|
@ -350,7 +350,6 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
|
status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
|
||||||
|
|
||||||
# TODO uncomment
|
|
||||||
# if 'GF0010453.1116.1' in sentences_of_interest:
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
||||||
# a = sentences_of_interest['GF0010453.1116.1']
|
# a = sentences_of_interest['GF0010453.1116.1']
|
||||||
# print('here')
|
# print('here')
|
||||||
|
@ -526,7 +525,6 @@ def create_sentence_output(sentence, headword_id, corpus):
|
||||||
# else:
|
# else:
|
||||||
# word_text = word[0][0]
|
# word_text = word[0][0]
|
||||||
# word_text += word[0][1]
|
# word_text += word[0][1]
|
||||||
# TODO CHANGE THIS TO FIX SPACE LOCATIONS!
|
|
||||||
# word_text = word[0][0] + word[0][1]
|
# word_text = word[0][0] + word[0][1]
|
||||||
if not first_outside_tag:
|
if not first_outside_tag:
|
||||||
if p_attach_to is None:
|
if p_attach_to is None:
|
||||||
|
@ -700,7 +698,6 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
||||||
# print('here')
|
# print('here')
|
||||||
|
|
||||||
sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
|
sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
|
||||||
# TODO ERASE THIS
|
|
||||||
examples_included_num = 0
|
examples_included_num = 0
|
||||||
|
|
||||||
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
|
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
|
||||||
|
@ -783,10 +780,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
mongo, patterns,
|
mongo, patterns,
|
||||||
pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
|
pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
|
||||||
|
|
||||||
# TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
if len(headword_patterns_ssj) == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
wf1 = aliased(WordFormFeature)
|
wf1 = aliased(WordFormFeature)
|
||||||
wf2 = aliased(WordFormFeature)
|
wf2 = aliased(WordFormFeature)
|
||||||
|
@ -1066,7 +1059,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
semanticRole.text = semantic_role
|
semanticRole.text = semantic_role
|
||||||
|
|
||||||
syntactic_structure_dict = {}
|
syntactic_structure_dict = {}
|
||||||
# TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS
|
|
||||||
if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
|
if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
|
||||||
for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
|
for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
|
||||||
if syn_struct_id not in syntactic_structure_dict:
|
if syn_struct_id not in syntactic_structure_dict:
|
||||||
|
@ -1121,22 +1114,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
lexem.set('sloleks', prep_id)
|
lexem.set('sloleks', prep_id)
|
||||||
lexem.text = l[2]
|
lexem.text = l[2]
|
||||||
|
|
||||||
|
|
||||||
# if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
|
|
||||||
# for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
|
|
||||||
# syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
|
||||||
# syntacticStructure.set('id', syn_struct_id)
|
|
||||||
# for com_num, com_set in syn_struct_dict.items():
|
|
||||||
# # component = lxml.SubElement(syntacticStructure, 'component')
|
|
||||||
# # component.set('num', com_num)
|
|
||||||
# for lex in com_set:
|
|
||||||
# component = lxml.SubElement(syntacticStructure, 'component')
|
|
||||||
# component.set('num', com_num)
|
|
||||||
# lexem = lxml.SubElement(component, 'lexeme')
|
|
||||||
# lexem.set('sloleks', '')
|
|
||||||
# lexem.text = lex
|
|
||||||
|
|
||||||
|
|
||||||
patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
|
patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
|
||||||
patternRepresentation.text = patternTranslationText
|
patternRepresentation.text = patternTranslationText
|
||||||
|
|
||||||
|
@ -1156,8 +1133,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
encoding='utf-8') as xf:
|
encoding='utf-8') as xf:
|
||||||
xf.write(dictionary, pretty_print=True)
|
xf.write(dictionary, pretty_print=True)
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
# xf.write(entry, pretty_print=True)
|
|
||||||
# tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
|
|
||||||
|
|
||||||
|
|
||||||
def init_db(db):
|
def init_db(db):
|
||||||
|
@ -1225,12 +1200,7 @@ def init_db(db):
|
||||||
def match_file(words, structures):
|
def match_file(words, structures):
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
if words[0].text == 'Ena':
|
|
||||||
a = 0
|
|
||||||
|
|
||||||
for s in structures:
|
for s in structures:
|
||||||
if s.id == '89':
|
|
||||||
a = 1
|
|
||||||
for w in words:
|
for w in words:
|
||||||
mhere = s.match(w)
|
mhere = s.match(w)
|
||||||
for match in mhere:
|
for match in mhere:
|
||||||
|
@ -1252,7 +1222,6 @@ possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri',
|
||||||
|
|
||||||
def find_word_sons(word, deppar_dict, word_id, role):
|
def find_word_sons(word, deppar_dict, word_id, role):
|
||||||
for k, v in word.links.items():
|
for k, v in word.links.items():
|
||||||
# if k != 'default_factory':
|
|
||||||
for w in v:
|
for w in v:
|
||||||
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
||||||
# print('here')
|
# print('here')
|
||||||
|
@ -1280,16 +1249,12 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
timeinfo = TimeInfo(len(input_corpus))
|
timeinfo = TimeInfo(len(input_corpus))
|
||||||
|
|
||||||
database = Database(args)
|
database = Database(args)
|
||||||
# match_store = MatchStore(args, database)
|
|
||||||
# word_stats = WordStats(lemma_msds, database)
|
|
||||||
formatted_sentences = {}
|
formatted_sentences = {}
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# print(time.time() - start_time)
|
|
||||||
|
|
||||||
sentences_num_limit = 10000
|
sentences_num_limit = 10000
|
||||||
sentences_in_ram = 0
|
sentences_in_ram = 0
|
||||||
|
sentence_glue_numbers = None
|
||||||
# is_gf = input_corpus_orig is not None
|
|
||||||
|
|
||||||
is_gf = input_corpus_orig is not None
|
is_gf = input_corpus_orig is not None
|
||||||
if is_gf:
|
if is_gf:
|
||||||
|
@ -1297,13 +1262,24 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
|
||||||
|
|
||||||
for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
|
for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
|
||||||
if is_gf:
|
if is_gf:
|
||||||
|
# create tuple for comparison with sentence_flue_words
|
||||||
|
sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')])
|
||||||
|
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
|
||||||
|
logging.warning(
|
||||||
|
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
|
continue
|
||||||
|
sentence_glue = next(glue_words_gen)
|
||||||
|
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
|
||||||
|
while sentence_glue_numbers < sent_id_numbers:
|
||||||
|
logging.warning(
|
||||||
|
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
sentence_glue = next(glue_words_gen)
|
sentence_glue = next(glue_words_gen)
|
||||||
|
|
||||||
if sent_id != sentence_glue[0]:
|
if sent_id != sentence_glue[0]:
|
||||||
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
||||||
if len(sentence_glue[1]) != len(sentence):
|
if len(sentence_glue[1]) != len(sentence):
|
||||||
raise Exception(
|
logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
|
||||||
f"Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
|
continue
|
||||||
for w, w_glue in zip(sentence, sentence_glue[1]):
|
for w, w_glue in zip(sentence, sentence_glue[1]):
|
||||||
w.glue = w_glue[2]
|
w.glue = w_glue[2]
|
||||||
if sentence is None:
|
if sentence is None:
|
||||||
|
@ -1644,21 +1620,6 @@ if __name__ == '__main__':
|
||||||
args = arg_parser.parse_args()
|
args = arg_parser.parse_args()
|
||||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||||
|
|
||||||
# try:
|
|
||||||
# sys.path.insert(1, args.structure_extraction)
|
|
||||||
# from progress_bar import progress
|
|
||||||
# from word import Word, WordCompressed
|
|
||||||
# from syntactic_structure import build_structures
|
|
||||||
# from match_store import MatchStore
|
|
||||||
# from word_stats import WordStats
|
|
||||||
# from writer import Writer
|
|
||||||
# from loader import load_files, file_sentence_glue_generator
|
|
||||||
# from database import Database
|
|
||||||
# from time_info import TimeInfo
|
|
||||||
# from msd_translate import MSD_TRANSLATE
|
|
||||||
# except:
|
|
||||||
# raise
|
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
main(args)
|
main(args)
|
||||||
logging.info("TIME: {}".format(time.time() - start))
|
logging.info("TIME: {}".format(time.time() - start))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user