From 5cdc963c2dc3ba021cdbaf018fa59da6c85e93a8 Mon Sep 17 00:00:00 2001 From: Luka Date: Wed, 30 Sep 2020 09:51:41 +0200 Subject: [PATCH] Added warnings to skip sentences that do not match --- README.md | 5 ++++ scripts/create_xml.py | 69 ++++++++++--------------------------------- 2 files changed, 20 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 3a51a2a..269b51b 100644 --- a/README.md +++ b/README.md @@ -223,4 +223,9 @@ docker exec -it ef0a /bin/bash # check if it worked by mongo --username --password --authenticationDatabase valdb + +# make mongodb visible only privately +docker stack rm dbstack +cd dockerfiles/database/ +docker-compose up ``` \ No newline at end of file diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 8d25b04..c6fe4df 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -350,7 +350,6 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m pbar.update(1) status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'})) - # TODO uncomment # if 'GF0010453.1116.1' in sentences_of_interest: # a = sentences_of_interest['GF0010453.1116.1'] # print('here') @@ -526,7 +525,6 @@ def create_sentence_output(sentence, headword_id, corpus): # else: # word_text = word[0][0] # word_text += word[0][1] - # TODO CHANGE THIS TO FIX SPACE LOCATIONS! # word_text = word[0][0] + word[0][1] if not first_outside_tag: if p_attach_to is None: @@ -700,7 +698,6 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter # print('here') sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data) - # TODO ERASE THIS examples_included_num = 0 # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data) @@ -783,10 +780,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj) - # TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - if len(headword_patterns_ssj) == 0: - continue - wf1 = aliased(WordFormFeature) wf2 = aliased(WordFormFeature) @@ -1066,7 +1059,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, semanticRole.text = semantic_role syntactic_structure_dict = {} - # TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS + if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']: for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items(): if syn_struct_id not in syntactic_structure_dict: @@ -1121,22 +1114,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, lexem.set('sloleks', prep_id) lexem.text = l[2] - - # if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']: - # for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items(): - # syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') - # syntacticStructure.set('id', syn_struct_id) - # for com_num, com_set in syn_struct_dict.items(): - # # component = lxml.SubElement(syntacticStructure, 'component') - # # component.set('num', com_num) - # for lex in com_set: - # component = lxml.SubElement(syntacticStructure, 'component') - # component.set('num', com_num) - # lexem = lxml.SubElement(component, 'lexeme') - # lexem.set('sloleks', '') - # lexem.text = lex - - patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation') patternRepresentation.text = patternTranslationText @@ -1156,8 +1133,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, encoding='utf-8') as xf: xf.write(dictionary, pretty_print=True) pbar.update(1) - # xf.write(entry, pretty_print=True) - # tree.write(output_file_name, encoding='UTF-8', pretty_print=True) def init_db(db): @@ -1225,12 +1200,7 @@ def init_db(db): def match_file(words, structures): matches = [] - if words[0].text == 'Ena': - a = 0 - for s in structures: - if s.id == '89': - a = 1 for w in words: mhere = s.match(w) for match in mhere: @@ -1252,7 +1222,6 @@ possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', def find_word_sons(word, deppar_dict, word_id, role): for k, v in word.links.items(): - # if k != 'default_factory': for w in v: # if k in possible_jos_links and w.id == 'ssj1.1.1.t21': # print('here') @@ -1280,16 +1249,12 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co timeinfo = TimeInfo(len(input_corpus)) database = Database(args) - # match_store = MatchStore(args, database) - # word_stats = WordStats(lemma_msds, database) formatted_sentences = {} start_time = time.time() - # print(time.time() - start_time) sentences_num_limit = 10000 sentences_in_ram = 0 - - # is_gf = input_corpus_orig is not None + sentence_glue_numbers = None is_gf = input_corpus_orig is not None if is_gf: @@ -1297,13 +1262,24 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus): if is_gf: + # create tuple for comparison with sentence_flue_words + sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')]) + if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers: + logging.warning( + f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") + continue sentence_glue = next(glue_words_gen) + sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')]) + while sentence_glue_numbers < sent_id_numbers: + logging.warning( + f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") + sentence_glue = next(glue_words_gen) if sent_id != sentence_glue[0]: raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") if len(sentence_glue[1]) != len(sentence): - raise Exception( - f"Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}") + logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}") + continue for w, w_glue in zip(sentence, sentence_glue[1]): w.glue = w_glue[2] if sentence is None: @@ -1644,21 +1620,6 @@ if __name__ == '__main__': args = arg_parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) - # try: - # sys.path.insert(1, args.structure_extraction) - # from progress_bar import progress - # from word import Word, WordCompressed - # from syntactic_structure import build_structures - # from match_store import MatchStore - # from word_stats import WordStats - # from writer import Writer - # from loader import load_files, file_sentence_glue_generator - # from database import Database - # from time_info import TimeInfo - # from msd_translate import MSD_TRANSLATE - # except: - # raise - start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))