From 5cdc963c2dc3ba021cdbaf018fa59da6c85e93a8 Mon Sep 17 00:00:00 2001
From: Luka <krsnik.luka92@gmail.com>
Date: Wed, 30 Sep 2020 09:51:41 +0200
Subject: [PATCH] Added warnings to skip sentences that do not match

---
 README.md             |  5 ++++
 scripts/create_xml.py | 69 ++++++++++---------------------------------
 2 files changed, 20 insertions(+), 54 deletions(-)
diff --git a/README.md b/README.md
index 3a51a2a..269b51b 100644
--- a/README.md
+++ b/README.md
@@ -223,4 +223,9 @@ docker exec -it ef0a /bin/bash
 
     # check if it worked by
     mongo --username <REGULAR USER> --password --authenticationDatabase valdb
+
+# make mongodb visible only privately
+docker stack rm dbstack
+cd dockerfiles/database/
+docker-compose up
 ```
\ No newline at end of file
diff --git a/scripts/create_xml.py b/scripts/create_xml.py
index 8d25b04..c6fe4df 100644
--- a/scripts/create_xml.py
+++ b/scripts/create_xml.py
@@ -350,7 +350,6 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
         pbar.update(1)
         status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
 
-    # TODO uncomment
     # if 'GF0010453.1116.1' in sentences_of_interest:
     #     a = sentences_of_interest['GF0010453.1116.1']
     #     print('here')
@@ -526,7 +525,6 @@ def create_sentence_output(sentence, headword_id, corpus):
         # else:
         #     word_text = word[0][0]
         # word_text += word[0][1]
-        # TODO CHANGE THIS TO FIX SPACE LOCATIONS!
         # word_text = word[0][0] + word[0][1]
         if not first_outside_tag:
             if p_attach_to is None:
@@ -700,7 +698,6 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
                 #     print('here')
 
                 sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
-                # TODO ERASE THIS
                 examples_included_num = 0
 
                 # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
@@ -783,10 +780,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
                                                                                                 mongo, patterns,
                                                                                                 pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
 
-        # TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-        if len(headword_patterns_ssj) == 0:
-            continue
-
 
         wf1 = aliased(WordFormFeature)
         wf2 = aliased(WordFormFeature)
@@ -1066,7 +1059,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
                     semanticRole.text = semantic_role
 
                     syntactic_structure_dict = {}
-                    # TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS
+
                     if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
                         for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
                             if syn_struct_id not in syntactic_structure_dict:
@@ -1121,22 +1114,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
                                     lexem.set('sloleks', prep_id)
                                     lexem.text = l[2]
 
-
-                    # if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
-                    #     for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
-                    #         syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
-                    #         syntacticStructure.set('id', syn_struct_id)
-                    #         for com_num, com_set in syn_struct_dict.items():
-                    #             # component = lxml.SubElement(syntacticStructure, 'component')
-                    #             # component.set('num', com_num)
-                    #             for lex in com_set:
-                    #                 component = lxml.SubElement(syntacticStructure, 'component')
-                    #                 component.set('num', com_num)
-                    #                 lexem = lxml.SubElement(component, 'lexeme')
-                    #                 lexem.set('sloleks', '')
-                    #                 lexem.text = lex
-
-
                 patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
                 patternRepresentation.text = patternTranslationText
 
@@ -1156,8 +1133,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
                               encoding='utf-8') as xf:
             xf.write(dictionary, pretty_print=True)
         pbar.update(1)
-            # xf.write(entry, pretty_print=True)
-            # tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
 
 
 def init_db(db):
@@ -1225,12 +1200,7 @@ def init_db(db):
 def match_file(words, structures):
     matches = []
 
-    if words[0].text == 'Ena':
-        a = 0
-
     for s in structures:
-        if s.id == '89':
-            a = 1
         for w in words:
             mhere = s.match(w)
             for match in mhere:
@@ -1252,7 +1222,6 @@ possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri',
 
 def find_word_sons(word, deppar_dict, word_id, role):
     for k, v in word.links.items():
-        # if k != 'default_factory':
         for w in v:
             # if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
             #     print('here')
@@ -1280,16 +1249,12 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
     timeinfo = TimeInfo(len(input_corpus))
 
     database = Database(args)
-    # match_store = MatchStore(args, database)
-    # word_stats = WordStats(lemma_msds, database)
     formatted_sentences = {}
     start_time = time.time()
-    # print(time.time() - start_time)
 
     sentences_num_limit = 10000
     sentences_in_ram = 0
-
-    # is_gf = input_corpus_orig is not None
+    sentence_glue_numbers = None
 
     is_gf = input_corpus_orig is not None
     if is_gf:
@@ -1297,13 +1262,24 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
 
     for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
         if is_gf:
+            # create tuple for comparison with sentence_flue_words
+            sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')])
+            if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
+                logging.warning(
+                    f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
+                continue
             sentence_glue = next(glue_words_gen)
+            sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
+            while sentence_glue_numbers < sent_id_numbers:
+                logging.warning(
+                    f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
+                sentence_glue = next(glue_words_gen)
 
             if sent_id != sentence_glue[0]:
                 raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
             if len(sentence_glue[1]) != len(sentence):
-                raise Exception(
-                    f"Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
+                logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
+                continue
             for w, w_glue in zip(sentence, sentence_glue[1]):
                 w.glue = w_glue[2]
         if sentence is None:
@@ -1644,21 +1620,6 @@ if __name__ == '__main__':
     args = arg_parser.parse_args()
     logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
 
-    # try:
-    #     sys.path.insert(1, args.structure_extraction)
-    #     from progress_bar import progress
-    #     from word import Word, WordCompressed
-    #     from syntactic_structure import build_structures
-    #     from match_store import MatchStore
-    #     from word_stats import WordStats
-    #     from writer import Writer
-    #     from loader import load_files, file_sentence_glue_generator
-    #     from database import Database
-    #     from time_info import TimeInfo
-    #     from msd_translate import MSD_TRANSLATE
-    # except:
-    #     raise
-
     start = time.time()
     main(args)
     logging.info("TIME: {}".format(time.time() - start))