@ -350,7 +350,6 @@ def get_sentences_of_interest(headword_category, collection, w_collection, RF, m
pbar . update ( 1 )
status_collection_update_list . append ( InsertOne ( { ' corpus_type ' : corpus_type , ' headword_text ' : headword_text , ' part ' : ' p1 ' } ) )
# TODO uncomment
# if 'GF0010453.1116.1' in sentences_of_interest:
# a = sentences_of_interest['GF0010453.1116.1']
# print('here')
@ -526,7 +525,6 @@ def create_sentence_output(sentence, headword_id, corpus):
# else:
# word_text = word[0][0]
# word_text += word[0][1]
# TODO CHANGE THIS TO FIX SPACE LOCATIONS!
# word_text = word[0][0] + word[0][1]
if not first_outside_tag :
if p_attach_to is None :
@ -700,7 +698,6 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
# print('here')
sr_data = get_SRLcontainer_data ( db_sentence , str ( hw_idi ) , sr_data )
# TODO ERASE THIS
examples_included_num = 0
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
@ -783,10 +780,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
mongo , patterns ,
pattern_id_max , valency_pattern_id_collection , ' Gigafida 2.0 ' , pattern_examples_limit , headword_patterns_ssj )
# TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
if len ( headword_patterns_ssj ) == 0 :
continue
wf1 = aliased ( WordFormFeature )
wf2 = aliased ( WordFormFeature )
@ -1066,7 +1059,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
semanticRole . text = semantic_role
syntactic_structure_dict = { }
# TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS
if ' ssj ' in headword_pattern_dict and semantic_role in headword_pattern_dict [ ' ssj ' ] [ ' sr_data ' ] :
for syn_struct_id , syn_struct_dict in headword_pattern_dict [ ' ssj ' ] [ ' sr_data ' ] [ semantic_role ] . items ( ) :
if syn_struct_id not in syntactic_structure_dict :
@ -1121,22 +1114,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
lexem . set ( ' sloleks ' , prep_id )
lexem . text = l [ 2 ]
# if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
# for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
# syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
# syntacticStructure.set('id', syn_struct_id)
# for com_num, com_set in syn_struct_dict.items():
# # component = lxml.SubElement(syntacticStructure, 'component')
# # component.set('num', com_num)
# for lex in com_set:
# component = lxml.SubElement(syntacticStructure, 'component')
# component.set('num', com_num)
# lexem = lxml.SubElement(component, 'lexeme')
# lexem.set('sloleks', '')
# lexem.text = lex
patternRepresentation = lxml . SubElement ( valencyPattern , ' patternRepresentation ' )
patternRepresentation . text = patternTranslationText
@ -1156,8 +1133,6 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
encoding = ' utf-8 ' ) as xf :
xf . write ( dictionary , pretty_print = True )
pbar . update ( 1 )
# xf.write(entry, pretty_print=True)
# tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
def init_db ( db ) :
@ -1225,12 +1200,7 @@ def init_db(db):
def match_file ( words , structures ) :
matches = [ ]
if words [ 0 ] . text == ' Ena ' :
a = 0
for s in structures :
if s . id == ' 89 ' :
a = 1
for w in words :
mhere = s . match ( w )
for match in mhere :
@ -1252,7 +1222,6 @@ possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri',
def find_word_sons ( word , deppar_dict , word_id , role ) :
for k , v in word . links . items ( ) :
# if k != 'default_factory':
for w in v :
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
# print('here')
@ -1280,16 +1249,12 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
timeinfo = TimeInfo ( len ( input_corpus ) )
database = Database ( args )
# match_store = MatchStore(args, database)
# word_stats = WordStats(lemma_msds, database)
formatted_sentences = { }
start_time = time . time ( )
# print(time.time() - start_time)
sentences_num_limit = 10000
sentences_in_ram = 0
# is_gf = input_corpus_orig is not None
sentence_glue_numbers = None
is_gf = input_corpus_orig is not None
if is_gf :
@ -1297,13 +1262,24 @@ def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_co
for sent_id , sentence , othr_sentence_attributes in load_files ( args , database , w_collection , input_corpus ) :
if is_gf :
# create tuple for comparison with sentence_flue_words
sent_id_numbers = tuple ( [ int ( sid ) for sid in sent_id [ 2 : ] . split ( ' . ' ) ] )
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers :
logging . warning (
f " Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
continue
sentence_glue = next ( glue_words_gen )
sentence_glue_numbers = tuple ( [ int ( sid ) for sid in sentence_glue [ 0 ] [ 2 : ] . split ( ' . ' ) ] )
while sentence_glue_numbers < sent_id_numbers :
logging . warning (
f " Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
sentence_glue = next ( glue_words_gen )
if sent_id != sentence_glue [ 0 ] :
raise Exception ( f " Annotated gigafida and original gigafida not in sync (annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
if len ( sentence_glue [ 1 ] ) != len ( sentence ) :
raise Exception (
f " Annotated gigafida and original gigafida size is not the same (annotated: { len ( sentence ) } , original: { len ( sentence_glue [ 1 ] ) } " )
logging . warning ( f " Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: { len ( sentence ) } , original: { len ( sentence_glue [ 1 ] ) } " )
continue
for w , w_glue in zip ( sentence , sentence_glue [ 1 ] ) :
w . glue = w_glue [ 2 ]
if sentence is None :
@ -1644,21 +1620,6 @@ if __name__ == '__main__':
args = arg_parser . parse_args ( )
logging . basicConfig ( stream = sys . stderr , level = args . verbose . upper ( ) )
# try:
# sys.path.insert(1, args.structure_extraction)
# from progress_bar import progress
# from word import Word, WordCompressed
# from syntactic_structure import build_structures
# from match_store import MatchStore
# from word_stats import WordStats
# from writer import Writer
# from loader import load_files, file_sentence_glue_generator
# from database import Database
# from time_info import TimeInfo
# from msd_translate import MSD_TRANSLATE
# except:
# raise
start = time . time ( )
main ( args )
logging . info ( " TIME: {} " . format ( time . time ( ) - start ) )