# -*- coding: utf-8 -*- #!/usr/bin/python3 #imports from luscenje_struktur import copy import csv from luscenje_struktur.progress_bar import progress from luscenje_struktur.word import Word, WordCompressed from luscenje_struktur.syntactic_structure import build_structures from luscenje_struktur.match_store import MatchStore from luscenje_struktur.word_stats import WordStats from luscenje_struktur.writer import Writer from luscenje_struktur.loader import load_files, file_sentence_glue_generator from luscenje_struktur.database import Database from luscenje_struktur.time_info import TimeInfo from luscenje_struktur.msd_translate import MSD_TRANSLATE # make database-service import gc import re import string from collections import OrderedDict import sys from tqdm import tqdm import pymongo # import tqdm as tqdm # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') from valency.Frame import frames_from_db_entry_headword from valency.reduce_functions import reduce_functions import argparse import os import shutil import lxml.etree as lxml import codecs import logging import argparse import pickle import time from io import StringIO from lxml import etree from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, aliased from sqlalchemy import create_engine from sqlalchemy import func from pymongo import MongoClient, UpdateOne, InsertOne # examples_num = sys.maxsize # corpus = 'ssj' translations = { 'ACT': 'KDO/KAJ', 'PAT': 'KOGA/KAJ', 'RESLT': 'REZULTAT', 'REC': 'KOMU/ČEMU', 'TIME': 'KDAJ', 'MANN': 'KAKO', 'LOC': 'KJE', 'MEANS': 'S ČIM', 'GOAL': 'ČEMU', 'REG': 'GLEDE NA KOGA/KAJ', 'DUR': 'KOLIKO ČASA', 'CAUSE': 'ZAKAJ', 'COND': 'POD KATERIM POGOJEM', 'ORIG': 'IZVOR', 'FREQ': 'KOLIKOKRAT', 'SOURCE': 'OD KOD', 'AIM': 'S KAKŠNIM NAMENOM', 'QUANT': 'ŠTEVILO', 'EVENT': 'NA DOGODKU', 'CONTR': 'KLJUB ČEMU', 'ACMP': 'S KOM/ČIM', 'RESTR': 'Z OMEJITVIJO', 'MWPRED': '', 'MODAL': '', 'PHRAS': '' } CATEGORY_MAP = { 'noun': 'samostalnik', 'verb': 'glagol', 'adjective': 'pridevnik', 'adverb': 'prislov', 'pronoun': 'zaimek', 'numeral': 'števnik', 'preposition': 'predlog', 'conjunction': 'veznik', 'particle': 'členek', 'interjection': 'medmet', 'abbreviation': 'okrajšava', 'residual': 'neuvrščeno' } ASPECT_MAP = { 'perfective': 'dovršni', 'progressive': 'nedovršni', 'biaspectual': 'dvovidski' } CASE_MAP = { 'n': 'nominative', 'g': 'genitive', 'd': 'dative', 'a': 'accusative', 'l': 'locative', 'i': 'instrumental' } ssj_frequency_dict = {} Lexeme = None LexemeFeature = None Feature = None LexicalUnitLexeme = None LexicalUnit = None LexicalUnitType = None Category = None Sense = None Measure = None LexicalUnitMeasure = None Corpus = None Definition = None WordForm = None WordFormFeature = None FormRepresentation = None FormEncoding = None # corpus = 'gigafida' from pathlib import Path import json def hws_generator(collection, headword_text, RF, mongo): cur = collection.find({"headwords": headword_text}) # print('tu2!') frames = [] for ent in cur: frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO cur.close() # if headword_text == 'brati': # print('here') # if headword_text == 'prevajati': # print('here') ret_frames = RF(frames, mongo.db.sensemap) # print('tu4!') for frame in ret_frames: frame_json = frame.to_json() yield frame_json def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type): sentences_of_interest = {} # all_sentences = set() sorted(headword_category, key=lambda x: x[0]) # num_sentences in RAM at once sentences_num_limit = 15000 sentences_in_ram = 0 # part = 0 # start_time = time.time() # first_sentence = True # section_included = False # last_processed_hw = 'pomeniti' # last_processed_hw = 'iti' # last_processed_hw = 'aktivirati' # last_processed_hw = 'aktivirati' status_collection_update_list = [] # already_processed = False for headword_id, (headword_text, category_text) in enumerate(headword_category): # check whether element has been processed if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}): pbar.update(1) continue # print(headword_text) # if already_processed: # if headword_text != last_processed_hw: # continue # else: # already_processed = False # for headword_text, category_text in headword_category[15:20]: # headword_text = 'zadovoljen' # category_text = 'adjective' headword_patterns_ids = {} # print('tu1!') cur = collection.find({"headwords": headword_text}) # cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True) # print('tu2!') frames = [] for ent in cur: frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO cur.close() # if headword_text == 'brati': # print('here') # if headword_text == 'prevajati': # print('here') ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} # print('tu4!') for frame in ret_frames: frame_json = frame.to_json() json_ret["frames"].append(frame_json) # print('tu5!') # get xml values for hws in json_ret.values(): for hw in hws: # print(hw['hw']) # if hw['hw'] == 'pomeniti': # print('aaa') # generate valency pattern key valency_pattern_key = [] functors = {} if len(hw['tids']) != 1: raise Exception('Multiple TIDS') for slot in hw['slots']: valency_pattern_key.append(slot['functor']) for tid in slot['tids']: if tid not in functors: functors[tid] = {} functors[tid] = slot['functor'] valency_pattern_key = tuple(sorted(valency_pattern_key)) if valency_pattern_key not in headword_patterns_ids: headword_patterns_ids[valency_pattern_key] = [] for sentence in hw['sentences']: # all_sentences.add(sentence[0][0]) # if len(headword_patterns_ids[valency_pattern_key]) < examples_num: # if section_included: # if not sentences_in_ram > sentences_num_limit: # sentences_in_ram += 1 # continue # else: # first_sentence = True sentence_id = sentence[0][0].rsplit('.', 1)[0] # print(sentence_id) if sentence_id not in sentences_of_interest: sentences_of_interest[sentence_id] = {} idi = 0 parent_idi = -1 # print('t1') for idx, word in sentence: if idx == hw['tids'][0]: parent_idi = idi if word['word']: idi += 1 # print('t2') if parent_idi == -1: raise Exception('No parent found!') idi = 0 # if len(sentence) > 500: # print(len(sentence)) for idx, word in sentence: if idx in functors: # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx] # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi) # sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']])) sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx]) if word['word']: # if sentence_id == 'ssj37.216.892': # print(idi) # print(word['text']) idi += 1 # print('t3') headword_patterns_ids[valency_pattern_key].append(sentence_id) # check if this is first sentence # if first_sentence: # one_element = next(iter(sentences_of_interest.items())) # section_included = w_collection.count_documents({'_id': one_element[0], # list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1 # first_sentence = False if sentences_in_ram >= sentences_num_limit: # print('print1:') # print(time.time() - start_time) start_time = time.time() # !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized') # print('Sentences in ram:') # print(sentences_in_ram) sentences_in_ram = 0 # [InsertOne({'y': 1}), DeleteOne({'x': 1}), # ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)] # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # if 'GF0010453.1116.1' in sentences_of_interest: # print('here') if len(status_collection_update_list) > 0: status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] # print('print2:') # print(time.time() - start_time) # start_time = time.time() result = w_collection.bulk_write(requests) # print('print3:') # print(time.time() - start_time) # start_time = time.time() del status_collection_update_list del requests del sentences_of_interest gc.collect() # print('print4:') # print(time.time() - start_time) # start_time = time.time() # print(part) # print('HEADWORD') # print(headword_text) # pbar.update(1) # part += 1 # # w_collection.bulk_write( # array.map((val) = > # ({ # updateOne: { # filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6}, # update: { # $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}}, # upsert: true # } # }) # ) # }) # sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()} # w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True) # try: # w_collection.insert_many(sentences_of_interest, ordered=False) # except pymongo.errors.BulkWriteError as e: # print(e.details['writeErrors']) status_collection_update_list = [] sentences_of_interest = {} # first_sentence = True sentences_in_ram += 1 pbar.update(1) status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'})) # if 'GF0010453.1116.1' in sentences_of_interest: # a = sentences_of_interest['GF0010453.1116.1'] # print('here') if len(status_collection_update_list) > 0: status_collection.bulk_write(status_collection_update_list) requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] if len(requests) > 0: result = w_collection.bulk_write(requests) # sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # try: # w_collection.insert_many(sentences_of_interest, ordered=False) # except pymongo.errors.BulkWriteError as e: # print(e.details['writeErrors']) # sentences_of_interest = {} # # else: # # print('aaa') # return sentences_of_interest def create_sentence_output(sentence, headword_id, corpus, sent_id): glue_outside = False headword_id = str(headword_id) parent_node = etree.Element('corpusExample') parent_node.set('corpusName', corpus) parent_node.set('exampleId', sent_id) # parent_node.text = 'AAA' # parent_node.prefix = 'BBB' # parent_node.tail = 'CCC' cur_node = parent_node # formatted_sentence = '' first_in_tag = True first_outside_tag = False in_dependency_tree = False # TODO use whole sentence! # for idi, word in enumerate(sentence): # def idi_word_generator(sentence): # idi = 0 # for word in sentence: # if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: # continue # yield idi, word # idi += 1 idi = 0 attach_to = None p_cur_node = None p_attach_to = None p_glue_attach_to = None previous_word = None # if sentence[0][0][0] == 'Tako': # print('here') # for idi, word in idi_word_generator(sentence): for word_id in range(len(sentence)): # is_ending_tree = False # SRL container output word = sentence[word_id] # sentence output if in_dependency_tree: if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]: attach_to = cur_node # is_ending_tree = True p_glue_attach_to = cur_node cur_node = parent_node if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True # formatted_sentence += '' in_dependency_tree = False first_outside_tag = True if headword_id in word[2] and not in_dependency_tree: dep_tree = lxml.SubElement(cur_node, 'tree') dep_tree.set('role', word[2][headword_id]) cur_node = dep_tree if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True # formatted_sentence += ''.format(word[2][headword_id]) in_dependency_tree = word[2][headword_id] attach_to = None if p_glue_attach_to is not None: glue_outside = True if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None): # if headword_id == idi: comp = lxml.SubElement(cur_node, 'comp') comp.set('role', 'headword') if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] elif p_glue_attach_to is not None: if p_glue_attach_to.tail is None: p_glue_attach_to.tail = previous_word[0][1] else: p_glue_attach_to.tail += previous_word[0][1] # elif p_attach_to is not None: # if p_attach_to.tail is None: # p_attach_to.tail = previous_word[0][1] # else: # p_attach_to.tail += previous_word[0][1] word_text = word[0][0] comp.text = word_text attach_to = comp if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True first_outside_tag = True p_cur_node = cur_node p_glue_attach_to = comp p_attach_to = attach_to previous_word = word # formatted_sentence += '{}'.format(word[0][0]) idi += 1 continue if word[1] and in_dependency_tree: col_id = -1 for i, col in enumerate(word[1]): if headword_id in col[3]: col_id = i break if col_id != -1: comp = lxml.SubElement(cur_node, 'comp') comp.set('structure_id', word[1][col_id][0]) comp.set('num', word[1][col_id][1]) if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] elif p_glue_attach_to is not None: if p_glue_attach_to.tail is None: p_glue_attach_to.tail = previous_word[0][1] else: p_glue_attach_to.tail += previous_word[0][1] # elif p_attach_to is not None: # if p_attach_to.tail is None: # p_attach_to.tail = previous_word[0][1] # else: # p_attach_to.tail += previous_word[0][1] word_text = word[0][0] comp.text = word_text attach_to = comp if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True first_outside_tag = True # Assuming one collocation per word # formatted_sentence += '{}'.format(word[1][0][0], word[1][0][1], word[0][0]) p_cur_node = cur_node p_glue_attach_to = comp p_attach_to = attach_to previous_word = word idi += 1 continue # collocation # if not first_in_new_row: # # formatted_sentence += ' ' # word_text = ' ' + word[0][0] # else: # word_text = word[0][0] # if first_in_tag and previous_word: # word_text = previous_word[0][1] + word[0][0] # else: # word_text = word[0][0] # word_text += word[0][1] # word_text = word[0][0] + word[0][1] if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] word_text = word[0][0] else: word_text = '' if p_attach_to is None: if p_cur_node is not None: word_text += previous_word[0][1] else: word_text += previous_word[0][1] if glue_outside: p_glue_attach_to.tail = previous_word[0][1] word_text = word[0][0] else: word_text += word[0][0] if attach_to is None: if cur_node.text is None: cur_node.text = word_text else: cur_node.text += word_text else: if attach_to.tail is None: attach_to.tail = word_text else: attach_to.tail += word_text # attach_to.tail +=word[0][0] # formatted_sentence += word[0][0] first_in_tag = False first_outside_tag = False p_cur_node = cur_node p_attach_to = attach_to previous_word = word p_glue_attach_to = None if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None: continue idi += 1 return parent_node def get_SRLcontainer_data(sentence, word_of_interest_id, summary): for word in sentence: if word_of_interest_id in word[2]: for col in word[1]: if word_of_interest_id in col[3]: if word[2][word_of_interest_id] not in summary: summary[word[2][word_of_interest_id]] = {} if col[0] not in summary[word[2][word_of_interest_id]]: summary[word[2][word_of_interest_id]][col[0]] = {} # word_of_interest_included = word_of_interest_id in col[3] if col[1] not in summary[word[2][word_of_interest_id]][col[0]]: summary[word[2][word_of_interest_id]][col[0]][col[1]] = set() if col[2][0] == 'S': summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3])) return summary def valid_valency_pattern(valency_pattern_key): occurences = set() for v_p in valency_pattern_key: if v_p in occurences: return False occurences.add(v_p) return True def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj): cur = collection.find({"headwords": headword_text}) frames = [] for ent in cur: frames += frames_from_db_entry_headword(ent, headword_text) cur.close() ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} for frame in ret_frames: frame_json = frame.to_json() json_ret["frames"].append(frame_json) # get xml values headword_patterns = {} new_patterns = {} for hws in json_ret.values(): for hw in hws: # generate valency pattern key valency_pattern_key = [] for slot in hw['slots']: valency_pattern_key.append(slot['functor']) # sort valency_pattern_key by order provided in translations valency_pattern_key_new = [] for key in translations: if key in valency_pattern_key: valency_pattern_key_new.append(key) valency_pattern_key = tuple(valency_pattern_key_new) if valency_pattern_key not in headword_patterns: headword_patterns[valency_pattern_key] = {} headword_patterns[valency_pattern_key]['sentence_examples'] = [] headword_patterns[valency_pattern_key]['sentence_num'] = 0 headword_patterns[valency_pattern_key]['sr_data'] = {} if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns: new_patterns[valency_pattern_key] = pattern_id_max patterns[valency_pattern_key] = pattern_id_max pattern_id_max += 1 headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key] sr_data = headword_patterns[valency_pattern_key]['sr_data'] tids = set(hw['tids']) if valency_pattern_key in headword_patterns_ssj: ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples']) else: ssj_len = 0 for sentence in hw['sentences']: # sentences_of_interest.append(sentence[0]) # get sentence example # sentence_example = [] sent_id = sentence[0][0].rsplit('.', 1)[0] try: cur = w_a_collection.find({'_id': sent_id}) db_sentence = next(iter(cur))['words'] cur.close() except StopIteration: continue # if valency_pattern_key == ('ACT', 'PAT'): # print('am') # idi = 0 idi = 0 hw_idi = -1 for word_id, word in sentence: if word_id in tids: hw_idi = idi if word['word']: idi += 1 if hw_idi == -1: raise Exception('No such headword idi!') # for idi, word in idi_word_generator(sentence): # print('here') # for word_id, word_dict in sentence: # # TODO Modify sentence! # # if formatted_sentences[sent_id] # sentence_example.append(word_dict['text']) # if word_dict['word']: # idi += 1 # if sent_id == 'ssj134.880.3375': # print('here') # if sent_id == 'ssj38.227.917': # print('here') # if sent_id == 'GF0004627.1913.1': # print('here') # print(sent_id) # print([a for a in w_a_collection.find()]) # if valency_pattern_key == ('ACT', 'PAT'): # print('here') sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data) examples_included_num = 0 # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data) if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key): examples_included_num += 1 sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id) # sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi) # sentence_example = ''.join(sentence_example) # headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example) headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example) headword_patterns[valency_pattern_key]['sentence_num'] += 1 headword_patterns[valency_pattern_key]['sr_data'] = sr_data # add patterns to db new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()] if len(new_patterns_query) > 0: result = valency_pattern_id_collection.bulk_write(new_patterns_query) # calculate statistics semantic_role_stats = {} sentence_tot = 0 pattern_tot = len(headword_patterns) for key, val in headword_patterns.items(): sentence_num = val['sentence_num'] for sr in key: if sr in semantic_role_stats: semantic_role_stats[sr]['valency_pattern_num'] += 1 semantic_role_stats[sr]['valency_sentence_num'] += sentence_num else: semantic_role_stats[sr] = {} semantic_role_stats[sr]['valency_pattern_num'] = 1 semantic_role_stats[sr]['valency_sentence_num'] = sentence_num sentence_tot += sentence_num return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, Lexeme.potential_lexeme, LexicalUnitType.name) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \ .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \ .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \ .filter(LexicalUnitType.name == 'single_lexeme_unit') \ .filter(Measure.name == 'frequency') \ .filter(Corpus.name == 'gigafida') \ .filter(Corpus.version == '2.0') # valency_pattern_id_collection.find() # used to not repeat search queries for prepositions preposition_list = {} for headword_text, category_text in headword_category: # with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf: # a = [a for a in valency_pattern_id_collection.find()] cur = valency_pattern_id_collection.find() patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]} cur.close() # patterns = {} pattern_id_max = len(patterns) + 1 # pattern_examples_limit = 4 # get data headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit, {}) if not ignore_gigafida: headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida, w_a_collection_gigafida, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj) wf1 = aliased(WordFormFeature) wf2 = aliased(WordFormFeature) wf3 = aliased(WordFormFeature) query_preposition = session.query(FormEncoding.text) \ .join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) \ .join(wf1, wf1.word_form_id == WordForm.id) \ .join(wf2, wf2.word_form_id == WordForm.id) \ .join(wf3, wf3.word_form_id == WordForm.id) \ .filter(Lexeme.lemma == headword_text) \ .filter(wf1.value == 'singular') \ .filter(wf2.value == 'third') \ .filter(wf3.value == 'present') pattern_translation_hws = query_preposition.all() pattern_translation_3_sin = headword_text if len(pattern_translation_hws) == 1: pattern_translation_3_sin = pattern_translation_hws[0].text qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation") dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'}) if headword_text[-1] == '_': headword_text_query = headword_text[:-1] else: headword_text_query = headword_text query = query_general.filter(Category.name == category_text) \ .filter(Lexeme.lemma == headword_text_query) \ .group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, LexicalUnitType.name) # res = query.one_or_none() query_res = query.all() if len(query_res) == 1: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ query_res[0] sense_ids = session.query(Sense.id, Sense.potential_sense).filter( Sense.lexical_unit_id == lexical_unit_id).all() features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ .filter(LexemeFeature.lexeme_id == lexeme_id) \ .filter(Feature.name == 'aspect').all() elif len(query_res) > 1: # find dummy dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, Lexeme.potential_lexeme, LexicalUnitType.name) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ .filter(LexicalUnitType.name == 'single_lexeme_unit') \ .filter(Corpus.name == 'gigafida') \ .filter(Corpus.version == '2.0') \ .filter(Lexeme.lemma == headword_text_query).all() # all lexical_unit_ids equal or at least one dummy dummy_exists = False final_lexical_unit_id = 0 final_lexical_unit_lexeme_id = 0 for r in dummy_query: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy, lexical_unit_type_name) = r if dummy: final_lexical_unit_id = lexical_unit_id final_lexical_unit_lexeme_id = lexical_unit_lexeme_id dummy_exists = True break assert dummy_exists sense_ids = [] features_set = set() frequency = 0 for r in query_res: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy, lexical_unit_type_name) = r if dummy: continue sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter( Sense.lexical_unit_id == lexical_unit_id).all()) features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ .filter(LexemeFeature.lexeme_id == lexeme_id) \ .filter(Feature.name == 'aspect').all() # set features in dictionary if not features: for n_feat in features_set: for f in n_feat: features.add(f) # compare features else: for n_feat in features_set: for f in n_feat: if f not in features: raise Exception('Different features in query_res - might be problematic!') frequency += el_frequency # check if any actual sense exists if not erase all but one any_sense_not_dummy = any([not sense[1] for sense in sense_ids]) if not any_sense_not_dummy: sense_ids = sense_ids[-1:] lexical_unit_id = final_lexical_unit_id lexical_unit_lexeme_id = final_lexical_unit_lexeme_id # sense_ids = session.query(Sense.id, Sense.potential_sense).filter( # Sense.lexical_unit_id == lexical_unit_id).all() # features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ # .filter(LexemeFeature.lexeme_id == lexeme_id) \ # .filter(Feature.name == 'aspect').all() else: frequency = None lexeme_id = None lexical_unit_id = None lexical_unit_lexeme_id = None lexical_unit_type_name = None sense_ids = [] features = [] entry = lxml.SubElement(dictionary, 'entry') head = lxml.SubElement(entry, 'head') headword = lxml.SubElement(head, 'headword') lemma = lxml.SubElement(headword, 'lemma') lemma.text = headword_text lexical_unit = lxml.SubElement(head, 'lexicalUnit') if lexical_unit_id is not None: lexical_unit.set('id', str(lexical_unit_id)) if lexical_unit_type_name is not None: lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name lexical_unit.set('type', lexical_unit_type_name) lexeme = lxml.SubElement(lexical_unit, 'lexeme') if lexical_unit_lexeme_id is not None: lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id)) lexeme.text = headword_text grammar = lxml.SubElement(head, 'grammar') category = lxml.SubElement(grammar, 'category') if args.language == 'sl': category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else '' else: category.text = category_text ssj_frequency = None if len(features) > 0: grammarFeature = lxml.SubElement(grammar, 'grammarFeature') ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None if args.language == 'sl': grammarFeature.set('name', 'vid') grammarFeature.text = ASPECT_MAP[features[0].value] else: grammarFeature.set('name', 'aspect') grammarFeature.text = features[0].value measureList = lxml.SubElement(head, 'measureList') if frequency: measure = lxml.SubElement(measureList, 'measure') measure.set('type', 'frequency') measure.set('source', 'Gigafida 2.0') # measure.set('source', 'ssj500k') measure.text = str(int(frequency)) if ssj_frequency is not None: measure = lxml.SubElement(measureList, 'measure') measure.set('type', 'frequency') measure.set('source', 'ssj500k 2.2') measure.text = str(int(ssj_frequency)) body = lxml.SubElement(entry, 'body') statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList') # combine semantic_role_stats semantic_role_stats = {} for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items(): semantic_role_stats[semanticRole_val] = {} semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats if not ignore_gigafida: for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items(): if semanticRole_val not in semantic_role_stats: semantic_role_stats[semanticRole_val] = {} semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats for semanticRole_val, semanticRole_stats in semantic_role_stats.items(): statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer') semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole') semanticRole.text = semanticRole_val measureList = lxml.SubElement(statisticsContainer, 'measureList') if 'ssj' in semanticRole_stats: measure_pattern_ssj = lxml.SubElement(measureList, 'measure') measure_pattern_ssj.set('type', 'valency_pattern_ratio') measure_pattern_ssj.set('source', 'ssj500k 2.2') measure_pattern_ssj.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj) measure_sentence_ssj = lxml.SubElement(measureList, 'measure') measure_sentence_ssj.set('type', 'valency_sentence_ratio') measure_sentence_ssj.set('source', 'ssj500k 2.2') if sentence_tot_ssj == 0: measure_sentence_ssj.text = '%.4f' % (0.0) # print(headword_text) # print(semanticRole_val) # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num']) else: measure_sentence_ssj.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj) # measure_sentence_ssj.text = '%.2f' % ( # semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj) if 'gf' in semanticRole_stats and not ignore_gigafida: measure_pattern_gf = lxml.SubElement(measureList, 'measure') measure_pattern_gf.set('type', 'valency_pattern_ratio') measure_pattern_gf.set('source', 'Gigafida 2.0') measure_pattern_gf.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf) measure_sentence_gf = lxml.SubElement(measureList, 'measure') measure_sentence_gf.set('type', 'valency_sentence_ratio') measure_sentence_gf.set('source', 'Gigafida 2.0') if sentence_tot_gf == 0: measure_sentence_gf.text = '%.4f' % (0.0) # print(headword_text) # print(semanticRole_val) # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num']) else: measure_sentence_gf.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf) senseList = lxml.SubElement(body, 'senseList') # handle cases when headword is not in sloleks if len(sense_ids) == 0: sense_ids = [-1] for sense_id in sense_ids: if len(sense_ids) > 1 and sense_id.potential_sense: continue sense = lxml.SubElement(senseList, 'sense') if not sense_id == -1 and not sense_id.potential_sense: sense.set('id', str(sense_id.id)) definitionList = lxml.SubElement(sense, 'definitionList') if not sense_id == -1: definition_texts = session.query(Definition.description).filter( Definition.sense_id == sense_id.id).all() else: definition_texts = [] for definition_text in definition_texts: definition = lxml.SubElement(definitionList, 'definition') definition.text = definition_text[0] valencyPatternList = lxml.SubElement(sense, 'valencyPatternList') valencyPatternList.set('system', 'JOS') # combine semantic_role_stats ################################## headword_patterns = {} for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items(): headword_patterns[headword_patterns_val] = {} headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats if not ignore_gigafida: for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items(): if headword_patterns_val not in headword_patterns: headword_patterns[headword_patterns_val] = {} headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats ################################################################# for headword_pattern, headword_pattern_dict in headword_patterns.items(): valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern') valencyPattern.set('id', str(patterns[headword_pattern])) measureList_sense = lxml.SubElement(valencyPattern, 'measureList') if 'ssj' in headword_pattern_dict: measure_sense = lxml.SubElement(measureList_sense, 'measure') measure_sense.set('type', 'frequency_all') measure_sense.set('source', 'ssj500k 2.2') measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num']) if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']: measure_sense = lxml.SubElement(measureList_sense, 'measure') measure_sense.set('type', 'frequency_all') measure_sense.set('source', 'Gigafida 2.0') measure_sense.text = str(headword_pattern_dict['gf']['sentence_num']) semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList') # patternId = lxml.SubElement(semanticRoles, 'patternId') # patternId.text = str(patterns[headword_pattern]) if 'ACT' in headword_pattern: patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin else: patternTranslationText = headword_text for semantic_role in headword_pattern: if semantic_role != 'ACT': # additional rules # if semantic_role == 'RESLT': # pass # else: # patternTranslationText += ' ' + translations[semantic_role] patternTranslationText += ' ' + translations[semantic_role] semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer') semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole') semanticRole.text = semantic_role syntactic_structure_dict = {} if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']: for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items(): if syn_struct_id not in syntactic_structure_dict: syntactic_structure_dict[syn_struct_id] = {} for com_num, com_set in syn_struct_dict.items(): if com_num not in syntactic_structure_dict[syn_struct_id]: syntactic_structure_dict[syn_struct_id][com_num] = set() for lex in com_set: syntactic_structure_dict[syn_struct_id][com_num].add(lex) if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']: for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items(): if syn_struct_id not in syntactic_structure_dict: syntactic_structure_dict[syn_struct_id] = {} for com_num, com_set in syn_struct_dict.items(): if com_num not in syntactic_structure_dict[syn_struct_id]: syntactic_structure_dict[syn_struct_id][com_num] = set() for lex in com_set: syntactic_structure_dict[syn_struct_id][com_num].add(lex) if len(syntactic_structure_dict) > 0: syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList') # iterate over syntactic structures and write them for syn_struct_id, component_dict in syntactic_structure_dict.items(): syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') syntacticStructure.set('id', syn_struct_id) dedup_dict = {} for comp_id, lexemes in component_dict.items(): for l in lexemes: if l in preposition_list: prep_id = preposition_list[l] else: query_preposition = session.query(Lexeme.id) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ .join(Feature, Feature.id == LexemeFeature.feature_id) \ .filter(Lexeme.lemma == l[2]) \ .filter(Feature.name == 'case') \ .filter(LexemeFeature.value == CASE_MAP[l[1]]) \ .group_by(Lexeme.id) preposition_ids = query_preposition.all() if len(preposition_ids) != 1: prep_id = '' else: prep_id = str(preposition_ids[0][0]) preposition_list[l] = prep_id if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]: continue dedup_dict.setdefault(comp_id, {})[prep_id] = l[2] component = lxml.SubElement(syntacticStructure, 'component') component.set('num', comp_id) lexem = lxml.SubElement(component, 'lexeme') lexem.set('sloleks', prep_id) lexem.text = l[2] patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation') patternRepresentation.text = patternTranslationText exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList') if 'ssj' in headword_pattern_dict: for sentence_example in headword_pattern_dict['ssj']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') exampleContainer.append(copy.deepcopy(sentence_example)) if 'gf' in headword_pattern_dict: for sentence_example in headword_pattern_dict['gf']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') exampleContainer.append(copy.deepcopy(sentence_example)) with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), encoding='utf-8') as xf: xf.write(dictionary, pretty_print=True) pbar.update(1) def init_db(db): global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding [db_user, db_password, db_database, db_host] = db.split(':') Base = declarative_base() engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, pool_recycle=14400) Base.metadata.reflect(engine) class Lexeme(Base): __table__ = Base.metadata.tables['jedro_lexeme'] class LexemeFeature(Base): __table__ = Base.metadata.tables['jedro_lexeme_feature'] class Feature(Base): __table__ = Base.metadata.tables['jedro_feature'] class LexicalUnitLexeme(Base): __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme'] class LexicalUnit(Base): __table__ = Base.metadata.tables['jedro_lexicalunit'] class LexicalUnitType(Base): __table__ = Base.metadata.tables['jedro_lexicalunittype'] class Category(Base): __table__ = Base.metadata.tables['jedro_category'] class Sense(Base): __table__ = Base.metadata.tables['jedro_sense'] class Measure(Base): __table__ = Base.metadata.tables['jedro_measure'] class LexicalUnitMeasure(Base): __table__ = Base.metadata.tables['jedro_lexicalunitmeasure'] class Corpus(Base): __table__ = Base.metadata.tables['jedro_corpus'] class Definition(Base): __table__ = Base.metadata.tables['jedro_definition'] class WordForm(Base): __table__ = Base.metadata.tables['jedro_wordform'] class WordFormFeature(Base): __table__ = Base.metadata.tables['jedro_wordform_feature'] class FormRepresentation(Base): __table__ = Base.metadata.tables['jedro_formrepresentation'] class FormEncoding(Base): __table__ = Base.metadata.tables['jedro_formencoding'] return engine def match_file(words, structures): matches = [] for s in structures: for w in words: mhere = s.match(w) for match in mhere: # save only those with verbs in them if not [True for m in match.values() if m.msd[0] == 'V']: continue colocation_id = [(idx, w.lemma) for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) colocation_id = tuple(colocation_id) matches.append([match, colocation_id]) return matches possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'} def find_word_sons(word, deppar_dict, word_id, role, parents): if word.id in parents: return False for k, v in word.links.items(): for w in v: # if k in possible_jos_links and w.id == 'ssj1.1.1.t21': # print('here') if k in possible_jos_links: if w.id not in deppar_dict: deppar_dict[w.id] = {} deppar_dict[w.id][word_id] = role if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]): return False # elif k in possible_jos_links: # raise Exception('One word in multiple dependency parsetrees') return True # for ignoring punctuations def idi_word_generator(sentence): idi = 0 for word in sentence: if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: continue yield idi, word idi += 1 def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig): structures, _, max_num_components = build_structures(args) timeinfo = TimeInfo(len(input_corpus)) database = Database(args) formatted_sentences = {} start_time = time.time() sentences_num_limit = 15000 sentences_in_ram = 0 sentence_glue_numbers = None is_gf = input_corpus_orig is not None if is_gf: glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection) for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus): if is_gf: # create tuple for comparison with sentence_flue_words sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')]) if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers: logging.warning( f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") continue sentence_glue = next(glue_words_gen) sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')]) while sentence_glue_numbers < sent_id_numbers: logging.warning( f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") sentence_glue = next(glue_words_gen) sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')]) # has to be here for when next sentence_glue is selected in while loop if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers: logging.warning( f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") continue if sent_id != sentence_glue[0]: raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") if len(sentence_glue[1]) != len(sentence): logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}") continue for w, w_glue in zip(sentence, sentence_glue[1]): w.glue = w_glue[2] if sentence is None: timeinfo.add_measurement(-1) continue # start_time = time.time() # print(time.time() - start_time) matches = match_file(sentence, structures) # if sent_id == 'ssj134.880.3375': # print('here') # print(time.time() - start_time) # match_store.add_matches(matches) # word_stats.add_words(words) # database.commit() # find unimportant collocations # extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values()) for match in matches: match_idis = [] for key, word in match[0].items(): match_idis.append(word.idi) match.append(match_idis) collocations = {} for match in matches: for key, word in match[0].items(): # if word.id == '' if word.id not in collocations: collocations[word.id] = [] collocations[word.id].append((match[1][0], key, word.msd[:2], match[2])) # print(time.time() - start_time) formatted_sentence = [] deppar_dict = {} # idi = 0 incorrect_sentence = False # create output and form dependency parsetree sons for idi, word in idi_word_generator(sentence): # if word.text == 'Mumel': # print('here') # if word.text == 'Poleg': # print('here') # if word.text == 'Luka': # print('here') idi = str(idi) # a = sent_id in sentences_of_interest # b = (word.lemma, word.msd) in sentences_of_interest[sent_id] # if word.msd == 'X': # continue # if len(word.text) == 1 and word.text in string.punctuation + '': # a = re.match('^[\w]+$', word.text) is not None # if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: # continue # if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]: # if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]: # cur_count = w_collection.count_documents({'_id': sent_id}) # if w_collection.count_documents({'_id': sent_id}) > 0: sentence_of_interest = othr_sentence_attributes # is_count = cur.count() > 0 if idi in othr_sentence_attributes: if word.id not in deppar_dict: deppar_dict[word.id] = {} deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1] # deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]} # if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]: # if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]: # print((word.lemma, word.msd)) # print(sentences_of_interest[sent_id][idi][1]) # if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi: # print('HERE') if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []): incorrect_sentence = True # idi += 1 if incorrect_sentence: logging.warning( f"Sentence {sent_id} contains srl connections that loop!") continue # print(time.time() - start_time) for word in sentence: if word.id in collocations: col = collocations[word.id] else: col = [] if word.id in deppar_dict: dp = deppar_dict[word.id] else: dp = {} formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma)) # create_sentence_output(formatted_sentence, 4) formatted_sentences[sent_id] = formatted_sentence if sentences_in_ram >= sentences_num_limit: sentences_in_ram = 0 requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] result = w_a_collection.bulk_write(requests) formatted_sentences = {} sentences_in_ram += 1 # print(time.time() - start_time) requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] if len(requests) > 0: result = w_a_collection.bulk_write(requests) # force a bit of garbage collection # del sentence # del sent_id # del matches # gc.collect() print(time.time() - start_time) # return formatted_sentences # # timeinfo.add_measurement(time.time() - start_time) # # timeinfo.info() # # if no output files, just exit # if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]): # return # # # get word renders for lemma/msd # word_stats.generate_renders() # match_store.determine_colocation_dispersions() # # # figure out representations! # if args.out or args.out_no_stat: # match_store.set_representations(word_stats, structures) # # Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) def get_headword_category(collection): """ Returns :return: List of tuples with all headwords in mongodb and their categories. """ headwords = sorted(collection.distinct("headwords")[1:]) if args.headwords: with open(args.headwords, 'w') as f: for item in headwords: f.write("%s\n" % item) headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in headwords] return headword_category def read_ssj500k_frequencies(path): with open(path, 'r') as f: reader = csv.reader(f, delimiter='\t') next(reader) for line in reader: ssj_frequency_dict[(line[1], line[-1])] = line[2] def main(args): # with Path('data/wordlist.json').open("r") as fp: # sskj_wordlist = json.load(fp) # # wordlist = set(sskj_wordlist['wordlist']) # wordlist = set(sskj_wordlist['wordlist']) print('beginning chunk') start_time = time.time() # user:user:valdb:127.0.0.1 [db_user, db_password, db_database, db_host] = args.mongo_db.split(':') mongo = MongoClient(username=db_user, password=db_password, authSource=db_database) db = mongo.valdb collection_ssj = db['ssj'] collection_gigafida = db['gigafida'] db2 = mongo.extvaldb # write collection w_collection_ssj = db2['ssj'] w_collection_gigafida = db2['gigafida'] w_a_collection_ssj = db2['ssj' + '_all'] w_a_collection_gigafida = db2['gigafida' + '_all'] status_collection = db2['status'] valency_pattern_id_collection = db2['valency_pattern_ids'] RF = reduce_functions["reduce_0"]["f"] # get all headwords from database # headword_category = get_headword_category(collection_ssj) with open(args.headwords, 'r') as read: headword_category = [(line[:-1], 'verb') for line in read.readlines()] assert args.language == 'en' or args.language == 'sl' shutil.rmtree(args.outdir, True) os.mkdir(args.outdir) engine = init_db(args.sloleks_db) # input_file = codecs.open(args.infile, 'r') # # input_file = [] # next(input_file) # category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'} session = Session(engine) # cur = collection.find({}) # # a = [] # cur_len = 0 # # num_empty_sent = 0 # for ent in cur: # cur_len += 1 # # s = frames_from_db_entry(ent) # # if not s: # # num_empty_sent += 1 # a += frames_from_db_entry(ent) print(time.time() - start_time) # print(num_empty_sent) print('get_sentences_of_interest') start_time = time.time() # sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo) # sentences_of_interest_stored = args.p1_processed if not args.p1_processed: with tqdm(total=len(headword_category)) as pbar: get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj') if not args.ignore_gigafida: with tqdm(total=len(headword_category)) as pbar: get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida') # sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items())) print(time.time() - start_time) # num_sentences = 0 # for el in all_sentences: # if el not in sentences_of_interest: # num_sentences += 1 # # print(num_sentences) # print(len(all_sentences)) print('extract_sentences') start_time = time.time() # formatted_sentences_stored = args.p2_processed if not args.p2_processed: gf_anno_paths = list(os.walk(args.input_gigafida_annotated)) gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]] gf_orig_paths = list(os.walk(args.input_gigafida_original)) gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF']) extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None) if not args.ignore_gigafida: extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths) print(time.time() - start_time) print('write_xml') start_time = time.time() if args.ssj500k_frequencies is not None: read_ssj500k_frequencies(args.ssj500k_frequencies) with tqdm(total=len(headword_category)) as pbar: write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) print(time.time() - start_time) # input_file.close() session.close() if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials') arg_parser.add_argument('--mongo_db', type=str, help='Database credentials') arg_parser.add_argument('--schema', type=str, help='XML schema') arg_parser.add_argument('--infile', type=str, help='Input file') arg_parser.add_argument('--outdir', type=str, help='Output directory') arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.') arg_parser.add_argument('--language', type=str, help='Language of certain attributes') arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.') arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.') arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.') arg_parser.add_argument('--p1_processed', help='Skip first part (obtaining sentences of interest) when they are already in DB.', action='store_true') arg_parser.add_argument('--p2_processed', help='Skip second part (obtaining formatted sentences) when they are already in DB.', action='store_true') arg_parser.add_argument('--structures', help='Structures definitions in xml file') arg_parser.add_argument('--input_sloleks', help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*') arg_parser.add_argument('--input_gigafida_annotated', help='input file in (gz or xml currently). If none, then just database is loaded') arg_parser.add_argument('--input_gigafida_original', help='input file in (gz or xml currently). If none, then just database is loaded') arg_parser.add_argument('--out', help='Classic output file') arg_parser.add_argument('--out-no-stat', help='Output file, but without statistical columns') arg_parser.add_argument('--all', help='Additional output file, writes more data') arg_parser.add_argument('--stats', help='Output file for statistics') arg_parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') arg_parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') arg_parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') arg_parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') arg_parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') arg_parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') arg_parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) arg_parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') arg_parser.add_argument('--db', help="Database file to use (instead of memory)", default=None) arg_parser.add_argument('--new-db', help="Writes over database file, if there exists one", action='store_true') arg_parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") arg_parser.add_argument('--ssj500k-frequencies', help='Tag for separators, usually pc or c', default=None) args = arg_parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))