#!/usr/bin/python3 #imports from luscenje_struktur from luscenje_struktur.progress_bar import progress from luscenje_struktur.word import Word, WordCompressed from luscenje_struktur.syntactic_structure import build_structures from luscenje_struktur.match_store import MatchStore from luscenje_struktur.word_stats import WordStats from luscenje_struktur.writer import Writer from luscenje_struktur.loader import load_files, file_sentence_glue_generator from luscenje_struktur.database import Database from luscenje_struktur.time_info import TimeInfo from luscenje_struktur.msd_translate import MSD_TRANSLATE # make database-service import gc import re import string from collections import OrderedDict import sys from tqdm import tqdm import pymongo # import tqdm as tqdm # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency') # sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser') from valency.Frame import frames_from_db_entry from valency.reduce_functions import reduce_functions import argparse import os import shutil import lxml.etree as lxml import codecs import logging import argparse import pickle import time from io import StringIO from lxml import etree from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, aliased from sqlalchemy import create_engine from sqlalchemy import func from pymongo import MongoClient, UpdateOne, InsertOne # examples_num = sys.maxsize # corpus = 'ssj' translations = { 'ACT': 'KDO/KAJ', 'PAT': 'KOGA/KAJ', 'RESLT': 'REZULTAT', 'REC': 'KOMU/ČEMU', 'TIME': 'KDAJ', 'MANN': 'KAKO', 'LOC': 'KJE', 'MEANS': 'S ČIM', 'GOAL': 'ČEMU', 'REG': 'GLEDE NA KOGA/KAJ', 'DUR': 'KOLIKO ČASA', 'CAUSE': 'ZAKAJ', 'COND': 'POD KATERIM POGOJEM', 'ORIG': 'IZVOR', 'FREQ': 'KOLIKOKRAT', 'SOURCE': 'OD KOD', 'AIM': 'S KAKŠNIM NAMENOM', 'QUANT': 'ŠTEVILO', 'EVENT': 'NA DOGODKU', 'CONTR': 'KLJUB ČEMU', 'ACMP': 'S KOM/ČIM', 'RESTR': 'Z OMEJITVIJO', 'MWPRED': '', 'MODAL': '', 'PHRAS': '' } CATEGORY_MAP = { 'noun': 'samostalnik', 'verb': 'glagol', 'adjective': 'pridevnik', 'adverb': 'prislov', 'pronoun': 'zaimek', 'numeral': 'števnik', 'preposition': 'predlog', 'conjunction': 'veznik', 'particle': 'členek', 'interjection': 'medmet', 'abbreviation': 'okrajšava', 'residual': 'neuvrščeno' } ASPECT_MAP = { 'perfective': 'dovršni', 'progressive': 'nedovršni', 'biaspectual': 'dvovidski' } CASE_MAP = { 'n': 'nominative', 'g': 'genitive', 'd': 'dative', 'a': 'accusative', 'l': 'locative', 'i': 'instrumental' } Lexeme = None LexemeFeature = None SyntacticStructure = None StructureComponent = None Feature = None LexicalUnitLexeme = None LexicalUnit = None LexicalUnitType = None Category = None Sense = None Measure = None LexicalUnitMeasure = None Corpus = None Definition = None WordForm = None WordFormFeature = None FormRepresentation = None # corpus = 'gigafida' from pathlib import Path import json def hws_generator(collection, headword_text, RF, mongo): cur = collection.find({"headwords": headword_text}) # print('tu2!') frames = [] for ent in cur: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO cur.close() print('tu3!') # filter by relevant hw frames = [x for x in frames if x.hw == headword_text] # if headword_text == 'brati': # print('here') # if headword_text == 'prevajati': # print('here') ret_frames = RF(frames, mongo.db.sensemap) # print('tu4!') for frame in ret_frames: frame_json = frame.to_json() yield frame_json def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar): sentences_of_interest = {} # all_sentences = set() sorted(headword_category, key=lambda x: x[0]) # num_sentences in RAM at once sentences_num_limit = 15000 sentences_in_ram = 0 # part = 0 # start_time = time.time() # first_sentence = True # section_included = False # last_processed_hw = 'pomeniti' # last_processed_hw = 'iti' # last_processed_hw = 'aktivirati' # last_processed_hw = 'aktivirati' # already_processed = False for headword_id, (headword_text, category_text) in enumerate(headword_category): # print(headword_text) # if already_processed: # if headword_text != last_processed_hw: # continue # else: # already_processed = False # for headword_text, category_text in headword_category[15:20]: # headword_text = 'zadovoljen' # category_text = 'adjective' headword_patterns_ids = {} # print('tu1!') cur = collection.find({"headwords": headword_text}) # print('tu2!') frames = [] for ent in cur: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO cur.close() # print('tu3!') # filter by relevant hw frames = [x for x in frames if x.hw == headword_text] # if headword_text == 'brati': # print('here') # if headword_text == 'prevajati': # print('here') ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} # print('tu4!') for frame in ret_frames: frame_json = frame.to_json() json_ret["frames"].append(frame_json) # print('tu5!') # get xml values for hws in json_ret.values(): for hw in hws: # print(hw['hw']) # if hw['hw'] == 'pomeniti': # print('aaa') # generate valency pattern key valency_pattern_key = [] functors = {} if len(hw['tids']) != 1: raise Exception('Multiple TIDS') for slot in hw['slots']: valency_pattern_key.append(slot['functor']) for tid in slot['tids']: if tid not in functors: functors[tid] = {} functors[tid] = slot['functor'] valency_pattern_key = tuple(sorted(valency_pattern_key)) if valency_pattern_key not in headword_patterns_ids: headword_patterns_ids[valency_pattern_key] = [] for sentence in hw['sentences']: # all_sentences.add(sentence[0][0]) # if len(headword_patterns_ids[valency_pattern_key]) < examples_num: # if section_included: # if not sentences_in_ram > sentences_num_limit: # sentences_in_ram += 1 # continue # else: # first_sentence = True sentence_id = sentence[0][0].rsplit('.', 1)[0] # print(sentence_id) if sentence_id not in sentences_of_interest: sentences_of_interest[sentence_id] = {} idi = 0 parent_idi = -1 # print('t1') for idx, word in sentence: if idx == hw['tids'][0]: parent_idi = idi if word['word']: idi += 1 # print('t2') if parent_idi == -1: raise Exception('No parent found!') idi = 0 # if len(sentence) > 500: # print(len(sentence)) for idx, word in sentence: if idx in functors: # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx] # sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi) # sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']])) sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx]) if word['word']: # if sentence_id == 'ssj37.216.892': # print(idi) # print(word['text']) idi += 1 # print('t3') headword_patterns_ids[valency_pattern_key].append(sentence_id) # check if this is first sentence # if first_sentence: # one_element = next(iter(sentences_of_interest.items())) # section_included = w_collection.count_documents({'_id': one_element[0], # list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1 # first_sentence = False if sentences_in_ram >= sentences_num_limit: # print('print1:') # print(time.time() - start_time) start_time = time.time() # !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized') # print('Sentences in ram:') # print(sentences_in_ram) sentences_in_ram = 0 # [InsertOne({'y': 1}), DeleteOne({'x': 1}), # ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)] # requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # if 'GF0010453.1116.1' in sentences_of_interest: # print('here') requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] # print('print2:') # print(time.time() - start_time) # start_time = time.time() result = w_collection.bulk_write(requests) # print('print3:') # print(time.time() - start_time) # start_time = time.time() del requests del sentences_of_interest gc.collect() # print('print4:') # print(time.time() - start_time) # start_time = time.time() # print(part) # print('HEADWORD') # print(headword_text) # pbar.update(1) # part += 1 # # w_collection.bulk_write( # array.map((val) = > # ({ # updateOne: { # filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6}, # update: { # $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}}, # upsert: true # } # }) # ) # }) # sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()} # w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True) # try: # w_collection.insert_many(sentences_of_interest, ordered=False) # except pymongo.errors.BulkWriteError as e: # print(e.details['writeErrors']) sentences_of_interest = {} # first_sentence = True sentences_in_ram += 1 pbar.update(1) # TODO uncomment # if 'GF0010453.1116.1' in sentences_of_interest: # a = sentences_of_interest['GF0010453.1116.1'] # print('here') requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()] result = w_collection.bulk_write(requests) # sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()] # try: # w_collection.insert_many(sentences_of_interest, ordered=False) # except pymongo.errors.BulkWriteError as e: # print(e.details['writeErrors']) # sentences_of_interest = {} # # else: # # print('aaa') # return sentences_of_interest def create_sentence_output(sentence, headword_id, corpus): glue_outside = False headword_id = str(headword_id) parent_node = etree.Element('corpusExample') parent_node.set('corpusName', corpus) # parent_node.text = 'AAA' # parent_node.prefix = 'BBB' # parent_node.tail = 'CCC' cur_node = parent_node # formatted_sentence = '' first_in_tag = True first_outside_tag = False in_dependency_tree = False # TODO use whole sentence! # for idi, word in enumerate(sentence): # def idi_word_generator(sentence): # idi = 0 # for word in sentence: # if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: # continue # yield idi, word # idi += 1 idi = 0 attach_to = None p_cur_node = None p_attach_to = None p_glue_attach_to = None previous_word = None # if sentence[0][0][0] == 'Tako': # print('here') # for idi, word in idi_word_generator(sentence): for word_id in range(len(sentence)): # is_ending_tree = False # SRL container output word = sentence[word_id] # sentence output if in_dependency_tree: if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]: attach_to = cur_node # is_ending_tree = True p_glue_attach_to = cur_node cur_node = parent_node if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True # formatted_sentence += '' in_dependency_tree = False first_outside_tag = True if headword_id in word[2] and not in_dependency_tree: dep_tree = lxml.SubElement(cur_node, 'tree') dep_tree.set('role', word[2][headword_id]) cur_node = dep_tree if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True # formatted_sentence += ''.format(word[2][headword_id]) in_dependency_tree = word[2][headword_id] attach_to = None if p_glue_attach_to is not None: glue_outside = True if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None): # if headword_id == idi: comp = lxml.SubElement(cur_node, 'comp') comp.set('role', 'headword') if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] elif p_glue_attach_to is not None: if p_glue_attach_to.tail is None: p_glue_attach_to.tail = previous_word[0][1] else: p_glue_attach_to.tail += previous_word[0][1] # elif p_attach_to is not None: # if p_attach_to.tail is None: # p_attach_to.tail = previous_word[0][1] # else: # p_attach_to.tail += previous_word[0][1] word_text = word[0][0] comp.text = word_text attach_to = comp if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True first_outside_tag = True p_cur_node = cur_node p_glue_attach_to = comp p_attach_to = attach_to previous_word = word # formatted_sentence += '{}'.format(word[0][0]) idi += 1 continue if word[1] and in_dependency_tree: col_id = -1 for i, col in enumerate(word[1]): if headword_id in col[3]: col_id = i break if col_id != -1: comp = lxml.SubElement(cur_node, 'comp') comp.set('structure_id', word[1][col_id][0]) comp.set('num', word[1][col_id][1]) if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] elif p_glue_attach_to is not None: if p_glue_attach_to.tail is None: p_glue_attach_to.tail = previous_word[0][1] else: p_glue_attach_to.tail += previous_word[0][1] # elif p_attach_to is not None: # if p_attach_to.tail is None: # p_attach_to.tail = previous_word[0][1] # else: # p_attach_to.tail += previous_word[0][1] word_text = word[0][0] comp.text = word_text attach_to = comp if not first_in_tag: # formatted_sentence += '\n' first_in_tag = True first_outside_tag = True # Assuming one collocation per word # formatted_sentence += '{}'.format(word[1][0][0], word[1][0][1], word[0][0]) p_cur_node = cur_node p_glue_attach_to = comp p_attach_to = attach_to previous_word = word idi += 1 continue # collocation # if not first_in_new_row: # # formatted_sentence += ' ' # word_text = ' ' + word[0][0] # else: # word_text = word[0][0] # if first_in_tag and previous_word: # word_text = previous_word[0][1] + word[0][0] # else: # word_text = word[0][0] # word_text += word[0][1] # TODO CHANGE THIS TO FIX SPACE LOCATIONS! # word_text = word[0][0] + word[0][1] if not first_outside_tag: if p_attach_to is None: if p_cur_node is not None: p_cur_node.text += previous_word[0][1] else: p_attach_to.tail += previous_word[0][1] word_text = word[0][0] else: word_text = '' if p_attach_to is None: if p_cur_node is not None: word_text += previous_word[0][1] else: word_text += previous_word[0][1] if glue_outside: p_glue_attach_to.tail = previous_word[0][1] word_text = word[0][0] else: word_text += word[0][0] if attach_to is None: if cur_node.text is None: cur_node.text = word_text else: cur_node.text += word_text else: if attach_to.tail is None: attach_to.tail = word_text else: attach_to.tail += word_text # attach_to.tail +=word[0][0] # formatted_sentence += word[0][0] first_in_tag = False first_outside_tag = False p_cur_node = cur_node p_attach_to = attach_to previous_word = word p_glue_attach_to = None if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None: continue idi += 1 return parent_node def get_SRLcontainer_data(sentence, word_of_interest_id, summary): for word in sentence: if word_of_interest_id in word[2]: for col in word[1]: if word_of_interest_id in col[3]: if word[2][word_of_interest_id] not in summary: summary[word[2][word_of_interest_id]] = {} if col[0] not in summary[word[2][word_of_interest_id]]: summary[word[2][word_of_interest_id]][col[0]] = {} # word_of_interest_included = word_of_interest_id in col[3] if col[1] not in summary[word[2][word_of_interest_id]][col[0]]: summary[word[2][word_of_interest_id]][col[0]][col[1]] = set() if col[2][0] == 'S': summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3])) return summary def valid_valency_pattern(valency_pattern_key): occurences = set() for v_p in valency_pattern_key: if v_p in occurences: return False occurences.add(v_p) return True def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj): cur = collection.find({"headwords": headword_text}) frames = [] for ent in cur: frames += frames_from_db_entry(ent) # pre-process this step for prod TODO cur.close() # filter by relevant hw frames = [x for x in frames if x.hw == headword_text] ret_frames = RF(frames, mongo.db.sensemap) json_ret = {"frames": []} for frame in ret_frames: frame_json = frame.to_json() json_ret["frames"].append(frame_json) # get xml values headword_patterns = {} new_patterns = {} for hws in json_ret.values(): for hw in hws: # generate valency pattern key valency_pattern_key = [] for slot in hw['slots']: valency_pattern_key.append(slot['functor']) # sort valency_pattern_key by order provided in translations valency_pattern_key_new = [] for key in translations: if key in valency_pattern_key: valency_pattern_key_new.append(key) valency_pattern_key = tuple(valency_pattern_key_new) if valency_pattern_key not in headword_patterns: headword_patterns[valency_pattern_key] = {} headword_patterns[valency_pattern_key]['sentence_examples'] = [] headword_patterns[valency_pattern_key]['sentence_num'] = 0 headword_patterns[valency_pattern_key]['sr_data'] = {} if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns: new_patterns[valency_pattern_key] = pattern_id_max patterns[valency_pattern_key] = pattern_id_max pattern_id_max += 1 headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key] sr_data = headword_patterns[valency_pattern_key]['sr_data'] tids = set(hw['tids']) if valency_pattern_key in headword_patterns_ssj: ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples']) else: ssj_len = 0 for sentence in hw['sentences']: # sentences_of_interest.append(sentence[0]) # get sentence example # sentence_example = [] sent_id = sentence[0][0].rsplit('.', 1)[0] try: db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words'] except StopIteration: continue # if valency_pattern_key == ('ACT', 'PAT'): # print('am') # idi = 0 idi = 0 hw_idi = -1 for word_id, word in sentence: if word_id in tids: hw_idi = idi if word['word']: idi += 1 if hw_idi == -1: raise Exception('No such headword idi!') # for idi, word in idi_word_generator(sentence): # print('here') # for word_id, word_dict in sentence: # # TODO Modify sentence! # # if formatted_sentences[sent_id] # sentence_example.append(word_dict['text']) # if word_dict['word']: # idi += 1 # if sent_id == 'ssj134.880.3375': # print('here') # if sent_id == 'ssj38.227.917': # print('here') # if sent_id == 'GF0004627.1913.1': # print('here') # print(sent_id) # print([a for a in w_a_collection.find()]) # if valency_pattern_key == ('ACT', 'PAT'): # print('here') sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data) # TODO ERASE THIS examples_included_num = 0 # sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data) if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key): examples_included_num += 1 sentence_example = create_sentence_output(db_sentence, hw_idi, corpus) # sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi) # sentence_example = ''.join(sentence_example) # headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example) headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example) headword_patterns[valency_pattern_key]['sentence_num'] += 1 headword_patterns[valency_pattern_key]['sr_data'] = sr_data # add patterns to db new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()] if len(new_patterns_query) > 0: result = valency_pattern_id_collection.bulk_write(new_patterns_query) # calculate statistics semantic_role_stats = {} sentence_tot = 0 pattern_tot = len(headword_patterns) for key, val in headword_patterns.items(): sentence_num = val['sentence_num'] for sr in key: if sr in semantic_role_stats: semantic_role_stats[sr]['valency_pattern_num'] += 1 semantic_role_stats[sr]['valency_sentence_num'] += sentence_num else: semantic_role_stats[sr] = {} semantic_role_stats[sr]['valency_pattern_num'] = 1 semantic_role_stats[sr]['valency_sentence_num'] = sentence_num sentence_tot += sentence_num return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, Lexeme.dummy, LexicalUnitType.name) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \ .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \ .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \ .filter(LexicalUnitType.name == 'single_lexeme_unit') \ .filter(Measure.name == 'frequency') \ .filter(Corpus.name == 'gigafida') \ .filter(Corpus.version == '2.0') # valency_pattern_id_collection.find() # used to not repeat search queries for prepositions preposition_list = {} for headword_text, category_text in headword_category: # with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf: # a = [a for a in valency_pattern_id_collection.find()] patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]} # patterns = {} pattern_id_max = len(patterns) + 1 # pattern_examples_limit = 4 # get data headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit, {}) if not ignore_gigafida: headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida, w_a_collection_gigafida, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj) # TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if len(headword_patterns_ssj) == 0: continue wf1 = aliased(WordFormFeature) wf2 = aliased(WordFormFeature) wf3 = aliased(WordFormFeature) query_preposition = session.query(FormRepresentation.form) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) \ .join(wf1, wf1.word_form_id == WordForm.id) \ .join(wf2, wf2.word_form_id == WordForm.id) \ .join(wf3, wf3.word_form_id == WordForm.id) \ .filter(Lexeme.lemma == headword_text) \ .filter(wf1.value == 'singular') \ .filter(wf2.value == 'third') \ .filter(wf3.value == 'present') pattern_translation_hws = query_preposition.all() pattern_translation_3_sin = headword_text if len(pattern_translation_hws) == 1: pattern_translation_3_sin = pattern_translation_hws[0].form qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation") dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'}) if headword_text[-1] == '_': headword_text_query = headword_text[:-1] else: headword_text_query = headword_text query = query_general.filter(Category.name == category_text) \ .filter(Lexeme.lemma == headword_text_query) \ .group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, LexicalUnitType.name) # res = query.one_or_none() query_res = query.all() # query2 = session.query(Lexeme.id) \ # .join(Category, Category.id == Lexeme.category_id) \ # .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ # .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ # .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ # .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \ # .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \ # .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \ # .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ # .join(Feature, Feature.id == LexemeFeature.feature_id) \ # .filter(LexicalUnitType.name == 'single_lexeme_unit') \ # .filter(Measure.name == 'frequency') \ # .filter(Category.name == 'preposition') \ # .filter(Lexeme.lemma == 'za') \ # .filter(Feature.name == 'case') \ # .filter(LexemeFeature.value == 'instrumental') \ # .group_by(Lexeme.id) # query2 = session.query(Lexeme.id) \ # .join(Category, Category.id == Lexeme.category_id) \ # .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ # .join(Feature, Feature.id == LexemeFeature.feature_id) \ # .filter(Lexeme.lemma == 'za') \ # .filter(Feature.name == 'case') \ # .filter(LexemeFeature.value == 'instrumental') \ # .group_by(Lexeme.id) # # a = query2.all() if len(query_res) == 1: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ query_res[0] elif len(query_res) > 1: # all lexical_unit_ids equal or at least one dummy final_lexical_unit_id = 0 final_lexical_unit_lexeme_id = 0 for r in query_res: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy, lexical_unit_type_name) = r if dummy: final_lexical_unit_id = lexical_unit_id final_lexical_unit_lexeme_id = lexical_unit_lexeme_id break lexical_unit_id = final_lexical_unit_id lexical_unit_lexeme_id = final_lexical_unit_lexeme_id else: frequency = 0 lexeme_id = 0 lexical_unit_id = 0 lexical_unit_lexeme_id = 0 lexical_unit_type_name = '' sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all() features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ .filter(LexemeFeature.lexeme_id == lexeme_id) \ .filter(Feature.name == 'aspect').all() entry = lxml.SubElement(dictionary, 'entry') head = lxml.SubElement(entry, 'head') headword = lxml.SubElement(head, 'headword') lemma = lxml.SubElement(headword, 'lemma') lemma.text = headword_text lexical_unit = lxml.SubElement(head, 'lexicalUnit') lexical_unit.set('id', str(lexical_unit_id)) lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name lexical_unit.set('type', lexical_unit_type_name) lexeme = lxml.SubElement(lexical_unit, 'lexeme') lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id)) lexeme.text = headword_text grammar = lxml.SubElement(head, 'grammar') category = lxml.SubElement(grammar, 'category') if args.language == 'sl': category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else '' else: category.text = category_text grammarFeature = lxml.SubElement(grammar, 'grammarFeature') if args.language == 'sl': grammarFeature.set('name', 'vid') grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[ 0].value in ASPECT_MAP else '' else: grammarFeature.set('name', 'aspect') grammarFeature.text = features[0].value if len(features) > 0 else '' measureList = lxml.SubElement(head, 'measureList') measure = lxml.SubElement(measureList, 'measure') measure.set('type', 'frequency') # TODO Modify this! measure.set('source', 'Gigafida 2.0') # measure.set('source', 'ssj500k') measure.text = str(int(frequency)) body = lxml.SubElement(entry, 'body') statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList') # combine semantic_role_stats semantic_role_stats = {} for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items(): semantic_role_stats[semanticRole_val] = {} semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats if not ignore_gigafida: for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items(): if semanticRole_val not in semantic_role_stats: semantic_role_stats[semanticRole_val] = {} semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats for semanticRole_val, semanticRole_stats in semantic_role_stats.items(): statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer') semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole') semanticRole.text = semanticRole_val measureList = lxml.SubElement(statisticsContainer, 'measureList') if 'ssj' in semanticRole_stats: measure_pattern_ssj = lxml.SubElement(measureList, 'measure') measure_pattern_ssj.set('type', 'valency_pattern_ratio') measure_pattern_ssj.set('source', 'ssj500k 2.2') measure_pattern_ssj.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj) measure_sentence_ssj = lxml.SubElement(measureList, 'measure') measure_sentence_ssj.set('type', 'valency_sentence_ratio') measure_sentence_ssj.set('source', 'ssj500k 2.2') if sentence_tot_ssj == 0: measure_sentence_ssj.text = '%.4f' % (0.0) # print(headword_text) # print(semanticRole_val) # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num']) else: measure_sentence_ssj.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj) # measure_sentence_ssj.text = '%.2f' % ( # semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj) if 'gf' in semanticRole_stats and not ignore_gigafida: measure_pattern_gf = lxml.SubElement(measureList, 'measure') measure_pattern_gf.set('type', 'valency_pattern_ratio') measure_pattern_gf.set('source', 'Gigafida 2.0') measure_pattern_gf.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf) measure_sentence_gf = lxml.SubElement(measureList, 'measure') measure_sentence_gf.set('type', 'valency_sentence_ratio') measure_sentence_gf.set('source', 'Gigafida 2.0') if sentence_tot_gf == 0: measure_sentence_gf.text = '%.4f' % (0.0) # print(headword_text) # print(semanticRole_val) # print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num']) else: measure_sentence_gf.text = '%.4f' % ( semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf) senseList = lxml.SubElement(body, 'senseList') for sense_id in sense_ids: if len(sense_ids) > 1 and sense_id.dummy: continue sense = lxml.SubElement(senseList, 'sense') if not sense_id.dummy: sense.set('id', str(sense_id.id)) definitionList = lxml.SubElement(sense, 'definitionList') definition_texts = session.query(Definition.description).filter( Definition.sense_id == sense_id.id).all() for definition_text in definition_texts: definition = lxml.SubElement(definitionList, 'definition') definition.text = definition_text[0] syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name, StructureComponent.id, StructureComponent.name).join( LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \ .join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \ .filter(LexicalUnit.id == sense_id.id) # .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \ # syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \ # .filter(SyntacticStructure.id == sense_id) syntactic_structuresr = syntactic_structures.all() # syntactic_structures2r = syntactic_structures2.all() valencyPatternList = lxml.SubElement(sense, 'valencyPatternList') valencyPatternList.set('system', 'JOS') # combine semantic_role_stats ################################## headword_patterns = {} for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items(): headword_patterns[headword_patterns_val] = {} headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats if not ignore_gigafida: for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items(): if headword_patterns_val not in headword_patterns: headword_patterns[headword_patterns_val] = {} headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats ################################################################# for headword_pattern, headword_pattern_dict in headword_patterns.items(): valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern') valencyPattern.set('id', str(patterns[headword_pattern])) measureList_sense = lxml.SubElement(valencyPattern, 'measureList') if 'ssj' in headword_pattern_dict: measure_sense = lxml.SubElement(measureList_sense, 'measure') measure_sense.set('type', 'frequency_all') measure_sense.set('source', 'ssj500k 2.2') measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num']) if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']: measure_sense = lxml.SubElement(measureList_sense, 'measure') measure_sense.set('type', 'frequency_all') measure_sense.set('source', 'Gigafida 2.0') measure_sense.text = str(headword_pattern_dict['gf']['sentence_num']) semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList') # patternId = lxml.SubElement(semanticRoles, 'patternId') # patternId.text = str(patterns[headword_pattern]) if 'ACT' in headword_pattern: patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin else: patternTranslationText = headword_text for semantic_role in headword_pattern: if semantic_role != 'ACT': # additional rules # if semantic_role == 'RESLT': # pass # else: # patternTranslationText += ' ' + translations[semantic_role] patternTranslationText += ' ' + translations[semantic_role] semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer') semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole') semanticRole.text = semantic_role syntactic_structure_dict = {} # TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']: for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items(): if syn_struct_id not in syntactic_structure_dict: syntactic_structure_dict[syn_struct_id] = {} for com_num, com_set in syn_struct_dict.items(): if com_num not in syntactic_structure_dict[syn_struct_id]: syntactic_structure_dict[syn_struct_id][com_num] = set() for lex in com_set: syntactic_structure_dict[syn_struct_id][com_num].add(lex) if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']: for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items(): if syn_struct_id not in syntactic_structure_dict: syntactic_structure_dict[syn_struct_id] = {} for com_num, com_set in syn_struct_dict.items(): if com_num not in syntactic_structure_dict[syn_struct_id]: syntactic_structure_dict[syn_struct_id][com_num] = set() for lex in com_set: syntactic_structure_dict[syn_struct_id][com_num].add(lex) if len(syntactic_structure_dict) > 0: syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList') # iterate over syntactic structures and write them for syn_struct_id, component_dict in syntactic_structure_dict.items(): syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') syntacticStructure.set('id', syn_struct_id) for comp_id, lexemes in component_dict.items(): for l in lexemes: component = lxml.SubElement(syntacticStructure, 'component') component.set('num', comp_id) lexem = lxml.SubElement(component, 'lexeme') if l in preposition_list: prep_id = preposition_list[l] else: query_preposition = session.query(Lexeme.id) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \ .join(Feature, Feature.id == LexemeFeature.feature_id) \ .filter(Lexeme.lemma == l[2]) \ .filter(Feature.name == 'case') \ .filter(LexemeFeature.value == CASE_MAP[l[1]]) \ .group_by(Lexeme.id) preposition_ids = query_preposition.all() if len(preposition_ids) != 1: prep_id = '' else: prep_id = str(preposition_ids[0][0]) preposition_list[l] = prep_id lexem.set('sloleks', prep_id) lexem.text = l[2] # if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']: # for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items(): # syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') # syntacticStructure.set('id', syn_struct_id) # for com_num, com_set in syn_struct_dict.items(): # # component = lxml.SubElement(syntacticStructure, 'component') # # component.set('num', com_num) # for lex in com_set: # component = lxml.SubElement(syntacticStructure, 'component') # component.set('num', com_num) # lexem = lxml.SubElement(component, 'lexeme') # lexem.set('sloleks', '') # lexem.text = lex patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation') patternRepresentation.text = patternTranslationText exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList') if 'ssj' in headword_pattern_dict: for sentence_example in headword_pattern_dict['ssj']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') exampleContainer.append(sentence_example) if 'gf' in headword_pattern_dict: for sentence_example in headword_pattern_dict['gf']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') exampleContainer.append(sentence_example) with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), encoding='utf-8') as xf: xf.write(dictionary, pretty_print=True) pbar.update(1) # xf.write(entry, pretty_print=True) # tree.write(output_file_name, encoding='UTF-8', pretty_print=True) def init_db(db): global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation [db_user, db_password, db_database, db_host] = db.split(':') Base = declarative_base() engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, pool_recycle=14400) Base.metadata.reflect(engine) class Lexeme(Base): __table__ = Base.metadata.tables['jedro_lexeme'] class LexemeFeature(Base): __table__ = Base.metadata.tables['jedro_lexeme_feature'] class SyntacticStructure(Base): __table__ = Base.metadata.tables['jedro_syntacticstructure'] class StructureComponent(Base): __table__ = Base.metadata.tables['jedro_structurecomponent'] class Feature(Base): __table__ = Base.metadata.tables['jedro_feature'] class LexicalUnitLexeme(Base): __table__ = Base.metadata.tables['jedro_lexicalunit_lexeme'] class LexicalUnit(Base): __table__ = Base.metadata.tables['jedro_lexicalunit'] class LexicalUnitType(Base): __table__ = Base.metadata.tables['jedro_lexicalunittype'] class Category(Base): __table__ = Base.metadata.tables['jedro_category'] class Sense(Base): __table__ = Base.metadata.tables['jedro_sense'] class Measure(Base): __table__ = Base.metadata.tables['jedro_measure'] class LexicalUnitMeasure(Base): __table__ = Base.metadata.tables['jedro_lexicalunitmeasure'] class Corpus(Base): __table__ = Base.metadata.tables['jedro_corpus'] class Definition(Base): __table__ = Base.metadata.tables['jedro_definition'] class WordForm(Base): __table__ = Base.metadata.tables['jedro_wordform'] class WordFormFeature(Base): __table__ = Base.metadata.tables['jedro_wordform_feature'] class FormRepresentation(Base): __table__ = Base.metadata.tables['jedro_formrepresentation'] return engine def match_file(words, structures): matches = [] if words[0].text == 'Ena': a = 0 for s in structures: if s.id == '89': a = 1 for w in words: mhere = s.match(w) for match in mhere: # save only those with verbs in them if not [True for m in match.values() if m.msd[0] == 'V']: continue colocation_id = [(idx, w.lemma) for idx, w in match.items()] colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0])) colocation_id = tuple(colocation_id) matches.append([match, colocation_id]) return matches possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'} def find_word_sons(word, deppar_dict, word_id, role): for k, v in word.links.items(): # if k != 'default_factory': for w in v: # if k in possible_jos_links and w.id == 'ssj1.1.1.t21': # print('here') if k in possible_jos_links: if w.id not in deppar_dict: deppar_dict[w.id] = {} deppar_dict[w.id][word_id] = role find_word_sons(w, deppar_dict, word_id, role) # elif k in possible_jos_links: # raise Exception('One word in multiple dependency parsetrees') # for ignoring punctuations def idi_word_generator(sentence): idi = 0 for word in sentence: if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: continue yield idi, word idi += 1 def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig): structures, _, max_num_components = build_structures(args) timeinfo = TimeInfo(len(input_corpus)) database = Database(args) # match_store = MatchStore(args, database) # word_stats = WordStats(lemma_msds, database) formatted_sentences = {} start_time = time.time() # print(time.time() - start_time) sentences_num_limit = 10000 sentences_in_ram = 0 # is_gf = input_corpus_orig is not None is_gf = input_corpus_orig is not None if is_gf: glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection) for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus): if is_gf: sentence_glue = next(glue_words_gen) if sent_id != sentence_glue[0]: raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}") if len(sentence_glue[1]) != len(sentence): raise Exception( f"Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}") for w, w_glue in zip(sentence, sentence_glue[1]): w.glue = w_glue[2] if sentence is None: timeinfo.add_measurement(-1) continue # start_time = time.time() # print(time.time() - start_time) matches = match_file(sentence, structures) # if sent_id == 'ssj134.880.3375': # print('here') # print(time.time() - start_time) # match_store.add_matches(matches) # word_stats.add_words(words) # database.commit() # find unimportant collocations # extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values()) for match in matches: match_idis = [] for key, word in match[0].items(): match_idis.append(word.idi) match.append(match_idis) collocations = {} for match in matches: for key, word in match[0].items(): # if word.id == '' if word.id not in collocations: collocations[word.id] = [] collocations[word.id].append((match[1][0], key, word.msd[:2], match[2])) # print(time.time() - start_time) formatted_sentence = [] deppar_dict = {} # idi = 0 # create output and form dependency parsetree sons for idi, word in idi_word_generator(sentence): # if word.text == 'Mumel': # print('here') # if word.text == 'Poleg': # print('here') # if word.text == 'Luka': # print('here') idi = str(idi) # a = sent_id in sentences_of_interest # b = (word.lemma, word.msd) in sentences_of_interest[sent_id] # if word.msd == 'X': # continue # if len(word.text) == 1 and word.text in string.punctuation + '': # a = re.match('^[\w]+$', word.text) is not None # if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None: # continue # if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]: # if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]: # cur_count = w_collection.count_documents({'_id': sent_id}) # if w_collection.count_documents({'_id': sent_id}) > 0: sentence_of_interest = othr_sentence_attributes # is_count = cur.count() > 0 if idi in othr_sentence_attributes: if word.id not in deppar_dict: deppar_dict[word.id] = {} deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1] # deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]} # if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]: # if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]: # print((word.lemma, word.msd)) # print(sentences_of_interest[sent_id][idi][1]) # if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi: # print('HERE') find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1]) # idi += 1 # print(time.time() - start_time) for word in sentence: if word.id in collocations: col = collocations[word.id] else: col = [] if word.id in deppar_dict: dp = deppar_dict[word.id] else: dp = {} formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma)) # create_sentence_output(formatted_sentence, 4) formatted_sentences[sent_id] = formatted_sentence if sentences_in_ram >= sentences_num_limit: sentences_in_ram = 0 requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] result = w_a_collection.bulk_write(requests) formatted_sentences = {} sentences_in_ram += 1 # print(time.time() - start_time) requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()] result = w_a_collection.bulk_write(requests) # force a bit of garbage collection # del sentence # del sent_id # del matches # gc.collect() print(time.time() - start_time) # return formatted_sentences # # timeinfo.add_measurement(time.time() - start_time) # # timeinfo.info() # # if no output files, just exit # if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]): # return # # # get word renders for lemma/msd # word_stats.generate_renders() # match_store.determine_colocation_dispersions() # # # figure out representations! # if args.out or args.out_no_stat: # match_store.set_representations(word_stats, structures) # # Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) # Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out( # structures, match_store) def get_headword_category(collection): """ Returns :return: List of tuples with all headwords in mongodb and their categories. """ headwords = sorted(collection.distinct("headwords")[1:]) if args.headwords: with open(args.headwords, 'w') as f: for item in headwords: f.write("%s\n" % item) headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in headwords] return headword_category def main(args): # with Path('data/wordlist.json').open("r") as fp: # sskj_wordlist = json.load(fp) # # wordlist = set(sskj_wordlist['wordlist']) # wordlist = set(sskj_wordlist['wordlist']) print('beginning chunk') start_time = time.time() # user:user:valdb:127.0.0.1 mongo = MongoClient(username='user', password='user', authSource='valdb') db = mongo.valdb collection_ssj = db['ssj'] collection_gigafida = db['gigafida'] db2 = mongo.extvaldb # write collection w_collection_ssj = db2['ssj'] w_collection_gigafida = db2['gigafida'] w_a_collection_ssj = db2['ssj' + '_all'] w_a_collection_gigafida = db2['gigafida' + '_all'] valency_pattern_id_collection = db2['valency_pattern_ids'] RF = reduce_functions["reduce_0"]["f"] # get all headwords from database # headword_category = get_headword_category(collection_ssj) with open(args.headwords, 'r') as read: headword_category = [(line[:-1], 'verb') for line in read.readlines()] assert args.language == 'en' or args.language == 'sl' shutil.rmtree(args.outdir, True) os.mkdir(args.outdir) engine = init_db(args.sloleks_db) # input_file = codecs.open(args.infile, 'r') # # input_file = [] # next(input_file) # category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'} session = Session(engine) # cur = collection.find({}) # # a = [] # cur_len = 0 # # num_empty_sent = 0 # for ent in cur: # cur_len += 1 # # s = frames_from_db_entry(ent) # # if not s: # # num_empty_sent += 1 # a += frames_from_db_entry(ent) print(time.time() - start_time) # print(num_empty_sent) print('get_sentences_of_interest') start_time = time.time() # sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo) # sentences_of_interest_stored = args.p1_processed if not args.p1_processed: with tqdm(total=len(headword_category)) as pbar: get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar) if not args.ignore_gigafida: with tqdm(total=len(headword_category)) as pbar: get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar) # sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items())) print(time.time() - start_time) # num_sentences = 0 # for el in all_sentences: # if el not in sentences_of_interest: # num_sentences += 1 # # print(num_sentences) # print(len(all_sentences)) print('extract_sentences') start_time = time.time() # formatted_sentences_stored = args.p2_processed if not args.p2_processed: gf_anno_paths = list(os.walk(args.input_gigafida_annotated)) gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]] gf_orig_paths = list(os.walk(args.input_gigafida_original)) gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF']) extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None) if not args.ignore_gigafida: extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths) print(time.time() - start_time) print('write_xml') start_time = time.time() # print('aa ' + 3) with tqdm(total=len(headword_category)) as pbar: write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) print(time.time() - start_time) # input_file.close() session.close() if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials') arg_parser.add_argument('--schema', type=str, help='XML schema') arg_parser.add_argument('--infile', type=str, help='Input file') arg_parser.add_argument('--outdir', type=str, help='Output directory') arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.') arg_parser.add_argument('--language', type=str, help='Language of certain attributes') arg_parser.add_argument('--structure_extraction', type=str, help='Path to project (https://gitea.cjvt.si/ozbolt/luscenje_struktur)') arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.') arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.') arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.') arg_parser.add_argument('--p1_processed', help='Skip first part (obtaining sentences of interest) when they are already in DB.', action='store_true') arg_parser.add_argument('--p2_processed', help='Skip second part (obtaining formatted sentences) when they are already in DB.', action='store_true') arg_parser.add_argument('--structures', help='Structures definitions in xml file') arg_parser.add_argument('--input_sloleks', help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*') arg_parser.add_argument('--input_gigafida_annotated', help='input file in (gz or xml currently). If none, then just database is loaded') arg_parser.add_argument('--input_gigafida_original', help='input file in (gz or xml currently). If none, then just database is loaded') arg_parser.add_argument('--out', help='Classic output file') arg_parser.add_argument('--out-no-stat', help='Output file, but without statistical columns') arg_parser.add_argument('--all', help='Additional output file, writes more data') arg_parser.add_argument('--stats', help='Output file for statistics') arg_parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') arg_parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') arg_parser.add_argument('--min_freq', help='Minimal frequency in output', type=int, default=0, const=1, nargs='?') arg_parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') arg_parser.add_argument('--count-files', help="Count files: more verbose output", action='store_true') arg_parser.add_argument('--multiple-output', help='Generate one output for each syntactic structure', action='store_true') arg_parser.add_argument('--sort-by', help="Sort by a this column (index)", type=int, default=-1) arg_parser.add_argument('--sort-reversed', help="Sort in reversed ored", action='store_true') arg_parser.add_argument('--db', help="Database file to use (instead of memory)", default=None) arg_parser.add_argument('--new-db', help="Writes over database file, if there exists one", action='store_true') arg_parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") args = arg_parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) # try: # sys.path.insert(1, args.structure_extraction) # from progress_bar import progress # from word import Word, WordCompressed # from syntactic_structure import build_structures # from match_store import MatchStore # from word_stats import WordStats # from writer import Writer # from loader import load_files, file_sentence_glue_generator # from database import Database # from time_info import TimeInfo # from msd_translate import MSD_TRANSLATE # except: # raise start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))