2020-09-15 12:08:16 +00:00
#!/usr/bin/python3
2020-09-18 08:21:05 +00:00
#imports from luscenje_struktur
from luscenje_struktur . progress_bar import progress
from luscenje_struktur . word import Word , WordCompressed
from luscenje_struktur . syntactic_structure import build_structures
from luscenje_struktur . match_store import MatchStore
from luscenje_struktur . word_stats import WordStats
from luscenje_struktur . writer import Writer
from luscenje_struktur . loader import load_files , file_sentence_glue_generator
from luscenje_struktur . database import Database
from luscenje_struktur . time_info import TimeInfo
from luscenje_struktur . msd_translate import MSD_TRANSLATE
2020-09-15 12:08:16 +00:00
# make database-service
import gc
import re
import string
from collections import OrderedDict
import sys
from tqdm import tqdm
import pymongo
# import tqdm as tqdm
2020-09-18 08:21:05 +00:00
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
2020-09-23 11:02:31 +00:00
from valency . Frame import frames_from_db_entry_headword
2020-09-15 12:08:16 +00:00
from valency . reduce_functions import reduce_functions
import argparse
import os
import shutil
import lxml . etree as lxml
import codecs
import logging
import argparse
import pickle
import time
from io import StringIO
from lxml import etree
from sqlalchemy . ext . declarative import declarative_base
from sqlalchemy . orm import Session , aliased
from sqlalchemy import create_engine
from sqlalchemy import func
from pymongo import MongoClient , UpdateOne , InsertOne
# examples_num = sys.maxsize
# corpus = 'ssj'
translations = {
' ACT ' : ' KDO/KAJ ' ,
' PAT ' : ' KOGA/KAJ ' ,
' RESLT ' : ' REZULTAT ' ,
' REC ' : ' KOMU/ČEMU ' ,
' TIME ' : ' KDAJ ' ,
' MANN ' : ' KAKO ' ,
' LOC ' : ' KJE ' ,
' MEANS ' : ' S ČIM ' ,
' GOAL ' : ' ČEMU ' ,
' REG ' : ' GLEDE NA KOGA/KAJ ' ,
' DUR ' : ' KOLIKO ČASA ' ,
' CAUSE ' : ' ZAKAJ ' ,
' COND ' : ' POD KATERIM POGOJEM ' ,
' ORIG ' : ' IZVOR ' ,
' FREQ ' : ' KOLIKOKRAT ' ,
' SOURCE ' : ' OD KOD ' ,
' AIM ' : ' S KAKŠNIM NAMENOM ' ,
' QUANT ' : ' ŠTEVILO ' ,
' EVENT ' : ' NA DOGODKU ' ,
' CONTR ' : ' KLJUB ČEMU ' ,
' ACMP ' : ' S KOM/ČIM ' ,
' RESTR ' : ' Z OMEJITVIJO ' ,
' MWPRED ' : ' ' ,
' MODAL ' : ' ' ,
' PHRAS ' : ' '
}
CATEGORY_MAP = {
' noun ' : ' samostalnik ' ,
' verb ' : ' glagol ' ,
' adjective ' : ' pridevnik ' ,
' adverb ' : ' prislov ' ,
' pronoun ' : ' zaimek ' ,
' numeral ' : ' števnik ' ,
' preposition ' : ' predlog ' ,
' conjunction ' : ' veznik ' ,
' particle ' : ' členek ' ,
' interjection ' : ' medmet ' ,
' abbreviation ' : ' okrajšava ' ,
' residual ' : ' neuvrščeno '
}
ASPECT_MAP = {
' perfective ' : ' dovršni ' ,
' progressive ' : ' nedovršni ' ,
' biaspectual ' : ' dvovidski '
}
CASE_MAP = {
' n ' : ' nominative ' ,
' g ' : ' genitive ' ,
' d ' : ' dative ' ,
' a ' : ' accusative ' ,
' l ' : ' locative ' ,
' i ' : ' instrumental '
}
Lexeme = None
LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
LexicalUnitType = None
Category = None
Sense = None
Measure = None
LexicalUnitMeasure = None
Corpus = None
Definition = None
WordForm = None
WordFormFeature = None
FormRepresentation = None
# corpus = 'gigafida'
from pathlib import Path
import json
def hws_generator ( collection , headword_text , RF , mongo ) :
cur = collection . find ( { " headwords " : headword_text } )
# print('tu2!')
frames = [ ]
for ent in cur :
2020-09-23 11:02:31 +00:00
frames + = frames_from_db_entry_headword ( ent , headword_text ) # pre-process this step for prod TODO
2020-09-15 12:08:16 +00:00
cur . close ( )
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF ( frames , mongo . db . sensemap )
# print('tu4!')
for frame in ret_frames :
frame_json = frame . to_json ( )
yield frame_json
2020-09-22 17:31:31 +00:00
def get_sentences_of_interest ( headword_category , collection , w_collection , RF , mongo , pbar , status_collection , corpus_type ) :
2020-09-15 12:08:16 +00:00
sentences_of_interest = { }
# all_sentences = set()
sorted ( headword_category , key = lambda x : x [ 0 ] )
# num_sentences in RAM at once
2020-09-18 08:21:05 +00:00
sentences_num_limit = 15000
2020-09-15 12:08:16 +00:00
sentences_in_ram = 0
2020-09-18 08:21:05 +00:00
# part = 0
# start_time = time.time()
2020-09-15 12:08:16 +00:00
# first_sentence = True
# section_included = False
# last_processed_hw = 'pomeniti'
# last_processed_hw = 'iti'
# last_processed_hw = 'aktivirati'
2020-09-18 08:21:05 +00:00
# last_processed_hw = 'aktivirati'
2020-09-15 12:08:16 +00:00
2020-09-22 17:31:31 +00:00
status_collection_update_list = [ ]
2020-09-18 08:21:05 +00:00
# already_processed = False
2020-09-15 12:08:16 +00:00
for headword_id , ( headword_text , category_text ) in enumerate ( headword_category ) :
2020-09-22 17:31:31 +00:00
# check whether element has been processed
if status_collection . count_documents ( { ' corpus_type ' : corpus_type , ' headword_text ' : headword_text , ' part ' : ' p1 ' } ) :
pbar . update ( 1 )
continue
2020-09-15 12:08:16 +00:00
# print(headword_text)
2020-09-18 08:21:05 +00:00
# if already_processed:
# if headword_text != last_processed_hw:
# continue
# else:
# already_processed = False
2020-09-15 12:08:16 +00:00
# for headword_text, category_text in headword_category[15:20]:
# headword_text = 'zadovoljen'
# category_text = 'adjective'
headword_patterns_ids = { }
# print('tu1!')
cur = collection . find ( { " headwords " : headword_text } )
# print('tu2!')
frames = [ ]
for ent in cur :
2020-09-23 11:02:31 +00:00
frames + = frames_from_db_entry_headword ( ent , headword_text ) # pre-process this step for prod TODO
2020-09-15 12:08:16 +00:00
cur . close ( )
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF ( frames , mongo . db . sensemap )
json_ret = { " frames " : [ ] }
# print('tu4!')
for frame in ret_frames :
frame_json = frame . to_json ( )
json_ret [ " frames " ] . append ( frame_json )
# print('tu5!')
# get xml values
for hws in json_ret . values ( ) :
for hw in hws :
# print(hw['hw'])
# if hw['hw'] == 'pomeniti':
# print('aaa')
# generate valency pattern key
valency_pattern_key = [ ]
functors = { }
if len ( hw [ ' tids ' ] ) != 1 :
raise Exception ( ' Multiple TIDS ' )
for slot in hw [ ' slots ' ] :
valency_pattern_key . append ( slot [ ' functor ' ] )
for tid in slot [ ' tids ' ] :
if tid not in functors :
functors [ tid ] = { }
functors [ tid ] = slot [ ' functor ' ]
valency_pattern_key = tuple ( sorted ( valency_pattern_key ) )
if valency_pattern_key not in headword_patterns_ids :
headword_patterns_ids [ valency_pattern_key ] = [ ]
for sentence in hw [ ' sentences ' ] :
# all_sentences.add(sentence[0][0])
# if len(headword_patterns_ids[valency_pattern_key]) < examples_num:
# if section_included:
# if not sentences_in_ram > sentences_num_limit:
# sentences_in_ram += 1
# continue
# else:
# first_sentence = True
sentence_id = sentence [ 0 ] [ 0 ] . rsplit ( ' . ' , 1 ) [ 0 ]
# print(sentence_id)
if sentence_id not in sentences_of_interest :
sentences_of_interest [ sentence_id ] = { }
idi = 0
parent_idi = - 1
# print('t1')
for idx , word in sentence :
if idx == hw [ ' tids ' ] [ 0 ] :
parent_idi = idi
if word [ ' word ' ] :
idi + = 1
# print('t2')
if parent_idi == - 1 :
raise Exception ( ' No parent found! ' )
idi = 0
# if len(sentence) > 500:
# print(len(sentence))
for idx , word in sentence :
if idx in functors :
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx]
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi)
# sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']]))
sentences_of_interest [ sentence_id ] [ str ( idi ) ] = ( str ( parent_idi ) , functors [ idx ] )
if word [ ' word ' ] :
# if sentence_id == 'ssj37.216.892':
# print(idi)
# print(word['text'])
idi + = 1
# print('t3')
headword_patterns_ids [ valency_pattern_key ] . append ( sentence_id )
# check if this is first sentence
# if first_sentence:
# one_element = next(iter(sentences_of_interest.items()))
# section_included = w_collection.count_documents({'_id': one_element[0],
# list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1
# first_sentence = False
if sentences_in_ram > = sentences_num_limit :
# print('print1:')
# print(time.time() - start_time)
start_time = time . time ( )
# !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized')
# print('Sentences in ram:')
# print(sentences_in_ram)
sentences_in_ram = 0
# [InsertOne({'y': 1}), DeleteOne({'x': 1}),
# ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# if 'GF0010453.1116.1' in sentences_of_interest:
# print('here')
2020-09-23 11:02:31 +00:00
if len ( status_collection_update_list ) > 0 :
status_collection . bulk_write ( status_collection_update_list )
2020-09-15 12:08:16 +00:00
requests = [ UpdateOne ( { ' _id ' : k } , { ' $set ' : v } , upsert = True ) for k , v in sentences_of_interest . items ( ) ]
# print('print2:')
# print(time.time() - start_time)
# start_time = time.time()
result = w_collection . bulk_write ( requests )
# print('print3:')
# print(time.time() - start_time)
# start_time = time.time()
2020-09-22 17:31:31 +00:00
del status_collection_update_list
2020-09-15 12:08:16 +00:00
del requests
del sentences_of_interest
gc . collect ( )
# print('print4:')
# print(time.time() - start_time)
# start_time = time.time()
# print(part)
# print('HEADWORD')
# print(headword_text)
# pbar.update(1)
2020-09-18 08:21:05 +00:00
# part += 1
2020-09-15 12:08:16 +00:00
#
# w_collection.bulk_write(
# array.map((val) = >
# ({
# updateOne: {
# filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6},
# update: {
# $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}},
# upsert: true
# }
# })
# )
# })
# sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()}
# w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True)
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
2020-09-22 17:31:31 +00:00
status_collection_update_list = [ ]
2020-09-15 12:08:16 +00:00
sentences_of_interest = { }
# first_sentence = True
sentences_in_ram + = 1
pbar . update ( 1 )
2020-09-22 17:31:31 +00:00
status_collection_update_list . append ( InsertOne ( { ' corpus_type ' : corpus_type , ' headword_text ' : headword_text , ' part ' : ' p1 ' } ) )
2020-09-15 12:08:16 +00:00
# if 'GF0010453.1116.1' in sentences_of_interest:
# a = sentences_of_interest['GF0010453.1116.1']
# print('here')
2020-09-23 11:02:31 +00:00
if len ( status_collection_update_list ) > 0 :
status_collection . bulk_write ( status_collection_update_list )
2020-09-15 12:08:16 +00:00
requests = [ UpdateOne ( { ' _id ' : k } , { ' $set ' : v } , upsert = True ) for k , v in sentences_of_interest . items ( ) ]
2020-09-23 11:02:31 +00:00
if len ( requests ) > 0 :
result = w_collection . bulk_write ( requests )
2020-09-15 12:08:16 +00:00
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
# sentences_of_interest = {}
# # else:
# # print('aaa')
# return sentences_of_interest
def create_sentence_output ( sentence , headword_id , corpus ) :
glue_outside = False
headword_id = str ( headword_id )
parent_node = etree . Element ( ' corpusExample ' )
parent_node . set ( ' corpusName ' , corpus )
# parent_node.text = 'AAA'
# parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC'
cur_node = parent_node
# formatted_sentence = ''
first_in_tag = True
first_outside_tag = False
in_dependency_tree = False
# TODO use whole sentence!
# for idi, word in enumerate(sentence):
# def idi_word_generator(sentence):
# idi = 0
# for word in sentence:
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# yield idi, word
# idi += 1
idi = 0
attach_to = None
p_cur_node = None
p_attach_to = None
p_glue_attach_to = None
previous_word = None
# if sentence[0][0][0] == 'Tako':
# print('here')
# for idi, word in idi_word_generator(sentence):
for word_id in range ( len ( sentence ) ) :
# is_ending_tree = False
# SRL container output
word = sentence [ word_id ]
# sentence output
if in_dependency_tree :
if headword_id not in word [ 2 ] or in_dependency_tree != word [ 2 ] [ headword_id ] :
attach_to = cur_node
# is_ending_tree = True
p_glue_attach_to = cur_node
cur_node = parent_node
if not first_in_tag :
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '</tree>'
in_dependency_tree = False
first_outside_tag = True
if headword_id in word [ 2 ] and not in_dependency_tree :
dep_tree = lxml . SubElement ( cur_node , ' tree ' )
dep_tree . set ( ' role ' , word [ 2 ] [ headword_id ] )
cur_node = dep_tree
if not first_in_tag :
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '<tree role="{}">'.format(word[2][headword_id])
in_dependency_tree = word [ 2 ] [ headword_id ]
attach_to = None
if p_glue_attach_to is not None :
glue_outside = True
if headword_id == str ( idi ) and not ( len ( word [ 0 ] [ 0 ] ) == 1 and re . match ( ' ^[ \ w]+$ ' , word [ 0 ] [ 0 ] ) is None ) :
# if headword_id == idi:
comp = lxml . SubElement ( cur_node , ' comp ' )
comp . set ( ' role ' , ' headword ' )
if not first_outside_tag :
if p_attach_to is None :
if p_cur_node is not None :
p_cur_node . text + = previous_word [ 0 ] [ 1 ]
else :
p_attach_to . tail + = previous_word [ 0 ] [ 1 ]
elif p_glue_attach_to is not None :
if p_glue_attach_to . tail is None :
p_glue_attach_to . tail = previous_word [ 0 ] [ 1 ]
else :
p_glue_attach_to . tail + = previous_word [ 0 ] [ 1 ]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word [ 0 ] [ 0 ]
comp . text = word_text
attach_to = comp
if not first_in_tag :
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
# formatted_sentence += '<comp structure_id="headword">{}</comp>'.format(word[0][0])
idi + = 1
continue
if word [ 1 ] and in_dependency_tree :
col_id = - 1
for i , col in enumerate ( word [ 1 ] ) :
if headword_id in col [ 3 ] :
col_id = i
break
if col_id != - 1 :
comp = lxml . SubElement ( cur_node , ' comp ' )
comp . set ( ' structure_id ' , word [ 1 ] [ col_id ] [ 0 ] )
comp . set ( ' num ' , word [ 1 ] [ col_id ] [ 1 ] )
if not first_outside_tag :
if p_attach_to is None :
if p_cur_node is not None :
p_cur_node . text + = previous_word [ 0 ] [ 1 ]
else :
p_attach_to . tail + = previous_word [ 0 ] [ 1 ]
elif p_glue_attach_to is not None :
if p_glue_attach_to . tail is None :
p_glue_attach_to . tail = previous_word [ 0 ] [ 1 ]
else :
p_glue_attach_to . tail + = previous_word [ 0 ] [ 1 ]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word [ 0 ] [ 0 ]
comp . text = word_text
attach_to = comp
if not first_in_tag :
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
# Assuming one collocation per word
# formatted_sentence += '<comp structure_id="{}" num="{}">{}</comp>'.format(word[1][0][0], word[1][0][1], word[0][0])
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
idi + = 1
continue
# collocation
# if not first_in_new_row:
# # formatted_sentence += ' '
# word_text = ' ' + word[0][0]
# else:
# word_text = word[0][0]
# if first_in_tag and previous_word:
# word_text = previous_word[0][1] + word[0][0]
# else:
# word_text = word[0][0]
# word_text += word[0][1]
# word_text = word[0][0] + word[0][1]
if not first_outside_tag :
if p_attach_to is None :
if p_cur_node is not None :
p_cur_node . text + = previous_word [ 0 ] [ 1 ]
else :
p_attach_to . tail + = previous_word [ 0 ] [ 1 ]
word_text = word [ 0 ] [ 0 ]
else :
word_text = ' '
if p_attach_to is None :
if p_cur_node is not None :
word_text + = previous_word [ 0 ] [ 1 ]
else :
word_text + = previous_word [ 0 ] [ 1 ]
if glue_outside :
p_glue_attach_to . tail = previous_word [ 0 ] [ 1 ]
word_text = word [ 0 ] [ 0 ]
else :
word_text + = word [ 0 ] [ 0 ]
if attach_to is None :
if cur_node . text is None :
cur_node . text = word_text
else :
cur_node . text + = word_text
else :
if attach_to . tail is None :
attach_to . tail = word_text
else :
attach_to . tail + = word_text
# attach_to.tail +=word[0][0]
# formatted_sentence += word[0][0]
first_in_tag = False
first_outside_tag = False
p_cur_node = cur_node
p_attach_to = attach_to
previous_word = word
p_glue_attach_to = None
if len ( word [ 0 ] [ 0 ] ) == 1 and re . match ( ' ^[ \ w]+$ ' , word [ 0 ] [ 0 ] ) is None :
continue
idi + = 1
return parent_node
def get_SRLcontainer_data ( sentence , word_of_interest_id , summary ) :
for word in sentence :
if word_of_interest_id in word [ 2 ] :
for col in word [ 1 ] :
if word_of_interest_id in col [ 3 ] :
if word [ 2 ] [ word_of_interest_id ] not in summary :
summary [ word [ 2 ] [ word_of_interest_id ] ] = { }
if col [ 0 ] not in summary [ word [ 2 ] [ word_of_interest_id ] ] :
summary [ word [ 2 ] [ word_of_interest_id ] ] [ col [ 0 ] ] = { }
# word_of_interest_included = word_of_interest_id in col[3]
if col [ 1 ] not in summary [ word [ 2 ] [ word_of_interest_id ] ] [ col [ 0 ] ] :
summary [ word [ 2 ] [ word_of_interest_id ] ] [ col [ 0 ] ] [ col [ 1 ] ] = set ( )
if col [ 2 ] [ 0 ] == ' S ' :
summary [ word [ 2 ] [ word_of_interest_id ] ] [ col [ 0 ] ] [ col [ 1 ] ] . add ( ( word [ 0 ] [ 0 ] , col [ 2 ] [ 1 ] , word [ 3 ] ) )
return summary
def valid_valency_pattern ( valency_pattern_key ) :
occurences = set ( )
for v_p in valency_pattern_key :
if v_p in occurences :
return False
occurences . add ( v_p )
return True
def obtain_xml_data ( collection , w_a_collection , headword_text , RF , mongo , patterns , pattern_id_max , valency_pattern_id_collection , corpus , examples_num , headword_patterns_ssj ) :
cur = collection . find ( { " headwords " : headword_text } )
frames = [ ]
for ent in cur :
2020-09-23 11:02:31 +00:00
frames + = frames_from_db_entry_headword ( ent , headword_text )
2020-09-15 12:08:16 +00:00
cur . close ( )
ret_frames = RF ( frames , mongo . db . sensemap )
json_ret = { " frames " : [ ] }
for frame in ret_frames :
frame_json = frame . to_json ( )
json_ret [ " frames " ] . append ( frame_json )
# get xml values
headword_patterns = { }
new_patterns = { }
for hws in json_ret . values ( ) :
for hw in hws :
# generate valency pattern key
valency_pattern_key = [ ]
for slot in hw [ ' slots ' ] :
valency_pattern_key . append ( slot [ ' functor ' ] )
# sort valency_pattern_key by order provided in translations
valency_pattern_key_new = [ ]
for key in translations :
if key in valency_pattern_key :
valency_pattern_key_new . append ( key )
valency_pattern_key = tuple ( valency_pattern_key_new )
if valency_pattern_key not in headword_patterns :
headword_patterns [ valency_pattern_key ] = { }
headword_patterns [ valency_pattern_key ] [ ' sentence_examples ' ] = [ ]
headword_patterns [ valency_pattern_key ] [ ' sentence_num ' ] = 0
headword_patterns [ valency_pattern_key ] [ ' sr_data ' ] = { }
if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns :
new_patterns [ valency_pattern_key ] = pattern_id_max
patterns [ valency_pattern_key ] = pattern_id_max
pattern_id_max + = 1
headword_patterns [ valency_pattern_key ] [ ' id ' ] = patterns [ valency_pattern_key ]
sr_data = headword_patterns [ valency_pattern_key ] [ ' sr_data ' ]
tids = set ( hw [ ' tids ' ] )
if valency_pattern_key in headword_patterns_ssj :
ssj_len = len ( headword_patterns_ssj [ valency_pattern_key ] [ ' sentence_examples ' ] )
else :
ssj_len = 0
for sentence in hw [ ' sentences ' ] :
# sentences_of_interest.append(sentence[0])
# get sentence example
# sentence_example = []
sent_id = sentence [ 0 ] [ 0 ] . rsplit ( ' . ' , 1 ) [ 0 ]
try :
db_sentence = next ( iter ( w_a_collection . find ( { ' _id ' : sent_id } ) ) ) [ ' words ' ]
except StopIteration :
continue
# if valency_pattern_key == ('ACT', 'PAT'):
# print('am')
# idi = 0
idi = 0
hw_idi = - 1
for word_id , word in sentence :
if word_id in tids :
hw_idi = idi
if word [ ' word ' ] :
idi + = 1
if hw_idi == - 1 :
raise Exception ( ' No such headword idi! ' )
# for idi, word in idi_word_generator(sentence):
# print('here')
# for word_id, word_dict in sentence:
# # TODO Modify sentence!
# # if formatted_sentences[sent_id]
# sentence_example.append(word_dict['text'])
# if word_dict['word']:
# idi += 1
# if sent_id == 'ssj134.880.3375':
# print('here')
# if sent_id == 'ssj38.227.917':
# print('here')
# if sent_id == 'GF0004627.1913.1':
# print('here')
# print(sent_id)
# print([a for a in w_a_collection.find()])
# if valency_pattern_key == ('ACT', 'PAT'):
# print('here')
sr_data = get_SRLcontainer_data ( db_sentence , str ( hw_idi ) , sr_data )
examples_included_num = 0
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len ( headword_patterns [ valency_pattern_key ] [ ' sentence_examples ' ] ) + ssj_len < examples_num and valid_valency_pattern ( valency_pattern_key ) :
examples_included_num + = 1
sentence_example = create_sentence_output ( db_sentence , hw_idi , corpus )
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example)
# headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
headword_patterns [ valency_pattern_key ] [ ' sentence_examples ' ] . append ( sentence_example )
headword_patterns [ valency_pattern_key ] [ ' sentence_num ' ] + = 1
headword_patterns [ valency_pattern_key ] [ ' sr_data ' ] = sr_data
# add patterns to db
new_patterns_query = [ InsertOne ( { ' _id ' : v , ' semantic_roles ' : list ( k ) } ) for k , v in new_patterns . items ( ) ]
if len ( new_patterns_query ) > 0 :
result = valency_pattern_id_collection . bulk_write ( new_patterns_query )
# calculate statistics
semantic_role_stats = { }
sentence_tot = 0
pattern_tot = len ( headword_patterns )
for key , val in headword_patterns . items ( ) :
sentence_num = val [ ' sentence_num ' ]
for sr in key :
if sr in semantic_role_stats :
semantic_role_stats [ sr ] [ ' valency_pattern_num ' ] + = 1
semantic_role_stats [ sr ] [ ' valency_sentence_num ' ] + = sentence_num
else :
semantic_role_stats [ sr ] = { }
semantic_role_stats [ sr ] [ ' valency_pattern_num ' ] = 1
semantic_role_stats [ sr ] [ ' valency_sentence_num ' ] = sentence_num
sentence_tot + = sentence_num
return headword_patterns , semantic_role_stats , sentence_tot , pattern_tot , pattern_id_max
2020-09-18 08:21:05 +00:00
def write_xml ( headword_category , collection_ssj , collection_gigafida , RF , mongo , session , w_a_collection_ssj , w_a_collection_gigafida , valency_pattern_id_collection , corpus_name , pattern_examples_limit , ignore_gigafida , pbar ) :
2020-09-15 12:08:16 +00:00
query_general = session . query ( Lexeme . id , LexicalUnitLexeme . id , LexicalUnit . id , LexicalUnitMeasure . value ,
Lexeme . dummy , LexicalUnitType . name ) \
. join ( Category , Category . id == Lexeme . category_id ) \
. join ( LexicalUnitLexeme , LexicalUnitLexeme . lexeme_id == Lexeme . id ) \
. join ( LexicalUnit , LexicalUnit . id == LexicalUnitLexeme . lexical_unit_id ) \
. join ( LexicalUnitType , LexicalUnitType . id == LexicalUnit . type_id ) \
. join ( LexicalUnitMeasure , LexicalUnitMeasure . lexical_unit_id == LexicalUnit . id ) \
. join ( Measure , Measure . id == LexicalUnitMeasure . measure_id ) \
. join ( Corpus , Corpus . id == LexicalUnitMeasure . corpus_id ) \
. filter ( LexicalUnitType . name == ' single_lexeme_unit ' ) \
. filter ( Measure . name == ' frequency ' ) \
. filter ( Corpus . name == ' gigafida ' ) \
. filter ( Corpus . version == ' 2.0 ' )
# valency_pattern_id_collection.find()
# used to not repeat search queries for prepositions
preposition_list = { }
for headword_text , category_text in headword_category :
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()]
patterns = { tuple ( v_p [ ' semantic_roles ' ] ) : v_p [ ' _id ' ] for v_p in [ a for a in valency_pattern_id_collection . find ( ) ] }
# patterns = {}
pattern_id_max = len ( patterns ) + 1
# pattern_examples_limit = 4
# get data
headword_patterns_ssj , semantic_role_stats_ssj , sentence_tot_ssj , pattern_tot_ssj , pattern_id_max = obtain_xml_data ( collection_ssj , w_a_collection_ssj ,
headword_text , RF , mongo , patterns , pattern_id_max , valency_pattern_id_collection , ' ssj500k 2.2 ' , pattern_examples_limit ,
{ } )
if not ignore_gigafida :
headword_patterns_gf , semantic_role_stats_gf , sentence_tot_gf , pattern_tot_gf , pattern_id_max = obtain_xml_data ( collection_gigafida ,
w_a_collection_gigafida ,
headword_text , RF ,
mongo , patterns ,
pattern_id_max , valency_pattern_id_collection , ' Gigafida 2.0 ' , pattern_examples_limit , headword_patterns_ssj )
wf1 = aliased ( WordFormFeature )
wf2 = aliased ( WordFormFeature )
wf3 = aliased ( WordFormFeature )
query_preposition = session . query ( FormRepresentation . form ) \
. join ( WordForm , WordForm . id == FormRepresentation . word_form_id ) \
. join ( Lexeme , Lexeme . id == WordForm . lexeme_id ) \
. join ( wf1 , wf1 . word_form_id == WordForm . id ) \
. join ( wf2 , wf2 . word_form_id == WordForm . id ) \
. join ( wf3 , wf3 . word_form_id == WordForm . id ) \
. filter ( Lexeme . lemma == headword_text ) \
. filter ( wf1 . value == ' singular ' ) \
. filter ( wf2 . value == ' third ' ) \
. filter ( wf3 . value == ' present ' )
pattern_translation_hws = query_preposition . all ( )
pattern_translation_3_sin = headword_text
if len ( pattern_translation_hws ) == 1 :
pattern_translation_3_sin = pattern_translation_hws [ 0 ] . form
qname = etree . QName ( " http://www.w3.org/2001/XMLSchema-instance " , " noNamespaceSchemaLocation " )
dictionary = lxml . Element ( ' dictionary ' , { qname : ' valency_lexicon.xsd ' } )
if headword_text [ - 1 ] == ' _ ' :
headword_text_query = headword_text [ : - 1 ]
else :
headword_text_query = headword_text
query = query_general . filter ( Category . name == category_text ) \
. filter ( Lexeme . lemma == headword_text_query ) \
. group_by ( Lexeme . id , LexicalUnitLexeme . id , LexicalUnit . id , LexicalUnitMeasure . value ,
LexicalUnitType . name )
# res = query.one_or_none()
query_res = query . all ( )
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
# .filter(Measure.name == 'frequency') \
# .filter(Category.name == 'preposition') \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
#
# a = query2.all()
if len ( query_res ) == 1 :
( lexeme_id , lexical_unit_lexeme_id , lexical_unit_id , frequency , _ , lexical_unit_type_name ) = \
query_res [ 0 ]
elif len ( query_res ) > 1 :
# all lexical_unit_ids equal or at least one dummy
final_lexical_unit_id = 0
final_lexical_unit_lexeme_id = 0
for r in query_res :
( lexeme_id , lexical_unit_lexeme_id , lexical_unit_id , frequency , dummy ,
lexical_unit_type_name ) = r
if dummy :
final_lexical_unit_id = lexical_unit_id
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
break
lexical_unit_id = final_lexical_unit_id
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
else :
frequency = 0
lexeme_id = 0
lexical_unit_id = 0
lexical_unit_lexeme_id = 0
lexical_unit_type_name = ' '
sense_ids = session . query ( Sense . id , Sense . dummy ) . filter ( Sense . lexical_unit_id == lexical_unit_id ) . all ( )
features = session . query ( LexemeFeature . value ) . join ( Feature , Feature . id == LexemeFeature . feature_id ) \
. filter ( LexemeFeature . lexeme_id == lexeme_id ) \
. filter ( Feature . name == ' aspect ' ) . all ( )
entry = lxml . SubElement ( dictionary , ' entry ' )
head = lxml . SubElement ( entry , ' head ' )
headword = lxml . SubElement ( head , ' headword ' )
lemma = lxml . SubElement ( headword , ' lemma ' )
lemma . text = headword_text
lexical_unit = lxml . SubElement ( head , ' lexicalUnit ' )
lexical_unit . set ( ' id ' , str ( lexical_unit_id ) )
lexical_unit_type_name = ' single ' if lexical_unit_type_name == ' single_lexeme_unit ' else lexical_unit_type_name
lexical_unit . set ( ' type ' , lexical_unit_type_name )
lexeme = lxml . SubElement ( lexical_unit , ' lexeme ' )
lexeme . set ( ' lexical_unit_lexeme_id ' , str ( lexical_unit_lexeme_id ) )
lexeme . text = headword_text
grammar = lxml . SubElement ( head , ' grammar ' )
category = lxml . SubElement ( grammar , ' category ' )
if args . language == ' sl ' :
category . text = CATEGORY_MAP [ category_text ] if category_text in CATEGORY_MAP else ' '
else :
category . text = category_text
grammarFeature = lxml . SubElement ( grammar , ' grammarFeature ' )
if args . language == ' sl ' :
grammarFeature . set ( ' name ' , ' vid ' )
grammarFeature . text = ASPECT_MAP [ features [ 0 ] . value ] if len ( features ) > 0 and features [
0 ] . value in ASPECT_MAP else ' '
else :
grammarFeature . set ( ' name ' , ' aspect ' )
grammarFeature . text = features [ 0 ] . value if len ( features ) > 0 else ' '
measureList = lxml . SubElement ( head , ' measureList ' )
measure = lxml . SubElement ( measureList , ' measure ' )
measure . set ( ' type ' , ' frequency ' )
# TODO Modify this!
measure . set ( ' source ' , ' Gigafida 2.0 ' )
# measure.set('source', 'ssj500k')
measure . text = str ( int ( frequency ) )
body = lxml . SubElement ( entry , ' body ' )
statisticsContainerList = lxml . SubElement ( body , ' statisticsContainerList ' )
# combine semantic_role_stats
semantic_role_stats = { }
for semanticRole_val , semanticRole_stats in semantic_role_stats_ssj . items ( ) :
semantic_role_stats [ semanticRole_val ] = { }
semantic_role_stats [ semanticRole_val ] [ ' ssj ' ] = semanticRole_stats
if not ignore_gigafida :
for semanticRole_val , semanticRole_stats in semantic_role_stats_gf . items ( ) :
if semanticRole_val not in semantic_role_stats :
semantic_role_stats [ semanticRole_val ] = { }
semantic_role_stats [ semanticRole_val ] [ ' gf ' ] = semanticRole_stats
for semanticRole_val , semanticRole_stats in semantic_role_stats . items ( ) :
statisticsContainer = lxml . SubElement ( statisticsContainerList , ' statisticsContainer ' )
semanticRole = lxml . SubElement ( statisticsContainer , ' semanticRole ' )
semanticRole . text = semanticRole_val
measureList = lxml . SubElement ( statisticsContainer , ' measureList ' )
if ' ssj ' in semanticRole_stats :
measure_pattern_ssj = lxml . SubElement ( measureList , ' measure ' )
measure_pattern_ssj . set ( ' type ' , ' valency_pattern_ratio ' )
measure_pattern_ssj . set ( ' source ' , ' ssj500k 2.2 ' )
measure_pattern_ssj . text = ' %.4f ' % (
semantic_role_stats [ semanticRole_val ] [ ' ssj ' ] [ ' valency_pattern_num ' ] / pattern_tot_ssj )
measure_sentence_ssj = lxml . SubElement ( measureList , ' measure ' )
measure_sentence_ssj . set ( ' type ' , ' valency_sentence_ratio ' )
measure_sentence_ssj . set ( ' source ' , ' ssj500k 2.2 ' )
if sentence_tot_ssj == 0 :
measure_sentence_ssj . text = ' %.4f ' % ( 0.0 )
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else :
measure_sentence_ssj . text = ' %.4f ' % (
semantic_role_stats [ semanticRole_val ] [ ' ssj ' ] [ ' valency_sentence_num ' ] / sentence_tot_ssj )
# measure_sentence_ssj.text = '%.2f' % (
# semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
if ' gf ' in semanticRole_stats and not ignore_gigafida :
measure_pattern_gf = lxml . SubElement ( measureList , ' measure ' )
measure_pattern_gf . set ( ' type ' , ' valency_pattern_ratio ' )
measure_pattern_gf . set ( ' source ' , ' Gigafida 2.0 ' )
measure_pattern_gf . text = ' %.4f ' % (
semantic_role_stats [ semanticRole_val ] [ ' gf ' ] [ ' valency_pattern_num ' ] / pattern_tot_gf )
measure_sentence_gf = lxml . SubElement ( measureList , ' measure ' )
measure_sentence_gf . set ( ' type ' , ' valency_sentence_ratio ' )
measure_sentence_gf . set ( ' source ' , ' Gigafida 2.0 ' )
if sentence_tot_gf == 0 :
measure_sentence_gf . text = ' %.4f ' % ( 0.0 )
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else :
measure_sentence_gf . text = ' %.4f ' % (
semantic_role_stats [ semanticRole_val ] [ ' gf ' ] [ ' valency_sentence_num ' ] / sentence_tot_gf )
senseList = lxml . SubElement ( body , ' senseList ' )
for sense_id in sense_ids :
if len ( sense_ids ) > 1 and sense_id . dummy :
continue
sense = lxml . SubElement ( senseList , ' sense ' )
if not sense_id . dummy :
sense . set ( ' id ' , str ( sense_id . id ) )
definitionList = lxml . SubElement ( sense , ' definitionList ' )
definition_texts = session . query ( Definition . description ) . filter (
Definition . sense_id == sense_id . id ) . all ( )
for definition_text in definition_texts :
definition = lxml . SubElement ( definitionList , ' definition ' )
definition . text = definition_text [ 0 ]
syntactic_structures = session . query ( SyntacticStructure . id , SyntacticStructure . name ,
StructureComponent . id , StructureComponent . name ) . join (
LexicalUnit , LexicalUnit . syntactic_structure_id == SyntacticStructure . id ) \
. join ( StructureComponent , StructureComponent . syntactic_structure_id == SyntacticStructure . id ) \
. filter ( LexicalUnit . id == sense_id . id )
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
# .filter(SyntacticStructure.id == sense_id)
syntactic_structuresr = syntactic_structures . all ( )
# syntactic_structures2r = syntactic_structures2.all()
valencyPatternList = lxml . SubElement ( sense , ' valencyPatternList ' )
valencyPatternList . set ( ' system ' , ' JOS ' )
# combine semantic_role_stats ##################################
headword_patterns = { }
for headword_patterns_val , headword_patterns_stats in headword_patterns_ssj . items ( ) :
headword_patterns [ headword_patterns_val ] = { }
headword_patterns [ headword_patterns_val ] [ ' ssj ' ] = headword_patterns_stats
if not ignore_gigafida :
for headword_patterns_val , headword_patterns_stats in headword_patterns_gf . items ( ) :
if headword_patterns_val not in headword_patterns :
headword_patterns [ headword_patterns_val ] = { }
headword_patterns [ headword_patterns_val ] [ ' gf ' ] = headword_patterns_stats
#################################################################
for headword_pattern , headword_pattern_dict in headword_patterns . items ( ) :
valencyPattern = lxml . SubElement ( valencyPatternList , ' valencyPattern ' )
valencyPattern . set ( ' id ' , str ( patterns [ headword_pattern ] ) )
measureList_sense = lxml . SubElement ( valencyPattern , ' measureList ' )
if ' ssj ' in headword_pattern_dict :
measure_sense = lxml . SubElement ( measureList_sense , ' measure ' )
measure_sense . set ( ' type ' , ' frequency_all ' )
measure_sense . set ( ' source ' , ' ssj500k 2.2 ' )
measure_sense . text = str ( headword_pattern_dict [ ' ssj ' ] [ ' sentence_num ' ] )
if not ignore_gigafida and ' gf ' in headword_pattern_dict and headword_pattern_dict [ ' gf ' ] [ ' sentence_num ' ] :
measure_sense = lxml . SubElement ( measureList_sense , ' measure ' )
measure_sense . set ( ' type ' , ' frequency_all ' )
measure_sense . set ( ' source ' , ' Gigafida 2.0 ' )
measure_sense . text = str ( headword_pattern_dict [ ' gf ' ] [ ' sentence_num ' ] )
semanticRoleContainerList = lxml . SubElement ( valencyPattern , ' semanticRoleContainerList ' )
# patternId = lxml.SubElement(semanticRoles, 'patternId')
# patternId.text = str(patterns[headword_pattern])
if ' ACT ' in headword_pattern :
patternTranslationText = ' KDO/KAJ ' + pattern_translation_3_sin
else :
patternTranslationText = headword_text
for semantic_role in headword_pattern :
if semantic_role != ' ACT ' :
# additional rules
# if semantic_role == 'RESLT':
# pass
# else:
# patternTranslationText += ' ' + translations[semantic_role]
patternTranslationText + = ' ' + translations [ semantic_role ]
semanticRoleContainer = lxml . SubElement ( semanticRoleContainerList , ' semanticRoleContainer ' )
semanticRole = lxml . SubElement ( semanticRoleContainer , ' semanticRole ' )
semanticRole . text = semantic_role
syntactic_structure_dict = { }
2020-09-30 07:51:41 +00:00
2020-09-15 12:08:16 +00:00
if ' ssj ' in headword_pattern_dict and semantic_role in headword_pattern_dict [ ' ssj ' ] [ ' sr_data ' ] :
for syn_struct_id , syn_struct_dict in headword_pattern_dict [ ' ssj ' ] [ ' sr_data ' ] [ semantic_role ] . items ( ) :
if syn_struct_id not in syntactic_structure_dict :
syntactic_structure_dict [ syn_struct_id ] = { }
for com_num , com_set in syn_struct_dict . items ( ) :
if com_num not in syntactic_structure_dict [ syn_struct_id ] :
syntactic_structure_dict [ syn_struct_id ] [ com_num ] = set ( )
for lex in com_set :
syntactic_structure_dict [ syn_struct_id ] [ com_num ] . add ( lex )
if ' gf ' in headword_pattern_dict and semantic_role in headword_pattern_dict [ ' gf ' ] [ ' sr_data ' ] :
for syn_struct_id , syn_struct_dict in headword_pattern_dict [ ' gf ' ] [ ' sr_data ' ] [ semantic_role ] . items ( ) :
if syn_struct_id not in syntactic_structure_dict :
syntactic_structure_dict [ syn_struct_id ] = { }
for com_num , com_set in syn_struct_dict . items ( ) :
if com_num not in syntactic_structure_dict [ syn_struct_id ] :
syntactic_structure_dict [ syn_struct_id ] [ com_num ] = set ( )
for lex in com_set :
syntactic_structure_dict [ syn_struct_id ] [ com_num ] . add ( lex )
if len ( syntactic_structure_dict ) > 0 :
syntacticStructureList = lxml . SubElement ( semanticRoleContainer , ' syntacticStructureList ' )
# iterate over syntactic structures and write them
for syn_struct_id , component_dict in syntactic_structure_dict . items ( ) :
syntacticStructure = lxml . SubElement ( syntacticStructureList , ' syntacticStructure ' )
syntacticStructure . set ( ' id ' , syn_struct_id )
for comp_id , lexemes in component_dict . items ( ) :
for l in lexemes :
component = lxml . SubElement ( syntacticStructure , ' component ' )
component . set ( ' num ' , comp_id )
lexem = lxml . SubElement ( component , ' lexeme ' )
if l in preposition_list :
prep_id = preposition_list [ l ]
else :
query_preposition = session . query ( Lexeme . id ) \
. join ( Category , Category . id == Lexeme . category_id ) \
. join ( LexemeFeature , LexemeFeature . lexeme_id == Lexeme . id ) \
. join ( Feature , Feature . id == LexemeFeature . feature_id ) \
. filter ( Lexeme . lemma == l [ 2 ] ) \
. filter ( Feature . name == ' case ' ) \
. filter ( LexemeFeature . value == CASE_MAP [ l [ 1 ] ] ) \
. group_by ( Lexeme . id )
preposition_ids = query_preposition . all ( )
if len ( preposition_ids ) != 1 :
prep_id = ' '
else :
prep_id = str ( preposition_ids [ 0 ] [ 0 ] )
preposition_list [ l ] = prep_id
lexem . set ( ' sloleks ' , prep_id )
lexem . text = l [ 2 ]
patternRepresentation = lxml . SubElement ( valencyPattern , ' patternRepresentation ' )
patternRepresentation . text = patternTranslationText
exampleContainerList = lxml . SubElement ( valencyPattern , ' exampleContainerList ' )
if ' ssj ' in headword_pattern_dict :
for sentence_example in headword_pattern_dict [ ' ssj ' ] [ ' sentence_examples ' ] :
exampleContainer = lxml . SubElement ( exampleContainerList , ' exampleContainer ' )
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer . append ( sentence_example )
if ' gf ' in headword_pattern_dict :
for sentence_example in headword_pattern_dict [ ' gf ' ] [ ' sentence_examples ' ] :
exampleContainer = lxml . SubElement ( exampleContainerList , ' exampleContainer ' )
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer . append ( sentence_example )
with lxml . xmlfile ( os . path . join ( args . outdir , ' VS10_ ' + headword_text + ' _ ' + corpus_name + ' .xml ' ) ,
encoding = ' utf-8 ' ) as xf :
xf . write ( dictionary , pretty_print = True )
2020-09-18 08:21:05 +00:00
pbar . update ( 1 )
2020-09-15 12:08:16 +00:00
def init_db ( db ) :
global Lexeme , LexemeFeature , SyntacticStructure , StructureComponent , Feature , LexicalUnitLexeme , LexicalUnit , LexicalUnitType , Category , Sense , Measure , LexicalUnitMeasure , Corpus , Definition , WordForm , WordFormFeature , FormRepresentation
[ db_user , db_password , db_database , db_host ] = db . split ( ' : ' )
Base = declarative_base ( )
engine = create_engine ( ' postgresql:// ' + db_user + ' : ' + db_password + ' @ ' + db_host + ' / ' + db_database ,
pool_recycle = 14400 )
Base . metadata . reflect ( engine )
class Lexeme ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexeme ' ]
class LexemeFeature ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexeme_feature ' ]
class SyntacticStructure ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_syntacticstructure ' ]
class StructureComponent ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_structurecomponent ' ]
class Feature ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_feature ' ]
class LexicalUnitLexeme ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexicalunit_lexeme ' ]
class LexicalUnit ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexicalunit ' ]
class LexicalUnitType ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexicalunittype ' ]
class Category ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_category ' ]
class Sense ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_sense ' ]
class Measure ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_measure ' ]
class LexicalUnitMeasure ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_lexicalunitmeasure ' ]
class Corpus ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_corpus ' ]
class Definition ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_definition ' ]
class WordForm ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_wordform ' ]
class WordFormFeature ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_wordform_feature ' ]
class FormRepresentation ( Base ) :
__table__ = Base . metadata . tables [ ' jedro_formrepresentation ' ]
return engine
def match_file ( words , structures ) :
matches = [ ]
for s in structures :
for w in words :
mhere = s . match ( w )
for match in mhere :
# save only those with verbs in them
if not [ True for m in match . values ( ) if m . msd [ 0 ] == ' V ' ] :
continue
colocation_id = [ ( idx , w . lemma ) for idx , w in match . items ( ) ]
colocation_id = [ s . id ] + list ( sorted ( colocation_id , key = lambda x : x [ 0 ] ) )
colocation_id = tuple ( colocation_id )
matches . append ( [ match , colocation_id ] )
return matches
possible_jos_links = { ' dol ' , ' del ' , ' prir ' , ' vez ' , ' skup ' , ' ena ' , ' dve ' , ' tri ' , ' štiri ' , ' modra ' }
def find_word_sons ( word , deppar_dict , word_id , role ) :
for k , v in word . links . items ( ) :
for w in v :
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
# print('here')
if k in possible_jos_links :
if w . id not in deppar_dict :
deppar_dict [ w . id ] = { }
deppar_dict [ w . id ] [ word_id ] = role
find_word_sons ( w , deppar_dict , word_id , role )
# elif k in possible_jos_links:
# raise Exception('One word in multiple dependency parsetrees')
# for ignoring punctuations
def idi_word_generator ( sentence ) :
idi = 0
for word in sentence :
if len ( word . text ) == 1 and re . match ( ' ^[ \ w]+$ ' , word . text ) is None :
continue
yield idi , word
idi + = 1
def extract_sentences ( w_collection , w_a_collection , args , input_corpus , input_corpus_orig ) :
structures , _ , max_num_components = build_structures ( args )
timeinfo = TimeInfo ( len ( input_corpus ) )
database = Database ( args )
formatted_sentences = { }
start_time = time . time ( )
sentences_num_limit = 10000
sentences_in_ram = 0
2020-09-30 07:51:41 +00:00
sentence_glue_numbers = None
2020-09-15 12:08:16 +00:00
is_gf = input_corpus_orig is not None
if is_gf :
glue_words_gen = file_sentence_glue_generator ( input_corpus_orig , args . pc_tag , w_collection )
for sent_id , sentence , othr_sentence_attributes in load_files ( args , database , w_collection , input_corpus ) :
if is_gf :
2020-09-30 07:51:41 +00:00
# create tuple for comparison with sentence_flue_words
sent_id_numbers = tuple ( [ int ( sid ) for sid in sent_id [ 2 : ] . split ( ' . ' ) ] )
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers :
logging . warning (
f " Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
continue
2020-09-15 12:08:16 +00:00
sentence_glue = next ( glue_words_gen )
2020-09-30 07:51:41 +00:00
sentence_glue_numbers = tuple ( [ int ( sid ) for sid in sentence_glue [ 0 ] [ 2 : ] . split ( ' . ' ) ] )
while sentence_glue_numbers < sent_id_numbers :
logging . warning (
f " Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
sentence_glue = next ( glue_words_gen )
2020-09-15 12:08:16 +00:00
if sent_id != sentence_glue [ 0 ] :
raise Exception ( f " Annotated gigafida and original gigafida not in sync (annotated sent_id = { sent_id } , original sent_id = { sentence_glue [ 0 ] } " )
if len ( sentence_glue [ 1 ] ) != len ( sentence ) :
2020-09-30 07:51:41 +00:00
logging . warning ( f " Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: { len ( sentence ) } , original: { len ( sentence_glue [ 1 ] ) } " )
continue
2020-09-15 12:08:16 +00:00
for w , w_glue in zip ( sentence , sentence_glue [ 1 ] ) :
w . glue = w_glue [ 2 ]
if sentence is None :
timeinfo . add_measurement ( - 1 )
continue
# start_time = time.time()
# print(time.time() - start_time)
matches = match_file ( sentence , structures )
# if sent_id == 'ssj134.880.3375':
# print('here')
# print(time.time() - start_time)
# match_store.add_matches(matches)
# word_stats.add_words(words)
# database.commit()
# find unimportant collocations
# extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values())
for match in matches :
match_idis = [ ]
for key , word in match [ 0 ] . items ( ) :
match_idis . append ( word . idi )
match . append ( match_idis )
collocations = { }
for match in matches :
for key , word in match [ 0 ] . items ( ) :
# if word.id == ''
if word . id not in collocations :
collocations [ word . id ] = [ ]
collocations [ word . id ] . append ( ( match [ 1 ] [ 0 ] , key , word . msd [ : 2 ] , match [ 2 ] ) )
# print(time.time() - start_time)
formatted_sentence = [ ]
deppar_dict = { }
# idi = 0
# create output and form dependency parsetree sons
for idi , word in idi_word_generator ( sentence ) :
# if word.text == 'Mumel':
# print('here')
# if word.text == 'Poleg':
# print('here')
# if word.text == 'Luka':
# print('here')
idi = str ( idi )
# a = sent_id in sentences_of_interest
# b = (word.lemma, word.msd) in sentences_of_interest[sent_id]
# if word.msd == 'X':
# continue
# if len(word.text) == 1 and word.text in string.punctuation + '':
# a = re.match('^[\w]+$', word.text) is not None
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]:
# if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]:
# cur_count = w_collection.count_documents({'_id': sent_id})
# if w_collection.count_documents({'_id': sent_id}) > 0:
sentence_of_interest = othr_sentence_attributes
# is_count = cur.count() > 0
if idi in othr_sentence_attributes :
if word . id not in deppar_dict :
deppar_dict [ word . id ] = { }
deppar_dict [ word . id ] [ sentence_of_interest [ idi ] [ 0 ] ] = sentence_of_interest [ idi ] [ 1 ]
# deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]}
# if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]:
# if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]:
# print((word.lemma, word.msd))
# print(sentences_of_interest[sent_id][idi][1])
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
# print('HERE')
find_word_sons ( word , deppar_dict , sentence_of_interest [ idi ] [ 0 ] , sentence_of_interest [ idi ] [ 1 ] )
# idi += 1
# print(time.time() - start_time)
for word in sentence :
if word . id in collocations :
col = collocations [ word . id ]
else :
col = [ ]
if word . id in deppar_dict :
dp = deppar_dict [ word . id ]
else :
dp = { }
formatted_sentence . append ( ( ( word . text , word . glue ) , col , dp , word . lemma ) )
# create_sentence_output(formatted_sentence, 4)
formatted_sentences [ sent_id ] = formatted_sentence
if sentences_in_ram > = sentences_num_limit :
sentences_in_ram = 0
requests = [ UpdateOne ( { ' _id ' : k } , { ' $set ' : { ' words ' : v } } , upsert = True ) for k , v in formatted_sentences . items ( ) ]
result = w_a_collection . bulk_write ( requests )
formatted_sentences = { }
sentences_in_ram + = 1
# print(time.time() - start_time)
requests = [ UpdateOne ( { ' _id ' : k } , { ' $set ' : { ' words ' : v } } , upsert = True ) for k , v in formatted_sentences . items ( ) ]
result = w_a_collection . bulk_write ( requests )
# force a bit of garbage collection
# del sentence
# del sent_id
# del matches
# gc.collect()
print ( time . time ( ) - start_time )
# return formatted_sentences
# # timeinfo.add_measurement(time.time() - start_time)
# # timeinfo.info()
# # if no output files, just exit
# if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
# return
#
# # get word renders for lemma/msd
# word_stats.generate_renders()
# match_store.determine_colocation_dispersions()
#
# # figure out representations!
# if args.out or args.out_no_stat:
# match_store.set_representations(word_stats, structures)
#
# Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
def get_headword_category ( collection ) :
"""
Returns
: return :
List of tuples with all headwords in mongodb and their categories .
"""
headwords = sorted ( collection . distinct ( " headwords " ) [ 1 : ] )
if args . headwords :
with open ( args . headwords , ' w ' ) as f :
for item in headwords :
f . write ( " %s \n " % item )
headword_category = [ ( headword , ' verb ' ) if headword [ - 1 ] != ' _ ' else ( headword , ' adjective ' ) for headword in
headwords ]
return headword_category
def main ( args ) :
# with Path('data/wordlist.json').open("r") as fp:
# sskj_wordlist = json.load(fp)
# # wordlist = set(sskj_wordlist['wordlist'])
# wordlist = set(sskj_wordlist['wordlist'])
print ( ' beginning chunk ' )
start_time = time . time ( )
# user:user:valdb:127.0.0.1
2020-09-22 17:31:31 +00:00
[ db_user , db_password , db_database , db_host ] = args . mongo_db . split ( ' : ' )
mongo = MongoClient ( username = db_user , password = db_password , authSource = db_database )
2020-09-15 12:08:16 +00:00
db = mongo . valdb
collection_ssj = db [ ' ssj ' ]
collection_gigafida = db [ ' gigafida ' ]
db2 = mongo . extvaldb
# write collection
w_collection_ssj = db2 [ ' ssj ' ]
w_collection_gigafida = db2 [ ' gigafida ' ]
w_a_collection_ssj = db2 [ ' ssj ' + ' _all ' ]
w_a_collection_gigafida = db2 [ ' gigafida ' + ' _all ' ]
2020-09-22 17:31:31 +00:00
status_collection = db2 [ ' status ' ]
2020-09-15 12:08:16 +00:00
valency_pattern_id_collection = db2 [ ' valency_pattern_ids ' ]
RF = reduce_functions [ " reduce_0 " ] [ " f " ]
# get all headwords from database
# headword_category = get_headword_category(collection_ssj)
with open ( args . headwords , ' r ' ) as read :
headword_category = [ ( line [ : - 1 ] , ' verb ' ) for line in read . readlines ( ) ]
assert args . language == ' en ' or args . language == ' sl '
shutil . rmtree ( args . outdir , True )
os . mkdir ( args . outdir )
engine = init_db ( args . sloleks_db )
# input_file = codecs.open(args.infile, 'r')
# # input_file = []
# next(input_file)
# category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'}
session = Session ( engine )
# cur = collection.find({})
#
# a = []
# cur_len = 0
# # num_empty_sent = 0
# for ent in cur:
# cur_len += 1
# # s = frames_from_db_entry(ent)
# # if not s:
# # num_empty_sent += 1
# a += frames_from_db_entry(ent)
print ( time . time ( ) - start_time )
# print(num_empty_sent)
print ( ' get_sentences_of_interest ' )
start_time = time . time ( )
# sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo)
# sentences_of_interest_stored = args.p1_processed
if not args . p1_processed :
with tqdm ( total = len ( headword_category ) ) as pbar :
2020-09-22 17:31:31 +00:00
get_sentences_of_interest ( headword_category , collection_ssj , w_collection_ssj , RF , mongo , pbar , status_collection , ' ssj ' )
2020-09-15 12:08:16 +00:00
if not args . ignore_gigafida :
with tqdm ( total = len ( headword_category ) ) as pbar :
2020-09-22 17:31:31 +00:00
get_sentences_of_interest ( headword_category , collection_gigafida , w_collection_gigafida , RF , mongo , pbar , status_collection , ' gigafida ' )
2020-09-15 12:08:16 +00:00
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
print ( time . time ( ) - start_time )
# num_sentences = 0
# for el in all_sentences:
# if el not in sentences_of_interest:
# num_sentences += 1
#
# print(num_sentences)
# print(len(all_sentences))
print ( ' extract_sentences ' )
start_time = time . time ( )
# formatted_sentences_stored = args.p2_processed
if not args . p2_processed :
gf_anno_paths = list ( os . walk ( args . input_gigafida_annotated ) )
gf_anno_paths = [ os . path . join ( p_t [ 0 ] , f_n ) for p_t in gf_anno_paths for f_n in p_t [ 2 ] ]
gf_orig_paths = list ( os . walk ( args . input_gigafida_original ) )
gf_orig_paths = sorted ( [ os . path . join ( p_t [ 0 ] , f_n ) for p_t in gf_orig_paths for f_n in p_t [ 2 ] if f_n [ : 2 ] == ' GF ' ] )
extract_sentences ( w_collection_ssj , w_a_collection_ssj , args , args . input_sloleks , None )
if not args . ignore_gigafida :
extract_sentences ( w_collection_gigafida , w_a_collection_gigafida , args , gf_anno_paths , gf_orig_paths )
print ( time . time ( ) - start_time )
print ( ' write_xml ' )
start_time = time . time ( )
# print('aa ' + 3)
2020-09-18 08:21:05 +00:00
with tqdm ( total = len ( headword_category ) ) as pbar :
write_xml ( headword_category , collection_ssj , collection_gigafida , RF , mongo , session , w_a_collection_ssj , w_a_collection_gigafida , valency_pattern_id_collection , args . corpus_name , args . pattern_examples_limit , args . ignore_gigafida , pbar )
2020-09-15 12:08:16 +00:00
print ( time . time ( ) - start_time )
# input_file.close()
session . close ( )
if __name__ == ' __main__ ' :
arg_parser = argparse . ArgumentParser ( description = ' Export and validate collocation data from DDD database. ' )
arg_parser . add_argument ( ' --sloleks_db ' , type = str , help = ' Database credentials ' )
2020-09-22 17:31:31 +00:00
arg_parser . add_argument ( ' --mongo_db ' , type = str , help = ' Database credentials ' )
2020-09-15 12:08:16 +00:00
arg_parser . add_argument ( ' --schema ' , type = str , help = ' XML schema ' )
arg_parser . add_argument ( ' --infile ' , type = str , help = ' Input file ' )
arg_parser . add_argument ( ' --outdir ' , type = str , help = ' Output directory ' )
arg_parser . add_argument ( ' --headwords ' , type = str , default = None , help = ' Path to file, where headwords will be saved. ' )
arg_parser . add_argument ( ' --language ' , type = str , help = ' Language of certain attributes ' )
arg_parser . add_argument ( ' --corpus_name ' , type = str , help = ' Name of corpus to be written in outputs. ' )
arg_parser . add_argument ( ' --pattern_examples_limit ' , type = int , default = 10 , help = ' Max number of examples. ' )
arg_parser . add_argument ( ' --ignore_gigafida ' , action = ' store_true ' , help = ' If tagged ignore gigafida in output. ' )
arg_parser . add_argument ( ' --p1_processed ' ,
help = ' Skip first part (obtaining sentences of interest) when they are already in DB. ' ,
action = ' store_true ' )
arg_parser . add_argument ( ' --p2_processed ' ,
help = ' Skip second part (obtaining formatted sentences) when they are already in DB. ' ,
action = ' store_true ' )
arg_parser . add_argument ( ' --structures ' ,
help = ' Structures definitions in xml file ' )
arg_parser . add_argument ( ' --input_sloleks ' ,
help = ' input file in (gz or xml currently). If none, then just database is loaded ' , nargs = ' * ' )
arg_parser . add_argument ( ' --input_gigafida_annotated ' ,
help = ' input file in (gz or xml currently). If none, then just database is loaded ' )
arg_parser . add_argument ( ' --input_gigafida_original ' ,
help = ' input file in (gz or xml currently). If none, then just database is loaded ' )
arg_parser . add_argument ( ' --out ' ,
help = ' Classic output file ' )
arg_parser . add_argument ( ' --out-no-stat ' ,
help = ' Output file, but without statistical columns ' )
arg_parser . add_argument ( ' --all ' ,
help = ' Additional output file, writes more data ' )
arg_parser . add_argument ( ' --stats ' ,
help = ' Output file for statistics ' )
arg_parser . add_argument ( ' --no-msd-translate ' ,
help = ' MSDs are translated from slovene to english by default ' ,
action = ' store_true ' )
arg_parser . add_argument ( ' --skip-id-check ' ,
help = ' Skips checks for ids of <w> and <pc>, if they are in correct format ' ,
action = ' store_true ' )
arg_parser . add_argument ( ' --min_freq ' , help = ' Minimal frequency in output ' ,
type = int , default = 0 , const = 1 , nargs = ' ? ' )
arg_parser . add_argument ( ' --verbose ' , help = ' Enable verbose output to stderr ' ,
choices = [ " warning " , " info " , " debug " ] , default = " info " ,
const = " info " , nargs = ' ? ' )
arg_parser . add_argument ( ' --count-files ' ,
help = " Count files: more verbose output " , action = ' store_true ' )
arg_parser . add_argument ( ' --multiple-output ' ,
help = ' Generate one output for each syntactic structure ' ,
action = ' store_true ' )
arg_parser . add_argument ( ' --sort-by ' ,
help = " Sort by a this column (index) " , type = int , default = - 1 )
arg_parser . add_argument ( ' --sort-reversed ' ,
help = " Sort in reversed ored " , action = ' store_true ' )
arg_parser . add_argument ( ' --db ' ,
help = " Database file to use (instead of memory) " , default = None )
arg_parser . add_argument ( ' --new-db ' ,
help = " Writes over database file, if there exists one " , action = ' store_true ' )
arg_parser . add_argument ( ' --pc-tag ' ,
help = ' Tag for separators, usually pc or c ' , default = " pc " )
args = arg_parser . parse_args ( )
logging . basicConfig ( stream = sys . stderr , level = args . verbose . upper ( ) )
start = time . time ( )
main ( args )
logging . info ( " TIME: {} " . format ( time . time ( ) - start ) )