You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-valency/scripts/create_xml.py

1709 lines
73 KiB

# -*- coding: utf-8 -*-
#!/usr/bin/python3
#imports from luscenje_struktur
import copy
import csv
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
from luscenje_struktur.syntactic_structure import build_structures
from luscenje_struktur.match_store import MatchStore
from luscenje_struktur.word_stats import WordStats
from luscenje_struktur.writer import Writer
from luscenje_struktur.loader import load_files, file_sentence_glue_generator
from luscenje_struktur.database import Database
from luscenje_struktur.time_info import TimeInfo
from luscenje_struktur.msd_translate import MSD_TRANSLATE
# make database-service
import gc
import re
import string
from collections import OrderedDict
import sys
from tqdm import tqdm
import pymongo
# import tqdm as tqdm
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
from valency.Frame import frames_from_db_entry_headword
from valency.reduce_functions import reduce_functions
import argparse
import os
import shutil
import lxml.etree as lxml
import codecs
import logging
import argparse
import pickle
import time
from io import StringIO
from lxml import etree
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from sqlalchemy import func
from pymongo import MongoClient, UpdateOne, InsertOne
# examples_num = sys.maxsize
# corpus = 'ssj'
translations = {
'ACT': 'KDO/KAJ',
'PAT': 'KOGA/KAJ',
'RESLT': 'REZULTAT',
'REC': 'KOMU/ČEMU',
'TIME': 'KDAJ',
'MANN': 'KAKO',
'LOC': 'KJE',
'MEANS': 'S ČIM',
'GOAL': 'ČEMU',
'REG': 'GLEDE NA KOGA/KAJ',
'DUR': 'KOLIKO ČASA',
'CAUSE': 'ZAKAJ',
'COND': 'POD KATERIM POGOJEM',
'ORIG': 'IZVOR',
'FREQ': 'KOLIKOKRAT',
'SOURCE': 'OD KOD',
'AIM': 'S KAKŠNIM NAMENOM',
'QUANT': 'ŠTEVILO',
'EVENT': 'NA DOGODKU',
'CONTR': 'KLJUB ČEMU',
'ACMP': 'S KOM/ČIM',
'RESTR': 'Z OMEJITVIJO',
'MWPRED': '',
'MODAL': '',
'PHRAS': ''
}
CATEGORY_MAP = {
'noun': 'samostalnik',
'verb': 'glagol',
'adjective': 'pridevnik',
'adverb': 'prislov',
'pronoun': 'zaimek',
'numeral': 'števnik',
'preposition': 'predlog',
'conjunction': 'veznik',
'particle': 'členek',
'interjection': 'medmet',
'abbreviation': 'okrajšava',
'residual': 'neuvrščeno'
}
ASPECT_MAP = {
'perfective': 'dovršni',
'progressive': 'nedovršni',
'biaspectual': 'dvovidski'
}
CASE_MAP = {
'n': 'nominative',
'g': 'genitive',
'd': 'dative',
'a': 'accusative',
'l': 'locative',
'i': 'instrumental'
}
ssj_frequency_dict = {}
Lexeme = None
LexemeFeature = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
LexicalUnitType = None
Category = None
Sense = None
Measure = None
LexicalUnitMeasure = None
Corpus = None
Definition = None
WordForm = None
WordFormFeature = None
FormRepresentation = None
FormEncoding = None
# corpus = 'gigafida'
from pathlib import Path
import json
def hws_generator(collection, headword_text, RF, mongo):
cur = collection.find({"headwords": headword_text})
# print('tu2!')
frames = []
for ent in cur:
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
cur.close()
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF(frames, mongo.db.sensemap)
# print('tu4!')
for frame in ret_frames:
frame_json = frame.to_json()
yield frame_json
def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type):
sentences_of_interest = {}
# all_sentences = set()
sorted(headword_category, key=lambda x: x[0])
# num_sentences in RAM at once
sentences_num_limit = 15000
sentences_in_ram = 0
# part = 0
# start_time = time.time()
# first_sentence = True
# section_included = False
# last_processed_hw = 'pomeniti'
# last_processed_hw = 'iti'
# last_processed_hw = 'aktivirati'
# last_processed_hw = 'aktivirati'
status_collection_update_list = []
# already_processed = False
for headword_id, (headword_text, category_text) in enumerate(headword_category):
# check whether element has been processed
if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}):
pbar.update(1)
continue
# print(headword_text)
# if already_processed:
# if headword_text != last_processed_hw:
# continue
# else:
# already_processed = False
# for headword_text, category_text in headword_category[15:20]:
# headword_text = 'zadovoljen'
# category_text = 'adjective'
headword_patterns_ids = {}
# print('tu1!')
cur = collection.find({"headwords": headword_text})
# cur = collection.find({"headwords": headword_text}, no_cursor_timeout=True)
# print('tu2!')
frames = []
for ent in cur:
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
cur.close()
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF(frames, mongo.db.sensemap)
json_ret = {"frames": []}
# print('tu4!')
for frame in ret_frames:
frame_json = frame.to_json()
json_ret["frames"].append(frame_json)
# print('tu5!')
# get xml values
for hws in json_ret.values():
for hw in hws:
# print(hw['hw'])
# if hw['hw'] == 'pomeniti':
# print('aaa')
# generate valency pattern key
valency_pattern_key = []
functors = {}
if len(hw['tids']) != 1:
raise Exception('Multiple TIDS')
for slot in hw['slots']:
valency_pattern_key.append(slot['functor'])
for tid in slot['tids']:
if tid not in functors:
functors[tid] = {}
functors[tid] = slot['functor']
valency_pattern_key = tuple(sorted(valency_pattern_key))
if valency_pattern_key not in headword_patterns_ids:
headword_patterns_ids[valency_pattern_key] = []
for sentence in hw['sentences']:
# all_sentences.add(sentence[0][0])
# if len(headword_patterns_ids[valency_pattern_key]) < examples_num:
# if section_included:
# if not sentences_in_ram > sentences_num_limit:
# sentences_in_ram += 1
# continue
# else:
# first_sentence = True
sentence_id = sentence[0][0].rsplit('.', 1)[0]
# print(sentence_id)
if sentence_id not in sentences_of_interest:
sentences_of_interest[sentence_id] = {}
idi = 0
parent_idi = -1
# print('t1')
for idx, word in sentence:
if idx == hw['tids'][0]:
parent_idi = idi
if word['word']:
idi += 1
# print('t2')
if parent_idi == -1:
raise Exception('No parent found!')
idi = 0
# if len(sentence) > 500:
# print(len(sentence))
for idx, word in sentence:
if idx in functors:
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx]
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi)
# sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']]))
sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx])
if word['word']:
# if sentence_id == 'ssj37.216.892':
# print(idi)
# print(word['text'])
idi += 1
# print('t3')
headword_patterns_ids[valency_pattern_key].append(sentence_id)
# check if this is first sentence
# if first_sentence:
# one_element = next(iter(sentences_of_interest.items()))
# section_included = w_collection.count_documents({'_id': one_element[0],
# list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1
# first_sentence = False
if sentences_in_ram >= sentences_num_limit:
# print('print1:')
# print(time.time() - start_time)
start_time = time.time()
# !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized')
# print('Sentences in ram:')
# print(sentences_in_ram)
sentences_in_ram = 0
# [InsertOne({'y': 1}), DeleteOne({'x': 1}),
# ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# if 'GF0010453.1116.1' in sentences_of_interest:
# print('here')
if len(status_collection_update_list) > 0:
status_collection.bulk_write(status_collection_update_list)
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
# print('print2:')
# print(time.time() - start_time)
# start_time = time.time()
result = w_collection.bulk_write(requests)
# print('print3:')
# print(time.time() - start_time)
# start_time = time.time()
del status_collection_update_list
del requests
del sentences_of_interest
gc.collect()
# print('print4:')
# print(time.time() - start_time)
# start_time = time.time()
# print(part)
# print('HEADWORD')
# print(headword_text)
# pbar.update(1)
# part += 1
#
# w_collection.bulk_write(
# array.map((val) = >
# ({
# updateOne: {
# filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6},
# update: {
# $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}},
# upsert: true
# }
# })
# )
# })
# sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()}
# w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True)
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
status_collection_update_list = []
sentences_of_interest = {}
# first_sentence = True
sentences_in_ram += 1
pbar.update(1)
status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
# if 'GF0010453.1116.1' in sentences_of_interest:
# a = sentences_of_interest['GF0010453.1116.1']
# print('here')
if len(status_collection_update_list) > 0:
status_collection.bulk_write(status_collection_update_list)
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
if len(requests) > 0:
result = w_collection.bulk_write(requests)
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
# sentences_of_interest = {}
# # else:
# # print('aaa')
# return sentences_of_interest
def create_sentence_output(sentence, headword_id, corpus, sent_id):
glue_outside = False
headword_id = str(headword_id)
parent_node = etree.Element('corpusExample')
parent_node.set('corpusName', corpus)
parent_node.set('exampleId', sent_id)
# parent_node.text = 'AAA'
# parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC'
cur_node = parent_node
# formatted_sentence = ''
first_in_tag = True
first_outside_tag = False
in_dependency_tree = False
# TODO use whole sentence!
# for idi, word in enumerate(sentence):
# def idi_word_generator(sentence):
# idi = 0
# for word in sentence:
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# yield idi, word
# idi += 1
idi = 0
attach_to = None
p_cur_node = None
p_attach_to = None
p_glue_attach_to = None
previous_word = None
# if sentence[0][0][0] == 'Tako':
# print('here')
# for idi, word in idi_word_generator(sentence):
for word_id in range(len(sentence)):
# is_ending_tree = False
# SRL container output
word = sentence[word_id]
# sentence output
if in_dependency_tree:
if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]:
attach_to = cur_node
# is_ending_tree = True
p_glue_attach_to = cur_node
cur_node = parent_node
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '</tree>'
in_dependency_tree = False
first_outside_tag = True
if headword_id in word[2] and not in_dependency_tree:
dep_tree = lxml.SubElement(cur_node, 'tree')
dep_tree.set('role', word[2][headword_id])
cur_node = dep_tree
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '<tree role="{}">'.format(word[2][headword_id])
in_dependency_tree = word[2][headword_id]
attach_to = None
if p_glue_attach_to is not None:
glue_outside = True
if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None):
# if headword_id == idi:
comp = lxml.SubElement(cur_node, 'comp')
comp.set('role', 'headword')
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
elif p_glue_attach_to is not None:
if p_glue_attach_to.tail is None:
p_glue_attach_to.tail = previous_word[0][1]
else:
p_glue_attach_to.tail += previous_word[0][1]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
comp.text = word_text
attach_to = comp
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
# formatted_sentence += '<comp structure_id="headword">{}</comp>'.format(word[0][0])
idi += 1
continue
if word[1] and in_dependency_tree:
col_id = -1
for i, col in enumerate(word[1]):
if headword_id in col[3]:
col_id = i
break
if col_id != -1:
comp = lxml.SubElement(cur_node, 'comp')
comp.set('structure_id', word[1][col_id][0])
comp.set('num', word[1][col_id][1])
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
elif p_glue_attach_to is not None:
if p_glue_attach_to.tail is None:
p_glue_attach_to.tail = previous_word[0][1]
else:
p_glue_attach_to.tail += previous_word[0][1]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
comp.text = word_text
attach_to = comp
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
# Assuming one collocation per word
# formatted_sentence += '<comp structure_id="{}" num="{}">{}</comp>'.format(word[1][0][0], word[1][0][1], word[0][0])
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
idi += 1
continue
# collocation
# if not first_in_new_row:
# # formatted_sentence += ' '
# word_text = ' ' + word[0][0]
# else:
# word_text = word[0][0]
# if first_in_tag and previous_word:
# word_text = previous_word[0][1] + word[0][0]
# else:
# word_text = word[0][0]
# word_text += word[0][1]
# word_text = word[0][0] + word[0][1]
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
else:
word_text = ''
if p_attach_to is None:
if p_cur_node is not None:
word_text += previous_word[0][1]
else:
word_text += previous_word[0][1]
if glue_outside:
p_glue_attach_to.tail = previous_word[0][1]
word_text = word[0][0]
else:
word_text += word[0][0]
if attach_to is None:
if cur_node.text is None:
cur_node.text = word_text
else:
cur_node.text += word_text
else:
if attach_to.tail is None:
attach_to.tail = word_text
else:
attach_to.tail += word_text
# attach_to.tail +=word[0][0]
# formatted_sentence += word[0][0]
first_in_tag = False
first_outside_tag = False
p_cur_node = cur_node
p_attach_to = attach_to
previous_word = word
p_glue_attach_to = None
if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None:
continue
idi += 1
return parent_node
def get_SRLcontainer_data(sentence, word_of_interest_id, summary):
for word in sentence:
if word_of_interest_id in word[2]:
for col in word[1]:
if word_of_interest_id in col[3]:
if word[2][word_of_interest_id] not in summary:
summary[word[2][word_of_interest_id]] = {}
if col[0] not in summary[word[2][word_of_interest_id]]:
summary[word[2][word_of_interest_id]][col[0]] = {}
# word_of_interest_included = word_of_interest_id in col[3]
if col[1] not in summary[word[2][word_of_interest_id]][col[0]]:
summary[word[2][word_of_interest_id]][col[0]][col[1]] = set()
if col[2][0] == 'S':
summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3]))
return summary
def valid_valency_pattern(valency_pattern_key):
occurences = set()
for v_p in valency_pattern_key:
if v_p in occurences:
return False
occurences.add(v_p)
return True
def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj):
cur = collection.find({"headwords": headword_text})
frames = []
for ent in cur:
frames += frames_from_db_entry_headword(ent, headword_text)
cur.close()
ret_frames = RF(frames, mongo.db.sensemap)
json_ret = {"frames": []}
for frame in ret_frames:
frame_json = frame.to_json()
json_ret["frames"].append(frame_json)
# get xml values
headword_patterns = {}
new_patterns = {}
for hws in json_ret.values():
for hw in hws:
# generate valency pattern key
valency_pattern_key = []
for slot in hw['slots']:
valency_pattern_key.append(slot['functor'])
# sort valency_pattern_key by order provided in translations
valency_pattern_key_new = []
for key in translations:
if key in valency_pattern_key:
valency_pattern_key_new.append(key)
valency_pattern_key = tuple(valency_pattern_key_new)
if valency_pattern_key not in headword_patterns:
headword_patterns[valency_pattern_key] = {}
headword_patterns[valency_pattern_key]['sentence_examples'] = []
headword_patterns[valency_pattern_key]['sentence_num'] = 0
headword_patterns[valency_pattern_key]['sr_data'] = {}
if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns:
new_patterns[valency_pattern_key] = pattern_id_max
patterns[valency_pattern_key] = pattern_id_max
pattern_id_max += 1
headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key]
sr_data = headword_patterns[valency_pattern_key]['sr_data']
tids = set(hw['tids'])
if valency_pattern_key in headword_patterns_ssj:
ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples'])
else:
ssj_len = 0
for sentence in hw['sentences']:
# sentences_of_interest.append(sentence[0])
# get sentence example
# sentence_example = []
sent_id = sentence[0][0].rsplit('.', 1)[0]
try:
cur = w_a_collection.find({'_id': sent_id})
db_sentence = next(iter(cur))['words']
cur.close()
except StopIteration:
continue
# if valency_pattern_key == ('ACT', 'PAT'):
# print('am')
# idi = 0
idi = 0
hw_idi = -1
for word_id, word in sentence:
if word_id in tids:
hw_idi = idi
if word['word']:
idi += 1
if hw_idi == -1:
raise Exception('No such headword idi!')
# for idi, word in idi_word_generator(sentence):
# print('here')
# for word_id, word_dict in sentence:
# # TODO Modify sentence!
# # if formatted_sentences[sent_id]
# sentence_example.append(word_dict['text'])
# if word_dict['word']:
# idi += 1
# if sent_id == 'ssj134.880.3375':
# print('here')
# if sent_id == 'ssj38.227.917':
# print('here')
# if sent_id == 'GF0004627.1913.1':
# print('here')
# print(sent_id)
# print([a for a in w_a_collection.find()])
# if valency_pattern_key == ('ACT', 'PAT'):
# print('here')
sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
examples_included_num = 0
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
examples_included_num += 1
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus, sent_id)
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example)
# headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
headword_patterns[valency_pattern_key]['sentence_num'] += 1
headword_patterns[valency_pattern_key]['sr_data'] = sr_data
# add patterns to db
new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()]
if len(new_patterns_query) > 0:
result = valency_pattern_id_collection.bulk_write(new_patterns_query)
# calculate statistics
semantic_role_stats = {}
sentence_tot = 0
pattern_tot = len(headword_patterns)
for key, val in headword_patterns.items():
sentence_num = val['sentence_num']
for sr in key:
if sr in semantic_role_stats:
semantic_role_stats[sr]['valency_pattern_num'] += 1
semantic_role_stats[sr]['valency_sentence_num'] += sentence_num
else:
semantic_role_stats[sr] = {}
semantic_role_stats[sr]['valency_pattern_num'] = 1
semantic_role_stats[sr]['valency_sentence_num'] = sentence_num
sentence_tot += sentence_num
return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
.join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
.join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
.join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
.filter(Measure.name == 'frequency') \
.filter(Corpus.name == 'gigafida') \
.filter(Corpus.version == '2.0')
# valency_pattern_id_collection.find()
# used to not repeat search queries for prepositions
preposition_list = {}
for headword_text, category_text in headword_category:
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()]
cur = valency_pattern_id_collection.find()
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in cur]}
cur.close()
# patterns = {}
pattern_id_max = len(patterns) + 1
# pattern_examples_limit = 4
# get data
headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj,
headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit,
{})
if not ignore_gigafida:
headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida,
w_a_collection_gigafida,
headword_text, RF,
mongo, patterns,
pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
wf1 = aliased(WordFormFeature)
wf2 = aliased(WordFormFeature)
wf3 = aliased(WordFormFeature)
query_preposition = session.query(FormEncoding.text) \
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
.join(wf1, wf1.word_form_id == WordForm.id) \
.join(wf2, wf2.word_form_id == WordForm.id) \
.join(wf3, wf3.word_form_id == WordForm.id) \
.filter(Lexeme.lemma == headword_text) \
.filter(wf1.value == 'singular') \
.filter(wf2.value == 'third') \
.filter(wf3.value == 'present')
pattern_translation_hws = query_preposition.all()
pattern_translation_3_sin = headword_text
if len(pattern_translation_hws) == 1:
pattern_translation_3_sin = pattern_translation_hws[0].text
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
if headword_text[-1] == '_':
headword_text_query = headword_text[:-1]
else:
headword_text_query = headword_text
query = query_general.filter(Category.name == category_text) \
.filter(Lexeme.lemma == headword_text_query) \
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
LexicalUnitType.name)
# res = query.one_or_none()
query_res = query.all()
if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0]
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
elif len(query_res) > 1:
# find dummy
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
.filter(Corpus.name == 'gigafida') \
.filter(Corpus.version == '2.0') \
.filter(Lexeme.lemma == headword_text_query).all()
# all lexical_unit_ids equal or at least one dummy
dummy_exists = False
final_lexical_unit_id = 0
final_lexical_unit_lexeme_id = 0
for r in dummy_query:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
lexical_unit_type_name) = r
if dummy:
final_lexical_unit_id = lexical_unit_id
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
dummy_exists = True
break
assert dummy_exists
sense_ids = []
features_set = set()
frequency = 0
for r in query_res:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
lexical_unit_type_name) = r
if dummy:
continue
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all())
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
# set features in dictionary
if not features:
for n_feat in features_set:
for f in n_feat:
features.add(f)
# compare features
else:
for n_feat in features_set:
for f in n_feat:
if f not in features:
raise Exception('Different features in query_res - might be problematic!')
frequency += el_frequency
# check if any actual sense exists if not erase all but one
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
if not any_sense_not_dummy:
sense_ids = sense_ids[-1:]
lexical_unit_id = final_lexical_unit_id
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
# Sense.lexical_unit_id == lexical_unit_id).all()
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
# .filter(Feature.name == 'aspect').all()
else:
frequency = None
lexeme_id = None
lexical_unit_id = None
lexical_unit_lexeme_id = None
lexical_unit_type_name = None
sense_ids = []
features = []
entry = lxml.SubElement(dictionary, 'entry')
head = lxml.SubElement(entry, 'head')
headword = lxml.SubElement(head, 'headword')
lemma = lxml.SubElement(headword, 'lemma')
lemma.text = headword_text
lexical_unit = lxml.SubElement(head, 'lexicalUnit')
if lexical_unit_id is not None:
lexical_unit.set('id', str(lexical_unit_id))
if lexical_unit_type_name is not None:
lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name
lexical_unit.set('type', lexical_unit_type_name)
lexeme = lxml.SubElement(lexical_unit, 'lexeme')
if lexical_unit_lexeme_id is not None:
lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id))
lexeme.text = headword_text
grammar = lxml.SubElement(head, 'grammar')
category = lxml.SubElement(grammar, 'category')
if args.language == 'sl':
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
else:
category.text = category_text
ssj_frequency = None
if len(features) > 0:
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
if args.language == 'sl':
grammarFeature.set('name', 'vid')
grammarFeature.text = ASPECT_MAP[features[0].value]
else:
grammarFeature.set('name', 'aspect')
grammarFeature.text = features[0].value
measureList = lxml.SubElement(head, 'measureList')
if frequency:
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
measure.set('source', 'Gigafida 2.0')
# measure.set('source', 'ssj500k')
measure.text = str(int(frequency))
if ssj_frequency is not None:
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
measure.set('source', 'ssj500k 2.2')
measure.text = str(int(ssj_frequency))
body = lxml.SubElement(entry, 'body')
statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList')
# combine semantic_role_stats
semantic_role_stats = {}
for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items():
semantic_role_stats[semanticRole_val] = {}
semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats
if not ignore_gigafida:
for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items():
if semanticRole_val not in semantic_role_stats:
semantic_role_stats[semanticRole_val] = {}
semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats
for semanticRole_val, semanticRole_stats in semantic_role_stats.items():
statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer')
semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole')
semanticRole.text = semanticRole_val
measureList = lxml.SubElement(statisticsContainer, 'measureList')
if 'ssj' in semanticRole_stats:
measure_pattern_ssj = lxml.SubElement(measureList, 'measure')
measure_pattern_ssj.set('type', 'valency_pattern_ratio')
measure_pattern_ssj.set('source', 'ssj500k 2.2')
measure_pattern_ssj.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj)
measure_sentence_ssj = lxml.SubElement(measureList, 'measure')
measure_sentence_ssj.set('type', 'valency_sentence_ratio')
measure_sentence_ssj.set('source', 'ssj500k 2.2')
if sentence_tot_ssj == 0:
measure_sentence_ssj.text = '%.4f' % (0.0)
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else:
measure_sentence_ssj.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
# measure_sentence_ssj.text = '%.2f' % (
# semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
if 'gf' in semanticRole_stats and not ignore_gigafida:
measure_pattern_gf = lxml.SubElement(measureList, 'measure')
measure_pattern_gf.set('type', 'valency_pattern_ratio')
measure_pattern_gf.set('source', 'Gigafida 2.0')
measure_pattern_gf.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf)
measure_sentence_gf = lxml.SubElement(measureList, 'measure')
measure_sentence_gf.set('type', 'valency_sentence_ratio')
measure_sentence_gf.set('source', 'Gigafida 2.0')
if sentence_tot_gf == 0:
measure_sentence_gf.text = '%.4f' % (0.0)
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else:
measure_sentence_gf.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
senseList = lxml.SubElement(body, 'senseList')
# handle cases when headword is not in sloleks
if len(sense_ids) == 0:
sense_ids = [-1]
for sense_id in sense_ids:
if len(sense_ids) > 1 and sense_id.potential_sense:
continue
sense = lxml.SubElement(senseList, 'sense')
if not sense_id == -1 and not sense_id.potential_sense:
sense.set('id', str(sense_id.id))
definitionList = lxml.SubElement(sense, 'definitionList')
if not sense_id == -1:
definition_texts = session.query(Definition.description).filter(
Definition.sense_id == sense_id.id).all()
else:
definition_texts = []
for definition_text in definition_texts:
definition = lxml.SubElement(definitionList, 'definition')
definition.text = definition_text[0]
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
valencyPatternList.set('system', 'JOS')
# combine semantic_role_stats ##################################
headword_patterns = {}
for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items():
headword_patterns[headword_patterns_val] = {}
headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats
if not ignore_gigafida:
for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items():
if headword_patterns_val not in headword_patterns:
headword_patterns[headword_patterns_val] = {}
headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats
#################################################################
for headword_pattern, headword_pattern_dict in headword_patterns.items():
valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern')
valencyPattern.set('id', str(patterns[headword_pattern]))
measureList_sense = lxml.SubElement(valencyPattern, 'measureList')
if 'ssj' in headword_pattern_dict:
measure_sense = lxml.SubElement(measureList_sense, 'measure')
measure_sense.set('type', 'frequency_all')
measure_sense.set('source', 'ssj500k 2.2')
measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num'])
if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']:
measure_sense = lxml.SubElement(measureList_sense, 'measure')
measure_sense.set('type', 'frequency_all')
measure_sense.set('source', 'Gigafida 2.0')
measure_sense.text = str(headword_pattern_dict['gf']['sentence_num'])
semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList')
# patternId = lxml.SubElement(semanticRoles, 'patternId')
# patternId.text = str(patterns[headword_pattern])
if 'ACT' in headword_pattern:
patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin
else:
patternTranslationText = headword_text
for semantic_role in headword_pattern:
if semantic_role != 'ACT':
# additional rules
# if semantic_role == 'RESLT':
# pass
# else:
# patternTranslationText += ' ' + translations[semantic_role]
patternTranslationText += ' ' + translations[semantic_role]
semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer')
semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole')
semanticRole.text = semantic_role
syntactic_structure_dict = {}
if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
if syn_struct_id not in syntactic_structure_dict:
syntactic_structure_dict[syn_struct_id] = {}
for com_num, com_set in syn_struct_dict.items():
if com_num not in syntactic_structure_dict[syn_struct_id]:
syntactic_structure_dict[syn_struct_id][com_num] = set()
for lex in com_set:
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
if syn_struct_id not in syntactic_structure_dict:
syntactic_structure_dict[syn_struct_id] = {}
for com_num, com_set in syn_struct_dict.items():
if com_num not in syntactic_structure_dict[syn_struct_id]:
syntactic_structure_dict[syn_struct_id][com_num] = set()
for lex in com_set:
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
if len(syntactic_structure_dict) > 0:
syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList')
# iterate over syntactic structures and write them
for syn_struct_id, component_dict in syntactic_structure_dict.items():
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
syntacticStructure.set('id', syn_struct_id)
dedup_dict = {}
for comp_id, lexemes in component_dict.items():
for l in lexemes:
if l in preposition_list:
prep_id = preposition_list[l]
else:
query_preposition = session.query(Lexeme.id) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
.join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(Lexeme.lemma == l[2]) \
.filter(Feature.name == 'case') \
.filter(LexemeFeature.value == CASE_MAP[l[1]]) \
.group_by(Lexeme.id)
preposition_ids = query_preposition.all()
if len(preposition_ids) != 1:
prep_id = ''
else:
prep_id = str(preposition_ids[0][0])
preposition_list[l] = prep_id
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
continue
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
lexem.set('sloleks', prep_id)
lexem.text = l[2]
patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
patternRepresentation.text = patternTranslationText
exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList')
if 'ssj' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(copy.deepcopy(sentence_example))
if 'gf' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(copy.deepcopy(sentence_example))
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
encoding='utf-8') as xf:
xf.write(dictionary, pretty_print=True)
pbar.update(1)
def init_db(db):
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
[db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
pool_recycle=14400)
Base.metadata.reflect(engine)
class Lexeme(Base):
__table__ = Base.metadata.tables['jedro_lexeme']
class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature']
class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature']
class LexicalUnitLexeme(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
class LexicalUnit(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit']
class LexicalUnitType(Base):
__table__ = Base.metadata.tables['jedro_lexicalunittype']
class Category(Base):
__table__ = Base.metadata.tables['jedro_category']
class Sense(Base):
__table__ = Base.metadata.tables['jedro_sense']
class Measure(Base):
__table__ = Base.metadata.tables['jedro_measure']
class LexicalUnitMeasure(Base):
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
class Corpus(Base):
__table__ = Base.metadata.tables['jedro_corpus']
class Definition(Base):
__table__ = Base.metadata.tables['jedro_definition']
class WordForm(Base):
__table__ = Base.metadata.tables['jedro_wordform']
class WordFormFeature(Base):
__table__ = Base.metadata.tables['jedro_wordform_feature']
class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation']
class FormEncoding(Base):
__table__ = Base.metadata.tables['jedro_formencoding']
return engine
def match_file(words, structures):
matches = []
for s in structures:
for w in words:
mhere = s.match(w)
for match in mhere:
# save only those with verbs in them
if not [True for m in match.values() if m.msd[0] == 'V']:
continue
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
colocation_id = tuple(colocation_id)
matches.append([match, colocation_id])
return matches
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
def find_word_sons(word, deppar_dict, word_id, role, parents):
if word.id in parents:
return False
for k, v in word.links.items():
for w in v:
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
# print('here')
if k in possible_jos_links:
if w.id not in deppar_dict:
deppar_dict[w.id] = {}
deppar_dict[w.id][word_id] = role
if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]):
return False
# elif k in possible_jos_links:
# raise Exception('One word in multiple dependency parsetrees')
return True
# for ignoring punctuations
def idi_word_generator(sentence):
idi = 0
for word in sentence:
if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
continue
yield idi, word
idi += 1
def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig):
structures, _, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(input_corpus))
database = Database(args)
formatted_sentences = {}
start_time = time.time()
sentences_num_limit = 15000
sentences_in_ram = 0
sentence_glue_numbers = None
is_gf = input_corpus_orig is not None
if is_gf:
glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection)
for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
if is_gf:
# create tuple for comparison with sentence_flue_words
sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')])
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
logging.warning(
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
continue
sentence_glue = next(glue_words_gen)
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
while sentence_glue_numbers < sent_id_numbers:
logging.warning(
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
sentence_glue = next(glue_words_gen)
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
# has to be here for when next sentence_glue is selected in while loop
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
logging.warning(
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
continue
if sent_id != sentence_glue[0]:
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
if len(sentence_glue[1]) != len(sentence):
logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
continue
for w, w_glue in zip(sentence, sentence_glue[1]):
w.glue = w_glue[2]
if sentence is None:
timeinfo.add_measurement(-1)
continue
# start_time = time.time()
# print(time.time() - start_time)
matches = match_file(sentence, structures)
# if sent_id == 'ssj134.880.3375':
# print('here')
# print(time.time() - start_time)
# match_store.add_matches(matches)
# word_stats.add_words(words)
# database.commit()
# find unimportant collocations
# extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values())
for match in matches:
match_idis = []
for key, word in match[0].items():
match_idis.append(word.idi)
match.append(match_idis)
collocations = {}
for match in matches:
for key, word in match[0].items():
# if word.id == ''
if word.id not in collocations:
collocations[word.id] = []
collocations[word.id].append((match[1][0], key, word.msd[:2], match[2]))
# print(time.time() - start_time)
formatted_sentence = []
deppar_dict = {}
# idi = 0
incorrect_sentence = False
# create output and form dependency parsetree sons
for idi, word in idi_word_generator(sentence):
# if word.text == 'Mumel':
# print('here')
# if word.text == 'Poleg':
# print('here')
# if word.text == 'Luka':
# print('here')
idi = str(idi)
# a = sent_id in sentences_of_interest
# b = (word.lemma, word.msd) in sentences_of_interest[sent_id]
# if word.msd == 'X':
# continue
# if len(word.text) == 1 and word.text in string.punctuation + '':
# a = re.match('^[\w]+$', word.text) is not None
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]:
# if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]:
# cur_count = w_collection.count_documents({'_id': sent_id})
# if w_collection.count_documents({'_id': sent_id}) > 0:
sentence_of_interest = othr_sentence_attributes
# is_count = cur.count() > 0
if idi in othr_sentence_attributes:
if word.id not in deppar_dict:
deppar_dict[word.id] = {}
deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1]
# deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]}
# if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]:
# if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]:
# print((word.lemma, word.msd))
# print(sentences_of_interest[sent_id][idi][1])
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
# print('HERE')
if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []):
incorrect_sentence = True
# idi += 1
if incorrect_sentence:
logging.warning(
f"Sentence {sent_id} contains srl connections that loop!")
continue
# print(time.time() - start_time)
for word in sentence:
if word.id in collocations:
col = collocations[word.id]
else:
col = []
if word.id in deppar_dict:
dp = deppar_dict[word.id]
else:
dp = {}
formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma))
# create_sentence_output(formatted_sentence, 4)
formatted_sentences[sent_id] = formatted_sentence
if sentences_in_ram >= sentences_num_limit:
sentences_in_ram = 0
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
result = w_a_collection.bulk_write(requests)
formatted_sentences = {}
sentences_in_ram += 1
# print(time.time() - start_time)
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
if len(requests) > 0:
result = w_a_collection.bulk_write(requests)
# force a bit of garbage collection
# del sentence
# del sent_id
# del matches
# gc.collect()
print(time.time() - start_time)
# return formatted_sentences
# # timeinfo.add_measurement(time.time() - start_time)
# # timeinfo.info()
# # if no output files, just exit
# if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
# return
#
# # get word renders for lemma/msd
# word_stats.generate_renders()
# match_store.determine_colocation_dispersions()
#
# # figure out representations!
# if args.out or args.out_no_stat:
# match_store.set_representations(word_stats, structures)
#
# Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
def get_headword_category(collection):
"""
Returns
:return:
List of tuples with all headwords in mongodb and their categories.
"""
headwords = sorted(collection.distinct("headwords")[1:])
if args.headwords:
with open(args.headwords, 'w') as f:
for item in headwords:
f.write("%s\n" % item)
headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in
headwords]
return headword_category
def read_ssj500k_frequencies(path):
with open(path, 'r') as f:
reader = csv.reader(f, delimiter='\t')
next(reader)
for line in reader:
ssj_frequency_dict[(line[1], line[-1])] = line[2]
def main(args):
# with Path('data/wordlist.json').open("r") as fp:
# sskj_wordlist = json.load(fp)
# # wordlist = set(sskj_wordlist['wordlist'])
# wordlist = set(sskj_wordlist['wordlist'])
print('beginning chunk')
start_time = time.time()
# user:user:valdb:127.0.0.1
[db_user, db_password, db_database, db_host] = args.mongo_db.split(':')
mongo = MongoClient(username=db_user, password=db_password, authSource=db_database)
db = mongo.valdb
collection_ssj = db['ssj']
collection_gigafida = db['gigafida']
db2 = mongo.extvaldb
# write collection
w_collection_ssj = db2['ssj']
w_collection_gigafida = db2['gigafida']
w_a_collection_ssj = db2['ssj' + '_all']
w_a_collection_gigafida = db2['gigafida' + '_all']
status_collection = db2['status']
valency_pattern_id_collection = db2['valency_pattern_ids']
RF = reduce_functions["reduce_0"]["f"]
# get all headwords from database
# headword_category = get_headword_category(collection_ssj)
with open(args.headwords, 'r') as read:
headword_category = [(line[:-1], 'verb') for line in read.readlines()]
assert args.language == 'en' or args.language == 'sl'
shutil.rmtree(args.outdir, True)
os.mkdir(args.outdir)
engine = init_db(args.sloleks_db)
# input_file = codecs.open(args.infile, 'r')
# # input_file = []
# next(input_file)
# category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'}
session = Session(engine)
# cur = collection.find({})
#
# a = []
# cur_len = 0
# # num_empty_sent = 0
# for ent in cur:
# cur_len += 1
# # s = frames_from_db_entry(ent)
# # if not s:
# # num_empty_sent += 1
# a += frames_from_db_entry(ent)
print(time.time() - start_time)
# print(num_empty_sent)
print('get_sentences_of_interest')
start_time = time.time()
# sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo)
# sentences_of_interest_stored = args.p1_processed
if not args.p1_processed:
with tqdm(total=len(headword_category)) as pbar:
get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj')
if not args.ignore_gigafida:
with tqdm(total=len(headword_category)) as pbar:
get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida')
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
print(time.time() - start_time)
# num_sentences = 0
# for el in all_sentences:
# if el not in sentences_of_interest:
# num_sentences += 1
#
# print(num_sentences)
# print(len(all_sentences))
print('extract_sentences')
start_time = time.time()
# formatted_sentences_stored = args.p2_processed
if not args.p2_processed:
gf_anno_paths = list(os.walk(args.input_gigafida_annotated))
gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]]
gf_orig_paths = list(os.walk(args.input_gigafida_original))
gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF'])
extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None)
if not args.ignore_gigafida:
extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths)
print(time.time() - start_time)
print('write_xml')
start_time = time.time()
if args.ssj500k_frequencies is not None:
read_ssj500k_frequencies(args.ssj500k_frequencies)
with tqdm(total=len(headword_category)) as pbar:
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
print(time.time() - start_time)
# input_file.close()
session.close()
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
arg_parser.add_argument('--mongo_db', type=str, help='Database credentials')
arg_parser.add_argument('--schema', type=str, help='XML schema')
arg_parser.add_argument('--infile', type=str, help='Input file')
arg_parser.add_argument('--outdir', type=str, help='Output directory')
arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.')
arg_parser.add_argument('--language', type=str, help='Language of certain attributes')
arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.')
arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.')
arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.')
arg_parser.add_argument('--p1_processed',
help='Skip first part (obtaining sentences of interest) when they are already in DB.',
action='store_true')
arg_parser.add_argument('--p2_processed',
help='Skip second part (obtaining formatted sentences) when they are already in DB.',
action='store_true')
arg_parser.add_argument('--structures',
help='Structures definitions in xml file')
arg_parser.add_argument('--input_sloleks',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
arg_parser.add_argument('--input_gigafida_annotated',
help='input file in (gz or xml currently). If none, then just database is loaded')
arg_parser.add_argument('--input_gigafida_original',
help='input file in (gz or xml currently). If none, then just database is loaded')
arg_parser.add_argument('--out',
help='Classic output file')
arg_parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
arg_parser.add_argument('--all',
help='Additional output file, writes more data')
arg_parser.add_argument('--stats',
help='Output file for statistics')
arg_parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
arg_parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
arg_parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
arg_parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
arg_parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
arg_parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
arg_parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
arg_parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
arg_parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
arg_parser.add_argument('--new-db',
help="Writes over database file, if there exists one", action='store_true')
arg_parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
arg_parser.add_argument('--ssj500k-frequencies',
help='Tag for separators, usually pc or c', default=None)
args = arg_parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))