cjvt-valency/scripts/create_xml.py

1644 lines
70 KiB
Python

#!/usr/bin/python3
# make database-service
import gc
import re
import string
from collections import OrderedDict
import sys
from tqdm import tqdm
import pymongo
# import tqdm as tqdm
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
from valency.Frame import frames_from_db_entry
from valency.reduce_functions import reduce_functions
import argparse
import os
import shutil
import lxml.etree as lxml
import codecs
import logging
import argparse
import pickle
import time
from io import StringIO
from lxml import etree
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, aliased
from sqlalchemy import create_engine
from sqlalchemy import func
from pymongo import MongoClient, UpdateOne, InsertOne
# examples_num = sys.maxsize
# corpus = 'ssj'
translations = {
'ACT': 'KDO/KAJ',
'PAT': 'KOGA/KAJ',
'RESLT': 'REZULTAT',
'REC': 'KOMU/ČEMU',
'TIME': 'KDAJ',
'MANN': 'KAKO',
'LOC': 'KJE',
'MEANS': 'S ČIM',
'GOAL': 'ČEMU',
'REG': 'GLEDE NA KOGA/KAJ',
'DUR': 'KOLIKO ČASA',
'CAUSE': 'ZAKAJ',
'COND': 'POD KATERIM POGOJEM',
'ORIG': 'IZVOR',
'FREQ': 'KOLIKOKRAT',
'SOURCE': 'OD KOD',
'AIM': 'S KAKŠNIM NAMENOM',
'QUANT': 'ŠTEVILO',
'EVENT': 'NA DOGODKU',
'CONTR': 'KLJUB ČEMU',
'ACMP': 'S KOM/ČIM',
'RESTR': 'Z OMEJITVIJO',
'MWPRED': '',
'MODAL': '',
'PHRAS': ''
}
CATEGORY_MAP = {
'noun': 'samostalnik',
'verb': 'glagol',
'adjective': 'pridevnik',
'adverb': 'prislov',
'pronoun': 'zaimek',
'numeral': 'števnik',
'preposition': 'predlog',
'conjunction': 'veznik',
'particle': 'členek',
'interjection': 'medmet',
'abbreviation': 'okrajšava',
'residual': 'neuvrščeno'
}
ASPECT_MAP = {
'perfective': 'dovršni',
'progressive': 'nedovršni',
'biaspectual': 'dvovidski'
}
CASE_MAP = {
'n': 'nominative',
'g': 'genitive',
'd': 'dative',
'a': 'accusative',
'l': 'locative',
'i': 'instrumental'
}
Lexeme = None
LexemeFeature = None
SyntacticStructure = None
StructureComponent = None
Feature = None
LexicalUnitLexeme = None
LexicalUnit = None
LexicalUnitType = None
Category = None
Sense = None
Measure = None
LexicalUnitMeasure = None
Corpus = None
Definition = None
WordForm = None
WordFormFeature = None
FormRepresentation = None
# corpus = 'gigafida'
from pathlib import Path
import json
def hws_generator(collection, headword_text, RF, mongo):
cur = collection.find({"headwords": headword_text})
# print('tu2!')
frames = []
for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
print('tu3!')
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF(frames, mongo.db.sensemap)
# print('tu4!')
for frame in ret_frames:
frame_json = frame.to_json()
yield frame_json
def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar):
sentences_of_interest = {}
# all_sentences = set()
sorted(headword_category, key=lambda x: x[0])
# num_sentences in RAM at once
sentences_num_limit = 10000
sentences_in_ram = 0
part = 0
start_time = time.time()
# first_sentence = True
# section_included = False
# last_processed_hw = 'pomeniti'
# last_processed_hw = 'iti'
# last_processed_hw = 'aktivirati'
last_processed_hw = 'aktivirati'
already_processed = False
for headword_id, (headword_text, category_text) in enumerate(headword_category):
# print(headword_text)
if already_processed:
if headword_text != last_processed_hw:
continue
else:
already_processed = False
# for headword_text, category_text in headword_category[15:20]:
# headword_text = 'zadovoljen'
# category_text = 'adjective'
headword_patterns_ids = {}
# print('tu1!')
cur = collection.find({"headwords": headword_text})
# print('tu2!')
frames = []
for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
# print('tu3!')
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
# if headword_text == 'brati':
# print('here')
# if headword_text == 'prevajati':
# print('here')
ret_frames = RF(frames, mongo.db.sensemap)
json_ret = {"frames": []}
# print('tu4!')
for frame in ret_frames:
frame_json = frame.to_json()
json_ret["frames"].append(frame_json)
# print('tu5!')
# get xml values
for hws in json_ret.values():
for hw in hws:
# print(hw['hw'])
# if hw['hw'] == 'pomeniti':
# print('aaa')
# generate valency pattern key
valency_pattern_key = []
functors = {}
if len(hw['tids']) != 1:
raise Exception('Multiple TIDS')
for slot in hw['slots']:
valency_pattern_key.append(slot['functor'])
for tid in slot['tids']:
if tid not in functors:
functors[tid] = {}
functors[tid] = slot['functor']
valency_pattern_key = tuple(sorted(valency_pattern_key))
if valency_pattern_key not in headword_patterns_ids:
headword_patterns_ids[valency_pattern_key] = []
for sentence in hw['sentences']:
# all_sentences.add(sentence[0][0])
# if len(headword_patterns_ids[valency_pattern_key]) < examples_num:
# if section_included:
# if not sentences_in_ram > sentences_num_limit:
# sentences_in_ram += 1
# continue
# else:
# first_sentence = True
sentence_id = sentence[0][0].rsplit('.', 1)[0]
# print(sentence_id)
if sentence_id not in sentences_of_interest:
sentences_of_interest[sentence_id] = {}
idi = 0
parent_idi = -1
# print('t1')
for idx, word in sentence:
if idx == hw['tids'][0]:
parent_idi = idi
if word['word']:
idi += 1
# print('t2')
if parent_idi == -1:
raise Exception('No parent found!')
idi = 0
# if len(sentence) > 500:
# print(len(sentence))
for idx, word in sentence:
if idx in functors:
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx]
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi)
# sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']]))
sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx])
if word['word']:
# if sentence_id == 'ssj37.216.892':
# print(idi)
# print(word['text'])
idi += 1
# print('t3')
headword_patterns_ids[valency_pattern_key].append(sentence_id)
# check if this is first sentence
# if first_sentence:
# one_element = next(iter(sentences_of_interest.items()))
# section_included = w_collection.count_documents({'_id': one_element[0],
# list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1
# first_sentence = False
if sentences_in_ram >= sentences_num_limit:
# print('print1:')
# print(time.time() - start_time)
start_time = time.time()
# !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized')
# print('Sentences in ram:')
# print(sentences_in_ram)
sentences_in_ram = 0
# [InsertOne({'y': 1}), DeleteOne({'x': 1}),
# ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# if 'GF0010453.1116.1' in sentences_of_interest:
# print('here')
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
# print('print2:')
# print(time.time() - start_time)
# start_time = time.time()
result = w_collection.bulk_write(requests)
# print('print3:')
# print(time.time() - start_time)
# start_time = time.time()
del requests
del sentences_of_interest
gc.collect()
# print('print4:')
# print(time.time() - start_time)
# start_time = time.time()
# print(part)
# print('HEADWORD')
# print(headword_text)
# pbar.update(1)
part += 1
#
# w_collection.bulk_write(
# array.map((val) = >
# ({
# updateOne: {
# filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6},
# update: {
# $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}},
# upsert: true
# }
# })
# )
# })
# sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()}
# w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True)
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
sentences_of_interest = {}
# first_sentence = True
sentences_in_ram += 1
pbar.update(1)
# TODO uncomment
# if 'GF0010453.1116.1' in sentences_of_interest:
# a = sentences_of_interest['GF0010453.1116.1']
# print('here')
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
result = w_collection.bulk_write(requests)
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
# try:
# w_collection.insert_many(sentences_of_interest, ordered=False)
# except pymongo.errors.BulkWriteError as e:
# print(e.details['writeErrors'])
# sentences_of_interest = {}
# # else:
# # print('aaa')
# return sentences_of_interest
def create_sentence_output(sentence, headword_id, corpus):
glue_outside = False
headword_id = str(headword_id)
parent_node = etree.Element('corpusExample')
parent_node.set('corpusName', corpus)
# parent_node.text = 'AAA'
# parent_node.prefix = 'BBB'
# parent_node.tail = 'CCC'
cur_node = parent_node
# formatted_sentence = ''
first_in_tag = True
first_outside_tag = False
in_dependency_tree = False
# TODO use whole sentence!
# for idi, word in enumerate(sentence):
# def idi_word_generator(sentence):
# idi = 0
# for word in sentence:
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# yield idi, word
# idi += 1
idi = 0
attach_to = None
p_cur_node = None
p_attach_to = None
p_glue_attach_to = None
previous_word = None
# if sentence[0][0][0] == 'Tako':
# print('here')
# for idi, word in idi_word_generator(sentence):
for word_id in range(len(sentence)):
# is_ending_tree = False
# SRL container output
word = sentence[word_id]
# sentence output
if in_dependency_tree:
if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]:
attach_to = cur_node
# is_ending_tree = True
p_glue_attach_to = cur_node
cur_node = parent_node
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '</tree>'
in_dependency_tree = False
first_outside_tag = True
if headword_id in word[2] and not in_dependency_tree:
dep_tree = lxml.SubElement(cur_node, 'tree')
dep_tree.set('role', word[2][headword_id])
cur_node = dep_tree
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
# formatted_sentence += '<tree role="{}">'.format(word[2][headword_id])
in_dependency_tree = word[2][headword_id]
attach_to = None
if p_glue_attach_to is not None:
glue_outside = True
if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None):
# if headword_id == idi:
comp = lxml.SubElement(cur_node, 'comp')
comp.set('role', 'headword')
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
elif p_glue_attach_to is not None:
if p_glue_attach_to.tail is None:
p_glue_attach_to.tail = previous_word[0][1]
else:
p_glue_attach_to.tail += previous_word[0][1]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
comp.text = word_text
attach_to = comp
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
# formatted_sentence += '<comp structure_id="headword">{}</comp>'.format(word[0][0])
idi += 1
continue
if word[1] and in_dependency_tree:
col_id = -1
for i, col in enumerate(word[1]):
if headword_id in col[3]:
col_id = i
break
if col_id != -1:
comp = lxml.SubElement(cur_node, 'comp')
comp.set('structure_id', word[1][col_id][0])
comp.set('num', word[1][col_id][1])
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
elif p_glue_attach_to is not None:
if p_glue_attach_to.tail is None:
p_glue_attach_to.tail = previous_word[0][1]
else:
p_glue_attach_to.tail += previous_word[0][1]
# elif p_attach_to is not None:
# if p_attach_to.tail is None:
# p_attach_to.tail = previous_word[0][1]
# else:
# p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
comp.text = word_text
attach_to = comp
if not first_in_tag:
# formatted_sentence += '\n'
first_in_tag = True
first_outside_tag = True
# Assuming one collocation per word
# formatted_sentence += '<comp structure_id="{}" num="{}">{}</comp>'.format(word[1][0][0], word[1][0][1], word[0][0])
p_cur_node = cur_node
p_glue_attach_to = comp
p_attach_to = attach_to
previous_word = word
idi += 1
continue
# collocation
# if not first_in_new_row:
# # formatted_sentence += ' '
# word_text = ' ' + word[0][0]
# else:
# word_text = word[0][0]
# if first_in_tag and previous_word:
# word_text = previous_word[0][1] + word[0][0]
# else:
# word_text = word[0][0]
# word_text += word[0][1]
# TODO CHANGE THIS TO FIX SPACE LOCATIONS!
# word_text = word[0][0] + word[0][1]
if not first_outside_tag:
if p_attach_to is None:
if p_cur_node is not None:
p_cur_node.text += previous_word[0][1]
else:
p_attach_to.tail += previous_word[0][1]
word_text = word[0][0]
else:
word_text = ''
if p_attach_to is None:
if p_cur_node is not None:
word_text += previous_word[0][1]
else:
word_text += previous_word[0][1]
if glue_outside:
p_glue_attach_to.tail = previous_word[0][1]
word_text = word[0][0]
else:
word_text += word[0][0]
if attach_to is None:
if cur_node.text is None:
cur_node.text = word_text
else:
cur_node.text += word_text
else:
if attach_to.tail is None:
attach_to.tail = word_text
else:
attach_to.tail += word_text
# attach_to.tail +=word[0][0]
# formatted_sentence += word[0][0]
first_in_tag = False
first_outside_tag = False
p_cur_node = cur_node
p_attach_to = attach_to
previous_word = word
p_glue_attach_to = None
if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None:
continue
idi += 1
return parent_node
def get_SRLcontainer_data(sentence, word_of_interest_id, summary):
for word in sentence:
if word_of_interest_id in word[2]:
for col in word[1]:
if word_of_interest_id in col[3]:
if word[2][word_of_interest_id] not in summary:
summary[word[2][word_of_interest_id]] = {}
if col[0] not in summary[word[2][word_of_interest_id]]:
summary[word[2][word_of_interest_id]][col[0]] = {}
# word_of_interest_included = word_of_interest_id in col[3]
if col[1] not in summary[word[2][word_of_interest_id]][col[0]]:
summary[word[2][word_of_interest_id]][col[0]][col[1]] = set()
if col[2][0] == 'S':
summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3]))
return summary
def valid_valency_pattern(valency_pattern_key):
occurences = set()
for v_p in valency_pattern_key:
if v_p in occurences:
return False
occurences.add(v_p)
return True
def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj):
cur = collection.find({"headwords": headword_text})
frames = []
for ent in cur:
frames += frames_from_db_entry(ent) # pre-process this step for prod TODO
cur.close()
# filter by relevant hw
frames = [x for x in frames if x.hw == headword_text]
ret_frames = RF(frames, mongo.db.sensemap)
json_ret = {"frames": []}
for frame in ret_frames:
frame_json = frame.to_json()
json_ret["frames"].append(frame_json)
# get xml values
headword_patterns = {}
new_patterns = {}
for hws in json_ret.values():
for hw in hws:
# generate valency pattern key
valency_pattern_key = []
for slot in hw['slots']:
valency_pattern_key.append(slot['functor'])
# sort valency_pattern_key by order provided in translations
valency_pattern_key_new = []
for key in translations:
if key in valency_pattern_key:
valency_pattern_key_new.append(key)
valency_pattern_key = tuple(valency_pattern_key_new)
if valency_pattern_key not in headword_patterns:
headword_patterns[valency_pattern_key] = {}
headword_patterns[valency_pattern_key]['sentence_examples'] = []
headword_patterns[valency_pattern_key]['sentence_num'] = 0
headword_patterns[valency_pattern_key]['sr_data'] = {}
if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns:
new_patterns[valency_pattern_key] = pattern_id_max
patterns[valency_pattern_key] = pattern_id_max
pattern_id_max += 1
headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key]
sr_data = headword_patterns[valency_pattern_key]['sr_data']
tids = set(hw['tids'])
if valency_pattern_key in headword_patterns_ssj:
ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples'])
else:
ssj_len = 0
for sentence in hw['sentences']:
# sentences_of_interest.append(sentence[0])
# get sentence example
# sentence_example = []
sent_id = sentence[0][0].rsplit('.', 1)[0]
try:
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
except StopIteration:
continue
# if valency_pattern_key == ('ACT', 'PAT'):
# print('am')
# idi = 0
idi = 0
hw_idi = -1
for word_id, word in sentence:
if word_id in tids:
hw_idi = idi
if word['word']:
idi += 1
if hw_idi == -1:
raise Exception('No such headword idi!')
# for idi, word in idi_word_generator(sentence):
# print('here')
# for word_id, word_dict in sentence:
# # TODO Modify sentence!
# # if formatted_sentences[sent_id]
# sentence_example.append(word_dict['text'])
# if word_dict['word']:
# idi += 1
# if sent_id == 'ssj134.880.3375':
# print('here')
# if sent_id == 'ssj38.227.917':
# print('here')
# if sent_id == 'GF0004627.1913.1':
# print('here')
# print(sent_id)
# print([a for a in w_a_collection.find()])
# if valency_pattern_key == ('ACT', 'PAT'):
# print('here')
sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
# TODO ERASE THIS
examples_included_num = 0
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
examples_included_num += 1
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
# sentence_example = ''.join(sentence_example)
# headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
headword_patterns[valency_pattern_key]['sentence_num'] += 1
headword_patterns[valency_pattern_key]['sr_data'] = sr_data
# add patterns to db
new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()]
if len(new_patterns_query) > 0:
result = valency_pattern_id_collection.bulk_write(new_patterns_query)
# calculate statistics
semantic_role_stats = {}
sentence_tot = 0
pattern_tot = len(headword_patterns)
for key, val in headword_patterns.items():
sentence_num = val['sentence_num']
for sr in key:
if sr in semantic_role_stats:
semantic_role_stats[sr]['valency_pattern_num'] += 1
semantic_role_stats[sr]['valency_sentence_num'] += sentence_num
else:
semantic_role_stats[sr] = {}
semantic_role_stats[sr]['valency_pattern_num'] = 1
semantic_role_stats[sr]['valency_sentence_num'] = sentence_num
sentence_tot += sentence_num
return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida):
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
Lexeme.dummy, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
.join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
.join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
.join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
.filter(Measure.name == 'frequency') \
.filter(Corpus.name == 'gigafida') \
.filter(Corpus.version == '2.0')
# valency_pattern_id_collection.find()
# used to not repeat search queries for prepositions
preposition_list = {}
for headword_text, category_text in headword_category:
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
# a = [a for a in valency_pattern_id_collection.find()]
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
# patterns = {}
pattern_id_max = len(patterns) + 1
# pattern_examples_limit = 4
# get data
headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj,
headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit,
{})
if not ignore_gigafida:
headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida,
w_a_collection_gigafida,
headword_text, RF,
mongo, patterns,
pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
# TODO ERASE THIS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
if len(headword_patterns_ssj) == 0:
continue
wf1 = aliased(WordFormFeature)
wf2 = aliased(WordFormFeature)
wf3 = aliased(WordFormFeature)
query_preposition = session.query(FormRepresentation.form) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
.join(wf1, wf1.word_form_id == WordForm.id) \
.join(wf2, wf2.word_form_id == WordForm.id) \
.join(wf3, wf3.word_form_id == WordForm.id) \
.filter(Lexeme.lemma == headword_text) \
.filter(wf1.value == 'singular') \
.filter(wf2.value == 'third') \
.filter(wf3.value == 'present')
pattern_translation_hws = query_preposition.all()
pattern_translation_3_sin = headword_text
if len(pattern_translation_hws) == 1:
pattern_translation_3_sin = pattern_translation_hws[0].form
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
if headword_text[-1] == '_':
headword_text_query = headword_text[:-1]
else:
headword_text_query = headword_text
query = query_general.filter(Category.name == category_text) \
.filter(Lexeme.lemma == headword_text_query) \
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
LexicalUnitType.name)
# res = query.one_or_none()
query_res = query.all()
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
# .filter(Measure.name == 'frequency') \
# .filter(Category.name == 'preposition') \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
# query2 = session.query(Lexeme.id) \
# .join(Category, Category.id == Lexeme.category_id) \
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(Lexeme.lemma == 'za') \
# .filter(Feature.name == 'case') \
# .filter(LexemeFeature.value == 'instrumental') \
# .group_by(Lexeme.id)
#
# a = query2.all()
if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0]
elif len(query_res) > 1:
# all lexical_unit_ids equal or at least one dummy
final_lexical_unit_id = 0
final_lexical_unit_lexeme_id = 0
for r in query_res:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
lexical_unit_type_name) = r
if dummy:
final_lexical_unit_id = lexical_unit_id
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
break
lexical_unit_id = final_lexical_unit_id
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
else:
frequency = 0
lexeme_id = 0
lexical_unit_id = 0
lexical_unit_lexeme_id = 0
lexical_unit_type_name = ''
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
entry = lxml.SubElement(dictionary, 'entry')
head = lxml.SubElement(entry, 'head')
headword = lxml.SubElement(head, 'headword')
lemma = lxml.SubElement(headword, 'lemma')
lemma.text = headword_text
lexical_unit = lxml.SubElement(head, 'lexicalUnit')
lexical_unit.set('id', str(lexical_unit_id))
lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name
lexical_unit.set('type', lexical_unit_type_name)
lexeme = lxml.SubElement(lexical_unit, 'lexeme')
lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id))
lexeme.text = headword_text
grammar = lxml.SubElement(head, 'grammar')
category = lxml.SubElement(grammar, 'category')
if args.language == 'sl':
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
else:
category.text = category_text
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
if args.language == 'sl':
grammarFeature.set('name', 'vid')
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
0].value in ASPECT_MAP else ''
else:
grammarFeature.set('name', 'aspect')
grammarFeature.text = features[0].value if len(features) > 0 else ''
measureList = lxml.SubElement(head, 'measureList')
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
# TODO Modify this!
measure.set('source', 'Gigafida 2.0')
# measure.set('source', 'ssj500k')
measure.text = str(int(frequency))
body = lxml.SubElement(entry, 'body')
statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList')
# combine semantic_role_stats
semantic_role_stats = {}
for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items():
semantic_role_stats[semanticRole_val] = {}
semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats
if not ignore_gigafida:
for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items():
if semanticRole_val not in semantic_role_stats:
semantic_role_stats[semanticRole_val] = {}
semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats
for semanticRole_val, semanticRole_stats in semantic_role_stats.items():
statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer')
semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole')
semanticRole.text = semanticRole_val
measureList = lxml.SubElement(statisticsContainer, 'measureList')
if 'ssj' in semanticRole_stats:
measure_pattern_ssj = lxml.SubElement(measureList, 'measure')
measure_pattern_ssj.set('type', 'valency_pattern_ratio')
measure_pattern_ssj.set('source', 'ssj500k 2.2')
measure_pattern_ssj.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj)
measure_sentence_ssj = lxml.SubElement(measureList, 'measure')
measure_sentence_ssj.set('type', 'valency_sentence_ratio')
measure_sentence_ssj.set('source', 'ssj500k 2.2')
if sentence_tot_ssj == 0:
measure_sentence_ssj.text = '%.4f' % (0.0)
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else:
measure_sentence_ssj.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
# measure_sentence_ssj.text = '%.2f' % (
# semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
if 'gf' in semanticRole_stats and not ignore_gigafida:
measure_pattern_gf = lxml.SubElement(measureList, 'measure')
measure_pattern_gf.set('type', 'valency_pattern_ratio')
measure_pattern_gf.set('source', 'Gigafida 2.0')
measure_pattern_gf.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf)
measure_sentence_gf = lxml.SubElement(measureList, 'measure')
measure_sentence_gf.set('type', 'valency_sentence_ratio')
measure_sentence_gf.set('source', 'Gigafida 2.0')
if sentence_tot_gf == 0:
measure_sentence_gf.text = '%.4f' % (0.0)
# print(headword_text)
# print(semanticRole_val)
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
else:
measure_sentence_gf.text = '%.4f' % (
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
senseList = lxml.SubElement(body, 'senseList')
for sense_id in sense_ids:
if len(sense_ids) > 1 and sense_id.dummy:
continue
sense = lxml.SubElement(senseList, 'sense')
if not sense_id.dummy:
sense.set('id', str(sense_id.id))
definitionList = lxml.SubElement(sense, 'definitionList')
definition_texts = session.query(Definition.description).filter(
Definition.sense_id == sense_id.id).all()
for definition_text in definition_texts:
definition = lxml.SubElement(definitionList, 'definition')
definition.text = definition_text[0]
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
StructureComponent.id, StructureComponent.name).join(
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
.filter(LexicalUnit.id == sense_id.id)
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
# .filter(SyntacticStructure.id == sense_id)
syntactic_structuresr = syntactic_structures.all()
# syntactic_structures2r = syntactic_structures2.all()
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
valencyPatternList.set('system', 'JOS')
# combine semantic_role_stats ##################################
headword_patterns = {}
for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items():
headword_patterns[headword_patterns_val] = {}
headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats
if not ignore_gigafida:
for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items():
if headword_patterns_val not in headword_patterns:
headword_patterns[headword_patterns_val] = {}
headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats
#################################################################
for headword_pattern, headword_pattern_dict in headword_patterns.items():
valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern')
valencyPattern.set('id', str(patterns[headword_pattern]))
measureList_sense = lxml.SubElement(valencyPattern, 'measureList')
if 'ssj' in headword_pattern_dict:
measure_sense = lxml.SubElement(measureList_sense, 'measure')
measure_sense.set('type', 'frequency_all')
measure_sense.set('source', 'ssj500k 2.2')
measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num'])
if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']:
measure_sense = lxml.SubElement(measureList_sense, 'measure')
measure_sense.set('type', 'frequency_all')
measure_sense.set('source', 'Gigafida 2.0')
measure_sense.text = str(headword_pattern_dict['gf']['sentence_num'])
semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList')
# patternId = lxml.SubElement(semanticRoles, 'patternId')
# patternId.text = str(patterns[headword_pattern])
if 'ACT' in headword_pattern:
patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin
else:
patternTranslationText = headword_text
for semantic_role in headword_pattern:
if semantic_role != 'ACT':
# additional rules
# if semantic_role == 'RESLT':
# pass
# else:
# patternTranslationText += ' ' + translations[semantic_role]
patternTranslationText += ' ' + translations[semantic_role]
semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer')
semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole')
semanticRole.text = semantic_role
syntactic_structure_dict = {}
# TODO EXPAND FROM SSJ DATA ONLY + FIX BUG ABOUT SEMANTIC ROLE CONTAINER + EXAMPLES NOT WORKING!!! FIX IDS
if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
if syn_struct_id not in syntactic_structure_dict:
syntactic_structure_dict[syn_struct_id] = {}
for com_num, com_set in syn_struct_dict.items():
if com_num not in syntactic_structure_dict[syn_struct_id]:
syntactic_structure_dict[syn_struct_id][com_num] = set()
for lex in com_set:
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
if syn_struct_id not in syntactic_structure_dict:
syntactic_structure_dict[syn_struct_id] = {}
for com_num, com_set in syn_struct_dict.items():
if com_num not in syntactic_structure_dict[syn_struct_id]:
syntactic_structure_dict[syn_struct_id][com_num] = set()
for lex in com_set:
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
if len(syntactic_structure_dict) > 0:
syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList')
# iterate over syntactic structures and write them
for syn_struct_id, component_dict in syntactic_structure_dict.items():
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
syntacticStructure.set('id', syn_struct_id)
for comp_id, lexemes in component_dict.items():
for l in lexemes:
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
if l in preposition_list:
prep_id = preposition_list[l]
else:
query_preposition = session.query(Lexeme.id) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
.join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(Lexeme.lemma == l[2]) \
.filter(Feature.name == 'case') \
.filter(LexemeFeature.value == CASE_MAP[l[1]]) \
.group_by(Lexeme.id)
preposition_ids = query_preposition.all()
if len(preposition_ids) != 1:
prep_id = ''
else:
prep_id = str(preposition_ids[0][0])
preposition_list[l] = prep_id
lexem.set('sloleks', prep_id)
lexem.text = l[2]
# if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
# for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
# syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
# syntacticStructure.set('id', syn_struct_id)
# for com_num, com_set in syn_struct_dict.items():
# # component = lxml.SubElement(syntacticStructure, 'component')
# # component.set('num', com_num)
# for lex in com_set:
# component = lxml.SubElement(syntacticStructure, 'component')
# component.set('num', com_num)
# lexem = lxml.SubElement(component, 'lexeme')
# lexem.set('sloleks', '')
# lexem.text = lex
patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
patternRepresentation.text = patternTranslationText
exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList')
if 'ssj' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(sentence_example)
if 'gf' in headword_pattern_dict:
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
exampleContainer.append(sentence_example)
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
encoding='utf-8') as xf:
xf.write(dictionary, pretty_print=True)
# xf.write(entry, pretty_print=True)
# tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
def init_db(db):
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
[db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
pool_recycle=14400)
Base.metadata.reflect(engine)
class Lexeme(Base):
__table__ = Base.metadata.tables['jedro_lexeme']
class LexemeFeature(Base):
__table__ = Base.metadata.tables['jedro_lexeme_feature']
class SyntacticStructure(Base):
__table__ = Base.metadata.tables['jedro_syntacticstructure']
class StructureComponent(Base):
__table__ = Base.metadata.tables['jedro_structurecomponent']
class Feature(Base):
__table__ = Base.metadata.tables['jedro_feature']
class LexicalUnitLexeme(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
class LexicalUnit(Base):
__table__ = Base.metadata.tables['jedro_lexicalunit']
class LexicalUnitType(Base):
__table__ = Base.metadata.tables['jedro_lexicalunittype']
class Category(Base):
__table__ = Base.metadata.tables['jedro_category']
class Sense(Base):
__table__ = Base.metadata.tables['jedro_sense']
class Measure(Base):
__table__ = Base.metadata.tables['jedro_measure']
class LexicalUnitMeasure(Base):
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
class Corpus(Base):
__table__ = Base.metadata.tables['jedro_corpus']
class Definition(Base):
__table__ = Base.metadata.tables['jedro_definition']
class WordForm(Base):
__table__ = Base.metadata.tables['jedro_wordform']
class WordFormFeature(Base):
__table__ = Base.metadata.tables['jedro_wordform_feature']
class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation']
return engine
def match_file(words, structures):
matches = []
if words[0].text == 'Ena':
a = 0
for s in structures:
if s.id == '89':
a = 1
for w in words:
mhere = s.match(w)
for match in mhere:
# save only those with verbs in them
if not [True for m in match.values() if m.msd[0] == 'V']:
continue
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
colocation_id = tuple(colocation_id)
matches.append([match, colocation_id])
return matches
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
def find_word_sons(word, deppar_dict, word_id, role):
for k, v in word.links.items():
# if k != 'default_factory':
for w in v:
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
# print('here')
if k in possible_jos_links:
if w.id not in deppar_dict:
deppar_dict[w.id] = {}
deppar_dict[w.id][word_id] = role
find_word_sons(w, deppar_dict, word_id, role)
# elif k in possible_jos_links:
# raise Exception('One word in multiple dependency parsetrees')
# for ignoring punctuations
def idi_word_generator(sentence):
idi = 0
for word in sentence:
if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
continue
yield idi, word
idi += 1
def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig):
structures, _, max_num_components = build_structures(args)
timeinfo = TimeInfo(len(input_corpus))
database = Database(args)
# match_store = MatchStore(args, database)
# word_stats = WordStats(lemma_msds, database)
formatted_sentences = {}
start_time = time.time()
# print(time.time() - start_time)
sentences_num_limit = 10000
sentences_in_ram = 0
# is_gf = input_corpus_orig is not None
is_gf = input_corpus_orig is not None
if is_gf:
glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection)
for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
if is_gf:
sentence_glue = next(glue_words_gen)
if sent_id != sentence_glue[0]:
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
if len(sentence_glue[1]) != len(sentence):
raise Exception(
f"Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
for w, w_glue in zip(sentence, sentence_glue[1]):
w.glue = w_glue[2]
if sentence is None:
timeinfo.add_measurement(-1)
continue
# start_time = time.time()
# print(time.time() - start_time)
matches = match_file(sentence, structures)
# if sent_id == 'ssj134.880.3375':
# print('here')
# print(time.time() - start_time)
# match_store.add_matches(matches)
# word_stats.add_words(words)
# database.commit()
# find unimportant collocations
# extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values())
for match in matches:
match_idis = []
for key, word in match[0].items():
match_idis.append(word.idi)
match.append(match_idis)
collocations = {}
for match in matches:
for key, word in match[0].items():
# if word.id == ''
if word.id not in collocations:
collocations[word.id] = []
collocations[word.id].append((match[1][0], key, word.msd[:2], match[2]))
# print(time.time() - start_time)
formatted_sentence = []
deppar_dict = {}
# idi = 0
# create output and form dependency parsetree sons
for idi, word in idi_word_generator(sentence):
# if word.text == 'Mumel':
# print('here')
# if word.text == 'Poleg':
# print('here')
# if word.text == 'Luka':
# print('here')
idi = str(idi)
# a = sent_id in sentences_of_interest
# b = (word.lemma, word.msd) in sentences_of_interest[sent_id]
# if word.msd == 'X':
# continue
# if len(word.text) == 1 and word.text in string.punctuation + '':
# a = re.match('^[\w]+$', word.text) is not None
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
# continue
# if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]:
# if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]:
# cur_count = w_collection.count_documents({'_id': sent_id})
# if w_collection.count_documents({'_id': sent_id}) > 0:
sentence_of_interest = othr_sentence_attributes
# is_count = cur.count() > 0
if idi in othr_sentence_attributes:
if word.id not in deppar_dict:
deppar_dict[word.id] = {}
deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1]
# deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]}
# if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]:
# if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]:
# print((word.lemma, word.msd))
# print(sentences_of_interest[sent_id][idi][1])
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
# print('HERE')
find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1])
# idi += 1
# print(time.time() - start_time)
for word in sentence:
if word.id in collocations:
col = collocations[word.id]
else:
col = []
if word.id in deppar_dict:
dp = deppar_dict[word.id]
else:
dp = {}
formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma))
# create_sentence_output(formatted_sentence, 4)
formatted_sentences[sent_id] = formatted_sentence
if sentences_in_ram >= sentences_num_limit:
sentences_in_ram = 0
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
result = w_a_collection.bulk_write(requests)
formatted_sentences = {}
sentences_in_ram += 1
# print(time.time() - start_time)
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
result = w_a_collection.bulk_write(requests)
# force a bit of garbage collection
# del sentence
# del sent_id
# del matches
# gc.collect()
print(time.time() - start_time)
# return formatted_sentences
# # timeinfo.add_measurement(time.time() - start_time)
# # timeinfo.info()
# # if no output files, just exit
# if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
# return
#
# # get word renders for lemma/msd
# word_stats.generate_renders()
# match_store.determine_colocation_dispersions()
#
# # figure out representations!
# if args.out or args.out_no_stat:
# match_store.set_representations(word_stats, structures)
#
# Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
# Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
# structures, match_store)
def get_headword_category(collection):
"""
Returns
:return:
List of tuples with all headwords in mongodb and their categories.
"""
headwords = sorted(collection.distinct("headwords")[1:])
if args.headwords:
with open(args.headwords, 'w') as f:
for item in headwords:
f.write("%s\n" % item)
headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in
headwords]
return headword_category
def main(args):
# with Path('data/wordlist.json').open("r") as fp:
# sskj_wordlist = json.load(fp)
# # wordlist = set(sskj_wordlist['wordlist'])
# wordlist = set(sskj_wordlist['wordlist'])
print('beginning chunk')
start_time = time.time()
# user:user:valdb:127.0.0.1
mongo = MongoClient(username='user', password='user', authSource='valdb')
db = mongo.valdb
collection_ssj = db['ssj']
collection_gigafida = db['gigafida']
db2 = mongo.extvaldb
# write collection
w_collection_ssj = db2['ssj']
w_collection_gigafida = db2['gigafida']
w_a_collection_ssj = db2['ssj' + '_all']
w_a_collection_gigafida = db2['gigafida' + '_all']
valency_pattern_id_collection = db2['valency_pattern_ids']
RF = reduce_functions["reduce_0"]["f"]
# get all headwords from database
# headword_category = get_headword_category(collection_ssj)
with open(args.headwords, 'r') as read:
headword_category = [(line[:-1], 'verb') for line in read.readlines()]
assert args.language == 'en' or args.language == 'sl'
shutil.rmtree(args.outdir, True)
os.mkdir(args.outdir)
engine = init_db(args.sloleks_db)
# input_file = codecs.open(args.infile, 'r')
# # input_file = []
# next(input_file)
# category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'}
session = Session(engine)
# cur = collection.find({})
#
# a = []
# cur_len = 0
# # num_empty_sent = 0
# for ent in cur:
# cur_len += 1
# # s = frames_from_db_entry(ent)
# # if not s:
# # num_empty_sent += 1
# a += frames_from_db_entry(ent)
print(time.time() - start_time)
# print(num_empty_sent)
print('get_sentences_of_interest')
start_time = time.time()
# sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo)
# sentences_of_interest_stored = args.p1_processed
if not args.p1_processed:
with tqdm(total=len(headword_category)) as pbar:
get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar)
if not args.ignore_gigafida:
with tqdm(total=len(headword_category)) as pbar:
get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar)
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
print(time.time() - start_time)
# num_sentences = 0
# for el in all_sentences:
# if el not in sentences_of_interest:
# num_sentences += 1
#
# print(num_sentences)
# print(len(all_sentences))
print('extract_sentences')
start_time = time.time()
# formatted_sentences_stored = args.p2_processed
if not args.p2_processed:
gf_anno_paths = list(os.walk(args.input_gigafida_annotated))
gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]]
gf_orig_paths = list(os.walk(args.input_gigafida_original))
gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF'])
extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None)
if not args.ignore_gigafida:
extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths)
print(time.time() - start_time)
print('write_xml')
start_time = time.time()
# print('aa ' + 3)
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida)
print(time.time() - start_time)
# input_file.close()
session.close()
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
arg_parser.add_argument('--schema', type=str, help='XML schema')
arg_parser.add_argument('--infile', type=str, help='Input file')
arg_parser.add_argument('--outdir', type=str, help='Output directory')
arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.')
arg_parser.add_argument('--language', type=str, help='Language of certain attributes')
arg_parser.add_argument('--structure_extraction', type=str, help='Path to project (https://gitea.cjvt.si/ozbolt/luscenje_struktur)')
arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.')
arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.')
arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.')
arg_parser.add_argument('--p1_processed',
help='Skip first part (obtaining sentences of interest) when they are already in DB.',
action='store_true')
arg_parser.add_argument('--p2_processed',
help='Skip second part (obtaining formatted sentences) when they are already in DB.',
action='store_true')
arg_parser.add_argument('--structures',
help='Structures definitions in xml file')
arg_parser.add_argument('--input_sloleks',
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
arg_parser.add_argument('--input_gigafida_annotated',
help='input file in (gz or xml currently). If none, then just database is loaded')
arg_parser.add_argument('--input_gigafida_original',
help='input file in (gz or xml currently). If none, then just database is loaded')
arg_parser.add_argument('--out',
help='Classic output file')
arg_parser.add_argument('--out-no-stat',
help='Output file, but without statistical columns')
arg_parser.add_argument('--all',
help='Additional output file, writes more data')
arg_parser.add_argument('--stats',
help='Output file for statistics')
arg_parser.add_argument('--no-msd-translate',
help='MSDs are translated from slovene to english by default',
action='store_true')
arg_parser.add_argument('--skip-id-check',
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
action='store_true')
arg_parser.add_argument('--min_freq', help='Minimal frequency in output',
type=int, default=0, const=1, nargs='?')
arg_parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
arg_parser.add_argument('--count-files',
help="Count files: more verbose output", action='store_true')
arg_parser.add_argument('--multiple-output',
help='Generate one output for each syntactic structure',
action='store_true')
arg_parser.add_argument('--sort-by',
help="Sort by a this column (index)", type=int, default=-1)
arg_parser.add_argument('--sort-reversed',
help="Sort in reversed ored", action='store_true')
arg_parser.add_argument('--db',
help="Database file to use (instead of memory)", default=None)
arg_parser.add_argument('--new-db',
help="Writes over database file, if there exists one", action='store_true')
arg_parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
args = arg_parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
try:
sys.path.insert(1, args.structure_extraction)
from progress_bar import progress
from word import Word, WordCompressed
from syntactic_structure import build_structures
from match_store import MatchStore
from word_stats import WordStats
from writer import Writer
from loader import load_files, file_sentence_glue_generator
from database import Database
from time_info import TimeInfo
from msd_translate import MSD_TRANSLATE
except:
raise
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))