forked from kristjan/cjvt-valency
1645 lines
71 KiB
Python
1645 lines
71 KiB
Python
# -*- coding: utf-8 -*-
|
|
#!/usr/bin/python3
|
|
|
|
#imports from luscenje_struktur
|
|
from luscenje_struktur.progress_bar import progress
|
|
from luscenje_struktur.word import Word, WordCompressed
|
|
from luscenje_struktur.syntactic_structure import build_structures
|
|
from luscenje_struktur.match_store import MatchStore
|
|
from luscenje_struktur.word_stats import WordStats
|
|
from luscenje_struktur.writer import Writer
|
|
from luscenje_struktur.loader import load_files, file_sentence_glue_generator
|
|
from luscenje_struktur.database import Database
|
|
from luscenje_struktur.time_info import TimeInfo
|
|
from luscenje_struktur.msd_translate import MSD_TRANSLATE
|
|
|
|
# make database-service
|
|
import gc
|
|
import re
|
|
import string
|
|
from collections import OrderedDict
|
|
import sys
|
|
from tqdm import tqdm
|
|
|
|
|
|
import pymongo
|
|
# import tqdm as tqdm
|
|
|
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/valency')
|
|
# sys.path.insert(1, '/home/luka/Development/srl/cjvt_valency/src/pkg/cjvt-corpusparser')
|
|
from valency.Frame import frames_from_db_entry_headword
|
|
from valency.reduce_functions import reduce_functions
|
|
|
|
import argparse
|
|
import os
|
|
import shutil
|
|
import lxml.etree as lxml
|
|
import codecs
|
|
|
|
import logging
|
|
import argparse
|
|
import pickle
|
|
import time
|
|
|
|
from io import StringIO
|
|
from lxml import etree
|
|
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
from sqlalchemy.orm import Session, aliased
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy import func
|
|
|
|
from pymongo import MongoClient, UpdateOne, InsertOne
|
|
|
|
# examples_num = sys.maxsize
|
|
# corpus = 'ssj'
|
|
|
|
translations = {
|
|
'ACT': 'KDO/KAJ',
|
|
'PAT': 'KOGA/KAJ',
|
|
'RESLT': 'REZULTAT',
|
|
'REC': 'KOMU/ČEMU',
|
|
'TIME': 'KDAJ',
|
|
'MANN': 'KAKO',
|
|
'LOC': 'KJE',
|
|
'MEANS': 'S ČIM',
|
|
'GOAL': 'ČEMU',
|
|
'REG': 'GLEDE NA KOGA/KAJ',
|
|
'DUR': 'KOLIKO ČASA',
|
|
'CAUSE': 'ZAKAJ',
|
|
'COND': 'POD KATERIM POGOJEM',
|
|
'ORIG': 'IZVOR',
|
|
'FREQ': 'KOLIKOKRAT',
|
|
'SOURCE': 'OD KOD',
|
|
'AIM': 'S KAKŠNIM NAMENOM',
|
|
'QUANT': 'ŠTEVILO',
|
|
'EVENT': 'NA DOGODKU',
|
|
'CONTR': 'KLJUB ČEMU',
|
|
'ACMP': 'S KOM/ČIM',
|
|
'RESTR': 'Z OMEJITVIJO',
|
|
'MWPRED': '',
|
|
'MODAL': '',
|
|
'PHRAS': ''
|
|
}
|
|
|
|
CATEGORY_MAP = {
|
|
'noun': 'samostalnik',
|
|
'verb': 'glagol',
|
|
'adjective': 'pridevnik',
|
|
'adverb': 'prislov',
|
|
'pronoun': 'zaimek',
|
|
'numeral': 'števnik',
|
|
'preposition': 'predlog',
|
|
'conjunction': 'veznik',
|
|
'particle': 'členek',
|
|
'interjection': 'medmet',
|
|
'abbreviation': 'okrajšava',
|
|
'residual': 'neuvrščeno'
|
|
}
|
|
|
|
ASPECT_MAP = {
|
|
'perfective': 'dovršni',
|
|
'progressive': 'nedovršni',
|
|
'biaspectual': 'dvovidski'
|
|
}
|
|
|
|
CASE_MAP = {
|
|
'n': 'nominative',
|
|
'g': 'genitive',
|
|
'd': 'dative',
|
|
'a': 'accusative',
|
|
'l': 'locative',
|
|
'i': 'instrumental'
|
|
}
|
|
|
|
|
|
Lexeme = None
|
|
LexemeFeature = None
|
|
SyntacticStructure = None
|
|
StructureComponent = None
|
|
Feature = None
|
|
LexicalUnitLexeme = None
|
|
LexicalUnit = None
|
|
LexicalUnitType = None
|
|
Category = None
|
|
Sense = None
|
|
Measure = None
|
|
LexicalUnitMeasure = None
|
|
Corpus = None
|
|
Definition = None
|
|
WordForm = None
|
|
WordFormFeature = None
|
|
FormRepresentation = None
|
|
|
|
|
|
# corpus = 'gigafida'
|
|
from pathlib import Path
|
|
import json
|
|
|
|
def hws_generator(collection, headword_text, RF, mongo):
|
|
cur = collection.find({"headwords": headword_text})
|
|
# print('tu2!')
|
|
frames = []
|
|
for ent in cur:
|
|
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
|
|
cur.close()
|
|
|
|
# if headword_text == 'brati':
|
|
# print('here')
|
|
# if headword_text == 'prevajati':
|
|
# print('here')
|
|
|
|
ret_frames = RF(frames, mongo.db.sensemap)
|
|
# print('tu4!')
|
|
for frame in ret_frames:
|
|
frame_json = frame.to_json()
|
|
yield frame_json
|
|
|
|
|
|
def get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo, pbar, status_collection, corpus_type):
|
|
sentences_of_interest = {}
|
|
# all_sentences = set()
|
|
sorted(headword_category, key=lambda x: x[0])
|
|
# num_sentences in RAM at once
|
|
sentences_num_limit = 15000
|
|
sentences_in_ram = 0
|
|
# part = 0
|
|
# start_time = time.time()
|
|
# first_sentence = True
|
|
# section_included = False
|
|
# last_processed_hw = 'pomeniti'
|
|
# last_processed_hw = 'iti'
|
|
# last_processed_hw = 'aktivirati'
|
|
# last_processed_hw = 'aktivirati'
|
|
|
|
status_collection_update_list = []
|
|
|
|
# already_processed = False
|
|
for headword_id, (headword_text, category_text) in enumerate(headword_category):
|
|
# check whether element has been processed
|
|
if status_collection.count_documents({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}):
|
|
pbar.update(1)
|
|
continue
|
|
# print(headword_text)
|
|
# if already_processed:
|
|
# if headword_text != last_processed_hw:
|
|
# continue
|
|
# else:
|
|
# already_processed = False
|
|
# for headword_text, category_text in headword_category[15:20]:
|
|
# headword_text = 'zadovoljen'
|
|
# category_text = 'adjective'
|
|
headword_patterns_ids = {}
|
|
# print('tu1!')
|
|
cur = collection.find({"headwords": headword_text})
|
|
# print('tu2!')
|
|
frames = []
|
|
for ent in cur:
|
|
frames += frames_from_db_entry_headword(ent, headword_text) # pre-process this step for prod TODO
|
|
cur.close()
|
|
|
|
# if headword_text == 'brati':
|
|
# print('here')
|
|
# if headword_text == 'prevajati':
|
|
# print('here')
|
|
|
|
ret_frames = RF(frames, mongo.db.sensemap)
|
|
json_ret = {"frames": []}
|
|
# print('tu4!')
|
|
for frame in ret_frames:
|
|
frame_json = frame.to_json()
|
|
json_ret["frames"].append(frame_json)
|
|
# print('tu5!')
|
|
# get xml values
|
|
|
|
|
|
for hws in json_ret.values():
|
|
for hw in hws:
|
|
|
|
# print(hw['hw'])
|
|
# if hw['hw'] == 'pomeniti':
|
|
# print('aaa')
|
|
# generate valency pattern key
|
|
valency_pattern_key = []
|
|
functors = {}
|
|
if len(hw['tids']) != 1:
|
|
raise Exception('Multiple TIDS')
|
|
for slot in hw['slots']:
|
|
valency_pattern_key.append(slot['functor'])
|
|
for tid in slot['tids']:
|
|
if tid not in functors:
|
|
functors[tid] = {}
|
|
functors[tid] = slot['functor']
|
|
valency_pattern_key = tuple(sorted(valency_pattern_key))
|
|
if valency_pattern_key not in headword_patterns_ids:
|
|
headword_patterns_ids[valency_pattern_key] = []
|
|
|
|
for sentence in hw['sentences']:
|
|
# all_sentences.add(sentence[0][0])
|
|
# if len(headword_patterns_ids[valency_pattern_key]) < examples_num:
|
|
# if section_included:
|
|
# if not sentences_in_ram > sentences_num_limit:
|
|
# sentences_in_ram += 1
|
|
# continue
|
|
# else:
|
|
# first_sentence = True
|
|
|
|
sentence_id = sentence[0][0].rsplit('.', 1)[0]
|
|
# print(sentence_id)
|
|
if sentence_id not in sentences_of_interest:
|
|
sentences_of_interest[sentence_id] = {}
|
|
idi = 0
|
|
parent_idi = -1
|
|
# print('t1')
|
|
for idx, word in sentence:
|
|
if idx == hw['tids'][0]:
|
|
parent_idi = idi
|
|
if word['word']:
|
|
idi += 1
|
|
# print('t2')
|
|
if parent_idi == -1:
|
|
raise Exception('No parent found!')
|
|
idi = 0
|
|
# if len(sentence) > 500:
|
|
# print(len(sentence))
|
|
for idx, word in sentence:
|
|
if idx in functors:
|
|
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = functors[idx]
|
|
# sentences_of_interest[sentence_id][(word['lemma'], MSD_TRANSLATE[word['msd']])] = (functors[idx], idi)
|
|
# sentences_of_interest[sentence_id][idi] = (functors[idx], (word['lemma'], MSD_TRANSLATE[word['msd']]))
|
|
sentences_of_interest[sentence_id][str(idi)] = (str(parent_idi), functors[idx])
|
|
if word['word']:
|
|
# if sentence_id == 'ssj37.216.892':
|
|
# print(idi)
|
|
# print(word['text'])
|
|
idi += 1
|
|
# print('t3')
|
|
headword_patterns_ids[valency_pattern_key].append(sentence_id)
|
|
|
|
# check if this is first sentence
|
|
# if first_sentence:
|
|
# one_element = next(iter(sentences_of_interest.items()))
|
|
# section_included = w_collection.count_documents({'_id': one_element[0],
|
|
# list(one_element[1].keys())[0]: list(one_element[1].values())[0]}) == 1
|
|
# first_sentence = False
|
|
if sentences_in_ram >= sentences_num_limit:
|
|
# print('print1:')
|
|
# print(time.time() - start_time)
|
|
start_time = time.time()
|
|
# !!!!!!!!!!!!!!!!!!!!!!print('Part %d finalized')
|
|
# print('Sentences in ram:')
|
|
# print(sentences_in_ram)
|
|
sentences_in_ram = 0
|
|
|
|
# [InsertOne({'y': 1}), DeleteOne({'x': 1}),
|
|
# ... ReplaceOne({'w': 1}, {'z': 1}, upsert=True)]
|
|
|
|
# requests = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
|
# print('here')
|
|
if len(status_collection_update_list) > 0:
|
|
status_collection.bulk_write(status_collection_update_list)
|
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
|
# print('print2:')
|
|
# print(time.time() - start_time)
|
|
# start_time = time.time()
|
|
result = w_collection.bulk_write(requests)
|
|
|
|
# print('print3:')
|
|
# print(time.time() - start_time)
|
|
# start_time = time.time()
|
|
del status_collection_update_list
|
|
del requests
|
|
del sentences_of_interest
|
|
gc.collect()
|
|
|
|
# print('print4:')
|
|
# print(time.time() - start_time)
|
|
# start_time = time.time()
|
|
|
|
# print(part)
|
|
# print('HEADWORD')
|
|
# print(headword_text)
|
|
# pbar.update(1)
|
|
# part += 1
|
|
#
|
|
# w_collection.bulk_write(
|
|
# array.map((val) = >
|
|
# ({
|
|
# updateOne: {
|
|
# filter: {_id: val, uniqueid: 1001, atype: 1, ftype: 6},
|
|
# update: {
|
|
# $set: {epoch: 1548484978658, actionbyuserid: 110, title: 'Good Morning To All'}},
|
|
# upsert: true
|
|
# }
|
|
# })
|
|
# )
|
|
# })
|
|
|
|
# sentences_of_interest = {{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()}
|
|
# w_collection.update_many({'_id': {'$exists': False}}, sentences_of_interest, upsert=True)
|
|
# try:
|
|
# w_collection.insert_many(sentences_of_interest, ordered=False)
|
|
# except pymongo.errors.BulkWriteError as e:
|
|
# print(e.details['writeErrors'])
|
|
status_collection_update_list = []
|
|
sentences_of_interest = {}
|
|
|
|
# first_sentence = True
|
|
|
|
sentences_in_ram += 1
|
|
pbar.update(1)
|
|
status_collection_update_list.append(InsertOne({'corpus_type': corpus_type, 'headword_text': headword_text, 'part': 'p1'}))
|
|
|
|
# if 'GF0010453.1116.1' in sentences_of_interest:
|
|
# a = sentences_of_interest['GF0010453.1116.1']
|
|
# print('here')
|
|
if len(status_collection_update_list) > 0:
|
|
status_collection.bulk_write(status_collection_update_list)
|
|
requests = [UpdateOne({'_id': k}, {'$set': v}, upsert=True) for k, v in sentences_of_interest.items()]
|
|
|
|
if len(requests) > 0:
|
|
result = w_collection.bulk_write(requests)
|
|
|
|
# sentences_of_interest = [{'_id': k, 'connections': v} for k, v in sentences_of_interest.items()]
|
|
# try:
|
|
# w_collection.insert_many(sentences_of_interest, ordered=False)
|
|
# except pymongo.errors.BulkWriteError as e:
|
|
# print(e.details['writeErrors'])
|
|
# sentences_of_interest = {}
|
|
# # else:
|
|
# # print('aaa')
|
|
# return sentences_of_interest
|
|
|
|
|
|
def create_sentence_output(sentence, headword_id, corpus):
|
|
glue_outside = False
|
|
headword_id = str(headword_id)
|
|
parent_node = etree.Element('corpusExample')
|
|
parent_node.set('corpusName', corpus)
|
|
# parent_node.text = 'AAA'
|
|
# parent_node.prefix = 'BBB'
|
|
# parent_node.tail = 'CCC'
|
|
cur_node = parent_node
|
|
# formatted_sentence = ''
|
|
first_in_tag = True
|
|
first_outside_tag = False
|
|
in_dependency_tree = False
|
|
# TODO use whole sentence!
|
|
# for idi, word in enumerate(sentence):
|
|
|
|
# def idi_word_generator(sentence):
|
|
# idi = 0
|
|
# for word in sentence:
|
|
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
|
|
# continue
|
|
# yield idi, word
|
|
# idi += 1
|
|
|
|
idi = 0
|
|
attach_to = None
|
|
p_cur_node = None
|
|
p_attach_to = None
|
|
p_glue_attach_to = None
|
|
previous_word = None
|
|
# if sentence[0][0][0] == 'Tako':
|
|
# print('here')
|
|
# for idi, word in idi_word_generator(sentence):
|
|
for word_id in range(len(sentence)):
|
|
# is_ending_tree = False
|
|
# SRL container output
|
|
word = sentence[word_id]
|
|
|
|
# sentence output
|
|
if in_dependency_tree:
|
|
if headword_id not in word[2] or in_dependency_tree != word[2][headword_id]:
|
|
attach_to = cur_node
|
|
# is_ending_tree = True
|
|
p_glue_attach_to = cur_node
|
|
cur_node = parent_node
|
|
if not first_in_tag:
|
|
# formatted_sentence += '\n'
|
|
first_in_tag = True
|
|
# formatted_sentence += '</tree>'
|
|
in_dependency_tree = False
|
|
first_outside_tag = True
|
|
|
|
if headword_id in word[2] and not in_dependency_tree:
|
|
dep_tree = lxml.SubElement(cur_node, 'tree')
|
|
dep_tree.set('role', word[2][headword_id])
|
|
cur_node = dep_tree
|
|
if not first_in_tag:
|
|
# formatted_sentence += '\n'
|
|
first_in_tag = True
|
|
# formatted_sentence += '<tree role="{}">'.format(word[2][headword_id])
|
|
in_dependency_tree = word[2][headword_id]
|
|
attach_to = None
|
|
if p_glue_attach_to is not None:
|
|
glue_outside = True
|
|
|
|
if headword_id == str(idi) and not (len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None):
|
|
# if headword_id == idi:
|
|
comp = lxml.SubElement(cur_node, 'comp')
|
|
comp.set('role', 'headword')
|
|
|
|
if not first_outside_tag:
|
|
if p_attach_to is None:
|
|
if p_cur_node is not None:
|
|
p_cur_node.text += previous_word[0][1]
|
|
else:
|
|
p_attach_to.tail += previous_word[0][1]
|
|
elif p_glue_attach_to is not None:
|
|
if p_glue_attach_to.tail is None:
|
|
p_glue_attach_to.tail = previous_word[0][1]
|
|
else:
|
|
p_glue_attach_to.tail += previous_word[0][1]
|
|
# elif p_attach_to is not None:
|
|
# if p_attach_to.tail is None:
|
|
# p_attach_to.tail = previous_word[0][1]
|
|
# else:
|
|
# p_attach_to.tail += previous_word[0][1]
|
|
word_text = word[0][0]
|
|
comp.text = word_text
|
|
attach_to = comp
|
|
if not first_in_tag:
|
|
# formatted_sentence += '\n'
|
|
first_in_tag = True
|
|
first_outside_tag = True
|
|
p_cur_node = cur_node
|
|
p_glue_attach_to = comp
|
|
p_attach_to = attach_to
|
|
previous_word = word
|
|
# formatted_sentence += '<comp structure_id="headword">{}</comp>'.format(word[0][0])
|
|
idi += 1
|
|
continue
|
|
if word[1] and in_dependency_tree:
|
|
col_id = -1
|
|
for i, col in enumerate(word[1]):
|
|
if headword_id in col[3]:
|
|
col_id = i
|
|
break
|
|
|
|
if col_id != -1:
|
|
comp = lxml.SubElement(cur_node, 'comp')
|
|
comp.set('structure_id', word[1][col_id][0])
|
|
comp.set('num', word[1][col_id][1])
|
|
|
|
if not first_outside_tag:
|
|
if p_attach_to is None:
|
|
if p_cur_node is not None:
|
|
p_cur_node.text += previous_word[0][1]
|
|
else:
|
|
p_attach_to.tail += previous_word[0][1]
|
|
elif p_glue_attach_to is not None:
|
|
if p_glue_attach_to.tail is None:
|
|
p_glue_attach_to.tail = previous_word[0][1]
|
|
else:
|
|
p_glue_attach_to.tail += previous_word[0][1]
|
|
# elif p_attach_to is not None:
|
|
# if p_attach_to.tail is None:
|
|
# p_attach_to.tail = previous_word[0][1]
|
|
# else:
|
|
# p_attach_to.tail += previous_word[0][1]
|
|
word_text = word[0][0]
|
|
comp.text = word_text
|
|
attach_to = comp
|
|
if not first_in_tag:
|
|
# formatted_sentence += '\n'
|
|
first_in_tag = True
|
|
first_outside_tag = True
|
|
# Assuming one collocation per word
|
|
# formatted_sentence += '<comp structure_id="{}" num="{}">{}</comp>'.format(word[1][0][0], word[1][0][1], word[0][0])
|
|
p_cur_node = cur_node
|
|
p_glue_attach_to = comp
|
|
p_attach_to = attach_to
|
|
previous_word = word
|
|
idi += 1
|
|
continue
|
|
# collocation
|
|
# if not first_in_new_row:
|
|
# # formatted_sentence += ' '
|
|
# word_text = ' ' + word[0][0]
|
|
# else:
|
|
# word_text = word[0][0]
|
|
# if first_in_tag and previous_word:
|
|
# word_text = previous_word[0][1] + word[0][0]
|
|
# else:
|
|
# word_text = word[0][0]
|
|
# word_text += word[0][1]
|
|
# word_text = word[0][0] + word[0][1]
|
|
if not first_outside_tag:
|
|
if p_attach_to is None:
|
|
if p_cur_node is not None:
|
|
p_cur_node.text += previous_word[0][1]
|
|
else:
|
|
p_attach_to.tail += previous_word[0][1]
|
|
word_text = word[0][0]
|
|
else:
|
|
word_text = ''
|
|
if p_attach_to is None:
|
|
if p_cur_node is not None:
|
|
word_text += previous_word[0][1]
|
|
else:
|
|
word_text += previous_word[0][1]
|
|
if glue_outside:
|
|
p_glue_attach_to.tail = previous_word[0][1]
|
|
word_text = word[0][0]
|
|
else:
|
|
word_text += word[0][0]
|
|
|
|
|
|
if attach_to is None:
|
|
if cur_node.text is None:
|
|
cur_node.text = word_text
|
|
else:
|
|
cur_node.text += word_text
|
|
else:
|
|
if attach_to.tail is None:
|
|
attach_to.tail = word_text
|
|
else:
|
|
attach_to.tail += word_text
|
|
# attach_to.tail +=word[0][0]
|
|
# formatted_sentence += word[0][0]
|
|
first_in_tag = False
|
|
first_outside_tag = False
|
|
|
|
p_cur_node = cur_node
|
|
p_attach_to = attach_to
|
|
previous_word = word
|
|
|
|
p_glue_attach_to = None
|
|
|
|
if len(word[0][0]) == 1 and re.match('^[\w]+$', word[0][0]) is None:
|
|
continue
|
|
idi += 1
|
|
|
|
return parent_node
|
|
|
|
|
|
def get_SRLcontainer_data(sentence, word_of_interest_id, summary):
|
|
for word in sentence:
|
|
if word_of_interest_id in word[2]:
|
|
for col in word[1]:
|
|
if word_of_interest_id in col[3]:
|
|
if word[2][word_of_interest_id] not in summary:
|
|
summary[word[2][word_of_interest_id]] = {}
|
|
if col[0] not in summary[word[2][word_of_interest_id]]:
|
|
summary[word[2][word_of_interest_id]][col[0]] = {}
|
|
# word_of_interest_included = word_of_interest_id in col[3]
|
|
if col[1] not in summary[word[2][word_of_interest_id]][col[0]]:
|
|
summary[word[2][word_of_interest_id]][col[0]][col[1]] = set()
|
|
if col[2][0] == 'S':
|
|
summary[word[2][word_of_interest_id]][col[0]][col[1]].add((word[0][0], col[2][1], word[3]))
|
|
|
|
return summary
|
|
|
|
|
|
def valid_valency_pattern(valency_pattern_key):
|
|
occurences = set()
|
|
for v_p in valency_pattern_key:
|
|
if v_p in occurences:
|
|
return False
|
|
occurences.add(v_p)
|
|
return True
|
|
|
|
|
|
def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, corpus, examples_num, headword_patterns_ssj):
|
|
cur = collection.find({"headwords": headword_text})
|
|
frames = []
|
|
for ent in cur:
|
|
frames += frames_from_db_entry_headword(ent, headword_text)
|
|
cur.close()
|
|
|
|
ret_frames = RF(frames, mongo.db.sensemap)
|
|
json_ret = {"frames": []}
|
|
for frame in ret_frames:
|
|
frame_json = frame.to_json()
|
|
json_ret["frames"].append(frame_json)
|
|
|
|
# get xml values
|
|
headword_patterns = {}
|
|
new_patterns = {}
|
|
|
|
for hws in json_ret.values():
|
|
|
|
for hw in hws:
|
|
# generate valency pattern key
|
|
|
|
valency_pattern_key = []
|
|
for slot in hw['slots']:
|
|
valency_pattern_key.append(slot['functor'])
|
|
# sort valency_pattern_key by order provided in translations
|
|
valency_pattern_key_new = []
|
|
for key in translations:
|
|
if key in valency_pattern_key:
|
|
valency_pattern_key_new.append(key)
|
|
valency_pattern_key = tuple(valency_pattern_key_new)
|
|
|
|
if valency_pattern_key not in headword_patterns:
|
|
headword_patterns[valency_pattern_key] = {}
|
|
headword_patterns[valency_pattern_key]['sentence_examples'] = []
|
|
headword_patterns[valency_pattern_key]['sentence_num'] = 0
|
|
headword_patterns[valency_pattern_key]['sr_data'] = {}
|
|
if valency_pattern_key not in patterns and valency_pattern_key not in new_patterns:
|
|
new_patterns[valency_pattern_key] = pattern_id_max
|
|
patterns[valency_pattern_key] = pattern_id_max
|
|
pattern_id_max += 1
|
|
headword_patterns[valency_pattern_key]['id'] = patterns[valency_pattern_key]
|
|
|
|
sr_data = headword_patterns[valency_pattern_key]['sr_data']
|
|
tids = set(hw['tids'])
|
|
|
|
if valency_pattern_key in headword_patterns_ssj:
|
|
ssj_len = len(headword_patterns_ssj[valency_pattern_key]['sentence_examples'])
|
|
else:
|
|
ssj_len = 0
|
|
|
|
for sentence in hw['sentences']:
|
|
# sentences_of_interest.append(sentence[0])
|
|
# get sentence example
|
|
# sentence_example = []
|
|
sent_id = sentence[0][0].rsplit('.', 1)[0]
|
|
|
|
try:
|
|
db_sentence = next(iter(w_a_collection.find({'_id': sent_id})))['words']
|
|
except StopIteration:
|
|
continue
|
|
|
|
# if valency_pattern_key == ('ACT', 'PAT'):
|
|
# print('am')
|
|
|
|
# idi = 0
|
|
idi = 0
|
|
hw_idi = -1
|
|
for word_id, word in sentence:
|
|
if word_id in tids:
|
|
hw_idi = idi
|
|
if word['word']:
|
|
idi += 1
|
|
if hw_idi == -1:
|
|
raise Exception('No such headword idi!')
|
|
# for idi, word in idi_word_generator(sentence):
|
|
# print('here')
|
|
# for word_id, word_dict in sentence:
|
|
# # TODO Modify sentence!
|
|
# # if formatted_sentences[sent_id]
|
|
# sentence_example.append(word_dict['text'])
|
|
# if word_dict['word']:
|
|
# idi += 1
|
|
# if sent_id == 'ssj134.880.3375':
|
|
# print('here')
|
|
# if sent_id == 'ssj38.227.917':
|
|
# print('here')
|
|
# if sent_id == 'GF0004627.1913.1':
|
|
# print('here')
|
|
# print(sent_id)
|
|
# print([a for a in w_a_collection.find()])
|
|
|
|
# if valency_pattern_key == ('ACT', 'PAT'):
|
|
# print('here')
|
|
|
|
sr_data = get_SRLcontainer_data(db_sentence, str(hw_idi), sr_data)
|
|
examples_included_num = 0
|
|
|
|
# sr_data = get_SRLcontainer_data(formatted_sentences[sent_id], hw_idi, sr_data)
|
|
if len(headword_patterns[valency_pattern_key]['sentence_examples']) + ssj_len < examples_num and valid_valency_pattern(valency_pattern_key):
|
|
examples_included_num += 1
|
|
sentence_example = create_sentence_output(db_sentence, hw_idi, corpus)
|
|
# sentence_example = create_sentence_output(formatted_sentences[sent_id], hw_idi)
|
|
|
|
# sentence_example = ''.join(sentence_example)
|
|
# headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
|
|
headword_patterns[valency_pattern_key]['sentence_examples'].append(sentence_example)
|
|
headword_patterns[valency_pattern_key]['sentence_num'] += 1
|
|
|
|
headword_patterns[valency_pattern_key]['sr_data'] = sr_data
|
|
|
|
# add patterns to db
|
|
new_patterns_query = [InsertOne({'_id': v, 'semantic_roles': list(k)}) for k, v in new_patterns.items()]
|
|
if len(new_patterns_query) > 0:
|
|
result = valency_pattern_id_collection.bulk_write(new_patterns_query)
|
|
|
|
|
|
# calculate statistics
|
|
semantic_role_stats = {}
|
|
sentence_tot = 0
|
|
pattern_tot = len(headword_patterns)
|
|
for key, val in headword_patterns.items():
|
|
sentence_num = val['sentence_num']
|
|
for sr in key:
|
|
if sr in semantic_role_stats:
|
|
semantic_role_stats[sr]['valency_pattern_num'] += 1
|
|
semantic_role_stats[sr]['valency_sentence_num'] += sentence_num
|
|
else:
|
|
semantic_role_stats[sr] = {}
|
|
semantic_role_stats[sr]['valency_pattern_num'] = 1
|
|
semantic_role_stats[sr]['valency_sentence_num'] = sentence_num
|
|
sentence_tot += sentence_num
|
|
|
|
return headword_patterns, semantic_role_stats, sentence_tot, pattern_tot, pattern_id_max
|
|
|
|
|
|
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
|
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
|
Lexeme.dummy, LexicalUnitType.name) \
|
|
.join(Category, Category.id == Lexeme.category_id) \
|
|
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
|
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
|
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
|
.join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
|
|
.join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
|
|
.join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
|
|
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
|
.filter(Measure.name == 'frequency') \
|
|
.filter(Corpus.name == 'gigafida') \
|
|
.filter(Corpus.version == '2.0')
|
|
|
|
# valency_pattern_id_collection.find()
|
|
|
|
|
|
# used to not repeat search queries for prepositions
|
|
preposition_list = {}
|
|
for headword_text, category_text in headword_category:
|
|
# with lxml.xmlfile('data/output.xml', encoding='utf-8') as xf:
|
|
|
|
# a = [a for a in valency_pattern_id_collection.find()]
|
|
patterns = {tuple(v_p['semantic_roles']): v_p['_id'] for v_p in [a for a in valency_pattern_id_collection.find()]}
|
|
# patterns = {}
|
|
pattern_id_max = len(patterns) + 1
|
|
|
|
# pattern_examples_limit = 4
|
|
|
|
# get data
|
|
headword_patterns_ssj, semantic_role_stats_ssj, sentence_tot_ssj, pattern_tot_ssj, pattern_id_max = obtain_xml_data(collection_ssj, w_a_collection_ssj,
|
|
headword_text, RF, mongo, patterns, pattern_id_max, valency_pattern_id_collection, 'ssj500k 2.2', pattern_examples_limit,
|
|
{})
|
|
|
|
if not ignore_gigafida:
|
|
headword_patterns_gf, semantic_role_stats_gf, sentence_tot_gf, pattern_tot_gf, pattern_id_max = obtain_xml_data(collection_gigafida,
|
|
w_a_collection_gigafida,
|
|
headword_text, RF,
|
|
mongo, patterns,
|
|
pattern_id_max, valency_pattern_id_collection, 'Gigafida 2.0', pattern_examples_limit, headword_patterns_ssj)
|
|
|
|
|
|
wf1 = aliased(WordFormFeature)
|
|
wf2 = aliased(WordFormFeature)
|
|
wf3 = aliased(WordFormFeature)
|
|
query_preposition = session.query(FormRepresentation.form) \
|
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
|
|
.join(wf1, wf1.word_form_id == WordForm.id) \
|
|
.join(wf2, wf2.word_form_id == WordForm.id) \
|
|
.join(wf3, wf3.word_form_id == WordForm.id) \
|
|
.filter(Lexeme.lemma == headword_text) \
|
|
.filter(wf1.value == 'singular') \
|
|
.filter(wf2.value == 'third') \
|
|
.filter(wf3.value == 'present')
|
|
pattern_translation_hws = query_preposition.all()
|
|
|
|
pattern_translation_3_sin = headword_text
|
|
if len(pattern_translation_hws) == 1:
|
|
pattern_translation_3_sin = pattern_translation_hws[0].form
|
|
|
|
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
|
|
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
|
|
|
|
|
|
|
|
if headword_text[-1] == '_':
|
|
headword_text_query = headword_text[:-1]
|
|
else:
|
|
headword_text_query = headword_text
|
|
query = query_general.filter(Category.name == category_text) \
|
|
.filter(Lexeme.lemma == headword_text_query) \
|
|
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
|
LexicalUnitType.name)
|
|
|
|
# res = query.one_or_none()
|
|
query_res = query.all()
|
|
|
|
# query2 = session.query(Lexeme.id) \
|
|
# .join(Category, Category.id == Lexeme.category_id) \
|
|
# .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
|
# .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
|
# .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
|
# .join(LexicalUnitMeasure, LexicalUnitMeasure.lexical_unit_id == LexicalUnit.id) \
|
|
# .join(Measure, Measure.id == LexicalUnitMeasure.measure_id) \
|
|
# .join(Corpus, Corpus.id == LexicalUnitMeasure.corpus_id) \
|
|
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
|
|
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
# .filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
|
# .filter(Measure.name == 'frequency') \
|
|
# .filter(Category.name == 'preposition') \
|
|
# .filter(Lexeme.lemma == 'za') \
|
|
# .filter(Feature.name == 'case') \
|
|
# .filter(LexemeFeature.value == 'instrumental') \
|
|
# .group_by(Lexeme.id)
|
|
|
|
# query2 = session.query(Lexeme.id) \
|
|
# .join(Category, Category.id == Lexeme.category_id) \
|
|
# .join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
|
|
# .join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
# .filter(Lexeme.lemma == 'za') \
|
|
# .filter(Feature.name == 'case') \
|
|
# .filter(LexemeFeature.value == 'instrumental') \
|
|
# .group_by(Lexeme.id)
|
|
#
|
|
# a = query2.all()
|
|
|
|
if len(query_res) == 1:
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
|
query_res[0]
|
|
|
|
elif len(query_res) > 1:
|
|
# all lexical_unit_ids equal or at least one dummy
|
|
final_lexical_unit_id = 0
|
|
final_lexical_unit_lexeme_id = 0
|
|
for r in query_res:
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
|
|
lexical_unit_type_name) = r
|
|
if dummy:
|
|
final_lexical_unit_id = lexical_unit_id
|
|
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
|
|
break
|
|
lexical_unit_id = final_lexical_unit_id
|
|
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
|
|
else:
|
|
frequency = 0
|
|
lexeme_id = 0
|
|
lexical_unit_id = 0
|
|
lexical_unit_lexeme_id = 0
|
|
lexical_unit_type_name = ''
|
|
|
|
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
|
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
.filter(Feature.name == 'aspect').all()
|
|
|
|
entry = lxml.SubElement(dictionary, 'entry')
|
|
|
|
head = lxml.SubElement(entry, 'head')
|
|
|
|
headword = lxml.SubElement(head, 'headword')
|
|
lemma = lxml.SubElement(headword, 'lemma')
|
|
lemma.text = headword_text
|
|
|
|
lexical_unit = lxml.SubElement(head, 'lexicalUnit')
|
|
lexical_unit.set('id', str(lexical_unit_id))
|
|
lexical_unit_type_name = 'single' if lexical_unit_type_name == 'single_lexeme_unit' else lexical_unit_type_name
|
|
lexical_unit.set('type', lexical_unit_type_name)
|
|
lexeme = lxml.SubElement(lexical_unit, 'lexeme')
|
|
lexeme.set('lexical_unit_lexeme_id', str(lexical_unit_lexeme_id))
|
|
lexeme.text = headword_text
|
|
|
|
grammar = lxml.SubElement(head, 'grammar')
|
|
category = lxml.SubElement(grammar, 'category')
|
|
if args.language == 'sl':
|
|
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
|
|
else:
|
|
category.text = category_text
|
|
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
|
if args.language == 'sl':
|
|
grammarFeature.set('name', 'vid')
|
|
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
|
|
0].value in ASPECT_MAP else ''
|
|
else:
|
|
grammarFeature.set('name', 'aspect')
|
|
grammarFeature.text = features[0].value if len(features) > 0 else ''
|
|
|
|
measureList = lxml.SubElement(head, 'measureList')
|
|
measure = lxml.SubElement(measureList, 'measure')
|
|
measure.set('type', 'frequency')
|
|
# TODO Modify this!
|
|
measure.set('source', 'Gigafida 2.0')
|
|
# measure.set('source', 'ssj500k')
|
|
measure.text = str(int(frequency))
|
|
|
|
|
|
|
|
|
|
|
|
body = lxml.SubElement(entry, 'body')
|
|
statisticsContainerList = lxml.SubElement(body, 'statisticsContainerList')
|
|
|
|
# combine semantic_role_stats
|
|
semantic_role_stats = {}
|
|
for semanticRole_val, semanticRole_stats in semantic_role_stats_ssj.items():
|
|
semantic_role_stats[semanticRole_val] = {}
|
|
semantic_role_stats[semanticRole_val]['ssj'] = semanticRole_stats
|
|
|
|
if not ignore_gigafida:
|
|
for semanticRole_val, semanticRole_stats in semantic_role_stats_gf.items():
|
|
if semanticRole_val not in semantic_role_stats:
|
|
semantic_role_stats[semanticRole_val] = {}
|
|
semantic_role_stats[semanticRole_val]['gf'] = semanticRole_stats
|
|
|
|
for semanticRole_val, semanticRole_stats in semantic_role_stats.items():
|
|
statisticsContainer = lxml.SubElement(statisticsContainerList, 'statisticsContainer')
|
|
semanticRole = lxml.SubElement(statisticsContainer, 'semanticRole')
|
|
semanticRole.text = semanticRole_val
|
|
measureList = lxml.SubElement(statisticsContainer, 'measureList')
|
|
if 'ssj' in semanticRole_stats:
|
|
measure_pattern_ssj = lxml.SubElement(measureList, 'measure')
|
|
measure_pattern_ssj.set('type', 'valency_pattern_ratio')
|
|
measure_pattern_ssj.set('source', 'ssj500k 2.2')
|
|
measure_pattern_ssj.text = '%.4f' % (
|
|
semantic_role_stats[semanticRole_val]['ssj']['valency_pattern_num'] / pattern_tot_ssj)
|
|
measure_sentence_ssj = lxml.SubElement(measureList, 'measure')
|
|
measure_sentence_ssj.set('type', 'valency_sentence_ratio')
|
|
measure_sentence_ssj.set('source', 'ssj500k 2.2')
|
|
|
|
if sentence_tot_ssj == 0:
|
|
measure_sentence_ssj.text = '%.4f' % (0.0)
|
|
# print(headword_text)
|
|
# print(semanticRole_val)
|
|
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
|
|
else:
|
|
measure_sentence_ssj.text = '%.4f' % (
|
|
semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
|
|
|
|
# measure_sentence_ssj.text = '%.2f' % (
|
|
# semantic_role_stats[semanticRole_val]['ssj']['valency_sentence_num'] / sentence_tot_ssj)
|
|
if 'gf' in semanticRole_stats and not ignore_gigafida:
|
|
measure_pattern_gf = lxml.SubElement(measureList, 'measure')
|
|
measure_pattern_gf.set('type', 'valency_pattern_ratio')
|
|
measure_pattern_gf.set('source', 'Gigafida 2.0')
|
|
measure_pattern_gf.text = '%.4f' % (
|
|
semantic_role_stats[semanticRole_val]['gf']['valency_pattern_num'] / pattern_tot_gf)
|
|
measure_sentence_gf = lxml.SubElement(measureList, 'measure')
|
|
measure_sentence_gf.set('type', 'valency_sentence_ratio')
|
|
measure_sentence_gf.set('source', 'Gigafida 2.0')
|
|
if sentence_tot_gf == 0:
|
|
measure_sentence_gf.text = '%.4f' % (0.0)
|
|
# print(headword_text)
|
|
# print(semanticRole_val)
|
|
# print(semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'])
|
|
else:
|
|
measure_sentence_gf.text = '%.4f' % (
|
|
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
|
|
|
|
senseList = lxml.SubElement(body, 'senseList')
|
|
for sense_id in sense_ids:
|
|
if len(sense_ids) > 1 and sense_id.dummy:
|
|
continue
|
|
|
|
sense = lxml.SubElement(senseList, 'sense')
|
|
if not sense_id.dummy:
|
|
sense.set('id', str(sense_id.id))
|
|
|
|
definitionList = lxml.SubElement(sense, 'definitionList')
|
|
|
|
definition_texts = session.query(Definition.description).filter(
|
|
Definition.sense_id == sense_id.id).all()
|
|
|
|
for definition_text in definition_texts:
|
|
definition = lxml.SubElement(definitionList, 'definition')
|
|
definition.text = definition_text[0]
|
|
|
|
syntactic_structures = session.query(SyntacticStructure.id, SyntacticStructure.name,
|
|
StructureComponent.id, StructureComponent.name).join(
|
|
LexicalUnit, LexicalUnit.syntactic_structure_id == SyntacticStructure.id) \
|
|
.join(StructureComponent, StructureComponent.syntactic_structure_id == SyntacticStructure.id) \
|
|
.filter(LexicalUnit.id == sense_id.id)
|
|
|
|
# .join(LexicalUnitLexeme, LexicalUnitLexeme.structure_component_id == StructureComponent.id) \
|
|
|
|
# syntactic_structures2 = session.query(SyntacticStructure.id, SyntacticStructure.name).join(SyntacticStructure, SyntacticStructure.id == LexicalUnit.syntactic_structure_id) \
|
|
# .filter(SyntacticStructure.id == sense_id)
|
|
|
|
syntactic_structuresr = syntactic_structures.all()
|
|
# syntactic_structures2r = syntactic_structures2.all()
|
|
|
|
valencyPatternList = lxml.SubElement(sense, 'valencyPatternList')
|
|
valencyPatternList.set('system', 'JOS')
|
|
|
|
# combine semantic_role_stats ##################################
|
|
headword_patterns = {}
|
|
for headword_patterns_val, headword_patterns_stats in headword_patterns_ssj.items():
|
|
headword_patterns[headword_patterns_val] = {}
|
|
headword_patterns[headword_patterns_val]['ssj'] = headword_patterns_stats
|
|
|
|
if not ignore_gigafida:
|
|
for headword_patterns_val, headword_patterns_stats in headword_patterns_gf.items():
|
|
if headword_patterns_val not in headword_patterns:
|
|
headword_patterns[headword_patterns_val] = {}
|
|
headword_patterns[headword_patterns_val]['gf'] = headword_patterns_stats
|
|
#################################################################
|
|
for headword_pattern, headword_pattern_dict in headword_patterns.items():
|
|
valencyPattern = lxml.SubElement(valencyPatternList, 'valencyPattern')
|
|
valencyPattern.set('id', str(patterns[headword_pattern]))
|
|
measureList_sense = lxml.SubElement(valencyPattern, 'measureList')
|
|
if 'ssj' in headword_pattern_dict:
|
|
measure_sense = lxml.SubElement(measureList_sense, 'measure')
|
|
measure_sense.set('type', 'frequency_all')
|
|
measure_sense.set('source', 'ssj500k 2.2')
|
|
measure_sense.text = str(headword_pattern_dict['ssj']['sentence_num'])
|
|
if not ignore_gigafida and 'gf' in headword_pattern_dict and headword_pattern_dict['gf']['sentence_num']:
|
|
measure_sense = lxml.SubElement(measureList_sense, 'measure')
|
|
measure_sense.set('type', 'frequency_all')
|
|
measure_sense.set('source', 'Gigafida 2.0')
|
|
measure_sense.text = str(headword_pattern_dict['gf']['sentence_num'])
|
|
semanticRoleContainerList = lxml.SubElement(valencyPattern, 'semanticRoleContainerList')
|
|
# patternId = lxml.SubElement(semanticRoles, 'patternId')
|
|
# patternId.text = str(patterns[headword_pattern])
|
|
|
|
if 'ACT' in headword_pattern:
|
|
patternTranslationText = 'KDO/KAJ ' + pattern_translation_3_sin
|
|
else:
|
|
patternTranslationText = headword_text
|
|
for semantic_role in headword_pattern:
|
|
if semantic_role != 'ACT':
|
|
# additional rules
|
|
# if semantic_role == 'RESLT':
|
|
# pass
|
|
# else:
|
|
# patternTranslationText += ' ' + translations[semantic_role]
|
|
patternTranslationText += ' ' + translations[semantic_role]
|
|
semanticRoleContainer = lxml.SubElement(semanticRoleContainerList, 'semanticRoleContainer')
|
|
semanticRole = lxml.SubElement(semanticRoleContainer, 'semanticRole')
|
|
semanticRole.text = semantic_role
|
|
|
|
syntactic_structure_dict = {}
|
|
|
|
if 'ssj' in headword_pattern_dict and semantic_role in headword_pattern_dict['ssj']['sr_data']:
|
|
for syn_struct_id, syn_struct_dict in headword_pattern_dict['ssj']['sr_data'][semantic_role].items():
|
|
if syn_struct_id not in syntactic_structure_dict:
|
|
syntactic_structure_dict[syn_struct_id] = {}
|
|
for com_num, com_set in syn_struct_dict.items():
|
|
if com_num not in syntactic_structure_dict[syn_struct_id]:
|
|
syntactic_structure_dict[syn_struct_id][com_num] = set()
|
|
for lex in com_set:
|
|
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
|
|
|
|
if 'gf' in headword_pattern_dict and semantic_role in headword_pattern_dict['gf']['sr_data']:
|
|
for syn_struct_id, syn_struct_dict in headword_pattern_dict['gf']['sr_data'][semantic_role].items():
|
|
if syn_struct_id not in syntactic_structure_dict:
|
|
syntactic_structure_dict[syn_struct_id] = {}
|
|
for com_num, com_set in syn_struct_dict.items():
|
|
if com_num not in syntactic_structure_dict[syn_struct_id]:
|
|
syntactic_structure_dict[syn_struct_id][com_num] = set()
|
|
for lex in com_set:
|
|
syntactic_structure_dict[syn_struct_id][com_num].add(lex)
|
|
|
|
if len(syntactic_structure_dict) > 0:
|
|
syntacticStructureList = lxml.SubElement(semanticRoleContainer, 'syntacticStructureList')
|
|
# iterate over syntactic structures and write them
|
|
for syn_struct_id, component_dict in syntactic_structure_dict.items():
|
|
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
|
syntacticStructure.set('id', syn_struct_id)
|
|
for comp_id, lexemes in component_dict.items():
|
|
for l in lexemes:
|
|
component = lxml.SubElement(syntacticStructure, 'component')
|
|
component.set('num', comp_id)
|
|
lexem = lxml.SubElement(component, 'lexeme')
|
|
|
|
if l in preposition_list:
|
|
prep_id = preposition_list[l]
|
|
else:
|
|
query_preposition = session.query(Lexeme.id) \
|
|
.join(Category, Category.id == Lexeme.category_id) \
|
|
.join(LexemeFeature, LexemeFeature.lexeme_id == Lexeme.id) \
|
|
.join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
.filter(Lexeme.lemma == l[2]) \
|
|
.filter(Feature.name == 'case') \
|
|
.filter(LexemeFeature.value == CASE_MAP[l[1]]) \
|
|
.group_by(Lexeme.id)
|
|
preposition_ids = query_preposition.all()
|
|
if len(preposition_ids) != 1:
|
|
prep_id = ''
|
|
else:
|
|
prep_id = str(preposition_ids[0][0])
|
|
preposition_list[l] = prep_id
|
|
|
|
|
|
lexem.set('sloleks', prep_id)
|
|
lexem.text = l[2]
|
|
|
|
patternRepresentation = lxml.SubElement(valencyPattern, 'patternRepresentation')
|
|
patternRepresentation.text = patternTranslationText
|
|
|
|
exampleContainerList = lxml.SubElement(valencyPattern, 'exampleContainerList')
|
|
if 'ssj' in headword_pattern_dict:
|
|
for sentence_example in headword_pattern_dict['ssj']['sentence_examples']:
|
|
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
|
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
|
exampleContainer.append(sentence_example)
|
|
|
|
if 'gf' in headword_pattern_dict:
|
|
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
|
|
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
|
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
|
exampleContainer.append(sentence_example)
|
|
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
|
|
encoding='utf-8') as xf:
|
|
xf.write(dictionary, pretty_print=True)
|
|
pbar.update(1)
|
|
|
|
|
|
def init_db(db):
|
|
global Lexeme, LexemeFeature, SyntacticStructure, StructureComponent, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
|
Base = declarative_base()
|
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
|
pool_recycle=14400)
|
|
Base.metadata.reflect(engine)
|
|
|
|
class Lexeme(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexeme']
|
|
|
|
class LexemeFeature(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexeme_feature']
|
|
|
|
class SyntacticStructure(Base):
|
|
__table__ = Base.metadata.tables['jedro_syntacticstructure']
|
|
|
|
class StructureComponent(Base):
|
|
__table__ = Base.metadata.tables['jedro_structurecomponent']
|
|
|
|
class Feature(Base):
|
|
__table__ = Base.metadata.tables['jedro_feature']
|
|
|
|
class LexicalUnitLexeme(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexicalunit_lexeme']
|
|
|
|
class LexicalUnit(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexicalunit']
|
|
|
|
class LexicalUnitType(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexicalunittype']
|
|
|
|
class Category(Base):
|
|
__table__ = Base.metadata.tables['jedro_category']
|
|
|
|
class Sense(Base):
|
|
__table__ = Base.metadata.tables['jedro_sense']
|
|
|
|
class Measure(Base):
|
|
__table__ = Base.metadata.tables['jedro_measure']
|
|
|
|
class LexicalUnitMeasure(Base):
|
|
__table__ = Base.metadata.tables['jedro_lexicalunitmeasure']
|
|
|
|
class Corpus(Base):
|
|
__table__ = Base.metadata.tables['jedro_corpus']
|
|
|
|
class Definition(Base):
|
|
__table__ = Base.metadata.tables['jedro_definition']
|
|
|
|
class WordForm(Base):
|
|
__table__ = Base.metadata.tables['jedro_wordform']
|
|
|
|
class WordFormFeature(Base):
|
|
__table__ = Base.metadata.tables['jedro_wordform_feature']
|
|
|
|
class FormRepresentation(Base):
|
|
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
|
|
|
return engine
|
|
|
|
|
|
def match_file(words, structures):
|
|
matches = []
|
|
|
|
for s in structures:
|
|
for w in words:
|
|
mhere = s.match(w)
|
|
for match in mhere:
|
|
# save only those with verbs in them
|
|
if not [True for m in match.values() if m.msd[0] == 'V']:
|
|
continue
|
|
|
|
colocation_id = [(idx, w.lemma) for idx, w in match.items()]
|
|
colocation_id = [s.id] + list(sorted(colocation_id, key=lambda x: x[0]))
|
|
colocation_id = tuple(colocation_id)
|
|
|
|
matches.append([match, colocation_id])
|
|
|
|
return matches
|
|
|
|
|
|
possible_jos_links = {'dol', 'del', 'prir', 'vez', 'skup', 'ena', 'dve', 'tri', 'štiri', 'modra'}
|
|
|
|
|
|
def find_word_sons(word, deppar_dict, word_id, role, parents):
|
|
if word.id in parents:
|
|
return False
|
|
for k, v in word.links.items():
|
|
for w in v:
|
|
# if k in possible_jos_links and w.id == 'ssj1.1.1.t21':
|
|
# print('here')
|
|
if k in possible_jos_links:
|
|
if w.id not in deppar_dict:
|
|
deppar_dict[w.id] = {}
|
|
deppar_dict[w.id][word_id] = role
|
|
if not find_word_sons(w, deppar_dict, word_id, role, parents + [word.id]):
|
|
return False
|
|
# elif k in possible_jos_links:
|
|
# raise Exception('One word in multiple dependency parsetrees')
|
|
return True
|
|
|
|
# for ignoring punctuations
|
|
def idi_word_generator(sentence):
|
|
idi = 0
|
|
for word in sentence:
|
|
if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
|
|
continue
|
|
yield idi, word
|
|
idi += 1
|
|
|
|
|
|
def extract_sentences(w_collection, w_a_collection, args, input_corpus, input_corpus_orig):
|
|
structures, _, max_num_components = build_structures(args)
|
|
timeinfo = TimeInfo(len(input_corpus))
|
|
|
|
database = Database(args)
|
|
formatted_sentences = {}
|
|
start_time = time.time()
|
|
|
|
sentences_num_limit = 15000
|
|
sentences_in_ram = 0
|
|
sentence_glue_numbers = None
|
|
|
|
is_gf = input_corpus_orig is not None
|
|
if is_gf:
|
|
glue_words_gen = file_sentence_glue_generator(input_corpus_orig, args.pc_tag, w_collection)
|
|
|
|
for sent_id, sentence, othr_sentence_attributes in load_files(args, database, w_collection, input_corpus):
|
|
if is_gf:
|
|
# create tuple for comparison with sentence_flue_words
|
|
sent_id_numbers = tuple([int(sid) for sid in sent_id[2:].split('.')])
|
|
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
|
|
logging.warning(
|
|
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
|
continue
|
|
sentence_glue = next(glue_words_gen)
|
|
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
|
|
while sentence_glue_numbers < sent_id_numbers:
|
|
logging.warning(
|
|
f"Skipping sentence in original sentence id (sentence_glue)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
|
sentence_glue = next(glue_words_gen)
|
|
sentence_glue_numbers = tuple([int(sid) for sid in sentence_glue[0][2:].split('.')])
|
|
|
|
# has to be here for when next sentence_glue is selected in while loop
|
|
if sentence_glue_numbers is not None and sentence_glue_numbers > sent_id_numbers:
|
|
logging.warning(
|
|
f"Skipping sentence in annotated sentence id (sent_id)! Annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
|
continue
|
|
|
|
if sent_id != sentence_glue[0]:
|
|
raise Exception(f"Annotated gigafida and original gigafida not in sync (annotated sent_id = {sent_id}, original sent_id = {sentence_glue[0]}")
|
|
if len(sentence_glue[1]) != len(sentence):
|
|
logging.warning(f"Skipping sentence! Annotated gigafida and original gigafida size is not the same (annotated: {len(sentence)}, original: {len(sentence_glue[1])}")
|
|
continue
|
|
for w, w_glue in zip(sentence, sentence_glue[1]):
|
|
w.glue = w_glue[2]
|
|
if sentence is None:
|
|
timeinfo.add_measurement(-1)
|
|
continue
|
|
|
|
# start_time = time.time()
|
|
# print(time.time() - start_time)
|
|
matches = match_file(sentence, structures)
|
|
# if sent_id == 'ssj134.880.3375':
|
|
# print('here')
|
|
# print(time.time() - start_time)
|
|
# match_store.add_matches(matches)
|
|
# word_stats.add_words(words)
|
|
# database.commit()
|
|
|
|
# find unimportant collocations
|
|
# extract_possible_headwords = set(v[0] for v in othr_sentence_attributes.values())
|
|
for match in matches:
|
|
match_idis = []
|
|
for key, word in match[0].items():
|
|
match_idis.append(word.idi)
|
|
match.append(match_idis)
|
|
|
|
|
|
|
|
collocations = {}
|
|
for match in matches:
|
|
for key, word in match[0].items():
|
|
# if word.id == ''
|
|
if word.id not in collocations:
|
|
collocations[word.id] = []
|
|
collocations[word.id].append((match[1][0], key, word.msd[:2], match[2]))
|
|
|
|
# print(time.time() - start_time)
|
|
formatted_sentence = []
|
|
deppar_dict = {}
|
|
|
|
# idi = 0
|
|
incorrect_sentence = False
|
|
|
|
# create output and form dependency parsetree sons
|
|
for idi, word in idi_word_generator(sentence):
|
|
# if word.text == 'Mumel':
|
|
# print('here')
|
|
# if word.text == 'Poleg':
|
|
# print('here')
|
|
# if word.text == 'Luka':
|
|
# print('here')
|
|
idi = str(idi)
|
|
# a = sent_id in sentences_of_interest
|
|
# b = (word.lemma, word.msd) in sentences_of_interest[sent_id]
|
|
# if word.msd == 'X':
|
|
# continue
|
|
# if len(word.text) == 1 and word.text in string.punctuation + '':
|
|
# a = re.match('^[\w]+$', word.text) is not None
|
|
# if len(word.text) == 1 and re.match('^[\w]+$', word.text) is None:
|
|
# continue
|
|
# if sent_id in sentences_of_interest and (word.lemma, word.msd) in sentences_of_interest[sent_id]:
|
|
# if sent_id in sentences_of_interest and idi in sentences_of_interest[sent_id]:
|
|
# cur_count = w_collection.count_documents({'_id': sent_id})
|
|
# if w_collection.count_documents({'_id': sent_id}) > 0:
|
|
sentence_of_interest = othr_sentence_attributes
|
|
# is_count = cur.count() > 0
|
|
if idi in othr_sentence_attributes:
|
|
if word.id not in deppar_dict:
|
|
deppar_dict[word.id] = {}
|
|
deppar_dict[word.id][sentence_of_interest[idi][0]] = sentence_of_interest[idi][1]
|
|
# deppar_dict[word.id] = {idi: sentences_of_interest[sent_id][idi]}
|
|
|
|
# if idi != sentences_of_interest[sent_id][(word.lemma, word.msd)][1]:
|
|
# if (word.lemma, word.msd) != sentences_of_interest[sent_id][idi][1]:
|
|
# print((word.lemma, word.msd))
|
|
# print(sentences_of_interest[sent_id][idi][1])
|
|
# if sentences_of_interest[sent_id][(word.lemma, word.msd)][1] > idi:
|
|
# print('HERE')
|
|
if not find_word_sons(word, deppar_dict, sentence_of_interest[idi][0], sentence_of_interest[idi][1], []):
|
|
incorrect_sentence = True
|
|
# idi += 1
|
|
|
|
if incorrect_sentence:
|
|
logging.warning(
|
|
f"Sentence {sent_id} contains srl connections that loop!")
|
|
continue
|
|
# print(time.time() - start_time)
|
|
|
|
for word in sentence:
|
|
if word.id in collocations:
|
|
col = collocations[word.id]
|
|
else:
|
|
col = []
|
|
|
|
if word.id in deppar_dict:
|
|
dp = deppar_dict[word.id]
|
|
else:
|
|
dp = {}
|
|
|
|
formatted_sentence.append(((word.text, word.glue), col, dp, word.lemma))
|
|
|
|
# create_sentence_output(formatted_sentence, 4)
|
|
formatted_sentences[sent_id] = formatted_sentence
|
|
|
|
if sentences_in_ram >= sentences_num_limit:
|
|
sentences_in_ram = 0
|
|
|
|
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
|
|
|
|
result = w_a_collection.bulk_write(requests)
|
|
|
|
formatted_sentences = {}
|
|
sentences_in_ram += 1
|
|
# print(time.time() - start_time)
|
|
|
|
requests = [UpdateOne({'_id': k}, {'$set': {'words': v}}, upsert=True) for k, v in formatted_sentences.items()]
|
|
|
|
result = w_a_collection.bulk_write(requests)
|
|
|
|
# force a bit of garbage collection
|
|
# del sentence
|
|
# del sent_id
|
|
# del matches
|
|
# gc.collect()
|
|
|
|
print(time.time() - start_time)
|
|
# return formatted_sentences
|
|
|
|
# # timeinfo.add_measurement(time.time() - start_time)
|
|
# # timeinfo.info()
|
|
# # if no output files, just exit
|
|
# if all([x == None for x in [args.out, args.out_no_stat, args.all, args.stats]]):
|
|
# return
|
|
#
|
|
# # get word renders for lemma/msd
|
|
# word_stats.generate_renders()
|
|
# match_store.determine_colocation_dispersions()
|
|
#
|
|
# # figure out representations!
|
|
# if args.out or args.out_no_stat:
|
|
# match_store.set_representations(word_stats, structures)
|
|
#
|
|
# Writer.make_output_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
# structures, match_store)
|
|
# Writer.make_output_no_stat_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
# structures, match_store)
|
|
# Writer.make_all_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
# structures, match_store)
|
|
# Writer.make_stats_writer(args, max_num_components, match_store, word_stats).write_out(
|
|
# structures, match_store)
|
|
|
|
|
|
def get_headword_category(collection):
|
|
"""
|
|
Returns
|
|
:return:
|
|
List of tuples with all headwords in mongodb and their categories.
|
|
"""
|
|
headwords = sorted(collection.distinct("headwords")[1:])
|
|
if args.headwords:
|
|
with open(args.headwords, 'w') as f:
|
|
for item in headwords:
|
|
f.write("%s\n" % item)
|
|
headword_category = [(headword, 'verb') if headword[-1] != '_' else (headword, 'adjective') for headword in
|
|
headwords]
|
|
return headword_category
|
|
|
|
|
|
def main(args):
|
|
# with Path('data/wordlist.json').open("r") as fp:
|
|
# sskj_wordlist = json.load(fp)
|
|
# # wordlist = set(sskj_wordlist['wordlist'])
|
|
# wordlist = set(sskj_wordlist['wordlist'])
|
|
print('beginning chunk')
|
|
start_time = time.time()
|
|
# user:user:valdb:127.0.0.1
|
|
|
|
[db_user, db_password, db_database, db_host] = args.mongo_db.split(':')
|
|
|
|
mongo = MongoClient(username=db_user, password=db_password, authSource=db_database)
|
|
|
|
db = mongo.valdb
|
|
collection_ssj = db['ssj']
|
|
collection_gigafida = db['gigafida']
|
|
|
|
db2 = mongo.extvaldb
|
|
# write collection
|
|
w_collection_ssj = db2['ssj']
|
|
w_collection_gigafida = db2['gigafida']
|
|
w_a_collection_ssj = db2['ssj' + '_all']
|
|
w_a_collection_gigafida = db2['gigafida' + '_all']
|
|
status_collection = db2['status']
|
|
|
|
valency_pattern_id_collection = db2['valency_pattern_ids']
|
|
|
|
RF = reduce_functions["reduce_0"]["f"]
|
|
|
|
# get all headwords from database
|
|
# headword_category = get_headword_category(collection_ssj)
|
|
with open(args.headwords, 'r') as read:
|
|
headword_category = [(line[:-1], 'verb') for line in read.readlines()]
|
|
|
|
assert args.language == 'en' or args.language == 'sl'
|
|
|
|
|
|
shutil.rmtree(args.outdir, True)
|
|
os.mkdir(args.outdir)
|
|
|
|
engine = init_db(args.sloleks_db)
|
|
|
|
|
|
# input_file = codecs.open(args.infile, 'r')
|
|
# # input_file = []
|
|
# next(input_file)
|
|
|
|
# category_map = {'samostalnik':'noun', 'glagol':'verb', 'pridevnik':'adjective', 'prislov':'adverb', 'števnik':'numeral', 'zaimek':'pronoun', 'medmet':'interjection', 'veznik':'conjunction'}
|
|
|
|
session = Session(engine)
|
|
|
|
|
|
# cur = collection.find({})
|
|
#
|
|
# a = []
|
|
# cur_len = 0
|
|
# # num_empty_sent = 0
|
|
# for ent in cur:
|
|
# cur_len += 1
|
|
# # s = frames_from_db_entry(ent)
|
|
# # if not s:
|
|
# # num_empty_sent += 1
|
|
# a += frames_from_db_entry(ent)
|
|
print(time.time() - start_time)
|
|
# print(num_empty_sent)
|
|
|
|
print('get_sentences_of_interest')
|
|
start_time = time.time()
|
|
# sentences_of_interest = get_sentences_of_interest(headword_category, collection, w_collection, RF, mongo)
|
|
# sentences_of_interest_stored = args.p1_processed
|
|
if not args.p1_processed:
|
|
with tqdm(total=len(headword_category)) as pbar:
|
|
get_sentences_of_interest(headword_category, collection_ssj, w_collection_ssj, RF, mongo, pbar, status_collection, 'ssj')
|
|
if not args.ignore_gigafida:
|
|
with tqdm(total=len(headword_category)) as pbar:
|
|
get_sentences_of_interest(headword_category, collection_gigafida, w_collection_gigafida, RF, mongo, pbar, status_collection, 'gigafida')
|
|
# sentences_of_interest = OrderedDict(sorted(sentences_of_interest.items()))
|
|
print(time.time() - start_time)
|
|
# num_sentences = 0
|
|
# for el in all_sentences:
|
|
# if el not in sentences_of_interest:
|
|
# num_sentences += 1
|
|
#
|
|
# print(num_sentences)
|
|
# print(len(all_sentences))
|
|
|
|
print('extract_sentences')
|
|
start_time = time.time()
|
|
# formatted_sentences_stored = args.p2_processed
|
|
if not args.p2_processed:
|
|
gf_anno_paths = list(os.walk(args.input_gigafida_annotated))
|
|
gf_anno_paths = [os.path.join(p_t[0], f_n) for p_t in gf_anno_paths for f_n in p_t[2]]
|
|
|
|
gf_orig_paths = list(os.walk(args.input_gigafida_original))
|
|
gf_orig_paths = sorted([os.path.join(p_t[0], f_n) for p_t in gf_orig_paths for f_n in p_t[2] if f_n[:2] == 'GF'])
|
|
|
|
extract_sentences(w_collection_ssj, w_a_collection_ssj, args, args.input_sloleks, None)
|
|
if not args.ignore_gigafida:
|
|
extract_sentences(w_collection_gigafida, w_a_collection_gigafida, args, gf_anno_paths, gf_orig_paths)
|
|
print(time.time() - start_time)
|
|
|
|
print('write_xml')
|
|
start_time = time.time()
|
|
# print('aa ' + 3)
|
|
with tqdm(total=len(headword_category)) as pbar:
|
|
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
|
print(time.time() - start_time)
|
|
# input_file.close()
|
|
session.close()
|
|
|
|
if __name__ == '__main__':
|
|
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
|
arg_parser.add_argument('--sloleks_db', type=str, help='Database credentials')
|
|
arg_parser.add_argument('--mongo_db', type=str, help='Database credentials')
|
|
arg_parser.add_argument('--schema', type=str, help='XML schema')
|
|
arg_parser.add_argument('--infile', type=str, help='Input file')
|
|
arg_parser.add_argument('--outdir', type=str, help='Output directory')
|
|
arg_parser.add_argument('--headwords', type=str, default=None, help='Path to file, where headwords will be saved.')
|
|
arg_parser.add_argument('--language', type=str, help='Language of certain attributes')
|
|
arg_parser.add_argument('--corpus_name', type=str, help='Name of corpus to be written in outputs.')
|
|
arg_parser.add_argument('--pattern_examples_limit', type=int, default=10, help='Max number of examples.')
|
|
arg_parser.add_argument('--ignore_gigafida', action='store_true', help='If tagged ignore gigafida in output.')
|
|
|
|
arg_parser.add_argument('--p1_processed',
|
|
help='Skip first part (obtaining sentences of interest) when they are already in DB.',
|
|
action='store_true')
|
|
arg_parser.add_argument('--p2_processed',
|
|
help='Skip second part (obtaining formatted sentences) when they are already in DB.',
|
|
action='store_true')
|
|
|
|
arg_parser.add_argument('--structures',
|
|
help='Structures definitions in xml file')
|
|
arg_parser.add_argument('--input_sloleks',
|
|
help='input file in (gz or xml currently). If none, then just database is loaded', nargs='*')
|
|
arg_parser.add_argument('--input_gigafida_annotated',
|
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
|
arg_parser.add_argument('--input_gigafida_original',
|
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
|
arg_parser.add_argument('--out',
|
|
help='Classic output file')
|
|
arg_parser.add_argument('--out-no-stat',
|
|
help='Output file, but without statistical columns')
|
|
arg_parser.add_argument('--all',
|
|
help='Additional output file, writes more data')
|
|
arg_parser.add_argument('--stats',
|
|
help='Output file for statistics')
|
|
|
|
arg_parser.add_argument('--no-msd-translate',
|
|
help='MSDs are translated from slovene to english by default',
|
|
action='store_true')
|
|
arg_parser.add_argument('--skip-id-check',
|
|
help='Skips checks for ids of <w> and <pc>, if they are in correct format',
|
|
action='store_true')
|
|
arg_parser.add_argument('--min_freq', help='Minimal frequency in output',
|
|
type=int, default=0, const=1, nargs='?')
|
|
arg_parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
|
choices=["warning", "info", "debug"], default="info",
|
|
const="info", nargs='?')
|
|
arg_parser.add_argument('--count-files',
|
|
help="Count files: more verbose output", action='store_true')
|
|
arg_parser.add_argument('--multiple-output',
|
|
help='Generate one output for each syntactic structure',
|
|
action='store_true')
|
|
|
|
arg_parser.add_argument('--sort-by',
|
|
help="Sort by a this column (index)", type=int, default=-1)
|
|
arg_parser.add_argument('--sort-reversed',
|
|
help="Sort in reversed ored", action='store_true')
|
|
|
|
arg_parser.add_argument('--db',
|
|
help="Database file to use (instead of memory)", default=None)
|
|
arg_parser.add_argument('--new-db',
|
|
help="Writes over database file, if there exists one", action='store_true')
|
|
|
|
arg_parser.add_argument('--pc-tag',
|
|
help='Tag for separators, usually pc or c', default="pc")
|
|
|
|
args = arg_parser.parse_args()
|
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
|
|
|
start = time.time()
|
|
main(args)
|
|
logging.info("TIME: {}".format(time.time() - start))
|
|
|
|
|