|
|
|
@ -3,6 +3,7 @@
|
|
|
|
|
|
|
|
|
|
#imports from luscenje_struktur
|
|
|
|
|
import copy
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
|
|
from luscenje_struktur.progress_bar import progress
|
|
|
|
|
from luscenje_struktur.word import Word, WordCompressed
|
|
|
|
@ -114,6 +115,8 @@ CASE_MAP = {
|
|
|
|
|
'i': 'instrumental'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ssj_frequency_dict = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Lexeme = None
|
|
|
|
|
LexemeFeature = None
|
|
|
|
@ -130,6 +133,7 @@ Definition = None
|
|
|
|
|
WordForm = None
|
|
|
|
|
WordFormFeature = None
|
|
|
|
|
FormRepresentation = None
|
|
|
|
|
FormEncoding = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# corpus = 'gigafida'
|
|
|
|
@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
|
|
|
|
|
|
|
|
|
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
|
|
|
|
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
|
|
|
|
Lexeme.dummy, LexicalUnitType.name) \
|
|
|
|
|
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
|
|
|
|
.join(Category, Category.id == Lexeme.category_id) \
|
|
|
|
|
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
|
|
|
|
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
|
|
|
@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
wf1 = aliased(WordFormFeature)
|
|
|
|
|
wf2 = aliased(WordFormFeature)
|
|
|
|
|
wf3 = aliased(WordFormFeature)
|
|
|
|
|
query_preposition = session.query(FormRepresentation.form) \
|
|
|
|
|
query_preposition = session.query(FormEncoding.text) \
|
|
|
|
|
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
|
|
|
|
|
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
|
|
|
|
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
|
|
|
|
|
.join(wf1, wf1.word_form_id == WordForm.id) \
|
|
|
|
@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
|
|
|
|
|
pattern_translation_3_sin = headword_text
|
|
|
|
|
if len(pattern_translation_hws) == 1:
|
|
|
|
|
pattern_translation_3_sin = pattern_translation_hws[0].form
|
|
|
|
|
pattern_translation_3_sin = pattern_translation_hws[0].text
|
|
|
|
|
|
|
|
|
|
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
|
|
|
|
|
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
|
|
|
|
@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
headword_text_query = headword_text[:-1]
|
|
|
|
|
else:
|
|
|
|
|
headword_text_query = headword_text
|
|
|
|
|
|
|
|
|
|
query = query_general.filter(Category.name == category_text) \
|
|
|
|
|
.filter(Lexeme.lemma == headword_text_query) \
|
|
|
|
|
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
|
|
|
@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
if len(query_res) == 1:
|
|
|
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
|
|
|
|
query_res[0]
|
|
|
|
|
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
|
|
|
|
Sense.lexical_unit_id == lexical_unit_id).all()
|
|
|
|
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
|
|
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
|
|
|
.filter(Feature.name == 'aspect').all()
|
|
|
|
|
|
|
|
|
|
elif len(query_res) > 1:
|
|
|
|
|
# find dummy
|
|
|
|
|
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
|
|
|
|
|
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
|
|
|
|
.join(Category, Category.id == Lexeme.category_id) \
|
|
|
|
|
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
|
|
|
|
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
|
|
|
|
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
|
|
|
|
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
|
|
|
|
.filter(Corpus.name == 'gigafida') \
|
|
|
|
|
.filter(Corpus.version == '2.0') \
|
|
|
|
|
.filter(Lexeme.lemma == headword_text_query).all()
|
|
|
|
|
|
|
|
|
|
# all lexical_unit_ids equal or at least one dummy
|
|
|
|
|
dummy_exists = False
|
|
|
|
|
final_lexical_unit_id = 0
|
|
|
|
|
final_lexical_unit_lexeme_id = 0
|
|
|
|
|
for r in query_res:
|
|
|
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
|
|
|
|
|
for r in dummy_query:
|
|
|
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
|
|
|
|
|
lexical_unit_type_name) = r
|
|
|
|
|
if dummy:
|
|
|
|
|
final_lexical_unit_id = lexical_unit_id
|
|
|
|
|
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
|
|
|
|
|
dummy_exists = True
|
|
|
|
|
break
|
|
|
|
|
assert dummy_exists
|
|
|
|
|
|
|
|
|
|
sense_ids = []
|
|
|
|
|
features_set = set()
|
|
|
|
|
frequency = 0
|
|
|
|
|
for r in query_res:
|
|
|
|
|
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
|
|
|
|
|
lexical_unit_type_name) = r
|
|
|
|
|
if dummy:
|
|
|
|
|
continue
|
|
|
|
|
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
|
|
|
|
|
Sense.lexical_unit_id == lexical_unit_id).all())
|
|
|
|
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
|
|
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
|
|
|
.filter(Feature.name == 'aspect').all()
|
|
|
|
|
|
|
|
|
|
# set features in dictionary
|
|
|
|
|
if not features:
|
|
|
|
|
for n_feat in features_set:
|
|
|
|
|
for f in n_feat:
|
|
|
|
|
features.add(f)
|
|
|
|
|
# compare features
|
|
|
|
|
else:
|
|
|
|
|
for n_feat in features_set:
|
|
|
|
|
for f in n_feat:
|
|
|
|
|
if f not in features:
|
|
|
|
|
raise Exception('Different features in query_res - might be problematic!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frequency += el_frequency
|
|
|
|
|
|
|
|
|
|
# check if any actual sense exists if not erase all but one
|
|
|
|
|
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
|
|
|
|
|
if not any_sense_not_dummy:
|
|
|
|
|
sense_ids = sense_ids[-1:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lexical_unit_id = final_lexical_unit_id
|
|
|
|
|
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
|
|
|
|
|
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
|
|
|
|
# Sense.lexical_unit_id == lexical_unit_id).all()
|
|
|
|
|
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
|
|
|
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
|
|
|
# .filter(Feature.name == 'aspect').all()
|
|
|
|
|
else:
|
|
|
|
|
frequency = 0
|
|
|
|
|
frequency = None
|
|
|
|
|
lexeme_id = 0
|
|
|
|
|
lexical_unit_id = 0
|
|
|
|
|
lexical_unit_lexeme_id = 0
|
|
|
|
|
lexical_unit_type_name = ''
|
|
|
|
|
sense_ids = []
|
|
|
|
|
features = []
|
|
|
|
|
|
|
|
|
|
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
|
|
|
|
|
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
|
|
|
|
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
|
|
|
|
.filter(Feature.name == 'aspect').all()
|
|
|
|
|
|
|
|
|
|
entry = lxml.SubElement(dictionary, 'entry')
|
|
|
|
|
|
|
|
|
@ -875,22 +941,32 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
|
|
|
|
|
else:
|
|
|
|
|
category.text = category_text
|
|
|
|
|
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
|
|
|
|
if args.language == 'sl':
|
|
|
|
|
grammarFeature.set('name', 'vid')
|
|
|
|
|
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
|
|
|
|
|
0].value in ASPECT_MAP else ''
|
|
|
|
|
else:
|
|
|
|
|
grammarFeature.set('name', 'aspect')
|
|
|
|
|
grammarFeature.text = features[0].value if len(features) > 0 else ''
|
|
|
|
|
ssj_frequency = None
|
|
|
|
|
if len(features) > 0:
|
|
|
|
|
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
|
|
|
|
ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
|
|
|
|
|
if args.language == 'sl':
|
|
|
|
|
grammarFeature.set('name', 'vid')
|
|
|
|
|
if len(features) > 1:
|
|
|
|
|
print(features)
|
|
|
|
|
grammarFeature.text = ASPECT_MAP[features[0].value]
|
|
|
|
|
else:
|
|
|
|
|
grammarFeature.set('name', 'aspect')
|
|
|
|
|
grammarFeature.text = features[0].value
|
|
|
|
|
|
|
|
|
|
measureList = lxml.SubElement(head, 'measureList')
|
|
|
|
|
measure = lxml.SubElement(measureList, 'measure')
|
|
|
|
|
measure.set('type', 'frequency')
|
|
|
|
|
# TODO Modify this!
|
|
|
|
|
measure.set('source', 'Gigafida 2.0')
|
|
|
|
|
# measure.set('source', 'ssj500k')
|
|
|
|
|
measure.text = str(int(frequency))
|
|
|
|
|
if frequency:
|
|
|
|
|
measure = lxml.SubElement(measureList, 'measure')
|
|
|
|
|
measure.set('type', 'frequency')
|
|
|
|
|
measure.set('source', 'Gigafida 2.0')
|
|
|
|
|
# measure.set('source', 'ssj500k')
|
|
|
|
|
measure.text = str(int(frequency))
|
|
|
|
|
|
|
|
|
|
if ssj_frequency is not None:
|
|
|
|
|
measure = lxml.SubElement(measureList, 'measure')
|
|
|
|
|
measure.set('type', 'frequency')
|
|
|
|
|
measure.set('source', 'ssj500k 2.2')
|
|
|
|
|
measure.text = str(int(ssj_frequency))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
|
|
|
|
|
|
|
|
|
|
senseList = lxml.SubElement(body, 'senseList')
|
|
|
|
|
|
|
|
|
|
# handle cases when headword is not in sloleks
|
|
|
|
|
if len(sense_ids) == 0:
|
|
|
|
|
sense_ids = [-1]
|
|
|
|
|
|
|
|
|
|
for sense_id in sense_ids:
|
|
|
|
|
if len(sense_ids) > 1 and sense_id.dummy:
|
|
|
|
|
if len(sense_ids) > 1 and sense_id.potential_sense:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
sense = lxml.SubElement(senseList, 'sense')
|
|
|
|
|
if not sense_id.dummy:
|
|
|
|
|
if not sense_id == -1 and not sense_id.potential_sense:
|
|
|
|
|
sense.set('id', str(sense_id.id))
|
|
|
|
|
|
|
|
|
|
definitionList = lxml.SubElement(sense, 'definitionList')
|
|
|
|
|
|
|
|
|
|
definition_texts = session.query(Definition.description).filter(
|
|
|
|
|
Definition.sense_id == sense_id.id).all()
|
|
|
|
|
if not sense_id == -1:
|
|
|
|
|
definition_texts = session.query(Definition.description).filter(
|
|
|
|
|
Definition.sense_id == sense_id.id).all()
|
|
|
|
|
else:
|
|
|
|
|
definition_texts = []
|
|
|
|
|
|
|
|
|
|
for definition_text in definition_texts:
|
|
|
|
|
definition = lxml.SubElement(definitionList, 'definition')
|
|
|
|
@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
for syn_struct_id, component_dict in syntactic_structure_dict.items():
|
|
|
|
|
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
|
|
|
|
syntacticStructure.set('id', syn_struct_id)
|
|
|
|
|
dedup_dict = {}
|
|
|
|
|
for comp_id, lexemes in component_dict.items():
|
|
|
|
|
for l in lexemes:
|
|
|
|
|
component = lxml.SubElement(syntacticStructure, 'component')
|
|
|
|
|
component.set('num', comp_id)
|
|
|
|
|
lexem = lxml.SubElement(component, 'lexeme')
|
|
|
|
|
|
|
|
|
|
if l in preposition_list:
|
|
|
|
|
prep_id = preposition_list[l]
|
|
|
|
@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
prep_id = str(preposition_ids[0][0])
|
|
|
|
|
preposition_list[l] = prep_id
|
|
|
|
|
|
|
|
|
|
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
|
|
|
|
|
component = lxml.SubElement(syntacticStructure, 'component')
|
|
|
|
|
component.set('num', comp_id)
|
|
|
|
|
lexem = lxml.SubElement(component, 'lexeme')
|
|
|
|
|
|
|
|
|
|
lexem.set('sloleks', prep_id)
|
|
|
|
|
lexem.text = l[2]
|
|
|
|
@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_db(db):
|
|
|
|
|
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
|
|
|
|
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
|
|
|
|
|
[db_user, db_password, db_database, db_host] = db.split(':')
|
|
|
|
|
Base = declarative_base()
|
|
|
|
|
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
|
|
|
@ -1152,6 +1241,9 @@ def init_db(db):
|
|
|
|
|
class FormRepresentation(Base):
|
|
|
|
|
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
|
|
|
|
|
|
|
|
|
class FormEncoding(Base):
|
|
|
|
|
__table__ = Base.metadata.tables['jedro_formencoding']
|
|
|
|
|
|
|
|
|
|
return engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -1415,6 +1507,14 @@ def get_headword_category(collection):
|
|
|
|
|
return headword_category
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_ssj500k_frequencies(path):
|
|
|
|
|
with open(path, 'r') as f:
|
|
|
|
|
reader = csv.reader(f, delimiter='\t')
|
|
|
|
|
next(reader)
|
|
|
|
|
for line in reader:
|
|
|
|
|
ssj_frequency_dict[(line[1], line[-1])] = line[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
|
# with Path('data/wordlist.json').open("r") as fp:
|
|
|
|
|
# sskj_wordlist = json.load(fp)
|
|
|
|
@ -1518,7 +1618,9 @@ def main(args):
|
|
|
|
|
|
|
|
|
|
print('write_xml')
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# print('aa ' + 3)
|
|
|
|
|
if args.ssj500k_frequencies is not None:
|
|
|
|
|
read_ssj500k_frequencies(args.ssj500k_frequencies)
|
|
|
|
|
|
|
|
|
|
with tqdm(total=len(headword_category)) as pbar:
|
|
|
|
|
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
|
|
|
|
print(time.time() - start_time)
|
|
|
|
@ -1592,6 +1694,9 @@ if __name__ == '__main__':
|
|
|
|
|
arg_parser.add_argument('--pc-tag',
|
|
|
|
|
help='Tag for separators, usually pc or c', default="pc")
|
|
|
|
|
|
|
|
|
|
arg_parser.add_argument('--ssj500k-frequencies',
|
|
|
|
|
help='Tag for separators, usually pc or c', default=None)
|
|
|
|
|
|
|
|
|
|
args = arg_parser.parse_args()
|
|
|
|
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
|
|
|
|
|
|
|
|
|