A couple of fixes on write_xml in create_xml.py + Created form_csv.py script

This commit is contained in:
Luka 2020-12-08 08:01:17 +01:00
parent c18aaff11f
commit 75b015dcda
3 changed files with 207 additions and 34 deletions

View File

@ -3,6 +3,7 @@
#imports from luscenje_struktur #imports from luscenje_struktur
import copy import copy
import csv
from luscenje_struktur.progress_bar import progress from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed from luscenje_struktur.word import Word, WordCompressed
@ -114,6 +115,8 @@ CASE_MAP = {
'i': 'instrumental' 'i': 'instrumental'
} }
ssj_frequency_dict = {}
Lexeme = None Lexeme = None
LexemeFeature = None LexemeFeature = None
@ -130,6 +133,7 @@ Definition = None
WordForm = None WordForm = None
WordFormFeature = None WordFormFeature = None
FormRepresentation = None FormRepresentation = None
FormEncoding = None
# corpus = 'gigafida' # corpus = 'gigafida'
@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
Lexeme.dummy, LexicalUnitType.name) \ Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \ .join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
wf1 = aliased(WordFormFeature) wf1 = aliased(WordFormFeature)
wf2 = aliased(WordFormFeature) wf2 = aliased(WordFormFeature)
wf3 = aliased(WordFormFeature) wf3 = aliased(WordFormFeature)
query_preposition = session.query(FormRepresentation.form) \ query_preposition = session.query(FormEncoding.text) \
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
.join(wf1, wf1.word_form_id == WordForm.id) \ .join(wf1, wf1.word_form_id == WordForm.id) \
@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
pattern_translation_3_sin = headword_text pattern_translation_3_sin = headword_text
if len(pattern_translation_hws) == 1: if len(pattern_translation_hws) == 1:
pattern_translation_3_sin = pattern_translation_hws[0].form pattern_translation_3_sin = pattern_translation_hws[0].text
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation") qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'}) dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
headword_text_query = headword_text[:-1] headword_text_query = headword_text[:-1]
else: else:
headword_text_query = headword_text headword_text_query = headword_text
query = query_general.filter(Category.name == category_text) \ query = query_general.filter(Category.name == category_text) \
.filter(Lexeme.lemma == headword_text_query) \ .filter(Lexeme.lemma == headword_text_query) \
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, .group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
if len(query_res) == 1: if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0] query_res[0]
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
elif len(query_res) > 1: elif len(query_res) > 1:
# find dummy
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
.filter(Corpus.name == 'gigafida') \
.filter(Corpus.version == '2.0') \
.filter(Lexeme.lemma == headword_text_query).all()
# all lexical_unit_ids equal or at least one dummy # all lexical_unit_ids equal or at least one dummy
dummy_exists = False
final_lexical_unit_id = 0 final_lexical_unit_id = 0
final_lexical_unit_lexeme_id = 0 final_lexical_unit_lexeme_id = 0
for r in query_res: for r in dummy_query:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy, (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
lexical_unit_type_name) = r lexical_unit_type_name) = r
if dummy: if dummy:
final_lexical_unit_id = lexical_unit_id final_lexical_unit_id = lexical_unit_id
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
dummy_exists = True
break break
assert dummy_exists
sense_ids = []
features_set = set()
frequency = 0
for r in query_res:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
lexical_unit_type_name) = r
if dummy:
continue
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all())
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
# set features in dictionary
if not features:
for n_feat in features_set:
for f in n_feat:
features.add(f)
# compare features
else:
for n_feat in features_set:
for f in n_feat:
if f not in features:
raise Exception('Different features in query_res - might be problematic!')
frequency += el_frequency
# check if any actual sense exists if not erase all but one
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
if not any_sense_not_dummy:
sense_ids = sense_ids[-1:]
lexical_unit_id = final_lexical_unit_id lexical_unit_id = final_lexical_unit_id
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
# Sense.lexical_unit_id == lexical_unit_id).all()
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
# .filter(Feature.name == 'aspect').all()
else: else:
frequency = 0 frequency = None
lexeme_id = 0 lexeme_id = 0
lexical_unit_id = 0 lexical_unit_id = 0
lexical_unit_lexeme_id = 0 lexical_unit_lexeme_id = 0
lexical_unit_type_name = '' lexical_unit_type_name = ''
sense_ids = []
features = []
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
entry = lxml.SubElement(dictionary, 'entry') entry = lxml.SubElement(dictionary, 'entry')
@ -875,22 +941,32 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else '' category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
else: else:
category.text = category_text category.text = category_text
grammarFeature = lxml.SubElement(grammar, 'grammarFeature') ssj_frequency = None
if args.language == 'sl': if len(features) > 0:
grammarFeature.set('name', 'vid') grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[ ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
0].value in ASPECT_MAP else '' if args.language == 'sl':
else: grammarFeature.set('name', 'vid')
grammarFeature.set('name', 'aspect') if len(features) > 1:
grammarFeature.text = features[0].value if len(features) > 0 else '' print(features)
grammarFeature.text = ASPECT_MAP[features[0].value]
else:
grammarFeature.set('name', 'aspect')
grammarFeature.text = features[0].value
measureList = lxml.SubElement(head, 'measureList') measureList = lxml.SubElement(head, 'measureList')
measure = lxml.SubElement(measureList, 'measure') if frequency:
measure.set('type', 'frequency') measure = lxml.SubElement(measureList, 'measure')
# TODO Modify this! measure.set('type', 'frequency')
measure.set('source', 'Gigafida 2.0') measure.set('source', 'Gigafida 2.0')
# measure.set('source', 'ssj500k') # measure.set('source', 'ssj500k')
measure.text = str(int(frequency)) measure.text = str(int(frequency))
if ssj_frequency is not None:
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
measure.set('source', 'ssj500k 2.2')
measure.text = str(int(ssj_frequency))
@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf) semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
senseList = lxml.SubElement(body, 'senseList') senseList = lxml.SubElement(body, 'senseList')
# handle cases when headword is not in sloleks
if len(sense_ids) == 0:
sense_ids = [-1]
for sense_id in sense_ids: for sense_id in sense_ids:
if len(sense_ids) > 1 and sense_id.dummy: if len(sense_ids) > 1 and sense_id.potential_sense:
continue continue
sense = lxml.SubElement(senseList, 'sense') sense = lxml.SubElement(senseList, 'sense')
if not sense_id.dummy: if not sense_id == -1 and not sense_id.potential_sense:
sense.set('id', str(sense_id.id)) sense.set('id', str(sense_id.id))
definitionList = lxml.SubElement(sense, 'definitionList') definitionList = lxml.SubElement(sense, 'definitionList')
definition_texts = session.query(Definition.description).filter( if not sense_id == -1:
Definition.sense_id == sense_id.id).all() definition_texts = session.query(Definition.description).filter(
Definition.sense_id == sense_id.id).all()
else:
definition_texts = []
for definition_text in definition_texts: for definition_text in definition_texts:
definition = lxml.SubElement(definitionList, 'definition') definition = lxml.SubElement(definitionList, 'definition')
@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
for syn_struct_id, component_dict in syntactic_structure_dict.items(): for syn_struct_id, component_dict in syntactic_structure_dict.items():
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
syntacticStructure.set('id', syn_struct_id) syntacticStructure.set('id', syn_struct_id)
dedup_dict = {}
for comp_id, lexemes in component_dict.items(): for comp_id, lexemes in component_dict.items():
for l in lexemes: for l in lexemes:
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
if l in preposition_list: if l in preposition_list:
prep_id = preposition_list[l] prep_id = preposition_list[l]
@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
prep_id = str(preposition_ids[0][0]) prep_id = str(preposition_ids[0][0])
preposition_list[l] = prep_id preposition_list[l] = prep_id
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
continue
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
lexem.set('sloleks', prep_id) lexem.set('sloleks', prep_id)
lexem.text = l[2] lexem.text = l[2]
@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
def init_db(db): def init_db(db):
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
[db_user, db_password, db_database, db_host] = db.split(':') [db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base() Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@ -1152,6 +1241,9 @@ def init_db(db):
class FormRepresentation(Base): class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation'] __table__ = Base.metadata.tables['jedro_formrepresentation']
class FormEncoding(Base):
__table__ = Base.metadata.tables['jedro_formencoding']
return engine return engine
@ -1415,6 +1507,14 @@ def get_headword_category(collection):
return headword_category return headword_category
def read_ssj500k_frequencies(path):
with open(path, 'r') as f:
reader = csv.reader(f, delimiter='\t')
next(reader)
for line in reader:
ssj_frequency_dict[(line[1], line[-1])] = line[2]
def main(args): def main(args):
# with Path('data/wordlist.json').open("r") as fp: # with Path('data/wordlist.json').open("r") as fp:
# sskj_wordlist = json.load(fp) # sskj_wordlist = json.load(fp)
@ -1518,7 +1618,9 @@ def main(args):
print('write_xml') print('write_xml')
start_time = time.time() start_time = time.time()
# print('aa ' + 3) if args.ssj500k_frequencies is not None:
read_ssj500k_frequencies(args.ssj500k_frequencies)
with tqdm(total=len(headword_category)) as pbar: with tqdm(total=len(headword_category)) as pbar:
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
print(time.time() - start_time) print(time.time() - start_time)
@ -1592,6 +1694,9 @@ if __name__ == '__main__':
arg_parser.add_argument('--pc-tag', arg_parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc") help='Tag for separators, usually pc or c', default="pc")
arg_parser.add_argument('--ssj500k-frequencies',
help='Tag for separators, usually pc or c', default=None)
args = arg_parser.parse_args() args = arg_parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

68
scripts/form_csv.py Normal file
View File

@ -0,0 +1,68 @@
import argparse
import csv
import os
from lxml import etree
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
measure = None
for measure_el in elem.find('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)

@ -1 +1 @@
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd