A couple of fixes on write_xml in create_xml.py + Created form_csv.py script

This commit is contained in:
Luka 2020-12-08 08:01:17 +01:00
parent c18aaff11f
commit 75b015dcda
3 changed files with 207 additions and 34 deletions

View File

@ -3,6 +3,7 @@
#imports from luscenje_struktur
import copy
import csv
from luscenje_struktur.progress_bar import progress
from luscenje_struktur.word import Word, WordCompressed
@ -114,6 +115,8 @@ CASE_MAP = {
'i': 'instrumental'
}
ssj_frequency_dict = {}
Lexeme = None
LexemeFeature = None
@ -130,6 +133,7 @@ Definition = None
WordForm = None
WordFormFeature = None
FormRepresentation = None
FormEncoding = None
# corpus = 'gigafida'
@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
Lexeme.dummy, LexicalUnitType.name) \
Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
wf1 = aliased(WordFormFeature)
wf2 = aliased(WordFormFeature)
wf3 = aliased(WordFormFeature)
query_preposition = session.query(FormRepresentation.form) \
query_preposition = session.query(FormEncoding.text) \
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
.join(wf1, wf1.word_form_id == WordForm.id) \
@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
pattern_translation_3_sin = headword_text
if len(pattern_translation_hws) == 1:
pattern_translation_3_sin = pattern_translation_hws[0].form
pattern_translation_3_sin = pattern_translation_hws[0].text
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
headword_text_query = headword_text[:-1]
else:
headword_text_query = headword_text
query = query_general.filter(Category.name == category_text) \
.filter(Lexeme.lemma == headword_text_query) \
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
if len(query_res) == 1:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
query_res[0]
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
elif len(query_res) > 1:
# find dummy
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
Lexeme.potential_lexeme, LexicalUnitType.name) \
.join(Category, Category.id == Lexeme.category_id) \
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
.filter(Corpus.name == 'gigafida') \
.filter(Corpus.version == '2.0') \
.filter(Lexeme.lemma == headword_text_query).all()
# all lexical_unit_ids equal or at least one dummy
dummy_exists = False
final_lexical_unit_id = 0
final_lexical_unit_lexeme_id = 0
for r in query_res:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
for r in dummy_query:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
lexical_unit_type_name) = r
if dummy:
final_lexical_unit_id = lexical_unit_id
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
dummy_exists = True
break
assert dummy_exists
sense_ids = []
features_set = set()
frequency = 0
for r in query_res:
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
lexical_unit_type_name) = r
if dummy:
continue
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
Sense.lexical_unit_id == lexical_unit_id).all())
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
# set features in dictionary
if not features:
for n_feat in features_set:
for f in n_feat:
features.add(f)
# compare features
else:
for n_feat in features_set:
for f in n_feat:
if f not in features:
raise Exception('Different features in query_res - might be problematic!')
frequency += el_frequency
# check if any actual sense exists if not erase all but one
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
if not any_sense_not_dummy:
sense_ids = sense_ids[-1:]
lexical_unit_id = final_lexical_unit_id
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
# Sense.lexical_unit_id == lexical_unit_id).all()
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
# .filter(Feature.name == 'aspect').all()
else:
frequency = 0
frequency = None
lexeme_id = 0
lexical_unit_id = 0
lexical_unit_lexeme_id = 0
lexical_unit_type_name = ''
sense_ids = []
features = []
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
.filter(LexemeFeature.lexeme_id == lexeme_id) \
.filter(Feature.name == 'aspect').all()
entry = lxml.SubElement(dictionary, 'entry')
@ -875,22 +941,32 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
else:
category.text = category_text
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
if args.language == 'sl':
grammarFeature.set('name', 'vid')
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
0].value in ASPECT_MAP else ''
else:
grammarFeature.set('name', 'aspect')
grammarFeature.text = features[0].value if len(features) > 0 else ''
ssj_frequency = None
if len(features) > 0:
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
if args.language == 'sl':
grammarFeature.set('name', 'vid')
if len(features) > 1:
print(features)
grammarFeature.text = ASPECT_MAP[features[0].value]
else:
grammarFeature.set('name', 'aspect')
grammarFeature.text = features[0].value
measureList = lxml.SubElement(head, 'measureList')
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
# TODO Modify this!
measure.set('source', 'Gigafida 2.0')
# measure.set('source', 'ssj500k')
measure.text = str(int(frequency))
if frequency:
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
measure.set('source', 'Gigafida 2.0')
# measure.set('source', 'ssj500k')
measure.text = str(int(frequency))
if ssj_frequency is not None:
measure = lxml.SubElement(measureList, 'measure')
measure.set('type', 'frequency')
measure.set('source', 'ssj500k 2.2')
measure.text = str(int(ssj_frequency))
@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
senseList = lxml.SubElement(body, 'senseList')
# handle cases when headword is not in sloleks
if len(sense_ids) == 0:
sense_ids = [-1]
for sense_id in sense_ids:
if len(sense_ids) > 1 and sense_id.dummy:
if len(sense_ids) > 1 and sense_id.potential_sense:
continue
sense = lxml.SubElement(senseList, 'sense')
if not sense_id.dummy:
if not sense_id == -1 and not sense_id.potential_sense:
sense.set('id', str(sense_id.id))
definitionList = lxml.SubElement(sense, 'definitionList')
definition_texts = session.query(Definition.description).filter(
Definition.sense_id == sense_id.id).all()
if not sense_id == -1:
definition_texts = session.query(Definition.description).filter(
Definition.sense_id == sense_id.id).all()
else:
definition_texts = []
for definition_text in definition_texts:
definition = lxml.SubElement(definitionList, 'definition')
@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
for syn_struct_id, component_dict in syntactic_structure_dict.items():
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
syntacticStructure.set('id', syn_struct_id)
dedup_dict = {}
for comp_id, lexemes in component_dict.items():
for l in lexemes:
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
if l in preposition_list:
prep_id = preposition_list[l]
@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
prep_id = str(preposition_ids[0][0])
preposition_list[l] = prep_id
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
continue
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
component = lxml.SubElement(syntacticStructure, 'component')
component.set('num', comp_id)
lexem = lxml.SubElement(component, 'lexeme')
lexem.set('sloleks', prep_id)
lexem.text = l[2]
@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
def init_db(db):
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
[db_user, db_password, db_database, db_host] = db.split(':')
Base = declarative_base()
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
@ -1152,6 +1241,9 @@ def init_db(db):
class FormRepresentation(Base):
__table__ = Base.metadata.tables['jedro_formrepresentation']
class FormEncoding(Base):
__table__ = Base.metadata.tables['jedro_formencoding']
return engine
@ -1415,6 +1507,14 @@ def get_headword_category(collection):
return headword_category
def read_ssj500k_frequencies(path):
with open(path, 'r') as f:
reader = csv.reader(f, delimiter='\t')
next(reader)
for line in reader:
ssj_frequency_dict[(line[1], line[-1])] = line[2]
def main(args):
# with Path('data/wordlist.json').open("r") as fp:
# sskj_wordlist = json.load(fp)
@ -1518,7 +1618,9 @@ def main(args):
print('write_xml')
start_time = time.time()
# print('aa ' + 3)
if args.ssj500k_frequencies is not None:
read_ssj500k_frequencies(args.ssj500k_frequencies)
with tqdm(total=len(headword_category)) as pbar:
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
print(time.time() - start_time)
@ -1592,6 +1694,9 @@ if __name__ == '__main__':
arg_parser.add_argument('--pc-tag',
help='Tag for separators, usually pc or c', default="pc")
arg_parser.add_argument('--ssj500k-frequencies',
help='Tag for separators, usually pc or c', default=None)
args = arg_parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())

68
scripts/form_csv.py Normal file
View File

@ -0,0 +1,68 @@
import argparse
import csv
import os
from lxml import etree
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
measure = None
for measure_el in elem.find('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)

@ -1 +1 @@
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd
Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd