forked from kristjan/cjvt-valency
A couple of fixes on write_xml in create_xml.py + Created form_csv.py script
This commit is contained in:
parent
c18aaff11f
commit
75b015dcda
|
@ -3,6 +3,7 @@
|
|||
|
||||
#imports from luscenje_struktur
|
||||
import copy
|
||||
import csv
|
||||
|
||||
from luscenje_struktur.progress_bar import progress
|
||||
from luscenje_struktur.word import Word, WordCompressed
|
||||
|
@ -114,6 +115,8 @@ CASE_MAP = {
|
|||
'i': 'instrumental'
|
||||
}
|
||||
|
||||
ssj_frequency_dict = {}
|
||||
|
||||
|
||||
Lexeme = None
|
||||
LexemeFeature = None
|
||||
|
@ -130,6 +133,7 @@ Definition = None
|
|||
WordForm = None
|
||||
WordFormFeature = None
|
||||
FormRepresentation = None
|
||||
FormEncoding = None
|
||||
|
||||
|
||||
# corpus = 'gigafida'
|
||||
|
@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter
|
|||
|
||||
def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar):
|
||||
query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
||||
Lexeme.dummy, LexicalUnitType.name) \
|
||||
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
||||
.join(Category, Category.id == Lexeme.category_id) \
|
||||
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
||||
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
||||
|
@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
wf1 = aliased(WordFormFeature)
|
||||
wf2 = aliased(WordFormFeature)
|
||||
wf3 = aliased(WordFormFeature)
|
||||
query_preposition = session.query(FormRepresentation.form) \
|
||||
query_preposition = session.query(FormEncoding.text) \
|
||||
.join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \
|
||||
.join(WordForm, WordForm.id == FormRepresentation.word_form_id) \
|
||||
.join(Lexeme, Lexeme.id == WordForm.lexeme_id) \
|
||||
.join(wf1, wf1.word_form_id == WordForm.id) \
|
||||
|
@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
|
||||
pattern_translation_3_sin = headword_text
|
||||
if len(pattern_translation_hws) == 1:
|
||||
pattern_translation_3_sin = pattern_translation_hws[0].form
|
||||
pattern_translation_3_sin = pattern_translation_hws[0].text
|
||||
|
||||
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation")
|
||||
dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'})
|
||||
|
@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
headword_text_query = headword_text[:-1]
|
||||
else:
|
||||
headword_text_query = headword_text
|
||||
|
||||
query = query_general.filter(Category.name == category_text) \
|
||||
.filter(Lexeme.lemma == headword_text_query) \
|
||||
.group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value,
|
||||
|
@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
if len(query_res) == 1:
|
||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \
|
||||
query_res[0]
|
||||
sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
||||
Sense.lexical_unit_id == lexical_unit_id).all()
|
||||
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||
.filter(Feature.name == 'aspect').all()
|
||||
|
||||
elif len(query_res) > 1:
|
||||
# find dummy
|
||||
dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id,
|
||||
Lexeme.potential_lexeme, LexicalUnitType.name) \
|
||||
.join(Category, Category.id == Lexeme.category_id) \
|
||||
.join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \
|
||||
.join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \
|
||||
.join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \
|
||||
.filter(LexicalUnitType.name == 'single_lexeme_unit') \
|
||||
.filter(Corpus.name == 'gigafida') \
|
||||
.filter(Corpus.version == '2.0') \
|
||||
.filter(Lexeme.lemma == headword_text_query).all()
|
||||
|
||||
# all lexical_unit_ids equal or at least one dummy
|
||||
dummy_exists = False
|
||||
final_lexical_unit_id = 0
|
||||
final_lexical_unit_lexeme_id = 0
|
||||
for r in query_res:
|
||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy,
|
||||
for r in dummy_query:
|
||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy,
|
||||
lexical_unit_type_name) = r
|
||||
if dummy:
|
||||
final_lexical_unit_id = lexical_unit_id
|
||||
final_lexical_unit_lexeme_id = lexical_unit_lexeme_id
|
||||
dummy_exists = True
|
||||
break
|
||||
assert dummy_exists
|
||||
|
||||
sense_ids = []
|
||||
features_set = set()
|
||||
frequency = 0
|
||||
for r in query_res:
|
||||
(lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy,
|
||||
lexical_unit_type_name) = r
|
||||
if dummy:
|
||||
continue
|
||||
sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter(
|
||||
Sense.lexical_unit_id == lexical_unit_id).all())
|
||||
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||
.filter(Feature.name == 'aspect').all()
|
||||
|
||||
# set features in dictionary
|
||||
if not features:
|
||||
for n_feat in features_set:
|
||||
for f in n_feat:
|
||||
features.add(f)
|
||||
# compare features
|
||||
else:
|
||||
for n_feat in features_set:
|
||||
for f in n_feat:
|
||||
if f not in features:
|
||||
raise Exception('Different features in query_res - might be problematic!')
|
||||
|
||||
|
||||
|
||||
frequency += el_frequency
|
||||
|
||||
# check if any actual sense exists if not erase all but one
|
||||
any_sense_not_dummy = any([not sense[1] for sense in sense_ids])
|
||||
if not any_sense_not_dummy:
|
||||
sense_ids = sense_ids[-1:]
|
||||
|
||||
|
||||
lexical_unit_id = final_lexical_unit_id
|
||||
lexical_unit_lexeme_id = final_lexical_unit_lexeme_id
|
||||
# sense_ids = session.query(Sense.id, Sense.potential_sense).filter(
|
||||
# Sense.lexical_unit_id == lexical_unit_id).all()
|
||||
# features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||
# .filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||
# .filter(Feature.name == 'aspect').all()
|
||||
else:
|
||||
frequency = 0
|
||||
frequency = None
|
||||
lexeme_id = 0
|
||||
lexical_unit_id = 0
|
||||
lexical_unit_lexeme_id = 0
|
||||
lexical_unit_type_name = ''
|
||||
sense_ids = []
|
||||
features = []
|
||||
|
||||
sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all()
|
||||
features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \
|
||||
.filter(LexemeFeature.lexeme_id == lexeme_id) \
|
||||
.filter(Feature.name == 'aspect').all()
|
||||
|
||||
entry = lxml.SubElement(dictionary, 'entry')
|
||||
|
||||
|
@ -875,22 +941,32 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else ''
|
||||
else:
|
||||
category.text = category_text
|
||||
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
||||
if args.language == 'sl':
|
||||
grammarFeature.set('name', 'vid')
|
||||
grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[
|
||||
0].value in ASPECT_MAP else ''
|
||||
else:
|
||||
grammarFeature.set('name', 'aspect')
|
||||
grammarFeature.text = features[0].value if len(features) > 0 else ''
|
||||
ssj_frequency = None
|
||||
if len(features) > 0:
|
||||
grammarFeature = lxml.SubElement(grammar, 'grammarFeature')
|
||||
ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None
|
||||
if args.language == 'sl':
|
||||
grammarFeature.set('name', 'vid')
|
||||
if len(features) > 1:
|
||||
print(features)
|
||||
grammarFeature.text = ASPECT_MAP[features[0].value]
|
||||
else:
|
||||
grammarFeature.set('name', 'aspect')
|
||||
grammarFeature.text = features[0].value
|
||||
|
||||
measureList = lxml.SubElement(head, 'measureList')
|
||||
measure = lxml.SubElement(measureList, 'measure')
|
||||
measure.set('type', 'frequency')
|
||||
# TODO Modify this!
|
||||
measure.set('source', 'Gigafida 2.0')
|
||||
# measure.set('source', 'ssj500k')
|
||||
measure.text = str(int(frequency))
|
||||
if frequency:
|
||||
measure = lxml.SubElement(measureList, 'measure')
|
||||
measure.set('type', 'frequency')
|
||||
measure.set('source', 'Gigafida 2.0')
|
||||
# measure.set('source', 'ssj500k')
|
||||
measure.text = str(int(frequency))
|
||||
|
||||
if ssj_frequency is not None:
|
||||
measure = lxml.SubElement(measureList, 'measure')
|
||||
measure.set('type', 'frequency')
|
||||
measure.set('source', 'ssj500k 2.2')
|
||||
measure.text = str(int(ssj_frequency))
|
||||
|
||||
|
||||
|
||||
|
@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf)
|
||||
|
||||
senseList = lxml.SubElement(body, 'senseList')
|
||||
|
||||
# handle cases when headword is not in sloleks
|
||||
if len(sense_ids) == 0:
|
||||
sense_ids = [-1]
|
||||
|
||||
for sense_id in sense_ids:
|
||||
if len(sense_ids) > 1 and sense_id.dummy:
|
||||
if len(sense_ids) > 1 and sense_id.potential_sense:
|
||||
continue
|
||||
|
||||
sense = lxml.SubElement(senseList, 'sense')
|
||||
if not sense_id.dummy:
|
||||
if not sense_id == -1 and not sense_id.potential_sense:
|
||||
sense.set('id', str(sense_id.id))
|
||||
|
||||
definitionList = lxml.SubElement(sense, 'definitionList')
|
||||
|
||||
definition_texts = session.query(Definition.description).filter(
|
||||
Definition.sense_id == sense_id.id).all()
|
||||
if not sense_id == -1:
|
||||
definition_texts = session.query(Definition.description).filter(
|
||||
Definition.sense_id == sense_id.id).all()
|
||||
else:
|
||||
definition_texts = []
|
||||
|
||||
for definition_text in definition_texts:
|
||||
definition = lxml.SubElement(definitionList, 'definition')
|
||||
|
@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
for syn_struct_id, component_dict in syntactic_structure_dict.items():
|
||||
syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure')
|
||||
syntacticStructure.set('id', syn_struct_id)
|
||||
dedup_dict = {}
|
||||
for comp_id, lexemes in component_dict.items():
|
||||
for l in lexemes:
|
||||
component = lxml.SubElement(syntacticStructure, 'component')
|
||||
component.set('num', comp_id)
|
||||
lexem = lxml.SubElement(component, 'lexeme')
|
||||
|
||||
if l in preposition_list:
|
||||
prep_id = preposition_list[l]
|
||||
|
@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
prep_id = str(preposition_ids[0][0])
|
||||
preposition_list[l] = prep_id
|
||||
|
||||
if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]:
|
||||
continue
|
||||
|
||||
dedup_dict.setdefault(comp_id, {})[prep_id] = l[2]
|
||||
component = lxml.SubElement(syntacticStructure, 'component')
|
||||
component.set('num', comp_id)
|
||||
lexem = lxml.SubElement(component, 'lexeme')
|
||||
|
||||
lexem.set('sloleks', prep_id)
|
||||
lexem.text = l[2]
|
||||
|
@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
|||
|
||||
|
||||
def init_db(db):
|
||||
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation
|
||||
global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding
|
||||
[db_user, db_password, db_database, db_host] = db.split(':')
|
||||
Base = declarative_base()
|
||||
engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database,
|
||||
|
@ -1152,6 +1241,9 @@ def init_db(db):
|
|||
class FormRepresentation(Base):
|
||||
__table__ = Base.metadata.tables['jedro_formrepresentation']
|
||||
|
||||
class FormEncoding(Base):
|
||||
__table__ = Base.metadata.tables['jedro_formencoding']
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
|
@ -1415,6 +1507,14 @@ def get_headword_category(collection):
|
|||
return headword_category
|
||||
|
||||
|
||||
def read_ssj500k_frequencies(path):
|
||||
with open(path, 'r') as f:
|
||||
reader = csv.reader(f, delimiter='\t')
|
||||
next(reader)
|
||||
for line in reader:
|
||||
ssj_frequency_dict[(line[1], line[-1])] = line[2]
|
||||
|
||||
|
||||
def main(args):
|
||||
# with Path('data/wordlist.json').open("r") as fp:
|
||||
# sskj_wordlist = json.load(fp)
|
||||
|
@ -1518,7 +1618,9 @@ def main(args):
|
|||
|
||||
print('write_xml')
|
||||
start_time = time.time()
|
||||
# print('aa ' + 3)
|
||||
if args.ssj500k_frequencies is not None:
|
||||
read_ssj500k_frequencies(args.ssj500k_frequencies)
|
||||
|
||||
with tqdm(total=len(headword_category)) as pbar:
|
||||
write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar)
|
||||
print(time.time() - start_time)
|
||||
|
@ -1592,6 +1694,9 @@ if __name__ == '__main__':
|
|||
arg_parser.add_argument('--pc-tag',
|
||||
help='Tag for separators, usually pc or c', default="pc")
|
||||
|
||||
arg_parser.add_argument('--ssj500k-frequencies',
|
||||
help='Tag for separators, usually pc or c', default=None)
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||
|
||||
|
|
68
scripts/form_csv.py
Normal file
68
scripts/form_csv.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import argparse
|
||||
import csv
|
||||
import os
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def write_general_statistics(path, out_list):
|
||||
if len(out_list) == 0:
|
||||
return
|
||||
with open(path, 'w') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter='\t',
|
||||
quotechar='"')
|
||||
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
|
||||
for line in out_list:
|
||||
writer.writerow(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
for file in sorted(os.listdir(args.input)):
|
||||
path = os.path.join(args.input, file)
|
||||
tree = etree.parse(path)
|
||||
gf_output = []
|
||||
ssj_output = []
|
||||
head = next(tree.iter('head'))
|
||||
headword = head.find('headword').find('lemma').text
|
||||
#for div in root.iterfind('.//div'):
|
||||
for elem in tree.iter('statisticsContainer'):
|
||||
# for element in tree.iterfind('statisticsContainer'):
|
||||
# for element in tree.find('statisticsContainer'):
|
||||
semRole = elem.find('semanticRole').text
|
||||
gf_pattern = None
|
||||
gf_sentence = None
|
||||
ssj_pattern = None
|
||||
ssj_sentence = None
|
||||
measure = elem.find('measureList')
|
||||
for el in measure:
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_sentence = el.text
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_sentence = el.text
|
||||
if gf_pattern is not None and gf_sentence is not None:
|
||||
gf_output.append([semRole, gf_pattern, gf_sentence])
|
||||
if ssj_pattern is not None and ssj_sentence is not None:
|
||||
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||
|
||||
for elem in tree.iter('valencyPattern'):
|
||||
valency_pattern_id = elem.attrib['id']
|
||||
measure = None
|
||||
for measure_el in elem.find('measure'):
|
||||
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||
measure = measure_el.text
|
||||
|
||||
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||
arg_parser.add_argument('--input', type=str, help='Input directory')
|
||||
arg_parser.add_argument('--output', type=str, help='Output directory')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
main(args)
|
|
@ -1 +1 @@
|
|||
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd
|
||||
Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd
|
Loading…
Reference in New Issue
Block a user