From 75b015dcda75037d9a752563a2ab53192f8a1b31 Mon Sep 17 00:00:00 2001 From: Luka Date: Tue, 8 Dec 2020 08:01:17 +0100 Subject: [PATCH] A couple of fixes on write_xml in create_xml.py + Created form_csv.py script --- scripts/create_xml.py | 171 ++++++++++++++++++++++++++++++-------- scripts/form_csv.py | 68 +++++++++++++++ src/pkg/cjvt-corpusparser | 2 +- 3 files changed, 207 insertions(+), 34 deletions(-) create mode 100644 scripts/form_csv.py diff --git a/scripts/create_xml.py b/scripts/create_xml.py index 4ab2e69..be0f4ac 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -3,6 +3,7 @@ #imports from luscenje_struktur import copy +import csv from luscenje_struktur.progress_bar import progress from luscenje_struktur.word import Word, WordCompressed @@ -114,6 +115,8 @@ CASE_MAP = { 'i': 'instrumental' } +ssj_frequency_dict = {} + Lexeme = None LexemeFeature = None @@ -130,6 +133,7 @@ Definition = None WordForm = None WordFormFeature = None FormRepresentation = None +FormEncoding = None # corpus = 'gigafida' @@ -745,7 +749,7 @@ def obtain_xml_data(collection, w_a_collection, headword_text, RF, mongo, patter def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, corpus_name, pattern_examples_limit, ignore_gigafida, pbar): query_general = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, - Lexeme.dummy, LexicalUnitType.name) \ + Lexeme.potential_lexeme, LexicalUnitType.name) \ .join(Category, Category.id == Lexeme.category_id) \ .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ @@ -791,7 +795,8 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, wf1 = aliased(WordFormFeature) wf2 = aliased(WordFormFeature) wf3 = aliased(WordFormFeature) - query_preposition = session.query(FormRepresentation.form) \ + query_preposition = session.query(FormEncoding.text) \ + .join(FormRepresentation, FormRepresentation.id == FormEncoding.form_representation_id) \ .join(WordForm, WordForm.id == FormRepresentation.word_form_id) \ .join(Lexeme, Lexeme.id == WordForm.lexeme_id) \ .join(wf1, wf1.word_form_id == WordForm.id) \ @@ -805,7 +810,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, pattern_translation_3_sin = headword_text if len(pattern_translation_hws) == 1: - pattern_translation_3_sin = pattern_translation_hws[0].form + pattern_translation_3_sin = pattern_translation_hws[0].text qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "noNamespaceSchemaLocation") dictionary = lxml.Element('dictionary', {qname: 'valency_lexicon.xsd'}) @@ -816,6 +821,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, headword_text_query = headword_text[:-1] else: headword_text_query = headword_text + query = query_general.filter(Category.name == category_text) \ .filter(Lexeme.lemma == headword_text_query) \ .group_by(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, LexicalUnitMeasure.value, @@ -827,31 +833,91 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, if len(query_res) == 1: (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, _, lexical_unit_type_name) = \ query_res[0] + sense_ids = session.query(Sense.id, Sense.potential_sense).filter( + Sense.lexical_unit_id == lexical_unit_id).all() + features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ + .filter(LexemeFeature.lexeme_id == lexeme_id) \ + .filter(Feature.name == 'aspect').all() elif len(query_res) > 1: + # find dummy + dummy_query = session.query(Lexeme.id, LexicalUnitLexeme.id, LexicalUnit.id, + Lexeme.potential_lexeme, LexicalUnitType.name) \ + .join(Category, Category.id == Lexeme.category_id) \ + .join(LexicalUnitLexeme, LexicalUnitLexeme.lexeme_id == Lexeme.id) \ + .join(LexicalUnit, LexicalUnit.id == LexicalUnitLexeme.lexical_unit_id) \ + .join(LexicalUnitType, LexicalUnitType.id == LexicalUnit.type_id) \ + .filter(LexicalUnitType.name == 'single_lexeme_unit') \ + .filter(Corpus.name == 'gigafida') \ + .filter(Corpus.version == '2.0') \ + .filter(Lexeme.lemma == headword_text_query).all() + # all lexical_unit_ids equal or at least one dummy + dummy_exists = False final_lexical_unit_id = 0 final_lexical_unit_lexeme_id = 0 - for r in query_res: - (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, frequency, dummy, + for r in dummy_query: + (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, dummy, lexical_unit_type_name) = r if dummy: final_lexical_unit_id = lexical_unit_id final_lexical_unit_lexeme_id = lexical_unit_lexeme_id + dummy_exists = True break + assert dummy_exists + + sense_ids = [] + features_set = set() + frequency = 0 + for r in query_res: + (lexeme_id, lexical_unit_lexeme_id, lexical_unit_id, el_frequency, dummy, + lexical_unit_type_name) = r + if dummy: + continue + sense_ids.extend(session.query(Sense.id, Sense.potential_sense).filter( + Sense.lexical_unit_id == lexical_unit_id).all()) + features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ + .filter(LexemeFeature.lexeme_id == lexeme_id) \ + .filter(Feature.name == 'aspect').all() + + # set features in dictionary + if not features: + for n_feat in features_set: + for f in n_feat: + features.add(f) + # compare features + else: + for n_feat in features_set: + for f in n_feat: + if f not in features: + raise Exception('Different features in query_res - might be problematic!') + + + + frequency += el_frequency + + # check if any actual sense exists if not erase all but one + any_sense_not_dummy = any([not sense[1] for sense in sense_ids]) + if not any_sense_not_dummy: + sense_ids = sense_ids[-1:] + + lexical_unit_id = final_lexical_unit_id lexical_unit_lexeme_id = final_lexical_unit_lexeme_id + # sense_ids = session.query(Sense.id, Sense.potential_sense).filter( + # Sense.lexical_unit_id == lexical_unit_id).all() + # features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ + # .filter(LexemeFeature.lexeme_id == lexeme_id) \ + # .filter(Feature.name == 'aspect').all() else: - frequency = 0 + frequency = None lexeme_id = 0 lexical_unit_id = 0 lexical_unit_lexeme_id = 0 lexical_unit_type_name = '' + sense_ids = [] + features = [] - sense_ids = session.query(Sense.id, Sense.dummy).filter(Sense.lexical_unit_id == lexical_unit_id).all() - features = session.query(LexemeFeature.value).join(Feature, Feature.id == LexemeFeature.feature_id) \ - .filter(LexemeFeature.lexeme_id == lexeme_id) \ - .filter(Feature.name == 'aspect').all() entry = lxml.SubElement(dictionary, 'entry') @@ -875,22 +941,32 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, category.text = CATEGORY_MAP[category_text] if category_text in CATEGORY_MAP else '' else: category.text = category_text - grammarFeature = lxml.SubElement(grammar, 'grammarFeature') - if args.language == 'sl': - grammarFeature.set('name', 'vid') - grammarFeature.text = ASPECT_MAP[features[0].value] if len(features) > 0 and features[ - 0].value in ASPECT_MAP else '' - else: - grammarFeature.set('name', 'aspect') - grammarFeature.text = features[0].value if len(features) > 0 else '' + ssj_frequency = None + if len(features) > 0: + grammarFeature = lxml.SubElement(grammar, 'grammarFeature') + ssj_frequency = ssj_frequency_dict[(headword_text, features[0].value)] if (headword_text, features[0].value) in ssj_frequency_dict else None + if args.language == 'sl': + grammarFeature.set('name', 'vid') + if len(features) > 1: + print(features) + grammarFeature.text = ASPECT_MAP[features[0].value] + else: + grammarFeature.set('name', 'aspect') + grammarFeature.text = features[0].value measureList = lxml.SubElement(head, 'measureList') - measure = lxml.SubElement(measureList, 'measure') - measure.set('type', 'frequency') - # TODO Modify this! - measure.set('source', 'Gigafida 2.0') - # measure.set('source', 'ssj500k') - measure.text = str(int(frequency)) + if frequency: + measure = lxml.SubElement(measureList, 'measure') + measure.set('type', 'frequency') + measure.set('source', 'Gigafida 2.0') + # measure.set('source', 'ssj500k') + measure.text = str(int(frequency)) + + if ssj_frequency is not None: + measure = lxml.SubElement(measureList, 'measure') + measure.set('type', 'frequency') + measure.set('source', 'ssj500k 2.2') + measure.text = str(int(ssj_frequency)) @@ -956,18 +1032,26 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, semantic_role_stats[semanticRole_val]['gf']['valency_sentence_num'] / sentence_tot_gf) senseList = lxml.SubElement(body, 'senseList') + + # handle cases when headword is not in sloleks + if len(sense_ids) == 0: + sense_ids = [-1] + for sense_id in sense_ids: - if len(sense_ids) > 1 and sense_id.dummy: + if len(sense_ids) > 1 and sense_id.potential_sense: continue sense = lxml.SubElement(senseList, 'sense') - if not sense_id.dummy: + if not sense_id == -1 and not sense_id.potential_sense: sense.set('id', str(sense_id.id)) definitionList = lxml.SubElement(sense, 'definitionList') - definition_texts = session.query(Definition.description).filter( - Definition.sense_id == sense_id.id).all() + if not sense_id == -1: + definition_texts = session.query(Definition.description).filter( + Definition.sense_id == sense_id.id).all() + else: + definition_texts = [] for definition_text in definition_texts: definition = lxml.SubElement(definitionList, 'definition') @@ -1050,11 +1134,9 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, for syn_struct_id, component_dict in syntactic_structure_dict.items(): syntacticStructure = lxml.SubElement(syntacticStructureList, 'syntacticStructure') syntacticStructure.set('id', syn_struct_id) + dedup_dict = {} for comp_id, lexemes in component_dict.items(): for l in lexemes: - component = lxml.SubElement(syntacticStructure, 'component') - component.set('num', comp_id) - lexem = lxml.SubElement(component, 'lexeme') if l in preposition_list: prep_id = preposition_list[l] @@ -1074,6 +1156,13 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, prep_id = str(preposition_ids[0][0]) preposition_list[l] = prep_id + if comp_id in dedup_dict and prep_id in dedup_dict[comp_id] and l[2] in dedup_dict[comp_id][prep_id]: + continue + + dedup_dict.setdefault(comp_id, {})[prep_id] = l[2] + component = lxml.SubElement(syntacticStructure, 'component') + component.set('num', comp_id) + lexem = lxml.SubElement(component, 'lexeme') lexem.set('sloleks', prep_id) lexem.text = l[2] @@ -1100,7 +1189,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, def init_db(db): - global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation + global Lexeme, LexemeFeature, Feature, LexicalUnitLexeme, LexicalUnit, LexicalUnitType, Category, Sense, Measure, LexicalUnitMeasure, Corpus, Definition, WordForm, WordFormFeature, FormRepresentation, FormEncoding [db_user, db_password, db_database, db_host] = db.split(':') Base = declarative_base() engine = create_engine('postgresql://' + db_user + ':' + db_password + '@' + db_host + '/' + db_database, @@ -1152,6 +1241,9 @@ def init_db(db): class FormRepresentation(Base): __table__ = Base.metadata.tables['jedro_formrepresentation'] + class FormEncoding(Base): + __table__ = Base.metadata.tables['jedro_formencoding'] + return engine @@ -1415,6 +1507,14 @@ def get_headword_category(collection): return headword_category +def read_ssj500k_frequencies(path): + with open(path, 'r') as f: + reader = csv.reader(f, delimiter='\t') + next(reader) + for line in reader: + ssj_frequency_dict[(line[1], line[-1])] = line[2] + + def main(args): # with Path('data/wordlist.json').open("r") as fp: # sskj_wordlist = json.load(fp) @@ -1518,7 +1618,9 @@ def main(args): print('write_xml') start_time = time.time() - # print('aa ' + 3) + if args.ssj500k_frequencies is not None: + read_ssj500k_frequencies(args.ssj500k_frequencies) + with tqdm(total=len(headword_category)) as pbar: write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, session, w_a_collection_ssj, w_a_collection_gigafida, valency_pattern_id_collection, args.corpus_name, args.pattern_examples_limit, args.ignore_gigafida, pbar) print(time.time() - start_time) @@ -1592,6 +1694,9 @@ if __name__ == '__main__': arg_parser.add_argument('--pc-tag', help='Tag for separators, usually pc or c', default="pc") + arg_parser.add_argument('--ssj500k-frequencies', + help='Tag for separators, usually pc or c', default=None) + args = arg_parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) diff --git a/scripts/form_csv.py b/scripts/form_csv.py new file mode 100644 index 0000000..3b2fa6c --- /dev/null +++ b/scripts/form_csv.py @@ -0,0 +1,68 @@ +import argparse +import csv +import os + +from lxml import etree + + +def write_general_statistics(path, out_list): + if len(out_list) == 0: + return + with open(path, 'w') as csvfile: + writer = csv.writer(csvfile, delimiter='\t', + quotechar='"') + writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio']) + for line in out_list: + writer.writerow(line) + + +def main(args): + for file in sorted(os.listdir(args.input)): + path = os.path.join(args.input, file) + tree = etree.parse(path) + gf_output = [] + ssj_output = [] + head = next(tree.iter('head')) + headword = head.find('headword').find('lemma').text + #for div in root.iterfind('.//div'): + for elem in tree.iter('statisticsContainer'): + # for element in tree.iterfind('statisticsContainer'): + # for element in tree.find('statisticsContainer'): + semRole = elem.find('semanticRole').text + gf_pattern = None + gf_sentence = None + ssj_pattern = None + ssj_sentence = None + measure = elem.find('measureList') + for el in measure: + if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0': + gf_pattern = el.text + if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0': + gf_sentence = el.text + if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2': + ssj_pattern = el.text + if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2': + ssj_sentence = el.text + if gf_pattern is not None and gf_sentence is not None: + gf_output.append([semRole, gf_pattern, gf_sentence]) + if ssj_pattern is not None and ssj_sentence is not None: + ssj_output.append([semRole, ssj_pattern, ssj_sentence]) + + for elem in tree.iter('valencyPattern'): + valency_pattern_id = elem.attrib['id'] + measure = None + for measure_el in elem.find('measure'): + if measure_el.attrib['source'] == 'Gigafida 2.0': + measure = measure_el.text + + write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output) + write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output) + +if __name__ == '__main__': + arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') + arg_parser.add_argument('--input', type=str, help='Input directory') + arg_parser.add_argument('--output', type=str, help='Output directory') + + args = arg_parser.parse_args() + + main(args) diff --git a/src/pkg/cjvt-corpusparser b/src/pkg/cjvt-corpusparser index 01adf47..92b3ac4 160000 --- a/src/pkg/cjvt-corpusparser +++ b/src/pkg/cjvt-corpusparser @@ -1 +1 @@ -Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd +Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd