diff --git a/scripts/create_xml.py b/scripts/create_xml.py index be0f4ac..4ab72a7 100644 --- a/scripts/create_xml.py +++ b/scripts/create_xml.py @@ -1181,7 +1181,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo, for sentence_example in headword_pattern_dict['gf']['sentence_examples']: exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer') # corpusExample = lxml.SubElement(exampleContainer, 'corpusExample') - exampleContainer.append(sentence_example) + exampleContainer.append(copy.deepcopy(sentence_example)) with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'), encoding='utf-8') as xf: xf.write(dictionary, pretty_print=True) diff --git a/scripts/form_csv.py b/scripts/form_csv.py index 3b2fa6c..7c42d75 100644 --- a/scripts/form_csv.py +++ b/scripts/form_csv.py @@ -2,7 +2,7 @@ import argparse import csv import os -from lxml import etree +from lxml import etree, objectify, html def write_general_statistics(path, out_list): @@ -11,7 +11,18 @@ def write_general_statistics(path, out_list): with open(path, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='"') - writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio']) + writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio']) + for line in out_list: + writer.writerow(line) + + +def write_statistics(path, out_list): + if len(out_list) == 0: + return + with open(path, 'w') as csvfile: + writer = csv.writer(csvfile, delimiter='\t', + quotechar='"') + writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example']) for line in out_list: writer.writerow(line) @@ -48,15 +59,53 @@ def main(args): if ssj_pattern is not None and ssj_sentence is not None: ssj_output.append([semRole, ssj_pattern, ssj_sentence]) + print(file) + + analyze_output = [] for elem in tree.iter('valencyPattern'): valency_pattern_id = elem.attrib['id'] - measure = None - for measure_el in elem.find('measure'): + + # get frequency + measure = '' + for measure_el in elem.find('measureList').findall('measure'): if measure_el.attrib['source'] == 'Gigafida 2.0': measure = measure_el.text + # get semantic roles + semantic_roles_list = [] + for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'): + semantic_roles_list.append(semantic_rol_con.find('semanticRole').text) + semantic_roles = '_'.join(semantic_roles_list) + + # pattern representation + pattern_representation = elem.find('patternRepresentation').text + + # corpus example + if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None: + corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode') + + else: + continue + + # ugly postprocessing to remove xmlns:xsi=... duh.. + root = etree.fromstring(corpus_example_text) + + # Remove namespace prefixes + for elem in root.getiterator(): + elem.tag = etree.QName(elem).localname + # Remove unused namespace declarations + etree.cleanup_namespaces(root) + + corpus_example = etree.tostring(root, encoding='unicode') + + print(f"Valency pattern {valency_pattern_id}") + + + analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example]) + write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output) write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output) + write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output) if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')