import argparse import csv import os from lxml import etree, objectify, html def write_general_statistics(path, out_list): if len(out_list) == 0: return with open(path, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='"') writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio']) for line in out_list: writer.writerow(line) def write_statistics(path, out_list): if len(out_list) == 0: return with open(path, 'w') as csvfile: writer = csv.writer(csvfile, delimiter='\t', quotechar='"') writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example']) for line in out_list: writer.writerow(line) def main(args): for file in sorted(os.listdir(args.input)): path = os.path.join(args.input, file) tree = etree.parse(path) gf_output = [] ssj_output = [] head = next(tree.iter('head')) headword = head.find('headword').find('lemma').text #for div in root.iterfind('.//div'): for elem in tree.iter('statisticsContainer'): # for element in tree.iterfind('statisticsContainer'): # for element in tree.find('statisticsContainer'): semRole = elem.find('semanticRole').text gf_pattern = None gf_sentence = None ssj_pattern = None ssj_sentence = None measure = elem.find('measureList') for el in measure: if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0': gf_pattern = el.text if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0': gf_sentence = el.text if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2': ssj_pattern = el.text if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2': ssj_sentence = el.text if gf_pattern is not None and gf_sentence is not None: gf_output.append([semRole, gf_pattern, gf_sentence]) if ssj_pattern is not None and ssj_sentence is not None: ssj_output.append([semRole, ssj_pattern, ssj_sentence]) print(file) analyze_output = [] for elem in tree.iter('valencyPattern'): valency_pattern_id = elem.attrib['id'] # get frequency measure = '' for measure_el in elem.find('measureList').findall('measure'): if measure_el.attrib['source'] == 'Gigafida 2.0': measure = measure_el.text # get semantic roles semantic_roles_list = [] for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'): semantic_roles_list.append(semantic_rol_con.find('semanticRole').text) semantic_roles = '_'.join(semantic_roles_list) # pattern representation pattern_representation = elem.find('patternRepresentation').text # corpus example if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None: corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode') else: continue # ugly postprocessing to remove xmlns:xsi=... duh.. root = etree.fromstring(corpus_example_text) # Remove namespace prefixes for elem in root.getiterator(): elem.tag = etree.QName(elem).localname # Remove unused namespace declarations etree.cleanup_namespaces(root) corpus_example = etree.tostring(root, encoding='unicode') print(f"Valency pattern {valency_pattern_id}") analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example]) write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output) write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output) write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output) if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.') arg_parser.add_argument('--input', type=str, help='Input directory') arg_parser.add_argument('--output', type=str, help='Output directory') args = arg_parser.parse_args() main(args)