You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cjvt-valency/scripts/form_csv.py

118 lines
4.9 KiB

import argparse
import csv
import os
from lxml import etree, objectify, html
def write_general_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
for line in out_list:
writer.writerow(line)
def write_statistics(path, out_list):
if len(out_list) == 0:
return
with open(path, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter='\t',
quotechar='"')
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
for line in out_list:
writer.writerow(line)
def main(args):
for file in sorted(os.listdir(args.input)):
path = os.path.join(args.input, file)
tree = etree.parse(path)
gf_output = []
ssj_output = []
head = next(tree.iter('head'))
headword = head.find('headword').find('lemma').text
#for div in root.iterfind('.//div'):
for elem in tree.iter('statisticsContainer'):
# for element in tree.iterfind('statisticsContainer'):
# for element in tree.find('statisticsContainer'):
semRole = elem.find('semanticRole').text
gf_pattern = None
gf_sentence = None
ssj_pattern = None
ssj_sentence = None
measure = elem.find('measureList')
for el in measure:
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
gf_sentence = el.text
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_pattern = el.text
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
ssj_sentence = el.text
if gf_pattern is not None and gf_sentence is not None:
gf_output.append([semRole, gf_pattern, gf_sentence])
if ssj_pattern is not None and ssj_sentence is not None:
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
print(file)
analyze_output = []
for elem in tree.iter('valencyPattern'):
valency_pattern_id = elem.attrib['id']
# get frequency
measure = ''
for measure_el in elem.find('measureList').findall('measure'):
if measure_el.attrib['source'] == 'Gigafida 2.0':
measure = measure_el.text
# get semantic roles
semantic_roles_list = []
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
semantic_roles = '_'.join(semantic_roles_list)
# pattern representation
pattern_representation = elem.find('patternRepresentation').text
# corpus example
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
else:
continue
# ugly postprocessing to remove xmlns:xsi=... duh..
root = etree.fromstring(corpus_example_text)
# Remove namespace prefixes
for elem in root.getiterator():
elem.tag = etree.QName(elem).localname
# Remove unused namespace declarations
etree.cleanup_namespaces(root)
corpus_example = etree.tostring(root, encoding='unicode')
print(f"Valency pattern {valency_pattern_id}")
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
arg_parser.add_argument('--input', type=str, help='Input directory')
arg_parser.add_argument('--output', type=str, help='Output directory')
args = arg_parser.parse_args()
main(args)