forked from kristjan/cjvt-valency
Added gigafida fix for multiple senses
This commit is contained in:
parent
75b015dcda
commit
69c3521e4b
|
@ -1181,7 +1181,7 @@ def write_xml(headword_category, collection_ssj, collection_gigafida, RF, mongo,
|
||||||
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
|
for sentence_example in headword_pattern_dict['gf']['sentence_examples']:
|
||||||
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
exampleContainer = lxml.SubElement(exampleContainerList, 'exampleContainer')
|
||||||
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
# corpusExample = lxml.SubElement(exampleContainer, 'corpusExample')
|
||||||
exampleContainer.append(sentence_example)
|
exampleContainer.append(copy.deepcopy(sentence_example))
|
||||||
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
|
with lxml.xmlfile(os.path.join(args.outdir, 'VS10_' + headword_text + '_' + corpus_name + '.xml'),
|
||||||
encoding='utf-8') as xf:
|
encoding='utf-8') as xf:
|
||||||
xf.write(dictionary, pretty_print=True)
|
xf.write(dictionary, pretty_print=True)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import argparse
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree, objectify, html
|
||||||
|
|
||||||
|
|
||||||
def write_general_statistics(path, out_list):
|
def write_general_statistics(path, out_list):
|
||||||
|
@ -11,7 +11,18 @@ def write_general_statistics(path, out_list):
|
||||||
with open(path, 'w') as csvfile:
|
with open(path, 'w') as csvfile:
|
||||||
writer = csv.writer(csvfile, delimiter='\t',
|
writer = csv.writer(csvfile, delimiter='\t',
|
||||||
quotechar='"')
|
quotechar='"')
|
||||||
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
|
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
|
||||||
|
for line in out_list:
|
||||||
|
writer.writerow(line)
|
||||||
|
|
||||||
|
|
||||||
|
def write_statistics(path, out_list):
|
||||||
|
if len(out_list) == 0:
|
||||||
|
return
|
||||||
|
with open(path, 'w') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter='\t',
|
||||||
|
quotechar='"')
|
||||||
|
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
|
||||||
for line in out_list:
|
for line in out_list:
|
||||||
writer.writerow(line)
|
writer.writerow(line)
|
||||||
|
|
||||||
|
@ -48,15 +59,53 @@ def main(args):
|
||||||
if ssj_pattern is not None and ssj_sentence is not None:
|
if ssj_pattern is not None and ssj_sentence is not None:
|
||||||
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||||
|
|
||||||
|
print(file)
|
||||||
|
|
||||||
|
analyze_output = []
|
||||||
for elem in tree.iter('valencyPattern'):
|
for elem in tree.iter('valencyPattern'):
|
||||||
valency_pattern_id = elem.attrib['id']
|
valency_pattern_id = elem.attrib['id']
|
||||||
measure = None
|
|
||||||
for measure_el in elem.find('measure'):
|
# get frequency
|
||||||
|
measure = ''
|
||||||
|
for measure_el in elem.find('measureList').findall('measure'):
|
||||||
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||||
measure = measure_el.text
|
measure = measure_el.text
|
||||||
|
|
||||||
|
# get semantic roles
|
||||||
|
semantic_roles_list = []
|
||||||
|
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
|
||||||
|
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
|
||||||
|
semantic_roles = '_'.join(semantic_roles_list)
|
||||||
|
|
||||||
|
# pattern representation
|
||||||
|
pattern_representation = elem.find('patternRepresentation').text
|
||||||
|
|
||||||
|
# corpus example
|
||||||
|
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
|
||||||
|
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
|
||||||
|
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ugly postprocessing to remove xmlns:xsi=... duh..
|
||||||
|
root = etree.fromstring(corpus_example_text)
|
||||||
|
|
||||||
|
# Remove namespace prefixes
|
||||||
|
for elem in root.getiterator():
|
||||||
|
elem.tag = etree.QName(elem).localname
|
||||||
|
# Remove unused namespace declarations
|
||||||
|
etree.cleanup_namespaces(root)
|
||||||
|
|
||||||
|
corpus_example = etree.tostring(root, encoding='unicode')
|
||||||
|
|
||||||
|
print(f"Valency pattern {valency_pattern_id}")
|
||||||
|
|
||||||
|
|
||||||
|
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
|
||||||
|
|
||||||
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||||
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||||
|
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user