A couple of fixes on write_xml in create_xml.py + Created form_csv.py script
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
import argparse
|
||||
import csv
|
||||
import os
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def write_general_statistics(path, out_list):
|
||||
if len(out_list) == 0:
|
||||
return
|
||||
with open(path, 'w') as csvfile:
|
||||
writer = csv.writer(csvfile, delimiter='\t',
|
||||
quotechar='"')
|
||||
writer.writerow(['semanticRole', 'valency_pattern_ratio', 'valency_sentence_ratio'])
|
||||
for line in out_list:
|
||||
writer.writerow(line)
|
||||
|
||||
|
||||
def main(args):
|
||||
for file in sorted(os.listdir(args.input)):
|
||||
path = os.path.join(args.input, file)
|
||||
tree = etree.parse(path)
|
||||
gf_output = []
|
||||
ssj_output = []
|
||||
head = next(tree.iter('head'))
|
||||
headword = head.find('headword').find('lemma').text
|
||||
#for div in root.iterfind('.//div'):
|
||||
for elem in tree.iter('statisticsContainer'):
|
||||
# for element in tree.iterfind('statisticsContainer'):
|
||||
# for element in tree.find('statisticsContainer'):
|
||||
semRole = elem.find('semanticRole').text
|
||||
gf_pattern = None
|
||||
gf_sentence = None
|
||||
ssj_pattern = None
|
||||
ssj_sentence = None
|
||||
measure = elem.find('measureList')
|
||||
for el in measure:
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||
gf_sentence = el.text
|
||||
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_pattern = el.text
|
||||
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||
ssj_sentence = el.text
|
||||
if gf_pattern is not None and gf_sentence is not None:
|
||||
gf_output.append([semRole, gf_pattern, gf_sentence])
|
||||
if ssj_pattern is not None and ssj_sentence is not None:
|
||||
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||
|
||||
for elem in tree.iter('valencyPattern'):
|
||||
valency_pattern_id = elem.attrib['id']
|
||||
measure = None
|
||||
for measure_el in elem.find('measure'):
|
||||
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||
measure = measure_el.text
|
||||
|
||||
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||
|
||||
if __name__ == '__main__':
|
||||
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||
arg_parser.add_argument('--input', type=str, help='Input directory')
|
||||
arg_parser.add_argument('--output', type=str, help='Output directory')
|
||||
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
main(args)
|
||||
Reference in New Issue
Block a user