From 530b6efe48bc090110d72a20d78b618b443c5cb3 Mon Sep 17 00:00:00 2001 From: Luka Date: Tue, 29 Jun 2021 09:54:10 +0200 Subject: [PATCH] Added solar2svala and tag_selection from Solar --- .gitignore | 2 + solar2svala.py | 238 +++++++++++++++++++++++++++++++++++++++++++++++ tag_selection.py | 73 +++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 .gitignore create mode 100644 solar2svala.py create mode 100644 tag_selection.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dddcc6c --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +data/ diff --git a/solar2svala.py b/solar2svala.py new file mode 100644 index 0000000..d2a74f5 --- /dev/null +++ b/solar2svala.py @@ -0,0 +1,238 @@ +import argparse +import copy +import logging +import os +import shutil +import time +from xml.etree import ElementTree +import json + +logging.basicConfig(level=logging.INFO) + + +def add_token(ind, el, source, target, edges): + source_id = "s" + ind + source.append({"id": source_id, "text": el.text + " "}) + target_id = "t" + ind + target.append({"id": target_id, "text": el.text + " "}) + edge_id = "e-" + source_id + "-" + target_id + edges[edge_id] = {"id": edge_id, "ids": [source_id, target_id], "labels": [], "manual": False} + + +def add_errors(i, error, source, target, edges): + source_edge_ids = [] + target_edge_ids = [] + podtip = error.attrib['podtip'] if 'podtip' in error.attrib else '' + + label = error.attrib['tip'] + '/' + podtip + '/' + error.attrib['kat'] + + labels = [label] + + word_combination_L1 = '' + word_combination_L2 = None + word_combination_L3 = None + word_combination_L4 = None + word_combination_L5 = None + + label_L2 = '' + label_L3 = '' + label_L4 = '' + label_L5 = '' + + has_error = False + + # solar5.7 + for el in error: + if el.tag.startswith('w') or el.tag.startswith('pc'): + ind = str(i) + + source_id = "s" + ind + source.append({"id": source_id, "text": el.text + " "}) + source_edge_ids.append(source_id) + i += 1 + + elif el.tag.startswith('p'): + for p_el in el: + if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): + ind = str(i) + + target_id = "t" + ind + target.append({"id": target_id, "text": p_el.text + " "}) + target_edge_ids.append(target_id) + word_combination_L1 += p_el.text + " " + i += 1 + + elif el.tag.startswith('u2'): + word_combination_L2 = '' + podtip = el.attrib['podtip'] if 'podtip' in el.attrib else '' + label_L2 = el.attrib['tip'] + '/' + podtip + '/' + el.attrib['kat'] + for el_l2 in el: + if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'): + ind = str(i) + + source_id = "s" + ind + source.append({"id": source_id, "text": el_l2.text + " "}) + source_edge_ids.append(source_id) + i += 1 + + elif el_l2.tag.startswith('p'): + for p_el_l2 in el_l2: + if p_el_l2.tag.startswith('w') or p_el_l2.tag.startswith('pc'): + word_combination_L2 += p_el_l2.text + " " + + + elif el_l2.tag.startswith('u3'): + word_combination_L3 = '' + podtip = el_l2.attrib['podtip'] if 'podtip' in el_l2.attrib else '' + label_L3 = el_l2.attrib['tip'] + '/' + podtip + '/' + el_l2.attrib['kat'] + for el_l3 in el_l2: + if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'): + ind = str(i) + + source_id = "s" + ind + source.append({"id": source_id, "text": el_l3.text + " "}) + source_edge_ids.append(source_id) + i += 1 + + elif el_l3.tag.startswith('p'): + for p_el_l3 in el_l3: + if p_el_l3.tag.startswith('w') or p_el_l3.tag.startswith('pc'): + word_combination_L3 += p_el_l3.text + " " + + elif el_l3.tag.startswith('u4'): + word_combination_L4 = '' + podtip = el_l3.attrib['podtip'] if 'podtip' in el_l3.attrib else '' + label_L4 = el_l3.attrib['tip'] + '/' + podtip + '/' + el_l3.attrib['kat'] + for el_l4 in el_l3: + if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'): + ind = str(i) + + source_id = "s" + ind + source.append({"id": source_id, "text": el_l4.text + " "}) + source_edge_ids.append(source_id) + i += 1 + + elif el_l4.tag.startswith('p'): + for p_el_l4 in el_l4: + if p_el_l4.tag.startswith('w') or p_el_l4.tag.startswith('pc'): + word_combination_L4 += p_el_l4.text + " " + + elif el_l4.tag.startswith('u5'): + word_combination_L5 = '' + podtip = el_l4.attrib['podtip'] if 'podtip' in el_l4.attrib else '' + label_L5 = el_l4.attrib['tip'] + '/' + podtip + '/' + el_l4.attrib['kat'] + for el_l5 in el_l4: + if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'): + ind = str(i) + + source_id = "s" + ind + source.append({"id": source_id, "text": el_l5.text + " "}) + source_edge_ids.append(source_id) + i += 1 + + elif el_l5.tag.startswith('p'): + for p_el_l5 in el_l5: + if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'): + word_combination_L5 += p_el_l5.text + " " + for p_el in el: + if p_el.tag.startswith('w') or p_el.tag.startswith('pc'): + ind = str(i) + + target_id = "t" + ind + target.append({"id": target_id, "text": p_el.text + " "}) + target_edge_ids.append(target_id) + i += 1 + + if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None: + labels.append(label_L2) + if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None: + labels.append(label_L3) + if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None: + labels.append(label_L4) + if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None: + labels.append(label_L5) + elif word_combination_L5 is not None: + has_error = True + elif word_combination_L4 is not None: + has_error = True + elif word_combination_L3 is not None: + has_error = True + elif word_combination_L2 is not None: + has_error = True + edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids) + edge_id = "e-" + "-".join(edge_ids) + edges[edge_id] = {"id": edge_id, "ids": edge_ids, "labels": labels, "manual": True} + + return i, has_error + + +def process_file(et, args): + if os.path.exists(args.output_folder): + shutil.rmtree(args.output_folder) + if os.path.exists(args.error_folder): + shutil.rmtree(args.error_folder) + os.mkdir(args.output_folder) + os.mkdir(args.error_folder) + for div in et.iter('div'): + bibl = div.find('bibl') + file_name = bibl.get('n') + file_name = file_name.replace('/', '_') + output_folder_loc = os.path.join(args.output_folder, file_name) + error_folder_loc = os.path.join(args.error_folder, file_name) + + paragraphs = div.findall('p') + for paragraph in paragraphs: + sentences = paragraph.findall('s') + i = 1 + source = [] + target = [] + edges = {} + paragraph_error = False + for sentence in sentences: + for el in sentence: + if el.tag.startswith('w'): + add_token(str(i), el, source, target, edges) + i += 1 + elif el.tag.startswith('pc'): + add_token(str(i), el, source, target, edges) + i += 1 + elif el.tag.startswith('u'): + i, has_error = add_errors(i, el, source, target, edges) + if has_error: + paragraph_error = True + + dictionary = {"source": source, "target": target, "edges": edges} + + if not paragraph_error: + if not os.path.exists(output_folder_loc): + os.mkdir(output_folder_loc) + with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + else: + if not os.path.exists(error_folder_loc): + os.mkdir(error_folder_loc) + with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf: + json.dump(dictionary, wf, ensure_ascii=False, indent="") + + +def main(args): + with open(args.input_file, 'r') as fp: + logging.info(args.input_file) + et = ElementTree.XML(fp.read()) + process_file(et, args) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') + parser.add_argument('--input_file', default='data/Solar2.0/solar2.xml', + help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--output_folder', default='data/solar.svala', + help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--error_folder', default='data/solar.svala.error', + help='input file in (gz or xml currently). If none, then just database is loaded') + args = parser.parse_args() + + start = time.time() + main(args) + logging.info("TIME: {}".format(time.time() - start)) diff --git a/tag_selection.py b/tag_selection.py new file mode 100644 index 0000000..93feec6 --- /dev/null +++ b/tag_selection.py @@ -0,0 +1,73 @@ +import argparse +import copy +import logging +import os +import time +from xml.etree import ElementTree + +logging.basicConfig(level=logging.INFO) + + +def process_file(et): + errors = {} + L1_num = 0 + L2_num = 0 + L3_num = 0 + L4_num = 0 + L5_num = 0 + for div in et.iter('div'): + bibl = div.find('bibl') + file_name = bibl.get('n') + paragraphs = div.findall('p') + for paragraph in paragraphs: + sentences = paragraph.findall('s') + for sentence in sentences: + sent_id = sentence.get('{http://www.w3.org/XML/1998/namespace}id') + errorsL1 = sentence.findall('u1') + for errorL1 in errorsL1: + errors.setdefault((errorL1.get('kat'), errorL1.get('tip'), errorL1.get('podtip')), []).append([file_name, sent_id]) + errorsL2 = errorL1.findall('u2') + L1_num += 1 + for errorL2 in errorsL2: + errors.setdefault((errorL2.get('kat'), errorL2.get('tip'), errorL2.get('podtip')), []).append([file_name, sent_id]) + errorsL3 = errorL2.findall('u3') + L2_num += 1 + for errorL3 in errorsL3: + errors.setdefault((errorL3.get('kat'), errorL3.get('tip'), errorL3.get('podtip')), []).append([file_name, sent_id]) + errorsL4 = errorL3.findall('u4') + L3_num += 1 + for errorL4 in errorsL4: + errors.setdefault((errorL4.get('kat'), errorL4.get('tip'), errorL4.get('podtip')), []).append([file_name, sent_id]) + errorsL5 = errorL4.findall('u5') + L4_num += 1 + for errorL5 in errorsL5: + errors.setdefault((errorL5.get('kat'), errorL5.get('tip'), errorL5.get('podtip')), []).append([file_name, sent_id]) + L5_num += 1 + print(f'L1: {L1_num}|L2: {L2_num}|L3: {L3_num}|L4: {L4_num}|L5: {L5_num}|') + text = '' + for k, v in errors.items(): + for el in v: + text += f'{k[0]}\t{k[1]}\t{k[2]}\t{el[0]}\t{el[1]}\n' + + return text + + +def main(args): + with open(args.input_file, 'r') as fp, open(args.output_file, 'w') as wf: + logging.info(args.input_file) + et = ElementTree.XML(fp.read()) + wf.write(process_file(et)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') + parser.add_argument('--input_file', default='data/Solar2.0/solar2.xml', + help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--output_file', default='data/tags.tsv', + help='input file in (gz or xml currently). If none, then just database is loaded') + args = parser.parse_args() + + start = time.time() + main(args) + logging.info("TIME: {}".format(time.time() - start))