Added solar2svala and tag_selection from Solar
This commit is contained in:
commit
530b6efe48
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
.idea/
|
||||||
|
data/
|
238
solar2svala.py
Normal file
238
solar2svala.py
Normal file
|
@ -0,0 +1,238 @@
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
import json
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
def add_token(ind, el, source, target, edges):
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el.text + " "})
|
||||||
|
target_id = "t" + ind
|
||||||
|
target.append({"id": target_id, "text": el.text + " "})
|
||||||
|
edge_id = "e-" + source_id + "-" + target_id
|
||||||
|
edges[edge_id] = {"id": edge_id, "ids": [source_id, target_id], "labels": [], "manual": False}
|
||||||
|
|
||||||
|
|
||||||
|
def add_errors(i, error, source, target, edges):
|
||||||
|
source_edge_ids = []
|
||||||
|
target_edge_ids = []
|
||||||
|
podtip = error.attrib['podtip'] if 'podtip' in error.attrib else ''
|
||||||
|
|
||||||
|
label = error.attrib['tip'] + '/' + podtip + '/' + error.attrib['kat']
|
||||||
|
|
||||||
|
labels = [label]
|
||||||
|
|
||||||
|
word_combination_L1 = ''
|
||||||
|
word_combination_L2 = None
|
||||||
|
word_combination_L3 = None
|
||||||
|
word_combination_L4 = None
|
||||||
|
word_combination_L5 = None
|
||||||
|
|
||||||
|
label_L2 = ''
|
||||||
|
label_L3 = ''
|
||||||
|
label_L4 = ''
|
||||||
|
label_L5 = ''
|
||||||
|
|
||||||
|
has_error = False
|
||||||
|
|
||||||
|
# solar5.7
|
||||||
|
for el in error:
|
||||||
|
if el.tag.startswith('w') or el.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el.text + " "})
|
||||||
|
source_edge_ids.append(source_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el.tag.startswith('p'):
|
||||||
|
for p_el in el:
|
||||||
|
if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
target_id = "t" + ind
|
||||||
|
target.append({"id": target_id, "text": p_el.text + " "})
|
||||||
|
target_edge_ids.append(target_id)
|
||||||
|
word_combination_L1 += p_el.text + " "
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el.tag.startswith('u2'):
|
||||||
|
word_combination_L2 = ''
|
||||||
|
podtip = el.attrib['podtip'] if 'podtip' in el.attrib else ''
|
||||||
|
label_L2 = el.attrib['tip'] + '/' + podtip + '/' + el.attrib['kat']
|
||||||
|
for el_l2 in el:
|
||||||
|
if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el_l2.text + " "})
|
||||||
|
source_edge_ids.append(source_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el_l2.tag.startswith('p'):
|
||||||
|
for p_el_l2 in el_l2:
|
||||||
|
if p_el_l2.tag.startswith('w') or p_el_l2.tag.startswith('pc'):
|
||||||
|
word_combination_L2 += p_el_l2.text + " "
|
||||||
|
|
||||||
|
|
||||||
|
elif el_l2.tag.startswith('u3'):
|
||||||
|
word_combination_L3 = ''
|
||||||
|
podtip = el_l2.attrib['podtip'] if 'podtip' in el_l2.attrib else ''
|
||||||
|
label_L3 = el_l2.attrib['tip'] + '/' + podtip + '/' + el_l2.attrib['kat']
|
||||||
|
for el_l3 in el_l2:
|
||||||
|
if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el_l3.text + " "})
|
||||||
|
source_edge_ids.append(source_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el_l3.tag.startswith('p'):
|
||||||
|
for p_el_l3 in el_l3:
|
||||||
|
if p_el_l3.tag.startswith('w') or p_el_l3.tag.startswith('pc'):
|
||||||
|
word_combination_L3 += p_el_l3.text + " "
|
||||||
|
|
||||||
|
elif el_l3.tag.startswith('u4'):
|
||||||
|
word_combination_L4 = ''
|
||||||
|
podtip = el_l3.attrib['podtip'] if 'podtip' in el_l3.attrib else ''
|
||||||
|
label_L4 = el_l3.attrib['tip'] + '/' + podtip + '/' + el_l3.attrib['kat']
|
||||||
|
for el_l4 in el_l3:
|
||||||
|
if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el_l4.text + " "})
|
||||||
|
source_edge_ids.append(source_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el_l4.tag.startswith('p'):
|
||||||
|
for p_el_l4 in el_l4:
|
||||||
|
if p_el_l4.tag.startswith('w') or p_el_l4.tag.startswith('pc'):
|
||||||
|
word_combination_L4 += p_el_l4.text + " "
|
||||||
|
|
||||||
|
elif el_l4.tag.startswith('u5'):
|
||||||
|
word_combination_L5 = ''
|
||||||
|
podtip = el_l4.attrib['podtip'] if 'podtip' in el_l4.attrib else ''
|
||||||
|
label_L5 = el_l4.attrib['tip'] + '/' + podtip + '/' + el_l4.attrib['kat']
|
||||||
|
for el_l5 in el_l4:
|
||||||
|
if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
source_id = "s" + ind
|
||||||
|
source.append({"id": source_id, "text": el_l5.text + " "})
|
||||||
|
source_edge_ids.append(source_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
elif el_l5.tag.startswith('p'):
|
||||||
|
for p_el_l5 in el_l5:
|
||||||
|
if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'):
|
||||||
|
word_combination_L5 += p_el_l5.text + " "
|
||||||
|
for p_el in el:
|
||||||
|
if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
|
||||||
|
ind = str(i)
|
||||||
|
|
||||||
|
target_id = "t" + ind
|
||||||
|
target.append({"id": target_id, "text": p_el.text + " "})
|
||||||
|
target_edge_ids.append(target_id)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None:
|
||||||
|
labels.append(label_L2)
|
||||||
|
if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None:
|
||||||
|
labels.append(label_L3)
|
||||||
|
if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None:
|
||||||
|
labels.append(label_L4)
|
||||||
|
if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None:
|
||||||
|
labels.append(label_L5)
|
||||||
|
elif word_combination_L5 is not None:
|
||||||
|
has_error = True
|
||||||
|
elif word_combination_L4 is not None:
|
||||||
|
has_error = True
|
||||||
|
elif word_combination_L3 is not None:
|
||||||
|
has_error = True
|
||||||
|
elif word_combination_L2 is not None:
|
||||||
|
has_error = True
|
||||||
|
edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
|
||||||
|
edge_id = "e-" + "-".join(edge_ids)
|
||||||
|
edges[edge_id] = {"id": edge_id, "ids": edge_ids, "labels": labels, "manual": True}
|
||||||
|
|
||||||
|
return i, has_error
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(et, args):
|
||||||
|
if os.path.exists(args.output_folder):
|
||||||
|
shutil.rmtree(args.output_folder)
|
||||||
|
if os.path.exists(args.error_folder):
|
||||||
|
shutil.rmtree(args.error_folder)
|
||||||
|
os.mkdir(args.output_folder)
|
||||||
|
os.mkdir(args.error_folder)
|
||||||
|
for div in et.iter('div'):
|
||||||
|
bibl = div.find('bibl')
|
||||||
|
file_name = bibl.get('n')
|
||||||
|
file_name = file_name.replace('/', '_')
|
||||||
|
output_folder_loc = os.path.join(args.output_folder, file_name)
|
||||||
|
error_folder_loc = os.path.join(args.error_folder, file_name)
|
||||||
|
|
||||||
|
paragraphs = div.findall('p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
sentences = paragraph.findall('s')
|
||||||
|
i = 1
|
||||||
|
source = []
|
||||||
|
target = []
|
||||||
|
edges = {}
|
||||||
|
paragraph_error = False
|
||||||
|
for sentence in sentences:
|
||||||
|
for el in sentence:
|
||||||
|
if el.tag.startswith('w'):
|
||||||
|
add_token(str(i), el, source, target, edges)
|
||||||
|
i += 1
|
||||||
|
elif el.tag.startswith('pc'):
|
||||||
|
add_token(str(i), el, source, target, edges)
|
||||||
|
i += 1
|
||||||
|
elif el.tag.startswith('u'):
|
||||||
|
i, has_error = add_errors(i, el, source, target, edges)
|
||||||
|
if has_error:
|
||||||
|
paragraph_error = True
|
||||||
|
|
||||||
|
dictionary = {"source": source, "target": target, "edges": edges}
|
||||||
|
|
||||||
|
if not paragraph_error:
|
||||||
|
if not os.path.exists(output_folder_loc):
|
||||||
|
os.mkdir(output_folder_loc)
|
||||||
|
with open(os.path.join(output_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
|
||||||
|
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||||
|
else:
|
||||||
|
if not os.path.exists(error_folder_loc):
|
||||||
|
os.mkdir(error_folder_loc)
|
||||||
|
with open(os.path.join(error_folder_loc, paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + '.json'), 'w') as wf:
|
||||||
|
json.dump(dictionary, wf, ensure_ascii=False, indent="")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
with open(args.input_file, 'r') as fp:
|
||||||
|
logging.info(args.input_file)
|
||||||
|
et = ElementTree.XML(fp.read())
|
||||||
|
process_file(et, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||||
|
parser.add_argument('--input_file', default='data/Solar2.0/solar2.xml',
|
||||||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
parser.add_argument('--output_folder', default='data/solar.svala',
|
||||||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
parser.add_argument('--error_folder', default='data/solar.svala.error',
|
||||||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
main(args)
|
||||||
|
logging.info("TIME: {}".format(time.time() - start))
|
73
tag_selection.py
Normal file
73
tag_selection.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
def process_file(et):
|
||||||
|
errors = {}
|
||||||
|
L1_num = 0
|
||||||
|
L2_num = 0
|
||||||
|
L3_num = 0
|
||||||
|
L4_num = 0
|
||||||
|
L5_num = 0
|
||||||
|
for div in et.iter('div'):
|
||||||
|
bibl = div.find('bibl')
|
||||||
|
file_name = bibl.get('n')
|
||||||
|
paragraphs = div.findall('p')
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
sentences = paragraph.findall('s')
|
||||||
|
for sentence in sentences:
|
||||||
|
sent_id = sentence.get('{http://www.w3.org/XML/1998/namespace}id')
|
||||||
|
errorsL1 = sentence.findall('u1')
|
||||||
|
for errorL1 in errorsL1:
|
||||||
|
errors.setdefault((errorL1.get('kat'), errorL1.get('tip'), errorL1.get('podtip')), []).append([file_name, sent_id])
|
||||||
|
errorsL2 = errorL1.findall('u2')
|
||||||
|
L1_num += 1
|
||||||
|
for errorL2 in errorsL2:
|
||||||
|
errors.setdefault((errorL2.get('kat'), errorL2.get('tip'), errorL2.get('podtip')), []).append([file_name, sent_id])
|
||||||
|
errorsL3 = errorL2.findall('u3')
|
||||||
|
L2_num += 1
|
||||||
|
for errorL3 in errorsL3:
|
||||||
|
errors.setdefault((errorL3.get('kat'), errorL3.get('tip'), errorL3.get('podtip')), []).append([file_name, sent_id])
|
||||||
|
errorsL4 = errorL3.findall('u4')
|
||||||
|
L3_num += 1
|
||||||
|
for errorL4 in errorsL4:
|
||||||
|
errors.setdefault((errorL4.get('kat'), errorL4.get('tip'), errorL4.get('podtip')), []).append([file_name, sent_id])
|
||||||
|
errorsL5 = errorL4.findall('u5')
|
||||||
|
L4_num += 1
|
||||||
|
for errorL5 in errorsL5:
|
||||||
|
errors.setdefault((errorL5.get('kat'), errorL5.get('tip'), errorL5.get('podtip')), []).append([file_name, sent_id])
|
||||||
|
L5_num += 1
|
||||||
|
print(f'L1: {L1_num}|L2: {L2_num}|L3: {L3_num}|L4: {L4_num}|L5: {L5_num}|')
|
||||||
|
text = ''
|
||||||
|
for k, v in errors.items():
|
||||||
|
for el in v:
|
||||||
|
text += f'{k[0]}\t{k[1]}\t{k[2]}\t{el[0]}\t{el[1]}\n'
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
with open(args.input_file, 'r') as fp, open(args.output_file, 'w') as wf:
|
||||||
|
logging.info(args.input_file)
|
||||||
|
et = ElementTree.XML(fp.read())
|
||||||
|
wf.write(process_file(et))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||||
|
parser.add_argument('--input_file', default='data/Solar2.0/solar2.xml',
|
||||||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
parser.add_argument('--output_file', default='data/tags.tsv',
|
||||||
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
main(args)
|
||||||
|
logging.info("TIME: {}".format(time.time() - start))
|
Loading…
Reference in New Issue
Block a user