svala-scripts/svala2tei.py
2023-08-17 08:44:59 +02:00

275 lines
13 KiB
Python
Executable File

import argparse
import json
import logging
import os
import pickle
import shutil
import time
import conllu
import classla
import copy
from lxml import etree
from src.annotate.annotate import annotate
from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
from src.read.read_and_merge import tokenize
from src.write.write import write_tei
logging.basicConfig(level=logging.INFO)
def add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id):
edge_id = "e-" + source_id + "-" + target_id
labels = svala_data['edges'][edge_id]['labels']
edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels})
def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data, sentence_string_id):
source_id = "s" + svala_i
target_id = "t" + svala_i
edge_id = "e-" + source_id + "-" + target_id
labels = svala_data['edges'][edge_id]['labels']
sentence_string_id_split = sentence_string_id.split('.')
source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{source_i}'
target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{target_i}'
token_tag = 'w' if el.tag.startswith('w') else 'pc'
lemma = el.attrib['lemma'] if token_tag == 'w' else el.text
source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': source_id})
target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False, 'svala_id': target_id})
edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels})
def add_errors(i, error, source, target, edges):
source_edge_ids = []
target_edge_ids = []
podtip = error.attrib['podtip'] if 'podtip' in error.attrib else ''
label = error.attrib['tip'] + '/' + podtip + '/' + error.attrib['kat']
labels = [label]
word_combination_L1 = ''
word_combination_L2 = None
word_combination_L3 = None
word_combination_L4 = None
word_combination_L5 = None
label_L2 = ''
label_L3 = ''
label_L4 = ''
label_L5 = ''
has_error = False
# solar5.7
for el in error:
if el.tag.startswith('w') or el.tag.startswith('pc'):
ind = str(i)
source_id = "s" + ind
source.append({"id": source_id, "text": el.text + " "})
source_edge_ids.append(source_id)
i += 1
elif el.tag.startswith('p'):
for p_el in el:
if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
ind = str(i)
target_id = "t" + ind
target.append({"id": target_id, "text": p_el.text + " "})
target_edge_ids.append(target_id)
word_combination_L1 += p_el.text + " "
i += 1
elif el.tag.startswith('u2'):
word_combination_L2 = ''
podtip = el.attrib['podtip'] if 'podtip' in el.attrib else ''
label_L2 = el.attrib['tip'] + '/' + podtip + '/' + el.attrib['kat']
for el_l2 in el:
if el_l2.tag.startswith('w') or el_l2.tag.startswith('pc'):
ind = str(i)
source_id = "s" + ind
source.append({"id": source_id, "text": el_l2.text + " "})
source_edge_ids.append(source_id)
i += 1
elif el_l2.tag.startswith('p'):
for p_el_l2 in el_l2:
if p_el_l2.tag.startswith('w') or p_el_l2.tag.startswith('pc'):
word_combination_L2 += p_el_l2.text + " "
elif el_l2.tag.startswith('u3'):
word_combination_L3 = ''
podtip = el_l2.attrib['podtip'] if 'podtip' in el_l2.attrib else ''
label_L3 = el_l2.attrib['tip'] + '/' + podtip + '/' + el_l2.attrib['kat']
for el_l3 in el_l2:
if el_l3.tag.startswith('w') or el_l3.tag.startswith('pc'):
ind = str(i)
source_id = "s" + ind
source.append({"id": source_id, "text": el_l3.text + " "})
source_edge_ids.append(source_id)
i += 1
elif el_l3.tag.startswith('p'):
for p_el_l3 in el_l3:
if p_el_l3.tag.startswith('w') or p_el_l3.tag.startswith('pc'):
word_combination_L3 += p_el_l3.text + " "
elif el_l3.tag.startswith('u4'):
word_combination_L4 = ''
podtip = el_l3.attrib['podtip'] if 'podtip' in el_l3.attrib else ''
label_L4 = el_l3.attrib['tip'] + '/' + podtip + '/' + el_l3.attrib['kat']
for el_l4 in el_l3:
if el_l4.tag.startswith('w') or el_l4.tag.startswith('pc'):
ind = str(i)
source_id = "s" + ind
source.append({"id": source_id, "text": el_l4.text + " "})
source_edge_ids.append(source_id)
i += 1
elif el_l4.tag.startswith('p'):
for p_el_l4 in el_l4:
if p_el_l4.tag.startswith('w') or p_el_l4.tag.startswith('pc'):
word_combination_L4 += p_el_l4.text + " "
elif el_l4.tag.startswith('u5'):
word_combination_L5 = ''
podtip = el_l4.attrib['podtip'] if 'podtip' in el_l4.attrib else ''
label_L5 = el_l4.attrib['tip'] + '/' + podtip + '/' + el_l4.attrib['kat']
for el_l5 in el_l4:
if el_l5.tag.startswith('w') or el_l5.tag.startswith('pc'):
ind = str(i)
source_id = "s" + ind
source.append({"id": source_id, "text": el_l5.text + " "})
source_edge_ids.append(source_id)
i += 1
elif el_l5.tag.startswith('p'):
for p_el_l5 in el_l5:
if p_el_l5.tag.startswith('w') or p_el_l5.tag.startswith('pc'):
word_combination_L5 += p_el_l5.text + " "
# TODO NOT SURE IF THIS SHOULD BE COMMENTED! IF IT IS NOT THERE ARE ERRORS ON 2ND lvl of errors, where some words are duplicated
# for p_el in el:
# if p_el.tag.startswith('w') or p_el.tag.startswith('pc'):
# ind = str(i)
#
# target_id = "t" + ind
# target.append({"id": target_id, "text": p_el.text + " "})
# target_edge_ids.append(target_id)
# i += 1
if word_combination_L1 == word_combination_L2 and word_combination_L2 is not None:
if label_L2 not in labels:
labels.append(label_L2)
else:
print(f"REPEATING LABEL - {label_L2} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L3 and word_combination_L3 is not None:
if label_L3 not in labels:
labels.append(label_L3)
else:
print(f"REPEATING LABEL - {label_L3} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L4 and word_combination_L4 is not None:
if label_L4 not in labels:
labels.append(label_L4)
else:
print(f"REPEATING LABEL - {label_L4} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
if word_combination_L1 == word_combination_L5 and word_combination_L5 is not None:
if label_L5 not in labels:
labels.append(label_L5)
else:
print(f"REPEATING LABEL - {label_L5} in {error.attrib['{http://www.w3.org/XML/1998/namespace}id']} - ID {i}")
elif word_combination_L5 is not None:
has_error = True
elif word_combination_L4 is not None:
has_error = True
elif word_combination_L3 is not None:
has_error = True
elif word_combination_L2 is not None:
has_error = True
edge_ids = sorted(source_edge_ids) + sorted(target_edge_ids)
edge_id = "e-" + "-".join(edge_ids)
edges[edge_id] = {"id": edge_id, "ids": edge_ids, "labels": labels, "manual": True}
return i, has_error
def map_svala_solar2(svala_data_part, solar2_paragraph):
svala_data_i = 0
for sentence in solar2_paragraph:
sentence_id = 0
for tok in sentence:
# if svala_data_part[svala_data_i]['text'].strip() != tok['token']:
# if tok['text'] == '§' and svala_data_part[svala_data_i]['token'].strip() == '§§§':
# wierd_sign_count += 1
# if wierd_sign_count < 3:
# continue
# else:
# tok['text'] = '§§§'
# wierd_sign_count = 0
# else:
# raise 'Word mismatch!'
assert svala_data_part[svala_data_i]['text'].strip() == tok['token']
sentence_id += 1
tok['svala_id'] = svala_data_part[svala_data_i]['id']
svala_data_i += 1
def process_file(args):
if os.path.exists(args.results_folder):
shutil.rmtree(args.results_folder)
os.mkdir(args.results_folder)
# READ AND MERGE svala tokenization, solar2 tokenization and obeliks tokenization
tokenized_source_divs, tokenized_target_divs, document_edges = tokenize(args)
# ANNOTATE WITH CLASSLA
annotated_source_divs, annotated_target_divs = annotate(tokenized_source_divs, tokenized_target_divs, args)
# GENERATE TEI AND WRITE OUTPUT
write_tei(annotated_source_divs, annotated_target_divs, document_edges, args)
def main(args):
process_file(args)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
parser.add_argument('--svala_folder', default='data/KOST/svala',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--results_folder', default='data/KOST/results',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--raw_text', default='data/KOST/raw',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
help='KOST metadata location')
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
help='KOST authors location')
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
help='KOST teachers location')
parser.add_argument('--translations', default='data/KOST/translations.csv',
help='KOST Slovenian-English column names translations')
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
args = parser.parse_args()
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))