Fixed some bugs.

This commit is contained in:
Luka 2022-05-30 07:07:11 +02:00
parent ecaaecb459
commit 5ec5cf3a28
2 changed files with 309 additions and 129 deletions

View File

@ -142,14 +142,6 @@ class TeiDocument:
text = etree.SubElement(root, 'text') text = etree.SubElement(root, 'text')
body = etree.SubElement(text, 'body') body = etree.SubElement(text, 'body')
for paras, bibl in self.divs:
div = etree.Element('div')
set_xml_attr(div, 'id', xml_id)
div.append(bibl)
for para in paras:
div.append(para.as_xml())
body.append(div)
encoding_desc = etree.SubElement(tei_header, 'encodingDesc') encoding_desc = etree.SubElement(tei_header, 'encodingDesc')
tags_decl = etree.SubElement(encoding_desc, 'tagsDecl') tags_decl = etree.SubElement(encoding_desc, 'tagsDecl')
namespace = etree.SubElement(tags_decl, 'namespace') namespace = etree.SubElement(tags_decl, 'namespace')
@ -159,6 +151,16 @@ class TeiDocument:
tag_usage = etree.SubElement(namespace, 'tagUsage') tag_usage = etree.SubElement(namespace, 'tagUsage')
tag_usage.set('gi', tag) tag_usage.set('gi', tag)
tag_usage.set('occurs', str(count)) tag_usage.set('occurs', str(count))
for paras, bibl, div_id in self.divs:
div = etree.Element('div')
set_xml_attr(div, 'id', div_id)
div.append(bibl)
for para in paras:
div.append(para.as_xml())
body.append(div)
return root return root
def add_paragraph(self, paragraph): def add_paragraph(self, paragraph):
@ -245,7 +247,7 @@ def build_links(all_edges):
if len(token_edges['source_ids']) > 0: if len(token_edges['source_ids']) > 0:
random_source_id = token_edges['source_ids'][0] random_source_id = token_edges['source_ids'][0]
sentence_id += '.'.join(random_source_id.split('.')[:3]) sentence_id += '.'.join(random_source_id.split('.')[:3])
elif len(token_edges['target_ids']) > 0: if len(token_edges['target_ids']) > 0:
random_target_id = token_edges['target_ids'][0] random_target_id = token_edges['target_ids'][0]
if len(token_edges['source_ids']) > 0: if len(token_edges['source_ids']) > 0:
sentence_id += ' #' sentence_id += ' #'

View File

@ -2,6 +2,7 @@ import argparse
import json import json
import logging import logging
import os import os
import pickle
import shutil import shutil
import time import time
from xml.etree import ElementTree from xml.etree import ElementTree
@ -55,9 +56,9 @@ def create_edges_list(target_ids, links_ids_mapper):
SKIP_IDS = ['solar2284s.1.1.1'] SKIP_IDS = ['solar2284s.1.1.1']
def create_edges(svala_data, source_par, target_par): def create_edges(svala_data, source_par, target_par):
# if source_par and source_par[0]: if source_par and source_par[0]:
# if source_par[0][0]['id'] in SKIP_IDS: if source_par[0][0]['id'] in SKIP_IDS:
# return [] return []
# # print(source_par[0][0]['id']) # # print(source_par[0][0]['id'])
# if source_par[0][0]['id'] == 'solar2440s.5.1.1': # if source_par[0][0]['id'] == 'solar2440s.5.1.1':
# print('pause!') # print('pause!')
@ -132,8 +133,8 @@ def create_edges(svala_data, source_par, target_par):
for active_source_sentence_i, active_source_sentence in enumerate(source_edges): for active_source_sentence_i, active_source_sentence in enumerate(source_edges):
for source_edge in active_source_sentence: for source_edge in active_source_sentence:
print(source_edge) # print(source_edge)
# if 'e-s261-t261' == source_edge: # if 'e-s7-t8' == source_edge:
# print('aaa') # print('aaa')
# if 'e-s253-s254-s255-s256-s257-s258-s259-s260' == source_edge: # if 'e-s253-s254-s255-s256-s257-s258-s259-s260' == source_edge:
# print('aaa') # print('aaa')
@ -145,8 +146,7 @@ def create_edges(svala_data, source_par, target_par):
edges_processed.add(source_edge) edges_processed.add(source_edge)
elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]: elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]:
# if 'e-s120-t121' == source_edge:
# print('aaa')
# if 'e-s119-t119' == source_edge: # if 'e-s119-t119' == source_edge:
# print('aaa') # print('aaa')
if source_edge not in edges_processed: if source_edge not in edges_processed:
@ -318,8 +318,8 @@ def create_edges(svala_data, source_par, target_par):
if not source_ok_all: if not source_ok_all:
source_sent_id += 1 source_sent_id += 1
if edge_id == 'e-s590-t590': # if edge_id == 'e-s590-t590':
print(edge_id) # print(edge_id)
target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else [] target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else []
target_ok_all = all(target_ok) target_ok_all = all(target_ok)
@ -746,7 +746,7 @@ def create_conllu(interest_list, sentence_string_id):
return conllu_result.serialize() return conllu_result.serialize()
def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu): def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func):
etree_source_sentences = [] etree_source_sentences = []
etree_target_sentences = [] etree_target_sentences = []
@ -755,6 +755,9 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro
par_source = [] par_source = []
par_target = [] par_target = []
source_conllus = []
target_conllus = []
for sentence_id, sentence in enumerate(sentences): for sentence_id, sentence in enumerate(sentences):
source = [] source = []
target = [] target = []
@ -788,34 +791,40 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro
par_target.append(target) par_target.append(target)
# sentence_edges.append(edges) # sentence_edges.append(edges)
source_conllu = ''
if len(source) > 0: if len(source) > 0:
source_conllu = create_conllu(source, sentence_string_id) source_conllu = create_conllu(source, sentence_string_id)
target_conllu = ''
if len(target) > 0: if len(target) > 0:
target_conllu = create_conllu(target, sentence_string_id) target_conllu = create_conllu(target, sentence_string_id)
if len(source) > 0: source_conllus.append(source_conllu)
source_conllu_annotated = nlp(source_conllu).to_conll() target_conllus.append(target_conllu)
if len(target) > 0:
target_conllu_annotated = nlp(target_conllu).to_conll()
if len(source) > 0: # if len(source) > 0:
complete_source_conllu += source_conllu_annotated # source_conllu_annotated = nlp(source_conllu).to_conll()
if len(target) > 0: # if len(target) > 0:
complete_target_conllu += target_conllu_annotated # target_conllu_annotated = nlp(target_conllu).to_conll()
#
if len(source) > 0: # if len(source) > 0:
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] # complete_source_conllu += source_conllu_annotated
if len(target) > 0: # if len(target) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] # complete_target_conllu += target_conllu_annotated
#
if len(source) > 0: # if len(source) > 0:
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True)) # source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
if len(target) > 0: # if len(target) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) # target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
#
# if len(source) > 0:
# etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True))
# if len(target) > 0:
# etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False))
sentence_edges = create_edges(svala_data, par_source, par_target) sentence_edges = create_edges(svala_data, par_source, par_target)
return etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu # return etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu
return sentence_edges, source_conllus, target_conllus
def read_raw_text(path): def read_raw_text(path):
@ -904,9 +913,9 @@ def update_ids(pretag, in_list):
el['id'] = f'{pretag}.{el["id"]}' el['id'] = f'{pretag}.{el["id"]}'
def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu, source_raw_text, target_raw_text, nlp_tokenize): def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize):
etree_source_sentences = [] # etree_source_sentences = []
etree_target_sentences = [] # etree_target_sentences = []
sentence_edges = [] sentence_edges = []
if source_raw_text is not None: if source_raw_text is not None:
@ -924,6 +933,8 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err
par_source = [] par_source = []
par_target = [] par_target = []
sentences_len = len(sentences) sentences_len = len(sentences)
source_conllus = []
target_conllus = []
if source_raw_text is not None: if source_raw_text is not None:
sentences_len = max(sentences_len, len(source_res)) sentences_len = max(sentences_len, len(source_res))
if target_raw_text is not None: if target_raw_text is not None:
@ -982,6 +993,7 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err
source = source_res[sentence_id - 1] source = source_res[sentence_id - 1]
update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source) update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source)
par_source.append(source) par_source.append(source)
source_conllu = ''
if len(source) > 0: if len(source) > 0:
source_conllu = create_conllu(source, sentence_string_id) source_conllu = create_conllu(source, sentence_string_id)
if target_raw_text is not None and sentence_id - 1 < len(target_res): if target_raw_text is not None and sentence_id - 1 < len(target_res):
@ -994,28 +1006,31 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err
if target_raw_text is None: if target_raw_text is None:
par_target.append(target) par_target.append(target)
target_conllu = ''
if len(target) > 0: if len(target) > 0:
target_conllu = create_conllu(target, sentence_string_id) target_conllu = create_conllu(target, sentence_string_id)
if len(source) > 0: source_conllus.append(source_conllu)
source_conllu_annotated = nlp(source_conllu).to_conll() target_conllus.append(target_conllu)
if len(target) > 0: # if len(source) > 0:
target_conllu_annotated = nlp(target_conllu).to_conll() # source_conllu_annotated = nlp(source_conllu).to_conll()
# if len(target) > 0:
if len(source) > 0: # target_conllu_annotated = nlp(target_conllu).to_conll()
complete_source_conllu += source_conllu_annotated #
if len(target) > 0: # if len(source) > 0:
complete_target_conllu += target_conllu_annotated # complete_source_conllu += source_conllu_annotated
# if len(target) > 0:
if len(source) > 0: # complete_target_conllu += target_conllu_annotated
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] #
if len(target) > 0: # if len(source) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] # source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
# if len(target) > 0:
if len(source) > 0: # target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True)) #
if len(target) > 0: # if len(source) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) # etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True))
# if len(target) > 0:
# etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False))
# reannotate svala_ids # reannotate svala_ids
if source_raw_text is None: if source_raw_text is None:
@ -1025,24 +1040,28 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err
sentence_edges = create_edges(svala_data, par_source, par_target) sentence_edges = create_edges(svala_data, par_source, par_target)
return etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu return sentence_edges, source_conllus, target_conllus
def process_file(et, args, nlp, nlp_tokenize): def tokenize(args):
if os.path.exists(args.results_folder): if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
shutil.rmtree(args.results_folder) print('READING AND MERGING...')
os.mkdir(args.results_folder) with open(args.tokenization_interprocessing, 'rb') as rp:
etree_source_documents = [] tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
etree_target_documents = [] return tokenized_source_divs, tokenized_target_divs, document_edges
etree_source_divs = []
etree_target_divs = []
complete_source_conllu = '' print('TOKENIZING...')
complete_target_conllu = '' with open(args.solar_file, 'r') as fp:
logging.info(args.solar_file)
et = ElementTree.XML(fp.read())
document_edges = [] nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
filename_encountered = False filename_encountered = False
i = 0 i = 0
folders_count = 5484 folders_count = 5484
tokenized_source_divs = []
tokenized_target_divs = []
document_edges = []
for div in et.iter('div'): for div in et.iter('div'):
bibl = div.find('bibl') bibl = div.find('bibl')
file_name = bibl.get('n') file_name = bibl.get('n')
@ -1050,13 +1069,13 @@ def process_file(et, args, nlp, nlp_tokenize):
print(f'{i*100/folders_count} % : {file_name}') print(f'{i*100/folders_count} % : {file_name}')
i += 1 i += 1
# if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt': # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt':
# if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10602': # if file_name == 'KUS-OS-slo-8-KR-R-2010-40088':
if i*100/folders_count > 40: # # if i*100/folders_count > 40:
filename_encountered = True # filename_encountered = True
# if i*100/folders_count > 50: # # if i*100/folders_count > 41:
# filename_encountered = False # # filename_encountered = False
if not filename_encountered: # if not filename_encountered:
continue # continue
svala_path = os.path.join(args.svala_folder, file_name) svala_path = os.path.join(args.svala_folder, file_name)
corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name)
@ -1074,8 +1093,10 @@ def process_file(et, args, nlp, nlp_tokenize):
svala_dict.update(corrected_svala_dict) svala_dict.update(corrected_svala_dict)
etree_source_paragraphs = [] # etree_source_paragraphs = []
etree_target_paragraphs = [] # etree_target_paragraphs = []
tokenized_source_paragraphs = []
tokenized_target_paragraphs = []
paragraph_edges = [] paragraph_edges = []
paragraphs = div.findall('p') paragraphs = div.findall('p')
@ -1102,46 +1123,85 @@ def process_file(et, args, nlp, nlp_tokenize):
target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None
if not (source_raw_text or target_raw_text): if not (source_raw_text or target_raw_text):
etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu = process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, sentence_edges, tokenized_source_sentences, tokenized_target_sentences = process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func)
complete_source_conllu, complete_target_conllu)
else: else:
etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu = process_obeliks_paragraph(sentences, paragraph, svala_i, sentence_edges, tokenized_source_sentences, tokenized_target_sentences = process_obeliks_paragraph(sentences, paragraph, svala_i,
svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu, source_raw_text, target_raw_text, nlp_tokenize) svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize)
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True)) tokenized_source_paragraphs.append(tokenized_source_sentences)
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False)) tokenized_target_paragraphs.append(tokenized_target_sentences)
# etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
# etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
paragraph_edges.append(sentence_edges) paragraph_edges.append(sentence_edges)
etree_bibl = convert_bibl(bibl) # etree_bibl = convert_bibl(bibl)
tokenized_source_divs.append(tokenized_source_paragraphs)
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl))) tokenized_target_divs.append(tokenized_target_paragraphs)
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl))) # etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl)))
# etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl)))
document_edges.append(paragraph_edges) document_edges.append(paragraph_edges)
print('APPENDING DOCUMENT...') with open(args.tokenization_interprocessing, 'wb') as wp:
etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', etree_source_divs)) pickle.dump((tokenized_source_divs, tokenized_target_divs, document_edges), wp)
etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', etree_target_divs))
print('BUILDING TEI DOCUMENTS...') return tokenized_source_divs, tokenized_target_divs, document_edges
etree_source = build_tei_etrees(etree_source_documents)
etree_target = build_tei_etrees(etree_target_documents)
print('BUILDING LINKS...') def annotate(tokenized_source_divs, tokenized_target_divs, args):
etree_links = build_links(document_edges) if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
print('READING...')
with open(args.annotation_interprocessing, 'rb') as rp:
annotated_source_divs, annotated_target_divs = pickle.load(rp)
return annotated_source_divs, annotated_target_divs
nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu",
type='standard_jos')
annotated_source_divs = []
complete_source_conllu = ''
print('ANNOTATING SOURCE...')
for i, div in enumerate(tokenized_source_divs):
print(f'{str(i*100/len(tokenized_source_divs))}')
annotated_source_pars = []
for par in div:
annotated_source_sens = []
for sen in par:
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
annotated_source_sens.append(source_conllu_annotated)
complete_source_conllu += source_conllu_annotated
annotated_source_pars.append(annotated_source_sens)
annotated_source_divs.append(annotated_source_pars)
annotated_target_divs = []
complete_target_conllu = ''
print('ANNOTATING TARGET...')
for i, div in enumerate(tokenized_target_divs):
print(f'{str(i * 100 / len(tokenized_target_divs))}')
annotated_target_pars = []
for par in div:
annotated_target_sens = []
for sen in par:
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
annotated_target_sens.append(target_conllu_annotated)
complete_target_conllu += target_conllu_annotated
annotated_target_pars.append(annotated_target_sens)
annotated_target_divs.append(annotated_target_pars)
print('Writting all but complete')
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
sf.write(complete_source_conllu) sf.write(complete_source_conllu)
with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf: with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
sf.write(complete_target_conllu) sf.write(complete_target_conllu)
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: with open(args.annotation_interprocessing, 'wb') as wp:
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) pickle.dump((annotated_source_divs, annotated_target_divs), wp)
with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf: return annotated_source_divs, annotated_target_divs
tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
print('BUILDING LINKS...')
etree_links = build_links(document_edges)
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf: with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode()) tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
@ -1149,40 +1209,154 @@ def process_file(et, args, nlp, nlp_tokenize):
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
json.dump(document_edges, jf, ensure_ascii=False, indent=" ") json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
print('WRITTING TEI...')
etree_source_documents = []
etree_target_documents = []
etree_source_divs = []
etree_target_divs = []
with open(args.solar_file, 'r') as fp:
logging.info(args.solar_file)
et = ElementTree.XML(fp.read())
filename_encountered = False
i = 0
folders_count = 5484
div_i = 0
for div in et.iter('div'):
bibl = div.find('bibl')
file_name = bibl.get('n')
file_name = file_name.replace('/', '_')
print(f'{i * 100 / folders_count} % : {file_name}')
i += 1
# if i * 100 / folders_count > 50:
# filename_encountered = True
# if i * 100 / folders_count > 100:
# filename_encountered = False
if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
# div_i -= 1
continue
if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
# div_i -= 1
continue
# if not filename_encountered:
# div_i+=1
#
# continue
# svala_path = os.path.join(args.svala_folder, file_name)
# corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name)
# raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name)
# skip files that are not svala annotated (to enable short examples)
# if not os.path.isdir(svala_path):
# continue
# svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in
# os.listdir(svala_path)]
# svala_dict = {e[0]: e[1] for e in svala_list}
# if os.path.exists(corrected_svala_path):
# corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in
# os.listdir(corrected_svala_path)]
# corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list}
#
# svala_dict.update(corrected_svala_dict)
etree_source_paragraphs = []
etree_target_paragraphs = []
# paragraph_edges = []
paragraphs = div.findall('p')
par_i = 0
for paragraph in paragraphs:
sentences = paragraph.findall('s')
etree_source_sentences = []
etree_target_sentences = []
for sentence_id, sentence in enumerate(sentences):
# print(f'{div_i} + {par_i} + {sentence_id}')
source_conllu_annotated = annotated_source_divs[div_i][par_i][sentence_id]
target_conllu_annotated = annotated_target_divs[div_i][par_i][sentence_id]
if len(source_conllu_annotated) > 0:
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
if len(target_conllu_annotated) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
if len(source_conllu_annotated) > 0:
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))
if len(target_conllu_annotated) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
par_i += 1
etree_bibl = convert_bibl(bibl)
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))
div_i += 1
print('APPENDING DOCUMENT...')
etree_source_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
etree_source_divs))
etree_target_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
etree_target_divs))
print('BUILDING TEI DOCUMENTS...')
etree_source = build_tei_etrees(etree_source_documents)
etree_target = build_tei_etrees(etree_target_documents)
print('Writting all but complete')
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
# TODO STUCKS HERE # TODO STUCKS HERE
print('COMPLETE TREE CREATION...') print('COMPLETE TREE CREATION...')
complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
# complete_etree = build_complete_tei(etree_source, etree_target, etree_links)
print('WRITING FILES') print('WRITING COMPLETE TREE')
# with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
# sf.write(complete_source_conllu)
#
# with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
# sf.write(complete_target_conllu)
#
# with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
# sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
#
# with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
# tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
#
# with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
# tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf: with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode()) tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode())
# with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: def process_file(args):
# json.dump(document_edges, jf, ensure_ascii=False, indent=" ") if os.path.exists(args.results_folder):
shutil.rmtree(args.results_folder)
os.mkdir(args.results_folder)
# READ AND MERGE svala tokenization, solar2 tokenization and obeliks tokenization
tokenized_source_divs, tokenized_target_divs, document_edges = tokenize(args)
# ANNOTATE WITH CLASSLA
annotated_source_divs, annotated_target_divs = annotate(tokenized_source_divs, tokenized_target_divs, args)
# GENERATE TEI AND WRITE OUTPUT
write_tei(annotated_source_divs, annotated_target_divs, document_edges, args)
def main(args): def main(args):
with open(args.solar_file, 'r') as fp: process_file(args)
logging.info(args.solar_file)
nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu", type='standard_jos')
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
et = ElementTree.XML(fp.read())
process_file(et, args, nlp, nlp_tokenize)
if __name__ == '__main__': if __name__ == '__main__':
@ -1198,8 +1372,12 @@ if __name__ == '__main__':
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--svala_generated_text_folder', default='data/svala_generated_text.formatted', parser.add_argument('--svala_generated_text_folder', default='data/svala_generated_text.formatted',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--raw_conllu_interprocessing', default='data/processing.raw_conllu', parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
args = parser.parse_args() args = parser.parse_args()
start = time.time() start = time.time()