You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

126 lines
5.4 KiB

import copy
import json
import os
from lxml import etree
import conllu
from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
print('BUILDING LINKS...')
etree_links = build_links(document_edges)
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
print('WRITTING TEI...')
etree_source_documents = []
etree_target_documents = []
etree_source_divs = []
etree_target_divs = []
# with open(args.solar_file, 'r') as fp:
# logging.info(args.solar_file)
# et = ElementTree.XML(fp.read())
# filename_encountered = False
i = 0
folders_count = 5484
div_i = 0
for div in et.iter('div'):
bibl = div.find('bibl')
file_name = bibl.get('n')
file_name = file_name.replace('/', '_')
print(f'{i * 100 / folders_count} % : {file_name}')
i += 1
# if i * 100 / folders_count > 50:
# filename_encountered = True
# # if file_name == 'KUS-G-slo-4-GO-E-2009-10071':
# # filename_encountered = True
# if i * 100 / folders_count > 51:
# filename_encountered = False
#
# if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
# # div_i -= 1
# continue
#
# if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
# # div_i -= 1
# continue
#
# if not filename_encountered:
# div_i+=1
#
# continue
etree_source_paragraphs = []
etree_target_paragraphs = []
# paragraph_edges = []
paragraphs = div.findall('p')
par_i = 0
for paragraph in paragraphs:
etree_source_sentences = []
etree_target_sentences = []
for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]):
if len(source_conllu_annotated) > 0:
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
if len(source_conllu_annotated) > 0:
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))
for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]):
if len(target_conllu_annotated) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
if len(target_conllu_annotated) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
par_i += 1
etree_bibl = convert_bibl(bibl)
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))
div_i += 1
print('APPENDING DOCUMENT...')
etree_source_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
etree_source_divs, etree_target_divs))
etree_target_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
etree_target_divs, etree_source_divs))
print('BUILDING TEI DOCUMENTS...')
etree_source = build_tei_etrees(etree_source_documents)
etree_target = build_tei_etrees(etree_target_documents)
print('Writting all but complete')
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
print('COMPLETE TREE CREATION...')
complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
# complete_etree = build_complete_tei(etree_source, etree_target, etree_links)
print('WRITING COMPLETE TREE')
with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode())