You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
211 lines
9.6 KiB
211 lines
9.6 KiB
import copy
|
|
import csv
|
|
import json
|
|
import os
|
|
from lxml import etree
|
|
import conllu
|
|
|
|
from src.create_tei import construct_sentence_from_list, \
|
|
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
|
|
|
|
|
def form_paragraphs(annotated_source_divs, metadata):
|
|
etree_source_divs = []
|
|
for div_i, div_tuple in enumerate(annotated_source_divs):
|
|
div_name, div = div_tuple
|
|
if div_name[:-1] not in metadata:
|
|
print(div_name[:-1])
|
|
continue
|
|
div_metadata = metadata[div_name[:-1]]
|
|
|
|
etree_source_paragraphs = []
|
|
|
|
for par_i, paragraph_tuple in enumerate(div):
|
|
par_name, paragraph = paragraph_tuple
|
|
etree_source_sentences = []
|
|
|
|
for sentence_id, sentence in enumerate(paragraph):
|
|
if len(sentence) > 0:
|
|
conllu_parsed = conllu.parse(sentence)[0]
|
|
etree_source_sentences.append(
|
|
construct_sentence_from_list(str(sentence_id + 1), conllu_parsed, True))
|
|
|
|
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
|
|
|
|
etree_source_divs.append((etree_source_paragraphs, div_name, div_metadata))
|
|
|
|
return etree_source_divs, div_name
|
|
|
|
|
|
def read_metadata(args):
|
|
texts_metadata = []
|
|
with open(args.texts_metadata, 'r') as file:
|
|
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
|
column_names = []
|
|
for i, row in enumerate(csvreader):
|
|
if i == 0:
|
|
column_names = row
|
|
continue
|
|
else:
|
|
row_dict = {}
|
|
for j, content in enumerate(row):
|
|
row_dict[column_names[j]] = content.strip()
|
|
texts_metadata.append(row_dict)
|
|
|
|
# handle teachers
|
|
teachers_metadata = {}
|
|
with open(args.teachers_metadata, 'r') as file:
|
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
|
column_names = []
|
|
for i, row in enumerate(csvreader):
|
|
if i == 0:
|
|
column_names = row
|
|
continue
|
|
else:
|
|
row_dict = {}
|
|
for j, content in enumerate(row):
|
|
row_dict[column_names[j]] = content
|
|
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
|
teachers_metadata[row_dict['Ime in priimek']] = row_dict
|
|
|
|
# handle authors
|
|
authors_metadata = {}
|
|
with open(args.authors_metadata, 'r') as file:
|
|
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
|
column_names = []
|
|
for i, row in enumerate(csvreader):
|
|
if i == 0:
|
|
column_names = row
|
|
continue
|
|
elif i == 1:
|
|
active_column_name = ''
|
|
for j, sub_name in enumerate(row):
|
|
if column_names[j]:
|
|
active_column_name = column_names[j]
|
|
if sub_name:
|
|
column_names[j] = f'{active_column_name} - {sub_name}'
|
|
continue
|
|
elif i == 2:
|
|
continue
|
|
else:
|
|
row_dict = {}
|
|
for j, content in enumerate(row):
|
|
row_dict[column_names[j]] = content.strip()
|
|
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
|
authors_metadata[row_dict['Ime in priimek']] = row_dict
|
|
|
|
translations = {}
|
|
with open(args.translations, 'r') as file:
|
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
|
for row in csvreader:
|
|
translations[row[0]] = row[1]
|
|
|
|
return texts_metadata, authors_metadata, teachers_metadata, translations
|
|
|
|
|
|
def process_metadata(args):
|
|
texts_metadata, authors_metadata, teachers_metadata, translations = read_metadata(args)
|
|
|
|
metadata = {}
|
|
for document_metadata in texts_metadata:
|
|
document_metadata['Tvorec'] = document_metadata['Tvorec'].strip()
|
|
if document_metadata['Tvorec'] not in authors_metadata:
|
|
if document_metadata['Tvorec']:
|
|
print(document_metadata['Tvorec'])
|
|
continue
|
|
author_metadata = authors_metadata[document_metadata['Tvorec']]
|
|
metadata_el = {}
|
|
for attribute_name_sl, attribute_name_en in translations.items():
|
|
if attribute_name_sl in document_metadata:
|
|
if attribute_name_sl == 'Ocena':
|
|
grade = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' if document_metadata[attribute_name_sl] and document_metadata["Najvišja možna ocena"] else ''
|
|
metadata_el[attribute_name_en] = grade
|
|
elif attribute_name_sl == 'Tvorec':
|
|
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
|
elif attribute_name_sl == 'Učitelj':
|
|
metadata_el[attribute_name_en] = teachers_metadata[document_metadata['Učitelj']]['Koda'] if document_metadata['Učitelj'] in teachers_metadata else None
|
|
else:
|
|
metadata_el[attribute_name_en] = document_metadata[attribute_name_sl]
|
|
elif attribute_name_sl in author_metadata:
|
|
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
|
|
elif attribute_name_sl == 'Ime šole, Fakulteta':
|
|
curr_school = []
|
|
if author_metadata["Trenutno šolanje - Ime šole"]:
|
|
curr_school.append(author_metadata["Trenutno šolanje - Ime šole"])
|
|
if author_metadata["Trenutno šolanje - Fakulteta"]:
|
|
curr_school.append(author_metadata["Trenutno šolanje - Fakulteta"])
|
|
metadata_el['Current school'] = ', '.join(curr_school)
|
|
elif attribute_name_sl == 'Stopnja študija':
|
|
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
|
|
elif attribute_name_sl == 'Leto študija':
|
|
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Leto študija']
|
|
elif attribute_name_sl == 'Ostali jeziki':
|
|
metadata_el[attribute_name_en] = ','.join([k[16:] for k, v in author_metadata.items() if k[:13] == 'Ostali jeziki' and v == 'ja'])
|
|
elif attribute_name_sl == 'Kje učenje':
|
|
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Kje?']
|
|
elif attribute_name_sl == 'Koliko časa učenje?':
|
|
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Koliko časa?']
|
|
elif attribute_name_sl == 'Učbeniki':
|
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred tem programom - Učbeniki']
|
|
elif attribute_name_sl == 'Kje?':
|
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Kje?']
|
|
elif attribute_name_sl == 'Koliko časa?':
|
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Koliko čas?']
|
|
else:
|
|
raise Exception(f'{attribute_name_sl} not found!')
|
|
|
|
metadata[metadata_el['Text ID']] = metadata_el
|
|
|
|
return metadata
|
|
|
|
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
|
print('BUILDING LINKS...')
|
|
etree_links = build_links(document_edges)
|
|
|
|
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
|
|
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
|
|
|
|
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
|
|
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
|
|
|
|
print('WRITTING TEI...')
|
|
etree_source_documents = []
|
|
etree_target_documents = []
|
|
|
|
print('PREPARING METADATA FOR BIBL...')
|
|
metadata = process_metadata(args)
|
|
|
|
print('WRITING SOURCE FILES...')
|
|
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs, metadata)
|
|
|
|
print('WRITING TARGET FILES...')
|
|
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs, metadata)
|
|
|
|
print('APPENDING DOCUMENT...')
|
|
etree_source_documents.append(
|
|
TeiDocument(source_div_name,
|
|
etree_source_divs, etree_target_divs))
|
|
etree_target_documents.append(
|
|
TeiDocument(target_div_name,
|
|
etree_target_divs, etree_source_divs))
|
|
|
|
print('BUILDING TEI DOCUMENTS...')
|
|
etree_source = build_tei_etrees(etree_source_documents)
|
|
etree_target = build_tei_etrees(etree_target_documents)
|
|
|
|
# to reduce RAM usage you may process the following in two steps, firstly write all but complete (by commenting complete tree code), secondly write only complete (by commenting "Writting all but complete" section of code and "deepcopy" function)
|
|
print('Writting all but complete')
|
|
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
|
|
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
|
|
|
|
with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
|
|
tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
|
|
|
|
print('COMPLETE TREE CREATION...')
|
|
complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
|
|
# complete_etree = build_complete_tei(etree_source, etree_target, etree_links)
|
|
|
|
print('WRITING COMPLETE TREE')
|
|
with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
|
|
tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode())
|