diff --git a/src/create_tei.py b/src/create_tei.py index 686f524..b869dcc 100644 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -6,6 +6,36 @@ from conversion_utils.translate_conllu_jos import get_syn_map from lxml import etree +kost_translations = { + "Author": "Author", + "Sex": "Sex", + "Year of birth": "YearOfBirth", + "Country": "Country", + "Employment status": "EmploymentStatus", + "Completed education": "CompletedEducation", + "Current school": "CurrentSchool", + "First language": "FirstLang", + "Knowledge of other languages": "OtherLang", + "Duration of Slovene language learning": "DurSlvLearning", + "Experience with Slovene before current program": "ExpSlv", + "Language proficiency in Slovene": "ProficSlv", + "Life in Slovenija before this current program": "LifeSlovenia", + "Location of Slovene language learning": "LocSlvLearning", + "Creation date": "CreationDate", + "Teacher": "Teacher", + "Academic year": "AcademicYear", + "Grade": "Grade", + "Input type": "InputType", + "Program type": "ProgramType", + "Program subtype": "ProgramSubtype", + "Slovene textbooks used": "SloveneTextbooks", + "Study cycle": "StudyCycle", + "Study year": "StudyYear", + "Task setting": "TaskSetting", + "Topic": "Topic", + "Instruction": "Instruction" +} + labels_mapper = { "B/GLAG/moči_morati": "B/GLAG/moči-morati", "B/MEN/besedna_družina": "B/MEN/besedna-družina", @@ -231,10 +261,12 @@ class TeiDocument: tag_usage.set('gi', tag) tag_usage.set('occurs', str(count)) - for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs): + for (paras, div_id, metadata), (_, corresp_div_id, _) in zip(self.divs, self.corresp_divs): div = etree.Element('div') set_xml_attr(div, 'id', div_id) div.set('corresp', f'#{corresp_div_id}') + bibl = create_bibl(metadata) + div.append(bibl) for para in paras: div.append(para.as_xml()) body.append(div) @@ -245,6 +277,24 @@ class TeiDocument: self.paragraphs.append(paragraph) +def create_bibl(metadata): + bibl = etree.Element('bibl') + bibl.set('n', metadata['Text ID']) + for k, v in metadata.items(): + if k == 'Text ID' or not v: + continue + note = etree.Element('note') + if k not in kost_translations: + # print(k) + key = ''.join([el.capitalize() for el in k.split()]) + else: + key = kost_translations[k] + note.set('ana', f'#{key}') + # set_xml_attr(note, 'lang', 'sl') + note.text = f'{v}' + bibl.append(note) + return bibl + def convert_bibl(bibl): etree_bibl = etree.Element('bibl') # etree_bibl.set('corresp', bibl.get('corresp')) diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index ddffea4..00f5c3b 100644 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -316,8 +316,8 @@ def tokenize(args): for tokenized_para in tokenized_divs[div_id]: paragraph_name, source_res, target_res, edges = tokenized_para split_para_name = paragraph_name[:-5].split('-') - div_name = '-'.join(split_para_name[:-1]) - par_name = split_para_name[-1] + div_name = '-'.join(split_para_name[:-1]) if len(split_para_name) == 4 else '-'.join(split_para_name) + par_name = split_para_name[-1] if len(split_para_name) == 4 else '1' assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!') if par_name in alphabet: par_name = str(alphabet.index(par_name) + 10) diff --git a/src/write/write.py b/src/write/write.py index 91d6458..413ebce 100644 --- a/src/write/write.py +++ b/src/write/write.py @@ -1,4 +1,5 @@ import copy +import csv import json import os from lxml import etree @@ -8,10 +9,15 @@ from src.create_tei import construct_sentence_from_list, \ construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl -def form_paragraphs(annotated_source_divs): +def form_paragraphs(annotated_source_divs, metadata): etree_source_divs = [] for div_i, div_tuple in enumerate(annotated_source_divs): div_name, div = div_tuple + if div_name[:-1] not in metadata: + print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!") + print(div_name[:-1]) + continue + div_metadata = metadata[div_name[:-1]] # file_name = file_name.replace('/', '_') # print(f'{i * 100 / folders_count} % : {file_name}') @@ -29,10 +35,107 @@ def form_paragraphs(annotated_source_divs): etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences)) - etree_source_divs.append((etree_source_paragraphs, div_name)) + etree_source_divs.append((etree_source_paragraphs, div_name, div_metadata)) return etree_source_divs, div_name + +def read_metadata(args): + texts_metadata = [] + with open(args.texts_metadata, 'r') as file: + csvreader = csv.reader(file, delimiter='\t', quotechar='"') + column_names = [] + for i, row in enumerate(csvreader): + if i == 0: + column_names = row + continue + else: + row_dict = {} + for j, content in enumerate(row): + row_dict[column_names[j]] = content + texts_metadata.append(row_dict) + + authors_metadata = {} + with open(args.authors_metadata, 'r') as file: + csvreader = csv.reader(file, delimiter='\t', quotechar='"') + column_names = [] + for i, row in enumerate(csvreader): + if i == 0: + column_names = row + continue + elif i == 1: + active_column_name = '' + for j, sub_name in enumerate(row): + if column_names[j]: + active_column_name = column_names[j] + if sub_name: + column_names[j] = f'{active_column_name} - {sub_name}' + continue + elif i == 2: + continue + else: + row_dict = {} + for j, content in enumerate(row): + row_dict[column_names[j]] = content + row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip() + authors_metadata[row_dict['Ime in priimek']] = row_dict + + translations = {} + with open(args.translations, 'r') as file: + csvreader = csv.reader(file, delimiter='\t', quotechar='"') + for row in csvreader: + translations[row[0]] = row[1] + + return texts_metadata, authors_metadata, translations + + +def process_metadata(args): + texts_metadata, authors_metadata, translations = read_metadata(args) + + metadata = {} + for document_metadata in texts_metadata: + document_metadata['Tvorec'] = document_metadata['Tvorec'].strip() + if document_metadata['Tvorec'] not in authors_metadata: + if document_metadata['Tvorec']: + print(document_metadata['Tvorec']) + continue + author_metadata = authors_metadata[document_metadata['Tvorec']] + metadata_el = {} + for attribute_name_sl, attribute_name_en in translations.items(): + if attribute_name_sl in document_metadata: + if attribute_name_sl == 'Ocena': + metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' + elif attribute_name_sl == 'Tvorec': + metadata_el[attribute_name_en] = author_metadata['Koda tvorca'] + else: + metadata_el[attribute_name_en] = document_metadata[attribute_name_sl] + elif attribute_name_sl in author_metadata: + metadata_el[attribute_name_en] = author_metadata[attribute_name_sl] + elif attribute_name_sl == 'Ime šole, Fakulteta': + metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}' + elif attribute_name_sl == 'Stopnja študija': + metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija'] + elif attribute_name_sl == 'Leto študija': + metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Leto študija'] + elif attribute_name_sl == 'Ostali jeziki': + metadata_el[attribute_name_en] = ','.join([k[16:] for k, v in author_metadata.items() if k[:13] == 'Ostali jeziki' and v == 'ja']) + elif attribute_name_sl == 'Kje učenje': + metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Kje?'] + elif attribute_name_sl == 'Koliko časa učenje?': + metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Koliko časa?'] + elif attribute_name_sl == 'Učbeniki': + metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred tem programom - Učbeniki'] + elif attribute_name_sl == 'Kje?': + metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Kje?'] + elif attribute_name_sl == 'Koliko časa?': + metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Koliko čas?'] + else: + raise Exception(f'{attribute_name_sl} not found!') + + metadata[metadata_el['Text ID']] = metadata_el + + return metadata + def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args): print('BUILDING LINKS...') etree_links = build_links(document_edges) @@ -47,11 +150,14 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args etree_source_documents = [] etree_target_documents = [] + print('PREPARING METADATA FOR BIBL...') + metadata = process_metadata(args) + print('WRITING SOURCE FILES...') - etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs) + etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs, metadata) print('WRITING TARGET FILES...') - etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs) + etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs, metadata) print('APPENDING DOCUMENT...') etree_source_documents.append( diff --git a/svala2tei.py b/svala2tei.py index 0e7e387..e568a65 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -253,6 +253,12 @@ if __name__ == '__main__': help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--raw_text', default='data/KOST/raw', help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv', + help='KOST metadata location') + parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv', + help='KOST authors location') + parser.add_argument('--translations', default='data/KOST/translations.csv', + help='KOST Slovenian-English column names translations') parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')