Fixed metadata

This commit is contained in:
Luka 2023-01-24 10:47:26 +01:00
parent 34d51b4fda
commit 3ceb706cef
4 changed files with 169 additions and 7 deletions

View File

@ -6,6 +6,36 @@ from conversion_utils.translate_conllu_jos import get_syn_map
from lxml import etree from lxml import etree
kost_translations = {
"Author": "Author",
"Sex": "Sex",
"Year of birth": "YearOfBirth",
"Country": "Country",
"Employment status": "EmploymentStatus",
"Completed education": "CompletedEducation",
"Current school": "CurrentSchool",
"First language": "FirstLang",
"Knowledge of other languages": "OtherLang",
"Duration of Slovene language learning": "DurSlvLearning",
"Experience with Slovene before current program": "ExpSlv",
"Language proficiency in Slovene": "ProficSlv",
"Life in Slovenija before this current program": "LifeSlovenia",
"Location of Slovene language learning": "LocSlvLearning",
"Creation date": "CreationDate",
"Teacher": "Teacher",
"Academic year": "AcademicYear",
"Grade": "Grade",
"Input type": "InputType",
"Program type": "ProgramType",
"Program subtype": "ProgramSubtype",
"Slovene textbooks used": "SloveneTextbooks",
"Study cycle": "StudyCycle",
"Study year": "StudyYear",
"Task setting": "TaskSetting",
"Topic": "Topic",
"Instruction": "Instruction"
}
labels_mapper = { labels_mapper = {
"B/GLAG/moči_morati": "B/GLAG/moči-morati", "B/GLAG/moči_morati": "B/GLAG/moči-morati",
"B/MEN/besedna_družina": "B/MEN/besedna-družina", "B/MEN/besedna_družina": "B/MEN/besedna-družina",
@ -231,10 +261,12 @@ class TeiDocument:
tag_usage.set('gi', tag) tag_usage.set('gi', tag)
tag_usage.set('occurs', str(count)) tag_usage.set('occurs', str(count))
for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs): for (paras, div_id, metadata), (_, corresp_div_id, _) in zip(self.divs, self.corresp_divs):
div = etree.Element('div') div = etree.Element('div')
set_xml_attr(div, 'id', div_id) set_xml_attr(div, 'id', div_id)
div.set('corresp', f'#{corresp_div_id}') div.set('corresp', f'#{corresp_div_id}')
bibl = create_bibl(metadata)
div.append(bibl)
for para in paras: for para in paras:
div.append(para.as_xml()) div.append(para.as_xml())
body.append(div) body.append(div)
@ -245,6 +277,24 @@ class TeiDocument:
self.paragraphs.append(paragraph) self.paragraphs.append(paragraph)
def create_bibl(metadata):
bibl = etree.Element('bibl')
bibl.set('n', metadata['Text ID'])
for k, v in metadata.items():
if k == 'Text ID' or not v:
continue
note = etree.Element('note')
if k not in kost_translations:
# print(k)
key = ''.join([el.capitalize() for el in k.split()])
else:
key = kost_translations[k]
note.set('ana', f'#{key}')
# set_xml_attr(note, 'lang', 'sl')
note.text = f'{v}'
bibl.append(note)
return bibl
def convert_bibl(bibl): def convert_bibl(bibl):
etree_bibl = etree.Element('bibl') etree_bibl = etree.Element('bibl')
# etree_bibl.set('corresp', bibl.get('corresp')) # etree_bibl.set('corresp', bibl.get('corresp'))

View File

@ -316,8 +316,8 @@ def tokenize(args):
for tokenized_para in tokenized_divs[div_id]: for tokenized_para in tokenized_divs[div_id]:
paragraph_name, source_res, target_res, edges = tokenized_para paragraph_name, source_res, target_res, edges = tokenized_para
split_para_name = paragraph_name[:-5].split('-') split_para_name = paragraph_name[:-5].split('-')
div_name = '-'.join(split_para_name[:-1]) div_name = '-'.join(split_para_name[:-1]) if len(split_para_name) == 4 else '-'.join(split_para_name)
par_name = split_para_name[-1] par_name = split_para_name[-1] if len(split_para_name) == 4 else '1'
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!') assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
if par_name in alphabet: if par_name in alphabet:
par_name = str(alphabet.index(par_name) + 10) par_name = str(alphabet.index(par_name) + 10)

View File

@ -1,4 +1,5 @@
import copy import copy
import csv
import json import json
import os import os
from lxml import etree from lxml import etree
@ -8,10 +9,15 @@ from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
def form_paragraphs(annotated_source_divs): def form_paragraphs(annotated_source_divs, metadata):
etree_source_divs = [] etree_source_divs = []
for div_i, div_tuple in enumerate(annotated_source_divs): for div_i, div_tuple in enumerate(annotated_source_divs):
div_name, div = div_tuple div_name, div = div_tuple
if div_name[:-1] not in metadata:
print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
print(div_name[:-1])
continue
div_metadata = metadata[div_name[:-1]]
# file_name = file_name.replace('/', '_') # file_name = file_name.replace('/', '_')
# print(f'{i * 100 / folders_count} % : {file_name}') # print(f'{i * 100 / folders_count} % : {file_name}')
@ -29,10 +35,107 @@ def form_paragraphs(annotated_source_divs):
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences)) etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
etree_source_divs.append((etree_source_paragraphs, div_name)) etree_source_divs.append((etree_source_paragraphs, div_name, div_metadata))
return etree_source_divs, div_name return etree_source_divs, div_name
def read_metadata(args):
texts_metadata = []
with open(args.texts_metadata, 'r') as file:
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
column_names = []
for i, row in enumerate(csvreader):
if i == 0:
column_names = row
continue
else:
row_dict = {}
for j, content in enumerate(row):
row_dict[column_names[j]] = content
texts_metadata.append(row_dict)
authors_metadata = {}
with open(args.authors_metadata, 'r') as file:
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
column_names = []
for i, row in enumerate(csvreader):
if i == 0:
column_names = row
continue
elif i == 1:
active_column_name = ''
for j, sub_name in enumerate(row):
if column_names[j]:
active_column_name = column_names[j]
if sub_name:
column_names[j] = f'{active_column_name} - {sub_name}'
continue
elif i == 2:
continue
else:
row_dict = {}
for j, content in enumerate(row):
row_dict[column_names[j]] = content
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
authors_metadata[row_dict['Ime in priimek']] = row_dict
translations = {}
with open(args.translations, 'r') as file:
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
for row in csvreader:
translations[row[0]] = row[1]
return texts_metadata, authors_metadata, translations
def process_metadata(args):
texts_metadata, authors_metadata, translations = read_metadata(args)
metadata = {}
for document_metadata in texts_metadata:
document_metadata['Tvorec'] = document_metadata['Tvorec'].strip()
if document_metadata['Tvorec'] not in authors_metadata:
if document_metadata['Tvorec']:
print(document_metadata['Tvorec'])
continue
author_metadata = authors_metadata[document_metadata['Tvorec']]
metadata_el = {}
for attribute_name_sl, attribute_name_en in translations.items():
if attribute_name_sl in document_metadata:
if attribute_name_sl == 'Ocena':
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
elif attribute_name_sl == 'Tvorec':
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
else:
metadata_el[attribute_name_en] = document_metadata[attribute_name_sl]
elif attribute_name_sl in author_metadata:
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
elif attribute_name_sl == 'Ime šole, Fakulteta':
metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}'
elif attribute_name_sl == 'Stopnja študija':
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
elif attribute_name_sl == 'Leto študija':
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Leto študija']
elif attribute_name_sl == 'Ostali jeziki':
metadata_el[attribute_name_en] = ','.join([k[16:] for k, v in author_metadata.items() if k[:13] == 'Ostali jeziki' and v == 'ja'])
elif attribute_name_sl == 'Kje učenje':
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Kje?']
elif attribute_name_sl == 'Koliko časa učenje?':
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Koliko časa?']
elif attribute_name_sl == 'Učbeniki':
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred tem programom - Učbeniki']
elif attribute_name_sl == 'Kje?':
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Kje?']
elif attribute_name_sl == 'Koliko časa?':
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Koliko čas?']
else:
raise Exception(f'{attribute_name_sl} not found!')
metadata[metadata_el['Text ID']] = metadata_el
return metadata
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args): def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
print('BUILDING LINKS...') print('BUILDING LINKS...')
etree_links = build_links(document_edges) etree_links = build_links(document_edges)
@ -47,11 +150,14 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
etree_source_documents = [] etree_source_documents = []
etree_target_documents = [] etree_target_documents = []
print('PREPARING METADATA FOR BIBL...')
metadata = process_metadata(args)
print('WRITING SOURCE FILES...') print('WRITING SOURCE FILES...')
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs) etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs, metadata)
print('WRITING TARGET FILES...') print('WRITING TARGET FILES...')
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs) etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs, metadata)
print('APPENDING DOCUMENT...') print('APPENDING DOCUMENT...')
etree_source_documents.append( etree_source_documents.append(

View File

@ -253,6 +253,12 @@ if __name__ == '__main__':
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--raw_text', default='data/KOST/raw', parser.add_argument('--raw_text', default='data/KOST/raw',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv',
help='KOST metadata location')
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
help='KOST authors location')
parser.add_argument('--translations', default='data/KOST/translations.csv',
help='KOST Slovenian-English column names translations')
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization', parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')