Fixed metadata
This commit is contained in:
parent
34d51b4fda
commit
3ceb706cef
|
@ -6,6 +6,36 @@ from conversion_utils.translate_conllu_jos import get_syn_map
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
|
kost_translations = {
|
||||||
|
"Author": "Author",
|
||||||
|
"Sex": "Sex",
|
||||||
|
"Year of birth": "YearOfBirth",
|
||||||
|
"Country": "Country",
|
||||||
|
"Employment status": "EmploymentStatus",
|
||||||
|
"Completed education": "CompletedEducation",
|
||||||
|
"Current school": "CurrentSchool",
|
||||||
|
"First language": "FirstLang",
|
||||||
|
"Knowledge of other languages": "OtherLang",
|
||||||
|
"Duration of Slovene language learning": "DurSlvLearning",
|
||||||
|
"Experience with Slovene before current program": "ExpSlv",
|
||||||
|
"Language proficiency in Slovene": "ProficSlv",
|
||||||
|
"Life in Slovenija before this current program": "LifeSlovenia",
|
||||||
|
"Location of Slovene language learning": "LocSlvLearning",
|
||||||
|
"Creation date": "CreationDate",
|
||||||
|
"Teacher": "Teacher",
|
||||||
|
"Academic year": "AcademicYear",
|
||||||
|
"Grade": "Grade",
|
||||||
|
"Input type": "InputType",
|
||||||
|
"Program type": "ProgramType",
|
||||||
|
"Program subtype": "ProgramSubtype",
|
||||||
|
"Slovene textbooks used": "SloveneTextbooks",
|
||||||
|
"Study cycle": "StudyCycle",
|
||||||
|
"Study year": "StudyYear",
|
||||||
|
"Task setting": "TaskSetting",
|
||||||
|
"Topic": "Topic",
|
||||||
|
"Instruction": "Instruction"
|
||||||
|
}
|
||||||
|
|
||||||
labels_mapper = {
|
labels_mapper = {
|
||||||
"B/GLAG/moči_morati": "B/GLAG/moči-morati",
|
"B/GLAG/moči_morati": "B/GLAG/moči-morati",
|
||||||
"B/MEN/besedna_družina": "B/MEN/besedna-družina",
|
"B/MEN/besedna_družina": "B/MEN/besedna-družina",
|
||||||
|
@ -231,10 +261,12 @@ class TeiDocument:
|
||||||
tag_usage.set('gi', tag)
|
tag_usage.set('gi', tag)
|
||||||
tag_usage.set('occurs', str(count))
|
tag_usage.set('occurs', str(count))
|
||||||
|
|
||||||
for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs):
|
for (paras, div_id, metadata), (_, corresp_div_id, _) in zip(self.divs, self.corresp_divs):
|
||||||
div = etree.Element('div')
|
div = etree.Element('div')
|
||||||
set_xml_attr(div, 'id', div_id)
|
set_xml_attr(div, 'id', div_id)
|
||||||
div.set('corresp', f'#{corresp_div_id}')
|
div.set('corresp', f'#{corresp_div_id}')
|
||||||
|
bibl = create_bibl(metadata)
|
||||||
|
div.append(bibl)
|
||||||
for para in paras:
|
for para in paras:
|
||||||
div.append(para.as_xml())
|
div.append(para.as_xml())
|
||||||
body.append(div)
|
body.append(div)
|
||||||
|
@ -245,6 +277,24 @@ class TeiDocument:
|
||||||
self.paragraphs.append(paragraph)
|
self.paragraphs.append(paragraph)
|
||||||
|
|
||||||
|
|
||||||
|
def create_bibl(metadata):
|
||||||
|
bibl = etree.Element('bibl')
|
||||||
|
bibl.set('n', metadata['Text ID'])
|
||||||
|
for k, v in metadata.items():
|
||||||
|
if k == 'Text ID' or not v:
|
||||||
|
continue
|
||||||
|
note = etree.Element('note')
|
||||||
|
if k not in kost_translations:
|
||||||
|
# print(k)
|
||||||
|
key = ''.join([el.capitalize() for el in k.split()])
|
||||||
|
else:
|
||||||
|
key = kost_translations[k]
|
||||||
|
note.set('ana', f'#{key}')
|
||||||
|
# set_xml_attr(note, 'lang', 'sl')
|
||||||
|
note.text = f'{v}'
|
||||||
|
bibl.append(note)
|
||||||
|
return bibl
|
||||||
|
|
||||||
def convert_bibl(bibl):
|
def convert_bibl(bibl):
|
||||||
etree_bibl = etree.Element('bibl')
|
etree_bibl = etree.Element('bibl')
|
||||||
# etree_bibl.set('corresp', bibl.get('corresp'))
|
# etree_bibl.set('corresp', bibl.get('corresp'))
|
||||||
|
|
|
@ -316,8 +316,8 @@ def tokenize(args):
|
||||||
for tokenized_para in tokenized_divs[div_id]:
|
for tokenized_para in tokenized_divs[div_id]:
|
||||||
paragraph_name, source_res, target_res, edges = tokenized_para
|
paragraph_name, source_res, target_res, edges = tokenized_para
|
||||||
split_para_name = paragraph_name[:-5].split('-')
|
split_para_name = paragraph_name[:-5].split('-')
|
||||||
div_name = '-'.join(split_para_name[:-1])
|
div_name = '-'.join(split_para_name[:-1]) if len(split_para_name) == 4 else '-'.join(split_para_name)
|
||||||
par_name = split_para_name[-1]
|
par_name = split_para_name[-1] if len(split_para_name) == 4 else '1'
|
||||||
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
|
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
|
||||||
if par_name in alphabet:
|
if par_name in alphabet:
|
||||||
par_name = str(alphabet.index(par_name) + 10)
|
par_name = str(alphabet.index(par_name) + 10)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import copy
|
import copy
|
||||||
|
import csv
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
@ -8,10 +9,15 @@ from src.create_tei import construct_sentence_from_list, \
|
||||||
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
||||||
|
|
||||||
|
|
||||||
def form_paragraphs(annotated_source_divs):
|
def form_paragraphs(annotated_source_divs, metadata):
|
||||||
etree_source_divs = []
|
etree_source_divs = []
|
||||||
for div_i, div_tuple in enumerate(annotated_source_divs):
|
for div_i, div_tuple in enumerate(annotated_source_divs):
|
||||||
div_name, div = div_tuple
|
div_name, div = div_tuple
|
||||||
|
if div_name[:-1] not in metadata:
|
||||||
|
print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
|
||||||
|
print(div_name[:-1])
|
||||||
|
continue
|
||||||
|
div_metadata = metadata[div_name[:-1]]
|
||||||
# file_name = file_name.replace('/', '_')
|
# file_name = file_name.replace('/', '_')
|
||||||
# print(f'{i * 100 / folders_count} % : {file_name}')
|
# print(f'{i * 100 / folders_count} % : {file_name}')
|
||||||
|
|
||||||
|
@ -29,10 +35,107 @@ def form_paragraphs(annotated_source_divs):
|
||||||
|
|
||||||
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
|
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
|
||||||
|
|
||||||
etree_source_divs.append((etree_source_paragraphs, div_name))
|
etree_source_divs.append((etree_source_paragraphs, div_name, div_metadata))
|
||||||
|
|
||||||
return etree_source_divs, div_name
|
return etree_source_divs, div_name
|
||||||
|
|
||||||
|
|
||||||
|
def read_metadata(args):
|
||||||
|
texts_metadata = []
|
||||||
|
with open(args.texts_metadata, 'r') as file:
|
||||||
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||||
|
column_names = []
|
||||||
|
for i, row in enumerate(csvreader):
|
||||||
|
if i == 0:
|
||||||
|
column_names = row
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
row_dict = {}
|
||||||
|
for j, content in enumerate(row):
|
||||||
|
row_dict[column_names[j]] = content
|
||||||
|
texts_metadata.append(row_dict)
|
||||||
|
|
||||||
|
authors_metadata = {}
|
||||||
|
with open(args.authors_metadata, 'r') as file:
|
||||||
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||||
|
column_names = []
|
||||||
|
for i, row in enumerate(csvreader):
|
||||||
|
if i == 0:
|
||||||
|
column_names = row
|
||||||
|
continue
|
||||||
|
elif i == 1:
|
||||||
|
active_column_name = ''
|
||||||
|
for j, sub_name in enumerate(row):
|
||||||
|
if column_names[j]:
|
||||||
|
active_column_name = column_names[j]
|
||||||
|
if sub_name:
|
||||||
|
column_names[j] = f'{active_column_name} - {sub_name}'
|
||||||
|
continue
|
||||||
|
elif i == 2:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
row_dict = {}
|
||||||
|
for j, content in enumerate(row):
|
||||||
|
row_dict[column_names[j]] = content
|
||||||
|
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
||||||
|
authors_metadata[row_dict['Ime in priimek']] = row_dict
|
||||||
|
|
||||||
|
translations = {}
|
||||||
|
with open(args.translations, 'r') as file:
|
||||||
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||||
|
for row in csvreader:
|
||||||
|
translations[row[0]] = row[1]
|
||||||
|
|
||||||
|
return texts_metadata, authors_metadata, translations
|
||||||
|
|
||||||
|
|
||||||
|
def process_metadata(args):
|
||||||
|
texts_metadata, authors_metadata, translations = read_metadata(args)
|
||||||
|
|
||||||
|
metadata = {}
|
||||||
|
for document_metadata in texts_metadata:
|
||||||
|
document_metadata['Tvorec'] = document_metadata['Tvorec'].strip()
|
||||||
|
if document_metadata['Tvorec'] not in authors_metadata:
|
||||||
|
if document_metadata['Tvorec']:
|
||||||
|
print(document_metadata['Tvorec'])
|
||||||
|
continue
|
||||||
|
author_metadata = authors_metadata[document_metadata['Tvorec']]
|
||||||
|
metadata_el = {}
|
||||||
|
for attribute_name_sl, attribute_name_en in translations.items():
|
||||||
|
if attribute_name_sl in document_metadata:
|
||||||
|
if attribute_name_sl == 'Ocena':
|
||||||
|
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
|
||||||
|
elif attribute_name_sl == 'Tvorec':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
||||||
|
else:
|
||||||
|
metadata_el[attribute_name_en] = document_metadata[attribute_name_sl]
|
||||||
|
elif attribute_name_sl in author_metadata:
|
||||||
|
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
|
||||||
|
elif attribute_name_sl == 'Ime šole, Fakulteta':
|
||||||
|
metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}'
|
||||||
|
elif attribute_name_sl == 'Stopnja študija':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
|
||||||
|
elif attribute_name_sl == 'Leto študija':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Leto študija']
|
||||||
|
elif attribute_name_sl == 'Ostali jeziki':
|
||||||
|
metadata_el[attribute_name_en] = ','.join([k[16:] for k, v in author_metadata.items() if k[:13] == 'Ostali jeziki' and v == 'ja'])
|
||||||
|
elif attribute_name_sl == 'Kje učenje':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Kje?']
|
||||||
|
elif attribute_name_sl == 'Koliko časa učenje?':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Življenje v Sloveniji pred tem programom - Koliko časa?']
|
||||||
|
elif attribute_name_sl == 'Učbeniki':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred tem programom - Učbeniki']
|
||||||
|
elif attribute_name_sl == 'Kje?':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Kje?']
|
||||||
|
elif attribute_name_sl == 'Koliko časa?':
|
||||||
|
metadata_el[attribute_name_en] = author_metadata['Učenje slovenščine pred L+ - Koliko čas?']
|
||||||
|
else:
|
||||||
|
raise Exception(f'{attribute_name_sl} not found!')
|
||||||
|
|
||||||
|
metadata[metadata_el['Text ID']] = metadata_el
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
||||||
print('BUILDING LINKS...')
|
print('BUILDING LINKS...')
|
||||||
etree_links = build_links(document_edges)
|
etree_links = build_links(document_edges)
|
||||||
|
@ -47,11 +150,14 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
|
||||||
etree_source_documents = []
|
etree_source_documents = []
|
||||||
etree_target_documents = []
|
etree_target_documents = []
|
||||||
|
|
||||||
|
print('PREPARING METADATA FOR BIBL...')
|
||||||
|
metadata = process_metadata(args)
|
||||||
|
|
||||||
print('WRITING SOURCE FILES...')
|
print('WRITING SOURCE FILES...')
|
||||||
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs)
|
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs, metadata)
|
||||||
|
|
||||||
print('WRITING TARGET FILES...')
|
print('WRITING TARGET FILES...')
|
||||||
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs)
|
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs, metadata)
|
||||||
|
|
||||||
print('APPENDING DOCUMENT...')
|
print('APPENDING DOCUMENT...')
|
||||||
etree_source_documents.append(
|
etree_source_documents.append(
|
||||||
|
|
|
@ -253,6 +253,12 @@ if __name__ == '__main__':
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv',
|
||||||
|
help='KOST metadata location')
|
||||||
|
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
|
||||||
|
help='KOST authors location')
|
||||||
|
parser.add_argument('--translations', default='data/KOST/translations.csv',
|
||||||
|
help='KOST Slovenian-English column names translations')
|
||||||
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
|
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user