diff --git a/src/create_tei.py b/src/create_tei.py index b869dcc..166260b 100644 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -330,17 +330,29 @@ def build_complete_tei(etree_source, etree_target, etree_links): text = etree.Element('text') group = etree.Element('group') print('P3') - group.append(list(etree_source[0])[1]) + group.insert(len(group), + list(etree_source[0])[1]) + # group.append(list(etree_source[0])[1]) print('P4') - group.append(list(etree_target[0])[1]) + group.insert(len(group), + list(etree_target[0])[1]) + # group.append(list(etree_target[0])[1]) print('P5') - text.append(group) + text.insert(len(text), + group) + # text.append(group) print('P6') - root.append(tei_header) + root.insert(len(root), + tei_header) + # root.append(tei_header) print('P7') - root.append(text) + # root.append(text) + root.insert(len(root), + text) print('P8') - root.append(etree_links) + # root.append(etree_links) + root.insert(len(root), + etree_links) print('P9') return root @@ -349,34 +361,22 @@ def build_links(all_edges): body = etree.Element('standOff') for document_edges in all_edges: - - - - # if len(document_edges) > 1: - # print('here') - # mine paragraphs for paragraph_edges in document_edges: p = etree.Element('linkGrp') - paragraph_id = '' corresp_source_id = '' corresp_target_id = '' - corresp = [] - # for sentence_edges in paragraph_edges: - # + for token_edges in paragraph_edges: if not corresp_source_id and len(token_edges['source_ids']) > 0: random_source_id = token_edges['source_ids'][0] corresp_source_id = '#' - # corresp_source_id += '.'.join(random_source_id.split('.')[:3]) corresp_source_id += '.'.join(random_source_id.split('.')[:2]) - corresp.append(corresp_source_id) if not corresp_target_id and len(token_edges['target_ids']) > 0: random_target_id = token_edges['target_ids'][0] corresp_target_id = '#' corresp_target_id += '.'.join(random_target_id.split('.')[:2]) - # corresp_target_id += random_target_id.split('.')[0] - corresp.append(corresp_target_id) + link = etree.Element('link') # translate labels labels_list = [] @@ -390,6 +390,11 @@ def build_links(all_edges): link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']])) p.append(link) + corresp = [] + if corresp_source_id: + corresp.append(corresp_source_id) + if corresp_target_id: + corresp.append(corresp_target_id) p.set('type', 'CORR') targFunc = [] if corresp_source_id: diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index 7d8662a..213d00a 100644 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -238,7 +238,7 @@ def create_target(svala_data_object, source_tokenized): target_tokenized.append(target_sent_tokenized) target_sent_tokenized = [] curr_sententence += 1 - tok_i = 1 + tok_i = 0 tok_i += 1 target_tokenized.append(target_sent_tokenized) return target_tokenized @@ -256,18 +256,19 @@ def fake_svala_data(source_tokenized): tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc' source_svala_id = 's' + str(edge_id) target_svala_id = 't' + str(edge_id) + space_after = not ('misc' in tok and tok['misc'] == 'SpaceAfter=No') source_sent.append({ 'token': tok['text'], 'tag': tok_tag, 'id': tok_id, - 'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', + 'space_after': space_after, 'svala_id': source_svala_id }) target_sent.append({ 'token': tok['text'], 'tag': tok_tag, 'id': tok_id, - 'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', + 'space_after': space_after, 'svala_id': target_svala_id }) generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = { diff --git a/src/write/write.py b/src/write/write.py index e4a6070..9a472df 100644 --- a/src/write/write.py +++ b/src/write/write.py @@ -43,7 +43,7 @@ def form_paragraphs(annotated_source_divs, metadata): def read_metadata(args): texts_metadata = [] with open(args.texts_metadata, 'r') as file: - csvreader = csv.reader(file, delimiter='\t', quotechar='"') + csvreader = csv.reader(file, delimiter='|', quotechar='"') column_names = [] for i, row in enumerate(csvreader): if i == 0: @@ -52,7 +52,7 @@ def read_metadata(args): else: row_dict = {} for j, content in enumerate(row): - row_dict[column_names[j]] = content + row_dict[column_names[j]] = content.strip() texts_metadata.append(row_dict) # handle teachers @@ -74,7 +74,7 @@ def read_metadata(args): # handle authors authors_metadata = {} with open(args.authors_metadata, 'r') as file: - csvreader = csv.reader(file, delimiter='\t', quotechar='"') + csvreader = csv.reader(file, delimiter='|', quotechar='"') column_names = [] for i, row in enumerate(csvreader): if i == 0: @@ -93,7 +93,7 @@ def read_metadata(args): else: row_dict = {} for j, content in enumerate(row): - row_dict[column_names[j]] = content + row_dict[column_names[j]] = content.strip() row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip() authors_metadata[row_dict['Ime in priimek']] = row_dict @@ -121,7 +121,8 @@ def process_metadata(args): for attribute_name_sl, attribute_name_en in translations.items(): if attribute_name_sl in document_metadata: if attribute_name_sl == 'Ocena': - metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' + grade = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' if document_metadata[attribute_name_sl] and document_metadata["Najvišja možna ocena"] else '' + metadata_el[attribute_name_en] = grade elif attribute_name_sl == 'Tvorec': metadata_el[attribute_name_en] = author_metadata['Koda tvorca'] elif attribute_name_sl == 'Učitelj': @@ -131,7 +132,12 @@ def process_metadata(args): elif attribute_name_sl in author_metadata: metadata_el[attribute_name_en] = author_metadata[attribute_name_sl] elif attribute_name_sl == 'Ime šole, Fakulteta': - metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}' + curr_school = [] + if author_metadata["Trenutno šolanje - Ime šole"]: + curr_school.append(author_metadata["Trenutno šolanje - Ime šole"]) + if author_metadata["Trenutno šolanje - Fakulteta"]: + curr_school.append(author_metadata["Trenutno šolanje - Fakulteta"]) + metadata_el['Current school'] = ', '.join(curr_school) elif attribute_name_sl == 'Stopnja študija': metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija'] elif attribute_name_sl == 'Leto študija': diff --git a/svala2tei.py b/svala2tei.py index 160b071..52aad64 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -253,9 +253,9 @@ if __name__ == '__main__': help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--raw_text', default='data/KOST/raw', help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv', + parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv', help='KOST metadata location') - parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv', + parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv', help='KOST authors location') parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv', help='KOST teachers location')