From 9216ee9a3bb68ed7bee6ca3db8d19aa7cee65af4 Mon Sep 17 00:00:00 2001 From: Luka Date: Sat, 25 Feb 2023 10:01:22 +0100 Subject: [PATCH] Multiple fixes for version KOST_0.2 --- src/read/read_and_merge.py | 102 ++++++++++++++++++++++++++++--------- src/write/write.py | 41 +++++++++++---- svala2tei.py | 2 + 3 files changed, 110 insertions(+), 35 deletions(-) diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index 00f5c3b..7d8662a 100644 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -244,6 +244,48 @@ def create_target(svala_data_object, source_tokenized): return target_tokenized +def fake_svala_data(source_tokenized): + source_res, target_res, generated_edges = [], [], {} + + edge_id = 0 + for sent in source_tokenized: + source_sent = [] + target_sent = [] + for tok in sent: + tok_id = tok['id'][0] + tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc' + source_svala_id = 's' + str(edge_id) + target_svala_id = 't' + str(edge_id) + source_sent.append({ + 'token': tok['text'], + 'tag': tok_tag, + 'id': tok_id, + 'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', + 'svala_id': source_svala_id + }) + target_sent.append({ + 'token': tok['text'], + 'tag': tok_tag, + 'id': tok_id, + 'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', + 'svala_id': target_svala_id + }) + generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = { + 'id': f'e-{source_svala_id}-{target_svala_id}', + 'ids': [source_svala_id, target_svala_id], + 'labels': [], + 'manual': False, + 'source_ids': [source_svala_id], + 'target_ids': [target_svala_id] + } + edge_id += 1 + source_res.append(source_sent) + target_res.append(target_sent) + + + return source_res, target_res, generated_edges + + def tokenize(args): if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization: print('READING TOKENIZATION...') @@ -266,42 +308,54 @@ def tokenize(args): text_filename = '' - for folder, _, filenames in os.walk(args.svala_folder): - filenames = sorted(filenames) - for filename_i, filename in enumerate(filenames): + all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0] + + for text_folder, _, text_filenames in os.walk(args.raw_text): + text_filenames = sorted(text_filenames) + for text_filename_i, text_filename in enumerate(text_filenames): # if filename_i*100/len(filenames) > 35: # print('here') # continue - svala_path = os.path.join(folder, filename) - new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt' - if text_filename != new_text_filename: - text_filename = new_text_filename - text_file = read_raw_text(os.path.join(args.raw_text, text_filename)) - raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize( - text_file) if text_file else ([], [], []) - source_sent_i = 0 - jf = open(svala_path, encoding='utf-8') - print(svala_path) - svala_data = json.load(jf) - jf.close() + text_file = read_raw_text(os.path.join(args.raw_text, text_filename)) + raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize( + text_file) if text_file else ([], [], []) + source_sent_i = 0 + + filenames = [filename for filename in all_js_filenames if filename.startswith(text_filename[:-4])] + # new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt' + if filenames: + for filename in filenames: + svala_path = os.path.join(args.svala_folder, filename) + jf = open(svala_path, encoding='utf-8') + print(svala_path) + svala_data = json.load(jf) + jf.close() - svala_data_object = SvalaData(svala_data) + svala_data_object = SvalaData(svala_data) - apply_svala_handfixes(svala_data_object) + apply_svala_handfixes(svala_data_object) - source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i) - # target_res = create_target(svala_data, source_tokenized) + source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i) + # target_res = create_target(svala_data, source_tokenized) - target_res = create_target(svala_data_object, source_res) + target_res = create_target(svala_data_object, source_res) - if text_filename not in tokenized_divs: - tokenized_divs[text_filename] = [] + if text_filename not in tokenized_divs: + tokenized_divs[text_filename] = [] - tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges'])) + tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges'])) + + + else: + filename = text_filename[:-4] + '.json' + source_res, target_res, generated_edges = fake_svala_data(source_tokenized) + if text_filename not in tokenized_divs: + tokenized_divs[text_filename] = [] + tokenized_divs[text_filename].append((filename, source_res, target_res, generated_edges)) - logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %') + logging.info(f'Tokenizing at {text_filename_i * 100 / len(text_filenames)} %') tokenized_source_divs = [] tokenized_target_divs = [] diff --git a/src/write/write.py b/src/write/write.py index 413ebce..e4a6070 100644 --- a/src/write/write.py +++ b/src/write/write.py @@ -14,7 +14,7 @@ def form_paragraphs(annotated_source_divs, metadata): for div_i, div_tuple in enumerate(annotated_source_divs): div_name, div = div_tuple if div_name[:-1] not in metadata: - print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!") + # print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!") print(div_name[:-1]) continue div_metadata = metadata[div_name[:-1]] @@ -55,6 +55,23 @@ def read_metadata(args): row_dict[column_names[j]] = content texts_metadata.append(row_dict) + # handle teachers + teachers_metadata = {} + with open(args.teachers_metadata, 'r') as file: + csvreader = csv.reader(file, delimiter='\t', quotechar='"') + column_names = [] + for i, row in enumerate(csvreader): + if i == 0: + column_names = row + continue + else: + row_dict = {} + for j, content in enumerate(row): + row_dict[column_names[j]] = content + row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip() + teachers_metadata[row_dict['Ime in priimek']] = row_dict + + # handle authors authors_metadata = {} with open(args.authors_metadata, 'r') as file: csvreader = csv.reader(file, delimiter='\t', quotechar='"') @@ -86,11 +103,11 @@ def read_metadata(args): for row in csvreader: translations[row[0]] = row[1] - return texts_metadata, authors_metadata, translations + return texts_metadata, authors_metadata, teachers_metadata, translations def process_metadata(args): - texts_metadata, authors_metadata, translations = read_metadata(args) + texts_metadata, authors_metadata, teachers_metadata, translations = read_metadata(args) metadata = {} for document_metadata in texts_metadata: @@ -107,6 +124,8 @@ def process_metadata(args): metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' elif attribute_name_sl == 'Tvorec': metadata_el[attribute_name_en] = author_metadata['Koda tvorca'] + elif attribute_name_sl == 'Učitelj': + metadata_el[attribute_name_en] = teachers_metadata[document_metadata['Učitelj']]['Koda'] if document_metadata['Učitelj'] in teachers_metadata else None else: metadata_el[attribute_name_en] = document_metadata[attribute_name_sl] elif attribute_name_sl in author_metadata: @@ -171,16 +190,16 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args etree_source = build_tei_etrees(etree_source_documents) etree_target = build_tei_etrees(etree_target_documents) - print('Writting all but complete') - with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: - sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) - - with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf: - tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode()) + # print('Writting all but complete') + # with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: + # sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) + # + # with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf: + # tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode()) print('COMPLETE TREE CREATION...') - complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) - # complete_etree = build_complete_tei(etree_source, etree_target, etree_links) + # complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) + complete_etree = build_complete_tei(etree_source, etree_target, etree_links) print('WRITING COMPLETE TREE') with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf: diff --git a/svala2tei.py b/svala2tei.py index e568a65..160b071 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -257,6 +257,8 @@ if __name__ == '__main__': help='KOST metadata location') parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv', help='KOST authors location') + parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv', + help='KOST teachers location') parser.add_argument('--translations', default='data/KOST/translations.csv', help='KOST Slovenian-English column names translations') parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',