diff --git a/src/create_tei.py b/src/create_tei.py index 60abb24..fc4ceda 100644 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -198,20 +198,29 @@ def build_tei_etrees(documents): def build_complete_tei(etree_source, etree_target, etree_links): + print('P1') root = etree.Element('TEI') root.set('xmlns', 'http://www.tei-c.org/ns/1.0') + print('P2') tei_header = etree.Element('teiHeader') text = etree.Element('text') group = etree.Element('group') + print('P3') group.append(list(etree_source[0])[1]) + print('P4') group.append(list(etree_target[0])[1]) + print('P5') text.append(group) + print('P6') root.append(tei_header) + print('P7') root.append(text) + print('P8') # standoff = etree.Element('standOff') # standoff.append(etree_links) # root.append(standoff) root.append(etree_links) + print('P9') return root diff --git a/svala2tei.py b/svala2tei.py index 171ac8e..33f800f 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -38,12 +38,41 @@ def add_edges(source_id, target_id, svala_data, edges, source_token_id, target_t labels = svala_data['edges'][edge_id]['labels'] edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels}) + +def create_edges_list(target_ids, links_ids_mapper): + target_edges = [] + target_edges_set = [] + for target_sentence in target_ids: + target_sentence_edges = [] + for target_id in target_sentence: + target_sentence_edges.extend(links_ids_mapper[target_id]) + target_edges.append(target_sentence_edges) + target_edges_set.append(set(target_sentence_edges)) + + return target_edges, target_edges_set + + +SKIP_IDS = ['solar2284s.1.1.1'] + def create_edges(svala_data, source_par, target_par): + # if source_par and source_par[0]: + # if source_par[0][0]['id'] in SKIP_IDS: + # return [] + # # print(source_par[0][0]['id']) + # if source_par[0][0]['id'] == 'solar2440s.5.1.1': + # print('pause!') + # if target_par and target_par[0]: + # print(target_par[0][0]['id']) + # if target_par[0][0]['id'] == 'solar364t.5.1.1': + # print('pause!') source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source} target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target} - source_ids = [el['svala_id'] for source in source_par for el in source] - target_ids = [el['svala_id'] for target in target_par for el in target] + # source_ids = [el['svala_id'] for source in source_par for el in source] + # target_ids = [el['svala_id'] for target in target_par for el in target] + + source_ids = [[el['svala_id'] for el in source] for source in source_par] + target_ids = [[el['svala_id'] for el in target] for target in target_par] source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par] target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par] @@ -51,6 +80,11 @@ def create_edges(svala_data, source_par, target_par): # create links to ids mapper links_ids_mapper = {} edges_of_one_type = set() + + # delete empty edge + if 'e-' in svala_data['edges']: + del (svala_data['edges']['e-']) + for k, v in svala_data['edges'].items(): has_source = False has_target = False @@ -65,43 +99,207 @@ def create_edges(svala_data, source_par, target_par): if el not in links_ids_mapper: links_ids_mapper[el] = [] links_ids_mapper[el].append(k) - if not has_source or not has_target: + if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \ + or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '): edges_of_one_type.add(k) + # delete edge with space + save_deleted_edges = {} + if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ': + for edg in links_ids_mapper[svala_data['source'][0]['id']]: + save_deleted_edges[edg] = svala_data['edges'][edg] + del (svala_data['edges'][edg]) + del (links_ids_mapper[svala_data['source'][0]['id']]) + if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ': + for edg in links_ids_mapper[svala_data['target'][0]['id']]: + save_deleted_edges[edg] = svala_data['edges'][edg] + del (svala_data['edges'][edg]) + del (links_ids_mapper[svala_data['target'][0]['id']]) # create edge order edges_order = [] edges_processed = set() + # active_source_sentence_i = 0 + active_target_sentence_i = 0 s_i = 0 t_i = 0 - check_s_i = True - while s_i < len(source_ids) or t_i < len(target_ids): - # take care of getting ids over desired s_i/t_i - if check_s_i and s_i >= len(source_ids): - check_s_i = False - - if not check_s_i and t_i >= len(target_ids): - check_s_i = True - - if check_s_i: - id_of_interest = source_ids[s_i] - s_i += 1 - check_s_i = not check_s_i - else: - id_of_interest = target_ids[t_i] - t_i += 1 - check_s_i = not check_s_i - - any_addition = False - if id_of_interest not in links_ids_mapper: - print('NOOOOO') - for edge_id in links_ids_mapper[id_of_interest]: - if edge_id not in edges_processed and edge_id not in edges_of_one_type: - any_addition = True - edges_order.append(edge_id) - edges_processed.add(edge_id) - if not any_addition: - check_s_i = not check_s_i + + # create target edges + target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper) + source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper) + + last_target_edge = '' + + for active_source_sentence_i, active_source_sentence in enumerate(source_edges): + for source_edge in active_source_sentence: + print(source_edge) + # if 'e-s261-t261' == source_edge: + # print('aaa') + # if 'e-s253-s254-s255-s256-s257-s258-s259-s260' == source_edge: + # print('aaa') + # if 'e-s252-t252' == source_edge: + # print('aaa') + if source_edge in edges_of_one_type: + if source_edge not in edges_processed: + edges_order.append(source_edge) + edges_processed.add(source_edge) + + elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]: + # if 'e-s120-t121' == source_edge: + # print('aaa') + # if 'e-s119-t119' == source_edge: + # print('aaa') + if source_edge not in edges_processed: + edges_order.append(source_edge) + edges_processed.add(source_edge) + last_target_edge = source_edge + # when source is connected to two targets + elif source_edge not in target_edges_set[active_target_sentence_i]: + # add missing edges from target + while source_edge not in target_edges_set[active_target_sentence_i]: + for target_edge in target_edges[active_target_sentence_i]: + if target_edge in edges_of_one_type: + if target_edge not in edges_processed: + edges_order.append(target_edge) + edges_processed.add(target_edge) + # if 'e-s411-s412-t413' == target_edge: + # print('nnnno') + last_target_edge = target_edge + active_target_sentence_i += 1 + # if active_target_sentence_i >= len(target_edges_set): + # break + if source_edge in target_edges_set[active_target_sentence_i]: + if source_edge not in edges_processed: + edges_order.append(source_edge) + edges_processed.add(source_edge) + + else: + raise 'Impossible!!!' + if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges): + continue + # if last_target_edge == 'e-s197-t197': + # print('HERE!') + if len(target_edges[active_target_sentence_i]) == 0: + active_target_sentence_i += 1 + continue + + # if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and target_edges[active_target_sentence_i][-1] in edges_of_one_type): + # (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] not in source_edges_set[active_source_sentence_i])) + # if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and last_target_edge in target_edges_set[active_target_sentence_i]): + if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))): + for target_edge in target_edges[active_target_sentence_i]: + if target_edge in edges_of_one_type: + if target_edge not in edges_processed: + edges_order.append(target_edge) + edges_processed.add(target_edge) + # if 'e-s243-t243' == target_edge: + # print('nnnno') + last_target_edge = target_edge + active_target_sentence_i += 1 + continue + target_edge_in_next_source_edge_sentence = False + for target_edge in target_edges[active_target_sentence_i]: + if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]: + target_edge_in_next_source_edge_sentence = True + break + if target_edge_in_next_source_edge_sentence: + pass + elif not target_edge_in_next_source_edge_sentence: + target_edge_in_next_source_edge_sentence = False + while not target_edge_in_next_source_edge_sentence: + # if active_target_sentence_i >= len(target_edges_set): + # break + for target_edge in target_edges[active_target_sentence_i]: + if target_edge in edges_of_one_type: + if target_edge not in edges_processed: + edges_order.append(target_edge) + edges_processed.add(target_edge) + last_target_edge = target_edge + + # if there is no next source sentence + if active_source_sentence_i + 1 >= len(source_edges_set): + target_edge_in_next_source_edge_sentence = True + + # if last_target_edge only in target stop regularly + if last_target_edge == target_edges[active_target_sentence_i][-1]: + target_edge_in_next_source_edge_sentence = True + + # test if target_edge in next source + for target_edge in target_edges[active_target_sentence_i]: + if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[ + active_source_sentence_i + 1]: + target_edge_in_next_source_edge_sentence = True + break + active_target_sentence_i += 1 + + if not source_edges: + for active_target_sentence in target_edges: + for target_edge in active_target_sentence: + if target_edge not in edges_processed: + edges_order.append(target_edge) + edges_processed.add(target_edge) + + + + + + # # create edge order + # edges_order = [] + # edges_processed = set() + # s_i = 0 + # t_i = 0 + # check_s_i = True + # while s_i < len(source_ids) or t_i < len(target_ids): + # # take care of getting ids over desired s_i/t_i + # if check_s_i and s_i >= len(source_ids): + # check_s_i = False + # + # if not check_s_i and t_i >= len(target_ids): + # check_s_i = True + # + # if check_s_i: + # id_of_interest = source_ids[s_i] + # s_i += 1 + # check_s_i = not check_s_i + # else: + # id_of_interest = target_ids[t_i] + # t_i += 1 + # check_s_i = not check_s_i + # + # any_addition = False + # if id_of_interest not in links_ids_mapper: + # print('NOOOOO') + # for edge_id in links_ids_mapper[id_of_interest]: + # if edge_id not in edges_processed: + # if edge_id not in edges_of_one_type: + # any_addition = True + # edges_order.append(edge_id) + # edges_processed.add(edge_id) + # if not any_addition: + # check_s_i = not check_s_i + + # # DEBUG stuff + # for edge_order in edges_order: + # if edges_order.count(edge_order) > 1: + # # if edge_order not in a: + # print(f'ERROR {edge_order}') + # + # for edge_order in edges_order: + # if edge_order not in svala_data['edges']: + # print(f'ERROR {edge_order}') + # + # for key in svala_data['edges'].keys(): + # if key not in edges_order: + # print(f'ERROR {key}') + # + # a = len(svala_data['edges']) + # b = len(edges_order) + if len(svala_data['edges']) != len(edges_order): + for k, v in save_deleted_edges.items(): + svala_data['edges'][k] = v + + + assert len(svala_data['edges']) == len(edges_order) sentence_edges = [] source_sent_id = 0 @@ -120,6 +318,8 @@ def create_edges(svala_data, source_par, target_par): if not source_ok_all: source_sent_id += 1 + if edge_id == 'e-s590-t590': + print(edge_id) target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else [] target_ok_all = all(target_ok) @@ -131,6 +331,9 @@ def create_edges(svala_data, source_par, target_par): edges = [] edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels}) + if edges: + sentence_edges.append(edges) + return sentence_edges def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data, sentence_string_id): @@ -612,13 +815,15 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro sentence_edges = create_edges(svala_data, par_source, par_target) - return etree_source_sentences, etree_target_sentences, sentence_edges + return etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu def read_raw_text(path): with open(path, 'r') as rf: return rf.read() +HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.']} + def map_svala_tokenized(svala_data_part, tokenized_paragraph): paragraph_res = [] svala_data_i = 0 @@ -632,15 +837,37 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph): assert tok['misc'] == 'SpaceAfter=No' space_after = not 'misc' in tok if svala_data_part[svala_data_i]['text'].strip() != tok['text']: - if tok['text'] == '§' and svala_data_part[svala_data_i]['text'].strip() == '§§§': + key = svala_data_part[svala_data_i]['text'].strip() + if key not in HAND_FIXES: + print(f'key: {key} ; tok[text]: {tok["text"]}') + if key.startswith('§§§') and key.endswith('§§§'): + HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§'] + elif key.startswith('§§§'): + HAND_FIXES[key] = ['§', '§', '§', key[3:]] + elif key.endswith('§§§'): + HAND_FIXES[key] = [key[:-3], '§', '§', '§'] + else: + raise 'Word mismatch!' + + if tok['text'] == HAND_FIXES[key][wierd_sign_count]: wierd_sign_count += 1 - if wierd_sign_count < 3: + if wierd_sign_count < len(HAND_FIXES[key]): continue else: - tok['text'] = '§§§' + tok['text'] = key wierd_sign_count = 0 else: + print(f'key: {key} ; tok[text]: {tok["text"]}') raise 'Word mismatch!' + # if tok['text'] == '§' and svala_data_part[svala_data_i]['text'].strip() == '§§§': + # wierd_sign_count += 1 + # if wierd_sign_count < 3: + # continue + # else: + # tok['text'] = '§§§' + # wierd_sign_count = 0 + # else: + # raise 'Word mismatch!' sentence_id += 1 sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) svala_data_i += 1 @@ -684,13 +911,13 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err sentence_edges = [] if source_raw_text is not None: text = read_raw_text(source_raw_text) - raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] + raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else ([], [], []) # source_tokenized = nlp_tokenize() source_res = map_svala_tokenized(svala_data['source'], source_tokenized) if target_raw_text is not None: text = read_raw_text(target_raw_text) - raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] + raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else ([], [], []) target_res = map_svala_tokenized(svala_data['target'], target_tokenized) # TODO RETURN IF SOURCE AND TARGET ARE NOT NONE @@ -798,7 +1025,7 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err sentence_edges = create_edges(svala_data, par_source, par_target) - return etree_source_sentences, etree_target_sentences, sentence_edges + return etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu def process_file(et, args, nlp, nlp_tokenize): if os.path.exists(args.results_folder): @@ -823,9 +1050,13 @@ def process_file(et, args, nlp, nlp_tokenize): print(f'{i*100/folders_count} % : {file_name}') i += 1 # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt': - # filename_encountered = True - # if not filename_encountered: - # continue + # if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10602': + if i*100/folders_count > 40: + filename_encountered = True + # if i*100/folders_count > 50: + # filename_encountered = False + if not filename_encountered: + continue svala_path = os.path.join(args.svala_folder, file_name) corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) @@ -871,11 +1102,11 @@ def process_file(et, args, nlp, nlp_tokenize): target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None if not (source_raw_text or target_raw_text): - etree_source_sentences, etree_target_sentences, sentence_edges = process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, + etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu = process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu) else: - etree_source_sentences, etree_target_sentences, sentence_edges = process_obeliks_paragraph(sentences, paragraph, svala_i, + etree_source_sentences, etree_target_sentences, sentence_edges, complete_source_conllu, complete_target_conllu = process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu, source_raw_text, target_raw_text, nlp_tokenize) etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True)) @@ -899,10 +1130,7 @@ def process_file(et, args, nlp, nlp_tokenize): print('BUILDING LINKS...') etree_links = build_links(document_edges) - print('BUILDING COMPLETE TEI...') - complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) - - print('WRITING FILES') + print('Writting all but complete') with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: sf.write(complete_source_conllu) @@ -918,11 +1146,34 @@ def process_file(et, args, nlp, nlp_tokenize): with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf: tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode()) + with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: + json.dump(document_edges, jf, ensure_ascii=False, indent=" ") + + # TODO STUCKS HERE + print('COMPLETE TREE CREATION...') + complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) + + print('WRITING FILES') + # with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: + # sf.write(complete_source_conllu) + # + # with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf: + # sf.write(complete_target_conllu) + # + # with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf: + # sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode()) + # + # with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf: + # tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode()) + # + # with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf: + # tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode()) + with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf: tf.write(etree.tostring(complete_etree, pretty_print=True, encoding='utf-8').decode()) - with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: - json.dump(document_edges, jf, ensure_ascii=False, indent=" ") + # with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: + # json.dump(document_edges, jf, ensure_ascii=False, indent=" ") def main(args): @@ -947,6 +1198,8 @@ if __name__ == '__main__': help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--svala_generated_text_folder', default='data/svala_generated_text.formatted', help='input file in (gz or xml currently). If none, then just database is loaded') + parser.add_argument('--raw_conllu_interprocessing', default='data/processing.raw_conllu', + help='input file in (gz or xml currently). If none, then just database is loaded') args = parser.parse_args() start = time.time()