diff --git a/svala2tei.py b/svala2tei.py index 1315083..171ac8e 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -50,11 +50,24 @@ def create_edges(svala_data, source_par, target_par): # create links to ids mapper links_ids_mapper = {} + edges_of_one_type = set() for k, v in svala_data['edges'].items(): + has_source = False + has_target = False for el in v['ids']: + # create edges of one type + if el[0] == 's': + has_source = True + if el[0] == 't': + has_target = True + + # create links_ids_mapper if el not in links_ids_mapper: links_ids_mapper[el] = [] links_ids_mapper[el].append(k) + if not has_source or not has_target: + edges_of_one_type.add(k) + # create edge order edges_order = [] @@ -80,10 +93,10 @@ def create_edges(svala_data, source_par, target_par): check_s_i = not check_s_i any_addition = False - # if id_of_interest not in links_ids_mapper: - # print('NOOOOO') + if id_of_interest not in links_ids_mapper: + print('NOOOOO') for edge_id in links_ids_mapper[id_of_interest]: - if edge_id not in edges_processed: + if edge_id not in edges_processed and edge_id not in edges_of_one_type: any_addition = True edges_order.append(edge_id) edges_processed.add(edge_id) @@ -101,13 +114,13 @@ def create_edges(svala_data, source_par, target_par): target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper] ids = svala_data['edges'][edge_id]['ids'] - source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] + source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else [] source_ok_all = all(source_ok) if not source_ok_all: source_sent_id += 1 - target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] + target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else [] target_ok_all = all(target_ok) if not target_ok_all: @@ -130,19 +143,19 @@ def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{target_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text - source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) - target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False}) + source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': source_id}) + target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False, 'svala_id': target_id}) edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels}) -def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source): +def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): sentence_string_id_split = sentence_string_id.split('.') source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \ else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text - out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) + out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': s_t_id}) out_list_ids.append(source_token_id) @@ -169,7 +182,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -185,7 +198,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -201,7 +214,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -217,7 +230,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -233,7 +246,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -248,7 +261,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -263,7 +276,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -292,7 +305,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -308,7 +321,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -324,7 +337,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -340,7 +353,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -356,7 +369,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -371,7 +384,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -536,6 +549,9 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro sentence_edges = [] + par_source = [] + par_target = [] + for sentence_id, sentence in enumerate(sentences): source = [] target = [] @@ -558,14 +574,17 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro target_i += 1 elif el.tag.startswith('u'): svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, - svala_data, sentence_string_id, edges=edges) + svala_data, sentence_string_id) elif el.tag.startswith('c'): if len(source) > 0: source[-1]['space_after'] = True if len(target) > 0: target[-1]['space_after'] = True - sentence_edges.append(edges) + par_source.append(source) + par_target.append(target) + + # sentence_edges.append(edges) if len(source) > 0: source_conllu = create_conllu(source, sentence_string_id) if len(target) > 0: @@ -578,7 +597,8 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro if len(source) > 0: complete_source_conllu += source_conllu_annotated - complete_target_conllu += target_conllu_annotated + if len(target) > 0: + complete_target_conllu += target_conllu_annotated if len(source) > 0: source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] @@ -590,6 +610,8 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro if len(target) > 0: etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) + sentence_edges = create_edges(svala_data, par_source, par_target) + return etree_source_sentences, etree_target_sentences, sentence_edges @@ -600,21 +622,56 @@ def read_raw_text(path): def map_svala_tokenized(svala_data_part, tokenized_paragraph): paragraph_res = [] svala_data_i = 0 + wierd_sign_count = 0 for sentence in tokenized_paragraph: sentence_res = [] + sentence_id = 0 for tok in sentence: tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w' if 'misc' in tok: assert tok['misc'] == 'SpaceAfter=No' space_after = not 'misc' in tok if svala_data_part[svala_data_i]['text'].strip() != tok['text']: - raise 'Word mismatch!' - sentence_res.append({'token': tok['text'], 'tag': tag, 'id': tok['id'][0], 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) + if tok['text'] == '§' and svala_data_part[svala_data_i]['text'].strip() == '§§§': + wierd_sign_count += 1 + if wierd_sign_count < 3: + continue + else: + tok['text'] = '§§§' + wierd_sign_count = 0 + else: + raise 'Word mismatch!' + sentence_id += 1 + sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) svala_data_i += 1 paragraph_res.append(sentence_res) return paragraph_res +def map_svala_solar2(svala_data_part, solar2_paragraph): + paragraph_res = [] + svala_data_i = 0 + wierd_sign_count = 0 + for sentence in solar2_paragraph: + sentence_res = [] + sentence_id = 0 + for tok in sentence: + # if svala_data_part[svala_data_i]['text'].strip() != tok['token']: + # if tok['text'] == '§' and svala_data_part[svala_data_i]['token'].strip() == '§§§': + # wierd_sign_count += 1 + # if wierd_sign_count < 3: + # continue + # else: + # tok['text'] = '§§§' + # wierd_sign_count = 0 + # else: + # raise 'Word mismatch!' + assert svala_data_part[svala_data_i]['text'].strip() == tok['token'] + sentence_id += 1 + tok['svala_id'] = svala_data_part[svala_data_i]['id'] + svala_data_i += 1 + + def update_ids(pretag, in_list): for el in in_list: el['id'] = f'{pretag}.{el["id"]}' @@ -627,73 +684,88 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err sentence_edges = [] if source_raw_text is not None: text = read_raw_text(source_raw_text) - raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) + raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] # source_tokenized = nlp_tokenize() source_res = map_svala_tokenized(svala_data['source'], source_tokenized) if target_raw_text is not None: text = read_raw_text(target_raw_text) - raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) + raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] target_res = map_svala_tokenized(svala_data['target'], target_tokenized) # TODO RETURN IF SOURCE AND TARGET ARE NOT NONE par_source = [] par_target = [] - for sentence_id, sentence in enumerate(sentences): + sentences_len = len(sentences) + if source_raw_text is not None: + sentences_len = max(sentences_len, len(source_res)) + if target_raw_text is not None: + sentences_len = max(sentences_len, len(target_res)) + for sentence_id in range(sentences_len): + # assert sentence_id < len(sentences) + + # sentence_id += 1 + # for sentence_id, sentence in enumerate(sentences): source = [] target = [] - sentence_id += 1 source_i = 1 target_i = 1 sentence_string_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f'.{sentence_id}' - if sentence_string_id == 'solar185.2.1': - print('HERE!') sentence_string_id_split = sentence_string_id.split('.') - for el in sentence: - if el.tag.startswith('w'): - if source_raw_text is None: - add_source(str(svala_i), source_i, sentence_string_id_split, source, el) - if target_raw_text is None: - add_target(str(svala_i), target_i, sentence_string_id_split, target, el) - # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) - svala_i += 1 - source_i += 1 - target_i += 1 - elif el.tag.startswith('pc'): - if source_raw_text is None: - add_source(str(svala_i), source_i, sentence_string_id_split, source, el) - if target_raw_text is None: - add_target(str(svala_i), target_i, sentence_string_id_split, target, el) - # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) + if sentence_id - 1 < len(sentences): + sentence = sentences[sentence_id - 1] + for el in sentence: + # if source_i == 101: + # print('HMM') + if el.tag.startswith('w'): + if source_raw_text is None: + add_source(str(svala_i), source_i, sentence_string_id_split, source, el) + if target_raw_text is None: + add_target(str(svala_i), target_i, sentence_string_id_split, target, el) + # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) - svala_i += 1 - source_i += 1 - target_i += 1 - elif el.tag.startswith('u'): - if source_raw_text is None or target_raw_text is None: - svala_i, source_i, target_i = add_errors_source_target_only(svala_i, source_i, target_i, el, source, target, svala_data, sentence_string_id) - else: - svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, - svala_data, sentence_string_id) - elif el.tag.startswith('c'): - if len(source) > 0: - source[-1]['space_after'] = True - if len(target) > 0: - target[-1]['space_after'] = True + svala_i += 1 + source_i += 1 + target_i += 1 + elif el.tag.startswith('pc'): + if source_raw_text is None: + add_source(str(svala_i), source_i, sentence_string_id_split, source, el) + if target_raw_text is None: + add_target(str(svala_i), target_i, sentence_string_id_split, target, el) + # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) + + svala_i += 1 + source_i += 1 + target_i += 1 + elif el.tag.startswith('u'): + if source_raw_text is None or target_raw_text is None: + svala_i, source_i, target_i = add_errors_source_target_only(svala_i, source_i, target_i, el, source, target, svala_data, sentence_string_id) + else: + svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, + svala_data, sentence_string_id) + elif el.tag.startswith('c'): + if len(source) > 0: + source[-1]['space_after'] = True + if len(target) > 0: + target[-1]['space_after'] = True if source_raw_text is not None and sentence_id - 1 < len(source_res): source = source_res[sentence_id - 1] update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source) + par_source.append(source) if len(source) > 0: source_conllu = create_conllu(source, sentence_string_id) if target_raw_text is not None and sentence_id - 1 < len(target_res): target = target_res[sentence_id - 1] update_ids(f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}', target) + par_target.append(target) - par_source.append(source) - par_target.append(target) + if source_raw_text is None: + par_source.append(source) + if target_raw_text is None: + par_target.append(target) if len(target) > 0: target_conllu = create_conllu(target, sentence_string_id) @@ -718,6 +790,12 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err if len(target) > 0: etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) + # reannotate svala_ids + if source_raw_text is None: + map_svala_solar2(svala_data['source'], par_source) + if target_raw_text is None: + map_svala_solar2(svala_data['target'], par_target) + sentence_edges = create_edges(svala_data, par_source, par_target) return etree_source_sentences, etree_target_sentences, sentence_edges @@ -735,10 +813,19 @@ def process_file(et, args, nlp, nlp_tokenize): complete_target_conllu = '' document_edges = [] + filename_encountered = False + i = 0 + folders_count = 5484 for div in et.iter('div'): bibl = div.find('bibl') file_name = bibl.get('n') file_name = file_name.replace('/', '_') + print(f'{i*100/folders_count} % : {file_name}') + i += 1 + # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt': + # filename_encountered = True + # if not filename_encountered: + # continue svala_path = os.path.join(args.svala_folder, file_name) corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) @@ -750,6 +837,12 @@ def process_file(et, args, nlp, nlp_tokenize): svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] svala_dict = {e[0]: e[1] for e in svala_list} + if os.path.exists(corrected_svala_path): + corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)] + corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list} + + svala_dict.update(corrected_svala_dict) + etree_source_paragraphs = [] etree_target_paragraphs = [] paragraph_edges = [] @@ -760,10 +853,12 @@ def process_file(et, args, nlp, nlp_tokenize): svala_i = 1 # read json + # if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar5.7': + # print('here') svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) - if os.path.exists(corrected_svala_file): - print('aaa') + # if os.path.exists(corrected_svala_file): + # print('aaa') add_errors_func = add_errors if not os.path.exists(corrected_svala_file) else add_errors1_0_1 jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file) svala_data = json.load(jf) @@ -793,16 +888,21 @@ def process_file(et, args, nlp, nlp_tokenize): etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl))) document_edges.append(paragraph_edges) + print('APPENDING DOCUMENT...') etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', etree_source_divs)) etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', etree_target_divs)) + print('BUILDING TEI DOCUMENTS...') etree_source = build_tei_etrees(etree_source_documents) etree_target = build_tei_etrees(etree_target_documents) + print('BUILDING LINKS...') etree_links = build_links(document_edges) + print('BUILDING COMPLETE TEI...') complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) + print('WRITING FILES') with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: sf.write(complete_source_conllu) @@ -841,7 +941,7 @@ if __name__ == '__main__': help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--svala_folder', default='data/solar.svala', help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--corrected_svala_folder', default='data/solar.svala.fixed.1.0.1', + parser.add_argument('--corrected_svala_folder', default='data/solar.svala.fixed.1.0.1_2', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--results_folder', default='data/results/solar3.0', help='input file in (gz or xml currently). If none, then just database is loaded') diff --git a/svala_formatter/copy_svala_handchecked_files.py b/svala_formatter/copy_svala_handchecked_files.py index a260af6..746cefe 100644 --- a/svala_formatter/copy_svala_handchecked_files.py +++ b/svala_formatter/copy_svala_handchecked_files.py @@ -38,12 +38,23 @@ def compare_files(corrected_file, original_file): def main(args): # create mapper to corrected files corrected_files_mapper = {} + filename_encountered = False for foldername in os.listdir(args.corrected_folder): orig_name = 'KUS' + foldername.split('KUS')[1] + # if orig_name == 'KUS-G-slo-4-GO-E-2009-10105': + # filename_encountered = True + # if not filename_encountered: + # continue corrected_files_mapper[orig_name] = foldername + filename_encountered = False for foldername in os.listdir(args.original_folder): + # if foldername == 'KUS-G-slo-4-GO-E-2009-10105': + # filename_encountered = True + # if not filename_encountered: + # continue for filename in os.listdir(os.path.join(args.original_folder, foldername)): + fixed = False of = os.path.join(args.original_folder, foldername, filename) copy_filename = filename if filename.endswith('_problem.json'): @@ -55,12 +66,13 @@ def main(args): if filename.endswith('_problem.json'): new_filename = filename[:-13] + '_popravljeno.json' if os.path.exists(os.path.join(args.corrected_folder, corrected_files_mapper[foldername], new_filename)): + fixed = True filename = new_filename cf = os.path.join(args.corrected_folder, corrected_files_mapper[foldername], filename) cor_files = read_json(cf) ori_files = read_json(of) target, source = compare_files(cor_files, ori_files) - if target or source: + if target or source or fixed: if not os.path.exists(cpfol): os.mkdir(cpfol) shutil.copyfile(cf, cpf)