diff --git a/svala2tei.py b/svala2tei.py index 1315083..171ac8e 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -50,11 +50,24 @@ def create_edges(svala_data, source_par, target_par): # create links to ids mapper links_ids_mapper = {} + edges_of_one_type = set() for k, v in svala_data['edges'].items(): + has_source = False + has_target = False for el in v['ids']: + # create edges of one type + if el[0] == 's': + has_source = True + if el[0] == 't': + has_target = True + + # create links_ids_mapper if el not in links_ids_mapper: links_ids_mapper[el] = [] links_ids_mapper[el].append(k) + if not has_source or not has_target: + edges_of_one_type.add(k) + # create edge order edges_order = [] @@ -80,10 +93,10 @@ def create_edges(svala_data, source_par, target_par): check_s_i = not check_s_i any_addition = False - # if id_of_interest not in links_ids_mapper: - # print('NOOOOO') + if id_of_interest not in links_ids_mapper: + print('NOOOOO') for edge_id in links_ids_mapper[id_of_interest]: - if edge_id not in edges_processed: + if edge_id not in edges_processed and edge_id not in edges_of_one_type: any_addition = True edges_order.append(edge_id) edges_processed.add(edge_id) @@ -101,13 +114,13 @@ def create_edges(svala_data, source_par, target_par): target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper] ids = svala_data['edges'][edge_id]['ids'] - source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] + source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else [] source_ok_all = all(source_ok) if not source_ok_all: source_sent_id += 1 - target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] + target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else [] target_ok_all = all(target_ok) if not target_ok_all: @@ -130,19 +143,19 @@ def add_token(svala_i, source_i, target_i, el, source, target, edges, svala_data target_token_id = f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{target_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text - source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) - target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False}) + source.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': source_id}) + target.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': target_token_id, 'space_after': False, 'svala_id': target_id}) edges.append({'source_ids': [source_token_id], 'target_ids': [target_token_id], 'labels': labels}) -def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source): +def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): sentence_string_id_split = sentence_string_id.split('.') source_token_id = f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}.{out_list_i}' if is_source \ else f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}.{out_list_i}' token_tag = 'w' if el.tag.startswith('w') else 'pc' lemma = el.attrib['lemma'] if token_tag == 'w' else el.text - out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False}) + out_list.append({'token': el.text, 'tag': token_tag, 'ana': el.attrib['ana'], 'lemma': lemma, 'id': source_token_id, 'space_after': False, 'svala_id': s_t_id}) out_list_ids.append(source_token_id) @@ -169,7 +182,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -185,7 +198,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -201,7 +214,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -217,7 +230,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -233,7 +246,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -248,7 +261,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -263,7 +276,7 @@ def add_errors1_0_1(svala_i, source_i, target_i, error, source, target, svala_da target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -292,7 +305,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -308,7 +321,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s target_id = "t" + ind target_edge_ids.append(target_id) - add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False) + add_error_token(p_el, target, sentence_string_id, target_i, target_ids, False, target_id) target_i += 1 svala_i += 1 @@ -324,7 +337,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l2, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -340,7 +353,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l3, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -356,7 +369,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l4, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -371,7 +384,7 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s source_id = "s" + ind source_edge_ids.append(source_id) - add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True) + add_error_token(el_l5, source, sentence_string_id, source_i, source_ids, True, source_id) source_i += 1 svala_i += 1 @@ -536,6 +549,9 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro sentence_edges = [] + par_source = [] + par_target = [] + for sentence_id, sentence in enumerate(sentences): source = [] target = [] @@ -558,143 +574,19 @@ def process_solar2_paragraph(sentences, paragraph, svala_i, svala_data, add_erro target_i += 1 elif el.tag.startswith('u'): svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, - svala_data, sentence_string_id, edges=edges) + svala_data, sentence_string_id) elif el.tag.startswith('c'): if len(source) > 0: source[-1]['space_after'] = True if len(target) > 0: target[-1]['space_after'] = True - sentence_edges.append(edges) - if len(source) > 0: - source_conllu = create_conllu(source, sentence_string_id) - if len(target) > 0: - target_conllu = create_conllu(target, sentence_string_id) - - if len(source) > 0: - source_conllu_annotated = nlp(source_conllu).to_conll() - if len(target) > 0: - target_conllu_annotated = nlp(target_conllu).to_conll() - - if len(source) > 0: - complete_source_conllu += source_conllu_annotated - complete_target_conllu += target_conllu_annotated - - if len(source) > 0: - source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] - if len(target) > 0: - target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] - - if len(source) > 0: - etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True)) - if len(target) > 0: - etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) - - return etree_source_sentences, etree_target_sentences, sentence_edges - - -def read_raw_text(path): - with open(path, 'r') as rf: - return rf.read() - -def map_svala_tokenized(svala_data_part, tokenized_paragraph): - paragraph_res = [] - svala_data_i = 0 - for sentence in tokenized_paragraph: - sentence_res = [] - for tok in sentence: - tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w' - if 'misc' in tok: - assert tok['misc'] == 'SpaceAfter=No' - space_after = not 'misc' in tok - if svala_data_part[svala_data_i]['text'].strip() != tok['text']: - raise 'Word mismatch!' - sentence_res.append({'token': tok['text'], 'tag': tag, 'id': tok['id'][0], 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) - svala_data_i += 1 - paragraph_res.append(sentence_res) - return paragraph_res - - -def update_ids(pretag, in_list): - for el in in_list: - el['id'] = f'{pretag}.{el["id"]}' - - -def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu, source_raw_text, target_raw_text, nlp_tokenize): - etree_source_sentences = [] - etree_target_sentences = [] - - sentence_edges = [] - if source_raw_text is not None: - text = read_raw_text(source_raw_text) - raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) - # source_tokenized = nlp_tokenize() - source_res = map_svala_tokenized(svala_data['source'], source_tokenized) - - if target_raw_text is not None: - text = read_raw_text(target_raw_text) - raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) - target_res = map_svala_tokenized(svala_data['target'], target_tokenized) - - # TODO RETURN IF SOURCE AND TARGET ARE NOT NONE - par_source = [] - par_target = [] - for sentence_id, sentence in enumerate(sentences): - source = [] - target = [] - - sentence_id += 1 - source_i = 1 - target_i = 1 - sentence_string_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f'.{sentence_id}' - if sentence_string_id == 'solar185.2.1': - print('HERE!') - sentence_string_id_split = sentence_string_id.split('.') - for el in sentence: - if el.tag.startswith('w'): - if source_raw_text is None: - add_source(str(svala_i), source_i, sentence_string_id_split, source, el) - if target_raw_text is None: - add_target(str(svala_i), target_i, sentence_string_id_split, target, el) - # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) - - svala_i += 1 - source_i += 1 - target_i += 1 - elif el.tag.startswith('pc'): - if source_raw_text is None: - add_source(str(svala_i), source_i, sentence_string_id_split, source, el) - if target_raw_text is None: - add_target(str(svala_i), target_i, sentence_string_id_split, target, el) - # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) - - svala_i += 1 - source_i += 1 - target_i += 1 - elif el.tag.startswith('u'): - if source_raw_text is None or target_raw_text is None: - svala_i, source_i, target_i = add_errors_source_target_only(svala_i, source_i, target_i, el, source, target, svala_data, sentence_string_id) - else: - svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, - svala_data, sentence_string_id) - elif el.tag.startswith('c'): - if len(source) > 0: - source[-1]['space_after'] = True - if len(target) > 0: - target[-1]['space_after'] = True - - if source_raw_text is not None and sentence_id - 1 < len(source_res): - source = source_res[sentence_id - 1] - update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source) - if len(source) > 0: - source_conllu = create_conllu(source, sentence_string_id) - if target_raw_text is not None and sentence_id - 1 < len(target_res): - target = target_res[sentence_id - 1] - update_ids(f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}', target) - par_source.append(source) par_target.append(target) + # sentence_edges.append(edges) + if len(source) > 0: + source_conllu = create_conllu(source, sentence_string_id) if len(target) > 0: target_conllu = create_conllu(target, sentence_string_id) @@ -722,6 +614,192 @@ def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_err return etree_source_sentences, etree_target_sentences, sentence_edges + +def read_raw_text(path): + with open(path, 'r') as rf: + return rf.read() + +def map_svala_tokenized(svala_data_part, tokenized_paragraph): + paragraph_res = [] + svala_data_i = 0 + wierd_sign_count = 0 + for sentence in tokenized_paragraph: + sentence_res = [] + sentence_id = 0 + for tok in sentence: + tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w' + if 'misc' in tok: + assert tok['misc'] == 'SpaceAfter=No' + space_after = not 'misc' in tok + if svala_data_part[svala_data_i]['text'].strip() != tok['text']: + if tok['text'] == '§' and svala_data_part[svala_data_i]['text'].strip() == '§§§': + wierd_sign_count += 1 + if wierd_sign_count < 3: + continue + else: + tok['text'] = '§§§' + wierd_sign_count = 0 + else: + raise 'Word mismatch!' + sentence_id += 1 + sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) + svala_data_i += 1 + paragraph_res.append(sentence_res) + return paragraph_res + + +def map_svala_solar2(svala_data_part, solar2_paragraph): + paragraph_res = [] + svala_data_i = 0 + wierd_sign_count = 0 + for sentence in solar2_paragraph: + sentence_res = [] + sentence_id = 0 + for tok in sentence: + # if svala_data_part[svala_data_i]['text'].strip() != tok['token']: + # if tok['text'] == '§' and svala_data_part[svala_data_i]['token'].strip() == '§§§': + # wierd_sign_count += 1 + # if wierd_sign_count < 3: + # continue + # else: + # tok['text'] = '§§§' + # wierd_sign_count = 0 + # else: + # raise 'Word mismatch!' + assert svala_data_part[svala_data_i]['text'].strip() == tok['token'] + sentence_id += 1 + tok['svala_id'] = svala_data_part[svala_data_i]['id'] + svala_data_i += 1 + + +def update_ids(pretag, in_list): + for el in in_list: + el['id'] = f'{pretag}.{el["id"]}' + + +def process_obeliks_paragraph(sentences, paragraph, svala_i, svala_data, add_errors_func, nlp, complete_source_conllu, complete_target_conllu, source_raw_text, target_raw_text, nlp_tokenize): + etree_source_sentences = [] + etree_target_sentences = [] + + sentence_edges = [] + if source_raw_text is not None: + text = read_raw_text(source_raw_text) + raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] + # source_tokenized = nlp_tokenize() + source_res = map_svala_tokenized(svala_data['source'], source_tokenized) + + if target_raw_text is not None: + text = read_raw_text(target_raw_text) + raw_text, target_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(text) if text else [], [], [] + target_res = map_svala_tokenized(svala_data['target'], target_tokenized) + + # TODO RETURN IF SOURCE AND TARGET ARE NOT NONE + par_source = [] + par_target = [] + sentences_len = len(sentences) + if source_raw_text is not None: + sentences_len = max(sentences_len, len(source_res)) + if target_raw_text is not None: + sentences_len = max(sentences_len, len(target_res)) + for sentence_id in range(sentences_len): + # assert sentence_id < len(sentences) + + # sentence_id += 1 + # for sentence_id, sentence in enumerate(sentences): + source = [] + target = [] + sentence_id += 1 + source_i = 1 + target_i = 1 + sentence_string_id = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + f'.{sentence_id}' + sentence_string_id_split = sentence_string_id.split('.') + + if sentence_id - 1 < len(sentences): + sentence = sentences[sentence_id - 1] + for el in sentence: + # if source_i == 101: + # print('HMM') + if el.tag.startswith('w'): + if source_raw_text is None: + add_source(str(svala_i), source_i, sentence_string_id_split, source, el) + if target_raw_text is None: + add_target(str(svala_i), target_i, sentence_string_id_split, target, el) + # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) + + svala_i += 1 + source_i += 1 + target_i += 1 + elif el.tag.startswith('pc'): + if source_raw_text is None: + add_source(str(svala_i), source_i, sentence_string_id_split, source, el) + if target_raw_text is None: + add_target(str(svala_i), target_i, sentence_string_id_split, target, el) + # add_edges(source_id, target_id, svala_data, edges, source_token_id, target_token_id) + + svala_i += 1 + source_i += 1 + target_i += 1 + elif el.tag.startswith('u'): + if source_raw_text is None or target_raw_text is None: + svala_i, source_i, target_i = add_errors_source_target_only(svala_i, source_i, target_i, el, source, target, svala_data, sentence_string_id) + else: + svala_i, source_i, target_i = add_errors_func(svala_i, source_i, target_i, el, source, target, + svala_data, sentence_string_id) + elif el.tag.startswith('c'): + if len(source) > 0: + source[-1]['space_after'] = True + if len(target) > 0: + target[-1]['space_after'] = True + + if source_raw_text is not None and sentence_id - 1 < len(source_res): + source = source_res[sentence_id - 1] + update_ids(f'{sentence_string_id_split[0]}s.{".".join(sentence_string_id_split[1:])}', source) + par_source.append(source) + if len(source) > 0: + source_conllu = create_conllu(source, sentence_string_id) + if target_raw_text is not None and sentence_id - 1 < len(target_res): + target = target_res[sentence_id - 1] + update_ids(f'{sentence_string_id_split[0]}t.{".".join(sentence_string_id_split[1:])}', target) + par_target.append(target) + + if source_raw_text is None: + par_source.append(source) + if target_raw_text is None: + par_target.append(target) + + if len(target) > 0: + target_conllu = create_conllu(target, sentence_string_id) + + if len(source) > 0: + source_conllu_annotated = nlp(source_conllu).to_conll() + if len(target) > 0: + target_conllu_annotated = nlp(target_conllu).to_conll() + + if len(source) > 0: + complete_source_conllu += source_conllu_annotated + if len(target) > 0: + complete_target_conllu += target_conllu_annotated + + if len(source) > 0: + source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] + if len(target) > 0: + target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] + + if len(source) > 0: + etree_source_sentences.append(construct_sentence_from_list(str(sentence_id), source_conllu_parsed, True)) + if len(target) > 0: + etree_target_sentences.append(construct_sentence_from_list(str(sentence_id), target_conllu_parsed, False)) + + # reannotate svala_ids + if source_raw_text is None: + map_svala_solar2(svala_data['source'], par_source) + if target_raw_text is None: + map_svala_solar2(svala_data['target'], par_target) + + sentence_edges = create_edges(svala_data, par_source, par_target) + + return etree_source_sentences, etree_target_sentences, sentence_edges + def process_file(et, args, nlp, nlp_tokenize): if os.path.exists(args.results_folder): shutil.rmtree(args.results_folder) @@ -735,10 +813,19 @@ def process_file(et, args, nlp, nlp_tokenize): complete_target_conllu = '' document_edges = [] + filename_encountered = False + i = 0 + folders_count = 5484 for div in et.iter('div'): bibl = div.find('bibl') file_name = bibl.get('n') file_name = file_name.replace('/', '_') + print(f'{i*100/folders_count} % : {file_name}') + i += 1 + # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt': + # filename_encountered = True + # if not filename_encountered: + # continue svala_path = os.path.join(args.svala_folder, file_name) corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) @@ -750,6 +837,12 @@ def process_file(et, args, nlp, nlp_tokenize): svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] svala_dict = {e[0]: e[1] for e in svala_list} + if os.path.exists(corrected_svala_path): + corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)] + corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list} + + svala_dict.update(corrected_svala_dict) + etree_source_paragraphs = [] etree_target_paragraphs = [] paragraph_edges = [] @@ -760,10 +853,12 @@ def process_file(et, args, nlp, nlp_tokenize): svala_i = 1 # read json + # if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar5.7': + # print('here') svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) - if os.path.exists(corrected_svala_file): - print('aaa') + # if os.path.exists(corrected_svala_file): + # print('aaa') add_errors_func = add_errors if not os.path.exists(corrected_svala_file) else add_errors1_0_1 jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file) svala_data = json.load(jf) @@ -793,16 +888,21 @@ def process_file(et, args, nlp, nlp_tokenize): etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl))) document_edges.append(paragraph_edges) + print('APPENDING DOCUMENT...') etree_source_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', etree_source_divs)) etree_target_documents.append(TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', etree_target_divs)) + print('BUILDING TEI DOCUMENTS...') etree_source = build_tei_etrees(etree_source_documents) etree_target = build_tei_etrees(etree_target_documents) + print('BUILDING LINKS...') etree_links = build_links(document_edges) + print('BUILDING COMPLETE TEI...') complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links) + print('WRITING FILES') with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: sf.write(complete_source_conllu) @@ -841,7 +941,7 @@ if __name__ == '__main__': help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--svala_folder', default='data/solar.svala', help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--corrected_svala_folder', default='data/solar.svala.fixed.1.0.1', + parser.add_argument('--corrected_svala_folder', default='data/solar.svala.fixed.1.0.1_2', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--results_folder', default='data/results/solar3.0', help='input file in (gz or xml currently). If none, then just database is loaded') diff --git a/svala_formatter/copy_svala_handchecked_files.py b/svala_formatter/copy_svala_handchecked_files.py index a260af6..746cefe 100644 --- a/svala_formatter/copy_svala_handchecked_files.py +++ b/svala_formatter/copy_svala_handchecked_files.py @@ -38,12 +38,23 @@ def compare_files(corrected_file, original_file): def main(args): # create mapper to corrected files corrected_files_mapper = {} + filename_encountered = False for foldername in os.listdir(args.corrected_folder): orig_name = 'KUS' + foldername.split('KUS')[1] + # if orig_name == 'KUS-G-slo-4-GO-E-2009-10105': + # filename_encountered = True + # if not filename_encountered: + # continue corrected_files_mapper[orig_name] = foldername + filename_encountered = False for foldername in os.listdir(args.original_folder): + # if foldername == 'KUS-G-slo-4-GO-E-2009-10105': + # filename_encountered = True + # if not filename_encountered: + # continue for filename in os.listdir(os.path.join(args.original_folder, foldername)): + fixed = False of = os.path.join(args.original_folder, foldername, filename) copy_filename = filename if filename.endswith('_problem.json'): @@ -55,12 +66,13 @@ def main(args): if filename.endswith('_problem.json'): new_filename = filename[:-13] + '_popravljeno.json' if os.path.exists(os.path.join(args.corrected_folder, corrected_files_mapper[foldername], new_filename)): + fixed = True filename = new_filename cf = os.path.join(args.corrected_folder, corrected_files_mapper[foldername], filename) cor_files = read_json(cf) ori_files = read_json(of) target, source = compare_files(cor_files, ori_files) - if target or source: + if target or source or fixed: if not os.path.exists(cpfol): os.mkdir(cpfol) shutil.copyfile(cf, cpf)