From eb0ea39415bdff4ba99cb8e539044f1256a2bb57 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 12 Dec 2022 10:23:28 +0100 Subject: [PATCH] Updated code for KOST --- src/annotate/annotate.py | 26 +++++--- src/create_tei.py | 124 ++++++++++++++++--------------------- src/read/merge.py | 4 +- src/read/read_and_merge.py | 30 ++++++--- src/write/write.py | 110 ++++++++++---------------------- svala2tei.py | 2 +- 6 files changed, 126 insertions(+), 170 deletions(-) diff --git a/src/annotate/annotate.py b/src/annotate/annotate.py index 4888215..a5f635b 100644 --- a/src/annotate/annotate.py +++ b/src/annotate/annotate.py @@ -5,7 +5,7 @@ import classla def annotate(tokenized_source_divs, tokenized_target_divs, args): if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation: - print('READING...') + print('READING ANNOTATIONS...') with open(args.annotation_interprocessing, 'rb') as rp: annotated_source_divs, annotated_target_divs = pickle.load(rp) return annotated_source_divs, annotated_target_divs @@ -16,32 +16,38 @@ def annotate(tokenized_source_divs, tokenized_target_divs, args): annotated_source_divs = [] complete_source_conllu = '' print('ANNOTATING SOURCE...') - for i, div in enumerate(tokenized_source_divs): + for i, div_tuple in enumerate(tokenized_source_divs): print(f'{str(i*100/len(tokenized_source_divs))}') + div_name, div = div_tuple annotated_source_pars = [] - for par in div: + for par_tuple in div: + par_name, par = par_tuple annotated_source_sens = [] for sen in par: source_conllu_annotated = nlp(sen).to_conll() if sen else '' annotated_source_sens.append(source_conllu_annotated) complete_source_conllu += source_conllu_annotated - annotated_source_pars.append(annotated_source_sens) - annotated_source_divs.append(annotated_source_pars) + annotated_source_pars.append((par_name, annotated_source_sens)) + annotated_source_divs.append((div_name, annotated_source_pars)) annotated_target_divs = [] complete_target_conllu = '' print('ANNOTATING TARGET...') - for i, div in enumerate(tokenized_target_divs): + for i, div_tuple in enumerate(tokenized_target_divs): print(f'{str(i * 100 / len(tokenized_target_divs))}') + div_name, div = div_tuple annotated_target_pars = [] - for par in div: + for par_tuple in div: + par_name, par = par_tuple annotated_target_sens = [] for sen in par: - target_conllu_annotated = nlp(sen).to_conll() if sen else '' + # if sen.count('\n') <= 2: + # print('HERE!!!!') + target_conllu_annotated = nlp(sen).to_conll() if sen and sen.count('\n') > 2 else '' annotated_target_sens.append(target_conllu_annotated) complete_target_conllu += target_conllu_annotated - annotated_target_pars.append(annotated_target_sens) - annotated_target_divs.append(annotated_target_pars) + annotated_target_pars.append((par_name, annotated_target_sens)) + annotated_target_divs.append((div_name, annotated_target_pars)) with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf: sf.write(complete_source_conllu) diff --git a/src/create_tei.py b/src/create_tei.py index 6edbf02..686f524 100644 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -176,9 +176,9 @@ class Sentence: class Paragraph: - def __init__(self, _id, _doc_id, is_source): + def __init__(self, _id, _doc_id): self._id = _id if _id is not None else 'no-id' - _doc_id += 's' if is_source else 't' + # _doc_id += 's' if is_source else 't' self._doc_id = _doc_id if _doc_id is not None else '' self.sentences = [] @@ -231,16 +231,14 @@ class TeiDocument: tag_usage.set('gi', tag) tag_usage.set('occurs', str(count)) - for (paras, bibl, div_id), (_, _, corresp_div_id) in zip(self.divs, self.corresp_divs): + for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs): div = etree.Element('div') set_xml_attr(div, 'id', div_id) div.set('corresp', f'#{corresp_div_id}') - div.append(bibl) for para in paras: div.append(para.as_xml()) body.append(div) - return root def add_paragraph(self, paragraph): @@ -301,47 +299,56 @@ def build_links(all_edges): body = etree.Element('standOff') for document_edges in all_edges: + + + + # if len(document_edges) > 1: + # print('here') + + # mine paragraphs for paragraph_edges in document_edges: - for sentence_edges in paragraph_edges: - s = etree.Element('linkGrp') - - sentence_id = '' - corresp_source_id = '' - corresp_target_id = '' - corresp = [] - for token_edges in sentence_edges: - if not corresp_source_id and len(token_edges['source_ids']) > 0: - random_source_id = token_edges['source_ids'][0] - corresp_source_id = '#' - corresp_source_id += '.'.join(random_source_id.split('.')[:3]) - corresp.append(corresp_source_id) - if not corresp_target_id and len(token_edges['target_ids']) > 0: - random_target_id = token_edges['target_ids'][0] - corresp_target_id = '#' - corresp_target_id += '.'.join(random_target_id.split('.')[:3]) - corresp.append(corresp_target_id) - link = etree.Element('link') - # translate labels - labels_list = [] - for label in token_edges['labels']: - if label in labels_mapper: - labels_list.append(labels_mapper[label]) - else: - labels_list.append(label) - labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID' - link.set('type', labels) - link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']])) - - s.append(link) - s.set('type', 'CORR') - targFunc = [] - if corresp_source_id: - targFunc.append('orig') - if corresp_target_id: - targFunc.append('reg') - s.set('targFunc', f'{" ".join(targFunc)}') - s.set('corresp', f'{" ".join(corresp)}') - body.append(s) + p = etree.Element('linkGrp') + paragraph_id = '' + corresp_source_id = '' + corresp_target_id = '' + corresp = [] + # for sentence_edges in paragraph_edges: + # + for token_edges in paragraph_edges: + if not corresp_source_id and len(token_edges['source_ids']) > 0: + random_source_id = token_edges['source_ids'][0] + corresp_source_id = '#' + # corresp_source_id += '.'.join(random_source_id.split('.')[:3]) + corresp_source_id += '.'.join(random_source_id.split('.')[:2]) + corresp.append(corresp_source_id) + if not corresp_target_id and len(token_edges['target_ids']) > 0: + random_target_id = token_edges['target_ids'][0] + corresp_target_id = '#' + corresp_target_id += '.'.join(random_target_id.split('.')[:2]) + # corresp_target_id += random_target_id.split('.')[0] + corresp.append(corresp_target_id) + link = etree.Element('link') + # translate labels + labels_list = [] + for label in token_edges['labels']: + if label in labels_mapper: + labels_list.append(labels_mapper[label]) + else: + labels_list.append(label) + labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID' + link.set('type', labels) + link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']])) + + p.append(link) + p.set('type', 'CORR') + targFunc = [] + if corresp_source_id: + targFunc.append('orig') + if corresp_target_id: + targFunc.append('reg') + p.set('targFunc', f'{" ".join(targFunc)}') + p.set('corresp', f'{" ".join(corresp)}') + body.append(p) return body @@ -365,8 +372,8 @@ def is_metaline(line): return False -def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id): - para = Paragraph(para_id, doc_id, source_id) +def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences): + para = Paragraph(para_id, doc_id) for sentence in etree_source_sentences: para.add_sentence(sentence) @@ -374,29 +381,6 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, sourc return para -def construct_paragraph(doc_id, para_id, conllu_lines, is_source): - para = Paragraph(para_id, doc_id, is_source) - - sent_id = None - sent_buffer = [] - - for line in conllu_lines: - if is_metaline(line): - key, val = parse_metaline(line) - if key == 'sent_id': - if len(sent_buffer) > 0: - para.add_sentence(construct_sentence(sent_id, sent_buffer)) - sent_buffer = [] - sent_id = val - elif not line.isspace(): - sent_buffer.append(line) - - if len(sent_buffer) > 0: - para.add_sentence(construct_sentence(sent_id, sent_buffer)) - - return para - - def construct_sentence_from_list(sent_id, object_list, is_source): sentence = Sentence(sent_id) converter = Converter() diff --git a/src/read/merge.py b/src/read/merge.py index 7d4503a..88a4492 100644 --- a/src/read/merge.py +++ b/src/read/merge.py @@ -19,8 +19,8 @@ SKIP_IDS = ['solar2284s.1.1.1'] def create_edges(raw_edges, source_par, target_par): - source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source} - target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target} + source_mapper = {el['svala_id']: source[1] + '.' + str(el['id']) for source in source_par for el in source[0]} + target_mapper = {el['svala_id']: target[1] + '.' + str(el['id']) for target in target_par for el in target[0]} # actually add edges edges = [] diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index 9dfde20..ddffea4 100644 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -13,6 +13,7 @@ from src.read.merge import merge, create_conllu, create_edges from src.read.read import read_raw_text, map_svala_tokenized from src.read.svala_data import SvalaData +alphabet = list(map(chr, range(97, 123))) def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): sentence_string_id_split = sentence_string_id.split('.') @@ -245,7 +246,7 @@ def create_target(svala_data_object, source_tokenized): def tokenize(args): if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization: - print('READING AND MERGING...') + print('READING TOKENIZATION...') with open(args.tokenization_interprocessing, 'rb') as rp: tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp) return tokenized_source_divs, tokenized_target_divs, document_edges @@ -314,26 +315,35 @@ def tokenize(args): # par_target = [] for tokenized_para in tokenized_divs[div_id]: paragraph_name, source_res, target_res, edges = tokenized_para + split_para_name = paragraph_name[:-5].split('-') + div_name = '-'.join(split_para_name[:-1]) + par_name = split_para_name[-1] + assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!') + if par_name in alphabet: + par_name = str(alphabet.index(par_name) + 10) + source_paragraphs = [] target_paragraphs = [] sen_source = [] sen_target = [] for sen_i, sen in enumerate(source_res): - source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}') + source_sen_name = f'{div_name}s.{par_name}.{str(sen_i + 1)}' + source_conllu = create_conllu(sen, source_sen_name) source_paragraphs.append(source_conllu) - sen_source.append(sen) + sen_source.append((sen, source_sen_name)) for sen_i, sen in enumerate(target_res): - target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}') + target_sen_name = f'{div_name}t.{par_name}.{str(sen_i + 1)}' + target_conllu = create_conllu(sen, target_sen_name) target_paragraphs.append(target_conllu) - sen_target.append(sen) - paragraph_edges.append(edges) - tokenized_source_paragraphs.append(source_paragraphs) - tokenized_target_paragraphs.append(target_paragraphs) + sen_target.append((sen, target_sen_name)) + # paragraph_edges.append(edges) + tokenized_source_paragraphs.append((par_name, source_paragraphs)) + tokenized_target_paragraphs.append((par_name, target_paragraphs)) paragraph_edges.append(create_edges(edges, sen_source, sen_target)) - tokenized_source_divs.append(tokenized_source_paragraphs) - tokenized_target_divs.append(tokenized_target_paragraphs) + tokenized_source_divs.append((div_name+'s', tokenized_source_paragraphs)) + tokenized_target_divs.append((div_name+'t', tokenized_target_paragraphs)) document_edges.append(paragraph_edges) diff --git a/src/write/write.py b/src/write/write.py index d4ef389..0587c07 100644 --- a/src/write/write.py +++ b/src/write/write.py @@ -8,9 +8,34 @@ from src.create_tei import construct_sentence_from_list, \ construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl +def form_paragraphs(annotated_source_divs): + etree_source_divs = [] + for div_i, div_tuple in enumerate(annotated_source_divs): + div_name, div = div_tuple + # file_name = file_name.replace('/', '_') + # print(f'{i * 100 / folders_count} % : {file_name}') + + etree_source_paragraphs = [] + + for par_i, paragraph_tuple in enumerate(div): + par_name, paragraph = paragraph_tuple + etree_source_sentences = [] + + for sentence_id, sentence in enumerate(paragraph): + if len(sentence) > 0: + conllu_parsed = conllu.parse(sentence)[0] + etree_source_sentences.append( + construct_sentence_from_list(str(sentence_id + 1), conllu_parsed, True)) + + etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences)) + + etree_source_divs.append((etree_source_paragraphs, div_name)) + + return etree_source_divs, div_name + def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args): - print('BUILDING LINKS...') - etree_links = build_links(document_edges) + # print('BUILDING LINKS...') + # etree_links = build_links(document_edges) with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf: tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode()) @@ -18,91 +43,22 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf: json.dump(document_edges, jf, ensure_ascii=False, indent=" ") - print('WRITTING TEI...') etree_source_documents = [] etree_target_documents = [] - etree_source_divs = [] - etree_target_divs = [] - - # with open(args.solar_file, 'r') as fp: - # logging.info(args.solar_file) - # et = ElementTree.XML(fp.read()) - - # filename_encountered = False - i = 0 - folders_count = 5484 - - div_i = 0 - for div in et.iter('div'): - bibl = div.find('bibl') - file_name = bibl.get('n') - file_name = file_name.replace('/', '_') - print(f'{i * 100 / folders_count} % : {file_name}') - i += 1 - - # if i * 100 / folders_count > 50: - # filename_encountered = True - # # if file_name == 'KUS-G-slo-4-GO-E-2009-10071': - # # filename_encountered = True - # if i * 100 / folders_count > 51: - # filename_encountered = False - # - # if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540': - # # div_i -= 1 - # continue - # - # if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173': - # # div_i -= 1 - # continue - # - # if not filename_encountered: - # div_i+=1 - # - # continue - - - etree_source_paragraphs = [] - etree_target_paragraphs = [] - # paragraph_edges = [] - - paragraphs = div.findall('p') - par_i = 0 - for paragraph in paragraphs: - - etree_source_sentences = [] - etree_target_sentences = [] - - for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]): - if len(source_conllu_annotated) > 0: - source_conllu_parsed = conllu.parse(source_conllu_annotated)[0] - if len(source_conllu_annotated) > 0: - etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True)) - - - for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]): - if len(target_conllu_annotated) > 0: - target_conllu_parsed = conllu.parse(target_conllu_annotated)[0] - if len(target_conllu_annotated) > 0: - etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False)) - - etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True)) - etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False)) - - par_i += 1 - etree_bibl = convert_bibl(bibl) - etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's')) - etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't')) + print('WRITING SOURCE FILES...') + etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs) - div_i += 1 + print('WRITING TARGET FILES...') + etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs) print('APPENDING DOCUMENT...') etree_source_documents.append( - TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's', + TeiDocument(source_div_name, etree_source_divs, etree_target_divs)) etree_target_documents.append( - TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't', + TeiDocument(target_div_name, etree_target_divs, etree_source_divs)) print('BUILDING TEI DOCUMENTS...') diff --git a/svala2tei.py b/svala2tei.py index a1111a2..0e7e387 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -249,7 +249,7 @@ if __name__ == '__main__': description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') parser.add_argument('--svala_folder', default='data/KOST/svala', help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--results_folder', default='data/results/solar3.0', + parser.add_argument('--results_folder', default='data/KOST/results', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--raw_text', default='data/KOST/raw', help='input file in (gz or xml currently). If none, then just database is loaded')