Updated code for KOST

This commit is contained in:
Luka 2022-12-12 10:23:28 +01:00
parent cc455b2558
commit eb0ea39415
6 changed files with 125 additions and 169 deletions

View File

@ -5,7 +5,7 @@ import classla
def annotate(tokenized_source_divs, tokenized_target_divs, args):
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
print('READING...')
print('READING ANNOTATIONS...')
with open(args.annotation_interprocessing, 'rb') as rp:
annotated_source_divs, annotated_target_divs = pickle.load(rp)
return annotated_source_divs, annotated_target_divs
@ -16,32 +16,38 @@ def annotate(tokenized_source_divs, tokenized_target_divs, args):
annotated_source_divs = []
complete_source_conllu = ''
print('ANNOTATING SOURCE...')
for i, div in enumerate(tokenized_source_divs):
for i, div_tuple in enumerate(tokenized_source_divs):
print(f'{str(i*100/len(tokenized_source_divs))}')
div_name, div = div_tuple
annotated_source_pars = []
for par in div:
for par_tuple in div:
par_name, par = par_tuple
annotated_source_sens = []
for sen in par:
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
annotated_source_sens.append(source_conllu_annotated)
complete_source_conllu += source_conllu_annotated
annotated_source_pars.append(annotated_source_sens)
annotated_source_divs.append(annotated_source_pars)
annotated_source_pars.append((par_name, annotated_source_sens))
annotated_source_divs.append((div_name, annotated_source_pars))
annotated_target_divs = []
complete_target_conllu = ''
print('ANNOTATING TARGET...')
for i, div in enumerate(tokenized_target_divs):
for i, div_tuple in enumerate(tokenized_target_divs):
print(f'{str(i * 100 / len(tokenized_target_divs))}')
div_name, div = div_tuple
annotated_target_pars = []
for par in div:
for par_tuple in div:
par_name, par = par_tuple
annotated_target_sens = []
for sen in par:
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
# if sen.count('\n') <= 2:
# print('HERE!!!!')
target_conllu_annotated = nlp(sen).to_conll() if sen and sen.count('\n') > 2 else ''
annotated_target_sens.append(target_conllu_annotated)
complete_target_conllu += target_conllu_annotated
annotated_target_pars.append(annotated_target_sens)
annotated_target_divs.append(annotated_target_pars)
annotated_target_pars.append((par_name, annotated_target_sens))
annotated_target_divs.append((div_name, annotated_target_pars))
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
sf.write(complete_source_conllu)

View File

@ -176,9 +176,9 @@ class Sentence:
class Paragraph:
def __init__(self, _id, _doc_id, is_source):
def __init__(self, _id, _doc_id):
self._id = _id if _id is not None else 'no-id'
_doc_id += 's' if is_source else 't'
# _doc_id += 's' if is_source else 't'
self._doc_id = _doc_id if _doc_id is not None else ''
self.sentences = []
@ -231,16 +231,14 @@ class TeiDocument:
tag_usage.set('gi', tag)
tag_usage.set('occurs', str(count))
for (paras, bibl, div_id), (_, _, corresp_div_id) in zip(self.divs, self.corresp_divs):
for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs):
div = etree.Element('div')
set_xml_attr(div, 'id', div_id)
div.set('corresp', f'#{corresp_div_id}')
div.append(bibl)
for para in paras:
div.append(para.as_xml())
body.append(div)
return root
def add_paragraph(self, paragraph):
@ -301,47 +299,56 @@ def build_links(all_edges):
body = etree.Element('standOff')
for document_edges in all_edges:
# if len(document_edges) > 1:
# print('here')
# mine paragraphs
for paragraph_edges in document_edges:
for sentence_edges in paragraph_edges:
s = etree.Element('linkGrp')
p = etree.Element('linkGrp')
paragraph_id = ''
corresp_source_id = ''
corresp_target_id = ''
corresp = []
# for sentence_edges in paragraph_edges:
#
for token_edges in paragraph_edges:
if not corresp_source_id and len(token_edges['source_ids']) > 0:
random_source_id = token_edges['source_ids'][0]
corresp_source_id = '#'
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
corresp.append(corresp_source_id)
if not corresp_target_id and len(token_edges['target_ids']) > 0:
random_target_id = token_edges['target_ids'][0]
corresp_target_id = '#'
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
# corresp_target_id += random_target_id.split('.')[0]
corresp.append(corresp_target_id)
link = etree.Element('link')
# translate labels
labels_list = []
for label in token_edges['labels']:
if label in labels_mapper:
labels_list.append(labels_mapper[label])
else:
labels_list.append(label)
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
link.set('type', labels)
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
sentence_id = ''
corresp_source_id = ''
corresp_target_id = ''
corresp = []
for token_edges in sentence_edges:
if not corresp_source_id and len(token_edges['source_ids']) > 0:
random_source_id = token_edges['source_ids'][0]
corresp_source_id = '#'
corresp_source_id += '.'.join(random_source_id.split('.')[:3])
corresp.append(corresp_source_id)
if not corresp_target_id and len(token_edges['target_ids']) > 0:
random_target_id = token_edges['target_ids'][0]
corresp_target_id = '#'
corresp_target_id += '.'.join(random_target_id.split('.')[:3])
corresp.append(corresp_target_id)
link = etree.Element('link')
# translate labels
labels_list = []
for label in token_edges['labels']:
if label in labels_mapper:
labels_list.append(labels_mapper[label])
else:
labels_list.append(label)
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
link.set('type', labels)
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
s.append(link)
s.set('type', 'CORR')
targFunc = []
if corresp_source_id:
targFunc.append('orig')
if corresp_target_id:
targFunc.append('reg')
s.set('targFunc', f'{" ".join(targFunc)}')
s.set('corresp', f'{" ".join(corresp)}')
body.append(s)
p.append(link)
p.set('type', 'CORR')
targFunc = []
if corresp_source_id:
targFunc.append('orig')
if corresp_target_id:
targFunc.append('reg')
p.set('targFunc', f'{" ".join(targFunc)}')
p.set('corresp', f'{" ".join(corresp)}')
body.append(p)
return body
@ -365,8 +372,8 @@ def is_metaline(line):
return False
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id):
para = Paragraph(para_id, doc_id, source_id)
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences):
para = Paragraph(para_id, doc_id)
for sentence in etree_source_sentences:
para.add_sentence(sentence)
@ -374,29 +381,6 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, sourc
return para
def construct_paragraph(doc_id, para_id, conllu_lines, is_source):
para = Paragraph(para_id, doc_id, is_source)
sent_id = None
sent_buffer = []
for line in conllu_lines:
if is_metaline(line):
key, val = parse_metaline(line)
if key == 'sent_id':
if len(sent_buffer) > 0:
para.add_sentence(construct_sentence(sent_id, sent_buffer))
sent_buffer = []
sent_id = val
elif not line.isspace():
sent_buffer.append(line)
if len(sent_buffer) > 0:
para.add_sentence(construct_sentence(sent_id, sent_buffer))
return para
def construct_sentence_from_list(sent_id, object_list, is_source):
sentence = Sentence(sent_id)
converter = Converter()

View File

@ -19,8 +19,8 @@ SKIP_IDS = ['solar2284s.1.1.1']
def create_edges(raw_edges, source_par, target_par):
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
source_mapper = {el['svala_id']: source[1] + '.' + str(el['id']) for source in source_par for el in source[0]}
target_mapper = {el['svala_id']: target[1] + '.' + str(el['id']) for target in target_par for el in target[0]}
# actually add edges
edges = []

View File

@ -13,6 +13,7 @@ from src.read.merge import merge, create_conllu, create_edges
from src.read.read import read_raw_text, map_svala_tokenized
from src.read.svala_data import SvalaData
alphabet = list(map(chr, range(97, 123)))
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
sentence_string_id_split = sentence_string_id.split('.')
@ -245,7 +246,7 @@ def create_target(svala_data_object, source_tokenized):
def tokenize(args):
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
print('READING AND MERGING...')
print('READING TOKENIZATION...')
with open(args.tokenization_interprocessing, 'rb') as rp:
tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
return tokenized_source_divs, tokenized_target_divs, document_edges
@ -314,26 +315,35 @@ def tokenize(args):
# par_target = []
for tokenized_para in tokenized_divs[div_id]:
paragraph_name, source_res, target_res, edges = tokenized_para
split_para_name = paragraph_name[:-5].split('-')
div_name = '-'.join(split_para_name[:-1])
par_name = split_para_name[-1]
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
if par_name in alphabet:
par_name = str(alphabet.index(par_name) + 10)
source_paragraphs = []
target_paragraphs = []
sen_source = []
sen_target = []
for sen_i, sen in enumerate(source_res):
source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
source_sen_name = f'{div_name}s.{par_name}.{str(sen_i + 1)}'
source_conllu = create_conllu(sen, source_sen_name)
source_paragraphs.append(source_conllu)
sen_source.append(sen)
sen_source.append((sen, source_sen_name))
for sen_i, sen in enumerate(target_res):
target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
target_sen_name = f'{div_name}t.{par_name}.{str(sen_i + 1)}'
target_conllu = create_conllu(sen, target_sen_name)
target_paragraphs.append(target_conllu)
sen_target.append(sen)
paragraph_edges.append(edges)
tokenized_source_paragraphs.append(source_paragraphs)
tokenized_target_paragraphs.append(target_paragraphs)
sen_target.append((sen, target_sen_name))
# paragraph_edges.append(edges)
tokenized_source_paragraphs.append((par_name, source_paragraphs))
tokenized_target_paragraphs.append((par_name, target_paragraphs))
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
tokenized_source_divs.append(tokenized_source_paragraphs)
tokenized_target_divs.append(tokenized_target_paragraphs)
tokenized_source_divs.append((div_name+'s', tokenized_source_paragraphs))
tokenized_target_divs.append((div_name+'t', tokenized_target_paragraphs))
document_edges.append(paragraph_edges)

View File

@ -8,9 +8,34 @@ from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
def form_paragraphs(annotated_source_divs):
etree_source_divs = []
for div_i, div_tuple in enumerate(annotated_source_divs):
div_name, div = div_tuple
# file_name = file_name.replace('/', '_')
# print(f'{i * 100 / folders_count} % : {file_name}')
etree_source_paragraphs = []
for par_i, paragraph_tuple in enumerate(div):
par_name, paragraph = paragraph_tuple
etree_source_sentences = []
for sentence_id, sentence in enumerate(paragraph):
if len(sentence) > 0:
conllu_parsed = conllu.parse(sentence)[0]
etree_source_sentences.append(
construct_sentence_from_list(str(sentence_id + 1), conllu_parsed, True))
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
etree_source_divs.append((etree_source_paragraphs, div_name))
return etree_source_divs, div_name
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
print('BUILDING LINKS...')
etree_links = build_links(document_edges)
# print('BUILDING LINKS...')
# etree_links = build_links(document_edges)
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
@ -18,91 +43,22 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
print('WRITTING TEI...')
etree_source_documents = []
etree_target_documents = []
etree_source_divs = []
etree_target_divs = []
# with open(args.solar_file, 'r') as fp:
# logging.info(args.solar_file)
# et = ElementTree.XML(fp.read())
print('WRITING SOURCE FILES...')
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs)
# filename_encountered = False
i = 0
folders_count = 5484
div_i = 0
for div in et.iter('div'):
bibl = div.find('bibl')
file_name = bibl.get('n')
file_name = file_name.replace('/', '_')
print(f'{i * 100 / folders_count} % : {file_name}')
i += 1
# if i * 100 / folders_count > 50:
# filename_encountered = True
# # if file_name == 'KUS-G-slo-4-GO-E-2009-10071':
# # filename_encountered = True
# if i * 100 / folders_count > 51:
# filename_encountered = False
#
# if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
# # div_i -= 1
# continue
#
# if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
# # div_i -= 1
# continue
#
# if not filename_encountered:
# div_i+=1
#
# continue
etree_source_paragraphs = []
etree_target_paragraphs = []
# paragraph_edges = []
paragraphs = div.findall('p')
par_i = 0
for paragraph in paragraphs:
etree_source_sentences = []
etree_target_sentences = []
for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]):
if len(source_conllu_annotated) > 0:
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
if len(source_conllu_annotated) > 0:
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))
for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]):
if len(target_conllu_annotated) > 0:
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
if len(target_conllu_annotated) > 0:
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
par_i += 1
etree_bibl = convert_bibl(bibl)
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))
div_i += 1
print('WRITING TARGET FILES...')
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs)
print('APPENDING DOCUMENT...')
etree_source_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
TeiDocument(source_div_name,
etree_source_divs, etree_target_divs))
etree_target_documents.append(
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
TeiDocument(target_div_name,
etree_target_divs, etree_source_divs))
print('BUILDING TEI DOCUMENTS...')

View File

@ -249,7 +249,7 @@ if __name__ == '__main__':
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
parser.add_argument('--svala_folder', default='data/KOST/svala',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--results_folder', default='data/results/solar3.0',
parser.add_argument('--results_folder', default='data/KOST/results',
help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--raw_text', default='data/KOST/raw',
help='input file in (gz or xml currently). If none, then just database is loaded')