Updated code for KOST
This commit is contained in:
parent
cc455b2558
commit
eb0ea39415
|
@ -5,7 +5,7 @@ import classla
|
|||
|
||||
def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
||||
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
|
||||
print('READING...')
|
||||
print('READING ANNOTATIONS...')
|
||||
with open(args.annotation_interprocessing, 'rb') as rp:
|
||||
annotated_source_divs, annotated_target_divs = pickle.load(rp)
|
||||
return annotated_source_divs, annotated_target_divs
|
||||
|
@ -16,32 +16,38 @@ def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
|||
annotated_source_divs = []
|
||||
complete_source_conllu = ''
|
||||
print('ANNOTATING SOURCE...')
|
||||
for i, div in enumerate(tokenized_source_divs):
|
||||
for i, div_tuple in enumerate(tokenized_source_divs):
|
||||
print(f'{str(i*100/len(tokenized_source_divs))}')
|
||||
div_name, div = div_tuple
|
||||
annotated_source_pars = []
|
||||
for par in div:
|
||||
for par_tuple in div:
|
||||
par_name, par = par_tuple
|
||||
annotated_source_sens = []
|
||||
for sen in par:
|
||||
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
||||
annotated_source_sens.append(source_conllu_annotated)
|
||||
complete_source_conllu += source_conllu_annotated
|
||||
annotated_source_pars.append(annotated_source_sens)
|
||||
annotated_source_divs.append(annotated_source_pars)
|
||||
annotated_source_pars.append((par_name, annotated_source_sens))
|
||||
annotated_source_divs.append((div_name, annotated_source_pars))
|
||||
|
||||
annotated_target_divs = []
|
||||
complete_target_conllu = ''
|
||||
print('ANNOTATING TARGET...')
|
||||
for i, div in enumerate(tokenized_target_divs):
|
||||
for i, div_tuple in enumerate(tokenized_target_divs):
|
||||
print(f'{str(i * 100 / len(tokenized_target_divs))}')
|
||||
div_name, div = div_tuple
|
||||
annotated_target_pars = []
|
||||
for par in div:
|
||||
for par_tuple in div:
|
||||
par_name, par = par_tuple
|
||||
annotated_target_sens = []
|
||||
for sen in par:
|
||||
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
||||
# if sen.count('\n') <= 2:
|
||||
# print('HERE!!!!')
|
||||
target_conllu_annotated = nlp(sen).to_conll() if sen and sen.count('\n') > 2 else ''
|
||||
annotated_target_sens.append(target_conllu_annotated)
|
||||
complete_target_conllu += target_conllu_annotated
|
||||
annotated_target_pars.append(annotated_target_sens)
|
||||
annotated_target_divs.append(annotated_target_pars)
|
||||
annotated_target_pars.append((par_name, annotated_target_sens))
|
||||
annotated_target_divs.append((div_name, annotated_target_pars))
|
||||
|
||||
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
|
||||
sf.write(complete_source_conllu)
|
||||
|
|
|
@ -176,9 +176,9 @@ class Sentence:
|
|||
|
||||
|
||||
class Paragraph:
|
||||
def __init__(self, _id, _doc_id, is_source):
|
||||
def __init__(self, _id, _doc_id):
|
||||
self._id = _id if _id is not None else 'no-id'
|
||||
_doc_id += 's' if is_source else 't'
|
||||
# _doc_id += 's' if is_source else 't'
|
||||
self._doc_id = _doc_id if _doc_id is not None else ''
|
||||
self.sentences = []
|
||||
|
||||
|
@ -231,16 +231,14 @@ class TeiDocument:
|
|||
tag_usage.set('gi', tag)
|
||||
tag_usage.set('occurs', str(count))
|
||||
|
||||
for (paras, bibl, div_id), (_, _, corresp_div_id) in zip(self.divs, self.corresp_divs):
|
||||
for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs):
|
||||
div = etree.Element('div')
|
||||
set_xml_attr(div, 'id', div_id)
|
||||
div.set('corresp', f'#{corresp_div_id}')
|
||||
div.append(bibl)
|
||||
for para in paras:
|
||||
div.append(para.as_xml())
|
||||
body.append(div)
|
||||
|
||||
|
||||
return root
|
||||
|
||||
def add_paragraph(self, paragraph):
|
||||
|
@ -301,47 +299,56 @@ def build_links(all_edges):
|
|||
body = etree.Element('standOff')
|
||||
|
||||
for document_edges in all_edges:
|
||||
|
||||
|
||||
|
||||
# if len(document_edges) > 1:
|
||||
# print('here')
|
||||
|
||||
# mine paragraphs
|
||||
for paragraph_edges in document_edges:
|
||||
for sentence_edges in paragraph_edges:
|
||||
s = etree.Element('linkGrp')
|
||||
p = etree.Element('linkGrp')
|
||||
paragraph_id = ''
|
||||
corresp_source_id = ''
|
||||
corresp_target_id = ''
|
||||
corresp = []
|
||||
# for sentence_edges in paragraph_edges:
|
||||
#
|
||||
for token_edges in paragraph_edges:
|
||||
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
||||
random_source_id = token_edges['source_ids'][0]
|
||||
corresp_source_id = '#'
|
||||
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
||||
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
|
||||
corresp.append(corresp_source_id)
|
||||
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
||||
random_target_id = token_edges['target_ids'][0]
|
||||
corresp_target_id = '#'
|
||||
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
|
||||
# corresp_target_id += random_target_id.split('.')[0]
|
||||
corresp.append(corresp_target_id)
|
||||
link = etree.Element('link')
|
||||
# translate labels
|
||||
labels_list = []
|
||||
for label in token_edges['labels']:
|
||||
if label in labels_mapper:
|
||||
labels_list.append(labels_mapper[label])
|
||||
else:
|
||||
labels_list.append(label)
|
||||
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
|
||||
link.set('type', labels)
|
||||
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
||||
|
||||
sentence_id = ''
|
||||
corresp_source_id = ''
|
||||
corresp_target_id = ''
|
||||
corresp = []
|
||||
for token_edges in sentence_edges:
|
||||
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
||||
random_source_id = token_edges['source_ids'][0]
|
||||
corresp_source_id = '#'
|
||||
corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
||||
corresp.append(corresp_source_id)
|
||||
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
||||
random_target_id = token_edges['target_ids'][0]
|
||||
corresp_target_id = '#'
|
||||
corresp_target_id += '.'.join(random_target_id.split('.')[:3])
|
||||
corresp.append(corresp_target_id)
|
||||
link = etree.Element('link')
|
||||
# translate labels
|
||||
labels_list = []
|
||||
for label in token_edges['labels']:
|
||||
if label in labels_mapper:
|
||||
labels_list.append(labels_mapper[label])
|
||||
else:
|
||||
labels_list.append(label)
|
||||
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
|
||||
link.set('type', labels)
|
||||
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
||||
|
||||
s.append(link)
|
||||
s.set('type', 'CORR')
|
||||
targFunc = []
|
||||
if corresp_source_id:
|
||||
targFunc.append('orig')
|
||||
if corresp_target_id:
|
||||
targFunc.append('reg')
|
||||
s.set('targFunc', f'{" ".join(targFunc)}')
|
||||
s.set('corresp', f'{" ".join(corresp)}')
|
||||
body.append(s)
|
||||
p.append(link)
|
||||
p.set('type', 'CORR')
|
||||
targFunc = []
|
||||
if corresp_source_id:
|
||||
targFunc.append('orig')
|
||||
if corresp_target_id:
|
||||
targFunc.append('reg')
|
||||
p.set('targFunc', f'{" ".join(targFunc)}')
|
||||
p.set('corresp', f'{" ".join(corresp)}')
|
||||
body.append(p)
|
||||
return body
|
||||
|
||||
|
||||
|
@ -365,8 +372,8 @@ def is_metaline(line):
|
|||
return False
|
||||
|
||||
|
||||
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id):
|
||||
para = Paragraph(para_id, doc_id, source_id)
|
||||
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences):
|
||||
para = Paragraph(para_id, doc_id)
|
||||
|
||||
for sentence in etree_source_sentences:
|
||||
para.add_sentence(sentence)
|
||||
|
@ -374,29 +381,6 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, sourc
|
|||
return para
|
||||
|
||||
|
||||
def construct_paragraph(doc_id, para_id, conllu_lines, is_source):
|
||||
para = Paragraph(para_id, doc_id, is_source)
|
||||
|
||||
sent_id = None
|
||||
sent_buffer = []
|
||||
|
||||
for line in conllu_lines:
|
||||
if is_metaline(line):
|
||||
key, val = parse_metaline(line)
|
||||
if key == 'sent_id':
|
||||
if len(sent_buffer) > 0:
|
||||
para.add_sentence(construct_sentence(sent_id, sent_buffer))
|
||||
sent_buffer = []
|
||||
sent_id = val
|
||||
elif not line.isspace():
|
||||
sent_buffer.append(line)
|
||||
|
||||
if len(sent_buffer) > 0:
|
||||
para.add_sentence(construct_sentence(sent_id, sent_buffer))
|
||||
|
||||
return para
|
||||
|
||||
|
||||
def construct_sentence_from_list(sent_id, object_list, is_source):
|
||||
sentence = Sentence(sent_id)
|
||||
converter = Converter()
|
||||
|
|
|
@ -19,8 +19,8 @@ SKIP_IDS = ['solar2284s.1.1.1']
|
|||
|
||||
|
||||
def create_edges(raw_edges, source_par, target_par):
|
||||
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
|
||||
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
|
||||
source_mapper = {el['svala_id']: source[1] + '.' + str(el['id']) for source in source_par for el in source[0]}
|
||||
target_mapper = {el['svala_id']: target[1] + '.' + str(el['id']) for target in target_par for el in target[0]}
|
||||
|
||||
# actually add edges
|
||||
edges = []
|
||||
|
|
|
@ -13,6 +13,7 @@ from src.read.merge import merge, create_conllu, create_edges
|
|||
from src.read.read import read_raw_text, map_svala_tokenized
|
||||
from src.read.svala_data import SvalaData
|
||||
|
||||
alphabet = list(map(chr, range(97, 123)))
|
||||
|
||||
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
|
||||
sentence_string_id_split = sentence_string_id.split('.')
|
||||
|
@ -245,7 +246,7 @@ def create_target(svala_data_object, source_tokenized):
|
|||
|
||||
def tokenize(args):
|
||||
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
|
||||
print('READING AND MERGING...')
|
||||
print('READING TOKENIZATION...')
|
||||
with open(args.tokenization_interprocessing, 'rb') as rp:
|
||||
tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
|
||||
return tokenized_source_divs, tokenized_target_divs, document_edges
|
||||
|
@ -314,26 +315,35 @@ def tokenize(args):
|
|||
# par_target = []
|
||||
for tokenized_para in tokenized_divs[div_id]:
|
||||
paragraph_name, source_res, target_res, edges = tokenized_para
|
||||
split_para_name = paragraph_name[:-5].split('-')
|
||||
div_name = '-'.join(split_para_name[:-1])
|
||||
par_name = split_para_name[-1]
|
||||
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
|
||||
if par_name in alphabet:
|
||||
par_name = str(alphabet.index(par_name) + 10)
|
||||
|
||||
source_paragraphs = []
|
||||
target_paragraphs = []
|
||||
sen_source = []
|
||||
sen_target = []
|
||||
for sen_i, sen in enumerate(source_res):
|
||||
source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
|
||||
source_sen_name = f'{div_name}s.{par_name}.{str(sen_i + 1)}'
|
||||
source_conllu = create_conllu(sen, source_sen_name)
|
||||
source_paragraphs.append(source_conllu)
|
||||
sen_source.append(sen)
|
||||
sen_source.append((sen, source_sen_name))
|
||||
|
||||
for sen_i, sen in enumerate(target_res):
|
||||
target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
|
||||
target_sen_name = f'{div_name}t.{par_name}.{str(sen_i + 1)}'
|
||||
target_conllu = create_conllu(sen, target_sen_name)
|
||||
target_paragraphs.append(target_conllu)
|
||||
sen_target.append(sen)
|
||||
paragraph_edges.append(edges)
|
||||
tokenized_source_paragraphs.append(source_paragraphs)
|
||||
tokenized_target_paragraphs.append(target_paragraphs)
|
||||
sen_target.append((sen, target_sen_name))
|
||||
# paragraph_edges.append(edges)
|
||||
tokenized_source_paragraphs.append((par_name, source_paragraphs))
|
||||
tokenized_target_paragraphs.append((par_name, target_paragraphs))
|
||||
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
|
||||
|
||||
tokenized_source_divs.append(tokenized_source_paragraphs)
|
||||
tokenized_target_divs.append(tokenized_target_paragraphs)
|
||||
tokenized_source_divs.append((div_name+'s', tokenized_source_paragraphs))
|
||||
tokenized_target_divs.append((div_name+'t', tokenized_target_paragraphs))
|
||||
|
||||
document_edges.append(paragraph_edges)
|
||||
|
||||
|
|
|
@ -8,9 +8,34 @@ from src.create_tei import construct_sentence_from_list, \
|
|||
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
||||
|
||||
|
||||
def form_paragraphs(annotated_source_divs):
|
||||
etree_source_divs = []
|
||||
for div_i, div_tuple in enumerate(annotated_source_divs):
|
||||
div_name, div = div_tuple
|
||||
# file_name = file_name.replace('/', '_')
|
||||
# print(f'{i * 100 / folders_count} % : {file_name}')
|
||||
|
||||
etree_source_paragraphs = []
|
||||
|
||||
for par_i, paragraph_tuple in enumerate(div):
|
||||
par_name, paragraph = paragraph_tuple
|
||||
etree_source_sentences = []
|
||||
|
||||
for sentence_id, sentence in enumerate(paragraph):
|
||||
if len(sentence) > 0:
|
||||
conllu_parsed = conllu.parse(sentence)[0]
|
||||
etree_source_sentences.append(
|
||||
construct_sentence_from_list(str(sentence_id + 1), conllu_parsed, True))
|
||||
|
||||
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
|
||||
|
||||
etree_source_divs.append((etree_source_paragraphs, div_name))
|
||||
|
||||
return etree_source_divs, div_name
|
||||
|
||||
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
||||
print('BUILDING LINKS...')
|
||||
etree_links = build_links(document_edges)
|
||||
# print('BUILDING LINKS...')
|
||||
# etree_links = build_links(document_edges)
|
||||
|
||||
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
|
||||
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
|
||||
|
@ -18,91 +43,22 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
|
|||
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
|
||||
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
|
||||
|
||||
|
||||
print('WRITTING TEI...')
|
||||
etree_source_documents = []
|
||||
etree_target_documents = []
|
||||
etree_source_divs = []
|
||||
etree_target_divs = []
|
||||
|
||||
# with open(args.solar_file, 'r') as fp:
|
||||
# logging.info(args.solar_file)
|
||||
# et = ElementTree.XML(fp.read())
|
||||
print('WRITING SOURCE FILES...')
|
||||
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs)
|
||||
|
||||
# filename_encountered = False
|
||||
i = 0
|
||||
folders_count = 5484
|
||||
|
||||
div_i = 0
|
||||
for div in et.iter('div'):
|
||||
bibl = div.find('bibl')
|
||||
file_name = bibl.get('n')
|
||||
file_name = file_name.replace('/', '_')
|
||||
print(f'{i * 100 / folders_count} % : {file_name}')
|
||||
i += 1
|
||||
|
||||
# if i * 100 / folders_count > 50:
|
||||
# filename_encountered = True
|
||||
# # if file_name == 'KUS-G-slo-4-GO-E-2009-10071':
|
||||
# # filename_encountered = True
|
||||
# if i * 100 / folders_count > 51:
|
||||
# filename_encountered = False
|
||||
#
|
||||
# if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
|
||||
# # div_i -= 1
|
||||
# continue
|
||||
#
|
||||
# if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
|
||||
# # div_i -= 1
|
||||
# continue
|
||||
#
|
||||
# if not filename_encountered:
|
||||
# div_i+=1
|
||||
#
|
||||
# continue
|
||||
|
||||
|
||||
etree_source_paragraphs = []
|
||||
etree_target_paragraphs = []
|
||||
# paragraph_edges = []
|
||||
|
||||
paragraphs = div.findall('p')
|
||||
par_i = 0
|
||||
for paragraph in paragraphs:
|
||||
|
||||
etree_source_sentences = []
|
||||
etree_target_sentences = []
|
||||
|
||||
for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]):
|
||||
if len(source_conllu_annotated) > 0:
|
||||
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
|
||||
if len(source_conllu_annotated) > 0:
|
||||
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))
|
||||
|
||||
|
||||
for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]):
|
||||
if len(target_conllu_annotated) > 0:
|
||||
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
|
||||
if len(target_conllu_annotated) > 0:
|
||||
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))
|
||||
|
||||
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
|
||||
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
|
||||
|
||||
par_i += 1
|
||||
|
||||
etree_bibl = convert_bibl(bibl)
|
||||
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
|
||||
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))
|
||||
|
||||
div_i += 1
|
||||
print('WRITING TARGET FILES...')
|
||||
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs)
|
||||
|
||||
print('APPENDING DOCUMENT...')
|
||||
etree_source_documents.append(
|
||||
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
|
||||
TeiDocument(source_div_name,
|
||||
etree_source_divs, etree_target_divs))
|
||||
etree_target_documents.append(
|
||||
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
|
||||
TeiDocument(target_div_name,
|
||||
etree_target_divs, etree_source_divs))
|
||||
|
||||
print('BUILDING TEI DOCUMENTS...')
|
||||
|
|
|
@ -249,7 +249,7 @@ if __name__ == '__main__':
|
|||
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||
parser.add_argument('--svala_folder', default='data/KOST/svala',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--results_folder', default='data/results/solar3.0',
|
||||
parser.add_argument('--results_folder', default='data/KOST/results',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
|
|
Loading…
Reference in New Issue
Block a user