Updated code for KOST
This commit is contained in:
parent
cc455b2558
commit
eb0ea39415
|
@ -5,7 +5,7 @@ import classla
|
||||||
|
|
||||||
def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
||||||
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
|
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
|
||||||
print('READING...')
|
print('READING ANNOTATIONS...')
|
||||||
with open(args.annotation_interprocessing, 'rb') as rp:
|
with open(args.annotation_interprocessing, 'rb') as rp:
|
||||||
annotated_source_divs, annotated_target_divs = pickle.load(rp)
|
annotated_source_divs, annotated_target_divs = pickle.load(rp)
|
||||||
return annotated_source_divs, annotated_target_divs
|
return annotated_source_divs, annotated_target_divs
|
||||||
|
@ -16,32 +16,38 @@ def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
||||||
annotated_source_divs = []
|
annotated_source_divs = []
|
||||||
complete_source_conllu = ''
|
complete_source_conllu = ''
|
||||||
print('ANNOTATING SOURCE...')
|
print('ANNOTATING SOURCE...')
|
||||||
for i, div in enumerate(tokenized_source_divs):
|
for i, div_tuple in enumerate(tokenized_source_divs):
|
||||||
print(f'{str(i*100/len(tokenized_source_divs))}')
|
print(f'{str(i*100/len(tokenized_source_divs))}')
|
||||||
|
div_name, div = div_tuple
|
||||||
annotated_source_pars = []
|
annotated_source_pars = []
|
||||||
for par in div:
|
for par_tuple in div:
|
||||||
|
par_name, par = par_tuple
|
||||||
annotated_source_sens = []
|
annotated_source_sens = []
|
||||||
for sen in par:
|
for sen in par:
|
||||||
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
||||||
annotated_source_sens.append(source_conllu_annotated)
|
annotated_source_sens.append(source_conllu_annotated)
|
||||||
complete_source_conllu += source_conllu_annotated
|
complete_source_conllu += source_conllu_annotated
|
||||||
annotated_source_pars.append(annotated_source_sens)
|
annotated_source_pars.append((par_name, annotated_source_sens))
|
||||||
annotated_source_divs.append(annotated_source_pars)
|
annotated_source_divs.append((div_name, annotated_source_pars))
|
||||||
|
|
||||||
annotated_target_divs = []
|
annotated_target_divs = []
|
||||||
complete_target_conllu = ''
|
complete_target_conllu = ''
|
||||||
print('ANNOTATING TARGET...')
|
print('ANNOTATING TARGET...')
|
||||||
for i, div in enumerate(tokenized_target_divs):
|
for i, div_tuple in enumerate(tokenized_target_divs):
|
||||||
print(f'{str(i * 100 / len(tokenized_target_divs))}')
|
print(f'{str(i * 100 / len(tokenized_target_divs))}')
|
||||||
|
div_name, div = div_tuple
|
||||||
annotated_target_pars = []
|
annotated_target_pars = []
|
||||||
for par in div:
|
for par_tuple in div:
|
||||||
|
par_name, par = par_tuple
|
||||||
annotated_target_sens = []
|
annotated_target_sens = []
|
||||||
for sen in par:
|
for sen in par:
|
||||||
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
# if sen.count('\n') <= 2:
|
||||||
|
# print('HERE!!!!')
|
||||||
|
target_conllu_annotated = nlp(sen).to_conll() if sen and sen.count('\n') > 2 else ''
|
||||||
annotated_target_sens.append(target_conllu_annotated)
|
annotated_target_sens.append(target_conllu_annotated)
|
||||||
complete_target_conllu += target_conllu_annotated
|
complete_target_conllu += target_conllu_annotated
|
||||||
annotated_target_pars.append(annotated_target_sens)
|
annotated_target_pars.append((par_name, annotated_target_sens))
|
||||||
annotated_target_divs.append(annotated_target_pars)
|
annotated_target_divs.append((div_name, annotated_target_pars))
|
||||||
|
|
||||||
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
|
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
|
||||||
sf.write(complete_source_conllu)
|
sf.write(complete_source_conllu)
|
||||||
|
|
|
@ -176,9 +176,9 @@ class Sentence:
|
||||||
|
|
||||||
|
|
||||||
class Paragraph:
|
class Paragraph:
|
||||||
def __init__(self, _id, _doc_id, is_source):
|
def __init__(self, _id, _doc_id):
|
||||||
self._id = _id if _id is not None else 'no-id'
|
self._id = _id if _id is not None else 'no-id'
|
||||||
_doc_id += 's' if is_source else 't'
|
# _doc_id += 's' if is_source else 't'
|
||||||
self._doc_id = _doc_id if _doc_id is not None else ''
|
self._doc_id = _doc_id if _doc_id is not None else ''
|
||||||
self.sentences = []
|
self.sentences = []
|
||||||
|
|
||||||
|
@ -231,16 +231,14 @@ class TeiDocument:
|
||||||
tag_usage.set('gi', tag)
|
tag_usage.set('gi', tag)
|
||||||
tag_usage.set('occurs', str(count))
|
tag_usage.set('occurs', str(count))
|
||||||
|
|
||||||
for (paras, bibl, div_id), (_, _, corresp_div_id) in zip(self.divs, self.corresp_divs):
|
for (paras, div_id), (_, corresp_div_id) in zip(self.divs, self.corresp_divs):
|
||||||
div = etree.Element('div')
|
div = etree.Element('div')
|
||||||
set_xml_attr(div, 'id', div_id)
|
set_xml_attr(div, 'id', div_id)
|
||||||
div.set('corresp', f'#{corresp_div_id}')
|
div.set('corresp', f'#{corresp_div_id}')
|
||||||
div.append(bibl)
|
|
||||||
for para in paras:
|
for para in paras:
|
||||||
div.append(para.as_xml())
|
div.append(para.as_xml())
|
||||||
body.append(div)
|
body.append(div)
|
||||||
|
|
||||||
|
|
||||||
return root
|
return root
|
||||||
|
|
||||||
def add_paragraph(self, paragraph):
|
def add_paragraph(self, paragraph):
|
||||||
|
@ -301,47 +299,56 @@ def build_links(all_edges):
|
||||||
body = etree.Element('standOff')
|
body = etree.Element('standOff')
|
||||||
|
|
||||||
for document_edges in all_edges:
|
for document_edges in all_edges:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# if len(document_edges) > 1:
|
||||||
|
# print('here')
|
||||||
|
|
||||||
|
# mine paragraphs
|
||||||
for paragraph_edges in document_edges:
|
for paragraph_edges in document_edges:
|
||||||
for sentence_edges in paragraph_edges:
|
p = etree.Element('linkGrp')
|
||||||
s = etree.Element('linkGrp')
|
paragraph_id = ''
|
||||||
|
corresp_source_id = ''
|
||||||
|
corresp_target_id = ''
|
||||||
|
corresp = []
|
||||||
|
# for sentence_edges in paragraph_edges:
|
||||||
|
#
|
||||||
|
for token_edges in paragraph_edges:
|
||||||
|
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
||||||
|
random_source_id = token_edges['source_ids'][0]
|
||||||
|
corresp_source_id = '#'
|
||||||
|
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
||||||
|
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
|
||||||
|
corresp.append(corresp_source_id)
|
||||||
|
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
||||||
|
random_target_id = token_edges['target_ids'][0]
|
||||||
|
corresp_target_id = '#'
|
||||||
|
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
|
||||||
|
# corresp_target_id += random_target_id.split('.')[0]
|
||||||
|
corresp.append(corresp_target_id)
|
||||||
|
link = etree.Element('link')
|
||||||
|
# translate labels
|
||||||
|
labels_list = []
|
||||||
|
for label in token_edges['labels']:
|
||||||
|
if label in labels_mapper:
|
||||||
|
labels_list.append(labels_mapper[label])
|
||||||
|
else:
|
||||||
|
labels_list.append(label)
|
||||||
|
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
|
||||||
|
link.set('type', labels)
|
||||||
|
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
||||||
|
|
||||||
sentence_id = ''
|
p.append(link)
|
||||||
corresp_source_id = ''
|
p.set('type', 'CORR')
|
||||||
corresp_target_id = ''
|
targFunc = []
|
||||||
corresp = []
|
if corresp_source_id:
|
||||||
for token_edges in sentence_edges:
|
targFunc.append('orig')
|
||||||
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
if corresp_target_id:
|
||||||
random_source_id = token_edges['source_ids'][0]
|
targFunc.append('reg')
|
||||||
corresp_source_id = '#'
|
p.set('targFunc', f'{" ".join(targFunc)}')
|
||||||
corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
p.set('corresp', f'{" ".join(corresp)}')
|
||||||
corresp.append(corresp_source_id)
|
body.append(p)
|
||||||
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
|
||||||
random_target_id = token_edges['target_ids'][0]
|
|
||||||
corresp_target_id = '#'
|
|
||||||
corresp_target_id += '.'.join(random_target_id.split('.')[:3])
|
|
||||||
corresp.append(corresp_target_id)
|
|
||||||
link = etree.Element('link')
|
|
||||||
# translate labels
|
|
||||||
labels_list = []
|
|
||||||
for label in token_edges['labels']:
|
|
||||||
if label in labels_mapper:
|
|
||||||
labels_list.append(labels_mapper[label])
|
|
||||||
else:
|
|
||||||
labels_list.append(label)
|
|
||||||
labels = '|'.join(labels_list) if len(labels_list) > 0 else 'ID'
|
|
||||||
link.set('type', labels)
|
|
||||||
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
|
||||||
|
|
||||||
s.append(link)
|
|
||||||
s.set('type', 'CORR')
|
|
||||||
targFunc = []
|
|
||||||
if corresp_source_id:
|
|
||||||
targFunc.append('orig')
|
|
||||||
if corresp_target_id:
|
|
||||||
targFunc.append('reg')
|
|
||||||
s.set('targFunc', f'{" ".join(targFunc)}')
|
|
||||||
s.set('corresp', f'{" ".join(corresp)}')
|
|
||||||
body.append(s)
|
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
@ -365,8 +372,8 @@ def is_metaline(line):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, source_id):
|
def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences):
|
||||||
para = Paragraph(para_id, doc_id, source_id)
|
para = Paragraph(para_id, doc_id)
|
||||||
|
|
||||||
for sentence in etree_source_sentences:
|
for sentence in etree_source_sentences:
|
||||||
para.add_sentence(sentence)
|
para.add_sentence(sentence)
|
||||||
|
@ -374,29 +381,6 @@ def construct_paragraph_from_list(doc_id, para_id, etree_source_sentences, sourc
|
||||||
return para
|
return para
|
||||||
|
|
||||||
|
|
||||||
def construct_paragraph(doc_id, para_id, conllu_lines, is_source):
|
|
||||||
para = Paragraph(para_id, doc_id, is_source)
|
|
||||||
|
|
||||||
sent_id = None
|
|
||||||
sent_buffer = []
|
|
||||||
|
|
||||||
for line in conllu_lines:
|
|
||||||
if is_metaline(line):
|
|
||||||
key, val = parse_metaline(line)
|
|
||||||
if key == 'sent_id':
|
|
||||||
if len(sent_buffer) > 0:
|
|
||||||
para.add_sentence(construct_sentence(sent_id, sent_buffer))
|
|
||||||
sent_buffer = []
|
|
||||||
sent_id = val
|
|
||||||
elif not line.isspace():
|
|
||||||
sent_buffer.append(line)
|
|
||||||
|
|
||||||
if len(sent_buffer) > 0:
|
|
||||||
para.add_sentence(construct_sentence(sent_id, sent_buffer))
|
|
||||||
|
|
||||||
return para
|
|
||||||
|
|
||||||
|
|
||||||
def construct_sentence_from_list(sent_id, object_list, is_source):
|
def construct_sentence_from_list(sent_id, object_list, is_source):
|
||||||
sentence = Sentence(sent_id)
|
sentence = Sentence(sent_id)
|
||||||
converter = Converter()
|
converter = Converter()
|
||||||
|
|
|
@ -19,8 +19,8 @@ SKIP_IDS = ['solar2284s.1.1.1']
|
||||||
|
|
||||||
|
|
||||||
def create_edges(raw_edges, source_par, target_par):
|
def create_edges(raw_edges, source_par, target_par):
|
||||||
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
|
source_mapper = {el['svala_id']: source[1] + '.' + str(el['id']) for source in source_par for el in source[0]}
|
||||||
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
|
target_mapper = {el['svala_id']: target[1] + '.' + str(el['id']) for target in target_par for el in target[0]}
|
||||||
|
|
||||||
# actually add edges
|
# actually add edges
|
||||||
edges = []
|
edges = []
|
||||||
|
|
|
@ -13,6 +13,7 @@ from src.read.merge import merge, create_conllu, create_edges
|
||||||
from src.read.read import read_raw_text, map_svala_tokenized
|
from src.read.read import read_raw_text, map_svala_tokenized
|
||||||
from src.read.svala_data import SvalaData
|
from src.read.svala_data import SvalaData
|
||||||
|
|
||||||
|
alphabet = list(map(chr, range(97, 123)))
|
||||||
|
|
||||||
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
|
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
|
||||||
sentence_string_id_split = sentence_string_id.split('.')
|
sentence_string_id_split = sentence_string_id.split('.')
|
||||||
|
@ -245,7 +246,7 @@ def create_target(svala_data_object, source_tokenized):
|
||||||
|
|
||||||
def tokenize(args):
|
def tokenize(args):
|
||||||
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
|
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
|
||||||
print('READING AND MERGING...')
|
print('READING TOKENIZATION...')
|
||||||
with open(args.tokenization_interprocessing, 'rb') as rp:
|
with open(args.tokenization_interprocessing, 'rb') as rp:
|
||||||
tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
|
tokenized_source_divs, tokenized_target_divs, document_edges = pickle.load(rp)
|
||||||
return tokenized_source_divs, tokenized_target_divs, document_edges
|
return tokenized_source_divs, tokenized_target_divs, document_edges
|
||||||
|
@ -314,26 +315,35 @@ def tokenize(args):
|
||||||
# par_target = []
|
# par_target = []
|
||||||
for tokenized_para in tokenized_divs[div_id]:
|
for tokenized_para in tokenized_divs[div_id]:
|
||||||
paragraph_name, source_res, target_res, edges = tokenized_para
|
paragraph_name, source_res, target_res, edges = tokenized_para
|
||||||
|
split_para_name = paragraph_name[:-5].split('-')
|
||||||
|
div_name = '-'.join(split_para_name[:-1])
|
||||||
|
par_name = split_para_name[-1]
|
||||||
|
assert not par_name.isnumeric() or par_name not in alphabet, Exception('Incorrect paragraph name!')
|
||||||
|
if par_name in alphabet:
|
||||||
|
par_name = str(alphabet.index(par_name) + 10)
|
||||||
|
|
||||||
source_paragraphs = []
|
source_paragraphs = []
|
||||||
target_paragraphs = []
|
target_paragraphs = []
|
||||||
sen_source = []
|
sen_source = []
|
||||||
sen_target = []
|
sen_target = []
|
||||||
for sen_i, sen in enumerate(source_res):
|
for sen_i, sen in enumerate(source_res):
|
||||||
source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
|
source_sen_name = f'{div_name}s.{par_name}.{str(sen_i + 1)}'
|
||||||
|
source_conllu = create_conllu(sen, source_sen_name)
|
||||||
source_paragraphs.append(source_conllu)
|
source_paragraphs.append(source_conllu)
|
||||||
sen_source.append(sen)
|
sen_source.append((sen, source_sen_name))
|
||||||
|
|
||||||
for sen_i, sen in enumerate(target_res):
|
for sen_i, sen in enumerate(target_res):
|
||||||
target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
|
target_sen_name = f'{div_name}t.{par_name}.{str(sen_i + 1)}'
|
||||||
|
target_conllu = create_conllu(sen, target_sen_name)
|
||||||
target_paragraphs.append(target_conllu)
|
target_paragraphs.append(target_conllu)
|
||||||
sen_target.append(sen)
|
sen_target.append((sen, target_sen_name))
|
||||||
paragraph_edges.append(edges)
|
# paragraph_edges.append(edges)
|
||||||
tokenized_source_paragraphs.append(source_paragraphs)
|
tokenized_source_paragraphs.append((par_name, source_paragraphs))
|
||||||
tokenized_target_paragraphs.append(target_paragraphs)
|
tokenized_target_paragraphs.append((par_name, target_paragraphs))
|
||||||
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
|
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
|
||||||
|
|
||||||
tokenized_source_divs.append(tokenized_source_paragraphs)
|
tokenized_source_divs.append((div_name+'s', tokenized_source_paragraphs))
|
||||||
tokenized_target_divs.append(tokenized_target_paragraphs)
|
tokenized_target_divs.append((div_name+'t', tokenized_target_paragraphs))
|
||||||
|
|
||||||
document_edges.append(paragraph_edges)
|
document_edges.append(paragraph_edges)
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,34 @@ from src.create_tei import construct_sentence_from_list, \
|
||||||
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
||||||
|
|
||||||
|
|
||||||
|
def form_paragraphs(annotated_source_divs):
|
||||||
|
etree_source_divs = []
|
||||||
|
for div_i, div_tuple in enumerate(annotated_source_divs):
|
||||||
|
div_name, div = div_tuple
|
||||||
|
# file_name = file_name.replace('/', '_')
|
||||||
|
# print(f'{i * 100 / folders_count} % : {file_name}')
|
||||||
|
|
||||||
|
etree_source_paragraphs = []
|
||||||
|
|
||||||
|
for par_i, paragraph_tuple in enumerate(div):
|
||||||
|
par_name, paragraph = paragraph_tuple
|
||||||
|
etree_source_sentences = []
|
||||||
|
|
||||||
|
for sentence_id, sentence in enumerate(paragraph):
|
||||||
|
if len(sentence) > 0:
|
||||||
|
conllu_parsed = conllu.parse(sentence)[0]
|
||||||
|
etree_source_sentences.append(
|
||||||
|
construct_sentence_from_list(str(sentence_id + 1), conllu_parsed, True))
|
||||||
|
|
||||||
|
etree_source_paragraphs.append(construct_paragraph_from_list(div_name, par_name, etree_source_sentences))
|
||||||
|
|
||||||
|
etree_source_divs.append((etree_source_paragraphs, div_name))
|
||||||
|
|
||||||
|
return etree_source_divs, div_name
|
||||||
|
|
||||||
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args):
|
||||||
print('BUILDING LINKS...')
|
# print('BUILDING LINKS...')
|
||||||
etree_links = build_links(document_edges)
|
# etree_links = build_links(document_edges)
|
||||||
|
|
||||||
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
|
with open(os.path.join(args.results_folder, f"links.xml"), 'w') as tf:
|
||||||
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
|
tf.write(etree.tostring(etree_links, pretty_print=True, encoding='utf-8').decode())
|
||||||
|
@ -18,91 +43,22 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
|
||||||
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
|
with open(os.path.join(args.results_folder, f"links.json"), 'w') as jf:
|
||||||
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
|
json.dump(document_edges, jf, ensure_ascii=False, indent=" ")
|
||||||
|
|
||||||
|
|
||||||
print('WRITTING TEI...')
|
print('WRITTING TEI...')
|
||||||
etree_source_documents = []
|
etree_source_documents = []
|
||||||
etree_target_documents = []
|
etree_target_documents = []
|
||||||
etree_source_divs = []
|
|
||||||
etree_target_divs = []
|
|
||||||
|
|
||||||
# with open(args.solar_file, 'r') as fp:
|
print('WRITING SOURCE FILES...')
|
||||||
# logging.info(args.solar_file)
|
etree_source_divs, source_div_name = form_paragraphs(annotated_source_divs)
|
||||||
# et = ElementTree.XML(fp.read())
|
|
||||||
|
|
||||||
# filename_encountered = False
|
print('WRITING TARGET FILES...')
|
||||||
i = 0
|
etree_target_divs, target_div_name = form_paragraphs(annotated_target_divs)
|
||||||
folders_count = 5484
|
|
||||||
|
|
||||||
div_i = 0
|
|
||||||
for div in et.iter('div'):
|
|
||||||
bibl = div.find('bibl')
|
|
||||||
file_name = bibl.get('n')
|
|
||||||
file_name = file_name.replace('/', '_')
|
|
||||||
print(f'{i * 100 / folders_count} % : {file_name}')
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# if i * 100 / folders_count > 50:
|
|
||||||
# filename_encountered = True
|
|
||||||
# # if file_name == 'KUS-G-slo-4-GO-E-2009-10071':
|
|
||||||
# # filename_encountered = True
|
|
||||||
# if i * 100 / folders_count > 51:
|
|
||||||
# filename_encountered = False
|
|
||||||
#
|
|
||||||
# if file_name == 'KUS-G-slo-1-LJ-E-2009_2010-10540':
|
|
||||||
# # div_i -= 1
|
|
||||||
# continue
|
|
||||||
#
|
|
||||||
# if file_name == 'KUS-SI-slo-2-NM-E-2009_2010-20362' or file_name == 'KUS-OS-slo-9-SG-R-2009_2010-40129' or file_name == 'KUS-OS-slo-7-SG-R-2009_2010-40173':
|
|
||||||
# # div_i -= 1
|
|
||||||
# continue
|
|
||||||
#
|
|
||||||
# if not filename_encountered:
|
|
||||||
# div_i+=1
|
|
||||||
#
|
|
||||||
# continue
|
|
||||||
|
|
||||||
|
|
||||||
etree_source_paragraphs = []
|
|
||||||
etree_target_paragraphs = []
|
|
||||||
# paragraph_edges = []
|
|
||||||
|
|
||||||
paragraphs = div.findall('p')
|
|
||||||
par_i = 0
|
|
||||||
for paragraph in paragraphs:
|
|
||||||
|
|
||||||
etree_source_sentences = []
|
|
||||||
etree_target_sentences = []
|
|
||||||
|
|
||||||
for sentence_id, source_conllu_annotated in enumerate(annotated_source_divs[div_i][par_i]):
|
|
||||||
if len(source_conllu_annotated) > 0:
|
|
||||||
source_conllu_parsed = conllu.parse(source_conllu_annotated)[0]
|
|
||||||
if len(source_conllu_annotated) > 0:
|
|
||||||
etree_source_sentences.append(construct_sentence_from_list(str(sentence_id + 1), source_conllu_parsed, True))
|
|
||||||
|
|
||||||
|
|
||||||
for sentence_id, target_conllu_annotated in enumerate(annotated_target_divs[div_i][par_i]):
|
|
||||||
if len(target_conllu_annotated) > 0:
|
|
||||||
target_conllu_parsed = conllu.parse(target_conllu_annotated)[0]
|
|
||||||
if len(target_conllu_annotated) > 0:
|
|
||||||
etree_target_sentences.append(construct_sentence_from_list(str(sentence_id + 1), target_conllu_parsed, False))
|
|
||||||
|
|
||||||
etree_source_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_source_sentences, True))
|
|
||||||
etree_target_paragraphs.append(construct_paragraph_from_list(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0], paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[1], etree_target_sentences, False))
|
|
||||||
|
|
||||||
par_i += 1
|
|
||||||
|
|
||||||
etree_bibl = convert_bibl(bibl)
|
|
||||||
etree_source_divs.append((etree_source_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's'))
|
|
||||||
etree_target_divs.append((etree_target_paragraphs, copy.deepcopy(etree_bibl), paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't'))
|
|
||||||
|
|
||||||
div_i += 1
|
|
||||||
|
|
||||||
print('APPENDING DOCUMENT...')
|
print('APPENDING DOCUMENT...')
|
||||||
etree_source_documents.append(
|
etree_source_documents.append(
|
||||||
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 's',
|
TeiDocument(source_div_name,
|
||||||
etree_source_divs, etree_target_divs))
|
etree_source_divs, etree_target_divs))
|
||||||
etree_target_documents.append(
|
etree_target_documents.append(
|
||||||
TeiDocument(paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'].split('.')[0] + 't',
|
TeiDocument(target_div_name,
|
||||||
etree_target_divs, etree_source_divs))
|
etree_target_divs, etree_source_divs))
|
||||||
|
|
||||||
print('BUILDING TEI DOCUMENTS...')
|
print('BUILDING TEI DOCUMENTS...')
|
||||||
|
|
|
@ -249,7 +249,7 @@ if __name__ == '__main__':
|
||||||
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||||
parser.add_argument('--svala_folder', default='data/KOST/svala',
|
parser.add_argument('--svala_folder', default='data/KOST/svala',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--results_folder', default='data/results/solar3.0',
|
parser.add_argument('--results_folder', default='data/KOST/results',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user