You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

56 lines
2.3 KiB

import os
import pickle
import classla
def annotate(tokenized_source_divs, tokenized_target_divs, args):
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
print('READING...')
with open(args.annotation_interprocessing, 'rb') as rp:
annotated_source_divs, annotated_target_divs = pickle.load(rp)
return annotated_source_divs, annotated_target_divs
nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu",
type='standard_jos')
annotated_source_divs = []
complete_source_conllu = ''
print('ANNOTATING SOURCE...')
for i, div in enumerate(tokenized_source_divs):
print(f'{str(i*100/len(tokenized_source_divs))}')
annotated_source_pars = []
for par in div:
annotated_source_sens = []
for sen in par:
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
annotated_source_sens.append(source_conllu_annotated)
complete_source_conllu += source_conllu_annotated
annotated_source_pars.append(annotated_source_sens)
annotated_source_divs.append(annotated_source_pars)
annotated_target_divs = []
complete_target_conllu = ''
print('ANNOTATING TARGET...')
for i, div in enumerate(tokenized_target_divs):
print(f'{str(i * 100 / len(tokenized_target_divs))}')
annotated_target_pars = []
for par in div:
annotated_target_sens = []
for sen in par:
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
annotated_target_sens.append(target_conllu_annotated)
complete_target_conllu += target_conllu_annotated
annotated_target_pars.append(annotated_target_sens)
annotated_target_divs.append(annotated_target_pars)
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
sf.write(complete_source_conllu)
with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
sf.write(complete_target_conllu)
with open(args.annotation_interprocessing, 'wb') as wp:
pickle.dump((annotated_source_divs, annotated_target_divs), wp)
return annotated_source_divs, annotated_target_divs