You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
56 lines
2.3 KiB
56 lines
2.3 KiB
import os
|
|
import pickle
|
|
import classla
|
|
|
|
|
|
def annotate(tokenized_source_divs, tokenized_target_divs, args):
|
|
if os.path.exists(args.annotation_interprocessing) and not args.overwrite_annotation:
|
|
print('READING...')
|
|
with open(args.annotation_interprocessing, 'rb') as rp:
|
|
annotated_source_divs, annotated_target_divs = pickle.load(rp)
|
|
return annotated_source_divs, annotated_target_divs
|
|
|
|
nlp = classla.Pipeline('sl', pos_use_lexicon=True, pos_lemma_pretag=False, tokenize_pretokenized="conllu",
|
|
type='standard_jos')
|
|
|
|
annotated_source_divs = []
|
|
complete_source_conllu = ''
|
|
print('ANNOTATING SOURCE...')
|
|
for i, div in enumerate(tokenized_source_divs):
|
|
print(f'{str(i*100/len(tokenized_source_divs))}')
|
|
annotated_source_pars = []
|
|
for par in div:
|
|
annotated_source_sens = []
|
|
for sen in par:
|
|
source_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
|
annotated_source_sens.append(source_conllu_annotated)
|
|
complete_source_conllu += source_conllu_annotated
|
|
annotated_source_pars.append(annotated_source_sens)
|
|
annotated_source_divs.append(annotated_source_pars)
|
|
|
|
annotated_target_divs = []
|
|
complete_target_conllu = ''
|
|
print('ANNOTATING TARGET...')
|
|
for i, div in enumerate(tokenized_target_divs):
|
|
print(f'{str(i * 100 / len(tokenized_target_divs))}')
|
|
annotated_target_pars = []
|
|
for par in div:
|
|
annotated_target_sens = []
|
|
for sen in par:
|
|
target_conllu_annotated = nlp(sen).to_conll() if sen else ''
|
|
annotated_target_sens.append(target_conllu_annotated)
|
|
complete_target_conllu += target_conllu_annotated
|
|
annotated_target_pars.append(annotated_target_sens)
|
|
annotated_target_divs.append(annotated_target_pars)
|
|
|
|
with open(os.path.join(args.results_folder, f"source.conllu"), 'w') as sf:
|
|
sf.write(complete_source_conllu)
|
|
|
|
with open(os.path.join(args.results_folder, f"target.conllu"), 'w') as sf:
|
|
sf.write(complete_target_conllu)
|
|
|
|
with open(args.annotation_interprocessing, 'wb') as wp:
|
|
pickle.dump((annotated_source_divs, annotated_target_divs), wp)
|
|
|
|
return annotated_source_divs, annotated_target_divs
|