2022-02-22 09:35:01 +00:00
|
|
|
import argparse
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
import time
|
|
|
|
|
2022-10-31 10:54:53 +00:00
|
|
|
from src.annotate.annotate import annotate
|
|
|
|
from src.read.read_and_merge import tokenize
|
2022-11-06 16:10:58 +00:00
|
|
|
from src.write.write import write_tei
|
2022-02-22 09:35:01 +00:00
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
2022-05-30 05:07:11 +00:00
|
|
|
def process_file(args):
|
|
|
|
if os.path.exists(args.results_folder):
|
|
|
|
shutil.rmtree(args.results_folder)
|
|
|
|
os.mkdir(args.results_folder)
|
|
|
|
|
|
|
|
# READ AND MERGE svala tokenization, solar2 tokenization and obeliks tokenization
|
|
|
|
tokenized_source_divs, tokenized_target_divs, document_edges = tokenize(args)
|
|
|
|
|
|
|
|
# ANNOTATE WITH CLASSLA
|
|
|
|
annotated_source_divs, annotated_target_divs = annotate(tokenized_source_divs, tokenized_target_divs, args)
|
|
|
|
|
|
|
|
# GENERATE TEI AND WRITE OUTPUT
|
|
|
|
write_tei(annotated_source_divs, annotated_target_divs, document_edges, args)
|
2022-02-22 09:35:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
2022-05-30 05:07:11 +00:00
|
|
|
process_file(args)
|
2022-02-22 09:35:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(
|
2023-08-17 07:16:15 +00:00
|
|
|
description='Merges svala data, raw data and metadata into TEI format (useful for corpora like KOST).')
|
2022-10-31 10:54:53 +00:00
|
|
|
parser.add_argument('--svala_folder', default='data/KOST/svala',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='Path to directory that contains svala files.')
|
2022-12-12 09:23:28 +00:00
|
|
|
parser.add_argument('--results_folder', default='data/KOST/results',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='Path to results directory.')
|
2022-10-31 10:54:53 +00:00
|
|
|
parser.add_argument('--raw_text', default='data/KOST/raw',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='Path to directory that contains raw text files.')
|
2023-03-02 10:03:40 +00:00
|
|
|
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
|
2023-01-24 09:47:26 +00:00
|
|
|
help='KOST metadata location')
|
2023-03-02 10:03:40 +00:00
|
|
|
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
|
2023-01-24 09:47:26 +00:00
|
|
|
help='KOST authors location')
|
2023-02-25 09:01:22 +00:00
|
|
|
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
|
|
|
help='KOST teachers location')
|
2023-01-24 09:47:26 +00:00
|
|
|
parser.add_argument('--translations', default='data/KOST/translations.csv',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='KOST Slovenian-English column names translations for TEI metadata')
|
2022-05-30 05:07:11 +00:00
|
|
|
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='Path to file that containing tokenized data.')
|
|
|
|
parser.add_argument('--overwrite_tokenization', action='store_true', help='Force retokenization without having to manually delete tokenization file.')
|
2022-05-30 05:07:11 +00:00
|
|
|
parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
|
2023-08-17 07:16:15 +00:00
|
|
|
help='Path to file that containing annotated data.')
|
|
|
|
parser.add_argument('--overwrite_annotation', action='store_true', help='Force reannotation without having to manually delete tokenization file.')
|
2022-02-22 09:35:01 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
start = time.time()
|
|
|
|
main(args)
|
|
|
|
logging.info("TIME: {}".format(time.time() - start))
|