Updated documentation.

This commit is contained in:
Luka 2023-08-17 09:16:15 +02:00
parent 4793c8e4bd
commit 735241d018
6 changed files with 12 additions and 49 deletions

View File

@ -208,7 +208,6 @@ class Sentence:
class Paragraph: class Paragraph:
def __init__(self, _id, _doc_id): def __init__(self, _id, _doc_id):
self._id = _id if _id is not None else 'no-id' self._id = _id if _id is not None else 'no-id'
# _doc_id += 's' if is_source else 't'
self._doc_id = _doc_id if _doc_id is not None else '' self._doc_id = _doc_id if _doc_id is not None else ''
self.sentences = [] self.sentences = []
@ -290,14 +289,12 @@ def create_bibl(metadata):
else: else:
key = kost_translations[k] key = kost_translations[k]
note.set('ana', f'#{key}') note.set('ana', f'#{key}')
# set_xml_attr(note, 'lang', 'sl')
note.text = f'{v}' note.text = f'{v}'
bibl.append(note) bibl.append(note)
return bibl return bibl
def convert_bibl(bibl): def convert_bibl(bibl):
etree_bibl = etree.Element('bibl') etree_bibl = etree.Element('bibl')
# etree_bibl.set('corresp', bibl.get('corresp'))
etree_bibl.set('n', bibl.get('n')) etree_bibl.set('n', bibl.get('n'))
for bibl_el in bibl: for bibl_el in bibl:
etree_bibl_el = etree.Element(bibl_el.tag) etree_bibl_el = etree.Element(bibl_el.tag)
@ -332,25 +329,19 @@ def build_complete_tei(etree_source, etree_target, etree_links):
print('P3') print('P3')
group.insert(len(group), group.insert(len(group),
list(etree_source[0])[1]) list(etree_source[0])[1])
# group.append(list(etree_source[0])[1])
print('P4') print('P4')
group.insert(len(group), group.insert(len(group),
list(etree_target[0])[1]) list(etree_target[0])[1])
# group.append(list(etree_target[0])[1])
print('P5') print('P5')
text.insert(len(text), text.insert(len(text),
group) group)
# text.append(group)
print('P6') print('P6')
root.insert(len(root), root.insert(len(root),
tei_header) tei_header)
# root.append(tei_header)
print('P7') print('P7')
# root.append(text)
root.insert(len(root), root.insert(len(root),
text) text)
print('P8') print('P8')
# root.append(etree_links)
root.insert(len(root), root.insert(len(root),
etree_links) etree_links)
print('P9') print('P9')

View File

@ -5,8 +5,6 @@ from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_
def read_raw_text(path): def read_raw_text(path):
print(path) print(path)
# if path == "data/KOST/raw/L-1819-110.txt":
# print('here')
try: try:
with open(path, 'r', encoding='utf-8') as rf: with open(path, 'r', encoding='utf-8') as rf:
return rf.read() return rf.read()
@ -56,7 +54,6 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
print('HAND_FIXES_MERGE:') print('HAND_FIXES_MERGE:')
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'") print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text'] SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
a = SVALA_HAND_FIXES_MERGE
else: else:
print('HAND_FIXES OLD:') print('HAND_FIXES OLD:')
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']") print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
@ -65,10 +62,8 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
reg = re.findall(r"[\w]+|[^\s\w]", key) reg = re.findall(r"[\w]+|[^\s\w]", key)
print(f", '{key}': {str(reg)}") print(f", '{key}': {str(reg)}")
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key) HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
print(f'key: {key} ; tok[text]: {tok["text"]}') print(f'key: {key} ; tok[text]: {tok["text"]}')
# raise ValueError('Word mismatch!')
if tok['text'] == HAND_FIXES[key][wierd_sign_count]: if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
wierd_sign_count += 1 wierd_sign_count += 1

View File

@ -295,29 +295,14 @@ def tokenize(args):
return tokenized_source_divs, tokenized_target_divs, document_edges return tokenized_source_divs, tokenized_target_divs, document_edges
print('TOKENIZING...') print('TOKENIZING...')
# with open(args.solar_file, 'r') as fp:
# logging.info(args.solar_file)
# et = ElementTree.XML(fp.read())
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True) nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
# filename_encountered = False
i = 0
tokenized_divs = {} tokenized_divs = {}
# tokenized_source_divs = {}
# tokenized_target_divs = {}
document_edges = []
text_filename = ''
all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0] all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0]
for text_folder, _, text_filenames in os.walk(args.raw_text): for text_folder, _, text_filenames in os.walk(args.raw_text):
text_filenames = sorted(text_filenames) text_filenames = sorted(text_filenames)
for text_filename_i, text_filename in enumerate(text_filenames): for text_filename_i, text_filename in enumerate(text_filenames):
# if filename_i*100/len(filenames) > 35:
# print('here')
# continue
text_file = read_raw_text(os.path.join(args.raw_text, text_filename)) text_file = read_raw_text(os.path.join(args.raw_text, text_filename))
raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize( raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(
text_file) if text_file else ([], [], []) text_file) if text_file else ([], [], [])
@ -338,8 +323,6 @@ def tokenize(args):
apply_svala_handfixes(svala_data_object) apply_svala_handfixes(svala_data_object)
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i) source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
# target_res = create_target(svala_data, source_tokenized)
target_res = create_target(svala_data_object, source_res) target_res = create_target(svala_data_object, source_res)
@ -366,8 +349,6 @@ def tokenize(args):
paragraph_edges = [] paragraph_edges = []
tokenized_source_paragraphs = [] tokenized_source_paragraphs = []
tokenized_target_paragraphs = [] tokenized_target_paragraphs = []
# par_source = []
# par_target = []
for tokenized_para in tokenized_divs[div_id]: for tokenized_para in tokenized_divs[div_id]:
paragraph_name, source_res, target_res, edges = tokenized_para paragraph_name, source_res, target_res, edges = tokenized_para
split_para_name = paragraph_name[:-5].split('-') split_para_name = paragraph_name[:-5].split('-')
@ -392,7 +373,6 @@ def tokenize(args):
target_conllu = create_conllu(sen, target_sen_name) target_conllu = create_conllu(sen, target_sen_name)
target_paragraphs.append(target_conllu) target_paragraphs.append(target_conllu)
sen_target.append((sen, target_sen_name)) sen_target.append((sen, target_sen_name))
# paragraph_edges.append(edges)
tokenized_source_paragraphs.append((par_name, source_paragraphs)) tokenized_source_paragraphs.append((par_name, source_paragraphs))
tokenized_target_paragraphs.append((par_name, target_paragraphs)) tokenized_target_paragraphs.append((par_name, target_paragraphs))
paragraph_edges.append(create_edges(edges, sen_source, sen_target)) paragraph_edges.append(create_edges(edges, sen_source, sen_target))

View File

@ -14,12 +14,9 @@ def form_paragraphs(annotated_source_divs, metadata):
for div_i, div_tuple in enumerate(annotated_source_divs): for div_i, div_tuple in enumerate(annotated_source_divs):
div_name, div = div_tuple div_name, div = div_tuple
if div_name[:-1] not in metadata: if div_name[:-1] not in metadata:
# print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
print(div_name[:-1]) print(div_name[:-1])
continue continue
div_metadata = metadata[div_name[:-1]] div_metadata = metadata[div_name[:-1]]
# file_name = file_name.replace('/', '_')
# print(f'{i * 100 / folders_count} % : {file_name}')
etree_source_paragraphs = [] etree_source_paragraphs = []

View File

@ -246,13 +246,13 @@ def main(args):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') description='Merges svala data, raw data and metadata into TEI format (useful for corpora like KOST).')
parser.add_argument('--svala_folder', default='data/KOST/svala', parser.add_argument('--svala_folder', default='data/KOST/svala',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to directory that contains svala files.')
parser.add_argument('--results_folder', default='data/KOST/results', parser.add_argument('--results_folder', default='data/KOST/results',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to results directory.')
parser.add_argument('--raw_text', default='data/KOST/raw', parser.add_argument('--raw_text', default='data/KOST/raw',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to directory that contains raw text files.')
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv', parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
help='KOST metadata location') help='KOST metadata location')
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv', parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
@ -260,13 +260,13 @@ if __name__ == '__main__':
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv', parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
help='KOST teachers location') help='KOST teachers location')
parser.add_argument('--translations', default='data/KOST/translations.csv', parser.add_argument('--translations', default='data/KOST/translations.csv',
help='KOST Slovenian-English column names translations') help='KOST Slovenian-English column names translations for TEI metadata')
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization', parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to file that containing tokenized data.')
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--overwrite_tokenization', action='store_true', help='Force retokenization without having to manually delete tokenization file.')
parser.add_argument('--annotation_interprocessing', default='data/processing.annotation', parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to file that containing annotated data.')
parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded') parser.add_argument('--overwrite_annotation', action='store_true', help='Force reannotation without having to manually delete tokenization file.')
args = parser.parse_args() args = parser.parse_args()
start = time.time() start = time.time()

View File

@ -47,11 +47,11 @@ def main(args):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') description='Converts raw text into svala format.')
parser.add_argument('--input_folder', default='data/txt/input', parser.add_argument('--input_folder', default='data/txt/input',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to folder containing raw texts.')
parser.add_argument('--output_folder', default='data/txt/output', parser.add_argument('--output_folder', default='data/txt/output',
help='input file in (gz or xml currently). If none, then just database is loaded') help='Path to folder that will contain svala formatted texts.')
args = parser.parse_args() args = parser.parse_args()
start = time.time() start = time.time()