From 735241d018bf965817bf54b835bbfb37a2517151 Mon Sep 17 00:00:00 2001 From: Luka Date: Thu, 17 Aug 2023 09:16:15 +0200 Subject: [PATCH] Updated documentation. --- src/create_tei.py | 9 --------- src/read/read.py | 5 ----- src/read/read_and_merge.py | 20 -------------------- src/write/write.py | 3 --- svala2tei.py | 18 +++++++++--------- txt2svala.py | 6 +++--- 6 files changed, 12 insertions(+), 49 deletions(-) diff --git a/src/create_tei.py b/src/create_tei.py index 166260b..70f3366 100755 --- a/src/create_tei.py +++ b/src/create_tei.py @@ -208,7 +208,6 @@ class Sentence: class Paragraph: def __init__(self, _id, _doc_id): self._id = _id if _id is not None else 'no-id' - # _doc_id += 's' if is_source else 't' self._doc_id = _doc_id if _doc_id is not None else '' self.sentences = [] @@ -290,14 +289,12 @@ def create_bibl(metadata): else: key = kost_translations[k] note.set('ana', f'#{key}') - # set_xml_attr(note, 'lang', 'sl') note.text = f'{v}' bibl.append(note) return bibl def convert_bibl(bibl): etree_bibl = etree.Element('bibl') - # etree_bibl.set('corresp', bibl.get('corresp')) etree_bibl.set('n', bibl.get('n')) for bibl_el in bibl: etree_bibl_el = etree.Element(bibl_el.tag) @@ -332,25 +329,19 @@ def build_complete_tei(etree_source, etree_target, etree_links): print('P3') group.insert(len(group), list(etree_source[0])[1]) - # group.append(list(etree_source[0])[1]) print('P4') group.insert(len(group), list(etree_target[0])[1]) - # group.append(list(etree_target[0])[1]) print('P5') text.insert(len(text), group) - # text.append(group) print('P6') root.insert(len(root), tei_header) - # root.append(tei_header) print('P7') - # root.append(text) root.insert(len(root), text) print('P8') - # root.append(etree_links) root.insert(len(root), etree_links) print('P9') diff --git a/src/read/read.py b/src/read/read.py index 41e6f36..4877403 100755 --- a/src/read/read.py +++ b/src/read/read.py @@ -5,8 +5,6 @@ from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_ def read_raw_text(path): print(path) - # if path == "data/KOST/raw/L-1819-110.txt": - # print('here') try: with open(path, 'r', encoding='utf-8') as rf: return rf.read() @@ -56,7 +54,6 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): print('HAND_FIXES_MERGE:') print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'") SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text'] - a = SVALA_HAND_FIXES_MERGE else: print('HAND_FIXES OLD:') print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']") @@ -65,10 +62,8 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): reg = re.findall(r"[\w]+|[^\s\w]", key) print(f", '{key}': {str(reg)}") - # HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]] HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key) print(f'key: {key} ; tok[text]: {tok["text"]}') - # raise ValueError('Word mismatch!') if tok['text'] == HAND_FIXES[key][wierd_sign_count]: wierd_sign_count += 1 diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index 213d00a..9bb09ac 100755 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -295,29 +295,14 @@ def tokenize(args): return tokenized_source_divs, tokenized_target_divs, document_edges print('TOKENIZING...') - # with open(args.solar_file, 'r') as fp: - # logging.info(args.solar_file) - # et = ElementTree.XML(fp.read()) - nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True) - # filename_encountered = False - i = 0 tokenized_divs = {} - # tokenized_source_divs = {} - # tokenized_target_divs = {} - document_edges = [] - - text_filename = '' all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0] for text_folder, _, text_filenames in os.walk(args.raw_text): text_filenames = sorted(text_filenames) for text_filename_i, text_filename in enumerate(text_filenames): - # if filename_i*100/len(filenames) > 35: - # print('here') - # continue - text_file = read_raw_text(os.path.join(args.raw_text, text_filename)) raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize( text_file) if text_file else ([], [], []) @@ -338,8 +323,6 @@ def tokenize(args): apply_svala_handfixes(svala_data_object) source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i) - # target_res = create_target(svala_data, source_tokenized) - target_res = create_target(svala_data_object, source_res) @@ -366,8 +349,6 @@ def tokenize(args): paragraph_edges = [] tokenized_source_paragraphs = [] tokenized_target_paragraphs = [] - # par_source = [] - # par_target = [] for tokenized_para in tokenized_divs[div_id]: paragraph_name, source_res, target_res, edges = tokenized_para split_para_name = paragraph_name[:-5].split('-') @@ -392,7 +373,6 @@ def tokenize(args): target_conllu = create_conllu(sen, target_sen_name) target_paragraphs.append(target_conllu) sen_target.append((sen, target_sen_name)) - # paragraph_edges.append(edges) tokenized_source_paragraphs.append((par_name, source_paragraphs)) tokenized_target_paragraphs.append((par_name, target_paragraphs)) paragraph_edges.append(create_edges(edges, sen_source, sen_target)) diff --git a/src/write/write.py b/src/write/write.py index 9a472df..0eb4948 100755 --- a/src/write/write.py +++ b/src/write/write.py @@ -14,12 +14,9 @@ def form_paragraphs(annotated_source_divs, metadata): for div_i, div_tuple in enumerate(annotated_source_divs): div_name, div = div_tuple if div_name[:-1] not in metadata: - # print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!") print(div_name[:-1]) continue div_metadata = metadata[div_name[:-1]] - # file_name = file_name.replace('/', '_') - # print(f'{i * 100 / folders_count} % : {file_name}') etree_source_paragraphs = [] diff --git a/svala2tei.py b/svala2tei.py index 52aad64..789c1e1 100755 --- a/svala2tei.py +++ b/svala2tei.py @@ -246,13 +246,13 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') + description='Merges svala data, raw data and metadata into TEI format (useful for corpora like KOST).') parser.add_argument('--svala_folder', default='data/KOST/svala', - help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to directory that contains svala files.') parser.add_argument('--results_folder', default='data/KOST/results', - help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to results directory.') parser.add_argument('--raw_text', default='data/KOST/raw', - help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to directory that contains raw text files.') parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv', help='KOST metadata location') parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv', @@ -260,13 +260,13 @@ if __name__ == '__main__': parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv', help='KOST teachers location') parser.add_argument('--translations', default='data/KOST/translations.csv', - help='KOST Slovenian-English column names translations') + help='KOST Slovenian-English column names translations for TEI metadata') parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization', - help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to file that containing tokenized data.') + parser.add_argument('--overwrite_tokenization', action='store_true', help='Force retokenization without having to manually delete tokenization file.') parser.add_argument('--annotation_interprocessing', default='data/processing.annotation', - help='input file in (gz or xml currently). If none, then just database is loaded') - parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to file that containing annotated data.') + parser.add_argument('--overwrite_annotation', action='store_true', help='Force reannotation without having to manually delete tokenization file.') args = parser.parse_args() start = time.time() diff --git a/txt2svala.py b/txt2svala.py index 2ffb640..fb04b98 100755 --- a/txt2svala.py +++ b/txt2svala.py @@ -47,11 +47,11 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser( - description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.') + description='Converts raw text into svala format.') parser.add_argument('--input_folder', default='data/txt/input', - help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to folder containing raw texts.') parser.add_argument('--output_folder', default='data/txt/output', - help='input file in (gz or xml currently). If none, then just database is loaded') + help='Path to folder that will contain svala formatted texts.') args = parser.parse_args() start = time.time()