Updated documentation.
This commit is contained in:
parent
4793c8e4bd
commit
735241d018
|
@ -208,7 +208,6 @@ class Sentence:
|
|||
class Paragraph:
|
||||
def __init__(self, _id, _doc_id):
|
||||
self._id = _id if _id is not None else 'no-id'
|
||||
# _doc_id += 's' if is_source else 't'
|
||||
self._doc_id = _doc_id if _doc_id is not None else ''
|
||||
self.sentences = []
|
||||
|
||||
|
@ -290,14 +289,12 @@ def create_bibl(metadata):
|
|||
else:
|
||||
key = kost_translations[k]
|
||||
note.set('ana', f'#{key}')
|
||||
# set_xml_attr(note, 'lang', 'sl')
|
||||
note.text = f'{v}'
|
||||
bibl.append(note)
|
||||
return bibl
|
||||
|
||||
def convert_bibl(bibl):
|
||||
etree_bibl = etree.Element('bibl')
|
||||
# etree_bibl.set('corresp', bibl.get('corresp'))
|
||||
etree_bibl.set('n', bibl.get('n'))
|
||||
for bibl_el in bibl:
|
||||
etree_bibl_el = etree.Element(bibl_el.tag)
|
||||
|
@ -332,25 +329,19 @@ def build_complete_tei(etree_source, etree_target, etree_links):
|
|||
print('P3')
|
||||
group.insert(len(group),
|
||||
list(etree_source[0])[1])
|
||||
# group.append(list(etree_source[0])[1])
|
||||
print('P4')
|
||||
group.insert(len(group),
|
||||
list(etree_target[0])[1])
|
||||
# group.append(list(etree_target[0])[1])
|
||||
print('P5')
|
||||
text.insert(len(text),
|
||||
group)
|
||||
# text.append(group)
|
||||
print('P6')
|
||||
root.insert(len(root),
|
||||
tei_header)
|
||||
# root.append(tei_header)
|
||||
print('P7')
|
||||
# root.append(text)
|
||||
root.insert(len(root),
|
||||
text)
|
||||
print('P8')
|
||||
# root.append(etree_links)
|
||||
root.insert(len(root),
|
||||
etree_links)
|
||||
print('P9')
|
||||
|
|
|
@ -5,8 +5,6 @@ from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_
|
|||
|
||||
def read_raw_text(path):
|
||||
print(path)
|
||||
# if path == "data/KOST/raw/L-1819-110.txt":
|
||||
# print('here')
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as rf:
|
||||
return rf.read()
|
||||
|
@ -56,7 +54,6 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|||
print('HAND_FIXES_MERGE:')
|
||||
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
|
||||
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
|
||||
a = SVALA_HAND_FIXES_MERGE
|
||||
else:
|
||||
print('HAND_FIXES OLD:')
|
||||
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
|
||||
|
@ -65,10 +62,8 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|||
reg = re.findall(r"[\w]+|[^\s\w]", key)
|
||||
print(f", '{key}': {str(reg)}")
|
||||
|
||||
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
|
||||
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
|
||||
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
||||
# raise ValueError('Word mismatch!')
|
||||
|
||||
if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
|
||||
wierd_sign_count += 1
|
||||
|
|
|
@ -295,29 +295,14 @@ def tokenize(args):
|
|||
return tokenized_source_divs, tokenized_target_divs, document_edges
|
||||
|
||||
print('TOKENIZING...')
|
||||
# with open(args.solar_file, 'r') as fp:
|
||||
# logging.info(args.solar_file)
|
||||
# et = ElementTree.XML(fp.read())
|
||||
|
||||
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
|
||||
# filename_encountered = False
|
||||
i = 0
|
||||
tokenized_divs = {}
|
||||
# tokenized_source_divs = {}
|
||||
# tokenized_target_divs = {}
|
||||
document_edges = []
|
||||
|
||||
text_filename = ''
|
||||
|
||||
all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0]
|
||||
|
||||
for text_folder, _, text_filenames in os.walk(args.raw_text):
|
||||
text_filenames = sorted(text_filenames)
|
||||
for text_filename_i, text_filename in enumerate(text_filenames):
|
||||
# if filename_i*100/len(filenames) > 35:
|
||||
# print('here')
|
||||
# continue
|
||||
|
||||
text_file = read_raw_text(os.path.join(args.raw_text, text_filename))
|
||||
raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(
|
||||
text_file) if text_file else ([], [], [])
|
||||
|
@ -338,8 +323,6 @@ def tokenize(args):
|
|||
apply_svala_handfixes(svala_data_object)
|
||||
|
||||
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
|
||||
# target_res = create_target(svala_data, source_tokenized)
|
||||
|
||||
|
||||
target_res = create_target(svala_data_object, source_res)
|
||||
|
||||
|
@ -366,8 +349,6 @@ def tokenize(args):
|
|||
paragraph_edges = []
|
||||
tokenized_source_paragraphs = []
|
||||
tokenized_target_paragraphs = []
|
||||
# par_source = []
|
||||
# par_target = []
|
||||
for tokenized_para in tokenized_divs[div_id]:
|
||||
paragraph_name, source_res, target_res, edges = tokenized_para
|
||||
split_para_name = paragraph_name[:-5].split('-')
|
||||
|
@ -392,7 +373,6 @@ def tokenize(args):
|
|||
target_conllu = create_conllu(sen, target_sen_name)
|
||||
target_paragraphs.append(target_conllu)
|
||||
sen_target.append((sen, target_sen_name))
|
||||
# paragraph_edges.append(edges)
|
||||
tokenized_source_paragraphs.append((par_name, source_paragraphs))
|
||||
tokenized_target_paragraphs.append((par_name, target_paragraphs))
|
||||
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
|
||||
|
|
|
@ -14,12 +14,9 @@ def form_paragraphs(annotated_source_divs, metadata):
|
|||
for div_i, div_tuple in enumerate(annotated_source_divs):
|
||||
div_name, div = div_tuple
|
||||
if div_name[:-1] not in metadata:
|
||||
# print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
|
||||
print(div_name[:-1])
|
||||
continue
|
||||
div_metadata = metadata[div_name[:-1]]
|
||||
# file_name = file_name.replace('/', '_')
|
||||
# print(f'{i * 100 / folders_count} % : {file_name}')
|
||||
|
||||
etree_source_paragraphs = []
|
||||
|
||||
|
|
18
svala2tei.py
18
svala2tei.py
|
@ -246,13 +246,13 @@ def main(args):
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||
description='Merges svala data, raw data and metadata into TEI format (useful for corpora like KOST).')
|
||||
parser.add_argument('--svala_folder', default='data/KOST/svala',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to directory that contains svala files.')
|
||||
parser.add_argument('--results_folder', default='data/KOST/results',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to results directory.')
|
||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to directory that contains raw text files.')
|
||||
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
|
||||
help='KOST metadata location')
|
||||
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
|
||||
|
@ -260,13 +260,13 @@ if __name__ == '__main__':
|
|||
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
||||
help='KOST teachers location')
|
||||
parser.add_argument('--translations', default='data/KOST/translations.csv',
|
||||
help='KOST Slovenian-English column names translations')
|
||||
help='KOST Slovenian-English column names translations for TEI metadata')
|
||||
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--overwrite_tokenization', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to file that containing tokenized data.')
|
||||
parser.add_argument('--overwrite_tokenization', action='store_true', help='Force retokenization without having to manually delete tokenization file.')
|
||||
parser.add_argument('--annotation_interprocessing', default='data/processing.annotation',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--overwrite_annotation', action='store_true', help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to file that containing annotated data.')
|
||||
parser.add_argument('--overwrite_annotation', action='store_true', help='Force reannotation without having to manually delete tokenization file.')
|
||||
args = parser.parse_args()
|
||||
|
||||
start = time.time()
|
||||
|
|
|
@ -47,11 +47,11 @@ def main(args):
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Read already processed xmls, erase entries without examples and limit gigafida examples to 1 per entry.')
|
||||
description='Converts raw text into svala format.')
|
||||
parser.add_argument('--input_folder', default='data/txt/input',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to folder containing raw texts.')
|
||||
parser.add_argument('--output_folder', default='data/txt/output',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
help='Path to folder that will contain svala formatted texts.')
|
||||
args = parser.parse_args()
|
||||
|
||||
start = time.time()
|
||||
|
|
Loading…
Reference in New Issue
Block a user