Multiple fixes for version KOST_0.2
This commit is contained in:
parent
3ceb706cef
commit
9216ee9a3b
|
@ -244,6 +244,48 @@ def create_target(svala_data_object, source_tokenized):
|
||||||
return target_tokenized
|
return target_tokenized
|
||||||
|
|
||||||
|
|
||||||
|
def fake_svala_data(source_tokenized):
|
||||||
|
source_res, target_res, generated_edges = [], [], {}
|
||||||
|
|
||||||
|
edge_id = 0
|
||||||
|
for sent in source_tokenized:
|
||||||
|
source_sent = []
|
||||||
|
target_sent = []
|
||||||
|
for tok in sent:
|
||||||
|
tok_id = tok['id'][0]
|
||||||
|
tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc'
|
||||||
|
source_svala_id = 's' + str(edge_id)
|
||||||
|
target_svala_id = 't' + str(edge_id)
|
||||||
|
source_sent.append({
|
||||||
|
'token': tok['text'],
|
||||||
|
'tag': tok_tag,
|
||||||
|
'id': tok_id,
|
||||||
|
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
||||||
|
'svala_id': source_svala_id
|
||||||
|
})
|
||||||
|
target_sent.append({
|
||||||
|
'token': tok['text'],
|
||||||
|
'tag': tok_tag,
|
||||||
|
'id': tok_id,
|
||||||
|
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
||||||
|
'svala_id': target_svala_id
|
||||||
|
})
|
||||||
|
generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = {
|
||||||
|
'id': f'e-{source_svala_id}-{target_svala_id}',
|
||||||
|
'ids': [source_svala_id, target_svala_id],
|
||||||
|
'labels': [],
|
||||||
|
'manual': False,
|
||||||
|
'source_ids': [source_svala_id],
|
||||||
|
'target_ids': [target_svala_id]
|
||||||
|
}
|
||||||
|
edge_id += 1
|
||||||
|
source_res.append(source_sent)
|
||||||
|
target_res.append(target_sent)
|
||||||
|
|
||||||
|
|
||||||
|
return source_res, target_res, generated_edges
|
||||||
|
|
||||||
|
|
||||||
def tokenize(args):
|
def tokenize(args):
|
||||||
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
|
if os.path.exists(args.tokenization_interprocessing) and not args.overwrite_tokenization:
|
||||||
print('READING TOKENIZATION...')
|
print('READING TOKENIZATION...')
|
||||||
|
@ -266,42 +308,54 @@ def tokenize(args):
|
||||||
|
|
||||||
text_filename = ''
|
text_filename = ''
|
||||||
|
|
||||||
for folder, _, filenames in os.walk(args.svala_folder):
|
all_js_filenames = [sorted(filenames) for folder, _, filenames in os.walk(args.svala_folder)][0]
|
||||||
filenames = sorted(filenames)
|
|
||||||
for filename_i, filename in enumerate(filenames):
|
for text_folder, _, text_filenames in os.walk(args.raw_text):
|
||||||
|
text_filenames = sorted(text_filenames)
|
||||||
|
for text_filename_i, text_filename in enumerate(text_filenames):
|
||||||
# if filename_i*100/len(filenames) > 35:
|
# if filename_i*100/len(filenames) > 35:
|
||||||
# print('here')
|
# print('here')
|
||||||
# continue
|
# continue
|
||||||
svala_path = os.path.join(folder, filename)
|
|
||||||
new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt'
|
|
||||||
if text_filename != new_text_filename:
|
|
||||||
text_filename = new_text_filename
|
|
||||||
text_file = read_raw_text(os.path.join(args.raw_text, text_filename))
|
|
||||||
raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(
|
|
||||||
text_file) if text_file else ([], [], [])
|
|
||||||
source_sent_i = 0
|
|
||||||
|
|
||||||
jf = open(svala_path, encoding='utf-8')
|
text_file = read_raw_text(os.path.join(args.raw_text, text_filename))
|
||||||
print(svala_path)
|
raw_text, source_tokenized, metadocument = nlp_tokenize.processors['tokenize']._tokenizer.tokenize(
|
||||||
svala_data = json.load(jf)
|
text_file) if text_file else ([], [], [])
|
||||||
jf.close()
|
source_sent_i = 0
|
||||||
|
|
||||||
svala_data_object = SvalaData(svala_data)
|
filenames = [filename for filename in all_js_filenames if filename.startswith(text_filename[:-4])]
|
||||||
|
# new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt'
|
||||||
|
if filenames:
|
||||||
|
for filename in filenames:
|
||||||
|
svala_path = os.path.join(args.svala_folder, filename)
|
||||||
|
jf = open(svala_path, encoding='utf-8')
|
||||||
|
print(svala_path)
|
||||||
|
svala_data = json.load(jf)
|
||||||
|
jf.close()
|
||||||
|
|
||||||
apply_svala_handfixes(svala_data_object)
|
svala_data_object = SvalaData(svala_data)
|
||||||
|
|
||||||
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
|
apply_svala_handfixes(svala_data_object)
|
||||||
# target_res = create_target(svala_data, source_tokenized)
|
|
||||||
|
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
|
||||||
|
# target_res = create_target(svala_data, source_tokenized)
|
||||||
|
|
||||||
|
|
||||||
target_res = create_target(svala_data_object, source_res)
|
target_res = create_target(svala_data_object, source_res)
|
||||||
|
|
||||||
if text_filename not in tokenized_divs:
|
if text_filename not in tokenized_divs:
|
||||||
tokenized_divs[text_filename] = []
|
tokenized_divs[text_filename] = []
|
||||||
|
|
||||||
tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges']))
|
tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges']))
|
||||||
|
|
||||||
logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %')
|
|
||||||
|
else:
|
||||||
|
filename = text_filename[:-4] + '.json'
|
||||||
|
source_res, target_res, generated_edges = fake_svala_data(source_tokenized)
|
||||||
|
if text_filename not in tokenized_divs:
|
||||||
|
tokenized_divs[text_filename] = []
|
||||||
|
tokenized_divs[text_filename].append((filename, source_res, target_res, generated_edges))
|
||||||
|
|
||||||
|
logging.info(f'Tokenizing at {text_filename_i * 100 / len(text_filenames)} %')
|
||||||
|
|
||||||
tokenized_source_divs = []
|
tokenized_source_divs = []
|
||||||
tokenized_target_divs = []
|
tokenized_target_divs = []
|
||||||
|
|
|
@ -14,7 +14,7 @@ def form_paragraphs(annotated_source_divs, metadata):
|
||||||
for div_i, div_tuple in enumerate(annotated_source_divs):
|
for div_i, div_tuple in enumerate(annotated_source_divs):
|
||||||
div_name, div = div_tuple
|
div_name, div = div_tuple
|
||||||
if div_name[:-1] not in metadata:
|
if div_name[:-1] not in metadata:
|
||||||
print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
|
# print(div_name[:-1] + "!!!!!!!!!!!!!!!!!!")
|
||||||
print(div_name[:-1])
|
print(div_name[:-1])
|
||||||
continue
|
continue
|
||||||
div_metadata = metadata[div_name[:-1]]
|
div_metadata = metadata[div_name[:-1]]
|
||||||
|
@ -55,6 +55,23 @@ def read_metadata(args):
|
||||||
row_dict[column_names[j]] = content
|
row_dict[column_names[j]] = content
|
||||||
texts_metadata.append(row_dict)
|
texts_metadata.append(row_dict)
|
||||||
|
|
||||||
|
# handle teachers
|
||||||
|
teachers_metadata = {}
|
||||||
|
with open(args.teachers_metadata, 'r') as file:
|
||||||
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||||
|
column_names = []
|
||||||
|
for i, row in enumerate(csvreader):
|
||||||
|
if i == 0:
|
||||||
|
column_names = row
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
row_dict = {}
|
||||||
|
for j, content in enumerate(row):
|
||||||
|
row_dict[column_names[j]] = content
|
||||||
|
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
||||||
|
teachers_metadata[row_dict['Ime in priimek']] = row_dict
|
||||||
|
|
||||||
|
# handle authors
|
||||||
authors_metadata = {}
|
authors_metadata = {}
|
||||||
with open(args.authors_metadata, 'r') as file:
|
with open(args.authors_metadata, 'r') as file:
|
||||||
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||||
|
@ -86,11 +103,11 @@ def read_metadata(args):
|
||||||
for row in csvreader:
|
for row in csvreader:
|
||||||
translations[row[0]] = row[1]
|
translations[row[0]] = row[1]
|
||||||
|
|
||||||
return texts_metadata, authors_metadata, translations
|
return texts_metadata, authors_metadata, teachers_metadata, translations
|
||||||
|
|
||||||
|
|
||||||
def process_metadata(args):
|
def process_metadata(args):
|
||||||
texts_metadata, authors_metadata, translations = read_metadata(args)
|
texts_metadata, authors_metadata, teachers_metadata, translations = read_metadata(args)
|
||||||
|
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for document_metadata in texts_metadata:
|
for document_metadata in texts_metadata:
|
||||||
|
@ -107,6 +124,8 @@ def process_metadata(args):
|
||||||
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
|
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
|
||||||
elif attribute_name_sl == 'Tvorec':
|
elif attribute_name_sl == 'Tvorec':
|
||||||
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
||||||
|
elif attribute_name_sl == 'Učitelj':
|
||||||
|
metadata_el[attribute_name_en] = teachers_metadata[document_metadata['Učitelj']]['Koda'] if document_metadata['Učitelj'] in teachers_metadata else None
|
||||||
else:
|
else:
|
||||||
metadata_el[attribute_name_en] = document_metadata[attribute_name_sl]
|
metadata_el[attribute_name_en] = document_metadata[attribute_name_sl]
|
||||||
elif attribute_name_sl in author_metadata:
|
elif attribute_name_sl in author_metadata:
|
||||||
|
@ -171,16 +190,16 @@ def write_tei(annotated_source_divs, annotated_target_divs, document_edges, args
|
||||||
etree_source = build_tei_etrees(etree_source_documents)
|
etree_source = build_tei_etrees(etree_source_documents)
|
||||||
etree_target = build_tei_etrees(etree_target_documents)
|
etree_target = build_tei_etrees(etree_target_documents)
|
||||||
|
|
||||||
print('Writting all but complete')
|
# print('Writting all but complete')
|
||||||
with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
|
# with open(os.path.join(args.results_folder, f"source.xml"), 'w') as sf:
|
||||||
sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
|
# sf.write(etree.tostring(etree_source[0], pretty_print=True, encoding='utf-8').decode())
|
||||||
|
#
|
||||||
with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
|
# with open(os.path.join(args.results_folder, f"target.xml"), 'w') as tf:
|
||||||
tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
|
# tf.write(etree.tostring(etree_target[0], pretty_print=True, encoding='utf-8').decode())
|
||||||
|
|
||||||
print('COMPLETE TREE CREATION...')
|
print('COMPLETE TREE CREATION...')
|
||||||
complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
|
# complete_etree = build_complete_tei(copy.deepcopy(etree_source), copy.deepcopy(etree_target), etree_links)
|
||||||
# complete_etree = build_complete_tei(etree_source, etree_target, etree_links)
|
complete_etree = build_complete_tei(etree_source, etree_target, etree_links)
|
||||||
|
|
||||||
print('WRITING COMPLETE TREE')
|
print('WRITING COMPLETE TREE')
|
||||||
with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
|
with open(os.path.join(args.results_folder, f"complete.xml"), 'w') as tf:
|
||||||
|
|
|
@ -257,6 +257,8 @@ if __name__ == '__main__':
|
||||||
help='KOST metadata location')
|
help='KOST metadata location')
|
||||||
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
|
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
|
||||||
help='KOST authors location')
|
help='KOST authors location')
|
||||||
|
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
||||||
|
help='KOST teachers location')
|
||||||
parser.add_argument('--translations', default='data/KOST/translations.csv',
|
parser.add_argument('--translations', default='data/KOST/translations.csv',
|
||||||
help='KOST Slovenian-English column names translations')
|
help='KOST Slovenian-English column names translations')
|
||||||
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
parser.add_argument('--tokenization_interprocessing', default='data/processing.tokenization',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user