Fixed occurences of impossible links + Other repairs

This commit is contained in:
Luka 2023-03-02 11:03:40 +01:00
parent 9216ee9a3b
commit 361cc14199
4 changed files with 43 additions and 31 deletions

View File

@ -330,17 +330,29 @@ def build_complete_tei(etree_source, etree_target, etree_links):
text = etree.Element('text') text = etree.Element('text')
group = etree.Element('group') group = etree.Element('group')
print('P3') print('P3')
group.append(list(etree_source[0])[1]) group.insert(len(group),
list(etree_source[0])[1])
# group.append(list(etree_source[0])[1])
print('P4') print('P4')
group.append(list(etree_target[0])[1]) group.insert(len(group),
list(etree_target[0])[1])
# group.append(list(etree_target[0])[1])
print('P5') print('P5')
text.append(group) text.insert(len(text),
group)
# text.append(group)
print('P6') print('P6')
root.append(tei_header) root.insert(len(root),
tei_header)
# root.append(tei_header)
print('P7') print('P7')
root.append(text) # root.append(text)
root.insert(len(root),
text)
print('P8') print('P8')
root.append(etree_links) # root.append(etree_links)
root.insert(len(root),
etree_links)
print('P9') print('P9')
return root return root
@ -349,34 +361,22 @@ def build_links(all_edges):
body = etree.Element('standOff') body = etree.Element('standOff')
for document_edges in all_edges: for document_edges in all_edges:
# if len(document_edges) > 1:
# print('here')
# mine paragraphs # mine paragraphs
for paragraph_edges in document_edges: for paragraph_edges in document_edges:
p = etree.Element('linkGrp') p = etree.Element('linkGrp')
paragraph_id = ''
corresp_source_id = '' corresp_source_id = ''
corresp_target_id = '' corresp_target_id = ''
corresp = []
# for sentence_edges in paragraph_edges:
#
for token_edges in paragraph_edges: for token_edges in paragraph_edges:
if not corresp_source_id and len(token_edges['source_ids']) > 0: if not corresp_source_id and len(token_edges['source_ids']) > 0:
random_source_id = token_edges['source_ids'][0] random_source_id = token_edges['source_ids'][0]
corresp_source_id = '#' corresp_source_id = '#'
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
corresp_source_id += '.'.join(random_source_id.split('.')[:2]) corresp_source_id += '.'.join(random_source_id.split('.')[:2])
corresp.append(corresp_source_id)
if not corresp_target_id and len(token_edges['target_ids']) > 0: if not corresp_target_id and len(token_edges['target_ids']) > 0:
random_target_id = token_edges['target_ids'][0] random_target_id = token_edges['target_ids'][0]
corresp_target_id = '#' corresp_target_id = '#'
corresp_target_id += '.'.join(random_target_id.split('.')[:2]) corresp_target_id += '.'.join(random_target_id.split('.')[:2])
# corresp_target_id += random_target_id.split('.')[0]
corresp.append(corresp_target_id)
link = etree.Element('link') link = etree.Element('link')
# translate labels # translate labels
labels_list = [] labels_list = []
@ -390,6 +390,11 @@ def build_links(all_edges):
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']])) link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
p.append(link) p.append(link)
corresp = []
if corresp_source_id:
corresp.append(corresp_source_id)
if corresp_target_id:
corresp.append(corresp_target_id)
p.set('type', 'CORR') p.set('type', 'CORR')
targFunc = [] targFunc = []
if corresp_source_id: if corresp_source_id:

View File

@ -238,7 +238,7 @@ def create_target(svala_data_object, source_tokenized):
target_tokenized.append(target_sent_tokenized) target_tokenized.append(target_sent_tokenized)
target_sent_tokenized = [] target_sent_tokenized = []
curr_sententence += 1 curr_sententence += 1
tok_i = 1 tok_i = 0
tok_i += 1 tok_i += 1
target_tokenized.append(target_sent_tokenized) target_tokenized.append(target_sent_tokenized)
return target_tokenized return target_tokenized
@ -256,18 +256,19 @@ def fake_svala_data(source_tokenized):
tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc' tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc'
source_svala_id = 's' + str(edge_id) source_svala_id = 's' + str(edge_id)
target_svala_id = 't' + str(edge_id) target_svala_id = 't' + str(edge_id)
space_after = not ('misc' in tok and tok['misc'] == 'SpaceAfter=No')
source_sent.append({ source_sent.append({
'token': tok['text'], 'token': tok['text'],
'tag': tok_tag, 'tag': tok_tag,
'id': tok_id, 'id': tok_id,
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', 'space_after': space_after,
'svala_id': source_svala_id 'svala_id': source_svala_id
}) })
target_sent.append({ target_sent.append({
'token': tok['text'], 'token': tok['text'],
'tag': tok_tag, 'tag': tok_tag,
'id': tok_id, 'id': tok_id,
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No', 'space_after': space_after,
'svala_id': target_svala_id 'svala_id': target_svala_id
}) })
generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = { generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = {

View File

@ -43,7 +43,7 @@ def form_paragraphs(annotated_source_divs, metadata):
def read_metadata(args): def read_metadata(args):
texts_metadata = [] texts_metadata = []
with open(args.texts_metadata, 'r') as file: with open(args.texts_metadata, 'r') as file:
csvreader = csv.reader(file, delimiter='\t', quotechar='"') csvreader = csv.reader(file, delimiter='|', quotechar='"')
column_names = [] column_names = []
for i, row in enumerate(csvreader): for i, row in enumerate(csvreader):
if i == 0: if i == 0:
@ -52,7 +52,7 @@ def read_metadata(args):
else: else:
row_dict = {} row_dict = {}
for j, content in enumerate(row): for j, content in enumerate(row):
row_dict[column_names[j]] = content row_dict[column_names[j]] = content.strip()
texts_metadata.append(row_dict) texts_metadata.append(row_dict)
# handle teachers # handle teachers
@ -74,7 +74,7 @@ def read_metadata(args):
# handle authors # handle authors
authors_metadata = {} authors_metadata = {}
with open(args.authors_metadata, 'r') as file: with open(args.authors_metadata, 'r') as file:
csvreader = csv.reader(file, delimiter='\t', quotechar='"') csvreader = csv.reader(file, delimiter='|', quotechar='"')
column_names = [] column_names = []
for i, row in enumerate(csvreader): for i, row in enumerate(csvreader):
if i == 0: if i == 0:
@ -93,7 +93,7 @@ def read_metadata(args):
else: else:
row_dict = {} row_dict = {}
for j, content in enumerate(row): for j, content in enumerate(row):
row_dict[column_names[j]] = content row_dict[column_names[j]] = content.strip()
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip() row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
authors_metadata[row_dict['Ime in priimek']] = row_dict authors_metadata[row_dict['Ime in priimek']] = row_dict
@ -121,7 +121,8 @@ def process_metadata(args):
for attribute_name_sl, attribute_name_en in translations.items(): for attribute_name_sl, attribute_name_en in translations.items():
if attribute_name_sl in document_metadata: if attribute_name_sl in document_metadata:
if attribute_name_sl == 'Ocena': if attribute_name_sl == 'Ocena':
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' grade = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' if document_metadata[attribute_name_sl] and document_metadata["Najvišja možna ocena"] else ''
metadata_el[attribute_name_en] = grade
elif attribute_name_sl == 'Tvorec': elif attribute_name_sl == 'Tvorec':
metadata_el[attribute_name_en] = author_metadata['Koda tvorca'] metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
elif attribute_name_sl == 'Učitelj': elif attribute_name_sl == 'Učitelj':
@ -131,7 +132,12 @@ def process_metadata(args):
elif attribute_name_sl in author_metadata: elif attribute_name_sl in author_metadata:
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl] metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
elif attribute_name_sl == 'Ime šole, Fakulteta': elif attribute_name_sl == 'Ime šole, Fakulteta':
metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}' curr_school = []
if author_metadata["Trenutno šolanje - Ime šole"]:
curr_school.append(author_metadata["Trenutno šolanje - Ime šole"])
if author_metadata["Trenutno šolanje - Fakulteta"]:
curr_school.append(author_metadata["Trenutno šolanje - Fakulteta"])
metadata_el['Current school'] = ', '.join(curr_school)
elif attribute_name_sl == 'Stopnja študija': elif attribute_name_sl == 'Stopnja študija':
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija'] metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
elif attribute_name_sl == 'Leto študija': elif attribute_name_sl == 'Leto študija':

View File

@ -253,9 +253,9 @@ if __name__ == '__main__':
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--raw_text', default='data/KOST/raw', parser.add_argument('--raw_text', default='data/KOST/raw',
help='input file in (gz or xml currently). If none, then just database is loaded') help='input file in (gz or xml currently). If none, then just database is loaded')
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv', parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
help='KOST metadata location') help='KOST metadata location')
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv', parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
help='KOST authors location') help='KOST authors location')
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv', parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
help='KOST teachers location') help='KOST teachers location')