Fixed occurences of impossible links + Other repairs
This commit is contained in:
parent
9216ee9a3b
commit
361cc14199
|
@ -330,17 +330,29 @@ def build_complete_tei(etree_source, etree_target, etree_links):
|
||||||
text = etree.Element('text')
|
text = etree.Element('text')
|
||||||
group = etree.Element('group')
|
group = etree.Element('group')
|
||||||
print('P3')
|
print('P3')
|
||||||
group.append(list(etree_source[0])[1])
|
group.insert(len(group),
|
||||||
|
list(etree_source[0])[1])
|
||||||
|
# group.append(list(etree_source[0])[1])
|
||||||
print('P4')
|
print('P4')
|
||||||
group.append(list(etree_target[0])[1])
|
group.insert(len(group),
|
||||||
|
list(etree_target[0])[1])
|
||||||
|
# group.append(list(etree_target[0])[1])
|
||||||
print('P5')
|
print('P5')
|
||||||
text.append(group)
|
text.insert(len(text),
|
||||||
|
group)
|
||||||
|
# text.append(group)
|
||||||
print('P6')
|
print('P6')
|
||||||
root.append(tei_header)
|
root.insert(len(root),
|
||||||
|
tei_header)
|
||||||
|
# root.append(tei_header)
|
||||||
print('P7')
|
print('P7')
|
||||||
root.append(text)
|
# root.append(text)
|
||||||
|
root.insert(len(root),
|
||||||
|
text)
|
||||||
print('P8')
|
print('P8')
|
||||||
root.append(etree_links)
|
# root.append(etree_links)
|
||||||
|
root.insert(len(root),
|
||||||
|
etree_links)
|
||||||
print('P9')
|
print('P9')
|
||||||
return root
|
return root
|
||||||
|
|
||||||
|
@ -349,34 +361,22 @@ def build_links(all_edges):
|
||||||
body = etree.Element('standOff')
|
body = etree.Element('standOff')
|
||||||
|
|
||||||
for document_edges in all_edges:
|
for document_edges in all_edges:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# if len(document_edges) > 1:
|
|
||||||
# print('here')
|
|
||||||
|
|
||||||
# mine paragraphs
|
# mine paragraphs
|
||||||
for paragraph_edges in document_edges:
|
for paragraph_edges in document_edges:
|
||||||
p = etree.Element('linkGrp')
|
p = etree.Element('linkGrp')
|
||||||
paragraph_id = ''
|
|
||||||
corresp_source_id = ''
|
corresp_source_id = ''
|
||||||
corresp_target_id = ''
|
corresp_target_id = ''
|
||||||
corresp = []
|
|
||||||
# for sentence_edges in paragraph_edges:
|
|
||||||
#
|
|
||||||
for token_edges in paragraph_edges:
|
for token_edges in paragraph_edges:
|
||||||
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
||||||
random_source_id = token_edges['source_ids'][0]
|
random_source_id = token_edges['source_ids'][0]
|
||||||
corresp_source_id = '#'
|
corresp_source_id = '#'
|
||||||
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
|
||||||
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
|
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
|
||||||
corresp.append(corresp_source_id)
|
|
||||||
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
||||||
random_target_id = token_edges['target_ids'][0]
|
random_target_id = token_edges['target_ids'][0]
|
||||||
corresp_target_id = '#'
|
corresp_target_id = '#'
|
||||||
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
|
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
|
||||||
# corresp_target_id += random_target_id.split('.')[0]
|
|
||||||
corresp.append(corresp_target_id)
|
|
||||||
link = etree.Element('link')
|
link = etree.Element('link')
|
||||||
# translate labels
|
# translate labels
|
||||||
labels_list = []
|
labels_list = []
|
||||||
|
@ -390,6 +390,11 @@ def build_links(all_edges):
|
||||||
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
||||||
|
|
||||||
p.append(link)
|
p.append(link)
|
||||||
|
corresp = []
|
||||||
|
if corresp_source_id:
|
||||||
|
corresp.append(corresp_source_id)
|
||||||
|
if corresp_target_id:
|
||||||
|
corresp.append(corresp_target_id)
|
||||||
p.set('type', 'CORR')
|
p.set('type', 'CORR')
|
||||||
targFunc = []
|
targFunc = []
|
||||||
if corresp_source_id:
|
if corresp_source_id:
|
||||||
|
|
|
@ -238,7 +238,7 @@ def create_target(svala_data_object, source_tokenized):
|
||||||
target_tokenized.append(target_sent_tokenized)
|
target_tokenized.append(target_sent_tokenized)
|
||||||
target_sent_tokenized = []
|
target_sent_tokenized = []
|
||||||
curr_sententence += 1
|
curr_sententence += 1
|
||||||
tok_i = 1
|
tok_i = 0
|
||||||
tok_i += 1
|
tok_i += 1
|
||||||
target_tokenized.append(target_sent_tokenized)
|
target_tokenized.append(target_sent_tokenized)
|
||||||
return target_tokenized
|
return target_tokenized
|
||||||
|
@ -256,18 +256,19 @@ def fake_svala_data(source_tokenized):
|
||||||
tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc'
|
tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc'
|
||||||
source_svala_id = 's' + str(edge_id)
|
source_svala_id = 's' + str(edge_id)
|
||||||
target_svala_id = 't' + str(edge_id)
|
target_svala_id = 't' + str(edge_id)
|
||||||
|
space_after = not ('misc' in tok and tok['misc'] == 'SpaceAfter=No')
|
||||||
source_sent.append({
|
source_sent.append({
|
||||||
'token': tok['text'],
|
'token': tok['text'],
|
||||||
'tag': tok_tag,
|
'tag': tok_tag,
|
||||||
'id': tok_id,
|
'id': tok_id,
|
||||||
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
'space_after': space_after,
|
||||||
'svala_id': source_svala_id
|
'svala_id': source_svala_id
|
||||||
})
|
})
|
||||||
target_sent.append({
|
target_sent.append({
|
||||||
'token': tok['text'],
|
'token': tok['text'],
|
||||||
'tag': tok_tag,
|
'tag': tok_tag,
|
||||||
'id': tok_id,
|
'id': tok_id,
|
||||||
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
'space_after': space_after,
|
||||||
'svala_id': target_svala_id
|
'svala_id': target_svala_id
|
||||||
})
|
})
|
||||||
generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = {
|
generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = {
|
||||||
|
|
|
@ -43,7 +43,7 @@ def form_paragraphs(annotated_source_divs, metadata):
|
||||||
def read_metadata(args):
|
def read_metadata(args):
|
||||||
texts_metadata = []
|
texts_metadata = []
|
||||||
with open(args.texts_metadata, 'r') as file:
|
with open(args.texts_metadata, 'r') as file:
|
||||||
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
||||||
column_names = []
|
column_names = []
|
||||||
for i, row in enumerate(csvreader):
|
for i, row in enumerate(csvreader):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
@ -52,7 +52,7 @@ def read_metadata(args):
|
||||||
else:
|
else:
|
||||||
row_dict = {}
|
row_dict = {}
|
||||||
for j, content in enumerate(row):
|
for j, content in enumerate(row):
|
||||||
row_dict[column_names[j]] = content
|
row_dict[column_names[j]] = content.strip()
|
||||||
texts_metadata.append(row_dict)
|
texts_metadata.append(row_dict)
|
||||||
|
|
||||||
# handle teachers
|
# handle teachers
|
||||||
|
@ -74,7 +74,7 @@ def read_metadata(args):
|
||||||
# handle authors
|
# handle authors
|
||||||
authors_metadata = {}
|
authors_metadata = {}
|
||||||
with open(args.authors_metadata, 'r') as file:
|
with open(args.authors_metadata, 'r') as file:
|
||||||
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
||||||
column_names = []
|
column_names = []
|
||||||
for i, row in enumerate(csvreader):
|
for i, row in enumerate(csvreader):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
|
@ -93,7 +93,7 @@ def read_metadata(args):
|
||||||
else:
|
else:
|
||||||
row_dict = {}
|
row_dict = {}
|
||||||
for j, content in enumerate(row):
|
for j, content in enumerate(row):
|
||||||
row_dict[column_names[j]] = content
|
row_dict[column_names[j]] = content.strip()
|
||||||
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
||||||
authors_metadata[row_dict['Ime in priimek']] = row_dict
|
authors_metadata[row_dict['Ime in priimek']] = row_dict
|
||||||
|
|
||||||
|
@ -121,7 +121,8 @@ def process_metadata(args):
|
||||||
for attribute_name_sl, attribute_name_en in translations.items():
|
for attribute_name_sl, attribute_name_en in translations.items():
|
||||||
if attribute_name_sl in document_metadata:
|
if attribute_name_sl in document_metadata:
|
||||||
if attribute_name_sl == 'Ocena':
|
if attribute_name_sl == 'Ocena':
|
||||||
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
|
grade = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' if document_metadata[attribute_name_sl] and document_metadata["Najvišja možna ocena"] else ''
|
||||||
|
metadata_el[attribute_name_en] = grade
|
||||||
elif attribute_name_sl == 'Tvorec':
|
elif attribute_name_sl == 'Tvorec':
|
||||||
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
||||||
elif attribute_name_sl == 'Učitelj':
|
elif attribute_name_sl == 'Učitelj':
|
||||||
|
@ -131,7 +132,12 @@ def process_metadata(args):
|
||||||
elif attribute_name_sl in author_metadata:
|
elif attribute_name_sl in author_metadata:
|
||||||
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
|
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
|
||||||
elif attribute_name_sl == 'Ime šole, Fakulteta':
|
elif attribute_name_sl == 'Ime šole, Fakulteta':
|
||||||
metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}'
|
curr_school = []
|
||||||
|
if author_metadata["Trenutno šolanje - Ime šole"]:
|
||||||
|
curr_school.append(author_metadata["Trenutno šolanje - Ime šole"])
|
||||||
|
if author_metadata["Trenutno šolanje - Fakulteta"]:
|
||||||
|
curr_school.append(author_metadata["Trenutno šolanje - Fakulteta"])
|
||||||
|
metadata_el['Current school'] = ', '.join(curr_school)
|
||||||
elif attribute_name_sl == 'Stopnja študija':
|
elif attribute_name_sl == 'Stopnja študija':
|
||||||
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
|
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
|
||||||
elif attribute_name_sl == 'Leto študija':
|
elif attribute_name_sl == 'Leto študija':
|
||||||
|
|
|
@ -253,9 +253,9 @@ if __name__ == '__main__':
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||||
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv',
|
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
|
||||||
help='KOST metadata location')
|
help='KOST metadata location')
|
||||||
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
|
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
|
||||||
help='KOST authors location')
|
help='KOST authors location')
|
||||||
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
||||||
help='KOST teachers location')
|
help='KOST teachers location')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user