Fixed occurences of impossible links + Other repairs
This commit is contained in:
parent
9216ee9a3b
commit
361cc14199
|
@ -330,17 +330,29 @@ def build_complete_tei(etree_source, etree_target, etree_links):
|
|||
text = etree.Element('text')
|
||||
group = etree.Element('group')
|
||||
print('P3')
|
||||
group.append(list(etree_source[0])[1])
|
||||
group.insert(len(group),
|
||||
list(etree_source[0])[1])
|
||||
# group.append(list(etree_source[0])[1])
|
||||
print('P4')
|
||||
group.append(list(etree_target[0])[1])
|
||||
group.insert(len(group),
|
||||
list(etree_target[0])[1])
|
||||
# group.append(list(etree_target[0])[1])
|
||||
print('P5')
|
||||
text.append(group)
|
||||
text.insert(len(text),
|
||||
group)
|
||||
# text.append(group)
|
||||
print('P6')
|
||||
root.append(tei_header)
|
||||
root.insert(len(root),
|
||||
tei_header)
|
||||
# root.append(tei_header)
|
||||
print('P7')
|
||||
root.append(text)
|
||||
# root.append(text)
|
||||
root.insert(len(root),
|
||||
text)
|
||||
print('P8')
|
||||
root.append(etree_links)
|
||||
# root.append(etree_links)
|
||||
root.insert(len(root),
|
||||
etree_links)
|
||||
print('P9')
|
||||
return root
|
||||
|
||||
|
@ -349,34 +361,22 @@ def build_links(all_edges):
|
|||
body = etree.Element('standOff')
|
||||
|
||||
for document_edges in all_edges:
|
||||
|
||||
|
||||
|
||||
# if len(document_edges) > 1:
|
||||
# print('here')
|
||||
|
||||
# mine paragraphs
|
||||
for paragraph_edges in document_edges:
|
||||
p = etree.Element('linkGrp')
|
||||
paragraph_id = ''
|
||||
corresp_source_id = ''
|
||||
corresp_target_id = ''
|
||||
corresp = []
|
||||
# for sentence_edges in paragraph_edges:
|
||||
#
|
||||
|
||||
for token_edges in paragraph_edges:
|
||||
if not corresp_source_id and len(token_edges['source_ids']) > 0:
|
||||
random_source_id = token_edges['source_ids'][0]
|
||||
corresp_source_id = '#'
|
||||
# corresp_source_id += '.'.join(random_source_id.split('.')[:3])
|
||||
corresp_source_id += '.'.join(random_source_id.split('.')[:2])
|
||||
corresp.append(corresp_source_id)
|
||||
if not corresp_target_id and len(token_edges['target_ids']) > 0:
|
||||
random_target_id = token_edges['target_ids'][0]
|
||||
corresp_target_id = '#'
|
||||
corresp_target_id += '.'.join(random_target_id.split('.')[:2])
|
||||
# corresp_target_id += random_target_id.split('.')[0]
|
||||
corresp.append(corresp_target_id)
|
||||
|
||||
link = etree.Element('link')
|
||||
# translate labels
|
||||
labels_list = []
|
||||
|
@ -390,6 +390,11 @@ def build_links(all_edges):
|
|||
link.set('target', ' '.join(['#' + source for source in token_edges['source_ids']] + ['#' + source for source in token_edges['target_ids']]))
|
||||
|
||||
p.append(link)
|
||||
corresp = []
|
||||
if corresp_source_id:
|
||||
corresp.append(corresp_source_id)
|
||||
if corresp_target_id:
|
||||
corresp.append(corresp_target_id)
|
||||
p.set('type', 'CORR')
|
||||
targFunc = []
|
||||
if corresp_source_id:
|
||||
|
|
|
@ -238,7 +238,7 @@ def create_target(svala_data_object, source_tokenized):
|
|||
target_tokenized.append(target_sent_tokenized)
|
||||
target_sent_tokenized = []
|
||||
curr_sententence += 1
|
||||
tok_i = 1
|
||||
tok_i = 0
|
||||
tok_i += 1
|
||||
target_tokenized.append(target_sent_tokenized)
|
||||
return target_tokenized
|
||||
|
@ -256,18 +256,19 @@ def fake_svala_data(source_tokenized):
|
|||
tok_tag = 'w' if 'xpos' not in tok or tok['xpos'] != 'Z' else 'pc'
|
||||
source_svala_id = 's' + str(edge_id)
|
||||
target_svala_id = 't' + str(edge_id)
|
||||
space_after = not ('misc' in tok and tok['misc'] == 'SpaceAfter=No')
|
||||
source_sent.append({
|
||||
'token': tok['text'],
|
||||
'tag': tok_tag,
|
||||
'id': tok_id,
|
||||
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
||||
'space_after': space_after,
|
||||
'svala_id': source_svala_id
|
||||
})
|
||||
target_sent.append({
|
||||
'token': tok['text'],
|
||||
'tag': tok_tag,
|
||||
'id': tok_id,
|
||||
'space_after': 'misc' in tok and tok['misc'] == 'SpaceAfter=No',
|
||||
'space_after': space_after,
|
||||
'svala_id': target_svala_id
|
||||
})
|
||||
generated_edges[f'e-{source_svala_id}-{target_svala_id}'] = {
|
||||
|
|
|
@ -43,7 +43,7 @@ def form_paragraphs(annotated_source_divs, metadata):
|
|||
def read_metadata(args):
|
||||
texts_metadata = []
|
||||
with open(args.texts_metadata, 'r') as file:
|
||||
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
||||
column_names = []
|
||||
for i, row in enumerate(csvreader):
|
||||
if i == 0:
|
||||
|
@ -52,7 +52,7 @@ def read_metadata(args):
|
|||
else:
|
||||
row_dict = {}
|
||||
for j, content in enumerate(row):
|
||||
row_dict[column_names[j]] = content
|
||||
row_dict[column_names[j]] = content.strip()
|
||||
texts_metadata.append(row_dict)
|
||||
|
||||
# handle teachers
|
||||
|
@ -74,7 +74,7 @@ def read_metadata(args):
|
|||
# handle authors
|
||||
authors_metadata = {}
|
||||
with open(args.authors_metadata, 'r') as file:
|
||||
csvreader = csv.reader(file, delimiter='\t', quotechar='"')
|
||||
csvreader = csv.reader(file, delimiter='|', quotechar='"')
|
||||
column_names = []
|
||||
for i, row in enumerate(csvreader):
|
||||
if i == 0:
|
||||
|
@ -93,7 +93,7 @@ def read_metadata(args):
|
|||
else:
|
||||
row_dict = {}
|
||||
for j, content in enumerate(row):
|
||||
row_dict[column_names[j]] = content
|
||||
row_dict[column_names[j]] = content.strip()
|
||||
row_dict['Ime in priimek'] = row_dict['Ime in priimek'].strip()
|
||||
authors_metadata[row_dict['Ime in priimek']] = row_dict
|
||||
|
||||
|
@ -121,7 +121,8 @@ def process_metadata(args):
|
|||
for attribute_name_sl, attribute_name_en in translations.items():
|
||||
if attribute_name_sl in document_metadata:
|
||||
if attribute_name_sl == 'Ocena':
|
||||
metadata_el[attribute_name_en] = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}'
|
||||
grade = f'{document_metadata[attribute_name_sl]} od {document_metadata["Najvišja možna ocena"]}' if document_metadata[attribute_name_sl] and document_metadata["Najvišja možna ocena"] else ''
|
||||
metadata_el[attribute_name_en] = grade
|
||||
elif attribute_name_sl == 'Tvorec':
|
||||
metadata_el[attribute_name_en] = author_metadata['Koda tvorca']
|
||||
elif attribute_name_sl == 'Učitelj':
|
||||
|
@ -131,7 +132,12 @@ def process_metadata(args):
|
|||
elif attribute_name_sl in author_metadata:
|
||||
metadata_el[attribute_name_en] = author_metadata[attribute_name_sl]
|
||||
elif attribute_name_sl == 'Ime šole, Fakulteta':
|
||||
metadata_el['Current school'] = f'{author_metadata["Trenutno šolanje - Ime šole"]}, {author_metadata["Trenutno šolanje - Fakulteta"]}'
|
||||
curr_school = []
|
||||
if author_metadata["Trenutno šolanje - Ime šole"]:
|
||||
curr_school.append(author_metadata["Trenutno šolanje - Ime šole"])
|
||||
if author_metadata["Trenutno šolanje - Fakulteta"]:
|
||||
curr_school.append(author_metadata["Trenutno šolanje - Fakulteta"])
|
||||
metadata_el['Current school'] = ', '.join(curr_school)
|
||||
elif attribute_name_sl == 'Stopnja študija':
|
||||
metadata_el[attribute_name_en] = author_metadata['Trenutno šolanje - Stopnja študija']
|
||||
elif attribute_name_sl == 'Leto študija':
|
||||
|
|
|
@ -253,9 +253,9 @@ if __name__ == '__main__':
|
|||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--raw_text', default='data/KOST/raw',
|
||||
help='input file in (gz or xml currently). If none, then just database is loaded')
|
||||
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata2.csv',
|
||||
parser.add_argument('--texts_metadata', default='data/KOST/texts_metadata5.csv',
|
||||
help='KOST metadata location')
|
||||
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata2.csv',
|
||||
parser.add_argument('--authors_metadata', default='data/KOST/authors_metadata5.csv',
|
||||
help='KOST authors location')
|
||||
parser.add_argument('--teachers_metadata', default='data/KOST/teachers_metadata.csv',
|
||||
help='KOST teachers location')
|
||||
|
|
Loading…
Reference in New Issue
Block a user