Tokenization part adapted for KOST
This commit is contained in:
parent
bafbb6a48a
commit
cc455b2558
94
src/read/hand_fixes.py
Normal file
94
src/read/hand_fixes.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
from collections import deque
|
||||
|
||||
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.'], 'plavali.': ['plavali', '.'], '[XImeX]': ['[', 'XImeX', ']'], '[XimeX]': ['[', 'XimeX', ']'], 'hipoteze:': ['hipoteze', ':'], 'prehrano?': ['prehrano', '?'], '68-letna': ['68', '-', 'letna'], 'pojma:': ['pojma', ':'], '[XKrajX]': ['[', 'XKrajX', ']'], '3/4': ['3', '/', '4'], 'I-phonea': ['I', '-', 'phonea'], 'kredita:': ['kredita', ':'], '[XFakultetaX]': ['[', 'XFakultetaX', ']'], 'športno-eleganten': ['športno', '-', 'eleganten'], '[XStudijskaSmerX]': ['[', 'XStudijskaSmerX', ']'], '[XNaslovX]': ['[', 'XNaslovX', ']'], '(tudi': ['(', 'tudi'], 'kupujem)': ['kupujem', ')'], '[XPriimekX]': ['[', 'XPriimekX', ']'], '[XPodjetjeX]': ['[', 'XPodjetjeX', ']'], 'Zagreb,': ['Zagreb', ','], 'Budimpešto.': ['Budimpešto', '.'], 'žalost.': ['žalost', '.'], '....': ['.', '.', '.', '.'], '[XStevilkaX]': ['[', 'XStevilkaX', ']'], 'e-naslov': ['e', '-', 'naslov'], '[XEnaslovX]': ['[', 'XEnaslovX', ']'], 'e-pošto': ['e', '-', 'pošto'], '[XDatumX]': ['[', 'XDatumX', ']'], 'eno-sobno': ['eno', '-', 'sobno'], 'lgbtq-prijazna': ['lgbtq', '-', 'prijazna'], 'lgbtq-prijaznega': ['lgbtq', '-', 'prijaznega'], 'Covid-19': ['Covid', '-', '19'], ',,,': [',', ',', ','], 'e-maila': ['e', '-', 'maila'], 'T&d': ['T', '&', 'd'], 'Spider-Man': ['Spider', '-', 'Man'], '12-strani': ['12', '-', 'strani'], 'turbo-folk': ['turbo', '-', 'folk'], 'Cp-čkar': ['Cp', '-', 'čkar'], '46-letnik': ['46', '-', 'letnik'], '40-letna': ['40', '-', 'letna'], '18-19h': ['18', '-', '19h'], '[XSvojilniPridevnikX]': ['[', 'XSvojilniPridevnikX', ']'], 'COVID-19': ['COVID', '-', '19'], '"sims"': ['"', 'sims', '"'], '2021/22': ['2021', '/', '22'], '2020/21': ['2020', '/', '21'], 'leto2021/22': ['leto2021', '/', '22'], 'H&m': ['H', '&', 'm'], 'high-street': ['high', '-', 'street'], 'H&M-u': ['H', '&', 'M-u'], 'H&M': ['H', '&', 'M'], 'srčno-žilnih': ['srčno', '-', 'žilnih'], 'srčno-žilni': ['srčno', '-', 'žilni'], ':))': [':)', ')'], 'You-Tube-ju': ['You', '-', 'Tube-ju'], '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']}
|
||||
# , '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']
|
||||
SVALA_HAND_FIXES_MERGE = {('oz', '.'): 'oz.', ('Npr', '.'): 'Npr.', ('npr', '.'): 'npr.', ('1', '.'): '1.', ('2', '.'): '2.', ('3', '.'): '3.', ('m', '.'): 'm.', ('itn', '.'): 'itn.', ('max', '.'): 'max.', ('4', '.'): '4.', ('cca', '.'): 'cca.', ('30', '.'): '30.', ('mlad', '.'): 'mlad.', (':)', ')'): ':))', ('sv', '.'): 'sv.', ('p', '.'): 'p.'}
|
||||
OBELIKS_HAND_FIXES_MERGE = {'2015.': ['2015', '.']}
|
||||
|
||||
|
||||
def merge_svala_data_elements(svala_data_object, i, mask_len):
|
||||
final_text = ''
|
||||
involved_sources = []
|
||||
involved_targets = []
|
||||
involved_edges = []
|
||||
for el in svala_data_object.svala_data['source'][i - mask_len + 1:i + 1]:
|
||||
# check whether merge won't cause further (unnoticed) issues later
|
||||
edges = svala_data_object.links_ids_mapper[el['id']]
|
||||
if len(edges) != 1:
|
||||
raise ValueError('Incorrect number of edges!')
|
||||
edge = svala_data_object.svala_data['edges'][edges[0]]
|
||||
# TODO check if or len(edge['labels']) != 0 has to be added
|
||||
if len(edge['source_ids']) != 1 or len(edge['target_ids']) != 1:
|
||||
raise ValueError('Possible errors - CHECK!')
|
||||
|
||||
final_text += el['text']
|
||||
involved_sources.append(edge['source_ids'][0])
|
||||
|
||||
involved_targets.append(edge['target_ids'][0])
|
||||
involved_edges.append(edge['id'])
|
||||
|
||||
# erase merged svala elements
|
||||
svala_data_object.svala_data['source'][i - mask_len + 1]['text'] = final_text
|
||||
svala_data_object.svala_data['source'] = [el for el in svala_data_object.svala_data['source'] if
|
||||
el['id'] not in involved_sources[1:]]
|
||||
|
||||
for el in svala_data_object.svala_data['target']:
|
||||
if el['id'] == involved_targets[0]:
|
||||
el['text'] = final_text
|
||||
break
|
||||
svala_data_object.svala_data['target'] = [el for el in svala_data_object.svala_data['target'] if
|
||||
el['id'] not in involved_targets[1:]]
|
||||
|
||||
svala_data_object.svala_data['edges'] = {k: v for k, v in svala_data_object.svala_data['edges'].items() if
|
||||
v['id'] not in involved_edges[1:]}
|
||||
i -= len(involved_sources[1:])
|
||||
return i
|
||||
|
||||
|
||||
def apply_svala_handfixes(svala_data_object):
|
||||
hand_fix_mask = []
|
||||
for key in SVALA_HAND_FIXES_MERGE.keys():
|
||||
if len(key) not in hand_fix_mask:
|
||||
hand_fix_mask.append(len(key))
|
||||
|
||||
remember_length = max(hand_fix_mask)
|
||||
q = deque()
|
||||
|
||||
i = 0
|
||||
for el in svala_data_object.svala_data['source']:
|
||||
q.append(el['text'])
|
||||
if len(q) > remember_length:
|
||||
q.popleft()
|
||||
for mask_len in hand_fix_mask:
|
||||
list_q = list(q)
|
||||
if len(list_q) - mask_len >= 0:
|
||||
key = tuple(list_q[remember_length - mask_len:])
|
||||
if key in SVALA_HAND_FIXES_MERGE:
|
||||
i = merge_svala_data_elements(svala_data_object, i, mask_len)
|
||||
i += 1
|
||||
|
||||
|
||||
def apply_obeliks_handfixes(tokenized_paragraph):
|
||||
for t_i in range(len(tokenized_paragraph)):
|
||||
sen = tokenized_paragraph[t_i]
|
||||
i = 0
|
||||
error = False
|
||||
for tok in sen:
|
||||
# if tok['text'] == ',,,':
|
||||
# tok['text'] = ','
|
||||
if tok['text'] in OBELIKS_HAND_FIXES_MERGE:
|
||||
error = True
|
||||
break
|
||||
i += 1
|
||||
if error:
|
||||
new_sen = []
|
||||
new_id = 1
|
||||
for t in sen:
|
||||
if t['text'] in OBELIKS_HAND_FIXES_MERGE:
|
||||
for ex_t in OBELIKS_HAND_FIXES_MERGE[t['text']]:
|
||||
new_sen.append({'id': tuple([new_id]), 'text': ex_t})
|
||||
new_id += 1
|
||||
else:
|
||||
new_sen.append({'id': tuple([new_id]), 'text': t['text']})
|
||||
new_id += 1
|
||||
tokenized_paragraph[t_i] = new_sen
|
|
@ -18,248 +18,20 @@ def create_edges_list(target_ids, links_ids_mapper):
|
|||
SKIP_IDS = ['solar2284s.1.1.1']
|
||||
|
||||
|
||||
def create_edges(svala_data, source_par, target_par):
|
||||
if source_par and source_par[0]:
|
||||
if source_par[0][0]['id'] in SKIP_IDS:
|
||||
return []
|
||||
# print(source_par[0][0]['id'])
|
||||
# if source_par[0][0]['id'] == 'solar17s.6.3.1':
|
||||
# print('pause!')
|
||||
# if target_par and target_par[0]:
|
||||
# print(target_par[0][0]['id'])
|
||||
# if target_par[0][0]['id'] == 'solar2150t.4.1.1':
|
||||
# print('pause!')
|
||||
def create_edges(raw_edges, source_par, target_par):
|
||||
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
|
||||
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
|
||||
|
||||
source_ids = [[el['svala_id'] for el in source] for source in source_par]
|
||||
target_ids = [[el['svala_id'] for el in target] for target in target_par]
|
||||
|
||||
source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par]
|
||||
target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par]
|
||||
|
||||
# create links to ids mapper
|
||||
links_ids_mapper = {}
|
||||
edges_of_one_type = set()
|
||||
|
||||
# delete empty edge
|
||||
if 'e-' in svala_data['edges']:
|
||||
del (svala_data['edges']['e-'])
|
||||
|
||||
for k, v in svala_data['edges'].items():
|
||||
has_source = False
|
||||
has_target = False
|
||||
for el in v['ids']:
|
||||
# create edges of one type
|
||||
if el[0] == 's':
|
||||
has_source = True
|
||||
if el[0] == 't':
|
||||
has_target = True
|
||||
|
||||
# create links_ids_mapper
|
||||
if el not in links_ids_mapper:
|
||||
links_ids_mapper[el] = []
|
||||
links_ids_mapper[el].append(k)
|
||||
if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
|
||||
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
|
||||
edges_of_one_type.add(k)
|
||||
|
||||
# delete edge with space
|
||||
save_deleted_edges = {}
|
||||
if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ':
|
||||
for edg in links_ids_mapper[svala_data['source'][0]['id']]:
|
||||
save_deleted_edges[edg] = svala_data['edges'][edg]
|
||||
del (svala_data['edges'][edg])
|
||||
del (links_ids_mapper[svala_data['source'][0]['id']])
|
||||
if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ':
|
||||
for edg in links_ids_mapper[svala_data['target'][0]['id']]:
|
||||
save_deleted_edges[edg] = svala_data['edges'][edg]
|
||||
del (svala_data['edges'][edg])
|
||||
del (links_ids_mapper[svala_data['target'][0]['id']])
|
||||
|
||||
# create edge order
|
||||
edges_order = []
|
||||
edges_processed = set()
|
||||
active_target_sentence_i = 0
|
||||
|
||||
# create target edges
|
||||
target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper)
|
||||
source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper)
|
||||
|
||||
last_target_edge = ''
|
||||
|
||||
for active_source_sentence_i, active_source_sentence in enumerate(source_edges):
|
||||
for source_edge in active_source_sentence:
|
||||
# print(source_edge)
|
||||
# if 'e-s7-t8' == source_edge:
|
||||
# print('aaa')
|
||||
if source_edge in edges_of_one_type:
|
||||
if source_edge not in edges_processed:
|
||||
edges_order.append(source_edge)
|
||||
edges_processed.add(source_edge)
|
||||
|
||||
elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]:
|
||||
|
||||
# if 'e-s119-t119' == source_edge:
|
||||
# print('aaa')
|
||||
if source_edge not in edges_processed:
|
||||
edges_order.append(source_edge)
|
||||
edges_processed.add(source_edge)
|
||||
last_target_edge = source_edge
|
||||
# when source is connected to two targets
|
||||
elif source_edge not in target_edges_set[active_target_sentence_i]:
|
||||
# add missing edges from target
|
||||
while source_edge not in target_edges_set[active_target_sentence_i]:
|
||||
for target_edge in target_edges[active_target_sentence_i]:
|
||||
if target_edge in edges_of_one_type:
|
||||
if target_edge not in edges_processed:
|
||||
edges_order.append(target_edge)
|
||||
edges_processed.add(target_edge)
|
||||
last_target_edge = target_edge
|
||||
active_target_sentence_i += 1
|
||||
if source_edge in target_edges_set[active_target_sentence_i]:
|
||||
if source_edge not in edges_processed:
|
||||
edges_order.append(source_edge)
|
||||
edges_processed.add(source_edge)
|
||||
|
||||
else:
|
||||
raise 'Impossible!!!'
|
||||
if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges):
|
||||
continue
|
||||
if len(target_edges[active_target_sentence_i]) == 0:
|
||||
active_target_sentence_i += 1
|
||||
continue
|
||||
|
||||
if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))):
|
||||
for target_edge in target_edges[active_target_sentence_i]:
|
||||
if target_edge in edges_of_one_type:
|
||||
if target_edge not in edges_processed:
|
||||
edges_order.append(target_edge)
|
||||
edges_processed.add(target_edge)
|
||||
last_target_edge = target_edge
|
||||
active_target_sentence_i += 1
|
||||
continue
|
||||
target_edge_in_next_source_edge_sentence = False
|
||||
for target_edge in target_edges[active_target_sentence_i]:
|
||||
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]:
|
||||
target_edge_in_next_source_edge_sentence = True
|
||||
break
|
||||
if target_edge_in_next_source_edge_sentence:
|
||||
pass
|
||||
elif not target_edge_in_next_source_edge_sentence:
|
||||
target_edge_in_next_source_edge_sentence = False
|
||||
while not target_edge_in_next_source_edge_sentence:
|
||||
# if active_target_sentence_i >= len(target_edges_set):
|
||||
# break
|
||||
for target_edge in target_edges[active_target_sentence_i]:
|
||||
if target_edge in edges_of_one_type:
|
||||
if target_edge not in edges_processed:
|
||||
edges_order.append(target_edge)
|
||||
edges_processed.add(target_edge)
|
||||
last_target_edge = target_edge
|
||||
|
||||
# if there is no next source sentence
|
||||
if active_source_sentence_i + 1 >= len(source_edges_set):
|
||||
target_edge_in_next_source_edge_sentence = True
|
||||
|
||||
# if last_target_edge only in target stop regularly
|
||||
if last_target_edge == target_edges[active_target_sentence_i][-1]:
|
||||
target_edge_in_next_source_edge_sentence = True
|
||||
|
||||
# test if target_edge in next source
|
||||
for target_edge in target_edges[active_target_sentence_i]:
|
||||
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[
|
||||
active_source_sentence_i + 1]:
|
||||
target_edge_in_next_source_edge_sentence = True
|
||||
break
|
||||
active_target_sentence_i += 1
|
||||
|
||||
if not source_edges:
|
||||
for active_target_sentence in target_edges:
|
||||
for target_edge in active_target_sentence:
|
||||
if target_edge not in edges_processed:
|
||||
edges_order.append(target_edge)
|
||||
edges_processed.add(target_edge)
|
||||
|
||||
# # DEBUG stuff
|
||||
# for edge_order in edges_order:
|
||||
# if edges_order.count(edge_order) > 1:
|
||||
# # if edge_order not in a:
|
||||
# print(f'ERROR {edge_order}')
|
||||
#
|
||||
# for edge_order in edges_order:
|
||||
# if edge_order not in svala_data['edges']:
|
||||
# print(f'ERROR {edge_order}')
|
||||
#
|
||||
# for key in svala_data['edges'].keys():
|
||||
# if key not in edges_order:
|
||||
# print(f'ERROR {key}')
|
||||
#
|
||||
# a = len(svala_data['edges'])
|
||||
# b = len(edges_order)
|
||||
if len(svala_data['edges']) != len(edges_order):
|
||||
for k, v in save_deleted_edges.items():
|
||||
svala_data['edges'][k] = v
|
||||
|
||||
|
||||
assert len(svala_data['edges']) == len(edges_order)
|
||||
|
||||
sentence_edges = []
|
||||
source_sent_id = 0
|
||||
target_sent_id = 0
|
||||
# actually add edges
|
||||
edges = []
|
||||
for edge_id in edges_order:
|
||||
labels = svala_data['edges'][edge_id]['labels']
|
||||
source_ids = [source_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in source_mapper]
|
||||
target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper]
|
||||
ids = svala_data['edges'][edge_id]['ids']
|
||||
for _, edge in raw_edges.items():
|
||||
labels = edge['labels']
|
||||
source_ids = [source_mapper[el] for el in edge['ids'] if el in source_mapper]
|
||||
target_ids = [target_mapper[el] for el in edge['ids'] if el in target_mapper]
|
||||
|
||||
source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else []
|
||||
source_ok_all = all(source_ok)
|
||||
|
||||
if not source_ok_all:
|
||||
source_sent_id += 1
|
||||
|
||||
target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else []
|
||||
target_ok_all = all(target_ok)
|
||||
|
||||
if not target_ok_all:
|
||||
target_sent_id += 1
|
||||
|
||||
if not source_ok_all or not target_ok_all:
|
||||
sentence_edges.append(edges)
|
||||
edges = []
|
||||
edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels})
|
||||
|
||||
if edges:
|
||||
sentence_edges.append(edges)
|
||||
|
||||
actual_sentence_edges = []
|
||||
passed_sentence = []
|
||||
for sent in sentence_edges:
|
||||
ha_source = False
|
||||
ha_target = False
|
||||
for toke in sent:
|
||||
if len(toke['target_ids']) > 0:
|
||||
ha_target = toke['target_ids'][0]
|
||||
if len(toke['source_ids']) > 0:
|
||||
ha_source = toke['source_ids'][0]
|
||||
if ha_target and ha_source:
|
||||
break
|
||||
|
||||
if not ha_target or not ha_source:
|
||||
passed_sentence.extend(sent)
|
||||
|
||||
else:
|
||||
passed_sentence.extend(sent)
|
||||
actual_sentence_edges.append(passed_sentence)
|
||||
passed_sentence = []
|
||||
|
||||
if passed_sentence:
|
||||
actual_sentence_edges.append(passed_sentence)
|
||||
|
||||
return actual_sentence_edges
|
||||
return edges
|
||||
|
||||
|
||||
def update_ids(pretag, in_list):
|
||||
|
|
|
@ -1,13 +1,29 @@
|
|||
import re
|
||||
|
||||
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
|
||||
from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE
|
||||
|
||||
|
||||
def read_raw_text(path):
|
||||
with open(path, 'r') as rf:
|
||||
return rf.read()
|
||||
print(path)
|
||||
# if path == "data/KOST/raw/L-1819-110.txt":
|
||||
# print('here')
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as rf:
|
||||
return rf.read()
|
||||
except:
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-16') as rf:
|
||||
return rf.read()
|
||||
except:
|
||||
with open(path, 'r', encoding="windows-1250") as rf:
|
||||
return rf.read()
|
||||
|
||||
|
||||
|
||||
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
||||
# apply handfixes for obeliks
|
||||
apply_obeliks_handfixes(tokenized_paragraph)
|
||||
|
||||
paragraph_res = []
|
||||
wierd_sign_count = 0
|
||||
svala_data_i = 0
|
||||
|
@ -21,11 +37,14 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|||
assert tok['misc'] == 'SpaceAfter=No'
|
||||
space_after = not 'misc' in tok
|
||||
if len(svala_data_part) <= svala_data_i:
|
||||
# if sentence does not end add it anyway
|
||||
# TODO i error?
|
||||
if sentence_res:
|
||||
paragraph_res.append(sentence_res)
|
||||
return i, paragraph_res
|
||||
if svala_data_part[svala_data_i]['text'].strip() != tok['text']:
|
||||
key = svala_data_part[svala_data_i]['text'].strip()
|
||||
if svala_data_part[svala_data_i]['text'] != tok['text']:
|
||||
key = svala_data_part[svala_data_i]['text']
|
||||
if key not in HAND_FIXES:
|
||||
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
||||
if key.startswith('§§§') and key.endswith('§§§'):
|
||||
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
|
||||
elif key.startswith('§§§'):
|
||||
|
@ -33,7 +52,23 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|||
elif key.endswith('§§§'):
|
||||
HAND_FIXES[key] = [key[:-3], '§', '§', '§']
|
||||
else:
|
||||
raise 'Word mismatch!'
|
||||
if len(key) < len(tok['text']):
|
||||
print('HAND_FIXES_MERGE:')
|
||||
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
|
||||
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
|
||||
a = SVALA_HAND_FIXES_MERGE
|
||||
else:
|
||||
print('HAND_FIXES OLD:')
|
||||
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
|
||||
|
||||
print('HAND_FIXES NEW:')
|
||||
reg = re.findall(r"[\w]+|[^\s\w]", key)
|
||||
print(f", '{key}': {str(reg)}")
|
||||
|
||||
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
|
||||
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
|
||||
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
||||
# raise ValueError('Word mismatch!')
|
||||
|
||||
if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
|
||||
wierd_sign_count += 1
|
||||
|
@ -42,6 +77,10 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|||
else:
|
||||
tok['text'] = key
|
||||
wierd_sign_count = 0
|
||||
elif key in ['[XKrajX]']:
|
||||
tok['text'] = '[XKrajX]'
|
||||
elif key in ['[XImeX]']:
|
||||
tok['text'] = '[XImeX]'
|
||||
else:
|
||||
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
||||
raise 'Word mismatch!'
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import queue
|
||||
import string
|
||||
from collections import deque
|
||||
|
||||
import classla
|
||||
|
||||
|
||||
from src.read.merge import merge
|
||||
from src.read.hand_fixes import apply_svala_handfixes
|
||||
from src.read.merge import merge, create_conllu, create_edges
|
||||
from src.read.read import read_raw_text, map_svala_tokenized
|
||||
|
||||
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
|
||||
from src.read.svala_data import SvalaData
|
||||
|
||||
|
||||
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
|
||||
|
@ -129,9 +133,114 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s
|
|||
return svala_i, source_i, target_i
|
||||
|
||||
|
||||
def create_target(svala_data, source_tokenized):
|
||||
for i, el in enumerate(svala_data['target']):
|
||||
print(i)
|
||||
def create_target(svala_data_object, source_tokenized):
|
||||
source_tokenized_dict = {}
|
||||
for i, sent in enumerate(source_tokenized):
|
||||
for tok in sent:
|
||||
tok['sent_id'] = i + 1
|
||||
source_tokenized_dict[tok['svala_id']] = tok
|
||||
|
||||
|
||||
links_ids_mapper, edges_of_one_type = svala_data_object.links_ids_mapper, svala_data_object.edges_of_one_type
|
||||
|
||||
curr_sententence = 1
|
||||
source_curr_sentence = 1
|
||||
|
||||
target_tokenized = []
|
||||
target_sent_tokenized = []
|
||||
tok_i = 1
|
||||
|
||||
for i, token in enumerate(svala_data_object.svala_data['target']):
|
||||
edge_id = links_ids_mapper[token['id']]
|
||||
if len(edge_id) > 1:
|
||||
print('Whaat?')
|
||||
edge_id = edge_id[0]
|
||||
edge = svala_data_object.svala_data['edges'][edge_id]
|
||||
source_word_ids = []
|
||||
target_word_ids = []
|
||||
for word_id in edge['ids']:
|
||||
if word_id[0] == 's':
|
||||
source_word_ids.append(word_id)
|
||||
if word_id[0] == 't':
|
||||
target_word_ids.append(word_id)
|
||||
|
||||
token_text = token['text']
|
||||
new_sentence = False
|
||||
if len(source_word_ids) == 1:
|
||||
source_id = source_word_ids[0]
|
||||
source_token = source_tokenized_dict[source_id]
|
||||
|
||||
if source_token['sent_id'] != source_curr_sentence:
|
||||
source_curr_sentence = source_token['sent_id']
|
||||
if source_token['id'] == 1 and len(target_sent_tokenized) > 1:
|
||||
target_tokenized.append(target_sent_tokenized)
|
||||
target_sent_tokenized = []
|
||||
curr_sententence += 1
|
||||
tok_i = 1
|
||||
|
||||
# check if words are equal and update
|
||||
if token_text == source_token['token']:
|
||||
target_token = {
|
||||
'token': source_token['token'],
|
||||
'tag': source_token['tag'],
|
||||
'id': tok_i,
|
||||
'space_after': source_token['space_after'],
|
||||
'svala_id': token['id'],
|
||||
'sent_id': curr_sententence,
|
||||
}
|
||||
else:
|
||||
|
||||
# Check for punctuation mismatch.
|
||||
if token_text in string.punctuation:
|
||||
tag = 'pc'
|
||||
else:
|
||||
tag = 'w'
|
||||
|
||||
target_token = {
|
||||
'token': token_text,
|
||||
'tag': tag,
|
||||
'id': tok_i,
|
||||
'space_after': source_token['space_after'],
|
||||
'svala_id': token['id'],
|
||||
'sent_id': curr_sententence,
|
||||
}
|
||||
|
||||
else:
|
||||
space_after = True
|
||||
if token_text in string.punctuation:
|
||||
tag = 'pc'
|
||||
if token_text in '!?.,):;]}':
|
||||
if len(target_sent_tokenized) == 0:
|
||||
raise ValueError('Sentence lenght = 0!')
|
||||
target_sent_tokenized[-1]['space_after'] = False
|
||||
if token_text in '!?.':
|
||||
new_sentence = True
|
||||
|
||||
# Handle cases like `...`
|
||||
if len(svala_data_object.svala_data['target']) > i + 1 and svala_data_object.svala_data['target'][i+1]['text'] in '.?!':
|
||||
new_sentence = False
|
||||
elif token_text in '([{':
|
||||
space_after = False
|
||||
else:
|
||||
tag = 'w'
|
||||
|
||||
target_token = {
|
||||
'token': token_text,
|
||||
'tag': tag,
|
||||
'id': tok_i,
|
||||
'space_after': space_after,
|
||||
'svala_id': token['id'],
|
||||
'sent_id': curr_sententence,
|
||||
}
|
||||
target_sent_tokenized.append(target_token)
|
||||
if new_sentence:
|
||||
target_tokenized.append(target_sent_tokenized)
|
||||
target_sent_tokenized = []
|
||||
curr_sententence += 1
|
||||
tok_i = 1
|
||||
tok_i += 1
|
||||
target_tokenized.append(target_sent_tokenized)
|
||||
return target_tokenized
|
||||
|
||||
|
||||
def tokenize(args):
|
||||
|
@ -149,14 +258,19 @@ def tokenize(args):
|
|||
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
|
||||
# filename_encountered = False
|
||||
i = 0
|
||||
tokenized_source_divs = []
|
||||
tokenized_target_divs = []
|
||||
tokenized_divs = {}
|
||||
# tokenized_source_divs = {}
|
||||
# tokenized_target_divs = {}
|
||||
document_edges = []
|
||||
|
||||
text_filename = ''
|
||||
|
||||
for folder, _, filenames in os.walk(args.svala_folder):
|
||||
for filename in filenames:
|
||||
filenames = sorted(filenames)
|
||||
for filename_i, filename in enumerate(filenames):
|
||||
# if filename_i*100/len(filenames) > 35:
|
||||
# print('here')
|
||||
# continue
|
||||
svala_path = os.path.join(folder, filename)
|
||||
new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt'
|
||||
if text_filename != new_text_filename:
|
||||
|
@ -166,81 +280,61 @@ def tokenize(args):
|
|||
text_file) if text_file else ([], [], [])
|
||||
source_sent_i = 0
|
||||
|
||||
jf = open(svala_path)
|
||||
jf = open(svala_path, encoding='utf-8')
|
||||
print(svala_path)
|
||||
svala_data = json.load(jf)
|
||||
jf.close()
|
||||
|
||||
svala_data_object = SvalaData(svala_data)
|
||||
|
||||
target_res = create_target(svala_data, source_tokenized)
|
||||
source_sent_i, source_res = map_svala_tokenized(svala_data['source'], source_tokenized, source_sent_i)
|
||||
print('aaa')
|
||||
apply_svala_handfixes(svala_data_object)
|
||||
|
||||
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
|
||||
# target_res = create_target(svala_data, source_tokenized)
|
||||
|
||||
|
||||
for div in et.iter('div'):
|
||||
bibl = div.find('bibl')
|
||||
file_name = bibl.get('n')
|
||||
file_name = file_name.replace('/', '_')
|
||||
print(f'{i*100/folders_count} % : {file_name}')
|
||||
i += 1
|
||||
# if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt':
|
||||
# if file_name == 'KUS-G-slo-4-GO-E-2009-10017':
|
||||
# # # if i*100/folders_count > 40:
|
||||
# filename_encountered = True
|
||||
# # # # if i*100/folders_count > 41:
|
||||
# # # # filename_encountered = False
|
||||
# if not filename_encountered:
|
||||
# continue
|
||||
target_res = create_target(svala_data_object, source_res)
|
||||
|
||||
svala_path = os.path.join(args.svala_folder, file_name)
|
||||
corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name)
|
||||
raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name)
|
||||
if text_filename not in tokenized_divs:
|
||||
tokenized_divs[text_filename] = []
|
||||
|
||||
svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] if os.path.isdir(svala_path) else []
|
||||
svala_dict = {e[0]: e[1] for e in svala_list}
|
||||
tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges']))
|
||||
|
||||
if os.path.exists(corrected_svala_path):
|
||||
corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)]
|
||||
corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list}
|
||||
logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %')
|
||||
|
||||
svala_dict.update(corrected_svala_dict)
|
||||
|
||||
assert len(svala_dict) != 0
|
||||
tokenized_source_divs = []
|
||||
tokenized_target_divs = []
|
||||
document_edges = []
|
||||
|
||||
for div_id in tokenized_divs.keys():
|
||||
paragraph_edges = []
|
||||
tokenized_source_paragraphs = []
|
||||
tokenized_target_paragraphs = []
|
||||
paragraph_edges = []
|
||||
# par_source = []
|
||||
# par_target = []
|
||||
for tokenized_para in tokenized_divs[div_id]:
|
||||
paragraph_name, source_res, target_res, edges = tokenized_para
|
||||
source_paragraphs = []
|
||||
target_paragraphs = []
|
||||
sen_source = []
|
||||
sen_target = []
|
||||
for sen_i, sen in enumerate(source_res):
|
||||
source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
|
||||
source_paragraphs.append(source_conllu)
|
||||
sen_source.append(sen)
|
||||
|
||||
paragraphs = div.findall('p')
|
||||
for paragraph in paragraphs:
|
||||
sentences = paragraph.findall('s')
|
||||
svala_i = 1
|
||||
|
||||
# read json
|
||||
# if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar17.6':
|
||||
# print('here')
|
||||
svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
|
||||
corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
|
||||
add_errors_func = add_errors
|
||||
jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file)
|
||||
svala_data = json.load(jf)
|
||||
jf.close()
|
||||
|
||||
source_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_source.json'
|
||||
target_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_target.json'
|
||||
|
||||
source_raw_text = os.path.join(raw_texts_path, source_filename) if os.path.exists(os.path.join(raw_texts_path, source_filename)) else None
|
||||
target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None
|
||||
|
||||
|
||||
sentence_edges, tokenized_source_sentences, tokenized_target_sentences = merge(sentences, paragraph, svala_i,
|
||||
svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize)
|
||||
|
||||
tokenized_source_paragraphs.append(tokenized_source_sentences)
|
||||
tokenized_target_paragraphs.append(tokenized_target_sentences)
|
||||
paragraph_edges.append(sentence_edges)
|
||||
for sen_i, sen in enumerate(target_res):
|
||||
target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
|
||||
target_paragraphs.append(target_conllu)
|
||||
sen_target.append(sen)
|
||||
paragraph_edges.append(edges)
|
||||
tokenized_source_paragraphs.append(source_paragraphs)
|
||||
tokenized_target_paragraphs.append(target_paragraphs)
|
||||
paragraph_edges.append(create_edges(edges, sen_source, sen_target))
|
||||
|
||||
tokenized_source_divs.append(tokenized_source_paragraphs)
|
||||
tokenized_target_divs.append(tokenized_target_paragraphs)
|
||||
|
||||
document_edges.append(paragraph_edges)
|
||||
|
||||
with open(args.tokenization_interprocessing, 'wb') as wp:
|
||||
|
|
48
src/read/svala_data.py
Normal file
48
src/read/svala_data.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
from collections import deque
|
||||
|
||||
from src.read.hand_fixes import SVALA_HAND_FIXES_MERGE
|
||||
|
||||
|
||||
class SvalaData():
|
||||
def __init__(self, svala_data):
|
||||
for el in svala_data['source']:
|
||||
el['text'] = el['text'].strip()
|
||||
if el['text'] == '':
|
||||
print('What?')
|
||||
for el in svala_data['target']:
|
||||
el['text'] = el['text'].strip()
|
||||
if el['text'] == '':
|
||||
print('What?')
|
||||
self.svala_data = svala_data
|
||||
self.links_ids_mapper, self.edges_of_one_type = self.create_ids_mapper(svala_data)
|
||||
|
||||
@staticmethod
|
||||
def create_ids_mapper(svala_data):
|
||||
# create links to ids mapper
|
||||
links_ids_mapper = {}
|
||||
edges_of_one_type = set()
|
||||
|
||||
for k, v in svala_data['edges'].items():
|
||||
has_source = False
|
||||
has_target = False
|
||||
v['source_ids'] = []
|
||||
v['target_ids'] = []
|
||||
for el in v['ids']:
|
||||
# create edges of one type
|
||||
if el[0] == 's':
|
||||
v['source_ids'].append(el)
|
||||
has_source = True
|
||||
if el[0] == 't':
|
||||
v['target_ids'].append(el)
|
||||
has_target = True
|
||||
|
||||
# create links_ids_mapper
|
||||
if el not in links_ids_mapper:
|
||||
links_ids_mapper[el] = []
|
||||
links_ids_mapper[el].append(k)
|
||||
if not has_source or not has_target or (
|
||||
len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
|
||||
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
|
||||
edges_of_one_type.add(k)
|
||||
|
||||
return links_ids_mapper, edges_of_one_type
|
|
@ -15,6 +15,7 @@ from src.annotate.annotate import annotate
|
|||
from src.create_tei import construct_sentence_from_list, \
|
||||
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
|
||||
from src.read.read_and_merge import tokenize
|
||||
from src.write.write import write_tei
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user