Tokenization part adapted for KOST

master
Luka 1 year ago
parent bafbb6a48a
commit cc455b2558

@ -0,0 +1,94 @@
from collections import deque
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.'], 'plavali.': ['plavali', '.'], '[XImeX]': ['[', 'XImeX', ']'], '[XimeX]': ['[', 'XimeX', ']'], 'hipoteze:': ['hipoteze', ':'], 'prehrano?': ['prehrano', '?'], '68-letna': ['68', '-', 'letna'], 'pojma:': ['pojma', ':'], '[XKrajX]': ['[', 'XKrajX', ']'], '3/4': ['3', '/', '4'], 'I-phonea': ['I', '-', 'phonea'], 'kredita:': ['kredita', ':'], '[XFakultetaX]': ['[', 'XFakultetaX', ']'], 'športno-eleganten': ['športno', '-', 'eleganten'], '[XStudijskaSmerX]': ['[', 'XStudijskaSmerX', ']'], '[XNaslovX]': ['[', 'XNaslovX', ']'], '(tudi': ['(', 'tudi'], 'kupujem)': ['kupujem', ')'], '[XPriimekX]': ['[', 'XPriimekX', ']'], '[XPodjetjeX]': ['[', 'XPodjetjeX', ']'], 'Zagreb,': ['Zagreb', ','], 'Budimpešto.': ['Budimpešto', '.'], 'žalost.': ['žalost', '.'], '....': ['.', '.', '.', '.'], '[XStevilkaX]': ['[', 'XStevilkaX', ']'], 'e-naslov': ['e', '-', 'naslov'], '[XEnaslovX]': ['[', 'XEnaslovX', ']'], 'e-pošto': ['e', '-', 'pošto'], '[XDatumX]': ['[', 'XDatumX', ']'], 'eno-sobno': ['eno', '-', 'sobno'], 'lgbtq-prijazna': ['lgbtq', '-', 'prijazna'], 'lgbtq-prijaznega': ['lgbtq', '-', 'prijaznega'], 'Covid-19': ['Covid', '-', '19'], ',,,': [',', ',', ','], 'e-maila': ['e', '-', 'maila'], 'T&d': ['T', '&', 'd'], 'Spider-Man': ['Spider', '-', 'Man'], '12-strani': ['12', '-', 'strani'], 'turbo-folk': ['turbo', '-', 'folk'], 'Cp-čkar': ['Cp', '-', 'čkar'], '46-letnik': ['46', '-', 'letnik'], '40-letna': ['40', '-', 'letna'], '18-19h': ['18', '-', '19h'], '[XSvojilniPridevnikX]': ['[', 'XSvojilniPridevnikX', ']'], 'COVID-19': ['COVID', '-', '19'], '"sims"': ['"', 'sims', '"'], '2021/22': ['2021', '/', '22'], '2020/21': ['2020', '/', '21'], 'leto2021/22': ['leto2021', '/', '22'], 'H&m': ['H', '&', 'm'], 'high-street': ['high', '-', 'street'], 'H&M-u': ['H', '&', 'M-u'], 'H&M': ['H', '&', 'M'], 'srčno-žilnih': ['srčno', '-', 'žilnih'], 'srčno-žilni': ['srčno', '-', 'žilni'], ':))': [':)', ')'], 'You-Tube-ju': ['You', '-', 'Tube-ju'], '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']}
# , '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']
SVALA_HAND_FIXES_MERGE = {('oz', '.'): 'oz.', ('Npr', '.'): 'Npr.', ('npr', '.'): 'npr.', ('1', '.'): '1.', ('2', '.'): '2.', ('3', '.'): '3.', ('m', '.'): 'm.', ('itn', '.'): 'itn.', ('max', '.'): 'max.', ('4', '.'): '4.', ('cca', '.'): 'cca.', ('30', '.'): '30.', ('mlad', '.'): 'mlad.', (':)', ')'): ':))', ('sv', '.'): 'sv.', ('p', '.'): 'p.'}
OBELIKS_HAND_FIXES_MERGE = {'2015.': ['2015', '.']}
def merge_svala_data_elements(svala_data_object, i, mask_len):
final_text = ''
involved_sources = []
involved_targets = []
involved_edges = []
for el in svala_data_object.svala_data['source'][i - mask_len + 1:i + 1]:
# check whether merge won't cause further (unnoticed) issues later
edges = svala_data_object.links_ids_mapper[el['id']]
if len(edges) != 1:
raise ValueError('Incorrect number of edges!')
edge = svala_data_object.svala_data['edges'][edges[0]]
# TODO check if or len(edge['labels']) != 0 has to be added
if len(edge['source_ids']) != 1 or len(edge['target_ids']) != 1:
raise ValueError('Possible errors - CHECK!')
final_text += el['text']
involved_sources.append(edge['source_ids'][0])
involved_targets.append(edge['target_ids'][0])
involved_edges.append(edge['id'])
# erase merged svala elements
svala_data_object.svala_data['source'][i - mask_len + 1]['text'] = final_text
svala_data_object.svala_data['source'] = [el for el in svala_data_object.svala_data['source'] if
el['id'] not in involved_sources[1:]]
for el in svala_data_object.svala_data['target']:
if el['id'] == involved_targets[0]:
el['text'] = final_text
break
svala_data_object.svala_data['target'] = [el for el in svala_data_object.svala_data['target'] if
el['id'] not in involved_targets[1:]]
svala_data_object.svala_data['edges'] = {k: v for k, v in svala_data_object.svala_data['edges'].items() if
v['id'] not in involved_edges[1:]}
i -= len(involved_sources[1:])
return i
def apply_svala_handfixes(svala_data_object):
hand_fix_mask = []
for key in SVALA_HAND_FIXES_MERGE.keys():
if len(key) not in hand_fix_mask:
hand_fix_mask.append(len(key))
remember_length = max(hand_fix_mask)
q = deque()
i = 0
for el in svala_data_object.svala_data['source']:
q.append(el['text'])
if len(q) > remember_length:
q.popleft()
for mask_len in hand_fix_mask:
list_q = list(q)
if len(list_q) - mask_len >= 0:
key = tuple(list_q[remember_length - mask_len:])
if key in SVALA_HAND_FIXES_MERGE:
i = merge_svala_data_elements(svala_data_object, i, mask_len)
i += 1
def apply_obeliks_handfixes(tokenized_paragraph):
for t_i in range(len(tokenized_paragraph)):
sen = tokenized_paragraph[t_i]
i = 0
error = False
for tok in sen:
# if tok['text'] == ',,,':
# tok['text'] = ','
if tok['text'] in OBELIKS_HAND_FIXES_MERGE:
error = True
break
i += 1
if error:
new_sen = []
new_id = 1
for t in sen:
if t['text'] in OBELIKS_HAND_FIXES_MERGE:
for ex_t in OBELIKS_HAND_FIXES_MERGE[t['text']]:
new_sen.append({'id': tuple([new_id]), 'text': ex_t})
new_id += 1
else:
new_sen.append({'id': tuple([new_id]), 'text': t['text']})
new_id += 1
tokenized_paragraph[t_i] = new_sen

@ -18,248 +18,20 @@ def create_edges_list(target_ids, links_ids_mapper):
SKIP_IDS = ['solar2284s.1.1.1'] SKIP_IDS = ['solar2284s.1.1.1']
def create_edges(svala_data, source_par, target_par): def create_edges(raw_edges, source_par, target_par):
if source_par and source_par[0]:
if source_par[0][0]['id'] in SKIP_IDS:
return []
# print(source_par[0][0]['id'])
# if source_par[0][0]['id'] == 'solar17s.6.3.1':
# print('pause!')
# if target_par and target_par[0]:
# print(target_par[0][0]['id'])
# if target_par[0][0]['id'] == 'solar2150t.4.1.1':
# print('pause!')
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source} source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target} target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
source_ids = [[el['svala_id'] for el in source] for source in source_par]
target_ids = [[el['svala_id'] for el in target] for target in target_par]
source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par]
target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par]
# create links to ids mapper
links_ids_mapper = {}
edges_of_one_type = set()
# delete empty edge
if 'e-' in svala_data['edges']:
del (svala_data['edges']['e-'])
for k, v in svala_data['edges'].items():
has_source = False
has_target = False
for el in v['ids']:
# create edges of one type
if el[0] == 's':
has_source = True
if el[0] == 't':
has_target = True
# create links_ids_mapper
if el not in links_ids_mapper:
links_ids_mapper[el] = []
links_ids_mapper[el].append(k)
if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
edges_of_one_type.add(k)
# delete edge with space
save_deleted_edges = {}
if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ':
for edg in links_ids_mapper[svala_data['source'][0]['id']]:
save_deleted_edges[edg] = svala_data['edges'][edg]
del (svala_data['edges'][edg])
del (links_ids_mapper[svala_data['source'][0]['id']])
if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ':
for edg in links_ids_mapper[svala_data['target'][0]['id']]:
save_deleted_edges[edg] = svala_data['edges'][edg]
del (svala_data['edges'][edg])
del (links_ids_mapper[svala_data['target'][0]['id']])
# create edge order
edges_order = []
edges_processed = set()
active_target_sentence_i = 0
# create target edges
target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper)
source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper)
last_target_edge = ''
for active_source_sentence_i, active_source_sentence in enumerate(source_edges):
for source_edge in active_source_sentence:
# print(source_edge)
# if 'e-s7-t8' == source_edge:
# print('aaa')
if source_edge in edges_of_one_type:
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]:
# if 'e-s119-t119' == source_edge:
# print('aaa')
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
last_target_edge = source_edge
# when source is connected to two targets
elif source_edge not in target_edges_set[active_target_sentence_i]:
# add missing edges from target
while source_edge not in target_edges_set[active_target_sentence_i]:
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
active_target_sentence_i += 1
if source_edge in target_edges_set[active_target_sentence_i]:
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
else:
raise 'Impossible!!!'
if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges):
continue
if len(target_edges[active_target_sentence_i]) == 0:
active_target_sentence_i += 1
continue
if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))):
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
active_target_sentence_i += 1
continue
target_edge_in_next_source_edge_sentence = False
for target_edge in target_edges[active_target_sentence_i]:
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]:
target_edge_in_next_source_edge_sentence = True
break
if target_edge_in_next_source_edge_sentence:
pass
elif not target_edge_in_next_source_edge_sentence:
target_edge_in_next_source_edge_sentence = False
while not target_edge_in_next_source_edge_sentence:
# if active_target_sentence_i >= len(target_edges_set):
# break
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
# if there is no next source sentence
if active_source_sentence_i + 1 >= len(source_edges_set):
target_edge_in_next_source_edge_sentence = True
# if last_target_edge only in target stop regularly
if last_target_edge == target_edges[active_target_sentence_i][-1]:
target_edge_in_next_source_edge_sentence = True
# test if target_edge in next source
for target_edge in target_edges[active_target_sentence_i]:
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[
active_source_sentence_i + 1]:
target_edge_in_next_source_edge_sentence = True
break
active_target_sentence_i += 1
if not source_edges:
for active_target_sentence in target_edges:
for target_edge in active_target_sentence:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
# # DEBUG stuff
# for edge_order in edges_order:
# if edges_order.count(edge_order) > 1:
# # if edge_order not in a:
# print(f'ERROR {edge_order}')
#
# for edge_order in edges_order:
# if edge_order not in svala_data['edges']:
# print(f'ERROR {edge_order}')
#
# for key in svala_data['edges'].keys():
# if key not in edges_order:
# print(f'ERROR {key}')
#
# a = len(svala_data['edges'])
# b = len(edges_order)
if len(svala_data['edges']) != len(edges_order):
for k, v in save_deleted_edges.items():
svala_data['edges'][k] = v
assert len(svala_data['edges']) == len(edges_order)
sentence_edges = []
source_sent_id = 0
target_sent_id = 0
# actually add edges # actually add edges
edges = [] edges = []
for edge_id in edges_order: for _, edge in raw_edges.items():
labels = svala_data['edges'][edge_id]['labels'] labels = edge['labels']
source_ids = [source_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in source_mapper] source_ids = [source_mapper[el] for el in edge['ids'] if el in source_mapper]
target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper] target_ids = [target_mapper[el] for el in edge['ids'] if el in target_mapper]
ids = svala_data['edges'][edge_id]['ids']
source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else []
source_ok_all = all(source_ok)
if not source_ok_all:
source_sent_id += 1
target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else []
target_ok_all = all(target_ok)
if not target_ok_all:
target_sent_id += 1
if not source_ok_all or not target_ok_all:
sentence_edges.append(edges)
edges = []
edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels}) edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels})
if edges: return edges
sentence_edges.append(edges)
actual_sentence_edges = []
passed_sentence = []
for sent in sentence_edges:
ha_source = False
ha_target = False
for toke in sent:
if len(toke['target_ids']) > 0:
ha_target = toke['target_ids'][0]
if len(toke['source_ids']) > 0:
ha_source = toke['source_ids'][0]
if ha_target and ha_source:
break
if not ha_target or not ha_source:
passed_sentence.extend(sent)
else:
passed_sentence.extend(sent)
actual_sentence_edges.append(passed_sentence)
passed_sentence = []
if passed_sentence:
actual_sentence_edges.append(passed_sentence)
return actual_sentence_edges
def update_ids(pretag, in_list): def update_ids(pretag, in_list):

@ -1,13 +1,29 @@
import re
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']} from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE
def read_raw_text(path): def read_raw_text(path):
with open(path, 'r') as rf: print(path)
return rf.read() # if path == "data/KOST/raw/L-1819-110.txt":
# print('here')
try:
with open(path, 'r', encoding='utf-8') as rf:
return rf.read()
except:
try:
with open(path, 'r', encoding='utf-16') as rf:
return rf.read()
except:
with open(path, 'r', encoding="windows-1250") as rf:
return rf.read()
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
# apply handfixes for obeliks
apply_obeliks_handfixes(tokenized_paragraph)
paragraph_res = [] paragraph_res = []
wierd_sign_count = 0 wierd_sign_count = 0
svala_data_i = 0 svala_data_i = 0
@ -21,11 +37,14 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
assert tok['misc'] == 'SpaceAfter=No' assert tok['misc'] == 'SpaceAfter=No'
space_after = not 'misc' in tok space_after = not 'misc' in tok
if len(svala_data_part) <= svala_data_i: if len(svala_data_part) <= svala_data_i:
# if sentence does not end add it anyway
# TODO i error?
if sentence_res:
paragraph_res.append(sentence_res)
return i, paragraph_res return i, paragraph_res
if svala_data_part[svala_data_i]['text'].strip() != tok['text']: if svala_data_part[svala_data_i]['text'] != tok['text']:
key = svala_data_part[svala_data_i]['text'].strip() key = svala_data_part[svala_data_i]['text']
if key not in HAND_FIXES: if key not in HAND_FIXES:
print(f'key: {key} ; tok[text]: {tok["text"]}')
if key.startswith('§§§') and key.endswith('§§§'): if key.startswith('§§§') and key.endswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§'] HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
elif key.startswith('§§§'): elif key.startswith('§§§'):
@ -33,7 +52,23 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
elif key.endswith('§§§'): elif key.endswith('§§§'):
HAND_FIXES[key] = [key[:-3], '§', '§', '§'] HAND_FIXES[key] = [key[:-3], '§', '§', '§']
else: else:
raise 'Word mismatch!' if len(key) < len(tok['text']):
print('HAND_FIXES_MERGE:')
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
a = SVALA_HAND_FIXES_MERGE
else:
print('HAND_FIXES OLD:')
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
print('HAND_FIXES NEW:')
reg = re.findall(r"[\w]+|[^\s\w]", key)
print(f", '{key}': {str(reg)}")
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
print(f'key: {key} ; tok[text]: {tok["text"]}')
# raise ValueError('Word mismatch!')
if tok['text'] == HAND_FIXES[key][wierd_sign_count]: if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
wierd_sign_count += 1 wierd_sign_count += 1
@ -42,6 +77,10 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
else: else:
tok['text'] = key tok['text'] = key
wierd_sign_count = 0 wierd_sign_count = 0
elif key in ['[XKrajX]']:
tok['text'] = '[XKrajX]'
elif key in ['[XImeX]']:
tok['text'] = '[XImeX]'
else: else:
print(f'key: {key} ; tok[text]: {tok["text"]}') print(f'key: {key} ; tok[text]: {tok["text"]}')
raise 'Word mismatch!' raise 'Word mismatch!'

@ -1,13 +1,17 @@
import json import json
import logging
import os import os
import pickle import pickle
import classla import queue
import string
from collections import deque
import classla
from src.read.merge import merge from src.read.hand_fixes import apply_svala_handfixes
from src.read.merge import merge, create_conllu, create_edges
from src.read.read import read_raw_text, map_svala_tokenized from src.read.read import read_raw_text, map_svala_tokenized
from src.read.svala_data import SvalaData
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
@ -129,9 +133,114 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s
return svala_i, source_i, target_i return svala_i, source_i, target_i
def create_target(svala_data, source_tokenized): def create_target(svala_data_object, source_tokenized):
for i, el in enumerate(svala_data['target']): source_tokenized_dict = {}
print(i) for i, sent in enumerate(source_tokenized):
for tok in sent:
tok['sent_id'] = i + 1
source_tokenized_dict[tok['svala_id']] = tok
links_ids_mapper, edges_of_one_type = svala_data_object.links_ids_mapper, svala_data_object.edges_of_one_type
curr_sententence = 1
source_curr_sentence = 1
target_tokenized = []
target_sent_tokenized = []
tok_i = 1
for i, token in enumerate(svala_data_object.svala_data['target']):
edge_id = links_ids_mapper[token['id']]
if len(edge_id) > 1:
print('Whaat?')
edge_id = edge_id[0]
edge = svala_data_object.svala_data['edges'][edge_id]
source_word_ids = []
target_word_ids = []
for word_id in edge['ids']:
if word_id[0] == 's':
source_word_ids.append(word_id)
if word_id[0] == 't':
target_word_ids.append(word_id)
token_text = token['text']
new_sentence = False
if len(source_word_ids) == 1:
source_id = source_word_ids[0]
source_token = source_tokenized_dict[source_id]
if source_token['sent_id'] != source_curr_sentence:
source_curr_sentence = source_token['sent_id']
if source_token['id'] == 1 and len(target_sent_tokenized) > 1:
target_tokenized.append(target_sent_tokenized)
target_sent_tokenized = []
curr_sententence += 1
tok_i = 1
# check if words are equal and update
if token_text == source_token['token']:
target_token = {
'token': source_token['token'],
'tag': source_token['tag'],
'id': tok_i,
'space_after': source_token['space_after'],
'svala_id': token['id'],
'sent_id': curr_sententence,
}
else:
# Check for punctuation mismatch.
if token_text in string.punctuation:
tag = 'pc'
else:
tag = 'w'
target_token = {
'token': token_text,
'tag': tag,
'id': tok_i,
'space_after': source_token['space_after'],
'svala_id': token['id'],
'sent_id': curr_sententence,
}
else:
space_after = True
if token_text in string.punctuation:
tag = 'pc'
if token_text in '!?.,):;]}':
if len(target_sent_tokenized) == 0:
raise ValueError('Sentence lenght = 0!')
target_sent_tokenized[-1]['space_after'] = False
if token_text in '!?.':
new_sentence = True
# Handle cases like `...`
if len(svala_data_object.svala_data['target']) > i + 1 and svala_data_object.svala_data['target'][i+1]['text'] in '.?!':
new_sentence = False
elif token_text in '([{':
space_after = False
else:
tag = 'w'
target_token = {
'token': token_text,
'tag': tag,
'id': tok_i,
'space_after': space_after,
'svala_id': token['id'],
'sent_id': curr_sententence,
}
target_sent_tokenized.append(target_token)
if new_sentence:
target_tokenized.append(target_sent_tokenized)
target_sent_tokenized = []
curr_sententence += 1
tok_i = 1
tok_i += 1
target_tokenized.append(target_sent_tokenized)
return target_tokenized
def tokenize(args): def tokenize(args):
@ -149,14 +258,19 @@ def tokenize(args):
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True) nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
# filename_encountered = False # filename_encountered = False
i = 0 i = 0
tokenized_source_divs = [] tokenized_divs = {}
tokenized_target_divs = [] # tokenized_source_divs = {}
# tokenized_target_divs = {}
document_edges = [] document_edges = []
text_filename = '' text_filename = ''
for folder, _, filenames in os.walk(args.svala_folder): for folder, _, filenames in os.walk(args.svala_folder):
for filename in filenames: filenames = sorted(filenames)
for filename_i, filename in enumerate(filenames):
# if filename_i*100/len(filenames) > 35:
# print('here')
# continue
svala_path = os.path.join(folder, filename) svala_path = os.path.join(folder, filename)
new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt' new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt'
if text_filename != new_text_filename: if text_filename != new_text_filename:
@ -166,81 +280,61 @@ def tokenize(args):
text_file) if text_file else ([], [], []) text_file) if text_file else ([], [], [])
source_sent_i = 0 source_sent_i = 0
jf = open(svala_path) jf = open(svala_path, encoding='utf-8')
print(svala_path)
svala_data = json.load(jf) svala_data = json.load(jf)
jf.close() jf.close()
svala_data_object = SvalaData(svala_data)
target_res = create_target(svala_data, source_tokenized) apply_svala_handfixes(svala_data_object)
source_sent_i, source_res = map_svala_tokenized(svala_data['source'], source_tokenized, source_sent_i)
print('aaa')
source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
# target_res = create_target(svala_data, source_tokenized)
for div in et.iter('div'):
bibl = div.find('bibl')
file_name = bibl.get('n')
file_name = file_name.replace('/', '_')
print(f'{i*100/folders_count} % : {file_name}')
i += 1
# if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt':
# if file_name == 'KUS-G-slo-4-GO-E-2009-10017':
# # # if i*100/folders_count > 40:
# filename_encountered = True
# # # # if i*100/folders_count > 41:
# # # # filename_encountered = False
# if not filename_encountered:
# continue
svala_path = os.path.join(args.svala_folder, file_name) target_res = create_target(svala_data_object, source_res)
corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name)
raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name)
svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] if os.path.isdir(svala_path) else [] if text_filename not in tokenized_divs:
svala_dict = {e[0]: e[1] for e in svala_list} tokenized_divs[text_filename] = []
if os.path.exists(corrected_svala_path): tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges']))
corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)]
corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list}
svala_dict.update(corrected_svala_dict) logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %')
assert len(svala_dict) != 0 tokenized_source_divs = []
tokenized_target_divs = []
document_edges = []
for div_id in tokenized_divs.keys():
paragraph_edges = []
tokenized_source_paragraphs = [] tokenized_source_paragraphs = []
tokenized_target_paragraphs = [] tokenized_target_paragraphs = []
paragraph_edges = [] # par_source = []
# par_target = []
paragraphs = div.findall('p') for tokenized_para in tokenized_divs[div_id]:
for paragraph in paragraphs: paragraph_name, source_res, target_res, edges = tokenized_para
sentences = paragraph.findall('s') source_paragraphs = []
svala_i = 1 target_paragraphs = []
sen_source = []
# read json sen_target = []
# if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar17.6': for sen_i, sen in enumerate(source_res):
# print('here') source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) source_paragraphs.append(source_conllu)
corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) sen_source.append(sen)
add_errors_func = add_errors
jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file) for sen_i, sen in enumerate(target_res):
svala_data = json.load(jf) target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
jf.close() target_paragraphs.append(target_conllu)
sen_target.append(sen)
source_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_source.json' paragraph_edges.append(edges)
target_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_target.json' tokenized_source_paragraphs.append(source_paragraphs)
tokenized_target_paragraphs.append(target_paragraphs)
source_raw_text = os.path.join(raw_texts_path, source_filename) if os.path.exists(os.path.join(raw_texts_path, source_filename)) else None paragraph_edges.append(create_edges(edges, sen_source, sen_target))
target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None
sentence_edges, tokenized_source_sentences, tokenized_target_sentences = merge(sentences, paragraph, svala_i,
svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize)
tokenized_source_paragraphs.append(tokenized_source_sentences)
tokenized_target_paragraphs.append(tokenized_target_sentences)
paragraph_edges.append(sentence_edges)
tokenized_source_divs.append(tokenized_source_paragraphs) tokenized_source_divs.append(tokenized_source_paragraphs)
tokenized_target_divs.append(tokenized_target_paragraphs) tokenized_target_divs.append(tokenized_target_paragraphs)
document_edges.append(paragraph_edges) document_edges.append(paragraph_edges)
with open(args.tokenization_interprocessing, 'wb') as wp: with open(args.tokenization_interprocessing, 'wb') as wp:

@ -0,0 +1,48 @@
from collections import deque
from src.read.hand_fixes import SVALA_HAND_FIXES_MERGE
class SvalaData():
def __init__(self, svala_data):
for el in svala_data['source']:
el['text'] = el['text'].strip()
if el['text'] == '':
print('What?')
for el in svala_data['target']:
el['text'] = el['text'].strip()
if el['text'] == '':
print('What?')
self.svala_data = svala_data
self.links_ids_mapper, self.edges_of_one_type = self.create_ids_mapper(svala_data)
@staticmethod
def create_ids_mapper(svala_data):
# create links to ids mapper
links_ids_mapper = {}
edges_of_one_type = set()
for k, v in svala_data['edges'].items():
has_source = False
has_target = False
v['source_ids'] = []
v['target_ids'] = []
for el in v['ids']:
# create edges of one type
if el[0] == 's':
v['source_ids'].append(el)
has_source = True
if el[0] == 't':
v['target_ids'].append(el)
has_target = True
# create links_ids_mapper
if el not in links_ids_mapper:
links_ids_mapper[el] = []
links_ids_mapper[el].append(k)
if not has_source or not has_target or (
len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
edges_of_one_type.add(k)
return links_ids_mapper, edges_of_one_type

@ -15,6 +15,7 @@ from src.annotate.annotate import annotate
from src.create_tei import construct_sentence_from_list, \ from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
from src.read.read_and_merge import tokenize from src.read.read_and_merge import tokenize
from src.write.write import write_tei
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)

Loading…
Cancel
Save