Tokenization part adapted for KOST

This commit is contained in:
Luka 2022-11-06 16:10:58 +00:00
parent bafbb6a48a
commit cc455b2558
6 changed files with 357 additions and 309 deletions

94
src/read/hand_fixes.py Normal file
View File

@ -0,0 +1,94 @@
from collections import deque
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.'], 'plavali.': ['plavali', '.'], '[XImeX]': ['[', 'XImeX', ']'], '[XimeX]': ['[', 'XimeX', ']'], 'hipoteze:': ['hipoteze', ':'], 'prehrano?': ['prehrano', '?'], '68-letna': ['68', '-', 'letna'], 'pojma:': ['pojma', ':'], '[XKrajX]': ['[', 'XKrajX', ']'], '3/4': ['3', '/', '4'], 'I-phonea': ['I', '-', 'phonea'], 'kredita:': ['kredita', ':'], '[XFakultetaX]': ['[', 'XFakultetaX', ']'], 'športno-eleganten': ['športno', '-', 'eleganten'], '[XStudijskaSmerX]': ['[', 'XStudijskaSmerX', ']'], '[XNaslovX]': ['[', 'XNaslovX', ']'], '(tudi': ['(', 'tudi'], 'kupujem)': ['kupujem', ')'], '[XPriimekX]': ['[', 'XPriimekX', ']'], '[XPodjetjeX]': ['[', 'XPodjetjeX', ']'], 'Zagreb,': ['Zagreb', ','], 'Budimpešto.': ['Budimpešto', '.'], 'žalost.': ['žalost', '.'], '....': ['.', '.', '.', '.'], '[XStevilkaX]': ['[', 'XStevilkaX', ']'], 'e-naslov': ['e', '-', 'naslov'], '[XEnaslovX]': ['[', 'XEnaslovX', ']'], 'e-pošto': ['e', '-', 'pošto'], '[XDatumX]': ['[', 'XDatumX', ']'], 'eno-sobno': ['eno', '-', 'sobno'], 'lgbtq-prijazna': ['lgbtq', '-', 'prijazna'], 'lgbtq-prijaznega': ['lgbtq', '-', 'prijaznega'], 'Covid-19': ['Covid', '-', '19'], ',,,': [',', ',', ','], 'e-maila': ['e', '-', 'maila'], 'T&d': ['T', '&', 'd'], 'Spider-Man': ['Spider', '-', 'Man'], '12-strani': ['12', '-', 'strani'], 'turbo-folk': ['turbo', '-', 'folk'], 'Cp-čkar': ['Cp', '-', 'čkar'], '46-letnik': ['46', '-', 'letnik'], '40-letna': ['40', '-', 'letna'], '18-19h': ['18', '-', '19h'], '[XSvojilniPridevnikX]': ['[', 'XSvojilniPridevnikX', ']'], 'COVID-19': ['COVID', '-', '19'], '"sims"': ['"', 'sims', '"'], '2021/22': ['2021', '/', '22'], '2020/21': ['2020', '/', '21'], 'leto2021/22': ['leto2021', '/', '22'], 'H&m': ['H', '&', 'm'], 'high-street': ['high', '-', 'street'], 'H&M-u': ['H', '&', 'M-u'], 'H&M': ['H', '&', 'M'], 'srčno-žilnih': ['srčno', '-', 'žilnih'], 'srčno-žilni': ['srčno', '-', 'žilni'], ':))': [':)', ')'], 'You-Tube-ju': ['You', '-', 'Tube-ju'], '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']}
# , '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']
SVALA_HAND_FIXES_MERGE = {('oz', '.'): 'oz.', ('Npr', '.'): 'Npr.', ('npr', '.'): 'npr.', ('1', '.'): '1.', ('2', '.'): '2.', ('3', '.'): '3.', ('m', '.'): 'm.', ('itn', '.'): 'itn.', ('max', '.'): 'max.', ('4', '.'): '4.', ('cca', '.'): 'cca.', ('30', '.'): '30.', ('mlad', '.'): 'mlad.', (':)', ')'): ':))', ('sv', '.'): 'sv.', ('p', '.'): 'p.'}
OBELIKS_HAND_FIXES_MERGE = {'2015.': ['2015', '.']}
def merge_svala_data_elements(svala_data_object, i, mask_len):
final_text = ''
involved_sources = []
involved_targets = []
involved_edges = []
for el in svala_data_object.svala_data['source'][i - mask_len + 1:i + 1]:
# check whether merge won't cause further (unnoticed) issues later
edges = svala_data_object.links_ids_mapper[el['id']]
if len(edges) != 1:
raise ValueError('Incorrect number of edges!')
edge = svala_data_object.svala_data['edges'][edges[0]]
# TODO check if or len(edge['labels']) != 0 has to be added
if len(edge['source_ids']) != 1 or len(edge['target_ids']) != 1:
raise ValueError('Possible errors - CHECK!')
final_text += el['text']
involved_sources.append(edge['source_ids'][0])
involved_targets.append(edge['target_ids'][0])
involved_edges.append(edge['id'])
# erase merged svala elements
svala_data_object.svala_data['source'][i - mask_len + 1]['text'] = final_text
svala_data_object.svala_data['source'] = [el for el in svala_data_object.svala_data['source'] if
el['id'] not in involved_sources[1:]]
for el in svala_data_object.svala_data['target']:
if el['id'] == involved_targets[0]:
el['text'] = final_text
break
svala_data_object.svala_data['target'] = [el for el in svala_data_object.svala_data['target'] if
el['id'] not in involved_targets[1:]]
svala_data_object.svala_data['edges'] = {k: v for k, v in svala_data_object.svala_data['edges'].items() if
v['id'] not in involved_edges[1:]}
i -= len(involved_sources[1:])
return i
def apply_svala_handfixes(svala_data_object):
hand_fix_mask = []
for key in SVALA_HAND_FIXES_MERGE.keys():
if len(key) not in hand_fix_mask:
hand_fix_mask.append(len(key))
remember_length = max(hand_fix_mask)
q = deque()
i = 0
for el in svala_data_object.svala_data['source']:
q.append(el['text'])
if len(q) > remember_length:
q.popleft()
for mask_len in hand_fix_mask:
list_q = list(q)
if len(list_q) - mask_len >= 0:
key = tuple(list_q[remember_length - mask_len:])
if key in SVALA_HAND_FIXES_MERGE:
i = merge_svala_data_elements(svala_data_object, i, mask_len)
i += 1
def apply_obeliks_handfixes(tokenized_paragraph):
for t_i in range(len(tokenized_paragraph)):
sen = tokenized_paragraph[t_i]
i = 0
error = False
for tok in sen:
# if tok['text'] == ',,,':
# tok['text'] = ','
if tok['text'] in OBELIKS_HAND_FIXES_MERGE:
error = True
break
i += 1
if error:
new_sen = []
new_id = 1
for t in sen:
if t['text'] in OBELIKS_HAND_FIXES_MERGE:
for ex_t in OBELIKS_HAND_FIXES_MERGE[t['text']]:
new_sen.append({'id': tuple([new_id]), 'text': ex_t})
new_id += 1
else:
new_sen.append({'id': tuple([new_id]), 'text': t['text']})
new_id += 1
tokenized_paragraph[t_i] = new_sen

View File

@ -18,248 +18,20 @@ def create_edges_list(target_ids, links_ids_mapper):
SKIP_IDS = ['solar2284s.1.1.1'] SKIP_IDS = ['solar2284s.1.1.1']
def create_edges(svala_data, source_par, target_par): def create_edges(raw_edges, source_par, target_par):
if source_par and source_par[0]:
if source_par[0][0]['id'] in SKIP_IDS:
return []
# print(source_par[0][0]['id'])
# if source_par[0][0]['id'] == 'solar17s.6.3.1':
# print('pause!')
# if target_par and target_par[0]:
# print(target_par[0][0]['id'])
# if target_par[0][0]['id'] == 'solar2150t.4.1.1':
# print('pause!')
source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source} source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source}
target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target} target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target}
source_ids = [[el['svala_id'] for el in source] for source in source_par]
target_ids = [[el['svala_id'] for el in target] for target in target_par]
source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par]
target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par]
# create links to ids mapper
links_ids_mapper = {}
edges_of_one_type = set()
# delete empty edge
if 'e-' in svala_data['edges']:
del (svala_data['edges']['e-'])
for k, v in svala_data['edges'].items():
has_source = False
has_target = False
for el in v['ids']:
# create edges of one type
if el[0] == 's':
has_source = True
if el[0] == 't':
has_target = True
# create links_ids_mapper
if el not in links_ids_mapper:
links_ids_mapper[el] = []
links_ids_mapper[el].append(k)
if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
edges_of_one_type.add(k)
# delete edge with space
save_deleted_edges = {}
if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ':
for edg in links_ids_mapper[svala_data['source'][0]['id']]:
save_deleted_edges[edg] = svala_data['edges'][edg]
del (svala_data['edges'][edg])
del (links_ids_mapper[svala_data['source'][0]['id']])
if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ':
for edg in links_ids_mapper[svala_data['target'][0]['id']]:
save_deleted_edges[edg] = svala_data['edges'][edg]
del (svala_data['edges'][edg])
del (links_ids_mapper[svala_data['target'][0]['id']])
# create edge order
edges_order = []
edges_processed = set()
active_target_sentence_i = 0
# create target edges
target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper)
source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper)
last_target_edge = ''
for active_source_sentence_i, active_source_sentence in enumerate(source_edges):
for source_edge in active_source_sentence:
# print(source_edge)
# if 'e-s7-t8' == source_edge:
# print('aaa')
if source_edge in edges_of_one_type:
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]:
# if 'e-s119-t119' == source_edge:
# print('aaa')
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
last_target_edge = source_edge
# when source is connected to two targets
elif source_edge not in target_edges_set[active_target_sentence_i]:
# add missing edges from target
while source_edge not in target_edges_set[active_target_sentence_i]:
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
active_target_sentence_i += 1
if source_edge in target_edges_set[active_target_sentence_i]:
if source_edge not in edges_processed:
edges_order.append(source_edge)
edges_processed.add(source_edge)
else:
raise 'Impossible!!!'
if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges):
continue
if len(target_edges[active_target_sentence_i]) == 0:
active_target_sentence_i += 1
continue
if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))):
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
active_target_sentence_i += 1
continue
target_edge_in_next_source_edge_sentence = False
for target_edge in target_edges[active_target_sentence_i]:
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]:
target_edge_in_next_source_edge_sentence = True
break
if target_edge_in_next_source_edge_sentence:
pass
elif not target_edge_in_next_source_edge_sentence:
target_edge_in_next_source_edge_sentence = False
while not target_edge_in_next_source_edge_sentence:
# if active_target_sentence_i >= len(target_edges_set):
# break
for target_edge in target_edges[active_target_sentence_i]:
if target_edge in edges_of_one_type:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
last_target_edge = target_edge
# if there is no next source sentence
if active_source_sentence_i + 1 >= len(source_edges_set):
target_edge_in_next_source_edge_sentence = True
# if last_target_edge only in target stop regularly
if last_target_edge == target_edges[active_target_sentence_i][-1]:
target_edge_in_next_source_edge_sentence = True
# test if target_edge in next source
for target_edge in target_edges[active_target_sentence_i]:
if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[
active_source_sentence_i + 1]:
target_edge_in_next_source_edge_sentence = True
break
active_target_sentence_i += 1
if not source_edges:
for active_target_sentence in target_edges:
for target_edge in active_target_sentence:
if target_edge not in edges_processed:
edges_order.append(target_edge)
edges_processed.add(target_edge)
# # DEBUG stuff
# for edge_order in edges_order:
# if edges_order.count(edge_order) > 1:
# # if edge_order not in a:
# print(f'ERROR {edge_order}')
#
# for edge_order in edges_order:
# if edge_order not in svala_data['edges']:
# print(f'ERROR {edge_order}')
#
# for key in svala_data['edges'].keys():
# if key not in edges_order:
# print(f'ERROR {key}')
#
# a = len(svala_data['edges'])
# b = len(edges_order)
if len(svala_data['edges']) != len(edges_order):
for k, v in save_deleted_edges.items():
svala_data['edges'][k] = v
assert len(svala_data['edges']) == len(edges_order)
sentence_edges = []
source_sent_id = 0
target_sent_id = 0
# actually add edges # actually add edges
edges = [] edges = []
for edge_id in edges_order: for _, edge in raw_edges.items():
labels = svala_data['edges'][edge_id]['labels'] labels = edge['labels']
source_ids = [source_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in source_mapper] source_ids = [source_mapper[el] for el in edge['ids'] if el in source_mapper]
target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper] target_ids = [target_mapper[el] for el in edge['ids'] if el in target_mapper]
ids = svala_data['edges'][edge_id]['ids']
source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else []
source_ok_all = all(source_ok)
if not source_ok_all:
source_sent_id += 1
target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else []
target_ok_all = all(target_ok)
if not target_ok_all:
target_sent_id += 1
if not source_ok_all or not target_ok_all:
sentence_edges.append(edges)
edges = []
edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels}) edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels})
if edges: return edges
sentence_edges.append(edges)
actual_sentence_edges = []
passed_sentence = []
for sent in sentence_edges:
ha_source = False
ha_target = False
for toke in sent:
if len(toke['target_ids']) > 0:
ha_target = toke['target_ids'][0]
if len(toke['source_ids']) > 0:
ha_source = toke['source_ids'][0]
if ha_target and ha_source:
break
if not ha_target or not ha_source:
passed_sentence.extend(sent)
else:
passed_sentence.extend(sent)
actual_sentence_edges.append(passed_sentence)
passed_sentence = []
if passed_sentence:
actual_sentence_edges.append(passed_sentence)
return actual_sentence_edges
def update_ids(pretag, in_list): def update_ids(pretag, in_list):

View File

@ -1,13 +1,29 @@
import re
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']} from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE
def read_raw_text(path): def read_raw_text(path):
with open(path, 'r') as rf: print(path)
# if path == "data/KOST/raw/L-1819-110.txt":
# print('here')
try:
with open(path, 'r', encoding='utf-8') as rf:
return rf.read()
except:
try:
with open(path, 'r', encoding='utf-16') as rf:
return rf.read()
except:
with open(path, 'r', encoding="windows-1250") as rf:
return rf.read() return rf.read()
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
# apply handfixes for obeliks
apply_obeliks_handfixes(tokenized_paragraph)
paragraph_res = [] paragraph_res = []
wierd_sign_count = 0 wierd_sign_count = 0
svala_data_i = 0 svala_data_i = 0
@ -21,11 +37,14 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
assert tok['misc'] == 'SpaceAfter=No' assert tok['misc'] == 'SpaceAfter=No'
space_after = not 'misc' in tok space_after = not 'misc' in tok
if len(svala_data_part) <= svala_data_i: if len(svala_data_part) <= svala_data_i:
# if sentence does not end add it anyway
# TODO i error?
if sentence_res:
paragraph_res.append(sentence_res)
return i, paragraph_res return i, paragraph_res
if svala_data_part[svala_data_i]['text'].strip() != tok['text']: if svala_data_part[svala_data_i]['text'] != tok['text']:
key = svala_data_part[svala_data_i]['text'].strip() key = svala_data_part[svala_data_i]['text']
if key not in HAND_FIXES: if key not in HAND_FIXES:
print(f'key: {key} ; tok[text]: {tok["text"]}')
if key.startswith('§§§') and key.endswith('§§§'): if key.startswith('§§§') and key.endswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§'] HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
elif key.startswith('§§§'): elif key.startswith('§§§'):
@ -33,7 +52,23 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
elif key.endswith('§§§'): elif key.endswith('§§§'):
HAND_FIXES[key] = [key[:-3], '§', '§', '§'] HAND_FIXES[key] = [key[:-3], '§', '§', '§']
else: else:
raise 'Word mismatch!' if len(key) < len(tok['text']):
print('HAND_FIXES_MERGE:')
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
a = SVALA_HAND_FIXES_MERGE
else:
print('HAND_FIXES OLD:')
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
print('HAND_FIXES NEW:')
reg = re.findall(r"[\w]+|[^\s\w]", key)
print(f", '{key}': {str(reg)}")
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
print(f'key: {key} ; tok[text]: {tok["text"]}')
# raise ValueError('Word mismatch!')
if tok['text'] == HAND_FIXES[key][wierd_sign_count]: if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
wierd_sign_count += 1 wierd_sign_count += 1
@ -42,6 +77,10 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
else: else:
tok['text'] = key tok['text'] = key
wierd_sign_count = 0 wierd_sign_count = 0
elif key in ['[XKrajX]']:
tok['text'] = '[XKrajX]'
elif key in ['[XImeX]']:
tok['text'] = '[XImeX]'
else: else:
print(f'key: {key} ; tok[text]: {tok["text"]}') print(f'key: {key} ; tok[text]: {tok["text"]}')
raise 'Word mismatch!' raise 'Word mismatch!'

View File

@ -1,13 +1,17 @@
import json import json
import logging
import os import os
import pickle import pickle
import queue
import string
from collections import deque
import classla import classla
from src.read.hand_fixes import apply_svala_handfixes
from src.read.merge import merge from src.read.merge import merge, create_conllu, create_edges
from src.read.read import read_raw_text, map_svala_tokenized from src.read.read import read_raw_text, map_svala_tokenized
from src.read.svala_data import SvalaData
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id):
@ -129,9 +133,114 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s
return svala_i, source_i, target_i return svala_i, source_i, target_i
def create_target(svala_data, source_tokenized): def create_target(svala_data_object, source_tokenized):
for i, el in enumerate(svala_data['target']): source_tokenized_dict = {}
print(i) for i, sent in enumerate(source_tokenized):
for tok in sent:
tok['sent_id'] = i + 1
source_tokenized_dict[tok['svala_id']] = tok
links_ids_mapper, edges_of_one_type = svala_data_object.links_ids_mapper, svala_data_object.edges_of_one_type
curr_sententence = 1
source_curr_sentence = 1
target_tokenized = []
target_sent_tokenized = []
tok_i = 1
for i, token in enumerate(svala_data_object.svala_data['target']):
edge_id = links_ids_mapper[token['id']]
if len(edge_id) > 1:
print('Whaat?')
edge_id = edge_id[0]
edge = svala_data_object.svala_data['edges'][edge_id]
source_word_ids = []
target_word_ids = []
for word_id in edge['ids']:
if word_id[0] == 's':
source_word_ids.append(word_id)
if word_id[0] == 't':
target_word_ids.append(word_id)
token_text = token['text']
new_sentence = False
if len(source_word_ids) == 1:
source_id = source_word_ids[0]
source_token = source_tokenized_dict[source_id]
if source_token['sent_id'] != source_curr_sentence:
source_curr_sentence = source_token['sent_id']
if source_token['id'] == 1 and len(target_sent_tokenized) > 1:
target_tokenized.append(target_sent_tokenized)
target_sent_tokenized = []
curr_sententence += 1
tok_i = 1
# check if words are equal and update
if token_text == source_token['token']:
target_token = {
'token': source_token['token'],
'tag': source_token['tag'],
'id': tok_i,
'space_after': source_token['space_after'],
'svala_id': token['id'],
'sent_id': curr_sententence,
}
else:
# Check for punctuation mismatch.
if token_text in string.punctuation:
tag = 'pc'
else:
tag = 'w'
target_token = {
'token': token_text,
'tag': tag,
'id': tok_i,
'space_after': source_token['space_after'],
'svala_id': token['id'],
'sent_id': curr_sententence,
}
else:
space_after = True
if token_text in string.punctuation:
tag = 'pc'
if token_text in '!?.,):;]}':
if len(target_sent_tokenized) == 0:
raise ValueError('Sentence lenght = 0!')
target_sent_tokenized[-1]['space_after'] = False
if token_text in '!?.':
new_sentence = True
# Handle cases like `...`
if len(svala_data_object.svala_data['target']) > i + 1 and svala_data_object.svala_data['target'][i+1]['text'] in '.?!':
new_sentence = False
elif token_text in '([{':
space_after = False
else:
tag = 'w'
target_token = {
'token': token_text,
'tag': tag,
'id': tok_i,
'space_after': space_after,
'svala_id': token['id'],
'sent_id': curr_sententence,
}
target_sent_tokenized.append(target_token)
if new_sentence:
target_tokenized.append(target_sent_tokenized)
target_sent_tokenized = []
curr_sententence += 1
tok_i = 1
tok_i += 1
target_tokenized.append(target_sent_tokenized)
return target_tokenized
def tokenize(args): def tokenize(args):
@ -149,14 +258,19 @@ def tokenize(args):
nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True) nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True)
# filename_encountered = False # filename_encountered = False
i = 0 i = 0
tokenized_source_divs = [] tokenized_divs = {}
tokenized_target_divs = [] # tokenized_source_divs = {}
# tokenized_target_divs = {}
document_edges = [] document_edges = []
text_filename = '' text_filename = ''
for folder, _, filenames in os.walk(args.svala_folder): for folder, _, filenames in os.walk(args.svala_folder):
for filename in filenames: filenames = sorted(filenames)
for filename_i, filename in enumerate(filenames):
# if filename_i*100/len(filenames) > 35:
# print('here')
# continue
svala_path = os.path.join(folder, filename) svala_path = os.path.join(folder, filename)
new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt' new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt'
if text_filename != new_text_filename: if text_filename != new_text_filename:
@ -166,81 +280,61 @@ def tokenize(args):
text_file) if text_file else ([], [], []) text_file) if text_file else ([], [], [])
source_sent_i = 0 source_sent_i = 0
jf = open(svala_path) jf = open(svala_path, encoding='utf-8')
print(svala_path)
svala_data = json.load(jf) svala_data = json.load(jf)
jf.close() jf.close()
svala_data_object = SvalaData(svala_data)
target_res = create_target(svala_data, source_tokenized) apply_svala_handfixes(svala_data_object)
source_sent_i, source_res = map_svala_tokenized(svala_data['source'], source_tokenized, source_sent_i)
print('aaa') source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i)
# target_res = create_target(svala_data, source_tokenized)
for div in et.iter('div'): target_res = create_target(svala_data_object, source_res)
bibl = div.find('bibl')
file_name = bibl.get('n')
file_name = file_name.replace('/', '_')
print(f'{i*100/folders_count} % : {file_name}')
i += 1
# if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt':
# if file_name == 'KUS-G-slo-4-GO-E-2009-10017':
# # # if i*100/folders_count > 40:
# filename_encountered = True
# # # # if i*100/folders_count > 41:
# # # # filename_encountered = False
# if not filename_encountered:
# continue
svala_path = os.path.join(args.svala_folder, file_name) if text_filename not in tokenized_divs:
corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) tokenized_divs[text_filename] = []
raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name)
svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] if os.path.isdir(svala_path) else [] tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges']))
svala_dict = {e[0]: e[1] for e in svala_list}
if os.path.exists(corrected_svala_path): logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %')
corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)]
corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list}
svala_dict.update(corrected_svala_dict) tokenized_source_divs = []
tokenized_target_divs = []
assert len(svala_dict) != 0 document_edges = []
for div_id in tokenized_divs.keys():
paragraph_edges = []
tokenized_source_paragraphs = [] tokenized_source_paragraphs = []
tokenized_target_paragraphs = [] tokenized_target_paragraphs = []
paragraph_edges = [] # par_source = []
# par_target = []
for tokenized_para in tokenized_divs[div_id]:
paragraph_name, source_res, target_res, edges = tokenized_para
source_paragraphs = []
target_paragraphs = []
sen_source = []
sen_target = []
for sen_i, sen in enumerate(source_res):
source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}')
source_paragraphs.append(source_conllu)
sen_source.append(sen)
paragraphs = div.findall('p') for sen_i, sen in enumerate(target_res):
for paragraph in paragraphs: target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}')
sentences = paragraph.findall('s') target_paragraphs.append(target_conllu)
svala_i = 1 sen_target.append(sen)
paragraph_edges.append(edges)
# read json tokenized_source_paragraphs.append(source_paragraphs)
# if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar17.6': tokenized_target_paragraphs.append(target_paragraphs)
# print('here') paragraph_edges.append(create_edges(edges, sen_source, sen_target))
svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']])
add_errors_func = add_errors
jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file)
svala_data = json.load(jf)
jf.close()
source_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_source.json'
target_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_target.json'
source_raw_text = os.path.join(raw_texts_path, source_filename) if os.path.exists(os.path.join(raw_texts_path, source_filename)) else None
target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None
sentence_edges, tokenized_source_sentences, tokenized_target_sentences = merge(sentences, paragraph, svala_i,
svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize)
tokenized_source_paragraphs.append(tokenized_source_sentences)
tokenized_target_paragraphs.append(tokenized_target_sentences)
paragraph_edges.append(sentence_edges)
tokenized_source_divs.append(tokenized_source_paragraphs) tokenized_source_divs.append(tokenized_source_paragraphs)
tokenized_target_divs.append(tokenized_target_paragraphs) tokenized_target_divs.append(tokenized_target_paragraphs)
document_edges.append(paragraph_edges) document_edges.append(paragraph_edges)
with open(args.tokenization_interprocessing, 'wb') as wp: with open(args.tokenization_interprocessing, 'wb') as wp:

48
src/read/svala_data.py Normal file
View File

@ -0,0 +1,48 @@
from collections import deque
from src.read.hand_fixes import SVALA_HAND_FIXES_MERGE
class SvalaData():
def __init__(self, svala_data):
for el in svala_data['source']:
el['text'] = el['text'].strip()
if el['text'] == '':
print('What?')
for el in svala_data['target']:
el['text'] = el['text'].strip()
if el['text'] == '':
print('What?')
self.svala_data = svala_data
self.links_ids_mapper, self.edges_of_one_type = self.create_ids_mapper(svala_data)
@staticmethod
def create_ids_mapper(svala_data):
# create links to ids mapper
links_ids_mapper = {}
edges_of_one_type = set()
for k, v in svala_data['edges'].items():
has_source = False
has_target = False
v['source_ids'] = []
v['target_ids'] = []
for el in v['ids']:
# create edges of one type
if el[0] == 's':
v['source_ids'].append(el)
has_source = True
if el[0] == 't':
v['target_ids'].append(el)
has_target = True
# create links_ids_mapper
if el not in links_ids_mapper:
links_ids_mapper[el] = []
links_ids_mapper[el].append(k)
if not has_source or not has_target or (
len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \
or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '):
edges_of_one_type.add(k)
return links_ids_mapper, edges_of_one_type

View File

@ -15,6 +15,7 @@ from src.annotate.annotate import annotate
from src.create_tei import construct_sentence_from_list, \ from src.create_tei import construct_sentence_from_list, \
construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl
from src.read.read_and_merge import tokenize from src.read.read_and_merge import tokenize
from src.write.write import write_tei
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)