diff --git a/src/read/hand_fixes.py b/src/read/hand_fixes.py new file mode 100644 index 0000000..f1e86a6 --- /dev/null +++ b/src/read/hand_fixes.py @@ -0,0 +1,94 @@ +from collections import deque + +HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.'], 'plavali.': ['plavali', '.'], '[XImeX]': ['[', 'XImeX', ']'], '[XimeX]': ['[', 'XimeX', ']'], 'hipoteze:': ['hipoteze', ':'], 'prehrano?': ['prehrano', '?'], '68-letna': ['68', '-', 'letna'], 'pojma:': ['pojma', ':'], '[XKrajX]': ['[', 'XKrajX', ']'], '3/4': ['3', '/', '4'], 'I-phonea': ['I', '-', 'phonea'], 'kredita:': ['kredita', ':'], '[XFakultetaX]': ['[', 'XFakultetaX', ']'], 'športno-eleganten': ['športno', '-', 'eleganten'], '[XStudijskaSmerX]': ['[', 'XStudijskaSmerX', ']'], '[XNaslovX]': ['[', 'XNaslovX', ']'], '(tudi': ['(', 'tudi'], 'kupujem)': ['kupujem', ')'], '[XPriimekX]': ['[', 'XPriimekX', ']'], '[XPodjetjeX]': ['[', 'XPodjetjeX', ']'], 'Zagreb,': ['Zagreb', ','], 'Budimpešto.': ['Budimpešto', '.'], 'žalost.': ['žalost', '.'], '....': ['.', '.', '.', '.'], '[XStevilkaX]': ['[', 'XStevilkaX', ']'], 'e-naslov': ['e', '-', 'naslov'], '[XEnaslovX]': ['[', 'XEnaslovX', ']'], 'e-pošto': ['e', '-', 'pošto'], '[XDatumX]': ['[', 'XDatumX', ']'], 'eno-sobno': ['eno', '-', 'sobno'], 'lgbtq-prijazna': ['lgbtq', '-', 'prijazna'], 'lgbtq-prijaznega': ['lgbtq', '-', 'prijaznega'], 'Covid-19': ['Covid', '-', '19'], ',,,': [',', ',', ','], 'e-maila': ['e', '-', 'maila'], 'T&d': ['T', '&', 'd'], 'Spider-Man': ['Spider', '-', 'Man'], '12-strani': ['12', '-', 'strani'], 'turbo-folk': ['turbo', '-', 'folk'], 'Cp-čkar': ['Cp', '-', 'čkar'], '46-letnik': ['46', '-', 'letnik'], '40-letna': ['40', '-', 'letna'], '18-19h': ['18', '-', '19h'], '[XSvojilniPridevnikX]': ['[', 'XSvojilniPridevnikX', ']'], 'COVID-19': ['COVID', '-', '19'], '"sims"': ['"', 'sims', '"'], '2021/22': ['2021', '/', '22'], '2020/21': ['2020', '/', '21'], 'leto2021/22': ['leto2021', '/', '22'], 'H&m': ['H', '&', 'm'], 'high-street': ['high', '-', 'street'], 'H&M-u': ['H', '&', 'M-u'], 'H&M': ['H', '&', 'M'], 'srčno-žilnih': ['srčno', '-', 'žilnih'], 'srčno-žilni': ['srčno', '-', 'žilni'], ':))': [':)', ')'], 'You-Tube-ju': ['You', '-', 'Tube-ju'], '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%']} +# , '37,8%': ['37', ',', '8%'], '23,8%': ['23', ',', '8%'], '17,6%': ['17', ',', '6%'], '12,6%': ['12', ',', '6%'], '58,2%': ['58', ',', '2%'], '76,2%': ['76', ',', '2%'] +SVALA_HAND_FIXES_MERGE = {('oz', '.'): 'oz.', ('Npr', '.'): 'Npr.', ('npr', '.'): 'npr.', ('1', '.'): '1.', ('2', '.'): '2.', ('3', '.'): '3.', ('m', '.'): 'm.', ('itn', '.'): 'itn.', ('max', '.'): 'max.', ('4', '.'): '4.', ('cca', '.'): 'cca.', ('30', '.'): '30.', ('mlad', '.'): 'mlad.', (':)', ')'): ':))', ('sv', '.'): 'sv.', ('p', '.'): 'p.'} +OBELIKS_HAND_FIXES_MERGE = {'2015.': ['2015', '.']} + + +def merge_svala_data_elements(svala_data_object, i, mask_len): + final_text = '' + involved_sources = [] + involved_targets = [] + involved_edges = [] + for el in svala_data_object.svala_data['source'][i - mask_len + 1:i + 1]: + # check whether merge won't cause further (unnoticed) issues later + edges = svala_data_object.links_ids_mapper[el['id']] + if len(edges) != 1: + raise ValueError('Incorrect number of edges!') + edge = svala_data_object.svala_data['edges'][edges[0]] + # TODO check if or len(edge['labels']) != 0 has to be added + if len(edge['source_ids']) != 1 or len(edge['target_ids']) != 1: + raise ValueError('Possible errors - CHECK!') + + final_text += el['text'] + involved_sources.append(edge['source_ids'][0]) + + involved_targets.append(edge['target_ids'][0]) + involved_edges.append(edge['id']) + + # erase merged svala elements + svala_data_object.svala_data['source'][i - mask_len + 1]['text'] = final_text + svala_data_object.svala_data['source'] = [el for el in svala_data_object.svala_data['source'] if + el['id'] not in involved_sources[1:]] + + for el in svala_data_object.svala_data['target']: + if el['id'] == involved_targets[0]: + el['text'] = final_text + break + svala_data_object.svala_data['target'] = [el for el in svala_data_object.svala_data['target'] if + el['id'] not in involved_targets[1:]] + + svala_data_object.svala_data['edges'] = {k: v for k, v in svala_data_object.svala_data['edges'].items() if + v['id'] not in involved_edges[1:]} + i -= len(involved_sources[1:]) + return i + + +def apply_svala_handfixes(svala_data_object): + hand_fix_mask = [] + for key in SVALA_HAND_FIXES_MERGE.keys(): + if len(key) not in hand_fix_mask: + hand_fix_mask.append(len(key)) + + remember_length = max(hand_fix_mask) + q = deque() + + i = 0 + for el in svala_data_object.svala_data['source']: + q.append(el['text']) + if len(q) > remember_length: + q.popleft() + for mask_len in hand_fix_mask: + list_q = list(q) + if len(list_q) - mask_len >= 0: + key = tuple(list_q[remember_length - mask_len:]) + if key in SVALA_HAND_FIXES_MERGE: + i = merge_svala_data_elements(svala_data_object, i, mask_len) + i += 1 + + +def apply_obeliks_handfixes(tokenized_paragraph): + for t_i in range(len(tokenized_paragraph)): + sen = tokenized_paragraph[t_i] + i = 0 + error = False + for tok in sen: + # if tok['text'] == ',,,': + # tok['text'] = ',' + if tok['text'] in OBELIKS_HAND_FIXES_MERGE: + error = True + break + i += 1 + if error: + new_sen = [] + new_id = 1 + for t in sen: + if t['text'] in OBELIKS_HAND_FIXES_MERGE: + for ex_t in OBELIKS_HAND_FIXES_MERGE[t['text']]: + new_sen.append({'id': tuple([new_id]), 'text': ex_t}) + new_id += 1 + else: + new_sen.append({'id': tuple([new_id]), 'text': t['text']}) + new_id += 1 + tokenized_paragraph[t_i] = new_sen diff --git a/src/read/merge.py b/src/read/merge.py index c2a9184..7d4503a 100644 --- a/src/read/merge.py +++ b/src/read/merge.py @@ -18,248 +18,20 @@ def create_edges_list(target_ids, links_ids_mapper): SKIP_IDS = ['solar2284s.1.1.1'] -def create_edges(svala_data, source_par, target_par): - if source_par and source_par[0]: - if source_par[0][0]['id'] in SKIP_IDS: - return [] - # print(source_par[0][0]['id']) - # if source_par[0][0]['id'] == 'solar17s.6.3.1': - # print('pause!') - # if target_par and target_par[0]: - # print(target_par[0][0]['id']) - # if target_par[0][0]['id'] == 'solar2150t.4.1.1': - # print('pause!') +def create_edges(raw_edges, source_par, target_par): source_mapper = {el['svala_id']: el['id'] for source in source_par for el in source} target_mapper = {el['svala_id']: el['id'] for target in target_par for el in target} - source_ids = [[el['svala_id'] for el in source] for source in source_par] - target_ids = [[el['svala_id'] for el in target] for target in target_par] - - source_sentence_ids = [set([el['svala_id'] for el in source]) for source in source_par] - target_sentence_ids = [set([el['svala_id'] for el in target]) for target in target_par] - - # create links to ids mapper - links_ids_mapper = {} - edges_of_one_type = set() - - # delete empty edge - if 'e-' in svala_data['edges']: - del (svala_data['edges']['e-']) - - for k, v in svala_data['edges'].items(): - has_source = False - has_target = False - for el in v['ids']: - # create edges of one type - if el[0] == 's': - has_source = True - if el[0] == 't': - has_target = True - - # create links_ids_mapper - if el not in links_ids_mapper: - links_ids_mapper[el] = [] - links_ids_mapper[el].append(k) - if not has_source or not has_target or (len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \ - or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '): - edges_of_one_type.add(k) - - # delete edge with space - save_deleted_edges = {} - if len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ': - for edg in links_ids_mapper[svala_data['source'][0]['id']]: - save_deleted_edges[edg] = svala_data['edges'][edg] - del (svala_data['edges'][edg]) - del (links_ids_mapper[svala_data['source'][0]['id']]) - if len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' ': - for edg in links_ids_mapper[svala_data['target'][0]['id']]: - save_deleted_edges[edg] = svala_data['edges'][edg] - del (svala_data['edges'][edg]) - del (links_ids_mapper[svala_data['target'][0]['id']]) - - # create edge order - edges_order = [] - edges_processed = set() - active_target_sentence_i = 0 - - # create target edges - target_edges, target_edges_set = create_edges_list(target_ids, links_ids_mapper) - source_edges, source_edges_set = create_edges_list(source_ids, links_ids_mapper) - - last_target_edge = '' - - for active_source_sentence_i, active_source_sentence in enumerate(source_edges): - for source_edge in active_source_sentence: - # print(source_edge) - # if 'e-s7-t8' == source_edge: - # print('aaa') - if source_edge in edges_of_one_type: - if source_edge not in edges_processed: - edges_order.append(source_edge) - edges_processed.add(source_edge) - - elif target_edges_set and source_edge in target_edges_set[active_target_sentence_i]: - - # if 'e-s119-t119' == source_edge: - # print('aaa') - if source_edge not in edges_processed: - edges_order.append(source_edge) - edges_processed.add(source_edge) - last_target_edge = source_edge - # when source is connected to two targets - elif source_edge not in target_edges_set[active_target_sentence_i]: - # add missing edges from target - while source_edge not in target_edges_set[active_target_sentence_i]: - for target_edge in target_edges[active_target_sentence_i]: - if target_edge in edges_of_one_type: - if target_edge not in edges_processed: - edges_order.append(target_edge) - edges_processed.add(target_edge) - last_target_edge = target_edge - active_target_sentence_i += 1 - if source_edge in target_edges_set[active_target_sentence_i]: - if source_edge not in edges_processed: - edges_order.append(source_edge) - edges_processed.add(source_edge) - - else: - raise 'Impossible!!!' - if not target_edges_set or not target_edges_set[0] or active_target_sentence_i >= len(target_edges): - continue - if len(target_edges[active_target_sentence_i]) == 0: - active_target_sentence_i += 1 - continue - - if last_target_edge == target_edges[active_target_sentence_i][-1] or (len(target_edges[active_target_sentence_i]) > 1 and last_target_edge == target_edges[active_target_sentence_i][-2] and (target_edges[active_target_sentence_i][-1] in edges_of_one_type or (target_edges[active_target_sentence_i][-1] not in edges_of_one_type and target_edges[active_target_sentence_i][-1] in source_edges_set[active_source_sentence_i]))): - for target_edge in target_edges[active_target_sentence_i]: - if target_edge in edges_of_one_type: - if target_edge not in edges_processed: - edges_order.append(target_edge) - edges_processed.add(target_edge) - last_target_edge = target_edge - active_target_sentence_i += 1 - continue - target_edge_in_next_source_edge_sentence = False - for target_edge in target_edges[active_target_sentence_i]: - if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[active_source_sentence_i + 1]: - target_edge_in_next_source_edge_sentence = True - break - if target_edge_in_next_source_edge_sentence: - pass - elif not target_edge_in_next_source_edge_sentence: - target_edge_in_next_source_edge_sentence = False - while not target_edge_in_next_source_edge_sentence: - # if active_target_sentence_i >= len(target_edges_set): - # break - for target_edge in target_edges[active_target_sentence_i]: - if target_edge in edges_of_one_type: - if target_edge not in edges_processed: - edges_order.append(target_edge) - edges_processed.add(target_edge) - last_target_edge = target_edge - - # if there is no next source sentence - if active_source_sentence_i + 1 >= len(source_edges_set): - target_edge_in_next_source_edge_sentence = True - - # if last_target_edge only in target stop regularly - if last_target_edge == target_edges[active_target_sentence_i][-1]: - target_edge_in_next_source_edge_sentence = True - - # test if target_edge in next source - for target_edge in target_edges[active_target_sentence_i]: - if active_source_sentence_i + 1 < len(source_edges_set) and target_edge in source_edges_set[ - active_source_sentence_i + 1]: - target_edge_in_next_source_edge_sentence = True - break - active_target_sentence_i += 1 - - if not source_edges: - for active_target_sentence in target_edges: - for target_edge in active_target_sentence: - if target_edge not in edges_processed: - edges_order.append(target_edge) - edges_processed.add(target_edge) - - # # DEBUG stuff - # for edge_order in edges_order: - # if edges_order.count(edge_order) > 1: - # # if edge_order not in a: - # print(f'ERROR {edge_order}') - # - # for edge_order in edges_order: - # if edge_order not in svala_data['edges']: - # print(f'ERROR {edge_order}') - # - # for key in svala_data['edges'].keys(): - # if key not in edges_order: - # print(f'ERROR {key}') - # - # a = len(svala_data['edges']) - # b = len(edges_order) - if len(svala_data['edges']) != len(edges_order): - for k, v in save_deleted_edges.items(): - svala_data['edges'][k] = v - - - assert len(svala_data['edges']) == len(edges_order) - - sentence_edges = [] - source_sent_id = 0 - target_sent_id = 0 # actually add edges edges = [] - for edge_id in edges_order: - labels = svala_data['edges'][edge_id]['labels'] - source_ids = [source_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in source_mapper] - target_ids = [target_mapper[el] for el in svala_data['edges'][edge_id]['ids'] if el in target_mapper] - ids = svala_data['edges'][edge_id]['ids'] + for _, edge in raw_edges.items(): + labels = edge['labels'] + source_ids = [source_mapper[el] for el in edge['ids'] if el in source_mapper] + target_ids = [target_mapper[el] for el in edge['ids'] if el in target_mapper] - source_ok = [el[0] == 't' or el in source_sentence_ids[source_sent_id] for el in ids] if source_sentence_ids else [] - source_ok_all = all(source_ok) - - if not source_ok_all: - source_sent_id += 1 - - target_ok = [el[0] == 's' or el in target_sentence_ids[target_sent_id] for el in ids] if target_sentence_ids else [] - target_ok_all = all(target_ok) - - if not target_ok_all: - target_sent_id += 1 - - if not source_ok_all or not target_ok_all: - sentence_edges.append(edges) - edges = [] edges.append({'source_ids': source_ids, 'target_ids': target_ids, 'labels': labels}) - if edges: - sentence_edges.append(edges) - - actual_sentence_edges = [] - passed_sentence = [] - for sent in sentence_edges: - ha_source = False - ha_target = False - for toke in sent: - if len(toke['target_ids']) > 0: - ha_target = toke['target_ids'][0] - if len(toke['source_ids']) > 0: - ha_source = toke['source_ids'][0] - if ha_target and ha_source: - break - - if not ha_target or not ha_source: - passed_sentence.extend(sent) - - else: - passed_sentence.extend(sent) - actual_sentence_edges.append(passed_sentence) - passed_sentence = [] - - if passed_sentence: - actual_sentence_edges.append(passed_sentence) - - return actual_sentence_edges + return edges def update_ids(pretag, in_list): diff --git a/src/read/read.py b/src/read/read.py index 459c665..41e6f36 100644 --- a/src/read/read.py +++ b/src/read/read.py @@ -1,13 +1,29 @@ +import re -HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']} +from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE def read_raw_text(path): - with open(path, 'r') as rf: - return rf.read() + print(path) + # if path == "data/KOST/raw/L-1819-110.txt": + # print('here') + try: + with open(path, 'r', encoding='utf-8') as rf: + return rf.read() + except: + try: + with open(path, 'r', encoding='utf-16') as rf: + return rf.read() + except: + with open(path, 'r', encoding="windows-1250") as rf: + return rf.read() + def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): + # apply handfixes for obeliks + apply_obeliks_handfixes(tokenized_paragraph) + paragraph_res = [] wierd_sign_count = 0 svala_data_i = 0 @@ -21,11 +37,14 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): assert tok['misc'] == 'SpaceAfter=No' space_after = not 'misc' in tok if len(svala_data_part) <= svala_data_i: + # if sentence does not end add it anyway + # TODO i error? + if sentence_res: + paragraph_res.append(sentence_res) return i, paragraph_res - if svala_data_part[svala_data_i]['text'].strip() != tok['text']: - key = svala_data_part[svala_data_i]['text'].strip() + if svala_data_part[svala_data_i]['text'] != tok['text']: + key = svala_data_part[svala_data_i]['text'] if key not in HAND_FIXES: - print(f'key: {key} ; tok[text]: {tok["text"]}') if key.startswith('§§§') and key.endswith('§§§'): HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§'] elif key.startswith('§§§'): @@ -33,7 +52,23 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): elif key.endswith('§§§'): HAND_FIXES[key] = [key[:-3], '§', '§', '§'] else: - raise 'Word mismatch!' + if len(key) < len(tok['text']): + print('HAND_FIXES_MERGE:') + print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'") + SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text'] + a = SVALA_HAND_FIXES_MERGE + else: + print('HAND_FIXES OLD:') + print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']") + + print('HAND_FIXES NEW:') + reg = re.findall(r"[\w]+|[^\s\w]", key) + print(f", '{key}': {str(reg)}") + + # HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]] + HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key) + print(f'key: {key} ; tok[text]: {tok["text"]}') + # raise ValueError('Word mismatch!') if tok['text'] == HAND_FIXES[key][wierd_sign_count]: wierd_sign_count += 1 @@ -42,6 +77,10 @@ def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): else: tok['text'] = key wierd_sign_count = 0 + elif key in ['[XKrajX]']: + tok['text'] = '[XKrajX]' + elif key in ['[XImeX]']: + tok['text'] = '[XImeX]' else: print(f'key: {key} ; tok[text]: {tok["text"]}') raise 'Word mismatch!' diff --git a/src/read/read_and_merge.py b/src/read/read_and_merge.py index f7e57bd..9dfde20 100644 --- a/src/read/read_and_merge.py +++ b/src/read/read_and_merge.py @@ -1,13 +1,17 @@ import json +import logging import os import pickle -import classla +import queue +import string +from collections import deque +import classla -from src.read.merge import merge +from src.read.hand_fixes import apply_svala_handfixes +from src.read.merge import merge, create_conllu, create_edges from src.read.read import read_raw_text, map_svala_tokenized - -HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']} +from src.read.svala_data import SvalaData def add_error_token(el, out_list, sentence_string_id, out_list_i, out_list_ids, is_source, s_t_id): @@ -129,9 +133,114 @@ def add_errors(svala_i, source_i, target_i, error, source, target, svala_data, s return svala_i, source_i, target_i -def create_target(svala_data, source_tokenized): - for i, el in enumerate(svala_data['target']): - print(i) +def create_target(svala_data_object, source_tokenized): + source_tokenized_dict = {} + for i, sent in enumerate(source_tokenized): + for tok in sent: + tok['sent_id'] = i + 1 + source_tokenized_dict[tok['svala_id']] = tok + + + links_ids_mapper, edges_of_one_type = svala_data_object.links_ids_mapper, svala_data_object.edges_of_one_type + + curr_sententence = 1 + source_curr_sentence = 1 + + target_tokenized = [] + target_sent_tokenized = [] + tok_i = 1 + + for i, token in enumerate(svala_data_object.svala_data['target']): + edge_id = links_ids_mapper[token['id']] + if len(edge_id) > 1: + print('Whaat?') + edge_id = edge_id[0] + edge = svala_data_object.svala_data['edges'][edge_id] + source_word_ids = [] + target_word_ids = [] + for word_id in edge['ids']: + if word_id[0] == 's': + source_word_ids.append(word_id) + if word_id[0] == 't': + target_word_ids.append(word_id) + + token_text = token['text'] + new_sentence = False + if len(source_word_ids) == 1: + source_id = source_word_ids[0] + source_token = source_tokenized_dict[source_id] + + if source_token['sent_id'] != source_curr_sentence: + source_curr_sentence = source_token['sent_id'] + if source_token['id'] == 1 and len(target_sent_tokenized) > 1: + target_tokenized.append(target_sent_tokenized) + target_sent_tokenized = [] + curr_sententence += 1 + tok_i = 1 + + # check if words are equal and update + if token_text == source_token['token']: + target_token = { + 'token': source_token['token'], + 'tag': source_token['tag'], + 'id': tok_i, + 'space_after': source_token['space_after'], + 'svala_id': token['id'], + 'sent_id': curr_sententence, + } + else: + + # Check for punctuation mismatch. + if token_text in string.punctuation: + tag = 'pc' + else: + tag = 'w' + + target_token = { + 'token': token_text, + 'tag': tag, + 'id': tok_i, + 'space_after': source_token['space_after'], + 'svala_id': token['id'], + 'sent_id': curr_sententence, + } + + else: + space_after = True + if token_text in string.punctuation: + tag = 'pc' + if token_text in '!?.,):;]}': + if len(target_sent_tokenized) == 0: + raise ValueError('Sentence lenght = 0!') + target_sent_tokenized[-1]['space_after'] = False + if token_text in '!?.': + new_sentence = True + + # Handle cases like `...` + if len(svala_data_object.svala_data['target']) > i + 1 and svala_data_object.svala_data['target'][i+1]['text'] in '.?!': + new_sentence = False + elif token_text in '([{': + space_after = False + else: + tag = 'w' + + target_token = { + 'token': token_text, + 'tag': tag, + 'id': tok_i, + 'space_after': space_after, + 'svala_id': token['id'], + 'sent_id': curr_sententence, + } + target_sent_tokenized.append(target_token) + if new_sentence: + target_tokenized.append(target_sent_tokenized) + target_sent_tokenized = [] + curr_sententence += 1 + tok_i = 1 + tok_i += 1 + target_tokenized.append(target_sent_tokenized) + return target_tokenized def tokenize(args): @@ -149,14 +258,19 @@ def tokenize(args): nlp_tokenize = classla.Pipeline('sl', processors='tokenize', pos_lemma_pretag=True) # filename_encountered = False i = 0 - tokenized_source_divs = [] - tokenized_target_divs = [] + tokenized_divs = {} + # tokenized_source_divs = {} + # tokenized_target_divs = {} document_edges = [] text_filename = '' for folder, _, filenames in os.walk(args.svala_folder): - for filename in filenames: + filenames = sorted(filenames) + for filename_i, filename in enumerate(filenames): + # if filename_i*100/len(filenames) > 35: + # print('here') + # continue svala_path = os.path.join(folder, filename) new_text_filename = '-'.join(filename[:-5].split('-')[:3]) + '.txt' if text_filename != new_text_filename: @@ -166,81 +280,61 @@ def tokenize(args): text_file) if text_file else ([], [], []) source_sent_i = 0 - jf = open(svala_path) + jf = open(svala_path, encoding='utf-8') + print(svala_path) svala_data = json.load(jf) jf.close() + svala_data_object = SvalaData(svala_data) - target_res = create_target(svala_data, source_tokenized) - source_sent_i, source_res = map_svala_tokenized(svala_data['source'], source_tokenized, source_sent_i) - print('aaa') + apply_svala_handfixes(svala_data_object) + source_sent_i, source_res = map_svala_tokenized(svala_data_object.svala_data['source'], source_tokenized, source_sent_i) + # target_res = create_target(svala_data, source_tokenized) - for div in et.iter('div'): - bibl = div.find('bibl') - file_name = bibl.get('n') - file_name = file_name.replace('/', '_') - print(f'{i*100/folders_count} % : {file_name}') - i += 1 - # if file_name == 'S20-PI-slo-2-SG-D-2016_2017-30479-12.txt': - # if file_name == 'KUS-G-slo-4-GO-E-2009-10017': - # # # if i*100/folders_count > 40: - # filename_encountered = True - # # # # if i*100/folders_count > 41: - # # # # filename_encountered = False - # if not filename_encountered: - # continue - svala_path = os.path.join(args.svala_folder, file_name) - corrected_svala_path = os.path.join(args.corrected_svala_folder, file_name) - raw_texts_path = os.path.join(args.svala_generated_text_folder, file_name) + target_res = create_target(svala_data_object, source_res) - svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(svala_path)] if os.path.isdir(svala_path) else [] - svala_dict = {e[0]: e[1] for e in svala_list} + if text_filename not in tokenized_divs: + tokenized_divs[text_filename] = [] - if os.path.exists(corrected_svala_path): - corrected_svala_list = [[fname[:-13], fname] if 'problem' in fname else [fname[:-5], fname] for fname in os.listdir(corrected_svala_path)] - corrected_svala_dict = {e[0]: e[1] for e in corrected_svala_list} + tokenized_divs[text_filename].append((filename, source_res, target_res, svala_data_object.svala_data['edges'])) - svala_dict.update(corrected_svala_dict) + logging.info(f'Tokenizing at {filename_i*100/len(filenames)} %') - assert len(svala_dict) != 0 + tokenized_source_divs = [] + tokenized_target_divs = [] + document_edges = [] + for div_id in tokenized_divs.keys(): + paragraph_edges = [] tokenized_source_paragraphs = [] tokenized_target_paragraphs = [] - paragraph_edges = [] - - paragraphs = div.findall('p') - for paragraph in paragraphs: - sentences = paragraph.findall('s') - svala_i = 1 - - # read json - # if paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] == 'solar17.6': - # print('here') - svala_file = os.path.join(svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) - corrected_svala_file = os.path.join(corrected_svala_path, svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']]) - add_errors_func = add_errors - jf = open(svala_file) if not os.path.exists(corrected_svala_file) else open(corrected_svala_file) - svala_data = json.load(jf) - jf.close() - - source_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_source.json' - target_filename = svala_dict[paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']][:-5] + '_target.json' - - source_raw_text = os.path.join(raw_texts_path, source_filename) if os.path.exists(os.path.join(raw_texts_path, source_filename)) else None - target_raw_text = os.path.join(raw_texts_path, target_filename) if os.path.exists(os.path.join(raw_texts_path, target_filename)) else None - - - sentence_edges, tokenized_source_sentences, tokenized_target_sentences = merge(sentences, paragraph, svala_i, - svala_data, add_errors_func, source_raw_text, target_raw_text, nlp_tokenize) - - tokenized_source_paragraphs.append(tokenized_source_sentences) - tokenized_target_paragraphs.append(tokenized_target_sentences) - paragraph_edges.append(sentence_edges) + # par_source = [] + # par_target = [] + for tokenized_para in tokenized_divs[div_id]: + paragraph_name, source_res, target_res, edges = tokenized_para + source_paragraphs = [] + target_paragraphs = [] + sen_source = [] + sen_target = [] + for sen_i, sen in enumerate(source_res): + source_conllu = create_conllu(sen, f'{paragraph_name[:-5]}.s{str(sen_i + 1)}') + source_paragraphs.append(source_conllu) + sen_source.append(sen) + + for sen_i, sen in enumerate(target_res): + target_conllu = create_conllu(sen, f'{paragraph_name}.t{str(sen_i)}') + target_paragraphs.append(target_conllu) + sen_target.append(sen) + paragraph_edges.append(edges) + tokenized_source_paragraphs.append(source_paragraphs) + tokenized_target_paragraphs.append(target_paragraphs) + paragraph_edges.append(create_edges(edges, sen_source, sen_target)) tokenized_source_divs.append(tokenized_source_paragraphs) tokenized_target_divs.append(tokenized_target_paragraphs) + document_edges.append(paragraph_edges) with open(args.tokenization_interprocessing, 'wb') as wp: diff --git a/src/read/svala_data.py b/src/read/svala_data.py new file mode 100644 index 0000000..5ffd098 --- /dev/null +++ b/src/read/svala_data.py @@ -0,0 +1,48 @@ +from collections import deque + +from src.read.hand_fixes import SVALA_HAND_FIXES_MERGE + + +class SvalaData(): + def __init__(self, svala_data): + for el in svala_data['source']: + el['text'] = el['text'].strip() + if el['text'] == '': + print('What?') + for el in svala_data['target']: + el['text'] = el['text'].strip() + if el['text'] == '': + print('What?') + self.svala_data = svala_data + self.links_ids_mapper, self.edges_of_one_type = self.create_ids_mapper(svala_data) + + @staticmethod + def create_ids_mapper(svala_data): + # create links to ids mapper + links_ids_mapper = {} + edges_of_one_type = set() + + for k, v in svala_data['edges'].items(): + has_source = False + has_target = False + v['source_ids'] = [] + v['target_ids'] = [] + for el in v['ids']: + # create edges of one type + if el[0] == 's': + v['source_ids'].append(el) + has_source = True + if el[0] == 't': + v['target_ids'].append(el) + has_target = True + + # create links_ids_mapper + if el not in links_ids_mapper: + links_ids_mapper[el] = [] + links_ids_mapper[el].append(k) + if not has_source or not has_target or ( + len(svala_data['source']) == 1 and svala_data['source'][0]['text'] == ' ') \ + or (len(svala_data['target']) == 1 and svala_data['target'][0]['text'] == ' '): + edges_of_one_type.add(k) + + return links_ids_mapper, edges_of_one_type diff --git a/svala2tei.py b/svala2tei.py index 65bd28a..a1111a2 100644 --- a/svala2tei.py +++ b/svala2tei.py @@ -15,6 +15,7 @@ from src.annotate.annotate import annotate from src.create_tei import construct_sentence_from_list, \ construct_paragraph_from_list, TeiDocument, build_tei_etrees, build_links, build_complete_tei, convert_bibl from src.read.read_and_merge import tokenize +from src.write.write import write_tei logging.basicConfig(level=logging.INFO)