import re from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE def read_raw_text(path): print(path) # if path == "data/KOST/raw/L-1819-110.txt": # print('here') try: with open(path, 'r', encoding='utf-8') as rf: return rf.read() except: try: with open(path, 'r', encoding='utf-16') as rf: return rf.read() except: with open(path, 'r', encoding="windows-1250") as rf: return rf.read() def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i): # apply handfixes for obeliks apply_obeliks_handfixes(tokenized_paragraph) paragraph_res = [] wierd_sign_count = 0 svala_data_i = 0 for i in range(sent_i, len(tokenized_paragraph)): sentence = tokenized_paragraph[i] sentence_res = [] sentence_id = 0 for tok in sentence: tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w' if 'misc' in tok: assert tok['misc'] == 'SpaceAfter=No' space_after = not 'misc' in tok if len(svala_data_part) <= svala_data_i: # if sentence does not end add it anyway # TODO i error? if sentence_res: paragraph_res.append(sentence_res) return i, paragraph_res if svala_data_part[svala_data_i]['text'] != tok['text']: key = svala_data_part[svala_data_i]['text'] if key not in HAND_FIXES: if key.startswith('§§§') and key.endswith('§§§'): HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§'] elif key.startswith('§§§'): HAND_FIXES[key] = ['§', '§', '§', key[3:]] elif key.endswith('§§§'): HAND_FIXES[key] = [key[:-3], '§', '§', '§'] else: if len(key) < len(tok['text']): print('HAND_FIXES_MERGE:') print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'") SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text'] a = SVALA_HAND_FIXES_MERGE else: print('HAND_FIXES OLD:') print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']") print('HAND_FIXES NEW:') reg = re.findall(r"[\w]+|[^\s\w]", key) print(f", '{key}': {str(reg)}") # HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]] HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key) print(f'key: {key} ; tok[text]: {tok["text"]}') # raise ValueError('Word mismatch!') if tok['text'] == HAND_FIXES[key][wierd_sign_count]: wierd_sign_count += 1 if wierd_sign_count < len(HAND_FIXES[key]): continue else: tok['text'] = key wierd_sign_count = 0 elif key in ['[XKrajX]']: tok['text'] = '[XKrajX]' elif key in ['[XImeX]']: tok['text'] = '[XImeX]' else: print(f'key: {key} ; tok[text]: {tok["text"]}') raise 'Word mismatch!' sentence_id += 1 sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']}) svala_data_i += 1 paragraph_res.append(sentence_res) return sent_i, paragraph_res