You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.9 KiB

import re
from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE
def read_raw_text(path):
print(path)
# if path == "data/KOST/raw/L-1819-110.txt":
# print('here')
try:
with open(path, 'r', encoding='utf-8') as rf:
return rf.read()
except:
try:
with open(path, 'r', encoding='utf-16') as rf:
return rf.read()
except:
with open(path, 'r', encoding="windows-1250") as rf:
return rf.read()
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
# apply handfixes for obeliks
apply_obeliks_handfixes(tokenized_paragraph)
paragraph_res = []
wierd_sign_count = 0
svala_data_i = 0
for i in range(sent_i, len(tokenized_paragraph)):
sentence = tokenized_paragraph[i]
sentence_res = []
sentence_id = 0
for tok in sentence:
tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'
if 'misc' in tok:
assert tok['misc'] == 'SpaceAfter=No'
space_after = not 'misc' in tok
if len(svala_data_part) <= svala_data_i:
# if sentence does not end add it anyway
# TODO i error?
if sentence_res:
paragraph_res.append(sentence_res)
return i, paragraph_res
if svala_data_part[svala_data_i]['text'] != tok['text']:
key = svala_data_part[svala_data_i]['text']
if key not in HAND_FIXES:
if key.startswith('§§§') and key.endswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
elif key.startswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:]]
elif key.endswith('§§§'):
HAND_FIXES[key] = [key[:-3], '§', '§', '§']
else:
if len(key) < len(tok['text']):
print('HAND_FIXES_MERGE:')
print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
a = SVALA_HAND_FIXES_MERGE
else:
print('HAND_FIXES OLD:')
print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")
print('HAND_FIXES NEW:')
reg = re.findall(r"[\w]+|[^\s\w]", key)
print(f", '{key}': {str(reg)}")
# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
print(f'key: {key} ; tok[text]: {tok["text"]}')
# raise ValueError('Word mismatch!')
if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
wierd_sign_count += 1
if wierd_sign_count < len(HAND_FIXES[key]):
continue
else:
tok['text'] = key
wierd_sign_count = 0
elif key in ['[XKrajX]']:
tok['text'] = '[XKrajX]'
elif key in ['[XImeX]']:
tok['text'] = '[XImeX]'
else:
print(f'key: {key} ; tok[text]: {tok["text"]}')
raise 'Word mismatch!'
sentence_id += 1
sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})
svala_data_i += 1
paragraph_res.append(sentence_res)
return sent_i, paragraph_res