You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

52 lines
3.7 KiB

HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
def read_raw_text(path):
with open(path, 'r') as rf:
return rf.read()
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
paragraph_res = []
wierd_sign_count = 0
svala_data_i = 0
for i in range(sent_i, len(tokenized_paragraph)):
sentence = tokenized_paragraph[i]
sentence_res = []
sentence_id = 0
for tok in sentence:
tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'
if 'misc' in tok:
assert tok['misc'] == 'SpaceAfter=No'
space_after = not 'misc' in tok
if len(svala_data_part) <= svala_data_i:
return i, paragraph_res
if svala_data_part[svala_data_i]['text'].strip() != tok['text']:
key = svala_data_part[svala_data_i]['text'].strip()
if key not in HAND_FIXES:
print(f'key: {key} ; tok[text]: {tok["text"]}')
if key.startswith('§§§') and key.endswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
elif key.startswith('§§§'):
HAND_FIXES[key] = ['§', '§', '§', key[3:]]
elif key.endswith('§§§'):
HAND_FIXES[key] = [key[:-3], '§', '§', '§']
else:
raise 'Word mismatch!'
if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
wierd_sign_count += 1
if wierd_sign_count < len(HAND_FIXES[key]):
continue
else:
tok['text'] = key
wierd_sign_count = 0
else:
print(f'key: {key} ; tok[text]: {tok["text"]}')
raise 'Word mismatch!'
sentence_id += 1
sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})
svala_data_i += 1
paragraph_res.append(sentence_res)
return sent_i, paragraph_res