You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
52 lines
3.7 KiB
52 lines
3.7 KiB
|
|
HAND_FIXES = {'§§§pisala': ['§', '§', '§', 'pisala'], '§§§poldne': ['§', '§', '§', 'poldne'], '§§§o': ['§', '§', '§', 'o'], '§§§mimi': ['§', '§', '§', 'mimi'], '§§§nil': ['§', '§', '§', 'nil'], '§§§ela': ['§', '§', '§', 'ela'], 'sam§§§': ['sam', '§', '§', '§'], 'globa觧§': ['globač', '§', '§', '§'], 'sin.': ['sin', '.'], '§§§oveduje': ['§', '§', '§', 'oveduje'], 'na§§§': ['na', '§', '§', '§'], '§§§ka§§§': ['§', '§', '§', 'ka', '§', '§', '§'], '§§§e§§§': ['§', '§', '§', 'e', '§', '§', '§'], '§§§': ['§', '§', '§'], 'ljubezni.': ['ljubezni', '.'], '12.': ['12', '.'], '16.': ['16', '.'], 'st.': ['st', '.'], 'S.': ['S', '.'], 'pr.': ['pr', '.'], 'n.': ['n', '.'], '19:30': ['19', ':', '30'], '9.': ['9', '.'], '6:35': ['6', ':', '35'], 'itd.': ['itd', '.'], 'Sv.': ['Sv', '.'], 'npr.': ['npr', '.'], 'sv.': ['sv', '.'], '12:00': ['12', ':', '00'], "sram'vali": ['sram', "'", 'vali'], '18:00': ['18', ':', '00'], 'J.': ['J', '.'], '5:45': ['5', ':', '45'], '17.': ['17', '.'], '9.00h': ['9', '.', '00h'], 'H.': ['H', '.'], '1.': ['1', '.'], '6.': ['6', '.'], '7:10': ['7', ':', '10'], 'g.': ['g', '.'], 'Oz.': ['Oz', '.'], '20:00': ['20', ':', '00'], '17.4.2010': ['17.', '4.', '2010'], 'ga.': ['ga', '.'], 'prof.': ['prof', '.'], '6:45': ['6', ':', '45'], '19.': ['19', '.'], '3.': ['3', '.'], 'tj.': ['tj', '.'], 'Prof.': ['Prof', '.'], '8.': ['8', '.'], '9:18': ['9', ':', '18'], 'ipd.': ['ipd', '.'], '7.': ['7', '.'], 'št.': ['št', '.'], 'oz.': ['oz', '.'], 'R.': ['R', '.'], '13:30': ['13', ':', '30'], '5.': ['5', '.'], '...': ['.', '.', '.']}
|
|
|
|
|
|
def read_raw_text(path):
|
|
with open(path, 'r') as rf:
|
|
return rf.read()
|
|
|
|
|
|
def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
|
|
paragraph_res = []
|
|
wierd_sign_count = 0
|
|
svala_data_i = 0
|
|
for i in range(sent_i, len(tokenized_paragraph)):
|
|
sentence = tokenized_paragraph[i]
|
|
sentence_res = []
|
|
sentence_id = 0
|
|
for tok in sentence:
|
|
tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'
|
|
if 'misc' in tok:
|
|
assert tok['misc'] == 'SpaceAfter=No'
|
|
space_after = not 'misc' in tok
|
|
if len(svala_data_part) <= svala_data_i:
|
|
return i, paragraph_res
|
|
if svala_data_part[svala_data_i]['text'].strip() != tok['text']:
|
|
key = svala_data_part[svala_data_i]['text'].strip()
|
|
if key not in HAND_FIXES:
|
|
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
|
if key.startswith('§§§') and key.endswith('§§§'):
|
|
HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
|
|
elif key.startswith('§§§'):
|
|
HAND_FIXES[key] = ['§', '§', '§', key[3:]]
|
|
elif key.endswith('§§§'):
|
|
HAND_FIXES[key] = [key[:-3], '§', '§', '§']
|
|
else:
|
|
raise 'Word mismatch!'
|
|
|
|
if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
|
|
wierd_sign_count += 1
|
|
if wierd_sign_count < len(HAND_FIXES[key]):
|
|
continue
|
|
else:
|
|
tok['text'] = key
|
|
wierd_sign_count = 0
|
|
else:
|
|
print(f'key: {key} ; tok[text]: {tok["text"]}')
|
|
raise 'Word mismatch!'
|
|
sentence_id += 1
|
|
sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})
|
|
svala_data_i += 1
|
|
paragraph_res.append(sentence_res)
|
|
return sent_i, paragraph_res |