svala-scripts/src/read/read.py

import re

from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE


def read_raw_text(path):
    print(path)
    # if path == "data/KOST/raw/L-1819-110.txt":
    #     print('here')
    try:
        with open(path, 'r', encoding='utf-8') as rf:
            return rf.read()
    except:
        try:
            with open(path, 'r', encoding='utf-16') as rf:
                return rf.read()
        except:
            with open(path, 'r', encoding="windows-1250") as rf:
                return rf.read()


def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):
    # apply handfixes for obeliks
    apply_obeliks_handfixes(tokenized_paragraph)

    paragraph_res = []
    wierd_sign_count = 0
    svala_data_i = 0
    for i in range(sent_i, len(tokenized_paragraph)):
        sentence = tokenized_paragraph[i]
        sentence_res = []
        sentence_id = 0
        for tok in sentence:
            tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'
            if 'misc' in tok:
                assert tok['misc'] == 'SpaceAfter=No'
            space_after = not 'misc' in tok
            if len(svala_data_part) <= svala_data_i:
                # if sentence does not end add it anyway
                # TODO i error?
                if sentence_res:
                    paragraph_res.append(sentence_res)
                return i, paragraph_res
            if svala_data_part[svala_data_i]['text'] != tok['text']:
                key = svala_data_part[svala_data_i]['text']
                if key not in HAND_FIXES:
                    if key.startswith('§§§') and key.endswith('§§§'):
                        HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']
                    elif key.startswith('§§§'):
                        HAND_FIXES[key] = ['§', '§', '§', key[3:]]
                    elif key.endswith('§§§'):
                        HAND_FIXES[key] = [key[:-3], '§', '§', '§']
                    else:
                        if len(key) < len(tok['text']):
                            print('HAND_FIXES_MERGE:')
                            print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")
                            SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']
                            a = SVALA_HAND_FIXES_MERGE
                        else:
                            print('HAND_FIXES OLD:')
                            print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")

                            print('HAND_FIXES NEW:')
                            reg = re.findall(r"[\w]+|[^\s\w]", key)
                            print(f", '{key}': {str(reg)}")

                            # HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]
                            HAND_FIXES[key] = re.findall(r"[\w]+|[^\s\w]", key)
                        print(f'key: {key} ; tok[text]: {tok["text"]}')
                        # raise ValueError('Word mismatch!')

                if tok['text'] == HAND_FIXES[key][wierd_sign_count]:
                    wierd_sign_count += 1
                    if wierd_sign_count < len(HAND_FIXES[key]):
                        continue
                    else:
                        tok['text'] = key
                        wierd_sign_count = 0
                elif key in ['[XKrajX]']:
                    tok['text'] = '[XKrajX]'
                elif key in ['[XImeX]']:
                    tok['text'] = '[XImeX]'
                else:
                    print(f'key: {key} ; tok[text]: {tok["text"]}')
                    raise 'Word mismatch!'
            sentence_id += 1
            sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})
            svala_data_i += 1
        paragraph_res.append(sentence_res)
    return sent_i, paragraph_res
Tokenization part adapted for KOST 2 years ago			`import re`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago
Tokenization part adapted for KOST 2 years ago			`from src.read.hand_fixes import HAND_FIXES, apply_obeliks_handfixes, SVALA_HAND_FIXES_MERGE`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago

			`def read_raw_text(path):`
Tokenization part adapted for KOST 2 years ago			`print(path)`
			`# if path == "data/KOST/raw/L-1819-110.txt":`
			`# print('here')`
			`try:`
			`with open(path, 'r', encoding='utf-8') as rf:`
			`return rf.read()`
			`except:`
			`try:`
			`with open(path, 'r', encoding='utf-16') as rf:`
			`return rf.read()`
			`except:`
			`with open(path, 'r', encoding="windows-1250") as rf:`
			`return rf.read()`

Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago

			`def map_svala_tokenized(svala_data_part, tokenized_paragraph, sent_i):`
Tokenization part adapted for KOST 2 years ago			`# apply handfixes for obeliks`
			`apply_obeliks_handfixes(tokenized_paragraph)`

Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago			`paragraph_res = []`
			`wierd_sign_count = 0`
			`svala_data_i = 0`
			`for i in range(sent_i, len(tokenized_paragraph)):`
			`sentence = tokenized_paragraph[i]`
			`sentence_res = []`
			`sentence_id = 0`
			`for tok in sentence:`
			`tag = 'pc' if 'xpos' in tok and tok['xpos'] == 'Z' else 'w'`
			`if 'misc' in tok:`
			`assert tok['misc'] == 'SpaceAfter=No'`
			`space_after = not 'misc' in tok`
			`if len(svala_data_part) <= svala_data_i:`
Tokenization part adapted for KOST 2 years ago			`# if sentence does not end add it anyway`
			`# TODO i error?`
			`if sentence_res:`
			`paragraph_res.append(sentence_res)`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago			`return i, paragraph_res`
Tokenization part adapted for KOST 2 years ago			`if svala_data_part[svala_data_i]['text'] != tok['text']:`
			`key = svala_data_part[svala_data_i]['text']`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago			`if key not in HAND_FIXES:`
			`if key.startswith('§§§') and key.endswith('§§§'):`
			`HAND_FIXES[key] = ['§', '§', '§', key[3:-3], '§', '§', '§']`
			`elif key.startswith('§§§'):`
			`HAND_FIXES[key] = ['§', '§', '§', key[3:]]`
			`elif key.endswith('§§§'):`
			`HAND_FIXES[key] = [key[:-3], '§', '§', '§']`
			`else:`
Tokenization part adapted for KOST 2 years ago			`if len(key) < len(tok['text']):`
			`print('HAND_FIXES_MERGE:')`
			`print(f", ('{tok['text'][:len(key)]}', '{tok['text'][len(key):]}'): '{tok['text']}'")`
			`SVALA_HAND_FIXES_MERGE[(tok['text'][:len(key)], tok['text'][len(key):])] = tok['text']`
			`a = SVALA_HAND_FIXES_MERGE`
			`else:`
			`print('HAND_FIXES OLD:')`
			`print(f", '{key}': ['{key[:len(tok['text'])]}', '{key[len(tok['text']):]}']")`

			`print('HAND_FIXES NEW:')`
			`reg = re.findall(r"[\w]+\|[^\s\w]", key)`
			`print(f", '{key}': {str(reg)}")`

			`# HAND_FIXES[key] = [key[:len(tok['text'])], key[len(tok['text']):]]`
			`HAND_FIXES[key] = re.findall(r"[\w]+\|[^\s\w]", key)`
			`print(f'key: {key} ; tok[text]: {tok["text"]}')`
			`# raise ValueError('Word mismatch!')`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago
			`if tok['text'] == HAND_FIXES[key][wierd_sign_count]:`
			`wierd_sign_count += 1`
			`if wierd_sign_count < len(HAND_FIXES[key]):`
			`continue`
			`else:`
			`tok['text'] = key`
			`wierd_sign_count = 0`
Tokenization part adapted for KOST 2 years ago			`elif key in ['[XKrajX]']:`
			`tok['text'] = '[XKrajX]'`
			`elif key in ['[XImeX]']:`
			`tok['text'] = '[XImeX]'`
Major structure refactorin + Beginning to add default (KOST) implementation. 2 years ago			`else:`
			`print(f'key: {key} ; tok[text]: {tok["text"]}')`
			`raise 'Word mismatch!'`
			`sentence_id += 1`
			`sentence_res.append({'token': tok['text'], 'tag': tag, 'id': sentence_id, 'space_after': space_after, 'svala_id': svala_data_part[svala_data_i]['id']})`
			`svala_data_i += 1`
			`paragraph_res.append(sentence_res)`
			`return sent_i, paragraph_res`