cjvt-srl-tagging/tools/parser/ozbolt.py

#!/usr/bin/python3

from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path

try:
    from lxml import etree as ElementTree
except ImportError:
    import xml.etree.ElementTree as ElementTree


# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"


# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'


class Sentence:
    def __init__(self, sentence, s_id):
        self.id = s_id
        self.words = []
        self.text = ""

        for word in sentence:
            self.handle_word(word)

    def handle_word(self, word):
        # handle space after
        if word.tag == S_TAG:
            assert(word.text is None)
            self.text += ' '
            return

        # ASK am I handling this correctly?
        elif word.tag == SEG_TAG:
            for segword in word:
                self.handle_word(segword)
            return

        # ASK handle unknown tags (are there others?)
        elif word.tag not in (WORD_TAG, C_TAG):
            return

        # ID
        idx = str(len(self.words) + 1)

        # TOKEN
        token = word.text

        # LEMMA
        if word.tag == WORD_TAG:
            lemma = word.get(LEMMA_ATTR)
            assert(lemma is not None)
        else:
            lemma = token

        # XPOS
        xpos = word.get('msd')
        if word.tag == C_TAG:
            xpos = "Z"
        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
            xpos = "N"
        elif xpos is None:
            print(self.id)

        # save word entry
        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])

        # save for text
        self.text += word.text


    def to_conllu(self):
        lines = []
        # lines.append('# sent_id = ' + self.id)
        # CONLLu does not like spaces at the end of # text
        # lines.append('# text = ' + self.text.strip())
        for word in self.words:
            lines.append('\t'.join('_' if data is None else data for data in word))

        return lines

def convert_file(in_file, out_file):
    print("Nalaganje xml: {}".format(in_file))
    with open(str(in_file), 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        xml_tree = ElementTree.XML(xmlstring)

    print("Pretvarjanje TEI -> TSV-U ...")
    lines = []

    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
        sidx = 1
        for sentence in paragraph:
            if sentence.tag != SENTENCE_TAG:
                continue

            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
            lines.extend(sentence.to_conllu())
            lines.append('') # ASK newline between sentences
            sidx += 1

    if len(lines) == 0:
        raise RuntimeError("Nobenih stavkov najdenih")

    print("Zapisovanje izhodne datoteke: {}".format(out_file))
    with open(out_file, 'w') as fp:
        for line in lines:
            if sys.version_info < (3, 0):
                line = line.encode('utf-8')
            print(line, file=fp)


if __name__ == "__main__":
    """
    Input: folder of TEI files, msds are encoded as msd="Z"
    Ouput: just a folder
    """

    in_folder = sys.argv[1]
    out_folder = sys.argv[2]
    num_processes = int(sys.argv[3])

    files = Path(in_folder).rglob("*.xml")
    in_out = []
    for filename in files:
        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
        convert_file(filename, out_file)
parser.py can read kres and/or ssj500k 2019-02-03 21:54:26 +00:00			`#!/usr/bin/python3`

			`from __future__ import print_function, unicode_literals, division`
			`import sys`
			`import os`
			`import re`
			`import pickle`
			`from pathlib import Path`

			`try:`
			`from lxml import etree as ElementTree`
			`except ImportError:`
			`import xml.etree.ElementTree as ElementTree`


			`# attributes`
			`ID_ATTR = "id"`
			`LEMMA_ATTR = "lemma"`
			`ANA_ATTR = "ana"`


			`# tags`
			`SENTENCE_TAG = 's'`
			`BIBL_TAG = 'bibl'`
			`PARAGRAPH_TAG = 'p'`
			`PC_TAG = 'pc'`
			`WORD_TAG = 'w'`
			`C_TAG = 'c'`
			`S_TAG = 'S'`
			`SEG_TAG = 'seg'`


			`class Sentence:`
			`def __init__(self, sentence, s_id):`
			`self.id = s_id`
			`self.words = []`
			`self.text = ""`

			`for word in sentence:`
			`self.handle_word(word)`

			`def handle_word(self, word):`
			`# handle space after`
			`if word.tag == S_TAG:`
			`assert(word.text is None)`
			`self.text += ' '`
			`return`

			`# ASK am I handling this correctly?`
			`elif word.tag == SEG_TAG:`
			`for segword in word:`
			`self.handle_word(segword)`
			`return`

			`# ASK handle unknown tags (are there others?)`
			`elif word.tag not in (WORD_TAG, C_TAG):`
			`return`

			`# ID`
			`idx = str(len(self.words) + 1)`

			`# TOKEN`
			`token = word.text`

			`# LEMMA`
			`if word.tag == WORD_TAG:`
			`lemma = word.get(LEMMA_ATTR)`
			`assert(lemma is not None)`
			`else:`
			`lemma = token`

			`# XPOS`
			`xpos = word.get('msd')`
			`if word.tag == C_TAG:`
			`xpos = "Z"`
			`elif xpos in ("Gp-ppdzn", "Gp-spmzd"):`
			`xpos = "N"`
			`elif xpos is None:`
			`print(self.id)`

			`# save word entry`
			`self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])`

			`# save for text`
			`self.text += word.text`


			`def to_conllu(self):`
			`lines = []`
			`# lines.append('# sent_id = ' + self.id)`
			`# CONLLu does not like spaces at the end of # text`
			`# lines.append('# text = ' + self.text.strip())`
			`for word in self.words:`
			`lines.append('\t'.join('_' if data is None else data for data in word))`

			`return lines`

			`def convert_file(in_file, out_file):`
			`print("Nalaganje xml: {}".format(in_file))`
			`with open(str(in_file), 'r') as fp:`
			`xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)`
			`xmlstring = xmlstring.replace(' xml:', ' ')`
			`xml_tree = ElementTree.XML(xmlstring)`

			`print("Pretvarjanje TEI -> TSV-U ...")`
			`lines = []`

			`for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):`
			`sidx = 1`
			`for sentence in paragraph:`
			`if sentence.tag != SENTENCE_TAG:`
			`continue`

			`sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))`
			`lines.extend(sentence.to_conllu())`
			`lines.append('') # ASK newline between sentences`
			`sidx += 1`

			`if len(lines) == 0:`
			`raise RuntimeError("Nobenih stavkov najdenih")`

			`print("Zapisovanje izhodne datoteke: {}".format(out_file))`
			`with open(out_file, 'w') as fp:`
			`for line in lines:`
			`if sys.version_info < (3, 0):`
			`line = line.encode('utf-8')`
			`print(line, file=fp)`


			`if __name__ == "__main__":`
			`"""`
			`Input: folder of TEI files, msds are encoded as msd="Z"`
			`Ouput: just a folder`
			`"""`

			`in_folder = sys.argv[1]`
			`out_folder = sys.argv[2]`
			`num_processes = int(sys.argv[3])`

			`files = Path(in_folder).rglob("*.xml")`
			`in_out = []`
			`for filename in files:`
			`out_file = out_folder + "/" + filename.name[:-4] + ".txt"`
			`convert_file(filename, out_file)`