tei_conllu_conversions/tei_to_tsv.py

#!/usr/bin/python2

from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path

try:
    from lxml import etree as ElementTree
except ImportError:
    import xml.etree.ElementTree as ElementTree


# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"


# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'


class Sentence:
    def __init__(self, sentence, s_id):
        self.id = s_id
        self.words = []
        self.text = ""

        for word in sentence:
            self.handle_word(word)

    def handle_word(self, word):
        # handle space after
        if word.tag == S_TAG:
            assert(word.text is None)
            self.text += ' '
            return

        # ASK am I handling this correctly?
        elif word.tag == SEG_TAG:
            for segword in word:
                self.handle_word(segword)
            return

        # ASK handle unknown tags (are there others?)
        elif word.tag not in (WORD_TAG, C_TAG):
            return

        # ID
        idx = str(len(self.words) + 1)

        # TOKEN
        token = word.text

        # LEMMA
        if word.tag == WORD_TAG:
            lemma = word.get(LEMMA_ATTR)
            assert(lemma is not None)
        else:
            lemma = token

        # XPOS
        xpos = word.get('msd')
        if word.tag == C_TAG:
            xpos = "Z"
        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
            xpos = "N"
        elif xpos is None:
            print(self.id)

        # save word entry
        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])

        # save for text
        self.text += word.text


    def to_conllu(self):
        lines = []
        # lines.append('# sent_id = ' + self.id)
        # CONLLu does not like spaces at the end of # text
        # lines.append('# text = ' + self.text.strip())
        for word in self.words:
            lines.append('\t'.join('_' if data is None else data for data in word))

        return lines

def convert_file(in_file, out_file):
    print("Nalaganje xml: {}".format(in_file))
    with open(str(in_file), 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        xml_tree = ElementTree.XML(xmlstring)

    print("Pretvarjanje TEI -> TSV-U ...")
    lines = []

    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
        sidx = 1
        for sentence in paragraph:
            if sentence.tag != SENTENCE_TAG:
                continue

            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
            lines.extend(sentence.to_conllu())
            lines.append('') # ASK newline between sentences
            sidx += 1

    if len(lines) == 0:
        raise RuntimeError("Nobenih stavkov najdenih")

    print("Zapisovanje izhodne datoteke: {}".format(out_file))
    with open(out_file, 'w') as fp:
        for line in lines:
            if sys.version_info < (3, 0):
                line = line.encode('utf-8')
            print(line, file=fp)


if __name__ == "__main__":
    """
    Input: folder of TEI files, msds are encoded as msd="Z"
    Ouput: just a folder
    """

    in_folder = sys.argv[1]
    out_folder = sys.argv[2]
    num_processes = int(sys.argv[3])

    files = Path(in_folder).rglob("*.xml")
    in_out = []
    for filename in files:
        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
        convert_file(filename, out_file)