Just some conllu <-> TEI scripts, should relly be done better!

2019-02-01 10:31:47 +01:00 · 2019-02-01 10:31:47 +01:00 · da7e82d042
commit da7e82d042
3 changed files with 310 additions and 0 deletions
--- a/switch_tei_tags.py
+++ b/switch_tei_tags.py
@ -0,0 +1,41 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger  
 import sys
 import os
 import shutil
 import codecs
 import lxml.etree as lxml
 input_directory = sys.argv[1]
 output_directory = sys.argv[2]
 shutil.rmtree(output_directory,True)
 os.makedirs(output_directory)
 tei_namespace = 'http://www.tei-c.org/ns/1.0'
 tei_namespace_qualifier = '{' + tei_namespace + '}'
 def do_file(input_file_name):
    if (input_file_name.endswith('.xml')):
        with open(log, 'a') as fp:
             fp.write(input_file_name + "\n")
        tree = lxml.parse(unicode(input_directory + '/' + input_file_name))
        root = tree.getroot()
        spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace})
        for space in spaces:
            space.tag = tei_namespace_qualifier + 'S'
            space.text = None
        punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace})
        for punctuation in punctuations:
            punctuation.tag = tei_namespace_qualifier + 'c'
        output_file_name = output_directory + '/' + input_file_name
        tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
 if __name__ == "__main__":
    for fname in input_file_names:
        do_file(fname)
--- a/tei_to_tsv.py
+++ b/tei_to_tsv.py
@ -0,0 +1,144 @@
 #!/usr/bin/python2
 from __future__ import print_function, unicode_literals, division
 import sys
 import os
 import re
 import pickle
 from pathlib import Path
 try:
    from lxml import etree as ElementTree
 except ImportError:
    import xml.etree.ElementTree as ElementTree
 # attributes
 ID_ATTR = "id"
 LEMMA_ATTR = "lemma"
 ANA_ATTR = "ana"
 # tags
 SENTENCE_TAG = 's'
 BIBL_TAG = 'bibl'
 PARAGRAPH_TAG = 'p'
 PC_TAG = 'pc'
 WORD_TAG = 'w'
 C_TAG = 'c'
 S_TAG = 'S'
 SEG_TAG = 'seg'
 class Sentence:
    def __init__(self, sentence, s_id):
        self.id = s_id
        self.words = []
        self.text = ""
        for word in sentence:
            self.handle_word(word)
    def handle_word(self, word):
        # handle space after
        if word.tag == S_TAG:
            assert(word.text is None)
            self.text += ' '
            return
        # ASK am I handling this correctly?
        elif word.tag == SEG_TAG:
            for segword in word:
                self.handle_word(segword)
            return
        # ASK handle unknown tags (are there others?)
        elif word.tag not in (WORD_TAG, C_TAG):
            return
        # ID
        idx = str(len(self.words) + 1)
        # TOKEN
        token = word.text
        # LEMMA
        if word.tag == WORD_TAG:
            lemma = word.get(LEMMA_ATTR)
            assert(lemma is not None)
        else:
            lemma = token
        # XPOS
        xpos = word.get('msd')
        if word.tag == C_TAG:
            xpos = "Z"
        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
            xpos = "N"
        elif xpos is None:
            print(self.id)
        # save word entry
        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
        # save for text
        self.text += word.text
    def to_conllu(self):
        lines = []
        # lines.append('# sent_id = ' + self.id)
        # CONLLu does not like spaces at the end of # text
        # lines.append('# text = ' + self.text.strip())
        for word in self.words:
            lines.append('\t'.join('_' if data is None else data for data in word))
        return lines
 def convert_file(in_file, out_file):
    print("Nalaganje xml: {}".format(in_file))
    with open(str(in_file), 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        xml_tree = ElementTree.XML(xmlstring)
    print("Pretvarjanje TEI -> TSV-U ...")
    lines = []
    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
        sidx = 1
        for sentence in paragraph:
            if sentence.tag != SENTENCE_TAG:
                continue
            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
            lines.extend(sentence.to_conllu())
            lines.append('') # ASK newline between sentences
            sidx += 1
    if len(lines) == 0:
        raise RuntimeError("Nobenih stavkov najdenih")
    print("Zapisovanje izhodne datoteke: {}".format(out_file))
    with open(out_file, 'w') as fp:
        for line in lines:
            if sys.version_info < (3, 0):
                line = line.encode('utf-8')
            print(line, file=fp)
 if __name__ == "__main__":
    """
    Input: folder of TEI files, msds are encoded as msd="Z"
    Ouput: just a folder
    """
    in_folder = sys.argv[1]
    out_folder = sys.argv[2]
    num_processes = int(sys.argv[3])
    files = Path(in_folder).rglob("*.xml")
    in_out = []
    for filename in files:
        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
        convert_file(filename, out_file)
--- a/untagged_tei_to_tsv.py
+++ b/untagged_tei_to_tsv.py
@ -0,0 +1,125 @@
 #!/usr/bin/python2
 from __future__ import print_function, unicode_literals, division
 import sys
 import os
 import re
 import pickle
 from pathlib import Path
 try:
    from lxml import etree as ElementTree
 except ImportError:
    import xml.etree.ElementTree as ElementTree
 # attributes
 ID_ATTR = "id"
 LEMMA_ATTR = "lemma"
 ANA_ATTR = "ana"
 # tags
 SENTENCE_TAG = 's'
 BIBL_TAG = 'bibl'
 PARAGRAPH_TAG = 'p'
 PC_TAG = 'pc'
 WORD_TAG = 'w'
 C_TAG = 'c'
 S_TAG = 'S'
 SEG_TAG = 'seg'
 class Sentence:
    def __init__(self, sentence, s_id):
        self.id = s_id
        self.words = []
        self.text = ""
        for word in sentence:
            self.handle_word(word)
    def handle_word(self, word):
        # handle space after
        if word.tag == S_TAG:
            assert(word.text is None)
            self.text += ' '
            return
        # ASK am I handling this correctly?
        elif word.tag == SEG_TAG:
            for segword in word:
                self.handle_word(segword)
            return
        # ASK handle unknown tags (are there others?)
        elif word.tag not in (WORD_TAG, C_TAG):
            return
        # ID
        idx = str(len(self.words) + 1)
        # TOKEN
        token = word.text
        # save word entry
        self.words.append(['F{}.{}'.format(self.id, idx), token]) #, lemma, xpos])
        # save for text
        self.text += word.text
    def to_conllu(self):
        lines = []
        # lines.append('# sent_id = ' + self.id)
        # CONLLu does not like spaces at the end of # text
        # lines.append('# text = ' + self.text.strip())
        for word in self.words:
            lines.append('\t'.join('_' if data is None else data for data in word))
        return lines
 def convert_file(in_file, out_file):
    print("Nalaganje xml: {}".format(in_file))
    with open(str(in_file), 'r') as fp:
        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
        xmlstring = xmlstring.replace(' xml:', ' ')
        xml_tree = ElementTree.XML(xmlstring)
    # print("Pretvarjanje TEI -> TSV-U ...")
    lines = []
    for pidx, paragraph in enumerate(xml_tree.iterfind('.//text/p')):
        sidx = 1
        for sentence in paragraph:
            if sentence.tag != SENTENCE_TAG:
                continue
            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
            lines.extend(sentence.to_conllu())
            lines.append('') # ASK newline between sentences
            sidx += 1
    if len(lines) == 0:
        raise RuntimeError("Nobenih stavkov najdenih")
    print("Zapisovanje izhodne datoteke: {}".format(out_file))
    with open(out_file, 'w') as fp:
        for line in lines:
            if sys.version_info < (3, 0):
                line = line.encode('utf-8')
            print(line, file=fp)
 if __name__ == "__main__":
    """
    Converting TEI without MSDs to conllu
    """
    in_folder = sys.argv[1]
    out_folder = sys.argv[2]
    files = Path(in_folder).rglob("*.xml")
    in_out = []
    for filename in files:
        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
        convert_file(filename, out_file)