Just some conllu <-> TEI scripts, should relly be done better!

2019-02-01 10:31:47 +01:00
commit da7e82d042
3 changed files with 310 additions and 0 deletions
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger  
+
+import sys
+import os
+import shutil
+import codecs
+
+import lxml.etree as lxml
+
+input_directory = sys.argv[1]
+output_directory = sys.argv[2]
+
+shutil.rmtree(output_directory,True)
+os.makedirs(output_directory)
+
+tei_namespace = 'http://www.tei-c.org/ns/1.0'
+tei_namespace_qualifier = '{' + tei_namespace + '}'
+
+def do_file(input_file_name):
+    if (input_file_name.endswith('.xml')):
+        with open(log, 'a') as fp:
+             fp.write(input_file_name + "\n")
+
+        tree = lxml.parse(unicode(input_directory + '/' + input_file_name))
+        root = tree.getroot()
+        spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace})
+        for space in spaces:
+            space.tag = tei_namespace_qualifier + 'S'
+            space.text = None
+        punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace})
+        for punctuation in punctuations:
+            punctuation.tag = tei_namespace_qualifier + 'c'
+        output_file_name = output_directory + '/' + input_file_name
+        tree.write(output_file_name, encoding='UTF-8', pretty_print=True)
+
+if __name__ == "__main__":
+    for fname in input_file_names:
+        do_file(fname)
@@ -0,0 +1,144 @@
+#!/usr/bin/python2
+
+from __future__ import print_function, unicode_literals, division
+import sys
+import os
+import re
+import pickle
+from pathlib import Path
+
+try:
+    from lxml import etree as ElementTree
+except ImportError:
+    import xml.etree.ElementTree as ElementTree
+
+
+# attributes
+ID_ATTR = "id"
+LEMMA_ATTR = "lemma"
+ANA_ATTR = "ana"
+
+
+# tags
+SENTENCE_TAG = 's'
+BIBL_TAG = 'bibl'
+PARAGRAPH_TAG = 'p'
+PC_TAG = 'pc'
+WORD_TAG = 'w'
+C_TAG = 'c'
+S_TAG = 'S'
+SEG_TAG = 'seg'
+
+
+class Sentence:
+    def __init__(self, sentence, s_id):
+        self.id = s_id
+        self.words = []
+        self.text = ""
+
+        for word in sentence:
+            self.handle_word(word)
+
+    def handle_word(self, word):
+        # handle space after
+        if word.tag == S_TAG:
+            assert(word.text is None)
+            self.text += ' '
+            return
+
+        # ASK am I handling this correctly?
+        elif word.tag == SEG_TAG:
+            for segword in word:
+                self.handle_word(segword)
+            return
+
+        # ASK handle unknown tags (are there others?)
+        elif word.tag not in (WORD_TAG, C_TAG):
+            return
+
+        # ID
+        idx = str(len(self.words) + 1)
+
+        # TOKEN
+        token = word.text
+
+        # LEMMA
+        if word.tag == WORD_TAG:
+            lemma = word.get(LEMMA_ATTR)
+            assert(lemma is not None)
+        else:
+            lemma = token
+
+        # XPOS
+        xpos = word.get('msd')
+        if word.tag == C_TAG:
+            xpos = "Z"
+        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
+            xpos = "N"
+        elif xpos is None:
+            print(self.id)
+
+        # save word entry
+        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
+
+        # save for text
+        self.text += word.text
+
+
+    def to_conllu(self):
+        lines = []
+        # lines.append('# sent_id = ' + self.id)
+        # CONLLu does not like spaces at the end of # text
+        # lines.append('# text = ' + self.text.strip())
+        for word in self.words:
+            lines.append('\t'.join('_' if data is None else data for data in word))
+
+        return lines
+
+def convert_file(in_file, out_file):
+    print("Nalaganje xml: {}".format(in_file))
+    with open(str(in_file), 'r') as fp:
+        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
+        xmlstring = xmlstring.replace(' xml:', ' ')
+        xml_tree = ElementTree.XML(xmlstring)
+
+    print("Pretvarjanje TEI -> TSV-U ...")
+    lines = []
+
+    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
+        sidx = 1
+        for sentence in paragraph:
+            if sentence.tag != SENTENCE_TAG:
+                continue
+
+            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
+            lines.extend(sentence.to_conllu())
+            lines.append('') # ASK newline between sentences
+            sidx += 1
+
+    if len(lines) == 0:
+        raise RuntimeError("Nobenih stavkov najdenih")
+
+    print("Zapisovanje izhodne datoteke: {}".format(out_file))
+    with open(out_file, 'w') as fp:
+        for line in lines:
+            if sys.version_info < (3, 0):
+                line = line.encode('utf-8')
+            print(line, file=fp)
+
+
+if __name__ == "__main__":
+    """
+    Input: folder of TEI files, msds are encoded as msd="Z"
+    Ouput: just a folder
+    """
+
+    in_folder = sys.argv[1]
+    out_folder = sys.argv[2]
+    num_processes = int(sys.argv[3])
+
+    files = Path(in_folder).rglob("*.xml")
+    in_out = []
+    for filename in files:
+        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
+        convert_file(filename, out_file)
@@ -0,0 +1,125 @@
+#!/usr/bin/python2
+
+from __future__ import print_function, unicode_literals, division
+import sys
+import os
+import re
+import pickle
+from pathlib import Path
+
+try:
+    from lxml import etree as ElementTree
+except ImportError:
+    import xml.etree.ElementTree as ElementTree
+
+
+# attributes
+ID_ATTR = "id"
+LEMMA_ATTR = "lemma"
+ANA_ATTR = "ana"
+
+
+# tags
+SENTENCE_TAG = 's'
+BIBL_TAG = 'bibl'
+PARAGRAPH_TAG = 'p'
+PC_TAG = 'pc'
+WORD_TAG = 'w'
+C_TAG = 'c'
+S_TAG = 'S'
+SEG_TAG = 'seg'
+
+
+class Sentence:
+    def __init__(self, sentence, s_id):
+        self.id = s_id
+        self.words = []
+        self.text = ""
+
+        for word in sentence:
+            self.handle_word(word)
+
+    def handle_word(self, word):
+        # handle space after
+        if word.tag == S_TAG:
+            assert(word.text is None)
+            self.text += ' '
+            return
+
+        # ASK am I handling this correctly?
+        elif word.tag == SEG_TAG:
+            for segword in word:
+                self.handle_word(segword)
+            return
+
+        # ASK handle unknown tags (are there others?)
+        elif word.tag not in (WORD_TAG, C_TAG):
+            return
+
+        # ID
+        idx = str(len(self.words) + 1)
+
+        # TOKEN
+        token = word.text
+
+        # save word entry
+        self.words.append(['F{}.{}'.format(self.id, idx), token]) #, lemma, xpos])
+
+        # save for text
+        self.text += word.text
+
+
+    def to_conllu(self):
+        lines = []
+        # lines.append('# sent_id = ' + self.id)
+        # CONLLu does not like spaces at the end of # text
+        # lines.append('# text = ' + self.text.strip())
+        for word in self.words:
+            lines.append('\t'.join('_' if data is None else data for data in word))
+
+        return lines
+
+def convert_file(in_file, out_file):
+    print("Nalaganje xml: {}".format(in_file))
+    with open(str(in_file), 'r') as fp:
+        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
+        xmlstring = xmlstring.replace(' xml:', ' ')
+        xml_tree = ElementTree.XML(xmlstring)
+
+    # print("Pretvarjanje TEI -> TSV-U ...")
+    lines = []
+
+    for pidx, paragraph in enumerate(xml_tree.iterfind('.//text/p')):
+        sidx = 1
+        for sentence in paragraph:
+            if sentence.tag != SENTENCE_TAG:
+                continue
+
+            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
+            lines.extend(sentence.to_conllu())
+            lines.append('') # ASK newline between sentences
+            sidx += 1
+
+    if len(lines) == 0:
+        raise RuntimeError("Nobenih stavkov najdenih")
+
+    print("Zapisovanje izhodne datoteke: {}".format(out_file))
+    with open(out_file, 'w') as fp:
+        for line in lines:
+            if sys.version_info < (3, 0):
+                line = line.encode('utf-8')
+            print(line, file=fp)
+
+
+if __name__ == "__main__":
+    """
+    Converting TEI without MSDs to conllu
+    """
+    in_folder = sys.argv[1]
+    out_folder = sys.argv[2]
+
+    files = Path(in_folder).rglob("*.xml")
+    in_out = []
+    for filename in files:
+        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
+        convert_file(filename, out_file)