From da7e82d04237d69a6c2f4023e0ae61553777ce12 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Fri, 1 Feb 2019 10:31:47 +0100 Subject: [PATCH] Just some conllu <-> TEI scripts, should relly be done better! --- switch_tei_tags.py | 41 ++++++++++++ tei_to_tsv.py | 144 +++++++++++++++++++++++++++++++++++++++++ untagged_tei_to_tsv.py | 125 +++++++++++++++++++++++++++++++++++ 3 files changed, 310 insertions(+) create mode 100755 switch_tei_tags.py create mode 100644 tei_to_tsv.py create mode 100644 untagged_tei_to_tsv.py diff --git a/switch_tei_tags.py b/switch_tei_tags.py new file mode 100755 index 0000000..605beee --- /dev/null +++ b/switch_tei_tags.py @@ -0,0 +1,41 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# python switch_tei_tags.py ~/slovnica/nova_slovnica/output/tei_pipeline/tei_tokenised ~/slovnica/nova_slovnica/output/tei_pipeline/tei_for_tagger + +import sys +import os +import shutil +import codecs + +import lxml.etree as lxml + +input_directory = sys.argv[1] +output_directory = sys.argv[2] + +shutil.rmtree(output_directory,True) +os.makedirs(output_directory) + +tei_namespace = 'http://www.tei-c.org/ns/1.0' +tei_namespace_qualifier = '{' + tei_namespace + '}' + +def do_file(input_file_name): + if (input_file_name.endswith('.xml')): + with open(log, 'a') as fp: + fp.write(input_file_name + "\n") + + tree = lxml.parse(unicode(input_directory + '/' + input_file_name)) + root = tree.getroot() + spaces = root.xpath('.//tei:c', namespaces={'tei':tei_namespace}) + for space in spaces: + space.tag = tei_namespace_qualifier + 'S' + space.text = None + punctuations = root.xpath('.//tei:pc', namespaces={'tei':tei_namespace}) + for punctuation in punctuations: + punctuation.tag = tei_namespace_qualifier + 'c' + output_file_name = output_directory + '/' + input_file_name + tree.write(output_file_name, encoding='UTF-8', pretty_print=True) + +if __name__ == "__main__": + for fname in input_file_names: + do_file(fname) diff --git a/tei_to_tsv.py b/tei_to_tsv.py new file mode 100644 index 0000000..0e766c8 --- /dev/null +++ b/tei_to_tsv.py @@ -0,0 +1,144 @@ +#!/usr/bin/python2 + +from __future__ import print_function, unicode_literals, division +import sys +import os +import re +import pickle +from pathlib import Path + +try: + from lxml import etree as ElementTree +except ImportError: + import xml.etree.ElementTree as ElementTree + + +# attributes +ID_ATTR = "id" +LEMMA_ATTR = "lemma" +ANA_ATTR = "ana" + + +# tags +SENTENCE_TAG = 's' +BIBL_TAG = 'bibl' +PARAGRAPH_TAG = 'p' +PC_TAG = 'pc' +WORD_TAG = 'w' +C_TAG = 'c' +S_TAG = 'S' +SEG_TAG = 'seg' + + +class Sentence: + def __init__(self, sentence, s_id): + self.id = s_id + self.words = [] + self.text = "" + + for word in sentence: + self.handle_word(word) + + def handle_word(self, word): + # handle space after + if word.tag == S_TAG: + assert(word.text is None) + self.text += ' ' + return + + # ASK am I handling this correctly? + elif word.tag == SEG_TAG: + for segword in word: + self.handle_word(segword) + return + + # ASK handle unknown tags (are there others?) + elif word.tag not in (WORD_TAG, C_TAG): + return + + # ID + idx = str(len(self.words) + 1) + + # TOKEN + token = word.text + + # LEMMA + if word.tag == WORD_TAG: + lemma = word.get(LEMMA_ATTR) + assert(lemma is not None) + else: + lemma = token + + # XPOS + xpos = word.get('msd') + if word.tag == C_TAG: + xpos = "Z" + elif xpos in ("Gp-ppdzn", "Gp-spmzd"): + xpos = "N" + elif xpos is None: + print(self.id) + + # save word entry + self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos]) + + # save for text + self.text += word.text + + + def to_conllu(self): + lines = [] + # lines.append('# sent_id = ' + self.id) + # CONLLu does not like spaces at the end of # text + # lines.append('# text = ' + self.text.strip()) + for word in self.words: + lines.append('\t'.join('_' if data is None else data for data in word)) + + return lines + +def convert_file(in_file, out_file): + print("Nalaganje xml: {}".format(in_file)) + with open(str(in_file), 'r') as fp: + xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + xml_tree = ElementTree.XML(xmlstring) + + print("Pretvarjanje TEI -> TSV-U ...") + lines = [] + + for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')): + sidx = 1 + for sentence in paragraph: + if sentence.tag != SENTENCE_TAG: + continue + + sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx)) + lines.extend(sentence.to_conllu()) + lines.append('') # ASK newline between sentences + sidx += 1 + + if len(lines) == 0: + raise RuntimeError("Nobenih stavkov najdenih") + + print("Zapisovanje izhodne datoteke: {}".format(out_file)) + with open(out_file, 'w') as fp: + for line in lines: + if sys.version_info < (3, 0): + line = line.encode('utf-8') + print(line, file=fp) + + +if __name__ == "__main__": + """ + Input: folder of TEI files, msds are encoded as msd="Z" + Ouput: just a folder + """ + + in_folder = sys.argv[1] + out_folder = sys.argv[2] + num_processes = int(sys.argv[3]) + + files = Path(in_folder).rglob("*.xml") + in_out = [] + for filename in files: + out_file = out_folder + "/" + filename.name[:-4] + ".txt" + convert_file(filename, out_file) diff --git a/untagged_tei_to_tsv.py b/untagged_tei_to_tsv.py new file mode 100644 index 0000000..4ca7389 --- /dev/null +++ b/untagged_tei_to_tsv.py @@ -0,0 +1,125 @@ +#!/usr/bin/python2 + +from __future__ import print_function, unicode_literals, division +import sys +import os +import re +import pickle +from pathlib import Path + +try: + from lxml import etree as ElementTree +except ImportError: + import xml.etree.ElementTree as ElementTree + + +# attributes +ID_ATTR = "id" +LEMMA_ATTR = "lemma" +ANA_ATTR = "ana" + + +# tags +SENTENCE_TAG = 's' +BIBL_TAG = 'bibl' +PARAGRAPH_TAG = 'p' +PC_TAG = 'pc' +WORD_TAG = 'w' +C_TAG = 'c' +S_TAG = 'S' +SEG_TAG = 'seg' + + +class Sentence: + def __init__(self, sentence, s_id): + self.id = s_id + self.words = [] + self.text = "" + + for word in sentence: + self.handle_word(word) + + def handle_word(self, word): + # handle space after + if word.tag == S_TAG: + assert(word.text is None) + self.text += ' ' + return + + # ASK am I handling this correctly? + elif word.tag == SEG_TAG: + for segword in word: + self.handle_word(segword) + return + + # ASK handle unknown tags (are there others?) + elif word.tag not in (WORD_TAG, C_TAG): + return + + # ID + idx = str(len(self.words) + 1) + + # TOKEN + token = word.text + + # save word entry + self.words.append(['F{}.{}'.format(self.id, idx), token]) #, lemma, xpos]) + + # save for text + self.text += word.text + + + def to_conllu(self): + lines = [] + # lines.append('# sent_id = ' + self.id) + # CONLLu does not like spaces at the end of # text + # lines.append('# text = ' + self.text.strip()) + for word in self.words: + lines.append('\t'.join('_' if data is None else data for data in word)) + + return lines + +def convert_file(in_file, out_file): + print("Nalaganje xml: {}".format(in_file)) + with open(str(in_file), 'r') as fp: + xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + xml_tree = ElementTree.XML(xmlstring) + + # print("Pretvarjanje TEI -> TSV-U ...") + lines = [] + + for pidx, paragraph in enumerate(xml_tree.iterfind('.//text/p')): + sidx = 1 + for sentence in paragraph: + if sentence.tag != SENTENCE_TAG: + continue + + sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx)) + lines.extend(sentence.to_conllu()) + lines.append('') # ASK newline between sentences + sidx += 1 + + if len(lines) == 0: + raise RuntimeError("Nobenih stavkov najdenih") + + print("Zapisovanje izhodne datoteke: {}".format(out_file)) + with open(out_file, 'w') as fp: + for line in lines: + if sys.version_info < (3, 0): + line = line.encode('utf-8') + print(line, file=fp) + + +if __name__ == "__main__": + """ + Converting TEI without MSDs to conllu + """ + in_folder = sys.argv[1] + out_folder = sys.argv[2] + + files = Path(in_folder).rglob("*.xml") + in_out = [] + for filename in files: + out_file = out_folder + "/" + filename.name[:-4] + ".txt" + convert_file(filename, out_file)