#!/usr/bin/python2 from __future__ import print_function, unicode_literals, division import sys import os import re import pickle from pathlib import Path try: from lxml import etree as ElementTree except ImportError: import xml.etree.ElementTree as ElementTree # attributes ID_ATTR = "id" LEMMA_ATTR = "lemma" ANA_ATTR = "ana" # tags SENTENCE_TAG = 's' BIBL_TAG = 'bibl' PARAGRAPH_TAG = 'p' PC_TAG = 'pc' WORD_TAG = 'w' C_TAG = 'c' S_TAG = 'S' SEG_TAG = 'seg' class Sentence: def __init__(self, sentence, s_id): self.id = s_id self.words = [] self.text = "" for word in sentence: self.handle_word(word) def handle_word(self, word): # handle space after if word.tag == S_TAG: assert(word.text is None) self.text += ' ' return # ASK am I handling this correctly? elif word.tag == SEG_TAG: for segword in word: self.handle_word(segword) return # ASK handle unknown tags (are there others?) elif word.tag not in (WORD_TAG, C_TAG): return # ID idx = str(len(self.words) + 1) # TOKEN token = word.text # save word entry self.words.append(['F{}.{}'.format(self.id, idx), token]) #, lemma, xpos]) # save for text self.text += word.text def to_conllu(self): lines = [] # lines.append('# sent_id = ' + self.id) # CONLLu does not like spaces at the end of # text # lines.append('# text = ' + self.text.strip()) for word in self.words: lines.append('\t'.join('_' if data is None else data for data in word)) return lines def convert_file(in_file, out_file): print("Nalaganje xml: {}".format(in_file)) with open(str(in_file), 'r') as fp: xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) xmlstring = xmlstring.replace(' xml:', ' ') xml_tree = ElementTree.XML(xmlstring) # print("Pretvarjanje TEI -> TSV-U ...") lines = [] for pidx, paragraph in enumerate(xml_tree.iterfind('.//text/p')): sidx = 1 for sentence in paragraph: if sentence.tag != SENTENCE_TAG: continue sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx)) lines.extend(sentence.to_conllu()) lines.append('') # ASK newline between sentences sidx += 1 if len(lines) == 0: raise RuntimeError("Nobenih stavkov najdenih") print("Zapisovanje izhodne datoteke: {}".format(out_file)) with open(out_file, 'w') as fp: for line in lines: if sys.version_info < (3, 0): line = line.encode('utf-8') print(line, file=fp) if __name__ == "__main__": """ Converting TEI without MSDs to conllu """ in_folder = sys.argv[1] out_folder = sys.argv[2] files = Path(in_folder).rglob("*.xml") in_out = [] for filename in files: out_file = out_folder + "/" + filename.name[:-4] + ".txt" convert_file(filename, out_file)