From dcc2935c3c4d7c52858f5841bc483d2984297715 Mon Sep 17 00:00:00 2001 From: voje Date: Thu, 28 Feb 2019 09:40:25 +0100 Subject: [PATCH] some changes on server --- .gitignore | 2 +- dockerfiles/python-java/Dockerfile | 3 +- dockerfiles/python-java/Makefile | 12 ++- parser/tei_to_dict.py | 15 +++ parser/test.py | 151 +++++++++++++++++++++++++++++ 5 files changed, 176 insertions(+), 7 deletions(-) create mode 100644 parser/tei_to_dict.py create mode 100755 parser/test.py diff --git a/.gitignore b/.gitignore index 3c5a087..3b822a4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,5 @@ *.pickle *.log -data/kres_out/ +data/kres_out/* data/kres_example/ diff --git a/dockerfiles/python-java/Dockerfile b/dockerfiles/python-java/Dockerfile index d086fc9..e2b3926 100644 --- a/dockerfiles/python-java/Dockerfile +++ b/dockerfiles/python-java/Dockerfile @@ -5,7 +5,8 @@ RUN apt-get install -y \ vim \ default-jdk \ python3 \ -python3-pip +python3-pip \ +sshfs RUN pip3 install lxml pandas sklearn diff --git a/dockerfiles/python-java/Makefile b/dockerfiles/python-java/Makefile index eda996c..74dae84 100644 --- a/dockerfiles/python-java/Makefile +++ b/dockerfiles/python-java/Makefile @@ -5,14 +5,16 @@ all: build run build: docker build . -t $(IMAGE_NAME) + run: docker run \ - -it \ - --user $(shell id -u):$(shell id -g) \ + -it \ -v /home/${USER}:/home/${USER} \ + --user $(shell id -u):$(shell id -g) \ -v /etc/passwd:/etc/passwd \ -v /etc/group:/etc/group \ - -v $(shell pwd)/../../:/cjvt-srl-tagging \ + -v $(shell pwd)/../../:/cjvt-srl-tagging \ -w /cjvt-srl-tagging \ - python-java \ - /bin/bash + -v /home/kristjan/kres_mount:/kres_mount:ro \ + python-java \ + /bin/bash diff --git a/parser/tei_to_dict.py b/parser/tei_to_dict.py new file mode 100644 index 0000000..6d05a1c --- /dev/null +++ b/parser/tei_to_dict.py @@ -0,0 +1,15 @@ +#!/usr/bin/python3 + +from lxml import etree + +def tei_to_dict(s_el): + + +if __name__ == "__main__": + with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f: + xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + xml_tree = ElementTree.XML(xmlstring) + + + diff --git a/parser/test.py b/parser/test.py new file mode 100755 index 0000000..f1bd71e --- /dev/null +++ b/parser/test.py @@ -0,0 +1,151 @@ +#!/usr/bin/python2 + +from __future__ import print_function, unicode_literals, division +import sys +import os +import re +import pickle +from pathlib import Path + +try: + from lxml import etree as ElementTree +except ImportError: + import xml.etree.ElementTree as ElementTree + + +# attributes +ID_ATTR = "id" +LEMMA_ATTR = "lemma" +ANA_ATTR = "ana" + + +# tags +SENTENCE_TAG = 's' +BIBL_TAG = 'bibl' +PARAGRAPH_TAG = 'p' +PC_TAG = 'pc' +WORD_TAG = 'w' +C_TAG = 'c' +S_TAG = 'S' +SEG_TAG = 'seg' + + +class Sentence: + def __init__(self, sentence, s_id): + self.id = s_id + self.words = [] + self.text = "" + + for word in sentence: + self.handle_word(word) + + def handle_word(self, word): + # handle space after + if word.tag == S_TAG: + assert(word.text is None) + self.text += ' ' + return + + # ASK am I handling this correctly? + elif word.tag == SEG_TAG: + for segword in word: + self.handle_word(segword) + return + + # ASK handle unknown tags (are there others?) + elif word.tag not in (WORD_TAG, C_TAG): + return + + # ID + idx = str(len(self.words) + 1) + + # TOKEN + token = word.text + + # LEMMA + if word.tag == WORD_TAG: + lemma = word.get(LEMMA_ATTR) + assert(lemma is not None) + else: + lemma = token + + # XPOS + xpos = word.get('msd') + if word.tag == C_TAG: + xpos = "Z" + elif xpos in ("Gp-ppdzn", "Gp-spmzd"): + xpos = "N" + elif xpos is None: + print(self.id) + + # save word entry + self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos]) + + # save for text + self.text += word.text + + + def to_conllu(self): + lines = [] + # lines.append('# sent_id = ' + self.id) + # CONLLu does not like spaces at the end of # text + # lines.append('# text = ' + self.text.strip()) + for word in self.words: + lines.append('\t'.join('_' if data is None else data for data in word)) + + return lines + +def convert_file(in_file, out_file): + print("Nalaganje xml: {}".format(in_file)) + with open(str(in_file), 'r') as fp: + uni_str = fp.read().decode("utf-8") + xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + print(xmlstring[:1000]) + xml_tree = ElementTree.XML(xmlstring) + + print("Pretvarjanje TEI -> TSV-U ...") + lines = [] + + for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')): + sidx = 1 + for sentence in paragraph: + if sentence.tag != SENTENCE_TAG: + continue + + sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx)) + lines.extend(sentence.to_conllu()) + lines.append('') # ASK newline between sentences + sidx += 1 + + if len(lines) == 0: + raise RuntimeError("Nobenih stavkov najdenih") + + print("Zapisovanje izhodne datoteke: {}".format(out_file)) + with open(out_file, 'w') as fp: + for line in lines: + if sys.version_info < (3, 0): + line = line.encode('utf-8') + print(line, file=fp) + + +if __name__ == "__main__": + """ + Input: folder of TEI files, msds are encoded as msd="Z" + Ouput: just a folder + """ + + infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml" + outfile = "test.out" + convert_file(infile, outfile) + sys.exit() + + in_folder = sys.argv[1] + out_folder = sys.argv[2] + num_processes = int(sys.argv[3]) + + files = Path(in_folder).rglob("*.xml") + in_out = [] + for filename in files: + out_file = out_folder + "/" + filename.name[:-4] + ".txt" + convert_file(filename, out_file)