forked from kristjan/cjvt-srl-tagging
some changes on server
This commit is contained in:
parent
60ac569f40
commit
dcc2935c3c
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -2,5 +2,5 @@
|
|||
*.pickle
|
||||
*.log
|
||||
|
||||
data/kres_out/
|
||||
data/kres_out/*
|
||||
data/kres_example/
|
||||
|
|
|
@ -5,7 +5,8 @@ RUN apt-get install -y \
|
|||
vim \
|
||||
default-jdk \
|
||||
python3 \
|
||||
python3-pip
|
||||
python3-pip \
|
||||
sshfs
|
||||
|
||||
RUN pip3 install lxml pandas sklearn
|
||||
|
||||
|
|
|
@ -5,14 +5,16 @@ all: build run
|
|||
build:
|
||||
docker build . -t $(IMAGE_NAME)
|
||||
|
||||
|
||||
run:
|
||||
docker run \
|
||||
-it \
|
||||
--user $(shell id -u):$(shell id -g) \
|
||||
-it \
|
||||
-v /home/${USER}:/home/${USER} \
|
||||
--user $(shell id -u):$(shell id -g) \
|
||||
-v /etc/passwd:/etc/passwd \
|
||||
-v /etc/group:/etc/group \
|
||||
-v $(shell pwd)/../../:/cjvt-srl-tagging \
|
||||
-v $(shell pwd)/../../:/cjvt-srl-tagging \
|
||||
-w /cjvt-srl-tagging \
|
||||
python-java \
|
||||
/bin/bash
|
||||
-v /home/kristjan/kres_mount:/kres_mount:ro \
|
||||
python-java \
|
||||
/bin/bash
|
||||
|
|
15
parser/tei_to_dict.py
Normal file
15
parser/tei_to_dict.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from lxml import etree
|
||||
|
||||
def tei_to_dict(s_el):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
xml_tree = ElementTree.XML(xmlstring)
|
||||
|
||||
|
||||
|
151
parser/test.py
Executable file
151
parser/test.py
Executable file
|
@ -0,0 +1,151 @@
|
|||
#!/usr/bin/python2
|
||||
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from lxml import etree as ElementTree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
|
||||
|
||||
# attributes
|
||||
ID_ATTR = "id"
|
||||
LEMMA_ATTR = "lemma"
|
||||
ANA_ATTR = "ana"
|
||||
|
||||
|
||||
# tags
|
||||
SENTENCE_TAG = 's'
|
||||
BIBL_TAG = 'bibl'
|
||||
PARAGRAPH_TAG = 'p'
|
||||
PC_TAG = 'pc'
|
||||
WORD_TAG = 'w'
|
||||
C_TAG = 'c'
|
||||
S_TAG = 'S'
|
||||
SEG_TAG = 'seg'
|
||||
|
||||
|
||||
class Sentence:
|
||||
def __init__(self, sentence, s_id):
|
||||
self.id = s_id
|
||||
self.words = []
|
||||
self.text = ""
|
||||
|
||||
for word in sentence:
|
||||
self.handle_word(word)
|
||||
|
||||
def handle_word(self, word):
|
||||
# handle space after
|
||||
if word.tag == S_TAG:
|
||||
assert(word.text is None)
|
||||
self.text += ' '
|
||||
return
|
||||
|
||||
# ASK am I handling this correctly?
|
||||
elif word.tag == SEG_TAG:
|
||||
for segword in word:
|
||||
self.handle_word(segword)
|
||||
return
|
||||
|
||||
# ASK handle unknown tags (are there others?)
|
||||
elif word.tag not in (WORD_TAG, C_TAG):
|
||||
return
|
||||
|
||||
# ID
|
||||
idx = str(len(self.words) + 1)
|
||||
|
||||
# TOKEN
|
||||
token = word.text
|
||||
|
||||
# LEMMA
|
||||
if word.tag == WORD_TAG:
|
||||
lemma = word.get(LEMMA_ATTR)
|
||||
assert(lemma is not None)
|
||||
else:
|
||||
lemma = token
|
||||
|
||||
# XPOS
|
||||
xpos = word.get('msd')
|
||||
if word.tag == C_TAG:
|
||||
xpos = "Z"
|
||||
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
|
||||
xpos = "N"
|
||||
elif xpos is None:
|
||||
print(self.id)
|
||||
|
||||
# save word entry
|
||||
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
|
||||
|
||||
# save for text
|
||||
self.text += word.text
|
||||
|
||||
|
||||
def to_conllu(self):
|
||||
lines = []
|
||||
# lines.append('# sent_id = ' + self.id)
|
||||
# CONLLu does not like spaces at the end of # text
|
||||
# lines.append('# text = ' + self.text.strip())
|
||||
for word in self.words:
|
||||
lines.append('\t'.join('_' if data is None else data for data in word))
|
||||
|
||||
return lines
|
||||
|
||||
def convert_file(in_file, out_file):
|
||||
print("Nalaganje xml: {}".format(in_file))
|
||||
with open(str(in_file), 'r') as fp:
|
||||
uni_str = fp.read().decode("utf-8")
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
print(xmlstring[:1000])
|
||||
xml_tree = ElementTree.XML(xmlstring)
|
||||
|
||||
print("Pretvarjanje TEI -> TSV-U ...")
|
||||
lines = []
|
||||
|
||||
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
|
||||
sidx = 1
|
||||
for sentence in paragraph:
|
||||
if sentence.tag != SENTENCE_TAG:
|
||||
continue
|
||||
|
||||
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
|
||||
lines.extend(sentence.to_conllu())
|
||||
lines.append('') # ASK newline between sentences
|
||||
sidx += 1
|
||||
|
||||
if len(lines) == 0:
|
||||
raise RuntimeError("Nobenih stavkov najdenih")
|
||||
|
||||
print("Zapisovanje izhodne datoteke: {}".format(out_file))
|
||||
with open(out_file, 'w') as fp:
|
||||
for line in lines:
|
||||
if sys.version_info < (3, 0):
|
||||
line = line.encode('utf-8')
|
||||
print(line, file=fp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Input: folder of TEI files, msds are encoded as msd="Z"
|
||||
Ouput: just a folder
|
||||
"""
|
||||
|
||||
infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
|
||||
outfile = "test.out"
|
||||
convert_file(infile, outfile)
|
||||
sys.exit()
|
||||
|
||||
in_folder = sys.argv[1]
|
||||
out_folder = sys.argv[2]
|
||||
num_processes = int(sys.argv[3])
|
||||
|
||||
files = Path(in_folder).rglob("*.xml")
|
||||
in_out = []
|
||||
for filename in files:
|
||||
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
|
||||
convert_file(filename, out_file)
|
Loading…
Reference in New Issue
Block a user