Compare commits

..

No commits in common. "master" and "per-file" have entirely different histories.

16 changed files with 77 additions and 357 deletions

7
.gitignore vendored
View File

@ -1,8 +1,5 @@
*.pyc
*.pickle
*.log
nohup.out
data/kres_out/*
data/kres_example/
data/*/*.xml
data/*/*.tsv

View File

@ -1,22 +1,19 @@
.PHONY: tsv_files srl_tagged_files json_files env clean
.PHONY: tsv_files srl_tagged_files json_files env
all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
all: json_files
json_files: # srl_tagged_files
json_files: #TODO srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: # tsv_files
srl_tagged_files: tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
tsv_files: # tools/fillpred_model/model.pickle
tsv_files: fillpred_model/model.pickle
cd tools; python3 parse_all.py
tools/fillpred_model/model.pickle:
fillpred_model/model.pickle:
cd tools/fillpred_model; $(MAKE)
env:
cd dockerfiles; cd python-java; $(MAKE)
clean:
rm tools/fillpred_model/model.pickle

View File

@ -11,9 +11,7 @@ Check out `./tools/srl-20131216/README.md`.
## Scripts
Check all possible xml tags (that occur after the <body> tag.
``` bash
cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq
```
'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq'
## Tools
* Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.
@ -28,12 +26,6 @@ $ cd ./cjvt-srl-tagging
$ make
```
If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.
```
$ nohup make &
```
See progress in generated logfile (check git root).
# Makefile
The Makefile follows certain steps:
1. Create a fillpred model.

Binary file not shown.

View File

@ -7,8 +7,6 @@ default-jdk \
python3 \
python3-pip
RUN apt-get install -y sshfs
RUN pip3 install lxml pandas sklearn
ENV PYTHONIOENCODING UTF-8

View File

@ -5,16 +5,14 @@ all: build run
build:
docker build . -t $(IMAGE_NAME)
run:
docker run \
-it \
-v /home/${USER}:/home/${USER} \
--user $(shell id -u):$(shell id -g) \
-v /home/${USER}:/home/${USER} \
-v /etc/passwd:/etc/passwd \
-v /etc/group:/etc/group \
-v $(shell pwd)/../../:/cjvt-srl-tagging \
-w /cjvt-srl-tagging \
-v /home/kristjan/kres_mount:/kres_mount:ro \
python-java \
/bin/bash
python-java \
/bin/bash

View File

@ -1,15 +0,0 @@
#!/usr/bin/python3
from lxml import etree
def tei_to_dict(s_el):
if __name__ == "__main__":
with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
xml_tree = ElementTree.XML(xmlstring)

View File

@ -1,151 +0,0 @@
#!/usr/bin/python2
from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path
try:
from lxml import etree as ElementTree
except ImportError:
import xml.etree.ElementTree as ElementTree
# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"
# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'
class Sentence:
def __init__(self, sentence, s_id):
self.id = s_id
self.words = []
self.text = ""
for word in sentence:
self.handle_word(word)
def handle_word(self, word):
# handle space after
if word.tag == S_TAG:
assert(word.text is None)
self.text += ' '
return
# ASK am I handling this correctly?
elif word.tag == SEG_TAG:
for segword in word:
self.handle_word(segword)
return
# ASK handle unknown tags (are there others?)
elif word.tag not in (WORD_TAG, C_TAG):
return
# ID
idx = str(len(self.words) + 1)
# TOKEN
token = word.text
# LEMMA
if word.tag == WORD_TAG:
lemma = word.get(LEMMA_ATTR)
assert(lemma is not None)
else:
lemma = token
# XPOS
xpos = word.get('msd')
if word.tag == C_TAG:
xpos = "Z"
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
xpos = "N"
elif xpos is None:
print(self.id)
# save word entry
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
# save for text
self.text += word.text
def to_conllu(self):
lines = []
# lines.append('# sent_id = ' + self.id)
# CONLLu does not like spaces at the end of # text
# lines.append('# text = ' + self.text.strip())
for word in self.words:
lines.append('\t'.join('_' if data is None else data for data in word))
return lines
def convert_file(in_file, out_file):
print("Nalaganje xml: {}".format(in_file))
with open(str(in_file), 'r') as fp:
uni_str = fp.read().decode("utf-8")
xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
print(xmlstring[:1000])
xml_tree = ElementTree.XML(xmlstring)
print("Pretvarjanje TEI -> TSV-U ...")
lines = []
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
sidx = 1
for sentence in paragraph:
if sentence.tag != SENTENCE_TAG:
continue
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
lines.extend(sentence.to_conllu())
lines.append('') # ASK newline between sentences
sidx += 1
if len(lines) == 0:
raise RuntimeError("Nobenih stavkov najdenih")
print("Zapisovanje izhodne datoteke: {}".format(out_file))
with open(out_file, 'w') as fp:
for line in lines:
if sys.version_info < (3, 0):
line = line.encode('utf-8')
print(line, file=fp)
if __name__ == "__main__":
"""
Input: folder of TEI files, msds are encoded as msd="Z"
Ouput: just a folder
"""
infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
outfile = "test.out"
convert_file(infile, outfile)
sys.exit()
in_folder = sys.argv[1]
out_folder = sys.argv[2]
num_processes = int(sys.argv[3])
files = Path(in_folder).rglob("*.xml")
in_out = []
for filename in files:
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
convert_file(filename, out_file)

View File

@ -51,4 +51,4 @@ if __name__ == "__main__":
print(i, df.shape)
print(ndf.head())
ndf.to_pickle(Path(OUTFILE))
ndf.to_pickle(OUTFILE)

View File

@ -27,6 +27,4 @@ if __name__ == "__main__":
clf_full = DecisionTreeClassifier()
clf_full.fit(X, y)
with open(OUTFILE, "wb") as fp:
pickle.dump(clf_full, fp)
pickle.dump(clf_full, open(OUTFILE, "wb"))

View File

@ -1,10 +1,11 @@
from pathlib import Path
from parser.parser import Parser
import configparser
import json
import sys
import logging
from multiprocessing import Pool
# defaults
ORIGPATH = Path("../data/kres_example") # we need the IDs
INPATH = Path("../data/kres_example_srl")
OUTPATH = Path("../data/kres_example_json")
# parse config
config = configparser.ConfigParser()
@ -12,103 +13,45 @@ config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["kres_orig"])
INPATH = Path(config["tools"]["kres_srl"])
OUTPATH = Path(config["tools"]["kres_json"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
def get_origfile(filename):
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
for origfile in ORIGPATH.iterdir():
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
return origfile
raise FileNotFoundError
def extract_sentences(line_reader):
acc = []
# last char in line is \n, remove it
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
if len(line) == 1: # empty line
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
acc = []
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
if line[0] == '\n':
tmp = acc
acc = []
yield tmp
else:
acc.append(line)
def to_sentence(sentence_arr):
return " ".join([token[1] for token in sentence_arr])
def match_sentence_id(sentence, orig_dict):
for k, e in orig_dict.items():
orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_sentence:
return k
raise KeyError
def get_dep_rel(token):
logging.debug(token)
for i, field in enumerate(token[14:]):
if field != "_":
return {
"arg": field,
"from": i, # i-th predicate in sentence
"dep": token[0],
}
return None
def handle_file(infile_tpl):
i = infile_tpl[0]
infile = infile_tpl[1]
outfile = (OUTPATH / infile.name).with_suffix(".json")
origfile = get_origfile(infile)
orig_dict = par.parse_tei(origfile)
with infile.open("rb") as fp:
outdata = {}
for sentence_arr in extract_sentences(fp.readlines()):
# tsv dropped sentence ids, match the ID, using original data
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
with outfile.open("w") as fp:
json.dump(outdata, fp)
logging.info("SRL relations written to: {}".format(outfile))
def match_sentence_id(string, rd):
str1 = " ".join([token[1] for token in sentence_arr])
for k, e in rd.items():
str2 = " ".join(token[2] for token in dict_entry["tokens"])
if str1 == str2:
return k
raise KeyError
# main
par = Parser()
OUTPATH.mkdir(exist_ok=True)
if __name__ == "__main__":
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
par = Parser()
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
origfile = get_origfile(infile)
rd = par.parse_tei(origfile)
logging.info("Finished generating .json files.")
fp = infile.open("rb")
for sentence_arr in extract_sentences(fp.readlines()):
sid = match_sentence_id(sentence_arr, rd)
print(sid)
# OK, we got the sentence id, now generate the predicate map!
outfile = (OUTPATH / infile.name).with_suffix(".json")

View File

@ -6,8 +6,10 @@ import re
import sys
import cProfile
import configparser
import logging
from multiprocessing import Pool
# some defaults
INDIR = Path("../data/kres_example")
OUTDIR = Path("../data/kres_example_tsv")
SSJ500K_2_1 = 27829 # number of sentences
par = Parser()
@ -17,13 +19,6 @@ config = configparser.ConfigParser()
config.read("tools.cfg")
INDIR = Path(config["tools"]["kres_orig"])
OUTDIR = Path(config["tools"]["kres_tsv"])
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
"""
print("parsing ssj")
@ -33,42 +28,22 @@ ssj_dict = par.parse_tei(ssj_file)
print("end parsing ssj")
"""
print("parsing kres")
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
print("Processing file: " + str(kres_file))
res_dict = par.parse_tei(kres_file)
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
print("Longest sentence: ", longest_sent)
kres_out_str = ""
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
except Exception as exc:
logging.info("Failed processing file: {}".format(str(kres_file)))
logging.error(exc)
return False
with outfile.open("wb+") as fp:
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
with Pool(CPU_CORES) as p:
p.map(handle_file, infiles)
logging.info("end parsing kres")
fp.close()
print("end parsing kres")

View File

@ -35,11 +35,7 @@ class Msdmap:
def slo_msd_to_eng_long(self, slo_msd):
# old, slow
# return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
# return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
if query.empty:
return "No-matching-msd-found"
return query["eng_long"].values[0]
return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
def slo_msd_to_eng_pos(self, slo_msd):
# first letter in slo_msd == slo_pos

View File

@ -119,7 +119,7 @@ class Parser:
return res_dict
def to_conll_2009_SRL(self, sentence_entry):
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
def fillpred(tsv_row):
mrow = build_model_row(tsv_row)
@ -127,6 +127,8 @@ class Parser:
y = self.fillpred_model.predict([x])
return y[0] # bool
apreds_string = '\t'.join(["_" for x in range(napreds)])
# works with kres, with parsed links
out_str = ""
for token in sentence_entry["tokens"]:
@ -139,7 +141,7 @@ class Parser:
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
["\n"]
[apreds_string, "\n"]
)
continue
@ -168,6 +170,7 @@ class Parser:
sentence_entry["links"][t_id][0], # pdeprel
"_", # fillpred
"_", # pred
apreds_string,
"\n",
]
fprd = fillpred(row_list)

View File

@ -1,8 +1,5 @@
[tools]
kres_orig = /kres_mount/kres_parsed/tei
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 5
debug = False
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_example_tsv
kres_srl = ../data/kres_example_srl
kres_json = ../data/kres/example_json

View File

@ -1,8 +0,0 @@
[tools]
kres_orig = ../data/kres_example
kres_tsv = ../data/kres_out/1_tsv
kres_srl = ../data/kres_out/2_srl
kres_json = ../data/kres_out/final_json
logfile = ../progress.log
cpu_cores = 1
debug = False