16 changed files with 77 additions and 357 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,5 @@
 *.pyc
 *.pickle
-*.log

-nohup.out
-
-data/kres_out/*
-data/kres_example/
+data/*/*.xml
+data/*/*.tsv
--- a/15
+++ b/15
@ -1,22 +1,19 @@
-.PHONY: tsv_files srl_tagged_files json_files env clean
+.PHONY: tsv_files srl_tagged_files json_files env

-all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
+all: json_files

-json_files: # srl_tagged_files
+json_files: #TODO srl_tagged_files
 	cd tools; python3 gen_json.py

-srl_tagged_files: # tsv_files
+srl_tagged_files: tsv_files
 	# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
 	cd tools/srl-20131216; ./tag_all.sh

-tsv_files: # tools/fillpred_model/model.pickle
+tsv_files: fillpred_model/model.pickle
 	cd tools; python3 parse_all.py

-tools/fillpred_model/model.pickle:
+fillpred_model/model.pickle:
 	cd tools/fillpred_model; $(MAKE)

 env:
 	cd dockerfiles; cd python-java; $(MAKE)
-
-clean:
-	rm tools/fillpred_model/model.pickle
--- a/README.md
+++ b/README.md
@ -11,9 +11,7 @@ Check out `./tools/srl-20131216/README.md`.

 ## Scripts
 Check all possible xml tags (that occur after the <body> tag.  
-``` bash
-cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq
-```
+'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq'

 ## Tools
 * Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.  
@ -28,12 +26,6 @@ $ cd ./cjvt-srl-tagging
 $ make
 ```

-If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.  
-```
-$ nohup make &
-```
-See progress in generated logfile (check git root).  
-
 # Makefile
 The Makefile follows certain steps:
 1. Create a fillpred model.
--- a/data/kres_example.tar.gz
+++ b/data/kres_example.tar.gz
--- a/dockerfiles/python-java/Dockerfile
+++ b/dockerfiles/python-java/Dockerfile
@ -7,8 +7,6 @@ default-jdk \
 python3 \
 python3-pip

-RUN apt-get install -y sshfs
-
 RUN pip3 install lxml pandas sklearn

 ENV PYTHONIOENCODING UTF-8
--- a/dockerfiles/python-java/Makefile
+++ b/dockerfiles/python-java/Makefile
@ -5,16 +5,14 @@ all: build run
 build:
 	docker build . -t $(IMAGE_NAME)

-
 run:
 	docker run \
    	-it \
-	-v /home/${USER}:/home/${USER} \
 	--user $(shell id -u):$(shell id -g) \
+	-v /home/${USER}:/home/${USER} \
 	-v /etc/passwd:/etc/passwd \
 	-v /etc/group:/etc/group \
    	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 	-w /cjvt-srl-tagging \
-	-v /home/kristjan/kres_mount:/kres_mount:ro \
-    python-java \
-    /bin/bash
+    	python-java \
+    	/bin/bash
--- a/parser/tei_to_dict.py
+++ b/parser/tei_to_dict.py
@ -1,15 +0,0 @@
-#!/usr/bin/python3
-
-from lxml import etree
-
-def tei_to_dict(s_el):
-
-
-if __name__ == "__main__":
-    with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
-        xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
-        xmlstring = xmlstring.replace(' xml:', ' ')
-        xml_tree = ElementTree.XML(xmlstring)
-
-
-
--- a/parser/test.py
+++ b/parser/test.py
@ -1,151 +0,0 @@
-#!/usr/bin/python2
-
-from __future__ import print_function, unicode_literals, division
-import sys
-import os
-import re
-import pickle
-from pathlib import Path
-
-try:
-    from lxml import etree as ElementTree
-except ImportError:
-    import xml.etree.ElementTree as ElementTree
-
-
-# attributes
-ID_ATTR = "id"
-LEMMA_ATTR = "lemma"
-ANA_ATTR = "ana"
-
-
-# tags
-SENTENCE_TAG = 's'
-BIBL_TAG = 'bibl'
-PARAGRAPH_TAG = 'p'
-PC_TAG = 'pc'
-WORD_TAG = 'w'
-C_TAG = 'c'
-S_TAG = 'S'
-SEG_TAG = 'seg'
-
-
-class Sentence:
-    def __init__(self, sentence, s_id):
-        self.id = s_id
-        self.words = []
-        self.text = ""
-
-        for word in sentence:
-            self.handle_word(word)
-
-    def handle_word(self, word):
-        # handle space after
-        if word.tag == S_TAG:
-            assert(word.text is None)
-            self.text += ' '
-            return
-
-        # ASK am I handling this correctly?
-        elif word.tag == SEG_TAG:
-            for segword in word:
-                self.handle_word(segword)
-            return
-
-        # ASK handle unknown tags (are there others?)
-        elif word.tag not in (WORD_TAG, C_TAG):
-            return
-
-        # ID
-        idx = str(len(self.words) + 1)
-
-        # TOKEN
-        token = word.text
-
-        # LEMMA
-        if word.tag == WORD_TAG:
-            lemma = word.get(LEMMA_ATTR)
-            assert(lemma is not None)
-        else:
-            lemma = token
-
-        # XPOS
-        xpos = word.get('msd')
-        if word.tag == C_TAG:
-            xpos = "Z"
-        elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
-            xpos = "N"
-        elif xpos is None:
-            print(self.id)
-
-        # save word entry
-        self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
-
-        # save for text
-        self.text += word.text
-
-
-    def to_conllu(self):
-        lines = []
-        # lines.append('# sent_id = ' + self.id)
-        # CONLLu does not like spaces at the end of # text
-        # lines.append('# text = ' + self.text.strip())
-        for word in self.words:
-            lines.append('\t'.join('_' if data is None else data for data in word))
-
-        return lines
-
-def convert_file(in_file, out_file):
-    print("Nalaganje xml: {}".format(in_file))
-    with open(str(in_file), 'r') as fp:
-        uni_str = fp.read().decode("utf-8")
-        xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
-        xmlstring = xmlstring.replace(' xml:', ' ')
-        print(xmlstring[:1000])
-        xml_tree = ElementTree.XML(xmlstring)
-
-    print("Pretvarjanje TEI -> TSV-U ...")
-    lines = []
-
-    for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
-        sidx = 1
-        for sentence in paragraph:
-            if sentence.tag != SENTENCE_TAG:
-                continue
-
-            sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
-            lines.extend(sentence.to_conllu())
-            lines.append('') # ASK newline between sentences
-            sidx += 1
-
-    if len(lines) == 0:
-        raise RuntimeError("Nobenih stavkov najdenih")
-
-    print("Zapisovanje izhodne datoteke: {}".format(out_file))
-    with open(out_file, 'w') as fp:
-        for line in lines:
-            if sys.version_info < (3, 0):
-                line = line.encode('utf-8')
-            print(line, file=fp)
-
-
-if __name__ == "__main__":
-    """
-    Input: folder of TEI files, msds are encoded as msd="Z"
-    Ouput: just a folder
-    """
-	 
-    infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
-    outfile = "test.out"
-    convert_file(infile, outfile)
-    sys.exit()
-
-    in_folder = sys.argv[1]
-    out_folder = sys.argv[2]
-    num_processes = int(sys.argv[3])
-
-    files = Path(in_folder).rglob("*.xml")
-    in_out = []
-    for filename in files:
-        out_file = out_folder + "/" + filename.name[:-4] + ".txt"
-        convert_file(filename, out_file)
--- a/tools/fillpred_model/step1.py
+++ b/tools/fillpred_model/step1.py
@ -51,4 +51,4 @@ if __name__ == "__main__":
 			print(i, df.shape)

 	print(ndf.head())
-	ndf.to_pickle(Path(OUTFILE))
+	ndf.to_pickle(OUTFILE)
--- a/tools/fillpred_model/step2.py
+++ b/tools/fillpred_model/step2.py
@ -27,6 +27,4 @@ if __name__ == "__main__":
 	clf_full = DecisionTreeClassifier()
 	clf_full.fit(X, y)

-	with open(OUTFILE, "wb") as fp:
-		pickle.dump(clf_full, fp)
-
+	pickle.dump(clf_full, open(OUTFILE, "wb"))
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@ -1,10 +1,11 @@
 from pathlib import Path
 from parser.parser import Parser
 import configparser
-import json
-import sys
-import logging
-from multiprocessing import Pool
+
+# defaults
+ORIGPATH = Path("../data/kres_example")  # we need the IDs
+INPATH = Path("../data/kres_example_srl")
+OUTPATH = Path("../data/kres_example_json")

 # parse config
 config = configparser.ConfigParser()
@ -12,103 +13,45 @@ config.read("tools.cfg")
 ORIGPATH = Path(config["tools"]["kres_orig"])
 INPATH = Path(config["tools"]["kres_srl"])
 OUTPATH = Path(config["tools"]["kres_json"])
-DEBUG = config["tools"]["debug"] == "True"
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

 def get_origfile(filename):
-    for origfile in ORIGPATH.iterdir():
-        if filename.name.split('.')[0] == origfile.name.split('.')[0]:
-            return origfile
-    raise FileNotFoundError
+	for origfile in ORIGPATH.iterdir():
+		if filename.name.split('.')[0] == origfile.name.split('.')[0]:
+			return origfile
+	raise FileNotFoundError

 def extract_sentences(line_reader):
-    acc = []
-    # last char in line is \n, remove it
-    for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
-        if len(line) == 1:  # empty line
-            tmp = acc
-            acc = []
-            yield tmp
-        else:
-            acc.append(line)
+	acc = []
+	for line in [x.decode("utf-8").split('\t') for x in line_reader]:
+		if line[0] == '\n':
+			tmp = acc
+			acc = []
+			yield tmp
+		else:
+			acc.append(line)

-def to_sentence(sentence_arr):
-    return " ".join([token[1] for token in sentence_arr])
-
-def match_sentence_id(sentence, orig_dict):
-    for k, e in orig_dict.items():
-        orig_sentence = " ".join(token[2] for token in e["tokens"])
-        if sentence == orig_sentence:
-            return k
-    raise KeyError
-
-def get_dep_rel(token):
-    logging.debug(token)
-    for i, field in enumerate(token[14:]):
-        if field != "_":
-            return {
-                "arg":  field,
-                "from": i,  # i-th predicate in sentence
-                "dep":  token[0],
-            }
-    return None
-
-def handle_file(infile_tpl):
-    i = infile_tpl[0]
-    infile = infile_tpl[1]
-    outfile = (OUTPATH / infile.name).with_suffix(".json")
-    origfile = get_origfile(infile)
-    orig_dict = par.parse_tei(origfile)
-
-    with infile.open("rb") as fp:
-        outdata = {}
-        for sentence_arr in extract_sentences(fp.readlines()):
-            # tsv dropped sentence ids, match the ID, using original data
-            sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
-
-            outdata[sid] = []
-
-            # find all predicate indices in the sentence
-            predicates = []
-            for token in sentence_arr:
-                if token[12] == "Y":
-                    predicates += [token[0]]  # idx
-
-                deprel = get_dep_rel(token)
-                if deprel is not None:
-                    outdata[sid].append(deprel)
-
-            # deprel["from"] points to n-th predicate
-            # replace with predicate's token index
-            for deprel in outdata[sid]:
-                deprel["from"] = predicates[deprel["from"]]
-
-            if DEBUG:
-                print(to_sentence(sentence_arr))
-                print(outdata[sid])
-                print(sid)
-                print()
-                print()
-
-    with outfile.open("w") as fp:
-        json.dump(outdata, fp)
-        logging.info("SRL relations written to: {}".format(outfile))
+def match_sentence_id(string, rd):
+	str1 = " ".join([token[1] for token in sentence_arr])
+	for k, e in rd.items():
+		str2 = " ".join(token[2] for token in dict_entry["tokens"])
+		if str1 == str2:
+			return k
+	raise KeyError


-# main
-par = Parser()
-OUTPATH.mkdir(exist_ok=True)
+if __name__ == "__main__":

-infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
-logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
+	par = Parser()

-with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
+	for infile in [x for x in INPATH.iterdir() if x.is_file()]:
+		origfile = get_origfile(infile)
+		rd = par.parse_tei(origfile)

-logging.info("Finished generating .json files.")
+		fp = infile.open("rb")
+		for sentence_arr in extract_sentences(fp.readlines()):
+			sid = match_sentence_id(sentence_arr, rd)
+			print(sid)
+			# OK, we got the sentence id, now generate the predicate map!
+
+
+		outfile = (OUTPATH / infile.name).with_suffix(".json")
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@ -6,8 +6,10 @@ import re
 import sys
 import cProfile
 import configparser
-import logging
-from multiprocessing import Pool
+
+# some defaults
+INDIR = Path("../data/kres_example")
+OUTDIR = Path("../data/kres_example_tsv")

 SSJ500K_2_1 = 27829  # number of sentences
 par = Parser()
@ -17,13 +19,6 @@ config = configparser.ConfigParser()
 config.read("tools.cfg")
 INDIR = Path(config["tools"]["kres_orig"])
 OUTDIR = Path(config["tools"]["kres_tsv"])
-CPU_CORES = int(config["tools"]["cpu_cores"])
-
-LOGFILE = Path(config["tools"]["logfile"]).absolute()
-LOGFILE.touch(exist_ok=True)
-LOGFILE.resolve()
-
-logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)

 """
 print("parsing ssj")
@ -33,42 +28,22 @@ ssj_dict = par.parse_tei(ssj_file)
 print("end parsing ssj")
 """

+print("parsing kres")
 # kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
 OUTDIR.mkdir(exist_ok=True)

-infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
-logging.info("Parsing kres: {} files.".format(len(infiles)))
+for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:

-def handle_file(infile):
-    i = infile[0]
-    kres_file = infile[1]
-    outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
+    print("Processing file: " + str(kres_file))
+    res_dict = par.parse_tei(kres_file)
+    longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
+    print("Longest sentence: ", longest_sent)
+    kres_out_str = ""

-    if outfile.is_file():
-        logging.info("Skipping existing file: {}.".format(str(kres_file)))
-        return True
+    for _, sentence in res_dict.items():
+        kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)

-    try:
-        res_dict = par.parse_tei(kres_file)
-        kres_out_str = ""
-        for _, sentence in res_dict.items():
-            kres_out_str += par.to_conll_2009_SRL(sentence)
-    except Exception as exc:
-        logging.info("Failed processing file: {}".format(str(kres_file)))
-        logging.error(exc)
-        return False
-
-
-    with outfile.open("wb+") as fp:
+    with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
        fp.write(kres_out_str.encode("utf-8"))
-        logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
-        return True
-    return False
-
-with Pool(CPU_CORES) as p:
-    p.map(handle_file, infiles)
-
-
-logging.info("end parsing kres")
-
-
+        fp.close()
+print("end parsing kres")
--- a/tools/parser/msd/msdmap.py
+++ b/tools/parser/msd/msdmap.py
@ -35,11 +35,7 @@ class Msdmap:
    def slo_msd_to_eng_long(self, slo_msd):
        # old, slow
        # return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
-        # return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
-        query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
-        if query.empty:
-            return "No-matching-msd-found"
-        return query["eng_long"].values[0]
+        return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]

    def slo_msd_to_eng_pos(self, slo_msd):
        # first letter in slo_msd == slo_pos 
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@ -119,7 +119,7 @@ class Parser:
        return res_dict


-    def to_conll_2009_SRL(self, sentence_entry):
+    def to_conll_2009_SRL(self, sentence_entry, napreds=9):

        def fillpred(tsv_row):
            mrow = build_model_row(tsv_row)
@ -127,6 +127,8 @@ class Parser:
            y = self.fillpred_model.predict([x])
            return y[0]  # bool

+        apreds_string = '\t'.join(["_" for x in range(napreds)])
+
        # works with kres, with parsed links
        out_str = ""
        for token in sentence_entry["tokens"]:
@ -139,7 +141,7 @@ class Parser:
                    [t_id] +
                    [form for x in range(7)] + 
                    ["0", "0", "modra", "modra", "_", "_"] +
-                    ["\n"]
+                    [apreds_string, "\n"]
                )
                continue 

@ -168,6 +170,7 @@ class Parser:
                    sentence_entry["links"][t_id][0],  # pdeprel
                    "_",  # fillpred
                    "_",  # pred
+                    apreds_string,
                    "\n",
            ]
            fprd = fillpred(row_list)
--- a/tools/tools.cfg
+++ b/tools/tools.cfg
@ -1,8 +1,5 @@
 [tools]
-kres_orig = /kres_mount/kres_parsed/tei
-kres_tsv = ../data/kres_out/1_tsv
-kres_srl = ../data/kres_out/2_srl
-kres_json = ../data/kres_out/final_json
-logfile = ../progress.log
-cpu_cores = 5
-debug = False
+kres_orig = ../data/kres_example
+kres_tsv = ../data/kres_example_tsv
+kres_srl = ../data/kres_example_srl
+kres_json = ../data/kres/example_json
--- a/tools/tools.cfg.local
+++ b/tools/tools.cfg.local
@ -1,8 +0,0 @@
-[tools]
-kres_orig = ../data/kres_example
-kres_tsv = ../data/kres_out/1_tsv
-kres_srl = ../data/kres_out/2_srl
-kres_json = ../data/kres_out/final_json
-logfile = ../progress.log
-cpu_cores = 1
-debug = False