diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ecb8742
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+data/samples/
+*/pycache/
+*egg-info/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..51a0afe
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,34 @@
+# All required components, to create and fill a database,
+# instantiate backend and frontend.
+
+MAKE_ROOT = $(shell pwd)
+
+### Input data
+# I received ssj500k in one .xml file,
+# kres is composed of many .xml files
+# I generated srl tags for kres in separate .json files
+# (for each kres.xml file there is a kres.json file with srl tags)
+SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
+KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
+KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl"
+export
+
+.PHONY: dev-env preflight
+
+all:
+ echo "Select an argument"
+
+# prereq (environment)
+dev-env:
+ cd dockerfiles; cd dev-env; $(MAKE)
+
+# run these inside dev-env container
+data/samples:
+ cd data; tar xzvf samples.tar.gz
+
+# installs our python code as packages
+# when debugging, run this once, then run python3 ... by hand
+preflight: data/samples
+ pip3 install -e src/pkg/corpusparser/.
+ python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \
+ --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER)
diff --git a/conf/main.conf b/conf/main.conf
deleted file mode 100644
index e69de29..0000000
diff --git a/data/kres_srl b/data/kres_srl
new file mode 120000
index 0000000..f1acfc4
--- /dev/null
+++ b/data/kres_srl
@@ -0,0 +1 @@
+/home/kristjan/kres_srl/final_json/
\ No newline at end of file
diff --git a/data/samples.tar.gz b/data/samples.tar.gz
new file mode 100644
index 0000000..ba89432
Binary files /dev/null and b/data/samples.tar.gz differ
diff --git a/dockerfiles/dev-env/Dockerfile b/dockerfiles/dev-env/Dockerfile
new file mode 100644
index 0000000..3a494d2
--- /dev/null
+++ b/dockerfiles/dev-env/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+RUN apt-get install -y \
+vim \
+python3 \
+python3-pip \
+sshfs
+
+RUN pip3 install \
+ lxml \
+ pandas \
+ sklearn \
+ argparse \
+ pathlib
+
+ENV PYTHONIOENCODING UTF-8
diff --git a/dockerfiles/dev-env/Makefile b/dockerfiles/dev-env/Makefile
new file mode 100644
index 0000000..7f5dd50
--- /dev/null
+++ b/dockerfiles/dev-env/Makefile
@@ -0,0 +1,18 @@
+IMAGE_NAME="cjvt-dev-env"
+
+all: build run
+
+build:
+ docker build . -t $(IMAGE_NAME)
+
+run:
+ docker run \
+ -it \
+ -v /home/${USER}:/home/${USER} \
+ --user $(shell id -u):$(shell id -g) \
+ -v /etc/passwd:/etc/passwd \
+ -v /etc/group:/etc/group \
+ -v $(MAKE_ROOT):/project \
+ -w /project \
+ $(IMAGE_NAME) \
+ /bin/bash
diff --git a/src/pkg/anchor b/src/pkg/anchor
deleted file mode 100644
index e69de29..0000000
diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py
new file mode 100644
index 0000000..dc93554
--- /dev/null
+++ b/src/pkg/corpusparser/corpusparser/Parser.py
@@ -0,0 +1,151 @@
+from corpusparser import Sentence
+from pathlib import Path
+import re
+from lxml import etree
+
+# Read input file(.xml, .json; kres or ssj500k).
+# Create an iterator that outputs resulting sentences (python dict format).
+class Parser():
+
+ def __init__(self, corpus, infiles):
+
+ if corpus == "kres":
+ self.kres_folder = Path(infiles[0])
+ self.kres_srl_folder = Path(infiles[1])
+ elif corpus == "ssj":
+ self.ssj_file = Path(infiles[0])
+ else:
+ raise ValueError("Argument corpus should be 'ssj' or 'kres'.")
+
+ self.corpus = corpus
+ self.W_TAGS = ['w']
+ self.C_TAGS = ['c']
+ self.S_TAGS = ['S', 'pc']
+
+ def parse_jos_links(self, sent_el):
+ if self.corpus == "kres":
+ return self.parse_jos_links_kres(sent_el)
+ else:
+ return self.parse_jos_links_ssj(sent_el)
+
+ def parse_ssj_target_arg(self, text):
+ # from: 0, to: 6
+ #
+ # from: 6, to: 7
+ #
+ lst = [x.split(".")[-1] for x in text.split(" ")]
+ return [int(x[1:] if x[0] == "t" else 0) for x in lst]
+
+ def parse_jos_links_kres(self, sent_el):
+ lgrps = sent_el.findall(".//links")
+ if len(lgrps) < 1:
+ raise IOError("Can't find links.")
+ res_links = []
+ for link in lgrps[0]:
+ res_links += [{
+ "from": int(link.get("from").split(".")[-1]),
+ "afun": link.get("afun"),
+ "to": int(link.get("dep").split(".")[-1]),
+ }]
+ return res_links
+
+ def parse_jos_links_ssj(self, sent_el):
+ lgrps = sent_el.findall(".//linkGrp")
+ if len(lgrps) < 1:
+ # print(etree.tostring(sent_el))
+ raise IOError("Can't find links.")
+ res_links = []
+ for link in lgrps[0]:
+ print(link)
+ tar = self.parse_ssj_target_arg(link.get("target"))
+ res_links += [{
+ "from": tar[0],
+ "afun": link.get("ana").split(":")[1],
+ "to": [1],
+ }]
+ return res_links
+
+ def parse(self):
+ if self.corpus == "kres":
+ print("parse kres: TODO")
+ else:
+ self.parse_xml_file(self.ssj_file)
+
+ def parse_xml_file(self, filepath):
+ res_dict = {}
+ with filepath.open("rb") as fp:
+ # remove namespaces
+ bstr = fp.read()
+
+ utf8str = bstr.decode("utf-8")
+ utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
+ utf8str = re.sub(' xml:', ' ', utf8str)
+
+ root = etree.XML(utf8str.encode("utf-8"))
+
+ divs = [] # in ssj, there are divs, in Kres, there are separate files
+ if self.corpus == "kres":
+ divs = [root]
+ else:
+ divs = root.findall(".//div")
+
+ # parse divs
+ for div in divs:
+ f_id = div.get("id")
+
+ # parse paragraphs
+ for p in div.findall(".//p"):
+ p_id = p.get("id").split(".")[-1]
+
+ # parse sentences
+ for s in p.findall(".//s"):
+ s_id = s.get("id").split(".")[-1]
+ sentence_text = ""
+ sentence_tokens = []
+
+ # parse tokens
+ for el in s.iter():
+ if el.tag in self.W_TAGS:
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ sentence_tokens += [{
+ "word": True,
+ "tid": int(el_id),
+ "text": el.text,
+ "lemma": el.get("lemma"),
+ "msd": (el.get("msd") if self.corpus == "kres"
+ else el.get("ana").split(":")[-1]),
+ }]
+ elif el.tag in self.C_TAGS:
+ # only Kres' C_TAGS have ids
+ el_id = el.get("id") or "none"
+ el_id = el_id.split(".")[-1]
+ sentence_text += el.text
+ sentence_tokens += [{
+ "word": False,
+ "tid": el_id,
+ "text": el.text,
+ }]
+ elif el.tag in self.S_TAGS:
+ # Kres' doesn't contain .text
+ sentence_text += " "
+ else:
+ # pass links and linkGroups
+ pass
+ sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
+
+ # make a generator instead of holding the whole corpus in memory
+ if sentence_id in res_dict:
+ raise KeyError("duplicated id: {}".format(sentence_id))
+ res_dict[sentence_id] = {
+ "sid": sentence_id,
+ "text": sentence_text,
+ "tokens": sentence_tokens,
+ "jos_links": self.parse_jos_links(s)
+ }
+
+ print(res_dict[sentence_id])
+ break
+ return res_dict
diff --git a/src/pkg/corpusparser/corpusparser/Sentence.py b/src/pkg/corpusparser/corpusparser/Sentence.py
new file mode 100644
index 0000000..aba9a4a
--- /dev/null
+++ b/src/pkg/corpusparser/corpusparser/Sentence.py
@@ -0,0 +1,3 @@
+class Sentence():
+ def __init__():
+ print("Sentence __init__(): TODO")
diff --git a/src/pkg/corpusparser/corpusparser/__init__.py b/src/pkg/corpusparser/corpusparser/__init__.py
new file mode 100644
index 0000000..213c88a
--- /dev/null
+++ b/src/pkg/corpusparser/corpusparser/__init__.py
@@ -0,0 +1,2 @@
+from corpusparser.Parser import Parser
+from corpusparser.Sentence import Sentence
\ No newline at end of file
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc
new file mode 100644
index 0000000..98e401c
Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc differ
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc
new file mode 100644
index 0000000..c0ae9f7
Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc differ
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc
new file mode 100644
index 0000000..a365c8e
Binary files /dev/null and b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc differ
diff --git a/src/pkg/corpusparser/setup.py b/src/pkg/corpusparser/setup.py
new file mode 100644
index 0000000..da967a7
--- /dev/null
+++ b/src/pkg/corpusparser/setup.py
@@ -0,0 +1,10 @@
+from setuptools import setup
+
+setup(name='corpusparser',
+ version='0.0.1',
+ description=u"Parser for kres and ssj500k",
+ author=u"Kristjan Voje",
+ author_email='kristjan.voje@gmail.com',
+ license='MIT',
+ zip_safe=False,
+)
\ No newline at end of file
diff --git a/src/preflight/main_parse.py b/src/preflight/main_parse.py
new file mode 100644
index 0000000..d696cfb
--- /dev/null
+++ b/src/preflight/main_parse.py
@@ -0,0 +1,16 @@
+from corpusparser import Parser
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
+ parser.add_argument('--kres-folder', required=True)
+ parser.add_argument('--kres-srl-folder', required=True)
+ parser.add_argument('--ssj-file', required=True)
+ args = parser.parse_args()
+
+ # parse ssj
+ ssj_parser = Parser(
+ corpus="ssj",
+ infiles=[args.ssj_file]
+ )
+ ssj_parser.parse()