diff --git a/.gitignore b/.gitignore
index ecb8742..c57e05a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
data/samples/
-*/pycache/
+*/__pycache__/
*egg-info/
diff --git a/README.md b/README.md
index a956d10..c35bf79 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# cjvt-vezljivost
+# cjvt-valency
## Components
diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py
index dc93554..9fb0e91 100644
--- a/src/pkg/corpusparser/corpusparser/Parser.py
+++ b/src/pkg/corpusparser/corpusparser/Parser.py
@@ -1,6 +1,7 @@
from corpusparser import Sentence
from pathlib import Path
import re
+import json
from lxml import etree
# Read input file(.xml, .json; kres or ssj500k).
@@ -26,15 +27,8 @@ class Parser():
if self.corpus == "kres":
return self.parse_jos_links_kres(sent_el)
else:
- return self.parse_jos_links_ssj(sent_el)
-
- def parse_ssj_target_arg(self, text):
- # from: 0, to: 6
- #
- # from: 6, to: 7
- #
- lst = [x.split(".")[-1] for x in text.split(" ")]
- return [int(x[1:] if x[0] == "t" else 0) for x in lst]
+ # 'syntax' is the linkgroup we're looking for
+ return self.parse_any_links_ssj(sent_el, "syntax")
def parse_jos_links_kres(self, sent_el):
lgrps = sent_el.findall(".//links")
@@ -49,103 +43,138 @@ class Parser():
}]
return res_links
- def parse_jos_links_ssj(self, sent_el):
+ def parse_ssj_target_arg(self, text):
+ # from: 0, to: 6
+ #
+ # from: 6, to: 7
+ #
+ lst = [x.split(".")[-1] for x in text.split(" ")]
+ return [int(x[1:] if x[0] == "t" else 0) for x in lst]
+
+ def parse_any_links_ssj(self, sent_el, links_type):
lgrps = sent_el.findall(".//linkGrp")
- if len(lgrps) < 1:
- # print(etree.tostring(sent_el))
- raise IOError("Can't find links.")
+ links = [x for x in lgrps if x.get("type") == links_type][0]
res_links = []
- for link in lgrps[0]:
- print(link)
+ for link in links:
tar = self.parse_ssj_target_arg(link.get("target"))
res_links += [{
"from": tar[0],
"afun": link.get("ana").split(":")[1],
- "to": [1],
+ "to": tar[1],
}]
return res_links
+ def parse_srl_links(self, sent_el, xml_file=None):
+ if self.corpus == "kres":
+ return self.parse_srl_links_kres(sent_el, xml_file)
+ else:
+ return self.parse_any_links_ssj(sent_el, "SRL")
+
+ def parse_srl_links_kres(self, sent_el, sent_srl_dict):
+ print(sent_srl_dict)
+ # find the correspointing json file with srl links
+ return "TODO"
+
def parse(self):
if self.corpus == "kres":
- print("parse kres: TODO")
+ for xml_file in self.kres_folder.iterdir():
+ self.parse_xml_file(xml_file)
+ break # TODO dev break
else:
self.parse_xml_file(self.ssj_file)
- def parse_xml_file(self, filepath):
- res_dict = {}
- with filepath.open("rb") as fp:
+ def parse_xml_file(self, xml_file):
+ srl_dict = {}
+ if self.corpus == "kres":
+ # in case of kres, read the SRL links form a separate json file
+ file_id = xml_file.name.split(".")[0]
+ json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json")
+ with json_file.open("r") as fp:
+ srl_dict = json.loads(fp.read())
+
+ with xml_file.open("rb") as fp:
# remove namespaces
bstr = fp.read()
- utf8str = bstr.decode("utf-8")
- utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
- utf8str = re.sub(' xml:', ' ', utf8str)
-
- root = etree.XML(utf8str.encode("utf-8"))
-
- divs = [] # in ssj, there are divs, in Kres, there are separate files
- if self.corpus == "kres":
- divs = [root]
- else:
- divs = root.findall(".//div")
-
- # parse divs
- for div in divs:
- f_id = div.get("id")
-
- # parse paragraphs
- for p in div.findall(".//p"):
- p_id = p.get("id").split(".")[-1]
-
- # parse sentences
- for s in p.findall(".//s"):
- s_id = s.get("id").split(".")[-1]
- sentence_text = ""
- sentence_tokens = []
-
- # parse tokens
- for el in s.iter():
- if el.tag in self.W_TAGS:
- el_id = el.get("id").split(".")[-1]
- if el_id[0] == 't':
- el_id = el_id[1:] # ssj W_TAG ids start with t
- sentence_text += el.text
- sentence_tokens += [{
- "word": True,
- "tid": int(el_id),
- "text": el.text,
- "lemma": el.get("lemma"),
- "msd": (el.get("msd") if self.corpus == "kres"
- else el.get("ana").split(":")[-1]),
- }]
- elif el.tag in self.C_TAGS:
- # only Kres' C_TAGS have ids
- el_id = el.get("id") or "none"
- el_id = el_id.split(".")[-1]
- sentence_text += el.text
- sentence_tokens += [{
- "word": False,
- "tid": el_id,
- "text": el.text,
- }]
- elif el.tag in self.S_TAGS:
- # Kres' doesn't contain .text
- sentence_text += " "
- else:
- # pass links and linkGroups
- pass
- sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
-
- # make a generator instead of holding the whole corpus in memory
- if sentence_id in res_dict:
- raise KeyError("duplicated id: {}".format(sentence_id))
- res_dict[sentence_id] = {
- "sid": sentence_id,
- "text": sentence_text,
- "tokens": sentence_tokens,
- "jos_links": self.parse_jos_links(s)
- }
-
- print(res_dict[sentence_id])
- break
+ utf8str = bstr.decode("utf-8")
+ utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
+ utf8str = re.sub(' xml:', ' ', utf8str)
+
+ root = etree.XML(utf8str.encode("utf-8"))
+
+ divs = [] # in ssj, there are divs, in Kres, there are separate files
+ if self.corpus == "kres":
+ divs = [root]
+ else:
+ divs = root.findall(".//div")
+
+ res_dict = [] # TODO: try making an iterator instead
+
+ # parse divs
+ for div in divs:
+ f_id = div.get("id")
+
+ # parse paragraphs
+ for p in div.findall(".//p"):
+ p_id = p.get("id").split(".")[-1]
+
+ # parse sentences
+ for s in p.findall(".//s"):
+ s_id = s.get("id").split(".")[-1]
+ sentence_text = ""
+ sentence_tokens = []
+
+ # parse tokens
+ for el in s.iter():
+ if el.tag in self.W_TAGS:
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ sentence_tokens += [{
+ "word": True,
+ "tid": int(el_id),
+ "text": el.text,
+ "lemma": el.get("lemma"),
+ "msd": (el.get("msd") if self.corpus == "kres"
+ else el.get("ana").split(":")[-1]),
+ }]
+ elif el.tag in self.C_TAGS:
+ # only Kres' C_TAGS have ids
+ el_id = el.get("id") or "none"
+ el_id = el_id.split(".")[-1]
+ sentence_text += el.text
+ sentence_tokens += [{
+ "word": False,
+ "tid": el_id,
+ "text": el.text,
+ }]
+ elif el.tag in self.S_TAGS:
+ # Kres' doesn't contain .text
+ sentence_text += " "
+ else:
+ # pass links and linkGroups
+ pass
+ sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
+
+ # make a generator instead of holding the whole corpus in memory
+ # TODO -- match ids
+ print("---")
+ print(sorted(srl_dict.keys(), key=lambda x: x.split(".")[1])[:100])
+ print(sentence_id)
+ print(srl_dict.get(str(sentence_id)))
+ print("---")
+ if sentence_id in res_dict:
+ raise KeyError("duplicated id: {}".format(sentence_id))
+ res_dict[sentence_id] = {
+ "sid": sentence_id,
+ "text": sentence_text,
+ "tokens": sentence_tokens,
+ "jos_links": self.parse_jos_links(s),
+ "srl_links": self.parse_srl_links(s, srl_dict[sentence_id]),
+ }
+
+ print(res_dict[sentence_id])
+ print("------------------------------------------------- END")
+ return # TODO dev break
return res_dict
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc
deleted file mode 100644
index 98e401c..0000000
Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc and /dev/null differ
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc
deleted file mode 100644
index c0ae9f7..0000000
Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc and /dev/null differ
diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc
deleted file mode 100644
index a365c8e..0000000
Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc and /dev/null differ
diff --git a/src/preflight/main_parse.py b/src/preflight/main_parse.py
index d696cfb..35bb2bb 100644
--- a/src/preflight/main_parse.py
+++ b/src/preflight/main_parse.py
@@ -9,8 +9,17 @@ if __name__ == "__main__":
args = parser.parse_args()
# parse ssj
+ """
ssj_parser = Parser(
corpus="ssj",
infiles=[args.ssj_file]
)
ssj_parser.parse()
+ """
+
+ # parse kres
+ kres_parser = Parser(
+ corpus="kres",
+ infiles=[args.kres_folder, args.kres_srl_folder]
+ )
+ kres_parser.parse()