diff --git a/.gitignore b/.gitignore index c57e05a..43fb148 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ data/samples/ -*/__pycache__/ *egg-info/ +*.pyc diff --git a/data/kres_srl_ikt b/data/kres_srl_ikt new file mode 120000 index 0000000..465d987 --- /dev/null +++ b/data/kres_srl_ikt @@ -0,0 +1 @@ +/home/voje/work_data/final_json \ No newline at end of file diff --git a/src/pkg/corpusparser/corpusparser/Parser.py b/src/pkg/corpusparser/corpusparser/Parser.py index fb8b408..1c5dd6b 100644 --- a/src/pkg/corpusparser/corpusparser/Parser.py +++ b/src/pkg/corpusparser/corpusparser/Parser.py @@ -75,9 +75,9 @@ class Parser(): if len(sent_srl_links) == 0: print("HI") return [] - print(sent_srl_dict) + print(sent_srl_links) # find the correspointing json file with srl links - return [] + return sent_srl_links def parse(self): if self.corpus == "kres": @@ -112,7 +112,7 @@ class Parser(): else: divs = root.findall(".//div") - res_dict = [] # TODO: try making an iterator instead + res_dict = {} # parse divs for div in divs: @@ -150,7 +150,7 @@ class Parser(): sentence_text += el.text sentence_tokens += [{ "word": False, - "tid": el_id, + "tid": int(el_id), "text": el.text, }] elif el.tag in self.S_TAGS: @@ -166,16 +166,15 @@ class Parser(): raise KeyError("duplicated id: {}".format(sentence_id)) jos_links = self.parse_jos_links(s) srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None - srl_links_fixed = self.parse_srl_links(s, srl_links) - print(srl_links) + srl_links_parsed = self.parse_srl_links(s, srl_links) res_dict[sentence_id] = { "sid": sentence_id, "text": sentence_text, "tokens": sentence_tokens, - "jos_links": "BBBB", - "srl_links": "AAAAA", + "jos_links": jos_links, + "srl_links": srl_links_parsed } - + print("------------------------------------------------- END") print(res_dict[sentence_id]) print("------------------------------------------------- END") return # TODO dev break diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc deleted file mode 100644 index e438b67..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Parser.cpython-35.pyc and /dev/null differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc deleted file mode 100644 index c0ae9f7..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/Sentence.cpython-35.pyc and /dev/null differ diff --git a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc b/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc deleted file mode 100644 index a365c8e..0000000 Binary files a/src/pkg/corpusparser/corpusparser/__pycache__/__init__.cpython-35.pyc and /dev/null differ