working on parser

2019-03-10 22:11:18 +01:00
parent 46f09be1d1
commit 1654548310
16 changed files with 255 additions and 0 deletions
@@ -0,0 +1,151 @@
+from corpusparser import Sentence
+from pathlib import Path
+import re
+from lxml import etree
+
+# Read input file(.xml, .json; kres or ssj500k).  
+# Create an iterator that outputs resulting sentences (python dict format).  
+class Parser():
+
+    def __init__(self, corpus, infiles):
+
+        if corpus == "kres":
+            self.kres_folder = Path(infiles[0])
+            self.kres_srl_folder = Path(infiles[1])
+        elif corpus == "ssj":
+            self.ssj_file = Path(infiles[0])
+        else:
+            raise ValueError("Argument corpus should be 'ssj' or 'kres'.")
+
+        self.corpus = corpus
+        self.W_TAGS = ['w']
+        self.C_TAGS = ['c']
+        self.S_TAGS = ['S', 'pc']
+
+    def parse_jos_links(self, sent_el):
+        if self.corpus == "kres":
+            return self.parse_jos_links_kres(sent_el)
+        else:
+            return self.parse_jos_links_ssj(sent_el)
+
+    def parse_ssj_target_arg(self, text):
+        # from: 0, to: 6
+        # <link ana="syn:modra" target="#ssj1.1.3 #ssj1.1.3.t6"/>
+        # from: 6, to: 7
+        # <link ana="syn:dol" target="#ssj1.1.3.t6 #ssj1.1.3.t7"/>
+        lst = [x.split(".")[-1] for x in text.split(" ")]
+        return [int(x[1:] if x[0] == "t" else 0) for x in lst]
+
+    def parse_jos_links_kres(self, sent_el):
+        lgrps = sent_el.findall(".//links")
+        if len(lgrps) < 1:
+            raise IOError("Can't find links.")
+        res_links = []
+        for link in lgrps[0]:
+            res_links += [{
+                "from": int(link.get("from").split(".")[-1]),
+                "afun": link.get("afun"),
+                "to": int(link.get("dep").split(".")[-1]),
+            }]
+        return res_links
+
+    def parse_jos_links_ssj(self, sent_el):
+        lgrps = sent_el.findall(".//linkGrp")
+        if len(lgrps) < 1:
+            # print(etree.tostring(sent_el))
+            raise IOError("Can't find links.")
+        res_links = []
+        for link in lgrps[0]:
+            print(link)
+            tar = self.parse_ssj_target_arg(link.get("target"))
+            res_links += [{
+                "from": tar[0],
+                "afun": link.get("ana").split(":")[1],
+                "to": [1],
+            }]
+        return res_links
+
+    def parse(self):
+        if self.corpus == "kres":
+            print("parse kres: TODO")
+        else:
+            self.parse_xml_file(self.ssj_file)
+
+    def parse_xml_file(self, filepath):
+        res_dict = {}
+        with filepath.open("rb") as fp:
+            # remove namespaces
+            bstr = fp.read()
+
+            utf8str = bstr.decode("utf-8")
+            utf8str = re.sub('\\sxmlns="[^"]+"', '', utf8str, count=1)
+            utf8str = re.sub(' xml:', ' ', utf8str)
+
+            root = etree.XML(utf8str.encode("utf-8"))
+
+            divs = []  # in ssj, there are divs, in Kres, there are separate files
+            if self.corpus == "kres":
+                divs = [root]
+            else:
+                divs = root.findall(".//div")
+
+            # parse divs
+            for div in divs:
+                f_id = div.get("id")
+
+                # parse paragraphs
+                for p in div.findall(".//p"):
+                    p_id = p.get("id").split(".")[-1]
+
+                    # parse sentences
+                    for s in p.findall(".//s"):
+                        s_id = s.get("id").split(".")[-1]
+                        sentence_text = ""
+                        sentence_tokens = []
+
+                        # parse tokens
+                        for el in s.iter():
+                            if el.tag in self.W_TAGS:
+                                el_id = el.get("id").split(".")[-1]
+                                if el_id[0] == 't':
+                                    el_id = el_id[1:]  # ssj W_TAG ids start with t
+                                sentence_text += el.text
+                                sentence_tokens += [{
+                                    "word": True,
+                                    "tid": int(el_id),
+                                    "text": el.text,
+                                    "lemma": el.get("lemma"),
+                                    "msd": (el.get("msd") if self.corpus == "kres"
+                                        else el.get("ana").split(":")[-1]),
+                                }]
+                            elif el.tag in self.C_TAGS:
+                                # only Kres' C_TAGS have ids
+                                el_id = el.get("id") or "none"
+                                el_id = el_id.split(".")[-1]
+                                sentence_text += el.text
+                                sentence_tokens += [{
+                                    "word": False,
+                                    "tid": el_id,
+                                    "text": el.text,
+                                }]
+                            elif el.tag in self.S_TAGS:
+                                # Kres' <S /> doesn't contain .text
+                                sentence_text += " "
+                            else:
+                                # pass links and linkGroups
+                                pass
+                        sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
+
+                        # make a generator instead of holding the whole corpus in memory
+                        if sentence_id in res_dict:
+                            raise KeyError("duplicated id: {}".format(sentence_id))
+                        res_dict[sentence_id] = {
+                            "sid": sentence_id,
+                            "text": sentence_text,
+                            "tokens": sentence_tokens,
+                            "jos_links": self.parse_jos_links(s)
+                        }
+
+                        print(res_dict[sentence_id])
+                        break
+        return res_dict
@@ -0,0 +1,3 @@
+class Sentence():
+	def __init__():
+		print("Sentence __init__(): TODO")
@@ -0,0 +1,2 @@
+from corpusparser.Parser import Parser
+from corpusparser.Sentence import Sentence
@@ -0,0 +1,10 @@
+from setuptools import setup
+
+setup(name='corpusparser',
+  version='0.0.1',
+  description=u"Parser for kres and ssj500k",
+  author=u"Kristjan Voje",
+  author_email='kristjan.voje@gmail.com',
+  license='MIT',
+  zip_safe=False,
+)
@@ -0,0 +1,16 @@
+from corpusparser import Parser 
+import argparse
+
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
+	parser.add_argument('--kres-folder', required=True)
+	parser.add_argument('--kres-srl-folder', required=True)
+	parser.add_argument('--ssj-file', required=True)
+	args = parser.parse_args()	
+
+	# parse ssj
+	ssj_parser = Parser(
+		corpus="ssj",
+		infiles=[args.ssj_file]
+	)
+	ssj_parser.parse()