|
|
@ -1,14 +1,16 @@
|
|
|
|
from corpusparser import Sentence
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
from pathlib import Path
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
import json
|
|
|
|
import json
|
|
|
|
from lxml import etree
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
# Read input file(.xml, .json; kres or ssj500k).
|
|
|
|
# Read input file(.xml, .json; kres or ssj500k).
|
|
|
|
# Create an iterator that outputs resulting sentences (python dict format).
|
|
|
|
# Create an iterator that outputs resulting sentences (python dict format).
|
|
|
|
class Parser():
|
|
|
|
class Parser():
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, corpus, infiles):
|
|
|
|
def __init__(self, corpus, infiles, logger=None):
|
|
|
|
|
|
|
|
|
|
|
|
if corpus == "kres":
|
|
|
|
if corpus == "kres":
|
|
|
|
self.kres_folder = Path(infiles[0])
|
|
|
|
self.kres_folder = Path(infiles[0])
|
|
|
@ -22,6 +24,7 @@ class Parser():
|
|
|
|
self.W_TAGS = ['w']
|
|
|
|
self.W_TAGS = ['w']
|
|
|
|
self.C_TAGS = ['c']
|
|
|
|
self.C_TAGS = ['c']
|
|
|
|
self.S_TAGS = ['S', 'pc']
|
|
|
|
self.S_TAGS = ['S', 'pc']
|
|
|
|
|
|
|
|
self.logger = logger or logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def parse_jos_links(self, sent_el):
|
|
|
|
def parse_jos_links(self, sent_el):
|
|
|
|
if self.corpus == "kres":
|
|
|
|
if self.corpus == "kres":
|
|
|
@ -71,13 +74,15 @@ class Parser():
|
|
|
|
return self.parse_any_links_ssj(sent_el, "SRL")
|
|
|
|
return self.parse_any_links_ssj(sent_el, "SRL")
|
|
|
|
|
|
|
|
|
|
|
|
def parse_srl_links_kres(self, sent_el, sent_srl_links):
|
|
|
|
def parse_srl_links_kres(self, sent_el, sent_srl_links):
|
|
|
|
print("HA")
|
|
|
|
res_links = []
|
|
|
|
if len(sent_srl_links) == 0:
|
|
|
|
for link in sent_srl_links:
|
|
|
|
print("HI")
|
|
|
|
res_links += [{
|
|
|
|
return []
|
|
|
|
"from": int(link["from"]),
|
|
|
|
print(sent_srl_links)
|
|
|
|
"afun": link["arg"],
|
|
|
|
|
|
|
|
"to": int(link["dep"]),
|
|
|
|
|
|
|
|
}]
|
|
|
|
# find the correspointing json file with srl links
|
|
|
|
# find the correspointing json file with srl links
|
|
|
|
return sent_srl_links
|
|
|
|
return res_links
|
|
|
|
|
|
|
|
|
|
|
|
def parse(self):
|
|
|
|
def parse(self):
|
|
|
|
if self.corpus == "kres":
|
|
|
|
if self.corpus == "kres":
|
|
|
@ -166,7 +171,10 @@ class Parser():
|
|
|
|
raise KeyError("duplicated id: {}".format(sentence_id))
|
|
|
|
raise KeyError("duplicated id: {}".format(sentence_id))
|
|
|
|
jos_links = self.parse_jos_links(s)
|
|
|
|
jos_links = self.parse_jos_links(s)
|
|
|
|
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
|
|
|
|
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
|
|
|
|
srl_links_parsed = self.parse_srl_links(s, srl_links)
|
|
|
|
if srl_links is None:
|
|
|
|
|
|
|
|
srl_links_parsed = None
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
srl_links_parsed = self.parse_srl_links(s, srl_links)
|
|
|
|
res_dict[sentence_id] = {
|
|
|
|
res_dict[sentence_id] = {
|
|
|
|
"sid": sentence_id,
|
|
|
|
"sid": sentence_id,
|
|
|
|
"text": sentence_text,
|
|
|
|
"text": sentence_text,
|
|
|
@ -174,8 +182,7 @@ class Parser():
|
|
|
|
"jos_links": jos_links,
|
|
|
|
"jos_links": jos_links,
|
|
|
|
"srl_links": srl_links_parsed
|
|
|
|
"srl_links": srl_links_parsed
|
|
|
|
}
|
|
|
|
}
|
|
|
|
print("------------------------------------------------- END")
|
|
|
|
if srl_links is None:
|
|
|
|
print(res_dict[sentence_id])
|
|
|
|
self.logger.info("srl_links missing:{}:{}".format(
|
|
|
|
print("------------------------------------------------- END")
|
|
|
|
sentence_id, res_dict[sentence_id]["text"]))
|
|
|
|
return # TODO dev break
|
|
|
|
|
|
|
|
return res_dict
|
|
|
|
return res_dict
|
|
|
|