Merge branch 'master' of gitea.cjvt.si:kristjan/cjvt-valency

dev
voje 5 years ago
commit 4e8447d930

2
.gitignore vendored

@ -1,3 +1,3 @@
data/samples/ data/samples/
*/__pycache__/
*egg-info/ *egg-info/
*.pyc

@ -11,6 +11,9 @@ MAKE_ROOT = $(shell pwd)
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl" KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl"
OUTPUT = "file"
OUTDIR = "/home/voje/workdir/test_out"
DBADDR = ""
export export
.PHONY: dev-env preflight .PHONY: dev-env preflight
@ -30,5 +33,5 @@ data/samples:
# when debugging, run this once, then run python3 ... by hand # when debugging, run this once, then run python3 ... by hand
preflight: data/samples preflight: data/samples
pip3 install -e src/pkg/corpusparser/. pip3 install -e src/pkg/corpusparser/.
python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \ python3 src/pkg/corpusparser/corpusparser/main.py --kres-folder $(KRES_FOLDER) \
--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) --output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR)

@ -0,0 +1,6 @@
F0034713.20.1": [{"dep": "7", "arg": "REC", "from": "9"}, {"dep": "10", "arg": "ACT", "from": "9"}, {"dep": "13", "arg": "MWPRED", "from": "12"}, {"dep": "18", "arg": "MANN", "from": "19"}, {"dep": "20", "arg": "LOC", "from": "19"}]
Sentence:
F0034713.20.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Sodobni ali preprosto neosveščeni potrošnik vse prerad zavrže stvar, ki se je malenkostno pokvarila in bi se jo zlahka dalo popraviti.

@ -1 +1 @@
/home/kristjan/kres_srl/final_json/ /home/voje/work_data/final_json

@ -0,0 +1 @@
/home/voje/work_data/final_json

@ -0,0 +1 @@
/home/kristjan/kres_srl/final_json/

@ -1,14 +1,16 @@
from corpusparser import Sentence
from pathlib import Path from pathlib import Path
import re import re
import json import json
from lxml import etree from lxml import etree
import logging
logging.basicConfig(level=logging.INFO)
# Read input file(.xml, .json; kres or ssj500k). # Read input file(.xml, .json; kres or ssj500k).
# Create an iterator that outputs resulting sentences (python dict format). # Create an iterator that outputs resulting sentences (python dict format).
class Parser(): class Parser():
def __init__(self, corpus, infiles): def __init__(self, corpus, infiles, logger=None):
if corpus == "kres": if corpus == "kres":
self.kres_folder = Path(infiles[0]) self.kres_folder = Path(infiles[0])
@ -22,6 +24,11 @@ class Parser():
self.W_TAGS = ['w'] self.W_TAGS = ['w']
self.C_TAGS = ['c'] self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc'] self.S_TAGS = ['S', 'pc']
self.logger = logger or logging.getLogger(__name__)
self.stats = {
"parsed_count": 0,
"missing_srl": []
}
def parse_jos_links(self, sent_el): def parse_jos_links(self, sent_el):
if self.corpus == "kres": if self.corpus == "kres":
@ -64,37 +71,40 @@ class Parser():
}] }]
return res_links return res_links
def parse_srl_links(self, sent_el, sent_srl_links): def parse_srl_links(self, sent_el, sent_srl_links=None):
if self.corpus == "kres": if self.corpus == "kres":
return self.parse_srl_links_kres(sent_el, sent_srl_links) return self.parse_srl_links_kres(sent_el, sent_srl_links)
else: else:
return self.parse_any_links_ssj(sent_el, "SRL") return self.parse_any_links_ssj(sent_el, "SRL")
def parse_srl_links_kres(self, sent_el, sent_srl_links): def parse_srl_links_kres(self, sent_el, sent_srl_links):
print("HA") res_links = []
if len(sent_srl_links) == 0: for link in sent_srl_links:
print("HI") res_links += [{
return [] "from": int(link["from"]),
print(sent_srl_dict) "afun": link["arg"],
"to": int(link["dep"]),
}]
# find the correspointing json file with srl links # find the correspointing json file with srl links
return [] return res_links
def parse(self): def sentence_generator(self):
# Using generators so we don't copy a whole corpu around in memory.
if self.corpus == "kres": if self.corpus == "kres":
for xml_file in self.kres_folder.iterdir(): for xml_file in self.kres_folder.iterdir():
self.parse_xml_file(xml_file) # self.parse_xml_file(xml_file)
break # TODO dev break yield from self.parse_xml_file(xml_file)
else: else:
self.parse_xml_file(self.ssj_file) yield from self.parse_xml_file(self.ssj_file)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
srl_dict = {} srl_from_json = {}
if self.corpus == "kres": if self.corpus == "kres":
# in case of kres, read the SRL links form a separate json file # in case of kres, read the SRL links form a separate json file
file_id = xml_file.name.split(".")[0] file_id = xml_file.name.split(".")[0]
json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json")
with json_file.open("r") as fp: with json_file.open("r") as fp:
srl_dict = json.loads(fp.read()) srl_from_json = json.loads(fp.read())
with xml_file.open("rb") as fp: with xml_file.open("rb") as fp:
# remove namespaces # remove namespaces
@ -112,7 +122,7 @@ class Parser():
else: else:
divs = root.findall(".//div") divs = root.findall(".//div")
res_dict = [] # TODO: try making an iterator instead res_dict = {}
# parse divs # parse divs
for div in divs: for div in divs:
@ -150,7 +160,7 @@ class Parser():
sentence_text += el.text sentence_text += el.text
sentence_tokens += [{ sentence_tokens += [{
"word": False, "word": False,
"tid": el_id, "tid": (int(el_id) if self.corpus == "kres" else -1),
"text": el.text, "text": el.text,
}] }]
elif el.tag in self.S_TAGS: elif el.tag in self.S_TAGS:
@ -161,22 +171,26 @@ class Parser():
pass pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
# make a generator instead of holding the whole corpus in memory
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
jos_links = self.parse_jos_links(s) jos_links = self.parse_jos_links(s)
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
srl_links_fixed = self.parse_srl_links(s, srl_links) if self.corpus == "kres":
print(srl_links) srl_links_raw = srl_from_json.get(sentence_id)
res_dict[sentence_id] = { if srl_links_raw is None:
srl_links_parsed = None
self.stats["missing_srl"] += [(sentence_id, sentence_text)]
else:
srl_links_parsed = self.parse_srl_links(s, srl_links_raw)
else:
srl_links_parsed = self.parse_srl_links(s)
if len(srl_links_parsed) == 0:
self.stats["missing_srl"] += [(sentence_id, sentence_text)]
sentence_entry = {
"sid": sentence_id, "sid": sentence_id,
"text": sentence_text, "text": sentence_text,
"tokens": sentence_tokens, "tokens": sentence_tokens,
"jos_links": "BBBB", "jos_links": jos_links,
"srl_links": "AAAAA", "srl_links": srl_links_parsed
} }
self.stats["parsed_count"] += 1
print(res_dict[sentence_id]) yield (xml_file, sentence_entry)
print("------------------------------------------------- END")
return # TODO dev break
return res_dict

@ -0,0 +1,38 @@
# corpusparser
A tool for parsing ssj500k and Kres into a unified .json format.
## Input:
### ssj500k
To parse ssj500k, point to the monolythic `ssj500k-sl.body.xml` file (tested on ssj500k 2.1).
### Kres
To parse Kres, point to folders:
* Kres folder, containig several (around 20K) .xml files (`F00XXXXX.xml.parsed.xml`).
* Kres SRL folder, containing SRL links for the corresponding F00...xml files (`F00XXXXX.srl.json`).
## Internal data format
This is the internal python dict data format. It can be stored to file as `.json` or stored into a database for application usage.
```python
{
'sid': 'F0034713.5.0',
'text': 'Mednarodni denarni sklad je odobril 30 milijard evrov vredno posojilo Grčiji. ',
'tokens': [
{'text': 'Mednarodni', 'lemma': 'mednaroden', 'msd': 'Ppnmeid', 'word': True, 'tid': 1},
{'text': 'denarni', 'lemma': 'denaren', 'msd': 'Ppnmeid', 'word': True, 'tid': 2},
{'text': 'sklad', 'lemma': 'sklad', 'msd': 'Somei', 'word': True, 'tid': 3},
{'text': 'je', 'lemma': 'biti', 'msd': 'Gp-ste-n', 'word': True, 'tid': 4},
{'text': 'odobril', 'lemma': 'odobriti', 'msd': 'Ggdd-em', 'word': True, 'tid': 5},
{'text': '30', 'lemma': '30', 'msd': 'Kag', 'word': True, 'tid': 6},
{'text': 'milijard', 'lemma': 'milijarda', 'msd': 'Sozmr', 'word': True, 'tid': 7}, # ...
]
'jos_links': [
{'to': 1, 'from': 3, 'afun': 'dol'},
{'to': 2, 'from': 3, 'afun': 'dol'},
{'to': 3, 'from': 5, 'afun': 'ena'}, # ...
]
'srl_links': [
{'to': 3, 'from': 5, 'afun': 'ACT'},
{'to': 7, 'from': 5, 'afun': 'PAT'}
]
}
```

@ -1,3 +0,0 @@
class Sentence():
def __init__():
print("Sentence __init__(): TODO")

@ -1,2 +1 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.Sentence import Sentence

@ -0,0 +1,102 @@
from pathlib import Path
from corpusparser import Parser
import argparse
import logging
import json
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
## Main handles command line arguments and writing to files / DB.
def ssj_to_json_file(sentence_generator, outfolder):
# this funciton is based on the fact that files are parsed sequentially
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)
outfile = outfolder / "ssj500k.json"
data_buffer = []
for s in sentence_generator:
sdata = s[1]
data_buffer += [sdata]
# outfile = Path(outfile)
with outfile.open("w") as fp:
logger.info("Writing to {}".format(outfile))
json.dump(data_buffer, fp)
def kres_to_json_files(sentence_generator, outfolder):
outfolder = Path(outfolder) / "kres_json"
outfolder.mkdir(parents=True, exist_ok=True)
def write_buffer_to_file(outfile, outfile_buffer):
logger.info("Writing file: {}".format(outfile))
with outfile.open("w") as fp:
json.dump(outfile_buffer, fp)
outfile_buffer = None
current_outfile = None
for s in sentence_generator:
infile = s[0]
outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json")
# parser sequentially parses files; when we're done with a file, write it out
if current_outfile is None:
current_outfile = outfile
outfile_buffer = []
elif outfile != current_outfile:
write_buffer_to_file(current_outfile, outfile_buffer)
current_outfile = outfile
outfile_buffer = []
# update buffer
sdata = s[1]
outfile_buffer += [sdata]
write_buffer_to_file(current_outfile, outfile_buffer)
def to_db():
return "TODO"
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
parser.add_argument('--kres-folder', required=True)
parser.add_argument('--kres-srl-folder', required=True)
parser.add_argument('--ssj-file', required=True)
parser.add_argument('--output', required=False, default=None)
parser.add_argument('--outdir', required=False, default=None)
parser.add_argument('--dbaddr', required=False, default=None)
args = parser.parse_args()
# parse ssj
logger.info("Parsing ssj500k: {}".format(args.ssj_file))
ssj_parser = Parser(
corpus="ssj",
infiles=[args.ssj_file],
)
# res = [x[1]["sid"] for x in ssj_parser.sentence_generator()]
# logger.info("Parsed {} sentences (ssj500k)".format(len(res)))
# ssj to json
ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir)
# parse kres
logger.info("Parsing Kres: {}".format(args.ssj_file))
kres_parser = Parser(
corpus="kres",
infiles=[args.kres_folder, args.kres_srl_folder],
)
# res = [x[1]["sid"] for x in kres_parser.sentence_generator()]
# logger.info("Parsed {} sentences (kres)".format(len(res)))
# kres to json
kres_to_json_files(kres_parser.sentence_generator(), args.outdir)
## Handling output is situational --- implement it outside of Parser.
## Parser returns tuples (orig_file, element)
# 1. parse per-file and output to file (JSON)
# 2. parse and save to DB
# TODO

@ -1,25 +0,0 @@
from corpusparser import Parser
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
parser.add_argument('--kres-folder', required=True)
parser.add_argument('--kres-srl-folder', required=True)
parser.add_argument('--ssj-file', required=True)
args = parser.parse_args()
# parse ssj
"""
ssj_parser = Parser(
corpus="ssj",
infiles=[args.ssj_file]
)
ssj_parser.parse()
"""
# parse kres
kres_parser = Parser(
corpus="kres",
infiles=[args.kres_folder, args.kres_srl_folder]
)
kres_parser.parse()
Loading…
Cancel
Save