Merge branch 'master' of gitea.cjvt.si:kristjan/cjvt-valency

This commit is contained in:
voje 2019-03-14 18:41:02 +01:00
commit 4e8447d930
15 changed files with 201 additions and 65 deletions

2
.gitignore vendored
View File

@ -1,3 +1,3 @@
data/samples/ data/samples/
*/__pycache__/
*egg-info/ *egg-info/
*.pyc

View File

@ -11,6 +11,9 @@ MAKE_ROOT = $(shell pwd)
SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml" SSJ_FILE = "$(MAKE_ROOT)/data/samples/ssj_example/ssj500k-sl.body.sample.xml"
KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example" KRES_FOLDER = "$(MAKE_ROOT)/data/samples/kres_example"
KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl" KRES_SRL_FOLDER = "$(MAKE_ROOT)/data/kres_srl"
OUTPUT = "file"
OUTDIR = "/home/voje/workdir/test_out"
DBADDR = ""
export export
.PHONY: dev-env preflight .PHONY: dev-env preflight
@ -30,5 +33,5 @@ data/samples:
# when debugging, run this once, then run python3 ... by hand # when debugging, run this once, then run python3 ... by hand
preflight: data/samples preflight: data/samples
pip3 install -e src/pkg/corpusparser/. pip3 install -e src/pkg/corpusparser/.
python3 src/preflight/main_parse.py --kres-folder $(KRES_FOLDER) \ python3 src/pkg/corpusparser/corpusparser/main.py --kres-folder $(KRES_FOLDER) \
--ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) --ssj-file $(SSJ_FILE) --kres-srl-folder $(KRES_SRL_FOLDER) --output $(OUTPUT) --outdir $(OUTDIR) --dbaddr $(DBADDR)

6
data/debugging/dbg.txt Normal file
View File

@ -0,0 +1,6 @@
F0034713.20.1": [{"dep": "7", "arg": "REC", "from": "9"}, {"dep": "10", "arg": "ACT", "from": "9"}, {"dep": "13", "arg": "MWPRED", "from": "12"}, {"dep": "18", "arg": "MANN", "from": "19"}, {"dep": "20", "arg": "LOC", "from": "19"}]
Sentence:
F0034713.20.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Sodobni ali preprosto neosveščeni potrošnik vse prerad zavrže stvar, ki se je malenkostno pokvarila in bi se jo zlahka dalo popraviti.

View File

@ -1 +1 @@
/home/kristjan/kres_srl/final_json/ /home/voje/work_data/final_json

1
data/kres_srl_ikt Symbolic link
View File

@ -0,0 +1 @@
/home/voje/work_data/final_json

1
data/kres_srl_t420 Symbolic link
View File

@ -0,0 +1 @@
/home/kristjan/kres_srl/final_json/

View File

@ -1,14 +1,16 @@
from corpusparser import Sentence
from pathlib import Path from pathlib import Path
import re import re
import json import json
from lxml import etree from lxml import etree
import logging
logging.basicConfig(level=logging.INFO)
# Read input file(.xml, .json; kres or ssj500k). # Read input file(.xml, .json; kres or ssj500k).
# Create an iterator that outputs resulting sentences (python dict format). # Create an iterator that outputs resulting sentences (python dict format).
class Parser(): class Parser():
def __init__(self, corpus, infiles): def __init__(self, corpus, infiles, logger=None):
if corpus == "kres": if corpus == "kres":
self.kres_folder = Path(infiles[0]) self.kres_folder = Path(infiles[0])
@ -22,6 +24,11 @@ class Parser():
self.W_TAGS = ['w'] self.W_TAGS = ['w']
self.C_TAGS = ['c'] self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc'] self.S_TAGS = ['S', 'pc']
self.logger = logger or logging.getLogger(__name__)
self.stats = {
"parsed_count": 0,
"missing_srl": []
}
def parse_jos_links(self, sent_el): def parse_jos_links(self, sent_el):
if self.corpus == "kres": if self.corpus == "kres":
@ -64,37 +71,40 @@ class Parser():
}] }]
return res_links return res_links
def parse_srl_links(self, sent_el, sent_srl_links): def parse_srl_links(self, sent_el, sent_srl_links=None):
if self.corpus == "kres": if self.corpus == "kres":
return self.parse_srl_links_kres(sent_el, sent_srl_links) return self.parse_srl_links_kres(sent_el, sent_srl_links)
else: else:
return self.parse_any_links_ssj(sent_el, "SRL") return self.parse_any_links_ssj(sent_el, "SRL")
def parse_srl_links_kres(self, sent_el, sent_srl_links): def parse_srl_links_kres(self, sent_el, sent_srl_links):
print("HA") res_links = []
if len(sent_srl_links) == 0: for link in sent_srl_links:
print("HI") res_links += [{
return [] "from": int(link["from"]),
print(sent_srl_dict) "afun": link["arg"],
"to": int(link["dep"]),
}]
# find the correspointing json file with srl links # find the correspointing json file with srl links
return [] return res_links
def parse(self): def sentence_generator(self):
# Using generators so we don't copy a whole corpu around in memory.
if self.corpus == "kres": if self.corpus == "kres":
for xml_file in self.kres_folder.iterdir(): for xml_file in self.kres_folder.iterdir():
self.parse_xml_file(xml_file) # self.parse_xml_file(xml_file)
break # TODO dev break yield from self.parse_xml_file(xml_file)
else: else:
self.parse_xml_file(self.ssj_file) yield from self.parse_xml_file(self.ssj_file)
def parse_xml_file(self, xml_file): def parse_xml_file(self, xml_file):
srl_dict = {} srl_from_json = {}
if self.corpus == "kres": if self.corpus == "kres":
# in case of kres, read the SRL links form a separate json file # in case of kres, read the SRL links form a separate json file
file_id = xml_file.name.split(".")[0] file_id = xml_file.name.split(".")[0]
json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json") json_file = self.kres_srl_folder / Path(file_id).with_suffix(".srl.json")
with json_file.open("r") as fp: with json_file.open("r") as fp:
srl_dict = json.loads(fp.read()) srl_from_json = json.loads(fp.read())
with xml_file.open("rb") as fp: with xml_file.open("rb") as fp:
# remove namespaces # remove namespaces
@ -112,7 +122,7 @@ class Parser():
else: else:
divs = root.findall(".//div") divs = root.findall(".//div")
res_dict = [] # TODO: try making an iterator instead res_dict = {}
# parse divs # parse divs
for div in divs: for div in divs:
@ -150,7 +160,7 @@ class Parser():
sentence_text += el.text sentence_text += el.text
sentence_tokens += [{ sentence_tokens += [{
"word": False, "word": False,
"tid": el_id, "tid": (int(el_id) if self.corpus == "kres" else -1),
"text": el.text, "text": el.text,
}] }]
elif el.tag in self.S_TAGS: elif el.tag in self.S_TAGS:
@ -161,22 +171,26 @@ class Parser():
pass pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
# make a generator instead of holding the whole corpus in memory
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
jos_links = self.parse_jos_links(s) jos_links = self.parse_jos_links(s)
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
srl_links_fixed = self.parse_srl_links(s, srl_links) if self.corpus == "kres":
print(srl_links) srl_links_raw = srl_from_json.get(sentence_id)
res_dict[sentence_id] = { if srl_links_raw is None:
srl_links_parsed = None
self.stats["missing_srl"] += [(sentence_id, sentence_text)]
else:
srl_links_parsed = self.parse_srl_links(s, srl_links_raw)
else:
srl_links_parsed = self.parse_srl_links(s)
if len(srl_links_parsed) == 0:
self.stats["missing_srl"] += [(sentence_id, sentence_text)]
sentence_entry = {
"sid": sentence_id, "sid": sentence_id,
"text": sentence_text, "text": sentence_text,
"tokens": sentence_tokens, "tokens": sentence_tokens,
"jos_links": "BBBB", "jos_links": jos_links,
"srl_links": "AAAAA", "srl_links": srl_links_parsed
} }
self.stats["parsed_count"] += 1
print(res_dict[sentence_id]) yield (xml_file, sentence_entry)
print("------------------------------------------------- END")
return # TODO dev break
return res_dict

View File

@ -0,0 +1,38 @@
# corpusparser
A tool for parsing ssj500k and Kres into a unified .json format.
## Input:
### ssj500k
To parse ssj500k, point to the monolythic `ssj500k-sl.body.xml` file (tested on ssj500k 2.1).
### Kres
To parse Kres, point to folders:
* Kres folder, containig several (around 20K) .xml files (`F00XXXXX.xml.parsed.xml`).
* Kres SRL folder, containing SRL links for the corresponding F00...xml files (`F00XXXXX.srl.json`).
## Internal data format
This is the internal python dict data format. It can be stored to file as `.json` or stored into a database for application usage.
```python
{
'sid': 'F0034713.5.0',
'text': 'Mednarodni denarni sklad je odobril 30 milijard evrov vredno posojilo Grčiji. ',
'tokens': [
{'text': 'Mednarodni', 'lemma': 'mednaroden', 'msd': 'Ppnmeid', 'word': True, 'tid': 1},
{'text': 'denarni', 'lemma': 'denaren', 'msd': 'Ppnmeid', 'word': True, 'tid': 2},
{'text': 'sklad', 'lemma': 'sklad', 'msd': 'Somei', 'word': True, 'tid': 3},
{'text': 'je', 'lemma': 'biti', 'msd': 'Gp-ste-n', 'word': True, 'tid': 4},
{'text': 'odobril', 'lemma': 'odobriti', 'msd': 'Ggdd-em', 'word': True, 'tid': 5},
{'text': '30', 'lemma': '30', 'msd': 'Kag', 'word': True, 'tid': 6},
{'text': 'milijard', 'lemma': 'milijarda', 'msd': 'Sozmr', 'word': True, 'tid': 7}, # ...
]
'jos_links': [
{'to': 1, 'from': 3, 'afun': 'dol'},
{'to': 2, 'from': 3, 'afun': 'dol'},
{'to': 3, 'from': 5, 'afun': 'ena'}, # ...
]
'srl_links': [
{'to': 3, 'from': 5, 'afun': 'ACT'},
{'to': 7, 'from': 5, 'afun': 'PAT'}
]
}
```

View File

@ -1,3 +0,0 @@
class Sentence():
def __init__():
print("Sentence __init__(): TODO")

View File

@ -1,2 +1 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.Sentence import Sentence

View File

@ -0,0 +1,102 @@
from pathlib import Path
from corpusparser import Parser
import argparse
import logging
import json
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
## Main handles command line arguments and writing to files / DB.
def ssj_to_json_file(sentence_generator, outfolder):
# this funciton is based on the fact that files are parsed sequentially
outfolder = Path(outfolder)
outfolder.mkdir(parents=True, exist_ok=True)
outfile = outfolder / "ssj500k.json"
data_buffer = []
for s in sentence_generator:
sdata = s[1]
data_buffer += [sdata]
# outfile = Path(outfile)
with outfile.open("w") as fp:
logger.info("Writing to {}".format(outfile))
json.dump(data_buffer, fp)
def kres_to_json_files(sentence_generator, outfolder):
outfolder = Path(outfolder) / "kres_json"
outfolder.mkdir(parents=True, exist_ok=True)
def write_buffer_to_file(outfile, outfile_buffer):
logger.info("Writing file: {}".format(outfile))
with outfile.open("w") as fp:
json.dump(outfile_buffer, fp)
outfile_buffer = None
current_outfile = None
for s in sentence_generator:
infile = s[0]
outfile = outfolder / Path(infile.name.split(".")[0]).with_suffix(".json")
# parser sequentially parses files; when we're done with a file, write it out
if current_outfile is None:
current_outfile = outfile
outfile_buffer = []
elif outfile != current_outfile:
write_buffer_to_file(current_outfile, outfile_buffer)
current_outfile = outfile
outfile_buffer = []
# update buffer
sdata = s[1]
outfile_buffer += [sdata]
write_buffer_to_file(current_outfile, outfile_buffer)
def to_db():
return "TODO"
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
parser.add_argument('--kres-folder', required=True)
parser.add_argument('--kres-srl-folder', required=True)
parser.add_argument('--ssj-file', required=True)
parser.add_argument('--output', required=False, default=None)
parser.add_argument('--outdir', required=False, default=None)
parser.add_argument('--dbaddr', required=False, default=None)
args = parser.parse_args()
# parse ssj
logger.info("Parsing ssj500k: {}".format(args.ssj_file))
ssj_parser = Parser(
corpus="ssj",
infiles=[args.ssj_file],
)
# res = [x[1]["sid"] for x in ssj_parser.sentence_generator()]
# logger.info("Parsed {} sentences (ssj500k)".format(len(res)))
# ssj to json
ssj_to_json_file(ssj_parser.sentence_generator(), args.outdir)
# parse kres
logger.info("Parsing Kres: {}".format(args.ssj_file))
kres_parser = Parser(
corpus="kres",
infiles=[args.kres_folder, args.kres_srl_folder],
)
# res = [x[1]["sid"] for x in kres_parser.sentence_generator()]
# logger.info("Parsed {} sentences (kres)".format(len(res)))
# kres to json
kres_to_json_files(kres_parser.sentence_generator(), args.outdir)
## Handling output is situational --- implement it outside of Parser.
## Parser returns tuples (orig_file, element)
# 1. parse per-file and output to file (JSON)
# 2. parse and save to DB
# TODO

View File

@ -1,25 +0,0 @@
from corpusparser import Parser
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.")
parser.add_argument('--kres-folder', required=True)
parser.add_argument('--kres-srl-folder', required=True)
parser.add_argument('--ssj-file', required=True)
args = parser.parse_args()
# parse ssj
"""
ssj_parser = Parser(
corpus="ssj",
infiles=[args.ssj_file]
)
ssj_parser.parse()
"""
# parse kres
kres_parser = Parser(
corpus="kres",
infiles=[args.kres_folder, args.kres_srl_folder]
)
kres_parser.parse()