corpusparser finished python dict representation; TODO .json and DB

This commit is contained in:
voje 2019-03-13 08:59:27 +01:00
parent d1dea2e22e
commit da460f74f1
7 changed files with 67 additions and 19 deletions

6
data/debugging/dbg.txt Normal file
View File

@ -0,0 +1,6 @@
F0034713.20.1": [{"dep": "7", "arg": "REC", "from": "9"}, {"dep": "10", "arg": "ACT", "from": "9"}, {"dep": "13", "arg": "MWPRED", "from": "12"}, {"dep": "18", "arg": "MANN", "from": "19"}, {"dep": "20", "arg": "LOC", "from": "19"}]
Sentence:
F0034713.20.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Sodobni ali preprosto neosveščeni potrošnik vse prerad zavrže stvar, ki se je malenkostno pokvarila in bi se jo zlahka dalo popraviti.

View File

@ -1 +1 @@
/home/kristjan/kres_srl/final_json/ /home/voje/work_data/final_json

1
data/kres_srl_t420 Symbolic link
View File

@ -0,0 +1 @@
/home/kristjan/kres_srl/final_json/

View File

@ -1,14 +1,16 @@
from corpusparser import Sentence
from pathlib import Path from pathlib import Path
import re import re
import json import json
from lxml import etree from lxml import etree
import logging
logging.basicConfig(level=logging.INFO)
# Read input file(.xml, .json; kres or ssj500k). # Read input file(.xml, .json; kres or ssj500k).
# Create an iterator that outputs resulting sentences (python dict format). # Create an iterator that outputs resulting sentences (python dict format).
class Parser(): class Parser():
def __init__(self, corpus, infiles): def __init__(self, corpus, infiles, logger=None):
if corpus == "kres": if corpus == "kres":
self.kres_folder = Path(infiles[0]) self.kres_folder = Path(infiles[0])
@ -22,6 +24,7 @@ class Parser():
self.W_TAGS = ['w'] self.W_TAGS = ['w']
self.C_TAGS = ['c'] self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc'] self.S_TAGS = ['S', 'pc']
self.logger = logger or logging.getLogger(__name__)
def parse_jos_links(self, sent_el): def parse_jos_links(self, sent_el):
if self.corpus == "kres": if self.corpus == "kres":
@ -71,13 +74,15 @@ class Parser():
return self.parse_any_links_ssj(sent_el, "SRL") return self.parse_any_links_ssj(sent_el, "SRL")
def parse_srl_links_kres(self, sent_el, sent_srl_links): def parse_srl_links_kres(self, sent_el, sent_srl_links):
print("HA") res_links = []
if len(sent_srl_links) == 0: for link in sent_srl_links:
print("HI") res_links += [{
return [] "from": int(link["from"]),
print(sent_srl_links) "afun": link["arg"],
"to": int(link["dep"]),
}]
# find the correspointing json file with srl links # find the correspointing json file with srl links
return sent_srl_links return res_links
def parse(self): def parse(self):
if self.corpus == "kres": if self.corpus == "kres":
@ -166,7 +171,10 @@ class Parser():
raise KeyError("duplicated id: {}".format(sentence_id)) raise KeyError("duplicated id: {}".format(sentence_id))
jos_links = self.parse_jos_links(s) jos_links = self.parse_jos_links(s)
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
srl_links_parsed = self.parse_srl_links(s, srl_links) if srl_links is None:
srl_links_parsed = None
else:
srl_links_parsed = self.parse_srl_links(s, srl_links)
res_dict[sentence_id] = { res_dict[sentence_id] = {
"sid": sentence_id, "sid": sentence_id,
"text": sentence_text, "text": sentence_text,
@ -174,8 +182,7 @@ class Parser():
"jos_links": jos_links, "jos_links": jos_links,
"srl_links": srl_links_parsed "srl_links": srl_links_parsed
} }
print("------------------------------------------------- END") if srl_links is None:
print(res_dict[sentence_id]) self.logger.info("srl_links missing:{}:{}".format(
print("------------------------------------------------- END") sentence_id, res_dict[sentence_id]["text"]))
return # TODO dev break
return res_dict return res_dict

View File

@ -0,0 +1,38 @@
# corpusparser
A tool for parsing ssj500k and Kres into a unified .json format.
## Input:
### ssj500k
To parse ssj500k, point to the monolythic `ssj500k-sl.body.xml` file (tested on ssj500k 2.1).
### Kres
To parse Kres, point to folders:
* Kres folder, containig several (around 20K) .xml files (`F00XXXXX.xml.parsed.xml`).
* Kres SRL folder, containing SRL links for the corresponding F00...xml files (`F00XXXXX.srl.json`).
## Internal data format
This is the internal python dict data format. It can be stored to file as `.json` or stored into a database for application usage.
```python
{
'sid': 'F0034713.5.0',
'text': 'Mednarodni denarni sklad je odobril 30 milijard evrov vredno posojilo Grčiji. ',
'tokens': [
{'text': 'Mednarodni', 'lemma': 'mednaroden', 'msd': 'Ppnmeid', 'word': True, 'tid': 1},
{'text': 'denarni', 'lemma': 'denaren', 'msd': 'Ppnmeid', 'word': True, 'tid': 2},
{'text': 'sklad', 'lemma': 'sklad', 'msd': 'Somei', 'word': True, 'tid': 3},
{'text': 'je', 'lemma': 'biti', 'msd': 'Gp-ste-n', 'word': True, 'tid': 4},
{'text': 'odobril', 'lemma': 'odobriti', 'msd': 'Ggdd-em', 'word': True, 'tid': 5},
{'text': '30', 'lemma': '30', 'msd': 'Kag', 'word': True, 'tid': 6},
{'text': 'milijard', 'lemma': 'milijarda', 'msd': 'Sozmr', 'word': True, 'tid': 7}, # ...
]
'jos_links': [
{'to': 1, 'from': 3, 'afun': 'dol'},
{'to': 2, 'from': 3, 'afun': 'dol'},
{'to': 3, 'from': 5, 'afun': 'ena'}, # ...
]
'srl_links': [
{'to': 3, 'from': 5, 'afun': 'ACT'},
{'to': 7, 'from': 5, 'afun': 'PAT'}
]
}
```

View File

@ -1,3 +0,0 @@
class Sentence():
def __init__():
print("Sentence __init__(): TODO")

View File

@ -1,2 +1 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.Sentence import Sentence