corpusparser finished python dict representation; TODO .json and DB

dev
voje 5 years ago
parent d1dea2e22e
commit da460f74f1

@ -0,0 +1,6 @@
F0034713.20.1": [{"dep": "7", "arg": "REC", "from": "9"}, {"dep": "10", "arg": "ACT", "from": "9"}, {"dep": "13", "arg": "MWPRED", "from": "12"}, {"dep": "18", "arg": "MANN", "from": "19"}, {"dep": "20", "arg": "LOC", "from": "19"}]
Sentence:
F0034713.20.0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
Sodobni ali preprosto neosveščeni potrošnik vse prerad zavrže stvar, ki se je malenkostno pokvarila in bi se jo zlahka dalo popraviti.

@ -1 +1 @@
/home/kristjan/kres_srl/final_json/ /home/voje/work_data/final_json

@ -0,0 +1 @@
/home/kristjan/kres_srl/final_json/

@ -1,14 +1,16 @@
from corpusparser import Sentence
from pathlib import Path from pathlib import Path
import re import re
import json import json
from lxml import etree from lxml import etree
import logging
logging.basicConfig(level=logging.INFO)
# Read input file(.xml, .json; kres or ssj500k). # Read input file(.xml, .json; kres or ssj500k).
# Create an iterator that outputs resulting sentences (python dict format). # Create an iterator that outputs resulting sentences (python dict format).
class Parser(): class Parser():
def __init__(self, corpus, infiles): def __init__(self, corpus, infiles, logger=None):
if corpus == "kres": if corpus == "kres":
self.kres_folder = Path(infiles[0]) self.kres_folder = Path(infiles[0])
@ -22,6 +24,7 @@ class Parser():
self.W_TAGS = ['w'] self.W_TAGS = ['w']
self.C_TAGS = ['c'] self.C_TAGS = ['c']
self.S_TAGS = ['S', 'pc'] self.S_TAGS = ['S', 'pc']
self.logger = logger or logging.getLogger(__name__)
def parse_jos_links(self, sent_el): def parse_jos_links(self, sent_el):
if self.corpus == "kres": if self.corpus == "kres":
@ -71,13 +74,15 @@ class Parser():
return self.parse_any_links_ssj(sent_el, "SRL") return self.parse_any_links_ssj(sent_el, "SRL")
def parse_srl_links_kres(self, sent_el, sent_srl_links): def parse_srl_links_kres(self, sent_el, sent_srl_links):
print("HA") res_links = []
if len(sent_srl_links) == 0: for link in sent_srl_links:
print("HI") res_links += [{
return [] "from": int(link["from"]),
print(sent_srl_links) "afun": link["arg"],
"to": int(link["dep"]),
}]
# find the correspointing json file with srl links # find the correspointing json file with srl links
return sent_srl_links return res_links
def parse(self): def parse(self):
if self.corpus == "kres": if self.corpus == "kres":
@ -166,7 +171,10 @@ class Parser():
raise KeyError("duplicated id: {}".format(sentence_id)) raise KeyError("duplicated id: {}".format(sentence_id))
jos_links = self.parse_jos_links(s) jos_links = self.parse_jos_links(s)
srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None srl_links = srl_dict.get(sentence_id) if self.corpus == "kres" else None
srl_links_parsed = self.parse_srl_links(s, srl_links) if srl_links is None:
srl_links_parsed = None
else:
srl_links_parsed = self.parse_srl_links(s, srl_links)
res_dict[sentence_id] = { res_dict[sentence_id] = {
"sid": sentence_id, "sid": sentence_id,
"text": sentence_text, "text": sentence_text,
@ -174,8 +182,7 @@ class Parser():
"jos_links": jos_links, "jos_links": jos_links,
"srl_links": srl_links_parsed "srl_links": srl_links_parsed
} }
print("------------------------------------------------- END") if srl_links is None:
print(res_dict[sentence_id]) self.logger.info("srl_links missing:{}:{}".format(
print("------------------------------------------------- END") sentence_id, res_dict[sentence_id]["text"]))
return # TODO dev break
return res_dict return res_dict

@ -0,0 +1,38 @@
# corpusparser
A tool for parsing ssj500k and Kres into a unified .json format.
## Input:
### ssj500k
To parse ssj500k, point to the monolythic `ssj500k-sl.body.xml` file (tested on ssj500k 2.1).
### Kres
To parse Kres, point to folders:
* Kres folder, containig several (around 20K) .xml files (`F00XXXXX.xml.parsed.xml`).
* Kres SRL folder, containing SRL links for the corresponding F00...xml files (`F00XXXXX.srl.json`).
## Internal data format
This is the internal python dict data format. It can be stored to file as `.json` or stored into a database for application usage.
```python
{
'sid': 'F0034713.5.0',
'text': 'Mednarodni denarni sklad je odobril 30 milijard evrov vredno posojilo Grčiji. ',
'tokens': [
{'text': 'Mednarodni', 'lemma': 'mednaroden', 'msd': 'Ppnmeid', 'word': True, 'tid': 1},
{'text': 'denarni', 'lemma': 'denaren', 'msd': 'Ppnmeid', 'word': True, 'tid': 2},
{'text': 'sklad', 'lemma': 'sklad', 'msd': 'Somei', 'word': True, 'tid': 3},
{'text': 'je', 'lemma': 'biti', 'msd': 'Gp-ste-n', 'word': True, 'tid': 4},
{'text': 'odobril', 'lemma': 'odobriti', 'msd': 'Ggdd-em', 'word': True, 'tid': 5},
{'text': '30', 'lemma': '30', 'msd': 'Kag', 'word': True, 'tid': 6},
{'text': 'milijard', 'lemma': 'milijarda', 'msd': 'Sozmr', 'word': True, 'tid': 7}, # ...
]
'jos_links': [
{'to': 1, 'from': 3, 'afun': 'dol'},
{'to': 2, 'from': 3, 'afun': 'dol'},
{'to': 3, 'from': 5, 'afun': 'ena'}, # ...
]
'srl_links': [
{'to': 3, 'from': 5, 'afun': 'ACT'},
{'to': 7, 'from': 5, 'afun': 'PAT'}
]
}
```

@ -1,3 +0,0 @@
class Sentence():
def __init__():
print("Sentence __init__(): TODO")

@ -1,2 +1 @@
from corpusparser.Parser import Parser from corpusparser.Parser import Parser
from corpusparser.Sentence import Sentence
Loading…
Cancel
Save