diff --git a/Makefile b/Makefile
index 69ae834..4ace65f 100644
--- a/Makefile
+++ b/Makefile
@@ -6,8 +6,9 @@ json_files: # srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: # tsv_files
- # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
+# # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
+# cd tools/srl-20131216; ./tag_ssj500k2.3.sh
tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py
diff --git a/README.md b/README.md
index 51252c2..ce7d3e3 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,11 @@
+# Instructions
+For mining ssj500k checkout to branch ssj500k.
+For running order look at Makefile. Generally it works like this:
+- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar
+- tools/srl-20131216/tag_all.sh - Tags ssj500k
+- tools/gen_json.py - Mine SRL to json
+- tools/gen_tei.py - Mine SRL to tei
+
# cjvt-srl-tagging
We'll be using mate-tools to perform SRL on Kres.
diff --git a/tools/gen_json.py b/tools/gen_json.py
index 1e8f821..fd16547 100644
--- a/tools/gen_json.py
+++ b/tools/gen_json.py
@@ -13,10 +13,10 @@ from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
-config.read("tools.cfg")
-ORIGPATH = Path(config["tools"]["giga"])
-INPATH = Path(config["tools"]["giga_srl"])
-OUTPATH = Path(config["tools"]["giga_json"])
+config.read("tools.cfg.ssj500k2.3")
+ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
+INPATH = Path(config["tools"]["ssj500k_srl"])
+OUTPATH = Path(config["tools"]["ssj500k_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
@@ -143,44 +143,36 @@ def handle_file(whole_input):
print('PAUSE')
# look at neighbouring sentences if they are correct
- for i in range(100):
- sentence, sentence_arr = next(gen)
- # orig_sentence = " ".join(token[2] for token in e["tokens"])
- if sentence == orig_val["text"]:
- # if i != 10 and i != 0:
- # print('OK!')
- sid = orig_id
+ sentence, sentence_arr = next(gen)
+ # orig_sentence = " ".join(token[2] for token in e["tokens"])
+ assert sentence.replace(' ', '') == orig_val['text']
+ # if i != 10 and i != 0:
+ # print('OK!')
+ sid = orig_id
- outdata[sid] = []
+ outdata[sid] = []
- # find all predicate indices in the sentence
- predicates = []
- for token in sentence_arr:
- if token[12] == "Y":
- predicates += [token[0]] # idx
+ # find all predicate indices in the sentence
+ predicates = []
+ for token in sentence_arr:
+ if token[12] == "Y":
+ predicates += [token[0]] # idx
- deprel = get_dep_rel(token)
- if deprel is not None:
- outdata[sid].append(deprel)
+ deprel = get_dep_rel(token)
+ if deprel is not None:
+ outdata[sid].append(deprel)
- # deprel["from"] points to n-th predicate
- # replace with predicate's token index
- for deprel in outdata[sid]:
- deprel["from"] = predicates[deprel["from"]]
+ # deprel["from"] points to n-th predicate
+ # replace with predicate's token index
+ for deprel in outdata[sid]:
+ deprel["from"] = predicates[deprel["from"]]
- if DEBUG:
- print(to_sentence(sentence_arr))
- print(outdata[sid])
- print(sid)
- print()
- print()
- break
- else:
- if i == 99:
- mismatch_sentences += 1
- sid = orig_id
- outdata[sid] = []
- gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
+ if DEBUG:
+ print(to_sentence(sentence_arr))
+ print(outdata[sid])
+ print(sid)
+ print()
+ print()
if mismatch_sentences > 0:
if mismatch_sentences / len(orig_dict.items()) < 0.1:
diff --git a/tools/gen_tei.py b/tools/gen_tei.py
new file mode 100644
index 0000000..f62642d
--- /dev/null
+++ b/tools/gen_tei.py
@@ -0,0 +1,47 @@
+# parse config
+import configparser
+import json
+import logging
+import os
+from pathlib import Path
+
+from tools.parser.parser import Parser
+
+config = configparser.ConfigParser()
+config.read("tools.cfg.ssj500k2.3")
+ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
+JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
+OUTPATH = Path(config["tools"]["ssj500k_tei"])
+INTERNAL_DATA = Path(config["tools"]["internal_data"])
+DEBUG = config["tools"]["debug"] == "True"
+CPU_CORES = int(config["tools"]["cpu_cores"])
+
+LOGFILE = Path(config["tools"]["logfile"]).absolute()
+LOGFILE.touch(exist_ok=True)
+LOGFILE.resolve()
+
+logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
+
+
+par = Parser()
+OUTPATH.mkdir(exist_ok=True)
+
+jsondata = []
+with open(JSONPATH, 'r') as jf:
+ jsondata = json.load(jf)
+
+logging.info("Generating TEI with annotated SRL.")
+
+def handle_file(file, jsondata):
+ teifile = (ORIGPATH / file)
+ resfile = (OUTPATH / file)
+
+ orig_dict = par.parse_tei(teifile)
+
+ # origfile = get_origfile()
+ orig_dict = par.minimize_tei(teifile, jsondata)
+
+origfiles = []
+for subdir, dirs, files in os.walk(ORIGPATH):
+ for file in files:
+ handle_file(file, jsondata)
diff --git a/tools/parse_all.py b/tools/parse_all.py
index 86c3caf..459935a 100644
--- a/tools/parse_all.py
+++ b/tools/parse_all.py
@@ -16,7 +16,8 @@ par = Parser()
# path to data
config = configparser.ConfigParser()
-config.read("tools.cfg")
+# config.read("tools.cfg")
+config.read("tools.cfg.ssj500k2.3")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
@@ -31,6 +32,14 @@ elif 'giga_orig' in config["tools"]:
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
+elif 'ssj500k_orig' in config["tools"]:
+ # analysis = 'gigafida'
+ analysis = 'ssj500k'
+ INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
+ INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
+ INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
+ OUTDIR = Path(config["tools"]["ssj500k_tsv"])
+ INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
@@ -49,40 +58,40 @@ print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
-OUTDIR.mkdir(exist_ok=True)
+# OUTDIR.mkdir(exist_ok=True)
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
-def handle_file(infile):
- i = infile[0]
- kres_file = infile[1]
- outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
+
+def handle_ssj500k_file():
+ kres_file = INDIR_SSJ500K_ORIG
+ outfile = OUTDIR
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
- try:
- res_dict = par.parse_tei(kres_file)
- kres_out_str = ""
- for _, sentence in res_dict.items():
- kres_out_str += par.to_conll_2009_SRL(sentence)
- except Exception as exc:
- logging.info("Failed processing file: {}".format(str(kres_file)))
- logging.error(exc)
- return False
+ # try:
+ res_dict = par.parse_tei(kres_file)
+ kres_out_str = ""
+ for _, sentence in res_dict.items():
+ kres_out_str += par.to_conll_2009_SRL(sentence)
+ # except Exception as exc:
+ # logging.info("Failed processing file: {}".format(str(kres_file)))
+ # logging.error(exc)
+ # return False
with outfile.open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
- logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
+ # logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
-def giga_orig_generator():
- with open(INDIR_GIGA, 'r') as gof:
+def ssj500k_orig_generator():
+ with open(INDIR_SSJ500K, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
@@ -104,12 +113,6 @@ def handle_gigafida_file():
# pass
# num_lines = i + 1
# print(num_lines)
- num_lines = 1393184026
- # 1393184026
- # 1393184033
- # return
- num_lines_per_part = num_lines / GIGA_PARTS
- curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
@@ -168,6 +171,70 @@ def handle_gigafida_file():
curr_part += 1
wf.close()
+
+def handle_ssj500k_file2():
+ """
+ File that splits big text file into more minor files. Only split on empty lines.
+ """
+ gof_generator = ssj500k_orig_generator()
+ # with open(INDIR_GIGA, 'r') as gof:
+ with open(INDIR_JOS, 'r') as gjf:
+ sentence = {}
+ sentence['tokens'] = []
+ sentence['links'] = {}
+ if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
+ ignore_lines = True
+ wf = False
+ else:
+ wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+ ignore_lines = False
+ # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
+ for i, l_gjf in enumerate(gjf):
+ l_gof = next(gof_generator)
+ if ignore_lines:
+ if i > num_lines_per_part * curr_part and l_gof == '\n':
+ if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
+ ignore_lines = False
+ # delete last file (probably not whole)
+ os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
+ if ignore_lines:
+ print(curr_part)
+ curr_part += 1
+ continue
+ else:
+ continue
+ l_gof_split = l_gof.split('\t')
+ l_gjf_split = l_gjf.split('\t')
+
+ # if punctuation
+ if l_gof != '\n':
+ if l_gof_split[1][-1] == 'u':
+ # print(l_gjf_split)
+ sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
+ else:
+ sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
+
+ sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
+
+ # if l_gof == '\n':
+ else:
+ if wf:
+ # print(i)
+ wf.write(par.to_conll_2009_SRL(sentence))
+ sentence['tokens'] = []
+ sentence['links'] = {}
+ # wf.flush()
+ # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
+ if i > num_lines_per_part * (curr_part + 1):
+ curr_part += 1
+ # if wf doesn't exist (first one)
+ if wf:
+ wf.close()
+ wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
+ curr_part += 1
+ wf.close()
+
+
import time
def handle_giga_file(ran):
"""
@@ -347,31 +414,9 @@ def handle_giga_file_selected_sentences(error_sentences):
# curr_part += 1
wf.close()
-file_indices = set(range(0, 100000))
-with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
- file_indices = set(pickle.load(pkl_file))
-with Pool(CPU_CORES) as p:
- if analysis == 'kres':
- p.map(handle_file, infiles)
- elif analysis == 'gigafida':
- handle_gigafida_file()
- elif analysis == 'giga':
- final_range = [0, 100000]
- size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
- # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
- ranges = []
- ps = None
- for i in range(CPU_CORES):
- s = int(final_range[0] + size_per_proc * i)
- ns = int(final_range[0] + size_per_proc * (i + 1))
- ranges.append([s, ns])
- # ranges = [[0, 1]]
- # p.map(handle_giga_file, ranges)
- # p.map(handle_giga_file, ranges)
- error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
- handle_giga_file_selected_sentences(set(error_sentences))
+handle_ssj500k_file()
logging.info("end parsing kres")
diff --git a/tools/parser/parser.py b/tools/parser/parser.py
index 80e353f..f18b9b2 100644
--- a/tools/parser/parser.py
+++ b/tools/parser/parser.py
@@ -1,3 +1,5 @@
+import copy
+
from lxml import etree
import re
from parser.msd.msdmap import Msdmap
@@ -5,6 +7,7 @@ import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
import sys
+import xml.etree.ElementTree as ET
class Parser:
# reads a TEI xml file and returns a dictionary:
@@ -29,17 +32,23 @@ class Parser:
def parse_tei(self, filepath):
def parse_links(s_el):
- lgrps = s_el.findall(".//links")
+ sent_id = '#' + s_el.get('id')
+ lgrps = s_el.findall(".//linkGrp")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
- for link in lgrps[0]:
- dep = int(link.get("dep").split(".")[-1])
- res_links[dep] = (
- link.get("afun"),
- dep,
- int(link.get("from").split(".")[-1]),
- )
+ for lgrp in lgrps:
+ if lgrp.get("type") == "JOS-SYN":
+ for link in lgrp:
+ jos_type = link.get("ana").split(":")[-1]
+ link_data = link.get("target").split(" ")
+ link_from = int(link_data[1].split('.')[-1][1:])
+ link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
+ res_links[link_from] = (
+ jos_type,
+ link_from,
+ link_to,
+ )
return res_links
guess_corpus = None # SSJ | KRES
@@ -79,6 +88,11 @@ class Parser:
# parse sentences
for s in p.findall(".//s"):
+ # test if sentence has jos-syn annotations and doesn't have SRL
+ sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
+ if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
+ continue
+
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
@@ -87,21 +101,29 @@ class Parser:
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
- if guess_corpus != "GIGA":
- el_id = el.get("id").split(".")[-1]
- if el_id[0] == 't':
- el_id = el_id[1:] # ssj W_TAG ids start with t
- sentence_text += el.text
- sentence_tokens += [(
- "w",
- int(el_id),
- el.text,
- el.get("lemma"),
- (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
- else el.get("ana").split(":")[-1]),
- )]
- else:
- sentence_list.append(el.text)
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ uPosTag = None
+ uPosFeats = []
+ for msd_el in el.get("msd").split('|'):
+ key, val = msd_el.split('=')
+ if key == 'UPosTag':
+ uPosTag = val
+ else:
+ uPosFeats.append(msd_el)
+ uPosFeats = '|'.join(uPosFeats)
+ sentence_tokens += [(
+ "w",
+ int(el_id),
+ el.text,
+ el.get("lemma"),
+ (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+ else el.get("ana").split(":")[-1]),
+ uPosTag,
+ uPosFeats
+ )]
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
if guess_corpus != "GIGA":
@@ -110,33 +132,243 @@ class Parser:
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
- # Kres' doesn't contain .text
- if guess_corpus == "GIGA":
- sentence_list.append(el.text)
- else:
- sentence_text += " "
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ uPosTag = None
+ uPosFeats = []
+ for msd_el in el.get("msd").split('|'):
+ key, val = msd_el.split('=')
+ if key == 'UPosTag':
+ uPosTag = val
+ else:
+ uPosFeats.append(msd_el)
+ uPosFeats = '|'.join(uPosFeats)
+ sentence_tokens += [(
+ "pc",
+ int(el_id),
+ el.text,
+ el.text,
+ (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+ else el.get("ana").split(":")[-1]),
+ uPosTag,
+ uPosFeats
+ )]
else:
# pass links and linkGroups
pass
- sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
+ sentence_id = s.get("id")
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
- if guess_corpus == "GIGA":
- res_dict[sentence_id] = {
- "sid": sentence_id,
- "text": ' '.join(sentence_list),
- "tokens": None,
- "links": None
- }
- else:
- res_dict[sentence_id] = {
- "sid": sentence_id,
- "text": sentence_text,
- "tokens": sentence_tokens,
- "links": (
- parse_links(s) if guess_corpus == "KRES" else None
- )
- }
+
+ res_dict[sentence_id] = {
+ "sid": sentence_id,
+ "text": sentence_text,
+ "tokens": sentence_tokens,
+ "links": (
+ parse_links(s)
+ )
+ }
+ fp.close()
+ return res_dict
+
+
+ def minimize_tei(self, filepath, jsondata):
+ def set_xml_attr(node, attribute, value):
+ node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
+
+ def parse_links(s_el):
+ sent_id = '#' + s_el.get('id')
+ lgrps = s_el.findall(".//linkGrp")
+ if len(lgrps) < 1:
+ raise IOError("Can't find links.")
+ res_links = {}
+ for lgrp in lgrps:
+ if lgrp.get("type") == "JOS-SYN":
+ for link in lgrp:
+ jos_type = link.get("ana").split(":")[-1]
+ link_data = link.get("target").split(" ")
+ link_from = int(link_data[1].split('.')[-1][1:])
+ link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
+ res_links[link_from] = (
+ jos_type,
+ link_from,
+ link_to,
+ )
+ return res_links
+
+ guess_corpus = None # SSJ | KRES
+ res_dict = {}
+ # with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
+ with filepath.open("rb") as fp:
+ used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
+ used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
+ used_ssj_sentences = set([k for k, v in jsondata.items()])
+
+ ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
+ tree = ET.parse(fp)
+ root_res = tree.getroot()
+ # root_res = copy.deepcopy(root)
+ ns = '{http://www.w3.org/XML/1998/namespace}'
+ ns2 = '{http://www.tei-c.org/ns/1.0}'
+
+ for doc in list(root_res):
+ doc_id = doc.get(ns + 'id')
+ if doc_id not in used_ssj_documents:
+ root_res.remove(doc)
+ continue
+
+ for par in list(doc):
+ par_id = par.get(ns + 'id')
+ if par_id not in used_ssj_paragraphs:
+ if par.tag != ns2 + 'bibl':
+ doc.remove(par)
+ continue
+
+ for sen in list(par):
+ sen_id = sen.get(ns + 'id')
+ if sen_id not in used_ssj_sentences:
+ par.remove(sen)
+ continue
+
+ linkGrp = ET.Element(f'{ns2}linkGrp')
+
+ linkGrp.attrib[f'targFunc'] = 'head argument'
+ linkGrp.attrib[f'type'] = 'SRL'
+
+ for srl_el in jsondata[sen_id]:
+ link = ET.Element(f'{ns2}link')
+ link.attrib['ana'] = f'srl:{srl_el["arg"]}'
+ link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
+ linkGrp.append(link)
+ sen.append(linkGrp)
+
+
+ #
+ #
+ #
+ #
+ #
+ #
+ #
+ #
+ # print('aaa')
+
+ # sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
+ tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
+
+ return
+ divs = [] # in ssj, there are divs, in Kres, there are separate files
+ if "id" in root.keys():
+ # Kres files start with
+ if root.get("id")[0:2] == 'GF':
+ guess_corpus = "GIGA"
+ else:
+ guess_corpus = "KRES"
+ divs = [root]
+ else:
+ guess_corpus = "SSJ"
+ divs = root.findall(".//div")
+
+ # parse divs
+ for div in divs:
+ f_id = div.get("id")
+
+ if guess_corpus == "GIGA":
+ div = div.findall(".//body")[0]
+
+ # parse paragraphs
+ for p in div.findall(".//p"):
+ p_id = p.get("id").split(".")[-1]
+
+ # parse sentences
+ for s in p.findall(".//s"):
+ # test if sentence has jos-syn annotations and doesn't have SRL
+ sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
+ if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
+ del s
+ continue
+
+ s_id = s.get("id").split(".")[-1]
+ sentence_text = ""
+ sentence_list = []
+ sentence_tokens = []
+
+ # parse tokens
+ for el in s.iter():
+ if el.tag in self.W_TAGS:
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ uPosTag = None
+ uPosFeats = []
+ for msd_el in el.get("msd").split('|'):
+ key, val = msd_el.split('=')
+ if key == 'UPosTag':
+ uPosTag = val
+ else:
+ uPosFeats.append(msd_el)
+ uPosFeats = '|'.join(uPosFeats)
+ sentence_tokens += [(
+ "w",
+ int(el_id),
+ el.text,
+ el.get("lemma"),
+ (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+ else el.get("ana").split(":")[-1]),
+ uPosTag,
+ uPosFeats
+ )]
+ elif el.tag in self.C_TAGS:
+ # only Kres' C_TAGS have ids
+ if guess_corpus != "GIGA":
+ el_id = el.get("id") or "none"
+ el_id = el_id.split(".")[-1]
+ sentence_text += el.text
+ sentence_tokens += [("c", el_id, el.text,)]
+ elif el.tag in self.S_TAGS:
+ el_id = el.get("id").split(".")[-1]
+ if el_id[0] == 't':
+ el_id = el_id[1:] # ssj W_TAG ids start with t
+ sentence_text += el.text
+ uPosTag = None
+ uPosFeats = []
+ for msd_el in el.get("msd").split('|'):
+ key, val = msd_el.split('=')
+ if key == 'UPosTag':
+ uPosTag = val
+ else:
+ uPosFeats.append(msd_el)
+ uPosFeats = '|'.join(uPosFeats)
+ sentence_tokens += [(
+ "pc",
+ int(el_id),
+ el.text,
+ el.text,
+ (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
+ else el.get("ana").split(":")[-1]),
+ uPosTag,
+ uPosFeats
+ )]
+ else:
+ # pass links and linkGroups
+ pass
+ sentence_id = s.get("id")
+ if sentence_id in res_dict:
+ raise KeyError("duplicated id: {}".format(sentence_id))
+
+ res_dict[sentence_id] = {
+ "sid": sentence_id,
+ "text": sentence_text,
+ "tokens": sentence_tokens,
+ "links": (
+ parse_links(s)
+ )
+ }
+ et = etree.ElementTree(root)
+ et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
fp.close()
return res_dict
@@ -157,12 +389,8 @@ class Parser:
# handle stop signs
if token[0] != "w":
- out_str += '\t'.join(
- [t_id] +
- [form for x in range(7)] +
- ["0", "0", "modra", "modra", "_", "_"] +
- ["\n"]
- )
+ out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
+ out_str += '\t'.join(map(str, out_list))
continue
pos = self.msdmap.slo_msd_to_eng_pos(token[4])
diff --git a/tools/srl-20131216/tag_ssj500k2.3.sh b/tools/srl-20131216/tag_ssj500k2.3.sh
new file mode 100755
index 0000000..93d86f7
--- /dev/null
+++ b/tools/srl-20131216/tag_ssj500k2.3.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# parsing tools.cfg values
+IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
+IN_FOLDER=$IN_FOLDER$1
+echo "input folder: $IN_FOLDER"
+OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
+echo "output folder: $OUT_FOLDER"
+
+SUFFIX="srl.tsv"
+
+mkdir -p $OUT_FOLDER
+# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
+
+for infile in $IN_FOLDER/*; do
+ echo "Tagging: ${infile}"
+ base=$(basename $infile | cut -d'.' -f1)
+ outfile=${OUT_FOLDER}/${base}.${SUFFIX}
+
+ # mate-tools tagger
+ ./scripts/parse_srl_only_mod.sh $infile $outfile
+
+ if [ $? -eq 0 ]; then
+ echo "Saved as ${outfile}"
+ else
+ echo "ERR"
+ exit 1
+ fi
+done
+
diff --git a/tools/tools.cfg b/tools/tools.cfg
index fb538df..49f6e52 100644
--- a/tools/tools.cfg
+++ b/tools/tools.cfg
@@ -1,18 +1,13 @@
[tools]
-giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
-giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
-; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
-giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
-giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
-; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
-; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
-; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
-giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
-giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
-; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
-giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
-internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
+giga = ../data/gf_example/gf2_orig
+giga_orig = ../data/gf_example/gf2-dedup.patch0001
+giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
+giga_tsv = ../data/gf_example/gf_files_part
+giga_srl = ../data/gf_example/2_srl
+;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
+giga_json = ../data/gf_example/final_json
+internal_data = ../data/gf_example/internal_data
giga_parts = 100000
-logfile = ../progress.log
-cpu_cores = 16
-debug = False
+logfile = ../data/gf_example/progress.log
+cpu_cores = 1
+debug = True
diff --git a/tools/tools.cfg.ssj500k2.3 b/tools/tools.cfg.ssj500k2.3
new file mode 100644
index 0000000..ef7bf5a
--- /dev/null
+++ b/tools/tools.cfg.ssj500k2.3
@@ -0,0 +1,15 @@
+[tools]
+ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
+ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
+ssj500k_orig_folder = ../data/ssj500k2.3/orig
+ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
+ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
+ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
+ssj500k_srl = ../data/ssj500k2.3/srls
+ssj500k_json = ../data/ssj500k2.3/final_json
+ssj500k_tei = ../data/ssj500k2.3/final_tei
+internal_data = ../data/ssj500k2.3/internal_data
+;internal_data = ../data/gf_example/internal_data
+logfile = ../data/ssj500k2.3/progress.log
+cpu_cores = 1
+debug = True