Adapted code to ssj500k and added its branch

This commit is contained in:
Luka 2022-03-14 11:01:53 +01:00
parent c1ecc4cdbc
commit fd20295017
9 changed files with 513 additions and 152 deletions

View File

@ -6,8 +6,9 @@ json_files: # srl_tagged_files
cd tools; python3 gen_json.py
srl_tagged_files: # tsv_files
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
# # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
cd tools/srl-20131216; ./tag_all.sh
# cd tools/srl-20131216; ./tag_ssj500k2.3.sh
tsv_files: # tools/fillpred_model/model.pickle
cd tools; python3 parse_all.py

View File

@ -1,3 +1,11 @@
# Instructions
For mining ssj500k <b>checkout to branch ssj500k</b>.
For running order look at Makefile. Generally it works like this:
- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar
- tools/srl-20131216/tag_all.sh - Tags ssj500k
- tools/gen_json.py - Mine SRL to json
- tools/gen_tei.py - Mine SRL to tei
# cjvt-srl-tagging
We'll be using mate-tools to perform SRL on Kres.

View File

@ -13,10 +13,10 @@ from multiprocessing import Pool
# parse config
config = configparser.ConfigParser()
config.read("tools.cfg")
ORIGPATH = Path(config["tools"]["giga"])
INPATH = Path(config["tools"]["giga_srl"])
OUTPATH = Path(config["tools"]["giga_json"])
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
INPATH = Path(config["tools"]["ssj500k_srl"])
OUTPATH = Path(config["tools"]["ssj500k_json"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
@ -143,44 +143,36 @@ def handle_file(whole_input):
print('PAUSE')
# look at neighbouring sentences if they are correct
for i in range(100):
sentence, sentence_arr = next(gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
if sentence == orig_val["text"]:
# if i != 10 and i != 0:
# print('OK!')
sid = orig_id
sentence, sentence_arr = next(gen)
# orig_sentence = " ".join(token[2] for token in e["tokens"])
assert sentence.replace(' ', '') == orig_val['text']
# if i != 10 and i != 0:
# print('OK!')
sid = orig_id
outdata[sid] = []
outdata[sid] = []
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
# find all predicate indices in the sentence
predicates = []
for token in sentence_arr:
if token[12] == "Y":
predicates += [token[0]] # idx
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
deprel = get_dep_rel(token)
if deprel is not None:
outdata[sid].append(deprel)
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
# deprel["from"] points to n-th predicate
# replace with predicate's token index
for deprel in outdata[sid]:
deprel["from"] = predicates[deprel["from"]]
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
break
else:
if i == 99:
mismatch_sentences += 1
sid = orig_id
outdata[sid] = []
gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i)
if DEBUG:
print(to_sentence(sentence_arr))
print(outdata[sid])
print(sid)
print()
print()
if mismatch_sentences > 0:
if mismatch_sentences / len(orig_dict.items()) < 0.1:

47
tools/gen_tei.py Normal file
View File

@ -0,0 +1,47 @@
# parse config
import configparser
import json
import logging
import os
from pathlib import Path
from tools.parser.parser import Parser
config = configparser.ConfigParser()
config.read("tools.cfg.ssj500k2.3")
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
OUTPATH = Path(config["tools"]["ssj500k_tei"])
INTERNAL_DATA = Path(config["tools"]["internal_data"])
DEBUG = config["tools"]["debug"] == "True"
CPU_CORES = int(config["tools"]["cpu_cores"])
LOGFILE = Path(config["tools"]["logfile"]).absolute()
LOGFILE.touch(exist_ok=True)
LOGFILE.resolve()
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
par = Parser()
OUTPATH.mkdir(exist_ok=True)
jsondata = []
with open(JSONPATH, 'r') as jf:
jsondata = json.load(jf)
logging.info("Generating TEI with annotated SRL.")
def handle_file(file, jsondata):
teifile = (ORIGPATH / file)
resfile = (OUTPATH / file)
orig_dict = par.parse_tei(teifile)
# origfile = get_origfile()
orig_dict = par.minimize_tei(teifile, jsondata)
origfiles = []
for subdir, dirs, files in os.walk(ORIGPATH):
for file in files:
handle_file(file, jsondata)

View File

@ -16,7 +16,8 @@ par = Parser()
# path to data
config = configparser.ConfigParser()
config.read("tools.cfg")
# config.read("tools.cfg")
config.read("tools.cfg.ssj500k2.3")
analysis = ''
if 'kres_orig' in config["tools"]:
analysis = 'kres'
@ -31,6 +32,14 @@ elif 'giga_orig' in config["tools"]:
OUTDIR = Path(config["tools"]["giga_tsv"])
GIGA_PARTS = int(config["tools"]["giga_parts"])
INTERNAL_DATA = config["tools"]["internal_data"]
elif 'ssj500k_orig' in config["tools"]:
# analysis = 'gigafida'
analysis = 'ssj500k'
INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"])
INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"])
INDIR_JOS = Path(config["tools"]["ssj500k_jos"])
OUTDIR = Path(config["tools"]["ssj500k_tsv"])
INTERNAL_DATA = config["tools"]["internal_data"]
CPU_CORES = int(config["tools"]["cpu_cores"])
@ -49,40 +58,40 @@ print("end parsing ssj")
"""
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
OUTDIR.mkdir(exist_ok=True)
# OUTDIR.mkdir(exist_ok=True)
if analysis == 'kres':
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
logging.info("Parsing kres: {} files.".format(len(infiles)))
def handle_file(infile):
i = infile[0]
kres_file = infile[1]
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
def handle_ssj500k_file():
kres_file = INDIR_SSJ500K_ORIG
outfile = OUTDIR
if outfile.is_file():
logging.info("Skipping existing file: {}.".format(str(kres_file)))
return True
try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
except Exception as exc:
logging.info("Failed processing file: {}".format(str(kres_file)))
logging.error(exc)
return False
# try:
res_dict = par.parse_tei(kres_file)
kres_out_str = ""
for _, sentence in res_dict.items():
kres_out_str += par.to_conll_2009_SRL(sentence)
# except Exception as exc:
# logging.info("Failed processing file: {}".format(str(kres_file)))
# logging.error(exc)
# return False
with outfile.open("wb+") as fp:
fp.write(kres_out_str.encode("utf-8"))
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
# logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
return True
return False
def giga_orig_generator():
with open(INDIR_GIGA, 'r') as gof:
def ssj500k_orig_generator():
with open(INDIR_SSJ500K, 'r') as gof:
previous_new_line = False
for l_gof in gof:
if l_gof == '\n':
@ -104,12 +113,6 @@ def handle_gigafida_file():
# pass
# num_lines = i + 1
# print(num_lines)
num_lines = 1393184026
# 1393184026
# 1393184033
# return
num_lines_per_part = num_lines / GIGA_PARTS
curr_part = 0
gof_generator = giga_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
@ -168,6 +171,70 @@ def handle_gigafida_file():
curr_part += 1
wf.close()
def handle_ssj500k_file2():
"""
File that splits big text file into more minor files. Only split on empty lines.
"""
gof_generator = ssj500k_orig_generator()
# with open(INDIR_GIGA, 'r') as gof:
with open(INDIR_JOS, 'r') as gjf:
sentence = {}
sentence['tokens'] = []
sentence['links'] = {}
if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)):
ignore_lines = True
wf = False
else:
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
ignore_lines = False
# for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)):
for i, l_gjf in enumerate(gjf):
l_gof = next(gof_generator)
if ignore_lines:
if i > num_lines_per_part * curr_part and l_gof == '\n':
if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))):
ignore_lines = False
# delete last file (probably not whole)
os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1)))
if ignore_lines:
print(curr_part)
curr_part += 1
continue
else:
continue
l_gof_split = l_gof.split('\t')
l_gjf_split = l_gjf.split('\t')
# if punctuation
if l_gof != '\n':
if l_gof_split[1][-1] == 'u':
# print(l_gjf_split)
sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1]))
else:
sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1]))
sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6])
# if l_gof == '\n':
else:
if wf:
# print(i)
wf.write(par.to_conll_2009_SRL(sentence))
sentence['tokens'] = []
sentence['links'] = {}
# wf.flush()
# if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n':
if i > num_lines_per_part * (curr_part + 1):
curr_part += 1
# if wf doesn't exist (first one)
if wf:
wf.close()
wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a')
curr_part += 1
wf.close()
import time
def handle_giga_file(ran):
"""
@ -347,31 +414,9 @@ def handle_giga_file_selected_sentences(error_sentences):
# curr_part += 1
wf.close()
file_indices = set(range(0, 100000))
with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file:
file_indices = set(pickle.load(pkl_file))
with Pool(CPU_CORES) as p:
if analysis == 'kres':
p.map(handle_file, infiles)
elif analysis == 'gigafida':
handle_gigafida_file()
elif analysis == 'giga':
final_range = [0, 100000]
size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES
# splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)]
ranges = []
ps = None
for i in range(CPU_CORES):
s = int(final_range[0] + size_per_proc * i)
ns = int(final_range[0] + size_per_proc * (i + 1))
ranges.append([s, ns])
# ranges = [[0, 1]]
# p.map(handle_giga_file, ranges)
# p.map(handle_giga_file, ranges)
error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))]
handle_giga_file_selected_sentences(set(error_sentences))
handle_ssj500k_file()
logging.info("end parsing kres")

View File

@ -1,3 +1,5 @@
import copy
from lxml import etree
import re
from parser.msd.msdmap import Msdmap
@ -5,6 +7,7 @@ import pickle
from pathlib import Path
from fillpred_model.step1 import build_model_row
import sys
import xml.etree.ElementTree as ET
class Parser:
# reads a TEI xml file and returns a dictionary:
@ -29,17 +32,23 @@ class Parser:
def parse_tei(self, filepath):
def parse_links(s_el):
lgrps = s_el.findall(".//links")
sent_id = '#' + s_el.get('id')
lgrps = s_el.findall(".//linkGrp")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for link in lgrps[0]:
dep = int(link.get("dep").split(".")[-1])
res_links[dep] = (
link.get("afun"),
dep,
int(link.get("from").split(".")[-1]),
)
for lgrp in lgrps:
if lgrp.get("type") == "JOS-SYN":
for link in lgrp:
jos_type = link.get("ana").split(":")[-1]
link_data = link.get("target").split(" ")
link_from = int(link_data[1].split('.')[-1][1:])
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
res_links[link_from] = (
jos_type,
link_from,
link_to,
)
return res_links
guess_corpus = None # SSJ | KRES
@ -79,6 +88,11 @@ class Parser:
# parse sentences
for s in p.findall(".//s"):
# test if sentence has jos-syn annotations and doesn't have SRL
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
continue
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
@ -87,21 +101,29 @@ class Parser:
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
if guess_corpus != "GIGA":
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
)]
else:
sentence_list.append(el.text)
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
if guess_corpus != "GIGA":
@ -110,33 +132,243 @@ class Parser:
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
# Kres' <S /> doesn't contain .text
if guess_corpus == "GIGA":
sentence_list.append(el.text)
else:
sentence_text += " "
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"pc",
int(el_id),
el.text,
el.text,
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
else:
# pass links and linkGroups
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
sentence_id = s.get("id")
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
if guess_corpus == "GIGA":
res_dict[sentence_id] = {
"sid": sentence_id,
"text": ' '.join(sentence_list),
"tokens": None,
"links": None
}
else:
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s) if guess_corpus == "KRES" else None
)
}
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s)
)
}
fp.close()
return res_dict
def minimize_tei(self, filepath, jsondata):
def set_xml_attr(node, attribute, value):
node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value
def parse_links(s_el):
sent_id = '#' + s_el.get('id')
lgrps = s_el.findall(".//linkGrp")
if len(lgrps) < 1:
raise IOError("Can't find links.")
res_links = {}
for lgrp in lgrps:
if lgrp.get("type") == "JOS-SYN":
for link in lgrp:
jos_type = link.get("ana").split(":")[-1]
link_data = link.get("target").split(" ")
link_from = int(link_data[1].split('.')[-1][1:])
link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0
res_links[link_from] = (
jos_type,
link_from,
link_to,
)
return res_links
guess_corpus = None # SSJ | KRES
res_dict = {}
# with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf:
with filepath.open("rb") as fp:
used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()])
used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()])
used_ssj_sentences = set([k for k, v in jsondata.items()])
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
tree = ET.parse(fp)
root_res = tree.getroot()
# root_res = copy.deepcopy(root)
ns = '{http://www.w3.org/XML/1998/namespace}'
ns2 = '{http://www.tei-c.org/ns/1.0}'
for doc in list(root_res):
doc_id = doc.get(ns + 'id')
if doc_id not in used_ssj_documents:
root_res.remove(doc)
continue
for par in list(doc):
par_id = par.get(ns + 'id')
if par_id not in used_ssj_paragraphs:
if par.tag != ns2 + 'bibl':
doc.remove(par)
continue
for sen in list(par):
sen_id = sen.get(ns + 'id')
if sen_id not in used_ssj_sentences:
par.remove(sen)
continue
linkGrp = ET.Element(f'{ns2}linkGrp')
linkGrp.attrib[f'targFunc'] = 'head argument'
linkGrp.attrib[f'type'] = 'SRL'
for srl_el in jsondata[sen_id]:
link = ET.Element(f'{ns2}link')
link.attrib['ana'] = f'srl:{srl_el["arg"]}'
link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}'
linkGrp.append(link)
sen.append(linkGrp)
# <linkGrp corresp="#ssj1.1.1" targFunc="head argument" type="SRL">
# <link ana="srl:TIME" target="#ssj1.1.1.t6 #ssj1.1.1.t3"/>
# <link ana="srl:QUANT" target="#ssj1.1.1.t6 #ssj1.1.1.t5"/>
# <link ana="srl:TIME" target="#ssj1.1.1.t8 #ssj1.1.1.t11"/>
# <link ana="srl:PAT" target="#ssj1.1.1.t23 #ssj1.1.1.t21"/>
# <link ana="srl:ACT" target="#ssj1.1.1.t23 #ssj1.1.1.t22"/>
# <link ana="srl:RESLT" target="#ssj1.1.1.t18 #ssj1.1.1.t23"/>
# </linkGrp>
# print('aaa')
# sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode())
tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8')
return
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
if root.get("id")[0:2] == 'GF':
guess_corpus = "GIGA"
else:
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
f_id = div.get("id")
if guess_corpus == "GIGA":
div = div.findall(".//body")[0]
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
# test if sentence has jos-syn annotations and doesn't have SRL
sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")]
if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list:
del s
continue
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_list = []
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in self.W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"w",
int(el_id),
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
elif el.tag in self.C_TAGS:
# only Kres' C_TAGS have ids
if guess_corpus != "GIGA":
el_id = el.get("id") or "none"
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in self.S_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
uPosTag = None
uPosFeats = []
for msd_el in el.get("msd").split('|'):
key, val = msd_el.split('=')
if key == 'UPosTag':
uPosTag = val
else:
uPosFeats.append(msd_el)
uPosFeats = '|'.join(uPosFeats)
sentence_tokens += [(
"pc",
int(el_id),
el.text,
el.text,
(el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA"
else el.get("ana").split(":")[-1]),
uPosTag,
uPosFeats
)]
else:
# pass links and linkGroups
pass
sentence_id = s.get("id")
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens,
"links": (
parse_links(s)
)
}
et = etree.ElementTree(root)
et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode')
fp.close()
return res_dict
@ -157,12 +389,8 @@ class Parser:
# handle stop signs
if token[0] != "w":
out_str += '\t'.join(
[t_id] +
[form for x in range(7)] +
["0", "0", "modra", "modra", "_", "_"] +
["\n"]
)
out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"]
out_str += '\t'.join(map(str, out_list))
continue
pos = self.msdmap.slo_msd_to_eng_pos(token[4])

View File

@ -0,0 +1,30 @@
#!/bin/bash
# parsing tools.cfg values
IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
IN_FOLDER=$IN_FOLDER$1
echo "input folder: $IN_FOLDER"
OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
echo "output folder: $OUT_FOLDER"
SUFFIX="srl.tsv"
mkdir -p $OUT_FOLDER
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
for infile in $IN_FOLDER/*; do
echo "Tagging: ${infile}"
base=$(basename $infile | cut -d'.' -f1)
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
# mate-tools tagger
./scripts/parse_srl_only_mod.sh $infile $outfile
if [ $? -eq 0 ]; then
echo "Saved as ${outfile}"
else
echo "ERR"
exit 1
fi
done

View File

@ -1,18 +1,13 @@
[tools]
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
giga = ../data/gf_example/gf2_orig
giga_orig = ../data/gf_example/gf2-dedup.patch0001
giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
giga_tsv = ../data/gf_example/gf_files_part
giga_srl = ../data/gf_example/2_srl
;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
giga_json = ../data/gf_example/final_json
internal_data = ../data/gf_example/internal_data
giga_parts = 100000
logfile = ../progress.log
cpu_cores = 16
debug = False
logfile = ../data/gf_example/progress.log
cpu_cores = 1
debug = True

View File

@ -0,0 +1,15 @@
[tools]
ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_orig_folder = ../data/ssj500k2.3/orig
ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
ssj500k_srl = ../data/ssj500k2.3/srls
ssj500k_json = ../data/ssj500k2.3/final_json
ssj500k_tei = ../data/ssj500k2.3/final_tei
internal_data = ../data/ssj500k2.3/internal_data
;internal_data = ../data/gf_example/internal_data
logfile = ../data/ssj500k2.3/progress.log
cpu_cores = 1
debug = True