From fd20295017396bf7aa815bd97cc173f97248a2b1 Mon Sep 17 00:00:00 2001 From: Luka Date: Mon, 14 Mar 2022 11:01:53 +0100 Subject: [PATCH] Adapted code to ssj500k and added its branch --- Makefile | 3 +- README.md | 8 + tools/gen_json.py | 66 +++--- tools/gen_tei.py | 47 ++++ tools/parse_all.py | 139 +++++++---- tools/parser/parser.py | 330 ++++++++++++++++++++++----- tools/srl-20131216/tag_ssj500k2.3.sh | 30 +++ tools/tools.cfg | 27 +-- tools/tools.cfg.ssj500k2.3 | 15 ++ 9 files changed, 513 insertions(+), 152 deletions(-) create mode 100644 tools/gen_tei.py create mode 100755 tools/srl-20131216/tag_ssj500k2.3.sh create mode 100644 tools/tools.cfg.ssj500k2.3 diff --git a/Makefile b/Makefile index 69ae834..4ace65f 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,9 @@ json_files: # srl_tagged_files cd tools; python3 gen_json.py srl_tagged_files: # tsv_files - # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - +# # cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - cd tools/srl-20131216; ./tag_all.sh +# cd tools/srl-20131216; ./tag_ssj500k2.3.sh tsv_files: # tools/fillpred_model/model.pickle cd tools; python3 parse_all.py diff --git a/README.md b/README.md index 51252c2..ce7d3e3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,11 @@ +# Instructions +For mining ssj500k checkout to branch ssj500k. +For running order look at Makefile. Generally it works like this: +- tools/parse_all.py - It creates mate file that is necessary for running Java based srl.jar +- tools/srl-20131216/tag_all.sh - Tags ssj500k +- tools/gen_json.py - Mine SRL to json +- tools/gen_tei.py - Mine SRL to tei + # cjvt-srl-tagging We'll be using mate-tools to perform SRL on Kres. diff --git a/tools/gen_json.py b/tools/gen_json.py index 1e8f821..fd16547 100644 --- a/tools/gen_json.py +++ b/tools/gen_json.py @@ -13,10 +13,10 @@ from multiprocessing import Pool # parse config config = configparser.ConfigParser() -config.read("tools.cfg") -ORIGPATH = Path(config["tools"]["giga"]) -INPATH = Path(config["tools"]["giga_srl"]) -OUTPATH = Path(config["tools"]["giga_json"]) +config.read("tools.cfg.ssj500k2.3") +ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"]) +INPATH = Path(config["tools"]["ssj500k_srl"]) +OUTPATH = Path(config["tools"]["ssj500k_json"]) INTERNAL_DATA = Path(config["tools"]["internal_data"]) DEBUG = config["tools"]["debug"] == "True" CPU_CORES = int(config["tools"]["cpu_cores"]) @@ -143,44 +143,36 @@ def handle_file(whole_input): print('PAUSE') # look at neighbouring sentences if they are correct - for i in range(100): - sentence, sentence_arr = next(gen) - # orig_sentence = " ".join(token[2] for token in e["tokens"]) - if sentence == orig_val["text"]: - # if i != 10 and i != 0: - # print('OK!') - sid = orig_id + sentence, sentence_arr = next(gen) + # orig_sentence = " ".join(token[2] for token in e["tokens"]) + assert sentence.replace(' ', '') == orig_val['text'] + # if i != 10 and i != 0: + # print('OK!') + sid = orig_id - outdata[sid] = [] + outdata[sid] = [] - # find all predicate indices in the sentence - predicates = [] - for token in sentence_arr: - if token[12] == "Y": - predicates += [token[0]] # idx + # find all predicate indices in the sentence + predicates = [] + for token in sentence_arr: + if token[12] == "Y": + predicates += [token[0]] # idx - deprel = get_dep_rel(token) - if deprel is not None: - outdata[sid].append(deprel) + deprel = get_dep_rel(token) + if deprel is not None: + outdata[sid].append(deprel) - # deprel["from"] points to n-th predicate - # replace with predicate's token index - for deprel in outdata[sid]: - deprel["from"] = predicates[deprel["from"]] + # deprel["from"] points to n-th predicate + # replace with predicate's token index + for deprel in outdata[sid]: + deprel["from"] = predicates[deprel["from"]] - if DEBUG: - print(to_sentence(sentence_arr)) - print(outdata[sid]) - print(sid) - print() - print() - break - else: - if i == 99: - mismatch_sentences += 1 - sid = orig_id - outdata[sid] = [] - gen = srl_multiple_files_sentences_generator(sentence_id + sentence_i) + if DEBUG: + print(to_sentence(sentence_arr)) + print(outdata[sid]) + print(sid) + print() + print() if mismatch_sentences > 0: if mismatch_sentences / len(orig_dict.items()) < 0.1: diff --git a/tools/gen_tei.py b/tools/gen_tei.py new file mode 100644 index 0000000..f62642d --- /dev/null +++ b/tools/gen_tei.py @@ -0,0 +1,47 @@ +# parse config +import configparser +import json +import logging +import os +from pathlib import Path + +from tools.parser.parser import Parser + +config = configparser.ConfigParser() +config.read("tools.cfg.ssj500k2.3") +ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"]) +JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json') +OUTPATH = Path(config["tools"]["ssj500k_tei"]) +INTERNAL_DATA = Path(config["tools"]["internal_data"]) +DEBUG = config["tools"]["debug"] == "True" +CPU_CORES = int(config["tools"]["cpu_cores"]) + +LOGFILE = Path(config["tools"]["logfile"]).absolute() +LOGFILE.touch(exist_ok=True) +LOGFILE.resolve() + +logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) + + +par = Parser() +OUTPATH.mkdir(exist_ok=True) + +jsondata = [] +with open(JSONPATH, 'r') as jf: + jsondata = json.load(jf) + +logging.info("Generating TEI with annotated SRL.") + +def handle_file(file, jsondata): + teifile = (ORIGPATH / file) + resfile = (OUTPATH / file) + + orig_dict = par.parse_tei(teifile) + + # origfile = get_origfile() + orig_dict = par.minimize_tei(teifile, jsondata) + +origfiles = [] +for subdir, dirs, files in os.walk(ORIGPATH): + for file in files: + handle_file(file, jsondata) diff --git a/tools/parse_all.py b/tools/parse_all.py index 86c3caf..459935a 100644 --- a/tools/parse_all.py +++ b/tools/parse_all.py @@ -16,7 +16,8 @@ par = Parser() # path to data config = configparser.ConfigParser() -config.read("tools.cfg") +# config.read("tools.cfg") +config.read("tools.cfg.ssj500k2.3") analysis = '' if 'kres_orig' in config["tools"]: analysis = 'kres' @@ -31,6 +32,14 @@ elif 'giga_orig' in config["tools"]: OUTDIR = Path(config["tools"]["giga_tsv"]) GIGA_PARTS = int(config["tools"]["giga_parts"]) INTERNAL_DATA = config["tools"]["internal_data"] +elif 'ssj500k_orig' in config["tools"]: + # analysis = 'gigafida' + analysis = 'ssj500k' + INDIR_SSJ500K_ORIG = Path(config["tools"]["ssj500k"]) + INDIR_SSJ500K = Path(config["tools"]["ssj500k_orig"]) + INDIR_JOS = Path(config["tools"]["ssj500k_jos"]) + OUTDIR = Path(config["tools"]["ssj500k_tsv"]) + INTERNAL_DATA = config["tools"]["internal_data"] CPU_CORES = int(config["tools"]["cpu_cores"]) @@ -49,40 +58,40 @@ print("end parsing ssj") """ # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" -OUTDIR.mkdir(exist_ok=True) +# OUTDIR.mkdir(exist_ok=True) if analysis == 'kres': infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) logging.info("Parsing kres: {} files.".format(len(infiles))) -def handle_file(infile): - i = infile[0] - kres_file = infile[1] - outfile = (OUTDIR / kres_file.name).with_suffix(".tsv") + +def handle_ssj500k_file(): + kres_file = INDIR_SSJ500K_ORIG + outfile = OUTDIR if outfile.is_file(): logging.info("Skipping existing file: {}.".format(str(kres_file))) return True - try: - res_dict = par.parse_tei(kres_file) - kres_out_str = "" - for _, sentence in res_dict.items(): - kres_out_str += par.to_conll_2009_SRL(sentence) - except Exception as exc: - logging.info("Failed processing file: {}".format(str(kres_file))) - logging.error(exc) - return False + # try: + res_dict = par.parse_tei(kres_file) + kres_out_str = "" + for _, sentence in res_dict.items(): + kres_out_str += par.to_conll_2009_SRL(sentence) + # except Exception as exc: + # logging.info("Failed processing file: {}".format(str(kres_file))) + # logging.error(exc) + # return False with outfile.open("wb+") as fp: fp.write(kres_out_str.encode("utf-8")) - logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) + # logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) return True return False -def giga_orig_generator(): - with open(INDIR_GIGA, 'r') as gof: +def ssj500k_orig_generator(): + with open(INDIR_SSJ500K, 'r') as gof: previous_new_line = False for l_gof in gof: if l_gof == '\n': @@ -104,12 +113,6 @@ def handle_gigafida_file(): # pass # num_lines = i + 1 # print(num_lines) - num_lines = 1393184026 - # 1393184026 - # 1393184033 - # return - num_lines_per_part = num_lines / GIGA_PARTS - curr_part = 0 gof_generator = giga_orig_generator() # with open(INDIR_GIGA, 'r') as gof: with open(INDIR_JOS, 'r') as gjf: @@ -168,6 +171,70 @@ def handle_gigafida_file(): curr_part += 1 wf.close() + +def handle_ssj500k_file2(): + """ + File that splits big text file into more minor files. Only split on empty lines. + """ + gof_generator = ssj500k_orig_generator() + # with open(INDIR_GIGA, 'r') as gof: + with open(INDIR_JOS, 'r') as gjf: + sentence = {} + sentence['tokens'] = [] + sentence['links'] = {} + if os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % 0)): + ignore_lines = True + wf = False + else: + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + ignore_lines = False + # for i, (l_gof, l_gjf) in enumerate(zip(gof, gjf)): + for i, l_gjf in enumerate(gjf): + l_gof = next(gof_generator) + if ignore_lines: + if i > num_lines_per_part * curr_part and l_gof == '\n': + if not os.path.exists(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 2))): + ignore_lines = False + # delete last file (probably not whole) + os.remove(os.path.join(OUTDIR, 'giga%07d.tsv' % (curr_part + 1))) + if ignore_lines: + print(curr_part) + curr_part += 1 + continue + else: + continue + l_gof_split = l_gof.split('\t') + l_gjf_split = l_gjf.split('\t') + + # if punctuation + if l_gof != '\n': + if l_gof_split[1][-1] == 'u': + # print(l_gjf_split) + sentence['tokens'].append(('c', l_gjf_split[0], l_gjf_split[1])) + else: + sentence['tokens'].append(('w', l_gjf_split[0], l_gof_split[0], l_gof_split[1][:-2], l_gof_split[3][:-1])) + + sentence['links'][l_gjf_split[0]] = (l_gjf_split[7], l_gjf_split[0], l_gjf_split[6]) + + # if l_gof == '\n': + else: + if wf: + # print(i) + wf.write(par.to_conll_2009_SRL(sentence)) + sentence['tokens'] = [] + sentence['links'] = {} + # wf.flush() + # if i > num_lines_per_part * (curr_part + 1) and l_gof == '\n': + if i > num_lines_per_part * (curr_part + 1): + curr_part += 1 + # if wf doesn't exist (first one) + if wf: + wf.close() + wf = open(os.path.join(OUTDIR, 'giga%07d.tsv' % curr_part), 'a') + curr_part += 1 + wf.close() + + import time def handle_giga_file(ran): """ @@ -347,31 +414,9 @@ def handle_giga_file_selected_sentences(error_sentences): # curr_part += 1 wf.close() -file_indices = set(range(0, 100000)) -with open(os.path.join(INTERNAL_DATA, 'diffs_updated_gigafida.pkl'), 'rb') as pkl_file: - file_indices = set(pickle.load(pkl_file)) -with Pool(CPU_CORES) as p: - if analysis == 'kres': - p.map(handle_file, infiles) - elif analysis == 'gigafida': - handle_gigafida_file() - elif analysis == 'giga': - final_range = [0, 100000] - size_per_proc = (final_range[1] - final_range[0]) / CPU_CORES - # splits = [int(final_range[0] + size_per_proc) for i in range(CPU_CORES)] - ranges = [] - ps = None - for i in range(CPU_CORES): - s = int(final_range[0] + size_per_proc * i) - ns = int(final_range[0] + size_per_proc * (i + 1)) - ranges.append([s, ns]) - # ranges = [[0, 1]] - # p.map(handle_giga_file, ranges) - # p.map(handle_giga_file, ranges) - error_sentences = [line.rstrip('\n') for line in open(os.path.join(INTERNAL_DATA, 'sentences_with_less_than_token.txt'))] - handle_giga_file_selected_sentences(set(error_sentences)) +handle_ssj500k_file() logging.info("end parsing kres") diff --git a/tools/parser/parser.py b/tools/parser/parser.py index 80e353f..f18b9b2 100644 --- a/tools/parser/parser.py +++ b/tools/parser/parser.py @@ -1,3 +1,5 @@ +import copy + from lxml import etree import re from parser.msd.msdmap import Msdmap @@ -5,6 +7,7 @@ import pickle from pathlib import Path from fillpred_model.step1 import build_model_row import sys +import xml.etree.ElementTree as ET class Parser: # reads a TEI xml file and returns a dictionary: @@ -29,17 +32,23 @@ class Parser: def parse_tei(self, filepath): def parse_links(s_el): - lgrps = s_el.findall(".//links") + sent_id = '#' + s_el.get('id') + lgrps = s_el.findall(".//linkGrp") if len(lgrps) < 1: raise IOError("Can't find links.") res_links = {} - for link in lgrps[0]: - dep = int(link.get("dep").split(".")[-1]) - res_links[dep] = ( - link.get("afun"), - dep, - int(link.get("from").split(".")[-1]), - ) + for lgrp in lgrps: + if lgrp.get("type") == "JOS-SYN": + for link in lgrp: + jos_type = link.get("ana").split(":")[-1] + link_data = link.get("target").split(" ") + link_from = int(link_data[1].split('.')[-1][1:]) + link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0 + res_links[link_from] = ( + jos_type, + link_from, + link_to, + ) return res_links guess_corpus = None # SSJ | KRES @@ -79,6 +88,11 @@ class Parser: # parse sentences for s in p.findall(".//s"): + # test if sentence has jos-syn annotations and doesn't have SRL + sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")] + if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list: + continue + s_id = s.get("id").split(".")[-1] sentence_text = "" sentence_list = [] @@ -87,21 +101,29 @@ class Parser: # parse tokens for el in s.iter(): if el.tag in self.W_TAGS: - if guess_corpus != "GIGA": - el_id = el.get("id").split(".")[-1] - if el_id[0] == 't': - el_id = el_id[1:] # ssj W_TAG ids start with t - sentence_text += el.text - sentence_tokens += [( - "w", - int(el_id), - el.text, - el.get("lemma"), - (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" - else el.get("ana").split(":")[-1]), - )] - else: - sentence_list.append(el.text) + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + uPosTag = None + uPosFeats = [] + for msd_el in el.get("msd").split('|'): + key, val = msd_el.split('=') + if key == 'UPosTag': + uPosTag = val + else: + uPosFeats.append(msd_el) + uPosFeats = '|'.join(uPosFeats) + sentence_tokens += [( + "w", + int(el_id), + el.text, + el.get("lemma"), + (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" + else el.get("ana").split(":")[-1]), + uPosTag, + uPosFeats + )] elif el.tag in self.C_TAGS: # only Kres' C_TAGS have ids if guess_corpus != "GIGA": @@ -110,33 +132,243 @@ class Parser: sentence_text += el.text sentence_tokens += [("c", el_id, el.text,)] elif el.tag in self.S_TAGS: - # Kres' doesn't contain .text - if guess_corpus == "GIGA": - sentence_list.append(el.text) - else: - sentence_text += " " + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + uPosTag = None + uPosFeats = [] + for msd_el in el.get("msd").split('|'): + key, val = msd_el.split('=') + if key == 'UPosTag': + uPosTag = val + else: + uPosFeats.append(msd_el) + uPosFeats = '|'.join(uPosFeats) + sentence_tokens += [( + "pc", + int(el_id), + el.text, + el.text, + (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" + else el.get("ana").split(":")[-1]), + uPosTag, + uPosFeats + )] else: # pass links and linkGroups pass - sentence_id = "{}.{}.{}".format(f_id, p_id, s_id) + sentence_id = s.get("id") if sentence_id in res_dict: raise KeyError("duplicated id: {}".format(sentence_id)) - if guess_corpus == "GIGA": - res_dict[sentence_id] = { - "sid": sentence_id, - "text": ' '.join(sentence_list), - "tokens": None, - "links": None - } - else: - res_dict[sentence_id] = { - "sid": sentence_id, - "text": sentence_text, - "tokens": sentence_tokens, - "links": ( - parse_links(s) if guess_corpus == "KRES" else None - ) - } + + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "links": ( + parse_links(s) + ) + } + fp.close() + return res_dict + + + def minimize_tei(self, filepath, jsondata): + def set_xml_attr(node, attribute, value): + node.attrib['{http://www.w3.org/XML/1998/namespace}' + attribute] = value + + def parse_links(s_el): + sent_id = '#' + s_el.get('id') + lgrps = s_el.findall(".//linkGrp") + if len(lgrps) < 1: + raise IOError("Can't find links.") + res_links = {} + for lgrp in lgrps: + if lgrp.get("type") == "JOS-SYN": + for link in lgrp: + jos_type = link.get("ana").split(":")[-1] + link_data = link.get("target").split(" ") + link_from = int(link_data[1].split('.')[-1][1:]) + link_to = int(link_data[0].split('.')[-1][1:]) if sent_id != link_data[0] else 0 + res_links[link_from] = ( + jos_type, + link_from, + link_to, + ) + return res_links + + guess_corpus = None # SSJ | KRES + res_dict = {} + # with filepath.open("rb") as fp, open("../data/ssj500k2.3/final_tei/res.xml", 'w') as sf: + with filepath.open("rb") as fp: + used_ssj_documents = set([k.split('.')[0] for k, v in jsondata.items()]) + used_ssj_paragraphs = set(['.'.join(k.split('.')[:-1]) for k, v in jsondata.items()]) + used_ssj_sentences = set([k for k, v in jsondata.items()]) + + ET.register_namespace("", "http://www.tei-c.org/ns/1.0") + tree = ET.parse(fp) + root_res = tree.getroot() + # root_res = copy.deepcopy(root) + ns = '{http://www.w3.org/XML/1998/namespace}' + ns2 = '{http://www.tei-c.org/ns/1.0}' + + for doc in list(root_res): + doc_id = doc.get(ns + 'id') + if doc_id not in used_ssj_documents: + root_res.remove(doc) + continue + + for par in list(doc): + par_id = par.get(ns + 'id') + if par_id not in used_ssj_paragraphs: + if par.tag != ns2 + 'bibl': + doc.remove(par) + continue + + for sen in list(par): + sen_id = sen.get(ns + 'id') + if sen_id not in used_ssj_sentences: + par.remove(sen) + continue + + linkGrp = ET.Element(f'{ns2}linkGrp') + + linkGrp.attrib[f'targFunc'] = 'head argument' + linkGrp.attrib[f'type'] = 'SRL' + + for srl_el in jsondata[sen_id]: + link = ET.Element(f'{ns2}link') + link.attrib['ana'] = f'srl:{srl_el["arg"]}' + link.attrib['target'] = f'#{sen_id}.t{srl_el["from"]} #{sen_id}.t{srl_el["dep"]}' + linkGrp.append(link) + sen.append(linkGrp) + + + # + # + # + # + # + # + # + # + # print('aaa') + + # sf.write(etree.tostring(tree, pretty_print=True, encoding='utf-8').decode()) + tree.write("../data/ssj500k2.3/final_tei/res.xml", encoding='utf-8') + + return + divs = [] # in ssj, there are divs, in Kres, there are separate files + if "id" in root.keys(): + # Kres files start with + if root.get("id")[0:2] == 'GF': + guess_corpus = "GIGA" + else: + guess_corpus = "KRES" + divs = [root] + else: + guess_corpus = "SSJ" + divs = root.findall(".//div") + + # parse divs + for div in divs: + f_id = div.get("id") + + if guess_corpus == "GIGA": + div = div.findall(".//body")[0] + + # parse paragraphs + for p in div.findall(".//p"): + p_id = p.get("id").split(".")[-1] + + # parse sentences + for s in p.findall(".//s"): + # test if sentence has jos-syn annotations and doesn't have SRL + sent_annot_type_list = [links.get('type') for links in s.findall(".//linkGrp")] + if 'JOS-SYN' not in sent_annot_type_list or 'UD-SYN' not in sent_annot_type_list or 'SRL' in sent_annot_type_list: + del s + continue + + s_id = s.get("id").split(".")[-1] + sentence_text = "" + sentence_list = [] + sentence_tokens = [] + + # parse tokens + for el in s.iter(): + if el.tag in self.W_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + uPosTag = None + uPosFeats = [] + for msd_el in el.get("msd").split('|'): + key, val = msd_el.split('=') + if key == 'UPosTag': + uPosTag = val + else: + uPosFeats.append(msd_el) + uPosFeats = '|'.join(uPosFeats) + sentence_tokens += [( + "w", + int(el_id), + el.text, + el.get("lemma"), + (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" + else el.get("ana").split(":")[-1]), + uPosTag, + uPosFeats + )] + elif el.tag in self.C_TAGS: + # only Kres' C_TAGS have ids + if guess_corpus != "GIGA": + el_id = el.get("id") or "none" + el_id = el_id.split(".")[-1] + sentence_text += el.text + sentence_tokens += [("c", el_id, el.text,)] + elif el.tag in self.S_TAGS: + el_id = el.get("id").split(".")[-1] + if el_id[0] == 't': + el_id = el_id[1:] # ssj W_TAG ids start with t + sentence_text += el.text + uPosTag = None + uPosFeats = [] + for msd_el in el.get("msd").split('|'): + key, val = msd_el.split('=') + if key == 'UPosTag': + uPosTag = val + else: + uPosFeats.append(msd_el) + uPosFeats = '|'.join(uPosFeats) + sentence_tokens += [( + "pc", + int(el_id), + el.text, + el.text, + (el.get("msd") if guess_corpus == "KRES" or guess_corpus == "GIGA" + else el.get("ana").split(":")[-1]), + uPosTag, + uPosFeats + )] + else: + # pass links and linkGroups + pass + sentence_id = s.get("id") + if sentence_id in res_dict: + raise KeyError("duplicated id: {}".format(sentence_id)) + + res_dict[sentence_id] = { + "sid": sentence_id, + "text": sentence_text, + "tokens": sentence_tokens, + "links": ( + parse_links(s) + ) + } + et = etree.ElementTree(root) + et.write("../data/ssj500k2.3/final_tei/res.xml", pretty_print=True, encoding='unicode') fp.close() return res_dict @@ -157,12 +389,8 @@ class Parser: # handle stop signs if token[0] != "w": - out_str += '\t'.join( - [t_id] + - [form for x in range(7)] + - ["0", "0", "modra", "modra", "_", "_"] + - ["\n"] - ) + out_list = [t_id] + [form for x in range(7)] + ["0", "0", "modra", "modra", "_", "_"] + ["\n"] + out_str += '\t'.join(map(str, out_list)) continue pos = self.msdmap.slo_msd_to_eng_pos(token[4]) diff --git a/tools/srl-20131216/tag_ssj500k2.3.sh b/tools/srl-20131216/tag_ssj500k2.3.sh new file mode 100755 index 0000000..93d86f7 --- /dev/null +++ b/tools/srl-20131216/tag_ssj500k2.3.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# parsing tools.cfg values +IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)" +IN_FOLDER=$IN_FOLDER$1 +echo "input folder: $IN_FOLDER" +OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)" +echo "output folder: $OUT_FOLDER" + +SUFFIX="srl.tsv" + +mkdir -p $OUT_FOLDER +# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null + +for infile in $IN_FOLDER/*; do + echo "Tagging: ${infile}" + base=$(basename $infile | cut -d'.' -f1) + outfile=${OUT_FOLDER}/${base}.${SUFFIX} + + # mate-tools tagger + ./scripts/parse_srl_only_mod.sh $infile $outfile + + if [ $? -eq 0 ]; then + echo "Saved as ${outfile}" + else + echo "ERR" + exit 1 + fi +done + diff --git a/tools/tools.cfg b/tools/tools.cfg index fb538df..49f6e52 100644 --- a/tools/tools.cfg +++ b/tools/tools.cfg @@ -1,18 +1,13 @@ [tools] -giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig -giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001 -; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup -giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001 -giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part -; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP -; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy -; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP -giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl -giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv -; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP -giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json -internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data +giga = ../data/gf_example/gf2_orig +giga_orig = ../data/gf_example/gf2-dedup.patch0001 +giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001 +giga_tsv = ../data/gf_example/gf_files_part +giga_srl = ../data/gf_example/2_srl +;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv +giga_json = ../data/gf_example/final_json +internal_data = ../data/gf_example/internal_data giga_parts = 100000 -logfile = ../progress.log -cpu_cores = 16 -debug = False +logfile = ../data/gf_example/progress.log +cpu_cores = 1 +debug = True diff --git a/tools/tools.cfg.ssj500k2.3 b/tools/tools.cfg.ssj500k2.3 new file mode 100644 index 0000000..ef7bf5a --- /dev/null +++ b/tools/tools.cfg.ssj500k2.3 @@ -0,0 +1,15 @@ +[tools] +ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml +ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml +ssj500k_orig_folder = ../data/ssj500k2.3/orig +ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml +ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv +ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs +ssj500k_srl = ../data/ssj500k2.3/srls +ssj500k_json = ../data/ssj500k2.3/final_json +ssj500k_tei = ../data/ssj500k2.3/final_tei +internal_data = ../data/ssj500k2.3/internal_data +;internal_data = ../data/gf_example/internal_data +logfile = ../data/ssj500k2.3/progress.log +cpu_cores = 1 +debug = True