forked from kristjan/cjvt-srl-tagging
parent
c1ecc4cdbc
commit
fd20295017
@ -0,0 +1,47 @@
|
||||
# parse config
|
||||
import configparser
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from tools.parser.parser import Parser
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read("tools.cfg.ssj500k2.3")
|
||||
ORIGPATH = Path(config["tools"]["ssj500k_orig_folder"])
|
||||
JSONPATH = Path(config["tools"]["ssj500k_json"] + '/ssj500k-sl.body.json')
|
||||
OUTPATH = Path(config["tools"]["ssj500k_tei"])
|
||||
INTERNAL_DATA = Path(config["tools"]["internal_data"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
|
||||
jsondata = []
|
||||
with open(JSONPATH, 'r') as jf:
|
||||
jsondata = json.load(jf)
|
||||
|
||||
logging.info("Generating TEI with annotated SRL.")
|
||||
|
||||
def handle_file(file, jsondata):
|
||||
teifile = (ORIGPATH / file)
|
||||
resfile = (OUTPATH / file)
|
||||
|
||||
orig_dict = par.parse_tei(teifile)
|
||||
|
||||
# origfile = get_origfile()
|
||||
orig_dict = par.minimize_tei(teifile, jsondata)
|
||||
|
||||
origfiles = []
|
||||
for subdir, dirs, files in os.walk(ORIGPATH):
|
||||
for file in files:
|
||||
handle_file(file, jsondata)
|
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
# parsing tools.cfg values
|
||||
IN_FOLDER="../$(sed -n -e 's/^\s*ssj500k_tsv_folder\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
|
||||
IN_FOLDER=$IN_FOLDER$1
|
||||
echo "input folder: $IN_FOLDER"
|
||||
OUT_FOLDER="../$(sed -n -e 's/^\s*ssj500k_srl\s*=\s*//p' ../tools.cfg.ssj500k2.3)"
|
||||
echo "output folder: $OUT_FOLDER"
|
||||
|
||||
SUFFIX="srl.tsv"
|
||||
|
||||
mkdir -p $OUT_FOLDER
|
||||
# rm $OUT_FOLDER/*${SUFFIX} &> /dev/null
|
||||
|
||||
for infile in $IN_FOLDER/*; do
|
||||
echo "Tagging: ${infile}"
|
||||
base=$(basename $infile | cut -d'.' -f1)
|
||||
outfile=${OUT_FOLDER}/${base}.${SUFFIX}
|
||||
|
||||
# mate-tools tagger
|
||||
./scripts/parse_srl_only_mod.sh $infile $outfile
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Saved as ${outfile}"
|
||||
else
|
||||
echo "ERR"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
@ -1,18 +1,13 @@
|
||||
[tools]
|
||||
giga = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_orig
|
||||
giga_orig = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.patch0001
|
||||
; giga_orig_old = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup
|
||||
giga_jos = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2-dedup.jos.patch0001
|
||||
giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf_files_part
|
||||
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||
; giga_tsv = /media/luka/Portable Disk/Datasets/gigafida_jos/gf2_files_copy
|
||||
; giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/TEMP
|
||||
giga_srl = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl
|
||||
giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
|
||||
; giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json_TEMP
|
||||
giga_json = /media/luka/Portable Disk/Datasets/gigafida_jos/final_json
|
||||
internal_data = /media/luka/Portable Disk/Datasets/gigafida_jos/internal_data
|
||||
giga = ../data/gf_example/gf2_orig
|
||||
giga_orig = ../data/gf_example/gf2-dedup.patch0001
|
||||
giga_jos = ../data/gf_example/gf2-dedup.jos.patch0001
|
||||
giga_tsv = ../data/gf_example/gf_files_part
|
||||
giga_srl = ../data/gf_example/2_srl
|
||||
;giga_srl_errors = /media/luka/Portable Disk/Datasets/gigafida_jos/2_srl_errors/giga_errors.srl.tsv
|
||||
giga_json = ../data/gf_example/final_json
|
||||
internal_data = ../data/gf_example/internal_data
|
||||
giga_parts = 100000
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 16
|
||||
debug = False
|
||||
logfile = ../data/gf_example/progress.log
|
||||
cpu_cores = 1
|
||||
debug = True
|
||||
|
@ -0,0 +1,15 @@
|
||||
[tools]
|
||||
ssj500k = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_orig = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_orig_folder = ../data/ssj500k2.3/orig
|
||||
ssj500k_jos = ../data/ssj500k2.3/orig/ssj500k-sl.body.xml
|
||||
ssj500k_tsv = ../data/ssj500k2.3/tsvs/tsvs.tsv
|
||||
ssj500k_tsv_folder = ../data/ssj500k2.3/tsvs
|
||||
ssj500k_srl = ../data/ssj500k2.3/srls
|
||||
ssj500k_json = ../data/ssj500k2.3/final_json
|
||||
ssj500k_tei = ../data/ssj500k2.3/final_tei
|
||||
internal_data = ../data/ssj500k2.3/internal_data
|
||||
;internal_data = ../data/gf_example/internal_data
|
||||
logfile = ../data/ssj500k2.3/progress.log
|
||||
cpu_cores = 1
|
||||
debug = True
|
Loading…
Reference in new issue