Compare commits
No commits in common. "master" and "per-file" have entirely different histories.
7
.gitignore
vendored
7
.gitignore
vendored
|
@ -1,8 +1,5 @@
|
|||
*.pyc
|
||||
*.pickle
|
||||
*.log
|
||||
|
||||
nohup.out
|
||||
|
||||
data/kres_out/*
|
||||
data/kres_example/
|
||||
data/*/*.xml
|
||||
data/*/*.tsv
|
||||
|
|
15
Makefile
15
Makefile
|
@ -1,22 +1,19 @@
|
|||
.PHONY: tsv_files srl_tagged_files json_files env clean
|
||||
.PHONY: tsv_files srl_tagged_files json_files env
|
||||
|
||||
all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files
|
||||
all: json_files
|
||||
|
||||
json_files: # srl_tagged_files
|
||||
json_files: #TODO srl_tagged_files
|
||||
cd tools; python3 gen_json.py
|
||||
|
||||
srl_tagged_files: # tsv_files
|
||||
srl_tagged_files: tsv_files
|
||||
# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd -
|
||||
cd tools/srl-20131216; ./tag_all.sh
|
||||
|
||||
tsv_files: # tools/fillpred_model/model.pickle
|
||||
tsv_files: fillpred_model/model.pickle
|
||||
cd tools; python3 parse_all.py
|
||||
|
||||
tools/fillpred_model/model.pickle:
|
||||
fillpred_model/model.pickle:
|
||||
cd tools/fillpred_model; $(MAKE)
|
||||
|
||||
env:
|
||||
cd dockerfiles; cd python-java; $(MAKE)
|
||||
|
||||
clean:
|
||||
rm tools/fillpred_model/model.pickle
|
||||
|
|
10
README.md
10
README.md
|
@ -11,9 +11,7 @@ Check out `./tools/srl-20131216/README.md`.
|
|||
|
||||
## Scripts
|
||||
Check all possible xml tags (that occur after the <body> tag.
|
||||
``` bash
|
||||
cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq
|
||||
```
|
||||
'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq'
|
||||
|
||||
## Tools
|
||||
* Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.
|
||||
|
@ -28,12 +26,6 @@ $ cd ./cjvt-srl-tagging
|
|||
$ make
|
||||
```
|
||||
|
||||
If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.
|
||||
```
|
||||
$ nohup make &
|
||||
```
|
||||
See progress in generated logfile (check git root).
|
||||
|
||||
# Makefile
|
||||
The Makefile follows certain steps:
|
||||
1. Create a fillpred model.
|
||||
|
|
Binary file not shown.
|
@ -7,8 +7,6 @@ default-jdk \
|
|||
python3 \
|
||||
python3-pip
|
||||
|
||||
RUN apt-get install -y sshfs
|
||||
|
||||
RUN pip3 install lxml pandas sklearn
|
||||
|
||||
ENV PYTHONIOENCODING UTF-8
|
||||
|
|
|
@ -5,16 +5,14 @@ all: build run
|
|||
build:
|
||||
docker build . -t $(IMAGE_NAME)
|
||||
|
||||
|
||||
run:
|
||||
docker run \
|
||||
-it \
|
||||
-v /home/${USER}:/home/${USER} \
|
||||
--user $(shell id -u):$(shell id -g) \
|
||||
-v /home/${USER}:/home/${USER} \
|
||||
-v /etc/passwd:/etc/passwd \
|
||||
-v /etc/group:/etc/group \
|
||||
-v $(shell pwd)/../../:/cjvt-srl-tagging \
|
||||
-w /cjvt-srl-tagging \
|
||||
-v /home/kristjan/kres_mount:/kres_mount:ro \
|
||||
python-java \
|
||||
/bin/bash
|
||||
python-java \
|
||||
/bin/bash
|
||||
|
|
|
@ -1,15 +0,0 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
from lxml import etree
|
||||
|
||||
def tei_to_dict(s_el):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f:
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
xml_tree = ElementTree.XML(xmlstring)
|
||||
|
||||
|
||||
|
151
parser/test.py
151
parser/test.py
|
@ -1,151 +0,0 @@
|
|||
#!/usr/bin/python2
|
||||
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
from lxml import etree as ElementTree
|
||||
except ImportError:
|
||||
import xml.etree.ElementTree as ElementTree
|
||||
|
||||
|
||||
# attributes
|
||||
ID_ATTR = "id"
|
||||
LEMMA_ATTR = "lemma"
|
||||
ANA_ATTR = "ana"
|
||||
|
||||
|
||||
# tags
|
||||
SENTENCE_TAG = 's'
|
||||
BIBL_TAG = 'bibl'
|
||||
PARAGRAPH_TAG = 'p'
|
||||
PC_TAG = 'pc'
|
||||
WORD_TAG = 'w'
|
||||
C_TAG = 'c'
|
||||
S_TAG = 'S'
|
||||
SEG_TAG = 'seg'
|
||||
|
||||
|
||||
class Sentence:
|
||||
def __init__(self, sentence, s_id):
|
||||
self.id = s_id
|
||||
self.words = []
|
||||
self.text = ""
|
||||
|
||||
for word in sentence:
|
||||
self.handle_word(word)
|
||||
|
||||
def handle_word(self, word):
|
||||
# handle space after
|
||||
if word.tag == S_TAG:
|
||||
assert(word.text is None)
|
||||
self.text += ' '
|
||||
return
|
||||
|
||||
# ASK am I handling this correctly?
|
||||
elif word.tag == SEG_TAG:
|
||||
for segword in word:
|
||||
self.handle_word(segword)
|
||||
return
|
||||
|
||||
# ASK handle unknown tags (are there others?)
|
||||
elif word.tag not in (WORD_TAG, C_TAG):
|
||||
return
|
||||
|
||||
# ID
|
||||
idx = str(len(self.words) + 1)
|
||||
|
||||
# TOKEN
|
||||
token = word.text
|
||||
|
||||
# LEMMA
|
||||
if word.tag == WORD_TAG:
|
||||
lemma = word.get(LEMMA_ATTR)
|
||||
assert(lemma is not None)
|
||||
else:
|
||||
lemma = token
|
||||
|
||||
# XPOS
|
||||
xpos = word.get('msd')
|
||||
if word.tag == C_TAG:
|
||||
xpos = "Z"
|
||||
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
|
||||
xpos = "N"
|
||||
elif xpos is None:
|
||||
print(self.id)
|
||||
|
||||
# save word entry
|
||||
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
|
||||
|
||||
# save for text
|
||||
self.text += word.text
|
||||
|
||||
|
||||
def to_conllu(self):
|
||||
lines = []
|
||||
# lines.append('# sent_id = ' + self.id)
|
||||
# CONLLu does not like spaces at the end of # text
|
||||
# lines.append('# text = ' + self.text.strip())
|
||||
for word in self.words:
|
||||
lines.append('\t'.join('_' if data is None else data for data in word))
|
||||
|
||||
return lines
|
||||
|
||||
def convert_file(in_file, out_file):
|
||||
print("Nalaganje xml: {}".format(in_file))
|
||||
with open(str(in_file), 'r') as fp:
|
||||
uni_str = fp.read().decode("utf-8")
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
print(xmlstring[:1000])
|
||||
xml_tree = ElementTree.XML(xmlstring)
|
||||
|
||||
print("Pretvarjanje TEI -> TSV-U ...")
|
||||
lines = []
|
||||
|
||||
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
|
||||
sidx = 1
|
||||
for sentence in paragraph:
|
||||
if sentence.tag != SENTENCE_TAG:
|
||||
continue
|
||||
|
||||
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
|
||||
lines.extend(sentence.to_conllu())
|
||||
lines.append('') # ASK newline between sentences
|
||||
sidx += 1
|
||||
|
||||
if len(lines) == 0:
|
||||
raise RuntimeError("Nobenih stavkov najdenih")
|
||||
|
||||
print("Zapisovanje izhodne datoteke: {}".format(out_file))
|
||||
with open(out_file, 'w') as fp:
|
||||
for line in lines:
|
||||
if sys.version_info < (3, 0):
|
||||
line = line.encode('utf-8')
|
||||
print(line, file=fp)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Input: folder of TEI files, msds are encoded as msd="Z"
|
||||
Ouput: just a folder
|
||||
"""
|
||||
|
||||
infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml"
|
||||
outfile = "test.out"
|
||||
convert_file(infile, outfile)
|
||||
sys.exit()
|
||||
|
||||
in_folder = sys.argv[1]
|
||||
out_folder = sys.argv[2]
|
||||
num_processes = int(sys.argv[3])
|
||||
|
||||
files = Path(in_folder).rglob("*.xml")
|
||||
in_out = []
|
||||
for filename in files:
|
||||
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
|
||||
convert_file(filename, out_file)
|
|
@ -51,4 +51,4 @@ if __name__ == "__main__":
|
|||
print(i, df.shape)
|
||||
|
||||
print(ndf.head())
|
||||
ndf.to_pickle(Path(OUTFILE))
|
||||
ndf.to_pickle(OUTFILE)
|
||||
|
|
|
@ -27,6 +27,4 @@ if __name__ == "__main__":
|
|||
clf_full = DecisionTreeClassifier()
|
||||
clf_full.fit(X, y)
|
||||
|
||||
with open(OUTFILE, "wb") as fp:
|
||||
pickle.dump(clf_full, fp)
|
||||
|
||||
pickle.dump(clf_full, open(OUTFILE, "wb"))
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from pathlib import Path
|
||||
from parser.parser import Parser
|
||||
import configparser
|
||||
import json
|
||||
import sys
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# defaults
|
||||
ORIGPATH = Path("../data/kres_example") # we need the IDs
|
||||
INPATH = Path("../data/kres_example_srl")
|
||||
OUTPATH = Path("../data/kres_example_json")
|
||||
|
||||
# parse config
|
||||
config = configparser.ConfigParser()
|
||||
|
@ -12,103 +13,45 @@ config.read("tools.cfg")
|
|||
ORIGPATH = Path(config["tools"]["kres_orig"])
|
||||
INPATH = Path(config["tools"]["kres_srl"])
|
||||
OUTPATH = Path(config["tools"]["kres_json"])
|
||||
DEBUG = config["tools"]["debug"] == "True"
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
def get_origfile(filename):
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
for origfile in ORIGPATH.iterdir():
|
||||
if filename.name.split('.')[0] == origfile.name.split('.')[0]:
|
||||
return origfile
|
||||
raise FileNotFoundError
|
||||
|
||||
def extract_sentences(line_reader):
|
||||
acc = []
|
||||
# last char in line is \n, remove it
|
||||
for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]:
|
||||
if len(line) == 1: # empty line
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
acc = []
|
||||
for line in [x.decode("utf-8").split('\t') for x in line_reader]:
|
||||
if line[0] == '\n':
|
||||
tmp = acc
|
||||
acc = []
|
||||
yield tmp
|
||||
else:
|
||||
acc.append(line)
|
||||
|
||||
def to_sentence(sentence_arr):
|
||||
return " ".join([token[1] for token in sentence_arr])
|
||||
|
||||
def match_sentence_id(sentence, orig_dict):
|
||||
for k, e in orig_dict.items():
|
||||
orig_sentence = " ".join(token[2] for token in e["tokens"])
|
||||
if sentence == orig_sentence:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
def get_dep_rel(token):
|
||||
logging.debug(token)
|
||||
for i, field in enumerate(token[14:]):
|
||||
if field != "_":
|
||||
return {
|
||||
"arg": field,
|
||||
"from": i, # i-th predicate in sentence
|
||||
"dep": token[0],
|
||||
}
|
||||
return None
|
||||
|
||||
def handle_file(infile_tpl):
|
||||
i = infile_tpl[0]
|
||||
infile = infile_tpl[1]
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
||||
origfile = get_origfile(infile)
|
||||
orig_dict = par.parse_tei(origfile)
|
||||
|
||||
with infile.open("rb") as fp:
|
||||
outdata = {}
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
# tsv dropped sentence ids, match the ID, using original data
|
||||
sid = match_sentence_id(to_sentence(sentence_arr), orig_dict)
|
||||
|
||||
outdata[sid] = []
|
||||
|
||||
# find all predicate indices in the sentence
|
||||
predicates = []
|
||||
for token in sentence_arr:
|
||||
if token[12] == "Y":
|
||||
predicates += [token[0]] # idx
|
||||
|
||||
deprel = get_dep_rel(token)
|
||||
if deprel is not None:
|
||||
outdata[sid].append(deprel)
|
||||
|
||||
# deprel["from"] points to n-th predicate
|
||||
# replace with predicate's token index
|
||||
for deprel in outdata[sid]:
|
||||
deprel["from"] = predicates[deprel["from"]]
|
||||
|
||||
if DEBUG:
|
||||
print(to_sentence(sentence_arr))
|
||||
print(outdata[sid])
|
||||
print(sid)
|
||||
print()
|
||||
print()
|
||||
|
||||
with outfile.open("w") as fp:
|
||||
json.dump(outdata, fp)
|
||||
logging.info("SRL relations written to: {}".format(outfile))
|
||||
def match_sentence_id(string, rd):
|
||||
str1 = " ".join([token[1] for token in sentence_arr])
|
||||
for k, e in rd.items():
|
||||
str2 = " ".join(token[2] for token in dict_entry["tokens"])
|
||||
if str1 == str2:
|
||||
return k
|
||||
raise KeyError
|
||||
|
||||
|
||||
# main
|
||||
par = Parser()
|
||||
OUTPATH.mkdir(exist_ok=True)
|
||||
if __name__ == "__main__":
|
||||
|
||||
infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()]))
|
||||
logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles)))
|
||||
par = Parser()
|
||||
|
||||
with Pool(CPU_CORES) as p:
|
||||
p.map(handle_file, infiles)
|
||||
for infile in [x for x in INPATH.iterdir() if x.is_file()]:
|
||||
origfile = get_origfile(infile)
|
||||
rd = par.parse_tei(origfile)
|
||||
|
||||
logging.info("Finished generating .json files.")
|
||||
fp = infile.open("rb")
|
||||
for sentence_arr in extract_sentences(fp.readlines()):
|
||||
sid = match_sentence_id(sentence_arr, rd)
|
||||
print(sid)
|
||||
# OK, we got the sentence id, now generate the predicate map!
|
||||
|
||||
|
||||
outfile = (OUTPATH / infile.name).with_suffix(".json")
|
|
@ -6,8 +6,10 @@ import re
|
|||
import sys
|
||||
import cProfile
|
||||
import configparser
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# some defaults
|
||||
INDIR = Path("../data/kres_example")
|
||||
OUTDIR = Path("../data/kres_example_tsv")
|
||||
|
||||
SSJ500K_2_1 = 27829 # number of sentences
|
||||
par = Parser()
|
||||
|
@ -17,13 +19,6 @@ config = configparser.ConfigParser()
|
|||
config.read("tools.cfg")
|
||||
INDIR = Path(config["tools"]["kres_orig"])
|
||||
OUTDIR = Path(config["tools"]["kres_tsv"])
|
||||
CPU_CORES = int(config["tools"]["cpu_cores"])
|
||||
|
||||
LOGFILE = Path(config["tools"]["logfile"]).absolute()
|
||||
LOGFILE.touch(exist_ok=True)
|
||||
LOGFILE.resolve()
|
||||
|
||||
logging.basicConfig(filename=str(LOGFILE), level=logging.INFO)
|
||||
|
||||
"""
|
||||
print("parsing ssj")
|
||||
|
@ -33,42 +28,22 @@ ssj_dict = par.parse_tei(ssj_file)
|
|||
print("end parsing ssj")
|
||||
"""
|
||||
|
||||
print("parsing kres")
|
||||
# kres_file = "../data/kres_example/F0019343.xml.parsed.xml"
|
||||
OUTDIR.mkdir(exist_ok=True)
|
||||
|
||||
infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()]))
|
||||
logging.info("Parsing kres: {} files.".format(len(infiles)))
|
||||
for kres_file in [x for x in INDIR.iterdir() if x.is_file()]:
|
||||
|
||||
def handle_file(infile):
|
||||
i = infile[0]
|
||||
kres_file = infile[1]
|
||||
outfile = (OUTDIR / kres_file.name).with_suffix(".tsv")
|
||||
print("Processing file: " + str(kres_file))
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()])
|
||||
print("Longest sentence: ", longest_sent)
|
||||
kres_out_str = ""
|
||||
|
||||
if outfile.is_file():
|
||||
logging.info("Skipping existing file: {}.".format(str(kres_file)))
|
||||
return True
|
||||
for _, sentence in res_dict.items():
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent)
|
||||
|
||||
try:
|
||||
res_dict = par.parse_tei(kres_file)
|
||||
kres_out_str = ""
|
||||
for _, sentence in res_dict.items():
|
||||
kres_out_str += par.to_conll_2009_SRL(sentence)
|
||||
except Exception as exc:
|
||||
logging.info("Failed processing file: {}".format(str(kres_file)))
|
||||
logging.error(exc)
|
||||
return False
|
||||
|
||||
|
||||
with outfile.open("wb+") as fp:
|
||||
with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp:
|
||||
fp.write(kres_out_str.encode("utf-8"))
|
||||
logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file)))
|
||||
return True
|
||||
return False
|
||||
|
||||
with Pool(CPU_CORES) as p:
|
||||
p.map(handle_file, infiles)
|
||||
|
||||
|
||||
logging.info("end parsing kres")
|
||||
|
||||
|
||||
fp.close()
|
||||
print("end parsing kres")
|
||||
|
|
|
@ -35,11 +35,7 @@ class Msdmap:
|
|||
def slo_msd_to_eng_long(self, slo_msd):
|
||||
# old, slow
|
||||
# return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0]
|
||||
# return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
|
||||
query = self.msd_table.query("slo_msd == '{}'".format(slo_msd))
|
||||
if query.empty:
|
||||
return "No-matching-msd-found"
|
||||
return query["eng_long"].values[0]
|
||||
return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0]
|
||||
|
||||
def slo_msd_to_eng_pos(self, slo_msd):
|
||||
# first letter in slo_msd == slo_pos
|
||||
|
|
|
@ -119,7 +119,7 @@ class Parser:
|
|||
return res_dict
|
||||
|
||||
|
||||
def to_conll_2009_SRL(self, sentence_entry):
|
||||
def to_conll_2009_SRL(self, sentence_entry, napreds=9):
|
||||
|
||||
def fillpred(tsv_row):
|
||||
mrow = build_model_row(tsv_row)
|
||||
|
@ -127,6 +127,8 @@ class Parser:
|
|||
y = self.fillpred_model.predict([x])
|
||||
return y[0] # bool
|
||||
|
||||
apreds_string = '\t'.join(["_" for x in range(napreds)])
|
||||
|
||||
# works with kres, with parsed links
|
||||
out_str = ""
|
||||
for token in sentence_entry["tokens"]:
|
||||
|
@ -139,7 +141,7 @@ class Parser:
|
|||
[t_id] +
|
||||
[form for x in range(7)] +
|
||||
["0", "0", "modra", "modra", "_", "_"] +
|
||||
["\n"]
|
||||
[apreds_string, "\n"]
|
||||
)
|
||||
continue
|
||||
|
||||
|
@ -168,6 +170,7 @@ class Parser:
|
|||
sentence_entry["links"][t_id][0], # pdeprel
|
||||
"_", # fillpred
|
||||
"_", # pred
|
||||
apreds_string,
|
||||
"\n",
|
||||
]
|
||||
fprd = fillpred(row_list)
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
[tools]
|
||||
kres_orig = /kres_mount/kres_parsed/tei
|
||||
kres_tsv = ../data/kres_out/1_tsv
|
||||
kres_srl = ../data/kres_out/2_srl
|
||||
kres_json = ../data/kres_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 5
|
||||
debug = False
|
||||
kres_orig = ../data/kres_example
|
||||
kres_tsv = ../data/kres_example_tsv
|
||||
kres_srl = ../data/kres_example_srl
|
||||
kres_json = ../data/kres/example_json
|
|
@ -1,8 +0,0 @@
|
|||
[tools]
|
||||
kres_orig = ../data/kres_example
|
||||
kres_tsv = ../data/kres_out/1_tsv
|
||||
kres_srl = ../data/kres_out/2_srl
|
||||
kres_json = ../data/kres_out/final_json
|
||||
logfile = ../progress.log
|
||||
cpu_cores = 1
|
||||
debug = False
|
Loading…
Reference in New Issue
Block a user