Compare commits
	
		
			17 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| a6cee3d459 | |||
| b32bd3e7c6 | |||
| 044fae2001 | |||
| 406e88ade8 | |||
| bf999a965f | |||
| d45b6d9f47 | |||
| a61ec8770a | |||
| ff25acd3c7 | |||
| 3881c74613 | |||
| 17cb0677a7 | |||
| fd0f9794f1 | |||
| 12f3994115 | |||
| dcc2935c3c | |||
| 60ac569f40 | |||
| b4c7ac5427 | |||
| 5c9cf59723 | |||
| 577c8418d2 | 
							
								
								
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -1,5 +1,8 @@ | ||||
| *.pyc | ||||
| *.pickle | ||||
| *.log | ||||
| 
 | ||||
| data/*/*.xml | ||||
| data/*/*.tsv | ||||
| nohup.out | ||||
| 
 | ||||
| data/kres_out/* | ||||
| data/kres_example/ | ||||
|  | ||||
							
								
								
									
										15
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								Makefile
									
									
									
									
									
								
							| @ -1,19 +1,22 @@ | ||||
| .PHONY: tsv_files srl_tagged_files json_files env | ||||
| .PHONY: tsv_files srl_tagged_files json_files env clean | ||||
| 
 | ||||
| all: json_files | ||||
| all: tools/fillpred_model/model.pickle tsv_files srl_tagged_files json_files | ||||
| 
 | ||||
| json_files: #TODO srl_tagged_files
 | ||||
| json_files: # srl_tagged_files
 | ||||
| 	cd tools; python3 gen_json.py | ||||
| 
 | ||||
| srl_tagged_files: tsv_files | ||||
| srl_tagged_files: # tsv_files
 | ||||
| 	# cd tools/srl-20131216; ./scripts/parse_srl_only_mod.sh; cd - | ||||
| 	cd tools/srl-20131216; ./tag_all.sh | ||||
| 
 | ||||
| tsv_files: fillpred_model/model.pickle | ||||
| tsv_files: # tools/fillpred_model/model.pickle
 | ||||
| 	cd tools; python3 parse_all.py | ||||
| 
 | ||||
| fillpred_model/model.pickle: | ||||
| tools/fillpred_model/model.pickle: | ||||
| 	cd tools/fillpred_model; $(MAKE) | ||||
| 
 | ||||
| env: | ||||
| 	cd dockerfiles; cd python-java; $(MAKE) | ||||
| 
 | ||||
| clean: | ||||
| 	rm tools/fillpred_model/model.pickle | ||||
|  | ||||
							
								
								
									
										10
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								README.md
									
									
									
									
									
								
							| @ -11,7 +11,9 @@ Check out `./tools/srl-20131216/README.md`. | ||||
| 
 | ||||
| ## Scripts | ||||
| Check all possible xml tags (that occur after the <body> tag.   | ||||
| 'cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq' | ||||
| ``` bash | ||||
| cat F0006347.xml.parsed.xml | grep -A 999999999999 -e '<body>' | grep -o -e '<[^" "]*' | sort | uniq | ||||
| ``` | ||||
| 
 | ||||
| ## Tools | ||||
| * Parser for reading both `SSJ500k 2.1 TEI xml` and `Kres F....xml.parsed.xml"` files found in `./tools/parser/parser.py`.   | ||||
| @ -26,6 +28,12 @@ $ cd ./cjvt-srl-tagging | ||||
| $ make | ||||
| ``` | ||||
| 
 | ||||
| If you want to run it on a server overnight, you might want to use `nohup`, so you can close the ssh connection without closing the process.   | ||||
| ``` | ||||
| $ nohup make & | ||||
| ``` | ||||
| See progress in generated logfile (check git root).   | ||||
| 
 | ||||
| # Makefile | ||||
| The Makefile follows certain steps: | ||||
| 1. Create a fillpred model. | ||||
|  | ||||
										
											Binary file not shown.
										
									
								
							| @ -7,6 +7,8 @@ default-jdk \ | ||||
| python3 \ | ||||
| python3-pip | ||||
| 
 | ||||
| RUN apt-get install -y sshfs | ||||
| 
 | ||||
| RUN pip3 install lxml pandas sklearn | ||||
| 
 | ||||
| ENV PYTHONIOENCODING UTF-8 | ||||
|  | ||||
| @ -5,14 +5,16 @@ all: build run | ||||
| build: | ||||
| 	docker build . -t $(IMAGE_NAME) | ||||
| 
 | ||||
| 
 | ||||
| run: | ||||
| 	docker run \
 | ||||
|     	-it \
 | ||||
| 	--user $(shell id -u):$(shell id -g) \
 | ||||
| 	-v /home/${USER}:/home/${USER} \
 | ||||
| 	--user $(shell id -u):$(shell id -g) \
 | ||||
| 	-v /etc/passwd:/etc/passwd \
 | ||||
| 	-v /etc/group:/etc/group \
 | ||||
|     	-v $(shell pwd)/../../:/cjvt-srl-tagging \
 | ||||
| 	-w /cjvt-srl-tagging \
 | ||||
|     	python-java \
 | ||||
|     	/bin/bash | ||||
| 	-v /home/kristjan/kres_mount:/kres_mount:ro \
 | ||||
|     python-java \
 | ||||
|     /bin/bash | ||||
|  | ||||
							
								
								
									
										15
									
								
								parser/tei_to_dict.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								parser/tei_to_dict.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| #!/usr/bin/python3 | ||||
| 
 | ||||
| from lxml import etree | ||||
| 
 | ||||
| def tei_to_dict(s_el): | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     with open("/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml") as f: | ||||
|         xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1) | ||||
|         xmlstring = xmlstring.replace(' xml:', ' ') | ||||
|         xml_tree = ElementTree.XML(xmlstring) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
							
								
								
									
										151
									
								
								parser/test.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										151
									
								
								parser/test.py
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,151 @@ | ||||
| #!/usr/bin/python2 | ||||
| 
 | ||||
| from __future__ import print_function, unicode_literals, division | ||||
| import sys | ||||
| import os | ||||
| import re | ||||
| import pickle | ||||
| from pathlib import Path | ||||
| 
 | ||||
| try: | ||||
|     from lxml import etree as ElementTree | ||||
| except ImportError: | ||||
|     import xml.etree.ElementTree as ElementTree | ||||
| 
 | ||||
| 
 | ||||
| # attributes | ||||
| ID_ATTR = "id" | ||||
| LEMMA_ATTR = "lemma" | ||||
| ANA_ATTR = "ana" | ||||
| 
 | ||||
| 
 | ||||
| # tags | ||||
| SENTENCE_TAG = 's' | ||||
| BIBL_TAG = 'bibl' | ||||
| PARAGRAPH_TAG = 'p' | ||||
| PC_TAG = 'pc' | ||||
| WORD_TAG = 'w' | ||||
| C_TAG = 'c' | ||||
| S_TAG = 'S' | ||||
| SEG_TAG = 'seg' | ||||
| 
 | ||||
| 
 | ||||
| class Sentence: | ||||
|     def __init__(self, sentence, s_id): | ||||
|         self.id = s_id | ||||
|         self.words = [] | ||||
|         self.text = "" | ||||
| 
 | ||||
|         for word in sentence: | ||||
|             self.handle_word(word) | ||||
| 
 | ||||
|     def handle_word(self, word): | ||||
|         # handle space after | ||||
|         if word.tag == S_TAG: | ||||
|             assert(word.text is None) | ||||
|             self.text += ' ' | ||||
|             return | ||||
| 
 | ||||
|         # ASK am I handling this correctly? | ||||
|         elif word.tag == SEG_TAG: | ||||
|             for segword in word: | ||||
|                 self.handle_word(segword) | ||||
|             return | ||||
| 
 | ||||
|         # ASK handle unknown tags (are there others?) | ||||
|         elif word.tag not in (WORD_TAG, C_TAG): | ||||
|             return | ||||
| 
 | ||||
|         # ID | ||||
|         idx = str(len(self.words) + 1) | ||||
| 
 | ||||
|         # TOKEN | ||||
|         token = word.text | ||||
| 
 | ||||
|         # LEMMA | ||||
|         if word.tag == WORD_TAG: | ||||
|             lemma = word.get(LEMMA_ATTR) | ||||
|             assert(lemma is not None) | ||||
|         else: | ||||
|             lemma = token | ||||
| 
 | ||||
|         # XPOS | ||||
|         xpos = word.get('msd') | ||||
|         if word.tag == C_TAG: | ||||
|             xpos = "Z" | ||||
|         elif xpos in ("Gp-ppdzn", "Gp-spmzd"): | ||||
|             xpos = "N" | ||||
|         elif xpos is None: | ||||
|             print(self.id) | ||||
| 
 | ||||
|         # save word entry | ||||
|         self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos]) | ||||
| 
 | ||||
|         # save for text | ||||
|         self.text += word.text | ||||
| 
 | ||||
| 
 | ||||
|     def to_conllu(self): | ||||
|         lines = [] | ||||
|         # lines.append('# sent_id = ' + self.id) | ||||
|         # CONLLu does not like spaces at the end of # text | ||||
|         # lines.append('# text = ' + self.text.strip()) | ||||
|         for word in self.words: | ||||
|             lines.append('\t'.join('_' if data is None else data for data in word)) | ||||
| 
 | ||||
|         return lines | ||||
| 
 | ||||
| def convert_file(in_file, out_file): | ||||
|     print("Nalaganje xml: {}".format(in_file)) | ||||
|     with open(str(in_file), 'r') as fp: | ||||
|         uni_str = fp.read().decode("utf-8") | ||||
|         xmlstring = re.sub(' xmlns="[^"]+"', '', uni_str, count=1) | ||||
|         xmlstring = xmlstring.replace(' xml:', ' ') | ||||
|         print(xmlstring[:1000]) | ||||
|         xml_tree = ElementTree.XML(xmlstring) | ||||
| 
 | ||||
|     print("Pretvarjanje TEI -> TSV-U ...") | ||||
|     lines = [] | ||||
| 
 | ||||
|     for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')): | ||||
|         sidx = 1 | ||||
|         for sentence in paragraph: | ||||
|             if sentence.tag != SENTENCE_TAG: | ||||
|                 continue | ||||
| 
 | ||||
|             sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx)) | ||||
|             lines.extend(sentence.to_conllu()) | ||||
|             lines.append('') # ASK newline between sentences | ||||
|             sidx += 1 | ||||
| 
 | ||||
|     if len(lines) == 0: | ||||
|         raise RuntimeError("Nobenih stavkov najdenih") | ||||
| 
 | ||||
|     print("Zapisovanje izhodne datoteke: {}".format(out_file)) | ||||
|     with open(out_file, 'w') as fp: | ||||
|         for line in lines: | ||||
|             if sys.version_info < (3, 0): | ||||
|                 line = line.encode('utf-8') | ||||
|             print(line, file=fp) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     """ | ||||
|     Input: folder of TEI files, msds are encoded as msd="Z" | ||||
|     Ouput: just a folder | ||||
|     """ | ||||
| 	  | ||||
|     infile = "/home/kristjan/kres_mount/kres_parsed/tei/F0025751.xml.parsed.xml" | ||||
|     outfile = "test.out" | ||||
|     convert_file(infile, outfile) | ||||
|     sys.exit() | ||||
| 
 | ||||
|     in_folder = sys.argv[1] | ||||
|     out_folder = sys.argv[2] | ||||
|     num_processes = int(sys.argv[3]) | ||||
| 
 | ||||
|     files = Path(in_folder).rglob("*.xml") | ||||
|     in_out = [] | ||||
|     for filename in files: | ||||
|         out_file = out_folder + "/" + filename.name[:-4] + ".txt" | ||||
|         convert_file(filename, out_file) | ||||
| @ -51,4 +51,4 @@ if __name__ == "__main__": | ||||
| 			print(i, df.shape) | ||||
| 
 | ||||
| 	print(ndf.head()) | ||||
| 	ndf.to_pickle(OUTFILE) | ||||
| 	ndf.to_pickle(Path(OUTFILE)) | ||||
|  | ||||
| @ -27,4 +27,6 @@ if __name__ == "__main__": | ||||
| 	clf_full = DecisionTreeClassifier() | ||||
| 	clf_full.fit(X, y) | ||||
| 
 | ||||
| 	pickle.dump(clf_full, open(OUTFILE, "wb")) | ||||
| 	with open(OUTFILE, "wb") as fp: | ||||
| 		pickle.dump(clf_full, fp) | ||||
| 
 | ||||
|  | ||||
| @ -1,11 +1,10 @@ | ||||
| from pathlib import Path | ||||
| from parser.parser import Parser | ||||
| import configparser | ||||
| 
 | ||||
| # defaults | ||||
| ORIGPATH = Path("../data/kres_example")  # we need the IDs | ||||
| INPATH = Path("../data/kres_example_srl") | ||||
| OUTPATH = Path("../data/kres_example_json") | ||||
| import json | ||||
| import sys | ||||
| import logging | ||||
| from multiprocessing import Pool | ||||
| 
 | ||||
| # parse config | ||||
| config = configparser.ConfigParser() | ||||
| @ -13,45 +12,103 @@ config.read("tools.cfg") | ||||
| ORIGPATH = Path(config["tools"]["kres_orig"]) | ||||
| INPATH = Path(config["tools"]["kres_srl"]) | ||||
| OUTPATH = Path(config["tools"]["kres_json"]) | ||||
| DEBUG = config["tools"]["debug"] == "True" | ||||
| CPU_CORES = int(config["tools"]["cpu_cores"]) | ||||
| 
 | ||||
| LOGFILE = Path(config["tools"]["logfile"]).absolute() | ||||
| LOGFILE.touch(exist_ok=True) | ||||
| LOGFILE.resolve() | ||||
| 
 | ||||
| logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) | ||||
| 
 | ||||
| def get_origfile(filename): | ||||
| 	for origfile in ORIGPATH.iterdir(): | ||||
| 		if filename.name.split('.')[0] == origfile.name.split('.')[0]: | ||||
| 			return origfile | ||||
| 	raise FileNotFoundError | ||||
|     for origfile in ORIGPATH.iterdir(): | ||||
|         if filename.name.split('.')[0] == origfile.name.split('.')[0]: | ||||
|             return origfile | ||||
|     raise FileNotFoundError | ||||
| 
 | ||||
| def extract_sentences(line_reader): | ||||
| 	acc = [] | ||||
| 	for line in [x.decode("utf-8").split('\t') for x in line_reader]: | ||||
| 		if line[0] == '\n': | ||||
| 			tmp = acc | ||||
| 			acc = [] | ||||
| 			yield tmp | ||||
| 		else: | ||||
| 			acc.append(line) | ||||
|     acc = [] | ||||
|     # last char in line is \n, remove it | ||||
|     for line in [x.decode("utf-8")[:-1].split('\t') for x in line_reader]: | ||||
|         if len(line) == 1:  # empty line | ||||
|             tmp = acc | ||||
|             acc = [] | ||||
|             yield tmp | ||||
|         else: | ||||
|             acc.append(line) | ||||
| 
 | ||||
| def match_sentence_id(string, rd): | ||||
| 	str1 = " ".join([token[1] for token in sentence_arr]) | ||||
| 	for k, e in rd.items(): | ||||
| 		str2 = " ".join(token[2] for token in dict_entry["tokens"]) | ||||
| 		if str1 == str2: | ||||
| 			return k | ||||
| 	raise KeyError | ||||
| def to_sentence(sentence_arr): | ||||
|     return " ".join([token[1] for token in sentence_arr]) | ||||
| 
 | ||||
| def match_sentence_id(sentence, orig_dict): | ||||
|     for k, e in orig_dict.items(): | ||||
|         orig_sentence = " ".join(token[2] for token in e["tokens"]) | ||||
|         if sentence == orig_sentence: | ||||
|             return k | ||||
|     raise KeyError | ||||
| 
 | ||||
| def get_dep_rel(token): | ||||
|     logging.debug(token) | ||||
|     for i, field in enumerate(token[14:]): | ||||
|         if field != "_": | ||||
|             return { | ||||
|                 "arg":  field, | ||||
|                 "from": i,  # i-th predicate in sentence | ||||
|                 "dep":  token[0], | ||||
|             } | ||||
|     return None | ||||
| 
 | ||||
| def handle_file(infile_tpl): | ||||
|     i = infile_tpl[0] | ||||
|     infile = infile_tpl[1] | ||||
|     outfile = (OUTPATH / infile.name).with_suffix(".json") | ||||
|     origfile = get_origfile(infile) | ||||
|     orig_dict = par.parse_tei(origfile) | ||||
| 
 | ||||
|     with infile.open("rb") as fp: | ||||
|         outdata = {} | ||||
|         for sentence_arr in extract_sentences(fp.readlines()): | ||||
|             # tsv dropped sentence ids, match the ID, using original data | ||||
|             sid = match_sentence_id(to_sentence(sentence_arr), orig_dict) | ||||
| 
 | ||||
|             outdata[sid] = [] | ||||
| 
 | ||||
|             # find all predicate indices in the sentence | ||||
|             predicates = [] | ||||
|             for token in sentence_arr: | ||||
|                 if token[12] == "Y": | ||||
|                     predicates += [token[0]]  # idx | ||||
| 
 | ||||
|                 deprel = get_dep_rel(token) | ||||
|                 if deprel is not None: | ||||
|                     outdata[sid].append(deprel) | ||||
| 
 | ||||
|             # deprel["from"] points to n-th predicate | ||||
|             # replace with predicate's token index | ||||
|             for deprel in outdata[sid]: | ||||
|                 deprel["from"] = predicates[deprel["from"]] | ||||
| 
 | ||||
|             if DEBUG: | ||||
|                 print(to_sentence(sentence_arr)) | ||||
|                 print(outdata[sid]) | ||||
|                 print(sid) | ||||
|                 print() | ||||
|                 print() | ||||
| 
 | ||||
|     with outfile.open("w") as fp: | ||||
|         json.dump(outdata, fp) | ||||
|         logging.info("SRL relations written to: {}".format(outfile)) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
| # main | ||||
| par = Parser() | ||||
| OUTPATH.mkdir(exist_ok=True) | ||||
| 
 | ||||
| 	par = Parser() | ||||
| infiles = list(enumerate([x for x in INPATH.iterdir() if x.is_file()])) | ||||
| logging.info("Generating JSON SRL files from {} tsv files.".format(len(infiles))) | ||||
| 
 | ||||
| 	for infile in [x for x in INPATH.iterdir() if x.is_file()]: | ||||
| 		origfile = get_origfile(infile) | ||||
| 		rd = par.parse_tei(origfile) | ||||
| with Pool(CPU_CORES) as p: | ||||
|     p.map(handle_file, infiles) | ||||
| 
 | ||||
| 		fp = infile.open("rb") | ||||
| 		for sentence_arr in extract_sentences(fp.readlines()): | ||||
| 			sid = match_sentence_id(sentence_arr, rd) | ||||
| 			print(sid) | ||||
| 			# OK, we got the sentence id, now generate the predicate map! | ||||
| 
 | ||||
| 
 | ||||
| 		outfile = (OUTPATH / infile.name).with_suffix(".json") | ||||
| logging.info("Finished generating .json files.") | ||||
|  | ||||
| @ -6,10 +6,8 @@ import re | ||||
| import sys | ||||
| import cProfile | ||||
| import configparser | ||||
| 
 | ||||
| # some defaults | ||||
| INDIR = Path("../data/kres_example") | ||||
| OUTDIR = Path("../data/kres_example_tsv") | ||||
| import logging | ||||
| from multiprocessing import Pool | ||||
| 
 | ||||
| SSJ500K_2_1 = 27829  # number of sentences | ||||
| par = Parser() | ||||
| @ -19,6 +17,13 @@ config = configparser.ConfigParser() | ||||
| config.read("tools.cfg") | ||||
| INDIR = Path(config["tools"]["kres_orig"]) | ||||
| OUTDIR = Path(config["tools"]["kres_tsv"]) | ||||
| CPU_CORES = int(config["tools"]["cpu_cores"]) | ||||
| 
 | ||||
| LOGFILE = Path(config["tools"]["logfile"]).absolute() | ||||
| LOGFILE.touch(exist_ok=True) | ||||
| LOGFILE.resolve() | ||||
| 
 | ||||
| logging.basicConfig(filename=str(LOGFILE), level=logging.INFO) | ||||
| 
 | ||||
| """ | ||||
| print("parsing ssj") | ||||
| @ -28,22 +33,42 @@ ssj_dict = par.parse_tei(ssj_file) | ||||
| print("end parsing ssj") | ||||
| """ | ||||
| 
 | ||||
| print("parsing kres") | ||||
| # kres_file = "../data/kres_example/F0019343.xml.parsed.xml" | ||||
| OUTDIR.mkdir(exist_ok=True) | ||||
| 
 | ||||
| for kres_file in [x for x in INDIR.iterdir() if x.is_file()]: | ||||
| infiles = list(enumerate([x for x in INDIR.iterdir() if x.is_file()])) | ||||
| logging.info("Parsing kres: {} files.".format(len(infiles))) | ||||
| 
 | ||||
|     print("Processing file: " + str(kres_file)) | ||||
|     res_dict = par.parse_tei(kres_file) | ||||
|     longest_sent = max([len(e["tokens"]) for k, e in res_dict.items()]) | ||||
|     print("Longest sentence: ", longest_sent) | ||||
|     kres_out_str = "" | ||||
| def handle_file(infile): | ||||
|     i = infile[0] | ||||
|     kres_file = infile[1] | ||||
|     outfile = (OUTDIR / kres_file.name).with_suffix(".tsv") | ||||
| 
 | ||||
|     for _, sentence in res_dict.items(): | ||||
|         kres_out_str += par.to_conll_2009_SRL(sentence, longest_sent) | ||||
|     if outfile.is_file(): | ||||
|         logging.info("Skipping existing file: {}.".format(str(kres_file))) | ||||
|         return True | ||||
| 
 | ||||
|     with (OUTDIR / kres_file.name).with_suffix(".tsv").open("wb+") as fp: | ||||
|     try: | ||||
|         res_dict = par.parse_tei(kres_file) | ||||
|         kres_out_str = "" | ||||
|         for _, sentence in res_dict.items(): | ||||
|             kres_out_str += par.to_conll_2009_SRL(sentence) | ||||
|     except Exception as exc: | ||||
|         logging.info("Failed processing file: {}".format(str(kres_file))) | ||||
|         logging.error(exc) | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
|     with outfile.open("wb+") as fp: | ||||
|         fp.write(kres_out_str.encode("utf-8")) | ||||
|         fp.close() | ||||
| print("end parsing kres") | ||||
|         logging.info("Processed file ({}/{}): {}".format(i+1, len(infiles), str(kres_file))) | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| with Pool(CPU_CORES) as p: | ||||
|     p.map(handle_file, infiles) | ||||
| 
 | ||||
| 
 | ||||
| logging.info("end parsing kres") | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -35,7 +35,11 @@ class Msdmap: | ||||
|     def slo_msd_to_eng_long(self, slo_msd): | ||||
|         # old, slow | ||||
|         # return (self.msd_table[self.msd_table["slo_msd"] == slo_msd]["eng_long"]).values[0] | ||||
|         return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0] | ||||
|         # return self.msd_table.query("slo_msd == '{}'".format(slo_msd))["eng_long"].values[0] | ||||
|         query = self.msd_table.query("slo_msd == '{}'".format(slo_msd)) | ||||
|         if query.empty: | ||||
|             return "No-matching-msd-found" | ||||
|         return query["eng_long"].values[0] | ||||
| 
 | ||||
|     def slo_msd_to_eng_pos(self, slo_msd): | ||||
|         # first letter in slo_msd == slo_pos  | ||||
|  | ||||
| @ -119,7 +119,7 @@ class Parser: | ||||
|         return res_dict | ||||
| 
 | ||||
| 
 | ||||
|     def to_conll_2009_SRL(self, sentence_entry, napreds=9): | ||||
|     def to_conll_2009_SRL(self, sentence_entry): | ||||
| 
 | ||||
|         def fillpred(tsv_row): | ||||
|             mrow = build_model_row(tsv_row) | ||||
| @ -127,8 +127,6 @@ class Parser: | ||||
|             y = self.fillpred_model.predict([x]) | ||||
|             return y[0]  # bool | ||||
| 
 | ||||
|         apreds_string = '\t'.join(["_" for x in range(napreds)]) | ||||
| 
 | ||||
|         # works with kres, with parsed links | ||||
|         out_str = "" | ||||
|         for token in sentence_entry["tokens"]: | ||||
| @ -141,7 +139,7 @@ class Parser: | ||||
|                     [t_id] + | ||||
|                     [form for x in range(7)] +  | ||||
|                     ["0", "0", "modra", "modra", "_", "_"] + | ||||
|                     [apreds_string, "\n"] | ||||
|                     ["\n"] | ||||
|                 ) | ||||
|                 continue  | ||||
| 
 | ||||
| @ -170,7 +168,6 @@ class Parser: | ||||
|                     sentence_entry["links"][t_id][0],  # pdeprel | ||||
|                     "_",  # fillpred | ||||
|                     "_",  # pred | ||||
|                     apreds_string, | ||||
|                     "\n", | ||||
|             ] | ||||
|             fprd = fillpred(row_list) | ||||
|  | ||||
| @ -1,5 +1,8 @@ | ||||
| [tools] | ||||
| kres_orig = ../data/kres_example | ||||
| kres_tsv = ../data/kres_example_tsv | ||||
| kres_srl = ../data/kres_example_srl | ||||
| kres_json = ../data/kres/example_json | ||||
| kres_orig = /kres_mount/kres_parsed/tei | ||||
| kres_tsv = ../data/kres_out/1_tsv | ||||
| kres_srl = ../data/kres_out/2_srl | ||||
| kres_json = ../data/kres_out/final_json | ||||
| logfile = ../progress.log | ||||
| cpu_cores = 5 | ||||
| debug = False | ||||
|  | ||||
							
								
								
									
										8
									
								
								tools/tools.cfg.local
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								tools/tools.cfg.local
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,8 @@ | ||||
| [tools] | ||||
| kres_orig = ../data/kres_example | ||||
| kres_tsv = ../data/kres_out/1_tsv | ||||
| kres_srl = ../data/kres_out/2_srl | ||||
| kres_json = ../data/kres_out/final_json | ||||
| logfile = ../progress.log | ||||
| cpu_cores = 1 | ||||
| debug = False | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user