parser.py can read kres and/or ssj500k

This commit is contained in:
2019-02-03 22:54:26 +01:00
parent 648f4e53d2
commit d1ba56be37
14 changed files with 437 additions and 82 deletions

BIN
tools/parser/Parser.pyc Normal file

Binary file not shown.

0
tools/parser/__init__.py Normal file
View File

BIN
tools/parser/__init__.pyc Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,70 @@
import xml.etree.ElementTree as ET
import random
random.seed(42)
tree=ET.parse('../../data/kres_example/F0006347.xml.parsed.xmll')
print(ET.tostring(tree))
root=tree.getroot()
train=[]
dev=[]
test=[]
train_text=open('train.txt','w')
dev_text=open('dev.txt','w')
test_text=open('test.txt','w')
for doc in root.iter('{http://www.tei-c.org/ns/1.0}div'):
rand=random.random()
if rand<0.8:
pointer=train
pointer_text=train_text
elif rand<0.9:
pointer=dev
pointer_text=dev_text
else:
pointer=test
pointer_text=test_text
for p in doc.iter('{http://www.tei-c.org/ns/1.0}p'):
for element in p:
if element.tag.endswith('s'):
sentence=element
text=''
tokens=[]
for element in sentence:
if element.tag[-3:]=='seg':
for subelement in element:
text+=subelement.text
if not subelement.tag.endswith('}c'):
if subelement.tag.endswith('w'):
lemma=subelement.attrib['lemma']
else:
lemma=subelement.text
tokens.append((subelement.text,lemma,subelement.attrib['ana'].split(':')[1]))
if element.tag[-2:] not in ('pc','}w','}c'):
continue
text+=element.text
if not element.tag.endswith('}c'):
if element.tag.endswith('w'):
lemma=element.attrib['lemma']
else:
lemma=element.text
tokens.append((element.text,lemma,element.attrib['ana'].split(':')[1]))
pointer.append((text,tokens))
pointer_text.write(text.encode('utf8'))
else:
pointer_text.write(element.text.encode('utf8'))
pointer_text.write('\n')
#pointer_text.write('\n')
def write_list(lst,fname):
f=open(fname,'w')
for text,tokens in lst:
f.write('# text = '+text.encode('utf8')+'\n')
for idx,token in enumerate(tokens):
f.write(str(idx+1)+'\t'+token[0].encode('utf8')+'\t'+token[1].encode('utf8')+'\t_\t'+token[2]+'\t_\t_\t_\t_\t_\n')
f.write('\n')
f.close()
write_list(train,'train.conllu')
write_list(dev,'dev.conllu')
write_list(test,'test.conllu')
train_text.close()
dev_text.close()
test_text.close()

144
tools/parser/ozbolt.py Executable file
View File

@@ -0,0 +1,144 @@
#!/usr/bin/python3
from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path
try:
from lxml import etree as ElementTree
except ImportError:
import xml.etree.ElementTree as ElementTree
# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"
# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'
class Sentence:
def __init__(self, sentence, s_id):
self.id = s_id
self.words = []
self.text = ""
for word in sentence:
self.handle_word(word)
def handle_word(self, word):
# handle space after
if word.tag == S_TAG:
assert(word.text is None)
self.text += ' '
return
# ASK am I handling this correctly?
elif word.tag == SEG_TAG:
for segword in word:
self.handle_word(segword)
return
# ASK handle unknown tags (are there others?)
elif word.tag not in (WORD_TAG, C_TAG):
return
# ID
idx = str(len(self.words) + 1)
# TOKEN
token = word.text
# LEMMA
if word.tag == WORD_TAG:
lemma = word.get(LEMMA_ATTR)
assert(lemma is not None)
else:
lemma = token
# XPOS
xpos = word.get('msd')
if word.tag == C_TAG:
xpos = "Z"
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
xpos = "N"
elif xpos is None:
print(self.id)
# save word entry
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
# save for text
self.text += word.text
def to_conllu(self):
lines = []
# lines.append('# sent_id = ' + self.id)
# CONLLu does not like spaces at the end of # text
# lines.append('# text = ' + self.text.strip())
for word in self.words:
lines.append('\t'.join('_' if data is None else data for data in word))
return lines
def convert_file(in_file, out_file):
print("Nalaganje xml: {}".format(in_file))
with open(str(in_file), 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
xml_tree = ElementTree.XML(xmlstring)
print("Pretvarjanje TEI -> TSV-U ...")
lines = []
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
sidx = 1
for sentence in paragraph:
if sentence.tag != SENTENCE_TAG:
continue
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
lines.extend(sentence.to_conllu())
lines.append('') # ASK newline between sentences
sidx += 1
if len(lines) == 0:
raise RuntimeError("Nobenih stavkov najdenih")
print("Zapisovanje izhodne datoteke: {}".format(out_file))
with open(out_file, 'w') as fp:
for line in lines:
if sys.version_info < (3, 0):
line = line.encode('utf-8')
print(line, file=fp)
if __name__ == "__main__":
"""
Input: folder of TEI files, msds are encoded as msd="Z"
Ouput: just a folder
"""
in_folder = sys.argv[1]
out_folder = sys.argv[2]
num_processes = int(sys.argv[3])
files = Path(in_folder).rglob("*.xml")
in_out = []
for filename in files:
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
convert_file(filename, out_file)

86
tools/parser/parser.py Normal file
View File

@@ -0,0 +1,86 @@
from lxml import etree
import re
W_TAGS = ['w']
C_TAGS = ['c']
S_TAGS = ['S', 'pc']
# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
# sid: <sentence_id>, # serves as index in MongoDB
# text: ,
# tokens: ,
# }}
def parse_tei(filepath):
guess_corpus = None # SSJ | KRES
res_dict = {}
with open(filepath, "r") as fp:
# remove namespaces
xmlstr = fp.read()
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
xmlstr = re.sub(' xml:', ' ', xmlstr)
root = etree.XML(xmlstr.encode("utf-8"))
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
f_id = div.get("id")
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
el_id,
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" else el.get("ana").split(":")[-1]),
)]
elif el.tag in C_TAGS:
el_id = el.get("id") or "none" # only Kres' C_TAGS have ids
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in S_TAGS:
sentence_text += " " # Kres' <S /> doesn't contain .text
else:
# pass links and linkGroups
# print(el.tag)
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
"""
print(sentence_id)
print(sentence_text)
print(sentence_tokens)
"""
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens
}
return res_dict

BIN
tools/parser/parser.pyc Normal file

Binary file not shown.