parser.py can read kres and/or ssj500k

This commit is contained in:
2019-02-03 22:54:26 +01:00
parent 648f4e53d2
commit d1ba56be37
14 changed files with 437 additions and 82 deletions
Binary file not shown.
View File
Binary file not shown.
Binary file not shown.
Binary file not shown.
+70
View File
@@ -0,0 +1,70 @@
import xml.etree.ElementTree as ET
import random
random.seed(42)
tree=ET.parse('../../data/kres_example/F0006347.xml.parsed.xmll')
print(ET.tostring(tree))
root=tree.getroot()
train=[]
dev=[]
test=[]
train_text=open('train.txt','w')
dev_text=open('dev.txt','w')
test_text=open('test.txt','w')
for doc in root.iter('{http://www.tei-c.org/ns/1.0}div'):
rand=random.random()
if rand<0.8:
pointer=train
pointer_text=train_text
elif rand<0.9:
pointer=dev
pointer_text=dev_text
else:
pointer=test
pointer_text=test_text
for p in doc.iter('{http://www.tei-c.org/ns/1.0}p'):
for element in p:
if element.tag.endswith('s'):
sentence=element
text=''
tokens=[]
for element in sentence:
if element.tag[-3:]=='seg':
for subelement in element:
text+=subelement.text
if not subelement.tag.endswith('}c'):
if subelement.tag.endswith('w'):
lemma=subelement.attrib['lemma']
else:
lemma=subelement.text
tokens.append((subelement.text,lemma,subelement.attrib['ana'].split(':')[1]))
if element.tag[-2:] not in ('pc','}w','}c'):
continue
text+=element.text
if not element.tag.endswith('}c'):
if element.tag.endswith('w'):
lemma=element.attrib['lemma']
else:
lemma=element.text
tokens.append((element.text,lemma,element.attrib['ana'].split(':')[1]))
pointer.append((text,tokens))
pointer_text.write(text.encode('utf8'))
else:
pointer_text.write(element.text.encode('utf8'))
pointer_text.write('\n')
#pointer_text.write('\n')
def write_list(lst,fname):
f=open(fname,'w')
for text,tokens in lst:
f.write('# text = '+text.encode('utf8')+'\n')
for idx,token in enumerate(tokens):
f.write(str(idx+1)+'\t'+token[0].encode('utf8')+'\t'+token[1].encode('utf8')+'\t_\t'+token[2]+'\t_\t_\t_\t_\t_\n')
f.write('\n')
f.close()
write_list(train,'train.conllu')
write_list(dev,'dev.conllu')
write_list(test,'test.conllu')
train_text.close()
dev_text.close()
test_text.close()
+144
View File
@@ -0,0 +1,144 @@
#!/usr/bin/python3
from __future__ import print_function, unicode_literals, division
import sys
import os
import re
import pickle
from pathlib import Path
try:
from lxml import etree as ElementTree
except ImportError:
import xml.etree.ElementTree as ElementTree
# attributes
ID_ATTR = "id"
LEMMA_ATTR = "lemma"
ANA_ATTR = "ana"
# tags
SENTENCE_TAG = 's'
BIBL_TAG = 'bibl'
PARAGRAPH_TAG = 'p'
PC_TAG = 'pc'
WORD_TAG = 'w'
C_TAG = 'c'
S_TAG = 'S'
SEG_TAG = 'seg'
class Sentence:
def __init__(self, sentence, s_id):
self.id = s_id
self.words = []
self.text = ""
for word in sentence:
self.handle_word(word)
def handle_word(self, word):
# handle space after
if word.tag == S_TAG:
assert(word.text is None)
self.text += ' '
return
# ASK am I handling this correctly?
elif word.tag == SEG_TAG:
for segword in word:
self.handle_word(segword)
return
# ASK handle unknown tags (are there others?)
elif word.tag not in (WORD_TAG, C_TAG):
return
# ID
idx = str(len(self.words) + 1)
# TOKEN
token = word.text
# LEMMA
if word.tag == WORD_TAG:
lemma = word.get(LEMMA_ATTR)
assert(lemma is not None)
else:
lemma = token
# XPOS
xpos = word.get('msd')
if word.tag == C_TAG:
xpos = "Z"
elif xpos in ("Gp-ppdzn", "Gp-spmzd"):
xpos = "N"
elif xpos is None:
print(self.id)
# save word entry
self.words.append(['F{}.{}'.format(self.id, idx), token, lemma, xpos])
# save for text
self.text += word.text
def to_conllu(self):
lines = []
# lines.append('# sent_id = ' + self.id)
# CONLLu does not like spaces at the end of # text
# lines.append('# text = ' + self.text.strip())
for word in self.words:
lines.append('\t'.join('_' if data is None else data for data in word))
return lines
def convert_file(in_file, out_file):
print("Nalaganje xml: {}".format(in_file))
with open(str(in_file), 'r') as fp:
xmlstring = re.sub(' xmlns="[^"]+"', '', fp.read(), count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
xml_tree = ElementTree.XML(xmlstring)
print("Pretvarjanje TEI -> TSV-U ...")
lines = []
for pidx, paragraph in enumerate(xml_tree.iterfind('.//body/p')):
sidx = 1
for sentence in paragraph:
if sentence.tag != SENTENCE_TAG:
continue
sentence = Sentence(sentence, "{}.{}".format(pidx + 1, sidx))
lines.extend(sentence.to_conllu())
lines.append('') # ASK newline between sentences
sidx += 1
if len(lines) == 0:
raise RuntimeError("Nobenih stavkov najdenih")
print("Zapisovanje izhodne datoteke: {}".format(out_file))
with open(out_file, 'w') as fp:
for line in lines:
if sys.version_info < (3, 0):
line = line.encode('utf-8')
print(line, file=fp)
if __name__ == "__main__":
"""
Input: folder of TEI files, msds are encoded as msd="Z"
Ouput: just a folder
"""
in_folder = sys.argv[1]
out_folder = sys.argv[2]
num_processes = int(sys.argv[3])
files = Path(in_folder).rglob("*.xml")
in_out = []
for filename in files:
out_file = out_folder + "/" + filename.name[:-4] + ".txt"
convert_file(filename, out_file)
+86
View File
@@ -0,0 +1,86 @@
from lxml import etree
import re
W_TAGS = ['w']
C_TAGS = ['c']
S_TAGS = ['S', 'pc']
# reads a TEI xml file and returns a dictionary:
# { <sentence_id>: {
# sid: <sentence_id>, # serves as index in MongoDB
# text: ,
# tokens: ,
# }}
def parse_tei(filepath):
guess_corpus = None # SSJ | KRES
res_dict = {}
with open(filepath, "r") as fp:
# remove namespaces
xmlstr = fp.read()
xmlstr = re.sub('\\sxmlns="[^"]+"', '', xmlstr, count=1)
xmlstr = re.sub(' xml:', ' ', xmlstr)
root = etree.XML(xmlstr.encode("utf-8"))
divs = [] # in ssj, there are divs, in Kres, there are separate files
if "id" in root.keys():
# Kres files start with <TEI id=...>
guess_corpus = "KRES"
divs = [root]
else:
guess_corpus = "SSJ"
divs = root.findall(".//div")
# parse divs
for div in divs:
f_id = div.get("id")
# parse paragraphs
for p in div.findall(".//p"):
p_id = p.get("id").split(".")[-1]
# parse sentences
for s in p.findall(".//s"):
s_id = s.get("id").split(".")[-1]
sentence_text = ""
sentence_tokens = []
# parse tokens
for el in s.iter():
if el.tag in W_TAGS:
el_id = el.get("id").split(".")[-1]
if el_id[0] == 't':
el_id = el_id[1:] # ssj W_TAG ids start with t
sentence_text += el.text
sentence_tokens += [(
"w",
el_id,
el.text,
el.get("lemma"),
(el.get("msd") if guess_corpus == "KRES" else el.get("ana").split(":")[-1]),
)]
elif el.tag in C_TAGS:
el_id = el.get("id") or "none" # only Kres' C_TAGS have ids
el_id = el_id.split(".")[-1]
sentence_text += el.text
sentence_tokens += [("c", el_id, el.text,)]
elif el.tag in S_TAGS:
sentence_text += " " # Kres' <S /> doesn't contain .text
else:
# pass links and linkGroups
# print(el.tag)
pass
sentence_id = "{}.{}.{}".format(f_id, p_id, s_id)
"""
print(sentence_id)
print(sentence_text)
print(sentence_tokens)
"""
if sentence_id in res_dict:
raise KeyError("duplicated id: {}".format(sentence_id))
res_dict[sentence_id] = {
"sid": sentence_id,
"text": sentence_text,
"tokens": sentence_tokens
}
return res_dict
Binary file not shown.