From ed27e549b73b7befc0d3f532ad20fff9fffc6638 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 3 Jun 2019 09:37:48 +0200 Subject: [PATCH] Adding slimming script --- slim_sskj.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 slim_sskj.py diff --git a/slim_sskj.py b/slim_sskj.py new file mode 100644 index 0000000..50c75a0 --- /dev/null +++ b/slim_sskj.py @@ -0,0 +1,45 @@ +# import xml.etree.ElementTree as ElementTree +import lxml.etree as ElementTree +import sys +import re + +filein = sys.argv[1] +fileout = sys.argv[2] + +lemmas = sys.argv[3:] + +with open(filein, 'r') as fp: + xmlstring = fp.read()[len(''):] + xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1) + xmlstring = xmlstring.replace(' xml:', ' ') + root = ElementTree.XML(xmlstring) + +good_s = set() + +for word in root.iterfind('.//w'): + if word.attrib['lemma'] in lemmas: + good_s.add(".".join(word.attrib['id'].split('.')[:-1])) + +for div in root.iterfind('.//div'): + print("div ", div.attrib['id']) + good_d = False + + for par in div.iterfind('./p'): + good_p = False + for s in par.iterfind('./s'): + if s.attrib['id'] not in good_s: + par.remove(s) + print("remove", s.attrib) + else: + print("keep", s.attrib) + good_p = True + + if not good_p: + div.remove(par) + else: + good_d = True + + if not good_d: + root.remove(div) + +ElementTree.ElementTree(root).write(fileout, encoding='utf-8')