luscenje_struktur/slim_sskj.py

46 lines
1.2 KiB
Python

# import xml.etree.ElementTree as ElementTree
import lxml.etree as ElementTree
import sys
import re
filein = sys.argv[1]
fileout = sys.argv[2]
lemmas = sys.argv[3:]
with open(filein, 'r') as fp:
xmlstring = fp.read()[len('<?xml version="1.0" encoding="UTF-8"?>'):]
xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1)
xmlstring = xmlstring.replace(' xml:', ' ')
root = ElementTree.XML(xmlstring)
good_s = set()
for word in root.iterfind('.//w'):
if word.attrib['lemma'] in lemmas:
good_s.add(".".join(word.attrib['id'].split('.')[:-1]))
for div in root.iterfind('.//div'):
print("div ", div.attrib['id'])
good_d = False
for par in div.iterfind('./p'):
good_p = False
for s in par.iterfind('./s'):
if s.attrib['id'] not in good_s:
par.remove(s)
print("remove", s.attrib)
else:
print("keep", s.attrib)
good_p = True
if not good_p:
div.remove(par)
else:
good_d = True
if not good_d:
root.remove(div)
ElementTree.ElementTree(root).write(fileout, encoding='utf-8')