43 lines
1.0 KiB
Python
43 lines
1.0 KiB
Python
|
# import xml.etree.ElementTree as ElementTree
|
||
|
import lxml.etree as ElementTree
|
||
|
import sys
|
||
|
import re
|
||
|
|
||
|
filein = sys.argv[1]
|
||
|
fileout = sys.argv[2]
|
||
|
|
||
|
lemmas = sys.argv[3:]
|
||
|
|
||
|
with open(filein, 'r') as fp:
|
||
|
xmlstring = fp.read()[len('<?xml version="1.0" encoding="UTF-8"?>'):]
|
||
|
xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1)
|
||
|
xmlstring = xmlstring.replace(' xml:', ' ')
|
||
|
root = ElementTree.XML(xmlstring)
|
||
|
|
||
|
good_s = set()
|
||
|
|
||
|
for word in root.iterfind('.//w'):
|
||
|
if word.attrib['lemma'] in lemmas:
|
||
|
good_s.add(".".join(word.attrib['id'].split('.')[:-1]))
|
||
|
|
||
|
for div in root.iterfind('.//div'):
|
||
|
good_d = False
|
||
|
|
||
|
for par in div.iterfind('./p'):
|
||
|
good_p = False
|
||
|
for s in par.iterfind('./s'):
|
||
|
if s.attrib['id'] not in good_s:
|
||
|
par.remove(s)
|
||
|
else:
|
||
|
good_p = True
|
||
|
|
||
|
if not good_p:
|
||
|
div.remove(par)
|
||
|
else:
|
||
|
good_d = True
|
||
|
|
||
|
if not good_d:
|
||
|
root.remove(div)
|
||
|
|
||
|
ElementTree.ElementTree(root).write(fileout, encoding='utf-8')
|