# import xml.etree.ElementTree as ElementTree import lxml.etree as ElementTree import sys import re filein = sys.argv[1] fileout = sys.argv[2] lemmas = sys.argv[3:] with open(filein, 'r') as fp: xmlstring = fp.read()[len(''):] xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1) xmlstring = xmlstring.replace(' xml:', ' ') root = ElementTree.XML(xmlstring) good_s = set() for word in root.iterfind('.//w'): if word.attrib['lemma'] in lemmas: good_s.add(".".join(word.attrib['id'].split('.')[:-1])) for div in root.iterfind('.//div'): good_d = False for par in div.iterfind('./p'): good_p = False for s in par.iterfind('./s'): if s.attrib['id'] not in good_s: par.remove(s) else: good_p = True if not good_p: div.remove(par) else: good_d = True if not good_d: root.remove(div) ElementTree.ElementTree(root).write(fileout, encoding='utf-8')