43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# import xml.etree.ElementTree as ElementTree
 | 
						|
import lxml.etree as ElementTree
 | 
						|
import sys
 | 
						|
import re
 | 
						|
 | 
						|
filein = sys.argv[1]
 | 
						|
fileout = sys.argv[2]
 | 
						|
 | 
						|
lemmas = sys.argv[3:]
 | 
						|
 | 
						|
with open(filein, 'r') as fp:
 | 
						|
    xmlstring = fp.read()[len('<?xml version="1.0" encoding="UTF-8"?>'):]
 | 
						|
    xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1)
 | 
						|
    xmlstring = xmlstring.replace(' xml:', ' ')
 | 
						|
    root = ElementTree.XML(xmlstring)
 | 
						|
 | 
						|
good_s = set()
 | 
						|
 | 
						|
for word in root.iterfind('.//w'):
 | 
						|
    if word.attrib['lemma'] in lemmas:
 | 
						|
        good_s.add(".".join(word.attrib['id'].split('.')[:-1]))
 | 
						|
 | 
						|
for div in root.iterfind('.//div'):
 | 
						|
    good_d = False
 | 
						|
 | 
						|
    for par in div.iterfind('./p'):
 | 
						|
        good_p = False
 | 
						|
        for s in par.iterfind('./s'):
 | 
						|
            if s.attrib['id'] not in good_s:
 | 
						|
                par.remove(s)
 | 
						|
            else:
 | 
						|
                good_p = True
 | 
						|
        
 | 
						|
        if not good_p:
 | 
						|
            div.remove(par)
 | 
						|
        else:
 | 
						|
            good_d = True
 | 
						|
    
 | 
						|
    if not good_d:
 | 
						|
        root.remove(div)
 | 
						|
 | 
						|
ElementTree.ElementTree(root).write(fileout, encoding='utf-8')
 |