43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # import xml.etree.ElementTree as ElementTree
 | |
| import lxml.etree as ElementTree
 | |
| import sys
 | |
| import re
 | |
| 
 | |
| filein = sys.argv[1]
 | |
| fileout = sys.argv[2]
 | |
| 
 | |
| lemmas = sys.argv[3:]
 | |
| 
 | |
| with open(filein, 'r') as fp:
 | |
|     xmlstring = fp.read()[len('<?xml version="1.0" encoding="UTF-8"?>'):]
 | |
|     xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1)
 | |
|     xmlstring = xmlstring.replace(' xml:', ' ')
 | |
|     root = ElementTree.XML(xmlstring)
 | |
| 
 | |
| good_s = set()
 | |
| 
 | |
| for word in root.iterfind('.//w'):
 | |
|     if word.attrib['lemma'] in lemmas:
 | |
|         good_s.add(".".join(word.attrib['id'].split('.')[:-1]))
 | |
| 
 | |
| for div in root.iterfind('.//div'):
 | |
|     good_d = False
 | |
| 
 | |
|     for par in div.iterfind('./p'):
 | |
|         good_p = False
 | |
|         for s in par.iterfind('./s'):
 | |
|             if s.attrib['id'] not in good_s:
 | |
|                 par.remove(s)
 | |
|             else:
 | |
|                 good_p = True
 | |
|         
 | |
|         if not good_p:
 | |
|             div.remove(par)
 | |
|         else:
 | |
|             good_d = True
 | |
|     
 | |
|     if not good_d:
 | |
|         root.remove(div)
 | |
| 
 | |
| ElementTree.ElementTree(root).write(fileout, encoding='utf-8')
 |