Adding slimming script
This commit is contained in:
parent
08c8050f3f
commit
ed27e549b7
45
slim_sskj.py
Normal file
45
slim_sskj.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# import xml.etree.ElementTree as ElementTree
|
||||
import lxml.etree as ElementTree
|
||||
import sys
|
||||
import re
|
||||
|
||||
filein = sys.argv[1]
|
||||
fileout = sys.argv[2]
|
||||
|
||||
lemmas = sys.argv[3:]
|
||||
|
||||
with open(filein, 'r') as fp:
|
||||
xmlstring = fp.read()[len('<?xml version="1.0" encoding="UTF-8"?>'):]
|
||||
xmlstring = re.sub(' xmlns="[^"]+"', '', xmlstring, count=1)
|
||||
xmlstring = xmlstring.replace(' xml:', ' ')
|
||||
root = ElementTree.XML(xmlstring)
|
||||
|
||||
good_s = set()
|
||||
|
||||
for word in root.iterfind('.//w'):
|
||||
if word.attrib['lemma'] in lemmas:
|
||||
good_s.add(".".join(word.attrib['id'].split('.')[:-1]))
|
||||
|
||||
for div in root.iterfind('.//div'):
|
||||
print("div ", div.attrib['id'])
|
||||
good_d = False
|
||||
|
||||
for par in div.iterfind('./p'):
|
||||
good_p = False
|
||||
for s in par.iterfind('./s'):
|
||||
if s.attrib['id'] not in good_s:
|
||||
par.remove(s)
|
||||
print("remove", s.attrib)
|
||||
else:
|
||||
print("keep", s.attrib)
|
||||
good_p = True
|
||||
|
||||
if not good_p:
|
||||
div.remove(par)
|
||||
else:
|
||||
good_d = True
|
||||
|
||||
if not good_d:
|
||||
root.remove(div)
|
||||
|
||||
ElementTree.ElementTree(root).write(fileout, encoding='utf-8')
|
Loading…
Reference in New Issue
Block a user