IssueID #1104: added helper script for filtering out data samples
This commit is contained in:
parent
b3d30f3dd4
commit
9b8de239a5
20
scripts/get_sample.py
Normal file
20
scripts/get_sample.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import lxml.etree as lxml
|
||||
import sys
|
||||
import random
|
||||
|
||||
input_file_name = sys.argv[1]
|
||||
sample_size = int(sys.argv[2])
|
||||
output_file_name = sys.argv[3]
|
||||
|
||||
tree = lxml.parse(input_file_name)
|
||||
root = tree.getroot()
|
||||
indexes = list(range(len(root.xpath('entry'))))
|
||||
random.shuffle(indexes)
|
||||
|
||||
for (index, entry) in enumerate(root.xpath('entry')):
|
||||
if (index not in indexes[:sample_size]):
|
||||
root.remove(entry)
|
||||
|
||||
tree.write(output_file_name, encoding='UTF-8')
|
Loading…
Reference in New Issue
Block a user