IssueID #1104: added helper script for filtering out data samples
This commit is contained in:
parent
b3d30f3dd4
commit
9b8de239a5
20
scripts/get_sample.py
Normal file
20
scripts/get_sample.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import lxml.etree as lxml
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
|
|
||||||
|
input_file_name = sys.argv[1]
|
||||||
|
sample_size = int(sys.argv[2])
|
||||||
|
output_file_name = sys.argv[3]
|
||||||
|
|
||||||
|
tree = lxml.parse(input_file_name)
|
||||||
|
root = tree.getroot()
|
||||||
|
indexes = list(range(len(root.xpath('entry'))))
|
||||||
|
random.shuffle(indexes)
|
||||||
|
|
||||||
|
for (index, entry) in enumerate(root.xpath('entry')):
|
||||||
|
if (index not in indexes[:sample_size]):
|
||||||
|
root.remove(entry)
|
||||||
|
|
||||||
|
tree.write(output_file_name, encoding='UTF-8')
|
Loading…
Reference in New Issue
Block a user