IssueID #1104: added helper script for filtering out data samples

This commit is contained in:
Cyprian Laskowski 2020-11-11 21:40:26 +01:00
parent b3d30f3dd4
commit 9b8de239a5

20
scripts/get_sample.py Normal file
View File

@ -0,0 +1,20 @@
#!/usr/bin/python3
import lxml.etree as lxml
import sys
import random
input_file_name = sys.argv[1]
sample_size = int(sys.argv[2])
output_file_name = sys.argv[3]
tree = lxml.parse(input_file_name)
root = tree.getroot()
indexes = list(range(len(root.xpath('entry'))))
random.shuffle(indexes)
for (index, entry) in enumerate(root.xpath('entry')):
if (index not in indexes[:sample_size]):
root.remove(entry)
tree.write(output_file_name, encoding='UTF-8')