From 9b8de239a51d80d3044a31f9000297093f451d19 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Wed, 11 Nov 2020 21:40:26 +0100 Subject: [PATCH] IssueID #1104: added helper script for filtering out data samples --- scripts/get_sample.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 scripts/get_sample.py diff --git a/scripts/get_sample.py b/scripts/get_sample.py new file mode 100644 index 0000000..ae0f88f --- /dev/null +++ b/scripts/get_sample.py @@ -0,0 +1,20 @@ +#!/usr/bin/python3 + +import lxml.etree as lxml +import sys +import random + +input_file_name = sys.argv[1] +sample_size = int(sys.argv[2]) +output_file_name = sys.argv[3] + +tree = lxml.parse(input_file_name) +root = tree.getroot() +indexes = list(range(len(root.xpath('entry')))) +random.shuffle(indexes) + +for (index, entry) in enumerate(root.xpath('entry')): + if (index not in indexes[:sample_size]): + root.remove(entry) + +tree.write(output_file_name, encoding='UTF-8')