Project copied

This commit is contained in:
2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
package util;
import java.nio.ByteBuffer;
public class ByteUtils {
/*
* Taken from <a href="https://stackoverflow.com/a/4485196">StackOverflow</a>
*/
public static byte[] longToBytes(long x) {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
buffer.putLong(x);
return buffer.array();
}
/*
* Taken from <a href="https://stackoverflow.com/a/4485196">StackOverflow</a>
*/
public static long bytesToLong(byte[] bytes) {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
buffer.put(bytes);
buffer.flip();//need flip
return buffer.getLong();
}
}

View File

@@ -0,0 +1,46 @@
package util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.stream.IntStream;
public class Combinations {
private static HashSet<HashSet<Integer>> result = new HashSet<>();
/* arr[] ---> Input Array
data[] ---> Temporary array to store current combination
start & end ---> Staring and Ending indexes in arr[]
index ---> Current index in data[]
r ---> Size of a combination to be printed */
static void combinationUtil(int arr[], Integer data[], int start, int end, int index, int combinationLength) {
// Current combination is ready to be printed, print it
if (index == combinationLength) {
result.add(new HashSet<>(Arrays.asList(data)));
return;
}
// replace index with all possible elements. The condition
// "end-i+1 >= r-index" makes sure that including one element
// at index will make a combination with remaining elements
// at remaining positions
for (int i = start; i <= end && end - i + 1 >= combinationLength - index; i++) {
data[index] = arr[i];
combinationUtil(arr, data, i + 1, end, index + 1, combinationLength);
}
}
public static HashSet<HashSet<Integer>> generateIndices(int maxNOfIndices) {
result = new HashSet<>();
int[] arr = IntStream.range(1, maxNOfIndices).toArray();
for (int i = 1; i < maxNOfIndices - 1; i++) {
// A temporary array to store all combination one by one
combinationUtil(arr, new Integer[i], 0, arr.length - 1, 0, i);
}
// also add an empty one for X.... (all of this type)
result.add(new HashSet<>());
return result;
}
}

View File

@@ -0,0 +1,267 @@
package util;
import static util.Util.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.lang3.tuple.Pair;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import data.Enums.WordLevelType;
@SuppressWarnings("unchecked")
public class Export {
public static void SetToJSON(Set<Pair<String, Map<String, Long>>> set) {
JSONArray wrapper = new JSONArray();
for (Pair<String, Map<String, Long>> p : set) {
JSONArray data_wrapper = new JSONArray();
JSONObject metric = new JSONObject();
String title = p.getLeft();
Map<String, Long> map = p.getRight();
if (map.isEmpty())
continue;
long total = Util.mapSumFrequencies(map);
for (Map.Entry<String, Long> e : map.entrySet()) {
JSONObject data_entry = new JSONObject();
data_entry.put("word", e.getKey());
data_entry.put("frequency", e.getValue());
data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total));
data_wrapper.add(data_entry);
}
metric.put("Title", title);
metric.put("data", data_wrapper);
wrapper.add(metric);
}
try (FileWriter file = new FileWriter("statistics.json")) {
file.write(wrapper.toJSONString());
} catch (IOException e) {
e.printStackTrace();
}
}
public static String SetToCSV(Set<Pair<String, Map<String, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"word", "frequency", "percent"};
String fileName = "";
for (Pair<String, Map<String, Long>> p : set) {
String title = p.getLeft();
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
Map<String, Long> map = p.getRight();
if (map.isEmpty())
continue;
long total = Util.mapSumFrequencies(map);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<String, Long> e : map.entrySet()) {
List dataEntry = new ArrayList<>();
dataEntry.add(e.getKey());
dataEntry.add(e.getValue().toString());
dataEntry.add(formatNumberAsPercent((double) e.getValue() / total));
csvFilePrinter.printRecord(dataEntry);
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
}
return fileName;
}
public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"word", "frequency", "percent"};
String fileName = "";
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Object[] resultEntry : result) {
List dataEntry = new ArrayList<>();
dataEntry.add(resultEntry[0]);
dataEntry.add(resultEntry[1]);
dataEntry.add(formatNumberAsPercent(resultEntry[2]));
csvFilePrinter.printRecord(dataEntry);
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
return fileName;
}
public static String nestedMapToCSV(String title, Map<WordLevelType, Map<String, Map<String, Long>>> result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"type", "key", "word", "frequency"};
String fileName = "";
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<WordLevelType, Map<String, Map<String, Long>>> typeEntry : result.entrySet()) {
for (Map.Entry<String, Map<String, Long>> keyWordEntry : typeEntry.getValue().entrySet()) {
for (Map.Entry<String, Long> calculationResults : keyWordEntry.getValue().entrySet()) {
List values = new ArrayList();
values.add(typeEntry.getKey().getName());
values.add(keyWordEntry.getKey());
values.add(calculationResults.getKey());
values.add(calculationResults.getValue());
csvFilePrinter.printRecord(values);
}
}
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
return fileName;
}
private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap<String, String> headerInfoBlock) throws IOException {
for (Map.Entry<String, String> entry : headerInfoBlock.entrySet()) {
List values = new ArrayList();
values.add(entry.getKey());
values.add(entry.getValue());
csvFilePrinter.printRecord(values);
}
// 2 empty lines
List values = new ArrayList();
csvFilePrinter.printRecord(values);
csvFilePrinter.printRecord(values);
}
}

View File

@@ -0,0 +1,31 @@
package util;
public class Key /*implements Comparable<Key> */ {
// private final String value;
//
// Key(String value) {
// this.value = value;
// }
//
// @Override
// public int compareTo(Key o) {
// return Objects.compare(this.value, o.value);
// }
//
// @Override
// public boolean equals(Object o) {
// if (this.equals(o)) {
// return true;
// }
// if (o == null || getClass() != o.getClass()) {
// return false;
// }
// Key key = (Key) o;
// return Objects.equals(value, key.value);
// }
//
// @Override
// public int hashCode() {
// return 0;
// }
}

View File

@@ -0,0 +1,63 @@
package util;
import java.util.concurrent.TimeUnit;
/**
* Adapted from http://memorynotfound.com/calculating-elapsed-time-java/
*/
public class TimeWatch {
private long starts;
private TimeWatch() {
reset();
}
public static TimeWatch start() {
return new TimeWatch();
}
private TimeWatch reset() {
starts = System.nanoTime();
return this;
}
private long time() {
long ends = System.nanoTime();
return ends - starts;
}
private long time(TimeUnit unit) {
return unit.convert(time(), TimeUnit.NANOSECONDS);
}
private String toMinuteSeconds() {
return String.format("%d min, %d sec", time(TimeUnit.MINUTES),
time(TimeUnit.SECONDS) - time(TimeUnit.MINUTES));
}
public String toFullTime() {
long hours = time(TimeUnit.HOURS);
long minutes = time(TimeUnit.MINUTES) - TimeUnit.HOURS.toMinutes(hours);
long seconds = time(TimeUnit.SECONDS) - TimeUnit.HOURS.toSeconds(hours) - TimeUnit.MINUTES.toSeconds(minutes);
long milliseconds = time(TimeUnit.MILLISECONDS) - TimeUnit.HOURS.toMillis(hours) - TimeUnit.MINUTES.toMillis(minutes) - TimeUnit.SECONDS.toMillis(seconds);
return String.format("%d h, %d min, %d s, %d ms", hours, minutes, seconds, milliseconds);
}
public String toString() {
return "Elapsed Time in nano seconds: ";
}
private void exampleUsage() {
TimeWatch watch = TimeWatch.start();
// do something...
System.out.println("Elapsed Time custom format: " + watch.toMinuteSeconds());
System.out.println("Elapsed Time in seconds: " + watch.time(TimeUnit.SECONDS));
System.out.println("Elapsed Time in nano seconds: " + watch.time());
}
}

View File

@@ -0,0 +1,225 @@
package util;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.text.MessageFormat;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.Settings;
import gui.GUIController;
import gui.ValidationUtil;
public class Util {
public final static Logger logger = LogManager.getLogger(Util.class);
public static String toReadableTime(long time) {
long hours = time(TimeUnit.HOURS, time);
long minutes = time(TimeUnit.MINUTES, time) - TimeUnit.HOURS.toMinutes(hours);
long seconds = time(TimeUnit.SECONDS, time) - TimeUnit.HOURS.toSeconds(hours) - TimeUnit.MINUTES.toSeconds(minutes);
long milliseconds = time(TimeUnit.MILLISECONDS, time) - TimeUnit.HOURS.toMillis(hours) - TimeUnit.MINUTES.toMillis(minutes) - TimeUnit.SECONDS.toMillis(seconds);
long microseconds = time(TimeUnit.MICROSECONDS, time) - TimeUnit.HOURS.toMicros(hours) - TimeUnit.MINUTES.toMicros(minutes) - TimeUnit.SECONDS.toMicros(seconds) - TimeUnit.MILLISECONDS.toMicros(milliseconds);
long nanoseconds = time(TimeUnit.NANOSECONDS, time) - TimeUnit.HOURS.toNanos(hours) - TimeUnit.MINUTES.toNanos(minutes) - TimeUnit.SECONDS.toNanos(seconds) - TimeUnit.MILLISECONDS.toNanos(milliseconds) - TimeUnit.MICROSECONDS.toNanos(microseconds);
return String.format("%d h, %d min, %d s, %d ms, %d µs, %d ns", hours, minutes, seconds, milliseconds, microseconds, nanoseconds);
}
private static long time(TimeUnit unit, long t) {
return unit.convert(t, TimeUnit.NANOSECONDS);
}
/**
* Converts a number to a more readable format.
* 12345 -> 12.345
* 12345,678 -> 12.345,67
*
* @param o byte, double, float, int,long, short
*
* @return number formatted with thousands separator and 2 decimal places (floats)
*/
private static String formatNumberReadable(Object o) {
if (isInstanceOfInteger(o))
return String.format("%,d", o);
else if (isInstanceOfFloat(o))
return String.format("%,.2f", o);
else
return "- invalid input format -";
}
public static String formatNumberAsPercent(Object o) {
return MessageFormat.format("{0,number,#.###%}", o);
}
private static boolean isInstanceOfInteger(Object o) {
Set<Class<?>> types = new HashSet<>();
types.add(Byte.class);
types.add(Short.class);
types.add(Integer.class);
types.add(Long.class);
return types.contains(o.getClass());
}
private static boolean isInstanceOfFloat(Object o) {
Set<Class<?>> types = new HashSet<>();
types.add(Float.class);
types.add(Double.class);
return types.contains(o.getClass());
}
public static <K, V> void printMap(Map<K, V> map) {
System.out.println("\nkey: value");
map.forEach((k, v) -> System.out.print(String.format("%s:\t %,8d%n", k, v)));
System.out.println();
}
/**
* Generic map converter -> since AtomicLongs aren't as comparable.
* Converts ConcurrentHashMap<K, AtomicLong> to HashMap<K, Long>
*/
public static <K, V> Map<String, Long> atomicInt2StringAndInt(Map<K, V> map) {
Map m = new HashMap<String, Long>();
for (Map.Entry<K, V> e : map.entrySet()) {
m.put(e.getKey().toString(), ((AtomicLong) e.getValue()).longValue());
}
return m;
}
/**
* Sorts a map in a descending order by value.
*/
public static <K, V extends Comparable<? super V>> Map<K, V> sortByValue(Map<K, V> map, int limit) {
/*
sorted() in itself is O(1), since it's an intermediate operation that
doesn't consume the stream, but simply adds an operation to the pipeline.
Once the stream is consumed by a terminal operation, the sort happens and
either
- it doesn't do anything (O(1)) because the stream knows that the
elements are already sorted (because they come from a SortedSet, for example)
- or the stream is not parallel, and it delegates to Arrays.sort() (O(n log n))
- or the stream is parallel, and it delegates to Arrays.parallelSort() (O(n log n))
As of JDK 8, the main sorting algorithm which is also used in standard
stream API implementation for sequential sorting is TimSort. Its worst
case is O(n log n), but it works incredibly fast (with O(n) and quite
small constant) if data is presorted (in forward or reverse direction)
or partially presorted (for example, if you concatenate two sorted lists
and sort them again).
*/
// if limit is set to 0 or less, we take that to mean no limit at all
if (limit <= 0) {
limit = map.size();
}
Map<K, V> result = new LinkedHashMap<>();
TimeWatch watch = TimeWatch.start();
Stream<Map.Entry<K, V>> st = map.entrySet().stream();
st.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).limit(limit)
.forEachOrdered(e -> result.put(e.getKey(), e.getValue()));
if (Settings.PRINT_LOG) {
System.out.println(String.format("Elapsed time for sorting %s items: %s",
formatNumberReadable(result.size()),
watch.toFullTime()));
}
return result;
}
public static <K, V> void printMap(Map<K, Integer> map, String title, int number_of_words) {
System.out.println(String.format("\n%s\n------------\nkey: value\tpercent", title));
map.forEach((k, v) ->
System.out.println(String.format("%s:\t %s\t %s%%",
k,
Util.formatNumberReadable(v),
Util.formatNumberReadable((double) v / number_of_words * 100))));
System.out.println();
}
static long mapSumFrequencies(Map<String, Long> map) {
long sum = 0;
for (long value : map.values()) {
sum += value;
}
return sum;
}
/**
* Used for passing optional integer values for sorting.
*/
public static int getValidInt(int... i) {
if (i == null || i.length < 1 || i[0] <= 0) {
return 0;
} else {
return i[0];
}
}
/**
* Check whether a map is empty. It also considers an edge case where map's keys are lists to check if those lists are empty.
*/
public static <K, V> boolean isMapEmpty(Map<K, V> map) {
if (map.isEmpty()) {
// default
return true;
}
// otherwise check if keys map to values that are empty
for (V v : map.values()) {
// todo: generalize to all collections if/when needed
ArrayList<String> vl = new ArrayList((List<String>) v);
if (!vl.isEmpty()) {
return false;
}
}
return true;
}
/**
* Returns the location of the main class if possible, otherwise null
*/
public static File getWorkingDirectory() {
// get location of the currently executing class
String path = GUIController.class.getProtectionDomain().getCodeSource().getLocation().getPath();
logger.info("working dir path: ", path);
String decodedPath = null;
try {
decodedPath = URLDecoder.decode(path, "UTF-8");
} catch (UnsupportedEncodingException e) {
logger.error("decoding: ", e);
// e.printStackTrace();
}
if (decodedPath != null) {
File workingDirectory = new File(decodedPath);
// in case it's a file (class is packaged inside a jar), select its parent folder
workingDirectory = workingDirectory.isFile() ? workingDirectory.getParentFile() : workingDirectory;
if (ValidationUtil.isReadableDirectory(workingDirectory)) {
logger.info("working dir is ok: ", workingDirectory.getAbsolutePath());
return workingDirectory;
}
}
logger.info("working dir returing null");
return null;
}
}

View File

@@ -0,0 +1,132 @@
package util.db;
import static util.ByteUtils.*;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.FileUtils;
import org.rocksdb.*;
import util.TimeWatch;
public class RDB {
private RocksDB db;
private String path;
private static final String UTF_8 = "UTF-8";
public RDB() {
// different dbs i ncase of concurrent calculations
this.path = System.getProperty("java.io.tmpdir")
.concat(File.separator)
.concat(String.format("corpusAnalyzer_db%d", LocalDateTime.now().toString().hashCode()));
this.db = createDB();
}
private RocksDB createDB() {
RocksDB.loadLibrary();
// the Options class contains a set of configurable DB options
// that determines the behaviour of the database.
try (final Options options = new Options()) {
options.setCreateIfMissing(true);
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(options, path)) {
if (db != null) {
return rdb;
} else {
this.db = rdb;
}
}
} catch (RocksDBException e) {
// do some error handling
}
return null;
}
public void writeBatch(Map<String, AtomicLong> results) throws UnsupportedEncodingException {
RocksDB.loadLibrary();
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(new Options(), path)) {
final WriteBatch wb = new WriteBatch();
for (Map.Entry<String, AtomicLong> entry : results.entrySet()) {
byte[] key = entry.getKey().getBytes(UTF_8);
long resultValue = entry.getValue().longValue();
try {
final byte[] dbValue = rdb.get(key);
if (dbValue != null) {
// value == null if key does not exist in db.
wb.put(key, longToBytes(bytesToLong(dbValue) + resultValue));
} else {
wb.put(key, longToBytes(entry.getValue().longValue()));
}
} catch (RocksDBException e) {
// TODO: error handling
}
}
TimeWatch watch = TimeWatch.start();
rdb.write(new WriteOptions(), wb);
System.out.println(String.format("Writing %d entries took: %s", wb.count(), watch.toFullTime()));
} catch (RocksDBException e) {
// do some error handling
}
}
// public byte[] atomicIntToByteArray(final AtomicLong i) {
// BigInteger bigInt = BigInteger.valueOf(i.intValue());
//
// return bigInt.toByteArray();
// }
public RocksDB getDb() {
return db;
}
public Map<String, AtomicLong> getDump() throws UnsupportedEncodingException {
Map<String, AtomicLong> dump = new HashMap<>();
RocksDB.loadLibrary();
// the Options class contains a set of configurable DB options
// that determines the behaviour of the database.
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(new Options(), path)) {
try (RocksIterator it = rdb.newIterator()) {
it.seekToFirst();
// it.next();
while (it.isValid()) {
byte[] key = it.key();
byte[] value = it.value();
dump.put(new String(key, UTF_8), new AtomicLong(bytesToLong(value)));
it.next();
}
}
} catch (RocksDBException e) {
e.printStackTrace();
}
return dump;
}
public void delete() {
try {
FileUtils.deleteDirectory(new File(path));
} catch (IOException e) {
e.printStackTrace();
}
}
}