Project copied

This commit is contained in:
2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: gui.GUIController

View File

@@ -0,0 +1,15 @@
package alg;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
public class Common {
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
// if not in map
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
map.get(o).incrementAndGet();
}
}

View File

@@ -0,0 +1,794 @@
package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.logging.log4j.LogManager;
import data.*;
import gui.ValidationUtil;
public class XML_processing {
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
// public static void processCorpus(Statistics stats) {
// // we can preset the list's size, so there won't be a need to resize it
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
//
// int i = 0;
// for (File f : Settings.corpus) {
// i++;
// readXML(f.toString(), stats);
// }
// }
// public static void readXML(String path, Statistics stats) {
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
// readXMLGigafida(path, stats);
// } else if (stats.getCorpusType() == CorpusType.GOS) {
// readXMLGos(path, stats);
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
// readXMLSolar(path, stats);
// }
// }
public static void readXML(String path, StatisticsNew stats) {
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
readXMLGigafida(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
}
}
/**
* Reads and returns the value of a passed header tag or an empty string.
* E.g. title tag, for discerning the corpus' type.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderTag(String path, String tag) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
pool.invoke(wc);
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
pool.invoke(wc);
} else {
// TODO:
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
}
// public static void readXMLGos(String path, Statistics stats) {
// boolean in_word = false;
// String taksonomija = "";
// String lemma = "";
// String msd = "";
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
//
// List<Word> stavek = new ArrayList<>();
// List<Sentence> corpus = new ArrayList<>();
// String sentenceDelimiter = "seg";
// String taxonomyPrefix = "gos.";
//
// try {
// XMLInputFactory factory = XMLInputFactory.newInstance();
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
//
// while (eventReader.hasNext()) {
// XMLEvent event = eventReader.nextEvent();
//
// switch (event.getEventType()) {
// case XMLStreamConstants.START_ELEMENT:
//
// StartElement startElement = event.asStartElement();
// String qName = startElement.getName().getLocalPart();
//
// // "word" node
// if (qName.equals("w")) {
// in_word = true;
//
// if (type.equals("norm")) {
// // make sure we're looking at <w lemma...> and not <w type...>
// Iterator var = startElement.getAttributes();
// ArrayList<Object> attributes = new ArrayList<>();
// while (var.hasNext()) {
// attributes.add(var.next());
// }
//
// if (attributes.contains("msd")) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// } else {
// msd = null;
// }
//
// if (attributes.contains("lemma")) {
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
// }
// }
// // taxonomy node
// else if (qName.equalsIgnoreCase("catRef")) {
// // there are some term nodes at the beginning that are of no interest to us
// // they differ by not having the attribute "ref", so test will equal null
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
//
// if (test != null) {
// // keep only taxonomy properties
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
// }
// } else if (qName.equalsIgnoreCase("div")) {
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
//
// }
// break;
//
// case XMLStreamConstants.CHARACTERS:
// Characters characters = event.asCharacters();
//
// // "word" node value
// if (in_word) {
// if (type.equals("norm") && msd != null) {
// stavek.add(new Word(characters.getData(), lemma, msd));
// } else {
// stavek.add(new Word(characters.getData()));
// }
//
// in_word = false;
// }
// break;
//
// case XMLStreamConstants.END_ELEMENT:
// EndElement endElement = event.asEndElement();
//
// // parser reached end of the current sentence
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// // add sentence to corpus
// corpus.add(new Sentence(stavek, taksonomija, type));
// // and start a new one
// stavek = new ArrayList<>();
//
// /* Invoke Fork-Join when we reach maximum limit of
// * sentences (because we can't read everything to
// * memory) or we reach the end of the file.
// */
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
// fj(corpus, stats);
// // empty the current corpus, since we don't need
// // the data anymore
// corpus.clear();
// }
// }
//
// // backup
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// fj(corpus, stats);
// corpus.clear();
// }
//
// break;
// }
// }
// } catch (FileNotFoundException | XMLStreamException e) {
// e.printStackTrace();
// }
// }
@SuppressWarnings("unused")
public static void readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>();
// used for filter
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
// System.out.println(String.format("%s", startElement.toString()));
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek));
// and start a new one
stavek = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String qNameEnd = endElement.getName().getLocalPart();
if (qNameEnd.equals("head")) {
// validate and set boolean
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
includeThisBlock = true;
}
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
}
// backup
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
}
}
/**
* @param readHeadBlock block of tags read from the corpus
* @param userSetFilter tags with values set by the user
*
* @return
*/
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
boolean pass = true;
if (userSetFilter == null) {
return true;
}
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
String key = filterEntry.getKey();
HashSet<String> valueObject = filterEntry.getValue();
// if (valueObject instanceof String) {
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
// } else
if (valueObject != null) {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
}
}
if (!pass) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
}
}
// if it gets to this point, it passed all the filters
return true;
}
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
if (!readHeadBlock.keySet().contains(userSetKey)) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
// different values -> doesn't pass the filter
return false;
}
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
String headTagName;
if (corpusType == CorpusType.SOLAR) {
headTagName = "head";
// used for filter
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else {
headTagName = "teiHeader";
}
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = null;
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String elementName = startElement.getName().getLocalPart();
if (elementName.equalsIgnoreCase(headTagName)) {
// if the corpus is split into files, we skip bodies
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
}
if (insideHeader) {
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
HashMap<String, String> atts = extractAttributes(startElement);
String debug = "";
String tax = startElement.getAttributeByName(QName.valueOf("target"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
}
}
} catch (XMLStreamException e) {
logger.error("Streaming error", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
} catch (FileNotFoundException e) {
logger.error("File not found", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
} finally {
if (xmlEventReader != null) {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return parseTaxonomy ? resultTaxonomy : resultFilters;
}
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
return event.asEndElement()
.getName()
.getLocalPart()
.equalsIgnoreCase(headerTag);
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
}
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "seg";
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
XMLEventReader eventReader = null;
boolean includeFile = true;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
if (qName.equals("div")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("type")) {
inOrthDiv = atts.get("type").equals("orth");
}
}
// "word" node
if (qName.equals("w")) {
// check that it's not a type
HashMap<String, String> atts = extractAttributes(startElement);
if (!atts.containsKey("type")) {
inWord = true;
if (atts.containsKey("msd")) {
msd = atts.get("msd");
}
if (atts.containsKey("lemma")) {
lemma = atts.get("lemma");
}
//
// if (!inOrthDiv) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
}
// }
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd));
} else {
sentence.add(new Word(characters.getData()));
}
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
boolean saveSentence = computeForOrth == inOrthDiv;
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
// disregard this entry if taxonomies don't match
includeFile = !currentFiletaxonomy.isEmpty();
currentFiletaxonomy = new ArrayList<>();
}
}
// backup
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
} catch (Exception e) {
logger.error("general error", e);
}
}
}
return true;
}
/**
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
* Filters:
* <ol>
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
* </ol>
*
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
*/
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
return null;
}
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
}
}
return sentence;
}
private static HashMap<String, String> extractAttributes(StartElement se) {
Iterator attributesIt = se.getAttributes();
HashMap<String, String> atts = new HashMap<>();
while (attributesIt.hasNext()) {
Attribute a = (Attribute) attributesIt.next();
atts.put(a.getName().getLocalPart(), a.getValue());
}
return atts;
}
}

View File

@@ -0,0 +1,67 @@
package alg.inflectedJOS;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.Statistics;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private Statistics stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
}
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,170 @@
package alg.inflectedJOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import alg.Common;
import data.Sentence;
import data.Statistics;
import data.StatisticsNew;
import data.Word;
public class InflectedJOSCount {
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
// static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i));
// }
// }
//
// private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5};
// int r = 3;
// int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
//
// return printCombination(arr, n, r);
// }
//
// /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[]
// r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>();
//
// if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++)
// System.out.print(data[j] + " ");
// System.out.println("");
// return;
// }
//
// // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements
// // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r);
// }
// }
//
// // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one
// int data[] = new int[r];
//
// // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r);
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
// }
//
// calculateCommon(s, stats.result);
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) {
// disregard if wrong taxonomy
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
continue;
}
for (Word word : s.getWords()) {
// skip if current word is not inflected
if (!(word.getMsd().length() > 0)) {
continue;
}
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
Common.updateMap(stats.result, entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
// skip if current word is not inflected
// // TODO: if has defined msd and is of correct type (create a set)
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}

View File

@@ -0,0 +1,131 @@
package alg.inflectedJOS;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import data.Enums.InflectedJosTypes;
import data.StatisticsNew;
import gui.ValidationUtil;
import util.Combinations;
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
public class WordFormation {
private static HashMap<String, Long> josTypeResult;
private static Object[][] tmpResults;
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
static {
indices = new HashMap<>();
for (int i = 4; i <= 8; i++) {
indices.put(i, Combinations.generateIndices(i));
}
}
public static void calculateStatistics(StatisticsNew stat) {
Map<String, AtomicLong> result = stat.getResult();
// 1. filter - keep only inflected types
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
// 2. for each inflected type get all possible subcombinations
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
josTypeResult = new HashMap<>();
// filter out results for a single word type
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
.filter(x -> x.getKey().charAt(0) == josChar)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
if (ValidationUtil.isEmpty(singleTypeResults)) {
continue;
}
// get all possible indices combos for a msd of this length
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
int l = e.getKey().length();
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
}
}
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
}
stat.setResultCustom(tmpResults);
}
private static String mask(String word, HashSet<Integer> indicesCombo) {
StringBuilder sb = new StringBuilder();
sb.append(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
}
return sb.toString();
}
private static void updateResults(String s, Long nOfOccurences) {
// if not in map add
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
// else update
if (r != null) {
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
}
}
private static void resultsMapToArray(Long totalValue) {
Double total = totalValue * 1.0;
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
int i = 0;
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
josTypeResultArray[i][0] = e.getKey();
josTypeResultArray[i][1] = e.getValue();
josTypeResultArray[i][2] = e.getValue() / total;
if (e.getValue() > total) {
String debug = "";
}
i++;
}
if (tmpResults == null) {
tmpResults = josTypeResultArray;
} else {
int firstLength = tmpResults.length;
int secondLength = josTypeResultArray.length;
Object[][] tmp = new Object[firstLength + secondLength][3];
System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
tmpResults = tmp;
// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
}
}
private static void printArray() {
for (int i = 0; i < tmpResults.length; i++) {
for (int j = 0; j < tmpResults[i].length; j++) {
System.out.print(tmpResults[i][j] + "\t");
}
System.out.println();
}
}
}

View File

@@ -0,0 +1,62 @@
package alg.ngram;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 5074814035083362355L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
Ngrams.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,204 @@
package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.CalculateFor;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
import gui.ValidationUtil;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
generateNgramLetterCandidates(corpus, stats);
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
generateSkipgramCandidates(corpus, stats);
} else {
generateNgramCandidates(corpus, stats);
}
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
}
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
continue;
}
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
}
for (int i = 0; i < regex.size(); i++) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
return false;
}
}
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
break;
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
break;
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.collect(Collectors.toList()));
break;
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.collect(Collectors.toList()));
break;
}
return StringUtils.join(candidate, " ");
}
/**
* Generates candidates and updates results
*
* @param corpus
* @param stats
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
}
}
}
}
/**
* Extracts skipgram candidates.
*
* @return List of candidates represented as a list<candidates(String)>
*/
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
if (ngram == 5 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats);
}
}
}
}
}
}
}
}
}
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
}
}
}

View File

@@ -0,0 +1,62 @@
package alg.word;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 7711587510996456040L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
WordLevel.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,167 @@
package alg.word;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import alg.Common;
import data.CalculateFor;
import data.Sentence;
import data.Statistics;
import data.Word;
class WordCount {
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
if (word.length() > stats.getSubstringLength()) {
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
String substring = word.substring(i, i + stats.getSubstringLength());
Common.updateMap(stats.result, substring);
}
}
}
}
}
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
boolean taxonomyIsSet = stats.isTaxonomySet();
boolean JosTypeIsSet = stats.isJOSTypeSet();
// branching because even though the only difference is an if or two &&
// O(if) = 1, the amount of ifs adds up and this saves some time
if (taxonomyIsSet && JosTypeIsSet) {
calculateForTaxonomyAndJosType(corpus, stats);
} else if (taxonomyIsSet && !JosTypeIsSet) {
calculateForTaxonomy(corpus, stats);
} else if (!taxonomyIsSet && JosTypeIsSet) {
calculateForJosType(corpus, stats);
} else {
if (stats.isVcc()) {
calculateVCC(corpus, stats);
} else {
calculateNoFilter(corpus, stats);
}
}
}
}

View File

@@ -0,0 +1,112 @@
package alg.word;
import static data.Enums.WordLevelDefaultValues.*;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import data.Enums.WordLevelDefaultValues;
import data.Enums.WordLevelType;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
@SuppressWarnings("Duplicates")
public class WordLevel {
private static HashSet<String> suffixes;
private static int minSuffixLength;
private static int maxSuffixLength;
private static HashSet<String> prefixes;
private static int minPrefixLength;
private static int maxPrefixLength;
static {
suffixes = WordLevelDefaultValues.getSuffixes();
calculateSuffixesLengths();
prefixes = WordLevelDefaultValues.getPrefixes();
calculatePrefixesLengths();
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
calculateForSuffixes(word.getWord(), stats);
calculateForPrefixes(word.getWord(), stats);
}
}
}
private static void calculateForPrefixes(String word, StatisticsNew stats) {
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
return;
}
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
if (prefixes.contains(extractedPrefix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
return;
}
}
}
public static void calculateForSuffixes(String word, StatisticsNew stats) {
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
return;
}
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
if (suffixes.contains(extractedSuffix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
return;
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculateSuffixesLengths() {
minSuffixLength = -1;
maxSuffixLength = -1;
for (String suffix : suffixes) {
if (suffix.length() > maxSuffixLength) {
maxSuffixLength = suffix.length();
if (minSuffixLength < 0) {
minSuffixLength = maxSuffixLength;
}
} else if (suffix.length() < minSuffixLength) {
minSuffixLength = suffix.length();
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculatePrefixesLengths() {
minPrefixLength = -1;
maxPrefixLength = -1;
for (String prefix : prefixes) {
if (prefix.length() > maxPrefixLength) {
maxPrefixLength = prefix.length();
if (minPrefixLength < 0) {
minPrefixLength = maxPrefixLength;
}
} else if (prefix.length() < minPrefixLength) {
minPrefixLength = prefix.length();
}
}
}
}

View File

@@ -0,0 +1,17 @@
package data;
public enum AnalysisLevel {
STRING_LEVEL("Besedni nizi"),
WORD_LEVEL("Nivo besed in delov besed"),
WORD_FORMATION("Besedotvorni procesi");
private final String name;
AnalysisLevel(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
}

View File

@@ -0,0 +1,43 @@
package data;
public enum CalculateFor {
WORD("različnica"),
LEMMA("lema"),
MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"),
MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"),
WORD_TYPE("besedna vrsta"),
DIST_WORDS("različnica"),
DIST_LEMMAS("lema");
private final String name;
CalculateFor(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
public static CalculateFor factory(String cf) {
if (cf != null) {
if (WORD.toString().equals(cf)) {
return WORD;
}
if (LEMMA.toString().equals(cf)) {
return LEMMA;
}
if (MORPHOSYNTACTIC_SPECS.toString().equals(cf)) {
return MORPHOSYNTACTIC_SPECS;
}
if (MORPHOSYNTACTIC_PROPERTY.toString().equals(cf)) {
return MORPHOSYNTACTIC_PROPERTY;
}
if (WORD_TYPE.toString().equals(cf)) {
return WORD_TYPE;
}
}
return null;
}
}

View File

@@ -0,0 +1,163 @@
package data;
import static gui.Messages.*;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.Enums.solar.SolarFilters;
import gui.ValidationUtil;
import javafx.collections.ObservableList;
public class Corpus {
public final static Logger logger = LogManager.getLogger(Corpus.class);
private CorpusType corpusType;
private File chosenResultsLocation;
private File chosenCorpusLocation;
private Collection<File> detectedCorpusFiles;
boolean headerRead;
private ObservableList<String> taxonomy; // if gigafida or gos
private HashMap<String, ObservableList<String>> solarFilters; // if solar
private HashMap<String, HashSet<String>> solarFiltersForXML; // if solar - used while parsing xml
private boolean gosOrthMode;
boolean hasMsdData;
private ArrayList<String> validationErrors;
public Corpus() {
validationErrors = new ArrayList<>();
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
logger.info("Corpus.set: ", corpusType);
}
public File getChosenResultsLocation() {
return chosenResultsLocation;
}
public void setChosenResultsLocation(File chosenResultsLocation) {
this.chosenResultsLocation = chosenResultsLocation;
logger.info("Corpus.set: ", chosenResultsLocation);
}
public File getChosenCorpusLocation() {
return chosenCorpusLocation;
}
public void setChosenCorpusLocation(File chosenCorpusLocation) {
this.chosenCorpusLocation = chosenCorpusLocation;
logger.info("Corpus.set: ", chosenCorpusLocation);
}
public Collection<File> getDetectedCorpusFiles() {
return detectedCorpusFiles;
}
public void setDetectedCorpusFiles(Collection<File> detectedCorpusFiles) {
this.detectedCorpusFiles = detectedCorpusFiles;
logger.info("Corpus.set: ", detectedCorpusFiles);
}
public boolean isHeaderRead() {
return headerRead;
}
public void setHeaderRead(boolean headerRead) {
this.headerRead = headerRead;
}
public ObservableList<String> getTaxonomy() {
return taxonomy;
}
public void setTaxonomy(ObservableList<String> taxonomy) {
this.taxonomy = taxonomy;
logger.info("Corpus.set: ", taxonomy);
}
public HashMap<String, ObservableList<String>> getSolarFilters() {
return solarFilters;
}
public void setSolarFilters(HashMap<String, ObservableList<String>> solarFilters) {
this.solarFilters = solarFilters;
logger.info("Corpus.set: ", solarFilters);
}
public HashMap<String, HashSet<String>> getSolarFiltersForXML() {
return solarFiltersForXML;
}
public void setSolarFiltersForXML(HashMap<String, HashSet<String>> solarFiltersForXML) {
this.solarFiltersForXML = solarFiltersForXML;
logger.info("Corpus.set: ", solarFiltersForXML);
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
logger.info("Corpus.set: ", gosOrthMode);
}
public ArrayList<String> getValidationErrors() {
return validationErrors;
}
public String getValidationErrorsToString() {
return StringUtils.join(validationErrors, "\n - ");
}
public void setValidationErrors(ArrayList<String> validationErrors) {
this.validationErrors = validationErrors;
}
public boolean validate() {
if (corpusType == null) {
validationErrors.add(LABEL_RESULTS_CORPUS_TYPE_NOT_SET);
}
if (chosenCorpusLocation == null) {
validationErrors.add(LABEL_CORPUS_LOCATION_NOT_SET);
}
if (chosenResultsLocation == null) {
validationErrors.add(LABEL_RESULTS_LOCATION_NOT_SET);
}
if (!headerRead && corpusType != null) {
// if user didn't opt into reading the headers, set default taxonomy or solar filters
if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) {
taxonomy = Tax.getTaxonomyForComboBox(corpusType);
} else if (corpusType == CorpusType.SOLAR && solarFilters == null) {
setSolarFilters(SolarFilters.getFiltersForComboBoxes());
}
}
if (headerRead && ValidationUtil.isEmpty(taxonomy)) {
// mustn't happen, intercept at gui level
}
if (!ValidationUtil.isEmpty(validationErrors)) {
logger.error("Corpus validation error: ", StringUtils.join(validationErrors, "\n - "));
return false;
} else {
return true;
}
}
}

View File

@@ -0,0 +1,25 @@
package data;
public enum CorpusType {
GIGAFIDA("Gigafida", "gigafida"),
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos");
private final String name;
private final String nameLowerCase;
CorpusType(String name, String nameLowerCase) {
this.name = name;
this.nameLowerCase = nameLowerCase;
}
public String toString() {
return this.name;
}
public String getNameLowerCase() {
return nameLowerCase;
}
}

View File

@@ -0,0 +1,12 @@
package data.Enums;
import java.util.Arrays;
import java.util.HashSet;
public class InflectedJosTypes {
public static final HashSet<Character> inflectedJosTypes = new HashSet<>();
static {
inflectedJosTypes.addAll(Arrays.asList('S', 'G', 'P'));
}
}

View File

@@ -0,0 +1,68 @@
package data.Enums;
import java.util.HashMap;
public enum Msd {
NOUN("samostalnik", 'S', "Noun", 'N', 5),
VERB("glagol", 'G', "Verb", 'V', 7),
ADJECTIVE("pridevnik", 'P', "Adjective", 'A', 6),
ADVERB("prislov", 'R', "Adverb", 'R', 2),
PRONOUN("zaimek", 'Z', "Pronoun", 'P', 8),
NUMERAL("števnik", 'K', "Numeral", 'M', 6),
PREPOSITION("predlog", 'D', "Preposition", 'S', 1),
CONJUNCTION("veznik", 'V', "Conjunction", 'C', 1),
PARTICLE("členek", 'L', "Particle", 'Q', 0),
INTERJECTION("medmet", 'M', "Interjection", 'I', 0),
ABBREVIATION("okrajšava", 'O', "Abbreviation", 'Y', 0),
RESIDUAL("neuvrščeno", 'N', "Residual", 'X', 1);
private final String siName;
private final Character siCode;
private final String enName;
private final Character enCode;
private final Integer nOfAttributes;
private static HashMap<Character, Integer> siCodeNOfAttributes;
static {
siCodeNOfAttributes = new HashMap<>();
for (Msd msd : Msd.values()) {
siCodeNOfAttributes.put(msd.getSiCode(), msd.nOfAttributes);
}
}
Msd(String siName, Character siCode, String enName, Character enCode, int nOfAttributes) {
this.siName = siName;
this.siCode = siCode;
this.enName = enName;
this.enCode = enCode;
this.nOfAttributes = nOfAttributes;
}
public String getSiName() {
return siName;
}
public Character getSiCode() {
return siCode;
}
public String getEnName() {
return enName;
}
public Character getEnCode() {
return enCode;
}
/**
* Returns the number of attributes for the given type.
*
* @param msd
*
* @return
*/
public static int getMsdLengthForType(String msd) {
return siCodeNOfAttributes.get(msd.charAt(0)) + 1;
}
}

View File

@@ -0,0 +1,55 @@
package data.Enums;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class WordLevelDefaultValues {
public final static Logger logger = LogManager.getLogger(WordLevelDefaultValues.class);
private static HashSet<String> suffixes;
private static final String SUFFIXES_FILE = "/Lists/suffixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_SUFFIX = 2;
private static HashSet<String> prefixes;
private static final String PREFIXES_FILE = "/Lists/prefixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_PREFIX = 2;
static {
suffixes = new HashSet<>();
suffixes = readFromFile(SUFFIXES_FILE);
prefixes = new HashSet<>();
prefixes = readFromFile(PREFIXES_FILE);
}
private static HashSet<String> readFromFile(String fileName) {
Set<String> dictionary = new HashSet<>();
try (InputStream is = WordLevelDefaultValues.class.getClass().getResourceAsStream(fileName)) {
if (is != null) {
// TODO: warn if !exists
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
dictionary = reader.lines().collect(Collectors.toSet());
}
} catch (IOException e) {
logger.error("Problem reading init dictionary", e);
}
return (HashSet<String>) dictionary;
}
public static HashSet<String> getSuffixes() {
return suffixes;
}
public static HashSet<String> getPrefixes() {
return prefixes;
}
}

View File

@@ -0,0 +1,16 @@
package data.Enums;
public enum WordLevelType {
SUFFIX("pripona"),
PREFIX("predpona");
private final String name;
WordLevelType(String name) {
this.name = name;
}
public String getName() {
return name;
}
}

View File

@@ -0,0 +1,57 @@
package data.Enums.solar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class SolarFilters {
private static HashMap<String, ObservableList<String>> SOLAR_FILTERS;
public static final String SOLA = "sola";
public static final String PREDMET = "predmet";
public static final String RAZRED = "razred";
public static final String REGIJA = "regija";
public static final String TIP = "tip";
public static final String LETO = "leto";
static {
SOLAR_FILTERS = new HashMap<>();
SOLAR_FILTERS.put(REGIJA, FXCollections.observableArrayList("Celje", "Gorica", "Koper", "Kranj", "Krško", "Ljubljana", "Maribor", "Murska Sobota", "Novo mesto", "Postojna", "Slovenj Gradec"));
SOLAR_FILTERS.put(PREDMET, FXCollections.observableArrayList("državljanska vzgoja in etika", "ekonomija", "filozofija", "geografija", "kemija", "podjetništvo", "psihologija", "slovenščina", "sociologija", "umetnostna vzgoja", "zgodovina"));
SOLAR_FILTERS.put(RAZRED, FXCollections.observableArrayList("6. razred", "7. razred", "8. razred", "9. razred", "1. letnik", "2. letnik", "3. letnik", "4. letnik", "5. letnik", "maturitetni tečaj"));
SOLAR_FILTERS.put(LETO, FXCollections.observableArrayList("2007", "2008", "2009", "2009/2010", "2010"));
SOLAR_FILTERS.put(SOLA, FXCollections.observableArrayList("gimnazija", "osnovna šola", "poklicna šola", "strokovna šola"));
SOLAR_FILTERS.put(TIP, FXCollections.observableArrayList("esej/spis", "pisni izdelek (učna ura)", "test (daljše besedilo)", "test (odgovori na vprašanja)"));
}
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_FULL = FXCollections.observableArrayList("različnica", "lema", "oblikoskladenjska oznaka", "oblikoskladenjska lastnost", "besedna vrsta");
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_LIMITED = FXCollections.observableArrayList("različnica", "lema");
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes() {
return SOLAR_FILTERS;
}
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes(HashMap<String, HashSet<String>> foundFilters) {
HashMap<String, ObservableList<String>> filtersForComboBoxes = new HashMap<>();
for (Map.Entry<String, ObservableList<String>> e : SOLAR_FILTERS.entrySet()) {
if (!foundFilters.containsKey(e.getKey())) {
// if, by some reason a specific filter wasn't in the corpus, return a blank list for that filter
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList());
} else {
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList(foundFilters.get(e.getKey())).sorted());
}
}
return filtersForComboBoxes;
}
}

View File

@@ -0,0 +1,144 @@
package data;
import static data.Filter.filterName.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.regex.Pattern;
import gui.ValidationUtil;
@SuppressWarnings("unchecked")
public class Filter {
private HashMap<filterName, Object> filter;
public enum filterName {
ANALYSIS_LEVEL,
CALCULATE_FOR,
NGRAM_VALUE,
SKIP_VALUE,
IS_CVV,
STRING_LENGTH,
TAXONOMY,
MSD,
HAS_MSD,
SOLAR_FILTERS
}
public Filter() {
filter = new HashMap<>();
}
public Filter(AnalysisLevel al, CalculateFor cf) {
filter = new HashMap<>();
filter.put(ANALYSIS_LEVEL, al);
filter.put(CALCULATE_FOR, cf);
}
public void setAl(AnalysisLevel al) {
filter.put(ANALYSIS_LEVEL, al);
}
public AnalysisLevel getAl() {
return (AnalysisLevel) filter.get(ANALYSIS_LEVEL);
}
public void setCalculateFor(CalculateFor cf) {
filter.put(CALCULATE_FOR, cf);
}
public CalculateFor getCalculateFor() {
return (CalculateFor) filter.get(CALCULATE_FOR);
}
public void setNgramValue(Integer ngramValue) {
filter.put(NGRAM_VALUE, ngramValue);
}
public Integer getNgramValue() {
return (Integer) filter.get(NGRAM_VALUE);
}
public void setSkipValue(Integer skipValue) {
filter.put(SKIP_VALUE, skipValue);
}
public Integer getSkipValue() {
return (Integer) filter.get(SKIP_VALUE);
}
public void setIsCvv(boolean isCvv) {
filter.put(IS_CVV, isCvv);
}
public boolean isCvv() {
return filter.containsKey(IS_CVV) && (boolean) filter.get(IS_CVV);
}
public void setStringLength(int stringLength) {
filter.put(STRING_LENGTH, stringLength);
}
public Integer getStringLength() {
return (Integer) filter.get(STRING_LENGTH);
}
public void setTaxonomy(ArrayList<String> taxonomy) {
filter.put(TAXONOMY, taxonomy);
}
public ArrayList<String> getTaxonomy() {
if (filter.containsKey(TAXONOMY) && filter.get(TAXONOMY) != null) {
return (ArrayList<String>) filter.get(TAXONOMY);
} else {
return new ArrayList<>();
}
}
public void setMsd(ArrayList<Pattern> msd) {
filter.put(MSD, msd);
if (!ValidationUtil.isEmpty(msd)) {
setHasMsd(true);
} else {
setHasMsd(false);
}
}
public ArrayList<Pattern> getMsd() {
return (ArrayList<Pattern>) filter.get(MSD);
}
public void setHasMsd(boolean hasMsd) {
filter.put(HAS_MSD, hasMsd);
}
public boolean hasMsd() {
return filter.containsKey(HAS_MSD) && (boolean) filter.get(HAS_MSD);
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Filter:");
for (Map.Entry<filterName, Object> entry : filter.entrySet()) {
sb.append(newLine)
.append(entry.getKey().toString())
.append(": ")
.append(entry.getValue() != null ? entry.getValue().toString() : "null");
}
return sb.toString();
}
public void setSolarFilters(HashMap<String, HashSet<String>> filters) {
filter.put(SOLAR_FILTERS, filters);
}
public HashMap<String, HashSet<String>> getSolarFilters() {
return (HashMap<String, HashSet<String>>) filter.get(SOLAR_FILTERS);
}
}

View File

@@ -0,0 +1,71 @@
package data;
public enum GigafidaJosWordType {
SAMOSTALNIK("samostalnik", 'S'),
GLAGOL("glagol", 'G'),
PRIDEVNIK("pridevnik", 'P'),
PRISLOV("prislov", 'R'),
ZAIMEK("zaimek", 'Z'),
STEVNIK("stevnik", 'K'),
PREDLOG("predlog", 'D'),
VEZNIK("veznik", 'V'),
CLENEK("clenek", 'L'),
MEDMET("medmet", 'M'),
OKRAJSAVA("okrajsava", 'O');
private final String name;
private final char wordType;
GigafidaJosWordType(String name, char wordType) {
this.name = name;
this.wordType = wordType;
}
public String toString() {
return this.name;
}
public char getWordType() {
return wordType;
}
public static GigafidaJosWordType factory(String wType) {
if (wType != null) {
if (SAMOSTALNIK.toString().equals(wType)) {
return SAMOSTALNIK;
}
if (GLAGOL.toString().equals(wType)) {
return GLAGOL;
}
if (PRIDEVNIK.toString().equals(wType)) {
return PRIDEVNIK;
}
if (PRISLOV.toString().equals(wType)) {
return PRISLOV;
}
if (ZAIMEK.toString().equals(wType)) {
return ZAIMEK;
}
if (STEVNIK.toString().equals(wType)) {
return STEVNIK;
}
if (PREDLOG.toString().equals(wType)) {
return PREDLOG;
}
if (VEZNIK.toString().equals(wType)) {
return VEZNIK;
}
if (CLENEK.toString().equals(wType)) {
return CLENEK;
}
if (MEDMET.toString().equals(wType)) {
return MEDMET;
}
if (OKRAJSAVA.toString().equals(wType)) {
return OKRAJSAVA;
}
}
return null;
}
}

View File

@@ -0,0 +1,76 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GigafidaTaxonomy {
TISK("tisk", "T"),
KNJIZNO("knjižno", "T.K"),
LEPOSLOVNO("leposlovno", "T.K.L"),
STROKOVNO("strokovno", "T.K.S"),
PERIODICNO("periodično", "T.P"),
CASOPIS("časopis", "T.P.C"),
REVIJA("revija", "T.P.R"),
INTERNET("internet", "I");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GigafidaTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GigafidaTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GigafidaTaxonomy factory(String tax) {
if (tax != null) {
if (TISK.toString().equals(tax)) {
return TISK;
}
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@@ -0,0 +1,85 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GosTaxonomy {
JAVNI("javni", "gos.T.J"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "gos.T.J.I"),
RAZVEDRILNI("razvedrilni", "gos.T.J.R"),
NEJAVNI("nejavni", "gos.T.N"),
NEZASEBNI("nezasebni", "gos.T.N.N"),
ZASEBNI("zasebni", "gos.T.N.Z"),
OSEBNI_STIK("osebni stik", "gos.K.O"),
TELEFON("telefon", "gos.K.P"),
RADIO("radio", "gos.K.R"),
TELEVIZIJA("televizija", "gos.K.T");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GosTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GosTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GosTaxonomy factory(String tax) {
if (tax != null) {
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@@ -0,0 +1,56 @@
package data;
import java.util.List;
import java.util.Map;
public class Sentence {
private List<Word> words;
private String taksonomija;
// GOS
private String type;
private Map<String, String> properties;
public Sentence(List<Word> words, String taksonomija) {
this.words = words;
this.taksonomija = taksonomija;
}
public Sentence(List<Word> words) {
this.words = words;
}
public Sentence(List<Word> words, String taksonomija, Map<String, String> properties) {
this.words = words;
this.taksonomija = taksonomija;
this.properties = properties;
}
public Sentence(List<Word> words, String taksonomija, String type) {
this.words = words;
this.taksonomija = taksonomija;
this.type = type;
}
public List<Word> getWords() {
return words;
}
public String getTaxonomy() {
return taksonomija;
}
public List<Word> getSublist(int indexFrom, int indexTo) {
return this.words.subList(indexFrom, indexTo);
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@@ -0,0 +1,16 @@
package data;
import java.io.File;
import java.util.Collection;
public class Settings {
public static final int CORPUS_SENTENCE_LIMIT = 50000;
public static final boolean PRINT_LOG = false;
public static final String FX_ACCENT_OK = "-fx-accent: forestgreen;";
public static final String FX_ACCENT_NOK = "-fx-accent: red;";
public static Collection<File> corpus;
public static File resultsFilePath;
}

View File

@@ -0,0 +1,299 @@
package data;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import util.Util;
import util.db.RDB;
public class Statistics {
private CorpusType corpusType;
private AnalysisLevel analysisLevel;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private String taxonomy;
private boolean taxonomyIsSet;
private char JOSType;
private boolean JOSTypeIsSet;
private String resultTitle;
public Map<String, AtomicLong> result = new ConcurrentHashMap<>();
// nGrams
private int nGramLevel;
private Integer skip;
private CalculateFor cf;
private List<Pattern> morphosyntacticFilter;
// distributions
private String distributionTaxonomy;
private char distributionJosWordType;
private boolean vcc;
private Integer substringLength;
// inflected JOS
private String inflectedJosTaxonomy;
// GOS
boolean gosOrthMode;
// šolar
Map<String, Object> solarHeadBlockFilter;
// for ngrams
public Statistics(AnalysisLevel al, int nGramLevel, Integer skip, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.cf = cf;
this.analysisLevel = al;
this.nGramLevel = nGramLevel;
this.skip = skip == null || skip == 0 ? null : skip;
this.resultTitle = String.format("%s%d-gram_%s_%s",
this.skip != null ? String.format("%d-%s-", skip, "skip") : "",
nGramLevel,
cf.toString(),
dateTime);
}
// for words distributions
public Statistics(AnalysisLevel al, Taxonomy distributionTaxonomy, GigafidaJosWordType distributionJosWordType, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy.toString() : "",
distributionJosWordType != null ? distributionJosWordType.toString() : "",
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.distributionTaxonomy = distributionTaxonomy != null ? distributionTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = distributionTaxonomy != null;
this.JOSTypeIsSet = distributionJosWordType != null;
this.distributionJosWordType = this.JOSTypeIsSet ? distributionJosWordType.getWordType() : ' ';
}
public Statistics(AnalysisLevel al, CalculateFor cf, Integer substringLength) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%d_%s",
"Distribucija zaporedij samoglasnikov in soglasnikov",
substringLength,
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.substringLength = substringLength;
this.vcc = true;
}
public Statistics(AnalysisLevel al, Taxonomy inflectedJosTaxonomy) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("InflectedJOS_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy : "",
dateTime);
this.analysisLevel = al;
this.inflectedJosTaxonomy = inflectedJosTaxonomy != null ? inflectedJosTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = inflectedJosTaxonomy != null;
}
public Integer getSkip() {
return skip;
}
public Integer getSubstringLength() {
return substringLength;
}
public String getInflectedJosTaxonomy() {
return inflectedJosTaxonomy;
}
public void setSubstringLength(Integer substringLength) {
this.substringLength = substringLength;
}
public boolean isVcc() {
return vcc;
}
public void setVcc(boolean vcc) {
this.vcc = vcc;
}
public String getDistributionTaxonomy() {
return distributionTaxonomy;
}
public void setDistributionTaxonomy(String distributionTaxonomy) {
this.distributionTaxonomy = distributionTaxonomy;
}
public char getDistributionJosWordType() {
return distributionJosWordType;
}
public void setDistributionJosWordType(char distributionJosWordType) {
this.distributionJosWordType = distributionJosWordType;
}
public void setMorphosyntacticFilter(List<String> morphosyntacticFilter) {
// change filter strings to regex patterns
this.morphosyntacticFilter = new ArrayList<>();
for (String s : morphosyntacticFilter) {
this.morphosyntacticFilter.add(Pattern.compile(s.replaceAll("\\*", ".")));
}
}
public List<Pattern> getMsd() {
return morphosyntacticFilter;
}
public Map<String, AtomicLong> getResult() {
return result;
}
public void setTaxonomy(String taxonomy) {
this.taxonomy = taxonomy;
}
public void setTaxonomyIsSet(boolean taxonomyIsSet) {
this.taxonomyIsSet = taxonomyIsSet;
}
public char getJOSType() {
return JOSType;
}
public void setJOSType(char JOSType) {
this.JOSType = JOSType;
}
public boolean isJOSTypeSet() {
return JOSTypeIsSet;
}
public void setJOSType(boolean JOSTypeIsSet) {
this.JOSTypeIsSet = JOSTypeIsSet;
}
public void saveResultToDisk(int... limit) throws UnsupportedEncodingException {
// Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
//
// if (useDB) {
// result = db.getDump();
// db.delete();
// }
//
// // if no results and nothing to save, return false
// if (!(result.size() > 0)) {
// analysisProducedResults = false;
// return;
// } else {
// analysisProducedResults = true;
// }
//
// stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
// Export.SetToCSV(stats);
}
// private Map<String, Integer> getSortedResultInflected(Map map) {
// // first convert to <String, Integer>
// Map<String, Integer> m = Util.sortByValue(Util.atomicInt2StringAndInt(map), 0);
//
// Map<String, Integer> sortedM = new TreeMap<>();
//
// sortedM.putAll(m);
//
// return sortedM;
// }
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public String getTaxonomy() {
return taxonomy;
}
public boolean isTaxonomySet() {
return taxonomyIsSet;
}
public int getnGramLevel() {
return nGramLevel;
}
public CalculateFor getCf() {
return cf;
}
public AnalysisLevel getAnalysisLevel() {
return analysisLevel;
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
}
public Map<String, Object> getSolarHeadBlockFilter() {
return solarHeadBlockFilter;
}
public void setSolarHeadBlockFilter(Map<String, Object> solarHeadBlockFilter) {
this.solarHeadBlockFilter = solarHeadBlockFilter;
}
public boolean isUseDB() {
return useDB;
}
public void setUseDB(boolean useDB) {
if (useDB && db == null) {
db = new RDB();
}
this.useDB = useDB;
}
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
}

View File

@@ -0,0 +1,409 @@
package data;
import static gui.ValidationUtil.*;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.inflectedJOS.WordFormation;
import data.Enums.WordLevelType;
import javafx.collections.ObservableList;
import util.Export;
import util.Util;
import util.db.RDB;
@SuppressWarnings("Duplicates")
public class StatisticsNew {
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
private Corpus corpus;
private Filter filter;
private String resultTitle;
private Map<String, AtomicLong> result;
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private LocalDateTime time;
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
if (useDB) {
this.useDB = true;
db = new RDB();
}
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
resultNestedSuffix = new ConcurrentHashMap<>();
resultNestedPrefix = new ConcurrentHashMap<>();
} else {
result = new ConcurrentHashMap<>();
}
resultTitle = generateResultTitle();
logger.debug(toString());
}
/**
* Result's title consists of:
* <ul>
* <li>Corpus type</li>
* <li>Analysis level</li>
* <li>Calculate for</li>
* <li></li>
* <li></li>
* <li></li>
* <li></li>
* </ul>
*
* @return
*/
private String generateResultTitle() {
String separator = "_";
StringBuilder sb = new StringBuilder();
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if(ngramLevel == 0) {
sb.append("Crke").
append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
} else if(ngramLevel == 1) {
sb.append("Besede").append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
else {
sb.append(filter.getAl().toString())
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
sb.append(filter.getCalculateFor().toString())
.append(separator);
// ngram value
sb.append(filter.getNgramValue()).append("-gram")
.append(separator);
sb.append(filter.getSkipValue()).append("-preskok")
.append(separator);
}
// TODO: assure skip is not null but zero
} else {
sb.append(filter.getAl().toString()) // analysis level
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
// skip value
// msd ?
// if taxonomy -> taxonomy
// if cvv -> cvv + dolžina
this.time = this.time != null ? this.time : LocalDateTime.now();
sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")));
return sb.toString();
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
public void setAnalysisProducedResults(boolean analysisProducedResults) {
this.analysisProducedResults = analysisProducedResults;
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Statistic properties:");
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
sb.append(newLine).append(filter.toString());
return sb.toString();
}
public String getResultTitle() {
return resultTitle;
}
// ****************************************
// ***************** util *****************
// ****************************************
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
logger.error("Store tmp results to DB", e);
// e.printStackTrace();
}
}
public Filter getFilter() {
return filter;
}
public Corpus getCorpus() {
return corpus;
}
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
if (!isEmpty(resultNestedSuffix)) {
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
}
if (!isEmpty(resultNestedPrefix)) {
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
}
// if no results and nothing to save, return false
if (!(results.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
filter.setAl(AnalysisLevel.WORD_FORMATION);
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
WordFormation.calculateStatistics(this);
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
Map<String, Map<String, Long>> sorted = new HashMap<>();
for (String s : nestedMap.keySet()) {
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
}
return sorted;
}
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
result.get(o).incrementAndGet();
}
public Map<String, AtomicLong> getResult() {
return result;
}
public Object[][] getResultCustom() {
return resultCustom;
}
public void setResultCustom(Object[][] resultCustom) {
this.resultCustom = resultCustom;
}
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
if (type == WordLevelType.SUFFIX) {
updateResultsNestedSuffix(key, stringValue);
} else if (type == WordLevelType.PREFIX) {
updateResultsNestedPrefix(key, stringValue);
}
}
public void updateResultsNestedSuffix(String key, String stringValue) {
if (resultNestedSuffix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
}
}
public void updateResultsNestedPrefix(String key, String stringValue) {
if (resultNestedPrefix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
}
}
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();
info.put("Korpus:", corpus.getCorpusType().toString());
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put("Analiza:", "Črke");
else if (ngramLevel == 1)
info.put("Analiza", "Besede");
else
info.put("Analiza:", filter.getAl().toString());
} else {
info.put("Analiza:", filter.getAl().toString());
}
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
// n.gram nivo
if (ngramLevel > 1) {
info.put("n-gram nivo:", String.valueOf(ngramLevel));
} else if (ngramLevel == 1){
info.put("n-gram nivo:", "nivo besed");
} else {
info.put("n-gram nivo:", "nivo črk");
}
// skip
if (ngramLevel > 1)
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
// izračunaj za
info.put("Izračunaj za:", filter.getCalculateFor().toString());
// msd
if (!isEmpty(filter.getMsd())) {
StringBuilder msdPattern = new StringBuilder();
for (Pattern pattern : filter.getMsd()) {
msdPattern.append(pattern.toString()).append(" ");
}
info.put("MSD:", msdPattern.toString());
}
// taksonomija
if (!isEmpty(filter.getTaxonomy())) {
info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
}
}
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
info.put("Taksonomija: ", "");
String sep = "";
for (String s : tax) {
info.put(sep = sep + " ", s);
}
}
if (corpus.getCorpusType() == CorpusType.SOLAR) {
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
if (!isEmpty(filters)) {
info.put("Dodatni filtri: ", "");
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
}
}
}
return info;
}
}

175
src/main/java/data/Tax.java Normal file
View File

@@ -0,0 +1,175 @@
package data;
import java.util.*;
import java.util.stream.Collectors;
import gui.ValidationUtil;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
static {
// GIGAFIDA ----------------------------
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
// GOS ----------------------------------
GOS_TAXONOMY = new LinkedHashMap<>();
GOS_TAXONOMY.put("gos.T", "diskurz");
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
GOS_TAXONOMY.put("gos.S", "situacija");
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
}
/**
* Returns the whole default taxonomy for the specified corpus type
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
return FXCollections.observableArrayList(GIGAFIDA_TAXONOMY.values());
} else if (corpusType == CorpusType.GOS) {
return FXCollections.observableArrayList(GOS_TAXONOMY.values());
}
return FXCollections.observableArrayList(new ArrayList<>());
}
/**
* Returns taxonomy names only for items found in headers
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> taxForCombo = new ArrayList<>();
// assures same relative order
for (String t : tax.keySet()) {
if (foundTax.contains(t)) {
taxForCombo.add(tax.get(t));
}
}
return FXCollections.observableArrayList(taxForCombo);
}
public static HashSet<CorpusType> getCorpusTypesWithTaxonomy() {
return corpusTypesWithTaxonomy;
}
public static ArrayList<String> getTaxonomyCodes(ArrayList<String> taxonomyNames, CorpusType corpusType) {
ArrayList<String> result = new ArrayList<>();
if (ValidationUtil.isEmpty(taxonomyNames)) {
return result;
}
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
// for easier lookup
Map<String, String> taxInversed = tax.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
for (String taxonomyName : taxonomyNames) {
result.add(taxInversed.get(taxonomyName));
}
return result;
}
/**
* Returns a list of proper names for codes
*
* @param corpusType
* @param taxonomy
*
* @return
*/
public static ArrayList<String> getTaxonomyForInfo(CorpusType corpusType, ArrayList<String> taxonomy) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> result = new ArrayList<>();
for (String t : taxonomy) {
result.add(tax.get(t));
}
return result;
}
}

View File

@@ -0,0 +1,171 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum Taxonomy {
// GOS
JAVNI("javni", "T.J", "gos"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "T.J.I", "gos"),
RAZVEDRILNI("razvedrilni", "T.J.R", "gos"),
NEJAVNI("nejavni", "T.N", "gos"),
NEZASEBNI("nezasebni", "T.N.N", "gos"),
ZASEBNI("zasebni", "T.N.Z", "gos"),
OSEBNI_STIK("osebni stik", "K.O", "gos"),
TELEFON("telefon", "K.P", "gos"),
RADIO("radio", "K.R", "gos"),
TELEVIZIJA("televizija", "K.T", "gos"),
// Gigafida
KNJIZNO("knjižno", "T.K", "gigafida"),
LEPOSLOVNO("leposlovno", "T.K.L", "gigafida"),
STROKOVNO("strokovno", "T.K.S", "gigafida"),
PERIODICNO("periodično", "T.P", "gigafida"),
CASOPIS("časopis", "T.P.C", "gigafida"),
REVIJA("revija", "T.P.R", "gigafida"),
INTERNET("internet", "I", "gigafida"),
SSJ_TISK("tisk", "SSJ.T", "gigafida"),
SSJ_KNJIZNO("opis", "identifikator", "gigafida"),
SSJ_LEPOSLOVNO("opis", "identifikator", "gigafida"),
SSJ_STROKOVNO("opis", "identifikator", "gigafida"),
SSJ_PERIODICNO("opis", "identifikator", "gigafida"),
SSJ_CASOPIS("opis", "identifikator", "gigafida"),
SSJ_REVIJA("opis", "identifikator", "gigafida"),
SSJ_DRUGO("opis", "identifikator", "gigafida"),
SSJ_INTERNET("opis", "identifikator", "gigafida"),
FT_P_PRENOSNIK("opis", "identifikator", "gigafida"),
FT_P_GOVORNI("opis", "identifikator", "gigafida"),
FT_P_ELEKTRONSKI("opis", "identifikator", "gigafida"),
FT_P_PISNI("opis", "identifikator", "gigafida"),
FT_P_OBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_KNJIZNO("opis", "identifikator", "gigafida"),
FT_P_PERIODICNO("opis", "identifikator", "gigafida"),
FT_P_CASOPISNO("opis", "identifikator", "gigafida"),
FT_P_DNEVNO("opis", "identifikator", "gigafida"),
FT_P_VECKRAT_TEDENSKO("opis", "identifikator", "gigafida"),
// FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_REVIALNO("opis", "identifikator", "gigafida"),
FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_STIRINAJSTDNEVNO("opis", "identifikator", "gigafida"),
FT_P_MESECNO("opis", "identifikator", "gigafida"),
FT_P_REDKEJE_KOT_MESECNO("opis", "identifikator", "gigafida"),
FT_P_OBCASNO("opis", "identifikator", "gigafida"),
FT_P_NEOBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_JAVNO("opis", "identifikator", "gigafida"),
FT_P_INTERNO("opis", "identifikator", "gigafida"),
FT_P_ZASEBNO("opis", "identifikator", "gigafida"),
FT_ZVRST("opis", "identifikator", "gigafida"),
FT_UMETNOSTNA("opis", "identifikator", "gigafida"),
FT_PESNISKA("opis", "identifikator", "gigafida"),
FT_PROZNA("opis", "identifikator", "gigafida"),
FT_DRAMSKA("opis", "identifikator", "gigafida"),
FT_NEUMETNOSTNA("opis", "identifikator", "gigafida"),
FT_STROKOVNA("opis", "identifikator", "gigafida"),
FT_HID("opis", "identifikator", "gigafida"),
FT_NIT("opis", "identifikator", "gigafida"),
FT_NESTROKOVNA("opis", "identifikator", "gigafida"),
FT_PRAVNA("opis", "identifikator", "gigafida"),
FT_LEKTORIRANO("opis", "identifikator", "gigafida"),
FT_DA("opis", "identifikator", "gigafida"),
FT_NE("opis", "identifikator", "gigafida");
private final String name;
private final String taxonomy;
private final String corpus;
Taxonomy(String name, String taxonomy, String corpusType) {
this.name = name;
this.taxonomy = taxonomy;
this.corpus = corpusType;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static Taxonomy factory(String tax) {
if (tax != null) {
// GOS
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
// Gigafida
// if (TISK.toString().equals(tax)) {
// return TISK;
// }
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getDefaultForComboBox(String corpusType) {
ArrayList<String> values = Arrays.stream(Taxonomy.values())
.filter(x -> x.corpus.equals(corpusType))
.map(x -> x.name)
.collect(Collectors.toCollection(ArrayList::new));
return FXCollections.observableArrayList(values);
}
public static ObservableList<String> getDefaultForComboBox(CorpusType corpusType) {
return getDefaultForComboBox(corpusType.toString());
}
}

View File

@@ -0,0 +1,53 @@
package data;
import static gui.ValidationUtil.*;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import gui.Messages;
import gui.ValidationUtil;
public class Validation {
public static String validateForStringLevel(Filter filter) {
ArrayList<String> errors = new ArrayList<>();
// should not be null, error if null, because init failed
if (filter.getNgramValue() == null) {
errors.add(Messages.MISSING_NGRAM_LEVEL);
}
// should not be null, error if null, because init failed
if (filter.getCalculateFor() == null) {
errors.add(Messages.MISSING_CALCULATE_FOR);
}
if (filter.getSkipValue() == null) {
filter.setSkipValue(0);
}
if (filter.getNgramValue() != null && ValidationUtil.isEmpty(filter.getMsd()) &&
(filter.getMsd().size() != filter.getNgramValue())) {
if (!(filter.getMsd().size() == 1 && filter.getNgramValue() == 0) && !ValidationUtil.isEmpty(filter.getMsd()))
errors.add(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES);
}
Integer ngramValue = filter.getNgramValue();
ArrayList<Pattern> msd = filter.getMsd();
if (ngramValue > 0 && !ValidationUtil.isEmpty(msd) && ngramValue != msd.size()) {
errors.add(String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, ngramValue, msd.size()));
}
if (filter.getNgramValue() != null && filter.getNgramValue() == 0 && isEmpty(filter.getStringLength())) {
// if count letters, make sure that the length is given
// TODO: check that words we're adding in xml reader are longer than this value
errors.add(Messages.MISSING_STRING_LENGTH);
}
return isEmpty(errors) ? null : StringUtils.join(errors, ", \n");
}
}

View File

@@ -0,0 +1,141 @@
package data;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import data.Enums.Msd;
import gui.ValidationUtil;
public class Word implements Serializable {
public static final char PAD_CHARACTER = '-';
private String word;
private String lemma;
private String msd;
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
/**
* Possible values:
* <p>
* <ul>
* <li>S = samostalnik</li>
* <li>G = glagol</li>
* <li>P = pridevnik</li>
* <li>R = prislov</li>
* <li>Z = zaimek</li>
* <li>K = števnik</li>
* <li>D = predlog</li>
* <li>V = veznik</li>
* <li>L = členek</li>
* <li>M = medmet</li>
* <li>O = okrajšava</li>
* <li>N = neuvrščeno</li>
* </ul>
*/
//private char besedna_vrsta;
public Word(String word, String lemma, String msd) {
this.lemma = lemma;
this.msd = normalizeMsd(msd);
// veliko zacetnico ohranimo samo za lastna imena
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
&& this.msd.length() >= 2
&& this.msd.charAt(1) == 'l')) {
this.word = word.toLowerCase();
} else {
this.word = word;
}
}
public Word() {
}
/**
* Appends a number of '-' to msds which are not properly sized.
* E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
*
* @param msdInput
*
* @return
*/
private String normalizeMsd(String msdInput) {
if (ValidationUtil.isEmpty(msdInput)) {
return "";
} else {
return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
}
}
public Word(String word) {
this.word = word;
}
public String getWord() {
return word;
}
public String getCVVWord() {
return covertToCvv(word);
}
public String getCVVLemma() {
return covertToCvv(lemma);
}
private String covertToCvv(String s) {
char[] StringCA = s.toCharArray();
for (int i = 0; i < StringCA.length; i++) {
StringCA[i] = VOWELS.contains(StringCA[i]) ? 'V' : 'C';
}
return new String(StringCA);
}
public void setWord(String word) {
this.word = word;
}
public String getLemma() {
return lemma;
}
public void setLemma(String lemma) {
this.lemma = lemma;
}
public String getMsd() {
return msd;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("beseda:\t")
.append(getWord())
.append("\n")
.append("lema:\t")
.append(getLemma())
.append("\n")
.append("msd:\t")
.append(getMsd())
.append("\n");
return sb.toString();
}
public String getForCf(CalculateFor calculateFor, boolean cvv) {
String returnValue = "";
if (cvv) {
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
} else {
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
}
return returnValue;
}
}

View File

@@ -0,0 +1,454 @@
package gui;
import data.*;
import javafx.application.HostServices;
import javafx.beans.value.ChangeListener;
import javafx.beans.value.ObservableValue;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import static alg.XML_processing.readXML;
import static gui.GUIController.showAlert;
import static gui.Messages.*;
@SuppressWarnings("Duplicates")
public class CharacterAnalysisTab {
public final static Logger logger = LogManager.getLogger(CharacterAnalysisTab.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private CheckBox calculatecvvCB;
private boolean calculateCvv;
@FXML
private TextField stringLengthTF;
private Integer stringLength;
@FXML
private ToggleGroup calculateForRB;
private CalculateFor calculateFor;
@FXML
private RadioButton lemmaRB;
@FXML
private RadioButton varietyRB;
@FXML
private Pane paneLetters;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("različnica", "lema");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.LETTER;
toggleMode(currentMode);
calculateForRB.selectedToggleProperty().addListener(new ChangeListener<Toggle>() {
@Override
public void changed(ObservableValue<? extends Toggle> observable, Toggle oldValue, Toggle newValue) {
//logger.info("calculateForRB:", newValue.toString());
RadioButton chk = (RadioButton)newValue.getToggleGroup().getSelectedToggle(); // Cast object to radio button
calculateFor = CalculateFor.factory(chk.getText());
logger.info("calculateForRB:", chk.getText());
//System.out.println("Selected Radio Button - "+chk.getText());
}
});
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = 1;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
// cvv
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
calculateCvv = newValue;
logger.info("calculate cvv: " + calculateCvv);
});
// string length
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = stringLengthTF.getText();
if (!ValidationUtil.isEmpty(value)) {
if (!ValidationUtil.isNumber(value)) {
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
}
stringLength = Integer.parseInt(value);
} else {
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
stringLengthTF.setText("1");
logAlert(WARNING_MISSING_STRING_LENGTH);
}
}
});
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForRB.selectToggle(lemmaRB);
calculateFor = CalculateFor.factory(calculateForRB.getSelectedToggle().toString());
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
// keep calculateCvv
calculatecvvCB.setSelected(calculateCvv);
// keep string length if set
if (stringLength != null) {
stringLengthTF.setText(String.valueOf(stringLength));
} else {
stringLengthTF.setText("1");
stringLength = 1;
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.LETTER) {
paneLetters.setVisible(true);
// populate with default cvv length value
if (stringLength == null) {
stringLengthTF.setText("1");
stringLength = 1;
} else {
stringLengthTF.setText(String.valueOf(stringLength));
}
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.LEMMA;
calculateForRB.selectToggle(lemmaRB);
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
// TODO change to
varietyRB.setDisable(true);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
varietyRB.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(0);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(calculateCvv);
filter.setSolarFilters(solarFiltersMap);
filter.setStringLength(stringLength);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,517 @@
package gui;
import static data.CorpusType.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import static util.Util.*;
import java.io.File;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.XML_processing;
import data.Corpus;
import data.CorpusType;
import data.Enums.solar.SolarFilters;
import data.Tax;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import javafx.stage.DirectoryChooser;
import javafx.stage.Stage;
import javafx.application.HostServices;
public class CorpusTab {
public final static Logger logger = LogManager.getLogger(CorpusTab.class);
public Pane setCorpusWrapperP;
private Stage stage;
@FXML
private Button chooseCorpusLocationB;
private File chosenCorpusLocation;
@FXML
private CheckBox readHeaderInfoChB;
private boolean readHeaderInfo;
@FXML
private CheckBox gosUseOrthChB;
private boolean gosUseOrth;
@FXML
private Button chooseResultsLocationB;
@FXML
private Label chooseCorpusL;
private String chooseCorpusLabelContent;
@FXML
private Label chooseResultsL;
private String chooseResultsLabelContent;
@FXML
private ProgressIndicator locationScanPI;
@FXML
private Hyperlink helpH;
// *** shared ***
private Corpus corpus;
private CorpusType corpusType;
// tabs - used to enable/disable
private Tab stringLevelTabNew2;
private Tab oneWordAnalysisTab;
private Tab characterLevelTab;
private Tab wordFormationTab;
private Tab wordLevelTab;
private Tab filterTab;
private TabPane tabPane;
private StringAnalysisTabNew2 satNew2Controller;
private OneWordAnalysisTab oneWordTabController;
private CharacterAnalysisTab catController;
private FiltersForSolar ffsController;
//private WordFormationTab wfController;
private WordLevelTab wlController;
private HostServices hostService;
public void initialize() {
stage = new Stage();
// add listeners
chooseCorpusLocationB.setOnAction(e -> chooseCorpusLocation());
chooseCorpusLocationB.setTooltip(new Tooltip(TOOLTIP_chooseCorpusLocationB));
helpH.setOnAction(e -> openHelpWebsite());
readHeaderInfoChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
readHeaderInfo = newValue;
logger.info("read headers: ", readHeaderInfo);
});
readHeaderInfoChB.setTooltip(new Tooltip(TOOLTIP_readHeaderInfoChB));
gosUseOrthChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
gosUseOrth = newValue;
corpus.setGosOrthMode(gosUseOrth);
wordFormationTab.setDisable(gosUseOrth);
satNew2Controller.toggleMode(null);
oneWordTabController.toggleMode(null);
catController.toggleMode(null);
logger.info("gosUseOrth: ", gosUseOrth);
});
chooseResultsLocationB.setOnAction(e -> chooseResultsLocation(null));
// set labels and toggle visibility
toggleGosChBVisibility();
chooseCorpusLabelContent = Messages.LABEL_CORPUS_LOCATION_NOT_SET;
chooseCorpusL.setText(chooseCorpusLabelContent);
chooseResultsLabelContent = Messages.LABEL_RESULTS_LOCATION_NOT_SET;
chooseResultsL.setText(chooseResultsLabelContent);
togglePiAndSetCorpusWrapper(false);
}
private void togglePiAndSetCorpusWrapper(boolean piIsActive) {
locationScanPI.setVisible(piIsActive);
setCorpusWrapperP.setLayoutX(piIsActive ? 100.0 : 10.0);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
/**
* In order for a directory to pass as a valid corpus location, following criteria has to be met:
* <ul>
* <li>it can't be null</li>
* <li>it has to be readable</li>
* <li>it has to contain xml files</li>
* <li>xml files have to contain valid headers from which we can infer the corpus type</li>
* <li>corpus type must be one of the expected corpus types - as noted in the @see data.CorpusType.class </li>
* </ul>
* <p>
* Additionally, if the user checks to read taxonomy/filters from the corpus files, that read
* has to produce a non-empty list results list
*/
private void chooseCorpusLocation() {
File selectedDirectory = directoryChooser();
if (selectedDirectory != null && ValidationUtil.isReadableDirectory(selectedDirectory)) {
logger.info("selected corpus dir: ", selectedDirectory.getAbsolutePath());
// scan for xml files
Collection<File> corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
// make sure there are corpus files in selected directory or notify the user about it
if (corpusFiles.size() == 0) {
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
} else {
String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles, selectedDirectory.getAbsolutePath());
if (chooseCorpusLabelContentTmp == null) {
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
} else {
initNewCorpus(selectedDirectory, corpusFiles);
corpus.setChosenCorpusLocation(selectedDirectory);
corpus.setDetectedCorpusFiles(corpusFiles);
chooseCorpusLabelContent = chooseCorpusLabelContentTmp;
logger.info("corpus dir: ", corpus.getChosenCorpusLocation().getAbsolutePath());
if (readHeaderInfo) {
logger.info("reading header info...");
readHeaderInfo();
} else {
setResults();
setCorpusForAnalysis();
}
}
}
}
}
/**
* If a user selects a valid corpus location, we define a new corpus (so none of the old data gets carried over)
*
* @param selectedDirectory
* @param corpusFiles
*/
private void initNewCorpus(File selectedDirectory, Collection<File> corpusFiles) {
corpus = new Corpus();
corpus.setCorpusType(corpusType);
corpus.setDetectedCorpusFiles(corpusFiles);
corpus.setChosenCorpusLocation(selectedDirectory);
chooseResultsLocation(selectedDirectory);
}
private void chooseResultsLocation(File dir) {
// results location can be set either to default value (after selecting valid corpus location) - dir attribute
// or to a dir picked via directoryChooser (when dir == null
File selectedDirectory = dir == null ? directoryChooser() : dir;
if (selectedDirectory != null) {
String resultsLocationPath = selectedDirectory.getAbsolutePath().concat(File.separator);
File chosenResultsLocationTmp = new File(resultsLocationPath);
if (!ValidationUtil.isValidDirectory(chosenResultsLocationTmp)) {
showAlert(Alert.AlertType.ERROR, WARNING_RESULTS_DIR_NOT_VALID);
logger.info("alert: ", WARNING_RESULTS_DIR_NOT_VALID);
} else {
corpus.setChosenResultsLocation(chosenResultsLocationTmp);
chooseResultsLabelContent = corpus.getChosenResultsLocation().getAbsolutePath();
chooseResultsL.setText(chooseResultsLabelContent);
logger.info("results dir: " + chooseResultsLabelContent);
}
}
}
private void setResults() {
// if everything is ok
// check and enable checkbox if GOS
toggleGosChBVisibility();
// set default results location
String defaultResultsLocationPath = corpus.getChosenCorpusLocation().getAbsolutePath();
logger.info("setting default results location to: ", defaultResultsLocationPath);
chooseCorpusL.setText(chooseCorpusLabelContent);
}
private void readHeaderInfo() {
CorpusType corpusType = corpus.getCorpusType();
Collection<File> corpusFiles = corpus.getDetectedCorpusFiles();
togglePiAndSetCorpusWrapper(true);
chooseCorpusL.setText(LABEL_SCANNING_CORPUS);
logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@Override
protected HashSet<String> call() throws Exception {
HashSet<String> values = new HashSet<>();
long i = 0;
if (!corpusIsSplit) {
updateProgress(-1.0f, -1.0f);
}
for (File file : corpusFiles) {
values.addAll((Collection<? extends String>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType));
i++;
if (corpusIsSplit) {
updateProgress(i, corpusFiles.size());
}
}
updateProgress(1.0f, 1.0f);
return values;
}
};
locationScanPI.progressProperty().bind(task.progressProperty());
task.setOnSucceeded(e -> {
ObservableList<String> readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue());
if (ValidationUtil.isEmpty(readTaxonomy)) {
// if no taxonomy found alert the user and keep other tabs disabled
logger.info("No taxonomy found in headers.");
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_TAXONOMY_FOUND);
} else {
// set taxonomy, update label
corpus.setTaxonomy(readTaxonomy);
corpus.setHeaderRead(true);
chooseCorpusL.setText(chooseCorpusLabelContent);
setResults();
setCorpusForAnalysis();
}
togglePiAndSetCorpusWrapper(false);
});
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
} else if (corpusType == CorpusType.SOLAR) {
// many many fields
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashMap<String, HashSet<String>>> task = new Task<HashMap<String, HashSet<String>>>() {
@Override
protected HashMap<String, HashSet<String>> call() throws Exception {
HashMap<String, HashSet<String>> values = new HashMap<>();
long i = 0;
if (!corpusIsSplit) {
updateProgress(-1.0f, -1.0f);
}
for (File file : corpusFiles) {
HashMap<String, HashSet<String>> tmpvalues = (HashMap<String, HashSet<String>>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType);
// update final results
for (Map.Entry<String, HashSet<String>> entry : tmpvalues.entrySet()) {
if (values.containsKey(entry.getKey())) {
values.get(entry.getKey()).addAll(entry.getValue());
} else {
values.put(entry.getKey(), entry.getValue());
}
}
i++;
if (corpusIsSplit) {
updateProgress(i, corpusFiles.size());
}
}
updateProgress(1.0f, 1.0f);
return values;
}
};
locationScanPI.progressProperty().bind(task.progressProperty());
task.setOnSucceeded(e -> {
HashMap<String, HashSet<String>> values = task.getValue();
if (ValidationUtil.isEmpty(values)) {
// if no taxonomy found alert the user and keep other tabs disabled
logger.info("No solar filters found in headers.");
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_SOLAR_FILTERS_FOUND);
} else {
HashMap<String, ObservableList<String>> filtersForComboBoxes = SolarFilters.getFiltersForComboBoxes(values);
// set taxonomy, update label
corpus.setSolarFiltersForXML(values);
corpus.setSolarFilters(filtersForComboBoxes);
corpus.setHeaderRead(true);
chooseCorpusL.setText(chooseCorpusLabelContent);
setResults();
setCorpusForAnalysis();
}
togglePiAndSetCorpusWrapper(false);
});
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
}
private void setCorpusForAnalysis() {
if (corpus.validate()) {
// new statistic, enable tabs...
stringLevelTabNew2.setDisable(false);
satNew2Controller.setCorpus(corpus);
satNew2Controller.init();
oneWordAnalysisTab.setDisable(false);
oneWordTabController.setCorpus(corpus);
oneWordTabController.init();
characterLevelTab.setDisable(false);
catController.setCorpus(corpus);
catController.init();
wordFormationTab.setDisable(false);
wordLevelTab.setDisable(false);
//wfController.setCorpus(corpus);
//wfController.init();
wlController.setCorpus(corpus);
wlController.init();
if (corpus.getCorpusType() == CorpusType.SOLAR) {
filterTab.setDisable(false);
tabPane.getTabs().add(1, filterTab);
ffsController.setCorpus(corpus);
ffsController.initFilters();
} else {
filterTab.setDisable(true);
tabPane.getTabs().removeAll(filterTab);
}
} else {
GUIController.showAlert(Alert.AlertType.ERROR, corpus.getValidationErrorsToString());
}
}
private File directoryChooser() {
DirectoryChooser directoryChooser = new DirectoryChooser();
// open in the folder where the jar is located if possible
File workingDir = getWorkingDirectory();
if (workingDir != null) {
directoryChooser.setInitialDirectory(workingDir);
}
return directoryChooser.showDialog(stage);
}
/**
* Hides GOS related checkbox until needed.
*/
private void toggleGosChBVisibility() {
gosUseOrthChB.setVisible(corpus != null && corpus.getCorpusType() != null && corpus.getCorpusType() == CorpusType.GOS);
}
private String detectCorpusType(Collection<File> corpusFiles, String corpusLocation) {
// check that we recognize this corpus
// read first file only, maybe later do all, if toll on resources is acceptable
File f = corpusFiles.iterator().next();
String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
String test = CCKRES.getNameLowerCase();
String debug = "";
// check if XML file's title contains any of recognized corpus titles
corpusType = null;
if (title.contains(SOLAR.getNameLowerCase())) {
corpusType = SOLAR;
} else if (title.contains(GIGAFIDA.getNameLowerCase())) {
corpusType = GIGAFIDA;
} else if (title.contains(CCKRES.getNameLowerCase())) {
corpusType = CCKRES;
} else if (title.contains(GOS.getNameLowerCase())) {
corpusType = GOS;
}
if (corpusType == null) {
return null;
} else {
corpus.setCorpusType(corpusType);
StringBuilder sb = new StringBuilder();
sb.append(corpusLocation)
.append("\n")
.append(String.format(NOTIFICATION_FOUND_X_FILES, corpusFiles.size()))
.append("\n")
.append(String.format("Korpus: %s", corpusType.toString()));
String result = sb.toString();
logger.debug(result);
return result;
}
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
}
public void setStringLevelTabNew2(Tab stringLevelTabNew2) { this.stringLevelTabNew2 = stringLevelTabNew2; }
public void setOneWordAnalysisTab(Tab oneWordAnalysisTab) { this.oneWordAnalysisTab = oneWordAnalysisTab; }
public void setCharacterLevelTab(Tab characterLevelTab) { this.characterLevelTab = characterLevelTab; }
public void setWordLevelTab(Tab wordLevelTab) {
this.wordLevelTab = wordLevelTab;
}
public void setFilterTab(Tab filterTab) {
this.filterTab = filterTab;
}
public void setFfsController(FiltersForSolar ffsController) {
this.ffsController = ffsController;
}
public void setTabPane(TabPane tabPane) {
this.tabPane = tabPane;
}
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
/*public void setWfController(WordFormationTab wfController) {
this.wfController = wfController;
}*/
public void setWlController(WordLevelTab wlController) {
this.wlController = wlController;
}
public void setWordFormationTab(Tab wordFormationTab) {
this.wordFormationTab = wordFormationTab;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,187 @@
package gui;
import static data.Enums.solar.SolarFilters.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import javafx.application.HostServices;
import javafx.scene.control.Hyperlink;
import org.controlsfx.control.CheckComboBox;
import data.Corpus;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.fxml.FXML;
import javafx.scene.control.Label;
import javafx.scene.layout.AnchorPane;
import util.Util;
public class FiltersForSolar {
@FXML
public AnchorPane solarFiltersTabPane;
@FXML
public CheckComboBox<String> solarRegijaCCB;
@FXML
public CheckComboBox<String> solarPredmetCCB;
@FXML
public CheckComboBox<String> solarRazredCCB;
@FXML
public CheckComboBox<String> solarLetoCCB;
@FXML
public CheckComboBox<String> solarSolaCCB;
@FXML
public CheckComboBox<String> solarVrstaBesedilaCCB;
@FXML
public Label selectedFiltersLabel;
@FXML
private Hyperlink helpH;
private HashMap<String, ObservableList<String>> selectedFilters;
private Corpus corpus;
private StringAnalysisTabNew2 satNew2Controller;
private OneWordAnalysisTab oneWordTabController;
private CharacterAnalysisTab catController;
//private WordFormationTab wfController;
private WordLevelTab wlController;
private HostServices hostService;
@SuppressWarnings("unchecked")
public void initialize() {
selectedFilters = new HashMap<>();
solarRegijaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(REGIJA, solarRegijaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarPredmetCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(PREDMET, solarPredmetCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarRazredCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(RAZRED, solarRazredCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarLetoCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(LETO, solarLetoCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarSolaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(SOLA, solarSolaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarVrstaBesedilaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(TIP, solarVrstaBesedilaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
helpH.setOnAction(e -> openHelpWebsite());
}
public void initFilters() {
solarRegijaCCB.getItems().removeAll();
solarRegijaCCB.getItems().setAll(corpus.getSolarFilters().get(REGIJA));
solarRegijaCCB.getItems().sorted();
solarPredmetCCB.getItems().removeAll();
solarPredmetCCB.getItems().setAll(corpus.getSolarFilters().get(PREDMET));
solarPredmetCCB.getItems().sorted();
solarRazredCCB.getItems().removeAll();
solarRazredCCB.getItems().setAll(corpus.getSolarFilters().get(RAZRED));
solarRazredCCB.getItems().sorted();
solarLetoCCB.getItems().removeAll();
solarLetoCCB.getItems().setAll(corpus.getSolarFilters().get(LETO));
solarLetoCCB.getItems().sorted();
solarSolaCCB.getItems().removeAll();
solarSolaCCB.getItems().setAll(corpus.getSolarFilters().get(SOLA));
solarSolaCCB.getItems().sorted();
solarVrstaBesedilaCCB.getItems().removeAll();
solarVrstaBesedilaCCB.getItems().setAll(corpus.getSolarFilters().get(TIP));
solarVrstaBesedilaCCB.getItems().sorted();
}
private void updateSolarFilterLabel() {
if (Util.isMapEmpty(selectedFilters)) {
setSOlarFIlterLabelText("/");
} else {
StringBuilder allFilters = new StringBuilder();
for (Map.Entry<String, ObservableList<String>> entry : selectedFilters.entrySet()) {
ArrayList<String> values = new ArrayList<>(entry.getValue());
if (!values.isEmpty()) {
allFilters.append(entry.getKey())
.append(": ");
for (int i = 0; i < values.size(); i++) {
allFilters.append(values.get(i));
if (i < values.size() - 1) {
// so we won't append a comma after the last element
allFilters.append(", ");
}
}
allFilters.append("\n\n");
}
}
setSOlarFIlterLabelText(allFilters.toString());
}
HashMap<String, HashSet<String>> solarFiltersMap = new HashMap<>();
for (Map.Entry<String, ObservableList<String>> e : selectedFilters.entrySet()) {
HashSet<String> values = new HashSet<>();
values.addAll(e.getValue());
solarFiltersMap.put(e.getKey(), values);
}
satNew2Controller.setSolarFiltersMap(solarFiltersMap);
oneWordTabController.setSolarFiltersMap(solarFiltersMap);
catController.setSolarFiltersMap(solarFiltersMap);
//wfController.setSolarFiltersMap(solarFiltersMap);
wlController.setSolarFiltersMap(solarFiltersMap);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void setSOlarFIlterLabelText(String content) {
selectedFiltersLabel.setText(content);
satNew2Controller.setSelectedFiltersLabel(content);
oneWordTabController.setSelectedFiltersLabel(content);
catController.setSelectedFiltersLabel(content);
//wfController.setSelectedFiltersLabel(content);
wlController.setSelectedFiltersLabel(content);
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
}
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
/*public void setWfController(WordFormationTab wfController) {
this.wfController = wfController;
}*/
public void setWlController(WordLevelTab wlController) {
this.wlController = wlController;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,150 @@
package gui;
import java.io.IOException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.kordamp.ikonli.fontawesome.FontAwesome;
import org.kordamp.ikonli.javafx.FontIcon;
import data.Corpus;
import javafx.application.Application;
import javafx.fxml.FXML;
import javafx.fxml.FXMLLoader;
import javafx.scene.Parent;
import javafx.scene.Scene;
import javafx.scene.control.Alert;
import javafx.scene.control.Tab;
import javafx.scene.control.TabPane;
import javafx.stage.Stage;
public class GUIController extends Application {
public final static Logger logger = LogManager.getLogger(GUIController.class);
@FXML
public Tab StringLevelTabNew2;
@FXML
public Tab OneWordAnalysisTab;
@FXML
public Tab CharacterLevelTabNew;
@FXML
public Tab corpusTab;
public TabPane tabPane;
@FXML
private CharacterAnalysisTab catController;
@FXML
private static Parent sat;
@FXML
private StringAnalysisTabNew2 satNew2Controller;
@FXML
private static Parent satNew2;
@FXML
private OneWordAnalysisTab oneWordTabController;
@FXML
private static Parent oneWordTab;
@FXML
private CorpusTab ctController;
@FXML
private Parent ct;
//@FXML
//private WordFormationTab wfController;
@FXML
private Parent wf;
@FXML
private WordLevelTab wlController;
@FXML
private Parent wl;
@FXML
private FiltersForSolar ffsController;
@FXML
private Parent ffs;
@FXML
private SelectedFiltersPane sfpController;
@FXML
private Parent sfp;
@FXML
public Tab stringLevelTab;
@FXML
public Tab wordLevelTab;
/*@FXML
public Tab wordFormationTab;*/
@FXML
public Tab filterTab;
public Stage stage;
private Corpus corpus;
@Override
public void start(Stage primaryStage) throws IOException {
Parent root = FXMLLoader.load(getClass().getResource("/GUI.fxml"));
primaryStage.setTitle("GUI");
Scene scene = new Scene(root, 800, 600);
// https://github.com/dicolar/jbootx
// scene.getStylesheets().add(GUIController.class.getResource("bootstrap3.css").toExternalForm())
primaryStage.setScene(scene);
stage = primaryStage;
primaryStage.show();
}
public static void main(String[] args) {
launch(args);
}
public void initialize() {
corpus = new Corpus();
ctController.setCorpus(corpus);
ctController.setFilterTab(filterTab);
ctController.setStringLevelTabNew2(StringLevelTabNew2);
ctController.setOneWordAnalysisTab(OneWordAnalysisTab);
ctController.setCharacterLevelTab(CharacterLevelTabNew);
ctController.setSatNew2Controller(satNew2Controller);
ctController.setOneWordTabController(oneWordTabController);
ctController.setCatController(catController);
//ctController.setWfController(wfController);
ctController.setWlController(wlController);
ctController.setTabPane(tabPane);
ctController.setFfsController(ffsController);
//ctController.setWordFormationTab(wordFormationTab);
ctController.setWordLevelTab(wordLevelTab);
ctController.setHostServices(getHostServices());
satNew2Controller.setCorpus(corpus);
satNew2Controller.setHostServices(getHostServices());
oneWordTabController.setCorpus(corpus);
oneWordTabController.setHostServices(getHostServices());
catController.setCorpus(corpus);
catController.setHostServices(getHostServices());
//wfController.setCorpus(corpus);
//wfController.setHostServices(getHostServices());
wlController.setCorpus(corpus);
wlController.setHostServices(getHostServices());
ffsController.setSatNew2Controller(satNew2Controller);
ffsController.setOneWordTabController(oneWordTabController);
ffsController.setCatController(catController);
//ffsController.setWfController(wfController);
ffsController.setWlController(wlController);
ffsController.setHostServices(getHostServices());
// set tab icons
corpusTab.setGraphic(new FontIcon(FontAwesome.COG));
filterTab.setGraphic(new FontIcon(FontAwesome.FILTER));
// hide filter tab
tabPane.getTabs().removeAll(filterTab);
}
static void showAlert(Alert.AlertType alertType, String headerText, String contentText) {
Alert alert = new Alert(alertType);
alert.setTitle(Messages.windowTitles.get(alertType));
alert.setHeaderText(headerText != null ? headerText : "");
alert.setContentText(contentText != null ? contentText : "");
alert.showAndWait();
}
static void showAlert(Alert.AlertType alertType, String headerText) {
showAlert(alertType, headerText, null);
}
}

View File

@@ -0,0 +1,74 @@
package gui;
import static javafx.scene.control.Alert.AlertType.*;
import java.util.HashMap;
import javafx.scene.control.Alert;
public class Messages {
// warnings & errors
public static final String WARNING_CORPUS_NOT_FOUND = "V izbranem direktoriju ni ustreznih korpusnih datotek.";
public static final String WARNING_RESULTS_DIR_NOT_VALID = "Za dostop do izbranega direktorija nimate potrebnih pravic.";
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS = "Izbran nivo ngramov in vpisano št. besed v filtru se ne ujemata.";
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS_INFO = "Izberite drugo število ali popravite filter.";
public static final String WARNING_WORD_OR_LEMMA = "Izberite, če želite statistiko izračunati za besede ali leme.";
public static final String WARNING_ONLY_NUMBERS_ALLOWED = "Prosim vnesite veljavno število.";
public static final String WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES = "Število za ngram (%d) in število msd oznak (%d) se morata ujemati.";
public static final String WARNING_MISSING_STRING_LENGTH = "Dolžina niza mora biti večja od 0. Vstavljena je privzeta vrednost (1).";
public static final String WARNING_NO_TAXONOMY_FOUND = "Iz korpusnih datotek ni bilo moč razbrati taksonomije. Prosim izberite drugo lokacijo ali korpus.";
public static final String WARNING_NO_SOLAR_FILTERS_FOUND = "Iz korpusnih datotek ni bilo moč razbrati filtrov. Prosim izberite drugo lokacijo ali korpus.";
public static final String ERROR_WHILE_EXECUTING = "Prišlo je do napake med izvajanjem.";
public static final String ERROR_WHILE_SAVING_RESULTS_TO_CSV = "Prišlo je do napake med shranjevanje rezultatov.";
// missing
public static final String MISSING_NGRAM_LEVEL = "N-gram nivo";
public static final String MISSING_CALCULATE_FOR = "Izračunaj za";
public static final String MISSING_SKIP = "";
public static final String MISSING_STRING_LENGTH = "Dolžina niza";
public static final String MISMATCHED_STRING_LENGTH_AND_MSD_REGEX = "Neujemajoča dolžina niza in regex filter";
// general notifications - static content/set only once
public static final String NOTIFICATION_FOUND_X_FILES = "Št. najdenih datotek: %d";
public static final String NOTIFICATION_ANALYSIS_COMPLETED = "Analiza je zaključena, rezultati so shranjeni.";
public static final String NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS = "Analiza je zaključena, vendar ni bilo moč izračunati statistike, ki bi ustrezala vsem navedenim pogojem.";
public static final String RESULTS_PATH_SET_TO_DEFAULT = "Lokacija za shranjevanje rezultatov je nastavljena na lokacijo korpusa.";
// ongoing notifications - displayed while processing, dynamically changing
public static final String ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y = "Analiziram datoteko %d od %d (%s)";
// Labels
public static final String LABEL_CORPUS_LOCATION_NOT_SET = "Lokacija korpusa ni nastavljena";
public static final String LABEL_RESULTS_LOCATION_NOT_SET = "Lokacija za shranjevanje rezultatov ni nastavljena";
public static final String LABEL_RESULTS_CORPUS_TYPE_NOT_SET = "Vrsta korpusa ni nastavljena";
public static final String LABEL_SCANNING_CORPUS = "Iskanje in analiza korpusnih datotek...";
public static final String LABEL_SCANNING_SINGLE_FILE_CORPUS = "Analiza vnosa ";
public static final String COMPLETED = "končano";
public static final String TOOLTIP_chooseCorpusLocationB = "Izberite mapo v kateri se nahaja korpus. Program izbrano mapo preišče rekurzivno, zato bodite pozorni, da ne izberete mape z več korpusi ali z mnogo datotekami, ki niso del korpusa.";
public static final String TOOLTIP_readHeaderInfoChB = "Če izberete to opcijo, se bo iz headerjev korpusa prebrala razpoložljiva taksonomija oz. filtri (korpus Šolar). Ta operacija lahko traja dlje časa, sploh če je korpus združen v eni sami datoteki.";
// Not properly to be here. TODO move somewhere else in future
public static final String HELP_URL = "http://slovnica.ijs.si/";
// helper maps
/**
* Typical window titles
* ERROR = "Napaka"
* WARNING = "Opozorilo"
* CONFIRMATION = "Potrdilo"
*/
static HashMap<Alert.AlertType, String> windowTitles = new HashMap<>();
static {
// automatically set window's title
windowTitles.put(ERROR, "Napaka");
windowTitles.put(WARNING, "Opozorilo");
windowTitles.put(CONFIRMATION, "Potrdilo");
}
}

View File

@@ -0,0 +1,389 @@
package gui;
import data.*;
import javafx.application.HostServices;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import static alg.XML_processing.readXML;
import static gui.GUIController.showAlert;
import static gui.Messages.*;
@SuppressWarnings("Duplicates")
public class OneWordAnalysisTab {
public final static Logger logger = LogManager.getLogger(OneWordAnalysisTab.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private ComboBox<String> calculateForCB;
private CalculateFor calculateFor;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER,
WORD
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.WORD;
toggleMode(currentMode);
// calculateForCB
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
calculateFor = CalculateFor.factory(newValue);
logger.info("calculateForCB:", calculateFor.toString());
});
calculateForCB.getSelectionModel().select(0);
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = 1;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.WORD) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
} else if (mode == MODE.LETTER) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.WORD;
calculateForCB.getSelectionModel().select("različnica");
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(false);
filter.setSolarFilters(solarFiltersMap);
filter.setStringLength(1);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,18 @@
package gui;
import javafx.scene.control.Label;
public class SelectedFiltersPane {
public Label selectedFiltersLabel;
public Label getSelectedFiltersLabel() {
return selectedFiltersLabel;
}
public void setSelectedFiltersLabel(String filters) {
this.selectedFiltersLabel = new Label(filters);
this.selectedFiltersLabel.setText("test?");
}
}

View File

@@ -0,0 +1,511 @@
package gui;
import static alg.XML_processing.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import javafx.application.HostServices;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import data.*;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
@SuppressWarnings("Duplicates")
public class StringAnalysisTabNew2 {
public final static Logger logger = LogManager.getLogger(StringAnalysisTabNew2.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private CheckBox calculatecvvCB;
private boolean calculateCvv;
@FXML
private TextField stringLengthTF;
private Integer stringLength;
@FXML
private ComboBox<String> calculateForCB;
private CalculateFor calculateFor;
@FXML
private ComboBox<String> ngramValueCB;
private Integer ngramValue;
@FXML
private ComboBox<String> skipValueCB;
private Integer skipValue;
@FXML
private Pane paneWords;
@FXML
private Pane paneLetters;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER,
WORD
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.WORD;
toggleMode(currentMode);
// ngram value CB
ngramValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
if (newValue.equals("nivo črk")) {
ngramValue = 0;
toggleMode(MODE.LETTER);
} else {
ngramValue = Integer.valueOf(newValue);
toggleMode(MODE.WORD);
}
// skip only on ngrams of more than one word
if (ngramValue > 1) {
skipValueCB.setDisable(false);
} else {
skipValueCB.getSelectionModel().select(0);
skipValue = 0;
skipValueCB.setDisable(true);
}
logger.info("ngramValueCB:", ngramValue);
});
// set first n-gram value to 2 at index 0
ngramValueCB.getSelectionModel().select(0); // selected index
ngramValue = 2; // actual value at that index
// calculateForCB
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
calculateFor = CalculateFor.factory(newValue);
logger.info("calculateForCB:", calculateFor.toString());
});
calculateForCB.getSelectionModel().select(0);
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = ngramValue == 0 ? 1 : ngramValue;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
// skip
skipValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
skipValue = Integer.valueOf(newValue);
logger.info("Skip " + skipValue);
});
skipValueCB.getSelectionModel().select(0);
skipValue = 0;
// cvv
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
calculateCvv = newValue;
logger.info("calculate cvv: " + calculateCvv);
});
calculatecvvCB.setSelected(false);
// string length
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = stringLengthTF.getText();
if (!ValidationUtil.isEmpty(value)) {
if (!ValidationUtil.isNumber(value)) {
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
}
stringLength = Integer.parseInt(value);
} else {
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
stringLengthTF.setText("1");
logAlert(WARNING_MISSING_STRING_LENGTH);
}
}
});
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// keep ngram value if set
if (ngramValue == null) {
ngramValueCB.getSelectionModel().select("1");
ngramValue = 1;
}
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
// keep skip value
if (skipValue == null) {
skipValueCB.getSelectionModel().select("0");
skipValue = 0;
}
// keep calculateCvv
calculatecvvCB.setSelected(calculateCvv);
// keep string length if set
if (stringLength != null) {
stringLengthTF.setText(String.valueOf(stringLength));
} else {
stringLengthTF.setText("1");
stringLength = 1;
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.WORD) {
paneWords.setVisible(true);
paneLetters.setVisible(false);
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
} else if (mode == MODE.LETTER) {
paneWords.setVisible(false);
paneLetters.setVisible(true);
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
// populate with default cvv length value
if (stringLength == null) {
stringLengthTF.setText("1");
stringLength = 1;
} else {
stringLengthTF.setText(String.valueOf(stringLength));
}
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.WORD;
calculateForCB.getSelectionModel().select("različnica");
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(ngramValue);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(skipValue);
filter.setIsCvv(calculateCvv);
filter.setSolarFilters(solarFiltersMap);
if (ngramValue != null && ngramValue == 0) {
filter.setStringLength(stringLength);
}
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,77 @@
package gui;
import java.io.File;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.math.NumberUtils;
public class ValidationUtil {
public static boolean isNumber(String value) {
return NumberUtils.isCreatable(value);
}
/**
* Checks if an object is empty or null. Null part is especially important,
* since Java's built-in isEmpty() methods don't check for this condition
* and throw a nullPointerException as a result.
* <p>
* Supported structures:
* <ul>
* <li>String: empty if null or length is zero</li>
* <li>List: empty if null or size() == 0</li>
* <li>Map: empty if null or if it contains no keys, or if all keys map to an empty value </li>
* </ul>
*/
public static boolean isEmpty(Object o) {
if (o == null) {
return true;
}
if (o instanceof String) {
if (((String) o).length() == 0) {
return true;
}
}
if (o instanceof List) {
if (((List) o).isEmpty()) {
return true;
}
}
if (o instanceof Map) {
if (((Map) o).keySet().isEmpty()) {
return true;
} else {
for (Object val : ((Map) o).values()) {
if (!isEmpty(val)) {
// if map contains any value that isn't empty, the map isn't considered empty
return false;
}
}
}
}
return false;
}
public static boolean isNotEmpty(Object o) {
return !isEmpty(o);
}
/**
* Checks whether a given File is a folder for which we have appropriate permission
*/
public static boolean isValidDirectory(File f) {
return f.isDirectory() && f.canRead() && f.canWrite();
}
/**
* Checks whether a given File is a folder for which we have appropriate permission
*/
public static boolean isReadableDirectory(File f) {
return f.isDirectory() && f.canRead();
}
}

View File

@@ -0,0 +1,208 @@
package gui;
import static alg.XML_processing.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import javafx.application.HostServices;
import javafx.scene.control.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import data.*;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.layout.AnchorPane;
@SuppressWarnings("Duplicates")
public class WordFormationTab {
public final static Logger logger = LogManager.getLogger(WordFormationTab.class);
public AnchorPane wordAnalysisTabPane;
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private Button computeB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private HostServices hostService;
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
private boolean useDb;
public void init() {
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
computeB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setMsd(new ArrayList<>());
filter.setIsCvv(false);
filter.setSolarFilters(solarFiltersMap);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
// first, we have to recalculate all occurrences to detailed statistics
boolean successullySaved = statistic.recalculateAndSaveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,207 @@
package gui;
import static alg.XML_processing.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import javafx.application.HostServices;
import javafx.scene.control.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import data.*;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.layout.AnchorPane;
@SuppressWarnings("Duplicates")
public class WordLevelTab {
public final static Logger logger = LogManager.getLogger(WordLevelTab.class);
public AnchorPane wordLevelAnalysisTabPane;
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private Button computeB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private HostServices hostService;
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
private boolean useDb;
public void init() {
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
computeB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(CalculateFor.WORD);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.WORD_LEVEL);
filter.setSkipValue(0);
filter.setMsd(new ArrayList<>());
filter.setIsCvv(false);
filter.setSolarFilters(solarFiltersMap);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
// first, we have to recalculate all occurrences to detailed statistics
boolean successullySaved = statistic.saveResultNestedToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: gui.GUIController

View File

@@ -0,0 +1,25 @@
package util;
import java.nio.ByteBuffer;
public class ByteUtils {
/*
* Taken from <a href="https://stackoverflow.com/a/4485196">StackOverflow</a>
*/
public static byte[] longToBytes(long x) {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
buffer.putLong(x);
return buffer.array();
}
/*
* Taken from <a href="https://stackoverflow.com/a/4485196">StackOverflow</a>
*/
public static long bytesToLong(byte[] bytes) {
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
buffer.put(bytes);
buffer.flip();//need flip
return buffer.getLong();
}
}

View File

@@ -0,0 +1,46 @@
package util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.stream.IntStream;
public class Combinations {
private static HashSet<HashSet<Integer>> result = new HashSet<>();
/* arr[] ---> Input Array
data[] ---> Temporary array to store current combination
start & end ---> Staring and Ending indexes in arr[]
index ---> Current index in data[]
r ---> Size of a combination to be printed */
static void combinationUtil(int arr[], Integer data[], int start, int end, int index, int combinationLength) {
// Current combination is ready to be printed, print it
if (index == combinationLength) {
result.add(new HashSet<>(Arrays.asList(data)));
return;
}
// replace index with all possible elements. The condition
// "end-i+1 >= r-index" makes sure that including one element
// at index will make a combination with remaining elements
// at remaining positions
for (int i = start; i <= end && end - i + 1 >= combinationLength - index; i++) {
data[index] = arr[i];
combinationUtil(arr, data, i + 1, end, index + 1, combinationLength);
}
}
public static HashSet<HashSet<Integer>> generateIndices(int maxNOfIndices) {
result = new HashSet<>();
int[] arr = IntStream.range(1, maxNOfIndices).toArray();
for (int i = 1; i < maxNOfIndices - 1; i++) {
// A temporary array to store all combination one by one
combinationUtil(arr, new Integer[i], 0, arr.length - 1, 0, i);
}
// also add an empty one for X.... (all of this type)
result.add(new HashSet<>());
return result;
}
}

View File

@@ -0,0 +1,267 @@
package util;
import static util.Util.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.lang3.tuple.Pair;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import data.Enums.WordLevelType;
@SuppressWarnings("unchecked")
public class Export {
public static void SetToJSON(Set<Pair<String, Map<String, Long>>> set) {
JSONArray wrapper = new JSONArray();
for (Pair<String, Map<String, Long>> p : set) {
JSONArray data_wrapper = new JSONArray();
JSONObject metric = new JSONObject();
String title = p.getLeft();
Map<String, Long> map = p.getRight();
if (map.isEmpty())
continue;
long total = Util.mapSumFrequencies(map);
for (Map.Entry<String, Long> e : map.entrySet()) {
JSONObject data_entry = new JSONObject();
data_entry.put("word", e.getKey());
data_entry.put("frequency", e.getValue());
data_entry.put("percent", formatNumberAsPercent((double) e.getValue() / total));
data_wrapper.add(data_entry);
}
metric.put("Title", title);
metric.put("data", data_wrapper);
wrapper.add(metric);
}
try (FileWriter file = new FileWriter("statistics.json")) {
file.write(wrapper.toJSONString());
} catch (IOException e) {
e.printStackTrace();
}
}
public static String SetToCSV(Set<Pair<String, Map<String, Long>>> set, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"word", "frequency", "percent"};
String fileName = "";
for (Pair<String, Map<String, Long>> p : set) {
String title = p.getLeft();
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
Map<String, Long> map = p.getRight();
if (map.isEmpty())
continue;
long total = Util.mapSumFrequencies(map);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<String, Long> e : map.entrySet()) {
List dataEntry = new ArrayList<>();
dataEntry.add(e.getKey());
dataEntry.add(e.getValue().toString());
dataEntry.add(formatNumberAsPercent((double) e.getValue() / total));
csvFilePrinter.printRecord(dataEntry);
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
}
return fileName;
}
public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"word", "frequency", "percent"};
String fileName = "";
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Object[] resultEntry : result) {
List dataEntry = new ArrayList<>();
dataEntry.add(resultEntry[0]);
dataEntry.add(resultEntry[1]);
dataEntry.add(formatNumberAsPercent(resultEntry[2]));
csvFilePrinter.printRecord(dataEntry);
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
return fileName;
}
public static String nestedMapToCSV(String title, Map<WordLevelType, Map<String, Map<String, Long>>> result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";
//CSV file header
Object[] FILE_HEADER = {"type", "key", "word", "frequency"};
String fileName = "";
fileName = title.replace(": ", "-");
fileName = fileName.replace(" ", "_").concat(".csv");
fileName = resultsPath.toString().concat(File.separator).concat(fileName);
OutputStreamWriter fileWriter = null;
CSVPrinter csvFilePrinter = null;
//Create the CSVFormat object with "\n" as a record delimiter
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withRecordSeparator(NEW_LINE_SEPARATOR).withDelimiter(';');
try {
//initialize FileWriter object
fileWriter = new OutputStreamWriter(new FileOutputStream(fileName), StandardCharsets.UTF_8);
//initialize CSVPrinter object
csvFilePrinter = new CSVPrinter(fileWriter, csvFileFormat);
// write info block
printHeaderInfo(csvFilePrinter, headerInfoBlock);
//Create CSV file header
csvFilePrinter.printRecord(FILE_HEADER);
for (Map.Entry<WordLevelType, Map<String, Map<String, Long>>> typeEntry : result.entrySet()) {
for (Map.Entry<String, Map<String, Long>> keyWordEntry : typeEntry.getValue().entrySet()) {
for (Map.Entry<String, Long> calculationResults : keyWordEntry.getValue().entrySet()) {
List values = new ArrayList();
values.add(typeEntry.getKey().getName());
values.add(keyWordEntry.getKey());
values.add(calculationResults.getKey());
values.add(calculationResults.getValue());
csvFilePrinter.printRecord(values);
}
}
}
} catch (Exception e) {
System.out.println("Error in CsvFileWriter!");
e.printStackTrace();
} finally {
try {
if (fileWriter != null) {
fileWriter.flush();
fileWriter.close();
}
if (csvFilePrinter != null) {
csvFilePrinter.close();
}
} catch (IOException e) {
System.out.println("Error while flushing/closing fileWriter/csvPrinter!");
e.printStackTrace();
}
}
return fileName;
}
private static void printHeaderInfo(CSVPrinter csvFilePrinter, LinkedHashMap<String, String> headerInfoBlock) throws IOException {
for (Map.Entry<String, String> entry : headerInfoBlock.entrySet()) {
List values = new ArrayList();
values.add(entry.getKey());
values.add(entry.getValue());
csvFilePrinter.printRecord(values);
}
// 2 empty lines
List values = new ArrayList();
csvFilePrinter.printRecord(values);
csvFilePrinter.printRecord(values);
}
}

View File

@@ -0,0 +1,31 @@
package util;
public class Key /*implements Comparable<Key> */ {
// private final String value;
//
// Key(String value) {
// this.value = value;
// }
//
// @Override
// public int compareTo(Key o) {
// return Objects.compare(this.value, o.value);
// }
//
// @Override
// public boolean equals(Object o) {
// if (this.equals(o)) {
// return true;
// }
// if (o == null || getClass() != o.getClass()) {
// return false;
// }
// Key key = (Key) o;
// return Objects.equals(value, key.value);
// }
//
// @Override
// public int hashCode() {
// return 0;
// }
}

View File

@@ -0,0 +1,63 @@
package util;
import java.util.concurrent.TimeUnit;
/**
* Adapted from http://memorynotfound.com/calculating-elapsed-time-java/
*/
public class TimeWatch {
private long starts;
private TimeWatch() {
reset();
}
public static TimeWatch start() {
return new TimeWatch();
}
private TimeWatch reset() {
starts = System.nanoTime();
return this;
}
private long time() {
long ends = System.nanoTime();
return ends - starts;
}
private long time(TimeUnit unit) {
return unit.convert(time(), TimeUnit.NANOSECONDS);
}
private String toMinuteSeconds() {
return String.format("%d min, %d sec", time(TimeUnit.MINUTES),
time(TimeUnit.SECONDS) - time(TimeUnit.MINUTES));
}
public String toFullTime() {
long hours = time(TimeUnit.HOURS);
long minutes = time(TimeUnit.MINUTES) - TimeUnit.HOURS.toMinutes(hours);
long seconds = time(TimeUnit.SECONDS) - TimeUnit.HOURS.toSeconds(hours) - TimeUnit.MINUTES.toSeconds(minutes);
long milliseconds = time(TimeUnit.MILLISECONDS) - TimeUnit.HOURS.toMillis(hours) - TimeUnit.MINUTES.toMillis(minutes) - TimeUnit.SECONDS.toMillis(seconds);
return String.format("%d h, %d min, %d s, %d ms", hours, minutes, seconds, milliseconds);
}
public String toString() {
return "Elapsed Time in nano seconds: ";
}
private void exampleUsage() {
TimeWatch watch = TimeWatch.start();
// do something...
System.out.println("Elapsed Time custom format: " + watch.toMinuteSeconds());
System.out.println("Elapsed Time in seconds: " + watch.time(TimeUnit.SECONDS));
System.out.println("Elapsed Time in nano seconds: " + watch.time());
}
}

View File

@@ -0,0 +1,225 @@
package util;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.text.MessageFormat;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.Settings;
import gui.GUIController;
import gui.ValidationUtil;
public class Util {
public final static Logger logger = LogManager.getLogger(Util.class);
public static String toReadableTime(long time) {
long hours = time(TimeUnit.HOURS, time);
long minutes = time(TimeUnit.MINUTES, time) - TimeUnit.HOURS.toMinutes(hours);
long seconds = time(TimeUnit.SECONDS, time) - TimeUnit.HOURS.toSeconds(hours) - TimeUnit.MINUTES.toSeconds(minutes);
long milliseconds = time(TimeUnit.MILLISECONDS, time) - TimeUnit.HOURS.toMillis(hours) - TimeUnit.MINUTES.toMillis(minutes) - TimeUnit.SECONDS.toMillis(seconds);
long microseconds = time(TimeUnit.MICROSECONDS, time) - TimeUnit.HOURS.toMicros(hours) - TimeUnit.MINUTES.toMicros(minutes) - TimeUnit.SECONDS.toMicros(seconds) - TimeUnit.MILLISECONDS.toMicros(milliseconds);
long nanoseconds = time(TimeUnit.NANOSECONDS, time) - TimeUnit.HOURS.toNanos(hours) - TimeUnit.MINUTES.toNanos(minutes) - TimeUnit.SECONDS.toNanos(seconds) - TimeUnit.MILLISECONDS.toNanos(milliseconds) - TimeUnit.MICROSECONDS.toNanos(microseconds);
return String.format("%d h, %d min, %d s, %d ms, %d µs, %d ns", hours, minutes, seconds, milliseconds, microseconds, nanoseconds);
}
private static long time(TimeUnit unit, long t) {
return unit.convert(t, TimeUnit.NANOSECONDS);
}
/**
* Converts a number to a more readable format.
* 12345 -> 12.345
* 12345,678 -> 12.345,67
*
* @param o byte, double, float, int,long, short
*
* @return number formatted with thousands separator and 2 decimal places (floats)
*/
private static String formatNumberReadable(Object o) {
if (isInstanceOfInteger(o))
return String.format("%,d", o);
else if (isInstanceOfFloat(o))
return String.format("%,.2f", o);
else
return "- invalid input format -";
}
public static String formatNumberAsPercent(Object o) {
return MessageFormat.format("{0,number,#.###%}", o);
}
private static boolean isInstanceOfInteger(Object o) {
Set<Class<?>> types = new HashSet<>();
types.add(Byte.class);
types.add(Short.class);
types.add(Integer.class);
types.add(Long.class);
return types.contains(o.getClass());
}
private static boolean isInstanceOfFloat(Object o) {
Set<Class<?>> types = new HashSet<>();
types.add(Float.class);
types.add(Double.class);
return types.contains(o.getClass());
}
public static <K, V> void printMap(Map<K, V> map) {
System.out.println("\nkey: value");
map.forEach((k, v) -> System.out.print(String.format("%s:\t %,8d%n", k, v)));
System.out.println();
}
/**
* Generic map converter -> since AtomicLongs aren't as comparable.
* Converts ConcurrentHashMap<K, AtomicLong> to HashMap<K, Long>
*/
public static <K, V> Map<String, Long> atomicInt2StringAndInt(Map<K, V> map) {
Map m = new HashMap<String, Long>();
for (Map.Entry<K, V> e : map.entrySet()) {
m.put(e.getKey().toString(), ((AtomicLong) e.getValue()).longValue());
}
return m;
}
/**
* Sorts a map in a descending order by value.
*/
public static <K, V extends Comparable<? super V>> Map<K, V> sortByValue(Map<K, V> map, int limit) {
/*
sorted() in itself is O(1), since it's an intermediate operation that
doesn't consume the stream, but simply adds an operation to the pipeline.
Once the stream is consumed by a terminal operation, the sort happens and
either
- it doesn't do anything (O(1)) because the stream knows that the
elements are already sorted (because they come from a SortedSet, for example)
- or the stream is not parallel, and it delegates to Arrays.sort() (O(n log n))
- or the stream is parallel, and it delegates to Arrays.parallelSort() (O(n log n))
As of JDK 8, the main sorting algorithm which is also used in standard
stream API implementation for sequential sorting is TimSort. Its worst
case is O(n log n), but it works incredibly fast (with O(n) and quite
small constant) if data is presorted (in forward or reverse direction)
or partially presorted (for example, if you concatenate two sorted lists
and sort them again).
*/
// if limit is set to 0 or less, we take that to mean no limit at all
if (limit <= 0) {
limit = map.size();
}
Map<K, V> result = new LinkedHashMap<>();
TimeWatch watch = TimeWatch.start();
Stream<Map.Entry<K, V>> st = map.entrySet().stream();
st.sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())).limit(limit)
.forEachOrdered(e -> result.put(e.getKey(), e.getValue()));
if (Settings.PRINT_LOG) {
System.out.println(String.format("Elapsed time for sorting %s items: %s",
formatNumberReadable(result.size()),
watch.toFullTime()));
}
return result;
}
public static <K, V> void printMap(Map<K, Integer> map, String title, int number_of_words) {
System.out.println(String.format("\n%s\n------------\nkey: value\tpercent", title));
map.forEach((k, v) ->
System.out.println(String.format("%s:\t %s\t %s%%",
k,
Util.formatNumberReadable(v),
Util.formatNumberReadable((double) v / number_of_words * 100))));
System.out.println();
}
static long mapSumFrequencies(Map<String, Long> map) {
long sum = 0;
for (long value : map.values()) {
sum += value;
}
return sum;
}
/**
* Used for passing optional integer values for sorting.
*/
public static int getValidInt(int... i) {
if (i == null || i.length < 1 || i[0] <= 0) {
return 0;
} else {
return i[0];
}
}
/**
* Check whether a map is empty. It also considers an edge case where map's keys are lists to check if those lists are empty.
*/
public static <K, V> boolean isMapEmpty(Map<K, V> map) {
if (map.isEmpty()) {
// default
return true;
}
// otherwise check if keys map to values that are empty
for (V v : map.values()) {
// todo: generalize to all collections if/when needed
ArrayList<String> vl = new ArrayList((List<String>) v);
if (!vl.isEmpty()) {
return false;
}
}
return true;
}
/**
* Returns the location of the main class if possible, otherwise null
*/
public static File getWorkingDirectory() {
// get location of the currently executing class
String path = GUIController.class.getProtectionDomain().getCodeSource().getLocation().getPath();
logger.info("working dir path: ", path);
String decodedPath = null;
try {
decodedPath = URLDecoder.decode(path, "UTF-8");
} catch (UnsupportedEncodingException e) {
logger.error("decoding: ", e);
// e.printStackTrace();
}
if (decodedPath != null) {
File workingDirectory = new File(decodedPath);
// in case it's a file (class is packaged inside a jar), select its parent folder
workingDirectory = workingDirectory.isFile() ? workingDirectory.getParentFile() : workingDirectory;
if (ValidationUtil.isReadableDirectory(workingDirectory)) {
logger.info("working dir is ok: ", workingDirectory.getAbsolutePath());
return workingDirectory;
}
}
logger.info("working dir returing null");
return null;
}
}

View File

@@ -0,0 +1,132 @@
package util.db;
import static util.ByteUtils.*;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.FileUtils;
import org.rocksdb.*;
import util.TimeWatch;
public class RDB {
private RocksDB db;
private String path;
private static final String UTF_8 = "UTF-8";
public RDB() {
// different dbs i ncase of concurrent calculations
this.path = System.getProperty("java.io.tmpdir")
.concat(File.separator)
.concat(String.format("corpusAnalyzer_db%d", LocalDateTime.now().toString().hashCode()));
this.db = createDB();
}
private RocksDB createDB() {
RocksDB.loadLibrary();
// the Options class contains a set of configurable DB options
// that determines the behaviour of the database.
try (final Options options = new Options()) {
options.setCreateIfMissing(true);
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(options, path)) {
if (db != null) {
return rdb;
} else {
this.db = rdb;
}
}
} catch (RocksDBException e) {
// do some error handling
}
return null;
}
public void writeBatch(Map<String, AtomicLong> results) throws UnsupportedEncodingException {
RocksDB.loadLibrary();
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(new Options(), path)) {
final WriteBatch wb = new WriteBatch();
for (Map.Entry<String, AtomicLong> entry : results.entrySet()) {
byte[] key = entry.getKey().getBytes(UTF_8);
long resultValue = entry.getValue().longValue();
try {
final byte[] dbValue = rdb.get(key);
if (dbValue != null) {
// value == null if key does not exist in db.
wb.put(key, longToBytes(bytesToLong(dbValue) + resultValue));
} else {
wb.put(key, longToBytes(entry.getValue().longValue()));
}
} catch (RocksDBException e) {
// TODO: error handling
}
}
TimeWatch watch = TimeWatch.start();
rdb.write(new WriteOptions(), wb);
System.out.println(String.format("Writing %d entries took: %s", wb.count(), watch.toFullTime()));
} catch (RocksDBException e) {
// do some error handling
}
}
// public byte[] atomicIntToByteArray(final AtomicLong i) {
// BigInteger bigInt = BigInteger.valueOf(i.intValue());
//
// return bigInt.toByteArray();
// }
public RocksDB getDb() {
return db;
}
public Map<String, AtomicLong> getDump() throws UnsupportedEncodingException {
Map<String, AtomicLong> dump = new HashMap<>();
RocksDB.loadLibrary();
// the Options class contains a set of configurable DB options
// that determines the behaviour of the database.
// a factory method that returns a RocksDB instance
try (final RocksDB rdb = RocksDB.open(new Options(), path)) {
try (RocksIterator it = rdb.newIterator()) {
it.seekToFirst();
// it.next();
while (it.isValid()) {
byte[] key = it.key();
byte[] value = it.value();
dump.put(new String(key, UTF_8), new AtomicLong(bytesToLong(value)));
it.next();
}
}
} catch (RocksDBException e) {
e.printStackTrace();
}
return dump;
}
public void delete() {
try {
FileUtils.deleteDirectory(new File(path));
} catch (IOException e) {
e.printStackTrace();
}
}
}