Project copied

This commit is contained in:
Luka 2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions

160
.gitignore vendored Normal file
View File

@ -0,0 +1,160 @@
# Created by .ignore support plugin (hsz.mobi)
### Maven template
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
!/.mvn/wrapper/maven-wrapper.jar
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
.idea/
# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
# Gradle:
.idea/**/gradle.xml
.idea/**/libraries
# Mongo Explorer plugin:
.idea/**/mongoSettings.xml
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
### Java template
# Compiled class file
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.war
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### Eclipse template
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# Eclipse Core
.project
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# JDT-specific (Eclipse Java Development Tools)
.classpath
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
### Windows ###
# Windows thumbnail cache files
Thumbs.db
ehthumbs.db
ehthumbs_vista.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msm
*.msp
# Windows shortcuts
*.lnk

28
Corpus Analyzer.iml Normal file
View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
</component>
</module>

122
pom.xml Normal file
View File

@ -0,0 +1,122 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>thesis</groupId>
<artifactId>corpus-analyzer</artifactId>
<version>1.2</version>
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.6</version>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.controlsfx</groupId>
<artifactId>controlsfx</artifactId>
<version>8.40.13</version>
</dependency>
<dependency>
<groupId>org.rocksdb</groupId>
<artifactId>rocksdbjni</artifactId>
<version>5.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-fontawesome-pack</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-javafx</artifactId>
<version>1.9.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!-- packages dependencies into the jar -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>gui.GUIController</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>artifact</outputDirectory>
<finalName>Corpus_Analyzer_${version}</finalName>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- JavaFX -->
<groupId>com.zenjava</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>8.6.0</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
<verbose>true</verbose>
</configuration>
<executions>
<execution>
<id>create-jfxjar</id>
<phase>package</phase>
<goals>
<goal>build-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: gui.GUIController

View File

@ -0,0 +1,15 @@
package alg;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
public class Common {
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
// if not in map
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
map.get(o).incrementAndGet();
}
}

View File

@ -0,0 +1,794 @@
package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.logging.log4j.LogManager;
import data.*;
import gui.ValidationUtil;
public class XML_processing {
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
// public static void processCorpus(Statistics stats) {
// // we can preset the list's size, so there won't be a need to resize it
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
//
// int i = 0;
// for (File f : Settings.corpus) {
// i++;
// readXML(f.toString(), stats);
// }
// }
// public static void readXML(String path, Statistics stats) {
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
// readXMLGigafida(path, stats);
// } else if (stats.getCorpusType() == CorpusType.GOS) {
// readXMLGos(path, stats);
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
// readXMLSolar(path, stats);
// }
// }
public static void readXML(String path, StatisticsNew stats) {
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
readXMLGigafida(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
}
}
/**
* Reads and returns the value of a passed header tag or an empty string.
* E.g. title tag, for discerning the corpus' type.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderTag(String path, String tag) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
pool.invoke(wc);
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
pool.invoke(wc);
} else {
// TODO:
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
}
// public static void readXMLGos(String path, Statistics stats) {
// boolean in_word = false;
// String taksonomija = "";
// String lemma = "";
// String msd = "";
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
//
// List<Word> stavek = new ArrayList<>();
// List<Sentence> corpus = new ArrayList<>();
// String sentenceDelimiter = "seg";
// String taxonomyPrefix = "gos.";
//
// try {
// XMLInputFactory factory = XMLInputFactory.newInstance();
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
//
// while (eventReader.hasNext()) {
// XMLEvent event = eventReader.nextEvent();
//
// switch (event.getEventType()) {
// case XMLStreamConstants.START_ELEMENT:
//
// StartElement startElement = event.asStartElement();
// String qName = startElement.getName().getLocalPart();
//
// // "word" node
// if (qName.equals("w")) {
// in_word = true;
//
// if (type.equals("norm")) {
// // make sure we're looking at <w lemma...> and not <w type...>
// Iterator var = startElement.getAttributes();
// ArrayList<Object> attributes = new ArrayList<>();
// while (var.hasNext()) {
// attributes.add(var.next());
// }
//
// if (attributes.contains("msd")) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// } else {
// msd = null;
// }
//
// if (attributes.contains("lemma")) {
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
// }
// }
// // taxonomy node
// else if (qName.equalsIgnoreCase("catRef")) {
// // there are some term nodes at the beginning that are of no interest to us
// // they differ by not having the attribute "ref", so test will equal null
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
//
// if (test != null) {
// // keep only taxonomy properties
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
// }
// } else if (qName.equalsIgnoreCase("div")) {
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
//
// }
// break;
//
// case XMLStreamConstants.CHARACTERS:
// Characters characters = event.asCharacters();
//
// // "word" node value
// if (in_word) {
// if (type.equals("norm") && msd != null) {
// stavek.add(new Word(characters.getData(), lemma, msd));
// } else {
// stavek.add(new Word(characters.getData()));
// }
//
// in_word = false;
// }
// break;
//
// case XMLStreamConstants.END_ELEMENT:
// EndElement endElement = event.asEndElement();
//
// // parser reached end of the current sentence
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// // add sentence to corpus
// corpus.add(new Sentence(stavek, taksonomija, type));
// // and start a new one
// stavek = new ArrayList<>();
//
// /* Invoke Fork-Join when we reach maximum limit of
// * sentences (because we can't read everything to
// * memory) or we reach the end of the file.
// */
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
// fj(corpus, stats);
// // empty the current corpus, since we don't need
// // the data anymore
// corpus.clear();
// }
// }
//
// // backup
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// fj(corpus, stats);
// corpus.clear();
// }
//
// break;
// }
// }
// } catch (FileNotFoundException | XMLStreamException e) {
// e.printStackTrace();
// }
// }
@SuppressWarnings("unused")
public static void readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>();
// used for filter
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
// System.out.println(String.format("%s", startElement.toString()));
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek));
// and start a new one
stavek = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String qNameEnd = endElement.getName().getLocalPart();
if (qNameEnd.equals("head")) {
// validate and set boolean
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
includeThisBlock = true;
}
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
}
// backup
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
}
}
/**
* @param readHeadBlock block of tags read from the corpus
* @param userSetFilter tags with values set by the user
*
* @return
*/
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
boolean pass = true;
if (userSetFilter == null) {
return true;
}
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
String key = filterEntry.getKey();
HashSet<String> valueObject = filterEntry.getValue();
// if (valueObject instanceof String) {
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
// } else
if (valueObject != null) {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
}
}
if (!pass) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
}
}
// if it gets to this point, it passed all the filters
return true;
}
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
if (!readHeadBlock.keySet().contains(userSetKey)) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
// different values -> doesn't pass the filter
return false;
}
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
String headTagName;
if (corpusType == CorpusType.SOLAR) {
headTagName = "head";
// used for filter
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else {
headTagName = "teiHeader";
}
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = null;
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String elementName = startElement.getName().getLocalPart();
if (elementName.equalsIgnoreCase(headTagName)) {
// if the corpus is split into files, we skip bodies
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
}
if (insideHeader) {
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
HashMap<String, String> atts = extractAttributes(startElement);
String debug = "";
String tax = startElement.getAttributeByName(QName.valueOf("target"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
}
}
} catch (XMLStreamException e) {
logger.error("Streaming error", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
} catch (FileNotFoundException e) {
logger.error("File not found", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
} finally {
if (xmlEventReader != null) {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return parseTaxonomy ? resultTaxonomy : resultFilters;
}
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
return event.asEndElement()
.getName()
.getLocalPart()
.equalsIgnoreCase(headerTag);
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
}
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "seg";
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
XMLEventReader eventReader = null;
boolean includeFile = true;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
if (qName.equals("div")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("type")) {
inOrthDiv = atts.get("type").equals("orth");
}
}
// "word" node
if (qName.equals("w")) {
// check that it's not a type
HashMap<String, String> atts = extractAttributes(startElement);
if (!atts.containsKey("type")) {
inWord = true;
if (atts.containsKey("msd")) {
msd = atts.get("msd");
}
if (atts.containsKey("lemma")) {
lemma = atts.get("lemma");
}
//
// if (!inOrthDiv) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
}
// }
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd));
} else {
sentence.add(new Word(characters.getData()));
}
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
boolean saveSentence = computeForOrth == inOrthDiv;
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
// disregard this entry if taxonomies don't match
includeFile = !currentFiletaxonomy.isEmpty();
currentFiletaxonomy = new ArrayList<>();
}
}
// backup
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
} catch (Exception e) {
logger.error("general error", e);
}
}
}
return true;
}
/**
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
* Filters:
* <ol>
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
* </ol>
*
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
*/
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
return null;
}
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
}
}
return sentence;
}
private static HashMap<String, String> extractAttributes(StartElement se) {
Iterator attributesIt = se.getAttributes();
HashMap<String, String> atts = new HashMap<>();
while (attributesIt.hasNext()) {
Attribute a = (Attribute) attributesIt.next();
atts.put(a.getName().getLocalPart(), a.getValue());
}
return atts;
}
}

View File

@ -0,0 +1,67 @@
package alg.inflectedJOS;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.Statistics;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private Statistics stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
}
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@ -0,0 +1,170 @@
package alg.inflectedJOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import alg.Common;
import data.Sentence;
import data.Statistics;
import data.StatisticsNew;
import data.Word;
public class InflectedJOSCount {
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
// static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i));
// }
// }
//
// private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5};
// int r = 3;
// int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
//
// return printCombination(arr, n, r);
// }
//
// /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[]
// r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>();
//
// if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++)
// System.out.print(data[j] + " ");
// System.out.println("");
// return;
// }
//
// // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements
// // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r);
// }
// }
//
// // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one
// int data[] = new int[r];
//
// // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r);
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
// }
//
// calculateCommon(s, stats.result);
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) {
// disregard if wrong taxonomy
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
continue;
}
for (Word word : s.getWords()) {
// skip if current word is not inflected
if (!(word.getMsd().length() > 0)) {
continue;
}
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
Common.updateMap(stats.result, entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
// skip if current word is not inflected
// // TODO: if has defined msd and is of correct type (create a set)
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}

View File

@ -0,0 +1,131 @@
package alg.inflectedJOS;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import data.Enums.InflectedJosTypes;
import data.StatisticsNew;
import gui.ValidationUtil;
import util.Combinations;
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
public class WordFormation {
private static HashMap<String, Long> josTypeResult;
private static Object[][] tmpResults;
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
static {
indices = new HashMap<>();
for (int i = 4; i <= 8; i++) {
indices.put(i, Combinations.generateIndices(i));
}
}
public static void calculateStatistics(StatisticsNew stat) {
Map<String, AtomicLong> result = stat.getResult();
// 1. filter - keep only inflected types
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
// 2. for each inflected type get all possible subcombinations
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
josTypeResult = new HashMap<>();
// filter out results for a single word type
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
.filter(x -> x.getKey().charAt(0) == josChar)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
if (ValidationUtil.isEmpty(singleTypeResults)) {
continue;
}
// get all possible indices combos for a msd of this length
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
int l = e.getKey().length();
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
}
}
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
}
stat.setResultCustom(tmpResults);
}
private static String mask(String word, HashSet<Integer> indicesCombo) {
StringBuilder sb = new StringBuilder();
sb.append(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
}
return sb.toString();
}
private static void updateResults(String s, Long nOfOccurences) {
// if not in map add
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
// else update
if (r != null) {
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
}
}
private static void resultsMapToArray(Long totalValue) {
Double total = totalValue * 1.0;
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
int i = 0;
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
josTypeResultArray[i][0] = e.getKey();
josTypeResultArray[i][1] = e.getValue();
josTypeResultArray[i][2] = e.getValue() / total;
if (e.getValue() > total) {
String debug = "";
}
i++;
}
if (tmpResults == null) {
tmpResults = josTypeResultArray;
} else {
int firstLength = tmpResults.length;
int secondLength = josTypeResultArray.length;
Object[][] tmp = new Object[firstLength + secondLength][3];
System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
tmpResults = tmp;
// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
}
}
private static void printArray() {
for (int i = 0; i < tmpResults.length; i++) {
for (int j = 0; j < tmpResults[i].length; j++) {
System.out.print(tmpResults[i][j] + "\t");
}
System.out.println();
}
}
}

View File

@ -0,0 +1,62 @@
package alg.ngram;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 5074814035083362355L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
Ngrams.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@ -0,0 +1,204 @@
package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.CalculateFor;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
import gui.ValidationUtil;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
generateNgramLetterCandidates(corpus, stats);
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
generateSkipgramCandidates(corpus, stats);
} else {
generateNgramCandidates(corpus, stats);
}
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
}
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
continue;
}
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
}
for (int i = 0; i < regex.size(); i++) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
return false;
}
}
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
break;
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
break;
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.collect(Collectors.toList()));
break;
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.collect(Collectors.toList()));
break;
}
return StringUtils.join(candidate, " ");
}
/**
* Generates candidates and updates results
*
* @param corpus
* @param stats
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
}
}
}
}
/**
* Extracts skipgram candidates.
*
* @return List of candidates represented as a list<candidates(String)>
*/
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
if (ngram == 5 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats);
}
}
}
}
}
}
}
}
}
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
}
}
}

View File

@ -0,0 +1,62 @@
package alg.word;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 7711587510996456040L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
WordLevel.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@ -0,0 +1,167 @@
package alg.word;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import alg.Common;
import data.CalculateFor;
import data.Sentence;
import data.Statistics;
import data.Word;
class WordCount {
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
if (word.length() > stats.getSubstringLength()) {
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
String substring = word.substring(i, i + stats.getSubstringLength());
Common.updateMap(stats.result, substring);
}
}
}
}
}
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
boolean taxonomyIsSet = stats.isTaxonomySet();
boolean JosTypeIsSet = stats.isJOSTypeSet();
// branching because even though the only difference is an if or two &&
// O(if) = 1, the amount of ifs adds up and this saves some time
if (taxonomyIsSet && JosTypeIsSet) {
calculateForTaxonomyAndJosType(corpus, stats);
} else if (taxonomyIsSet && !JosTypeIsSet) {
calculateForTaxonomy(corpus, stats);
} else if (!taxonomyIsSet && JosTypeIsSet) {
calculateForJosType(corpus, stats);
} else {
if (stats.isVcc()) {
calculateVCC(corpus, stats);
} else {
calculateNoFilter(corpus, stats);
}
}
}
}

View File

@ -0,0 +1,112 @@
package alg.word;
import static data.Enums.WordLevelDefaultValues.*;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import data.Enums.WordLevelDefaultValues;
import data.Enums.WordLevelType;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
@SuppressWarnings("Duplicates")
public class WordLevel {
private static HashSet<String> suffixes;
private static int minSuffixLength;
private static int maxSuffixLength;
private static HashSet<String> prefixes;
private static int minPrefixLength;
private static int maxPrefixLength;
static {
suffixes = WordLevelDefaultValues.getSuffixes();
calculateSuffixesLengths();
prefixes = WordLevelDefaultValues.getPrefixes();
calculatePrefixesLengths();
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
calculateForSuffixes(word.getWord(), stats);
calculateForPrefixes(word.getWord(), stats);
}
}
}
private static void calculateForPrefixes(String word, StatisticsNew stats) {
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
return;
}
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
if (prefixes.contains(extractedPrefix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
return;
}
}
}
public static void calculateForSuffixes(String word, StatisticsNew stats) {
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
return;
}
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
if (suffixes.contains(extractedSuffix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
return;
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculateSuffixesLengths() {
minSuffixLength = -1;
maxSuffixLength = -1;
for (String suffix : suffixes) {
if (suffix.length() > maxSuffixLength) {
maxSuffixLength = suffix.length();
if (minSuffixLength < 0) {
minSuffixLength = maxSuffixLength;
}
} else if (suffix.length() < minSuffixLength) {
minSuffixLength = suffix.length();
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculatePrefixesLengths() {
minPrefixLength = -1;
maxPrefixLength = -1;
for (String prefix : prefixes) {
if (prefix.length() > maxPrefixLength) {
maxPrefixLength = prefix.length();
if (minPrefixLength < 0) {
minPrefixLength = maxPrefixLength;
}
} else if (prefix.length() < minPrefixLength) {
minPrefixLength = prefix.length();
}
}
}
}

View File

@ -0,0 +1,17 @@
package data;
public enum AnalysisLevel {
STRING_LEVEL("Besedni nizi"),
WORD_LEVEL("Nivo besed in delov besed"),
WORD_FORMATION("Besedotvorni procesi");
private final String name;
AnalysisLevel(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
}

View File

@ -0,0 +1,43 @@
package data;
public enum CalculateFor {
WORD("različnica"),
LEMMA("lema"),
MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"),
MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"),
WORD_TYPE("besedna vrsta"),
DIST_WORDS("različnica"),
DIST_LEMMAS("lema");
private final String name;
CalculateFor(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
public static CalculateFor factory(String cf) {
if (cf != null) {
if (WORD.toString().equals(cf)) {
return WORD;
}
if (LEMMA.toString().equals(cf)) {
return LEMMA;
}
if (MORPHOSYNTACTIC_SPECS.toString().equals(cf)) {
return MORPHOSYNTACTIC_SPECS;
}
if (MORPHOSYNTACTIC_PROPERTY.toString().equals(cf)) {
return MORPHOSYNTACTIC_PROPERTY;
}
if (WORD_TYPE.toString().equals(cf)) {
return WORD_TYPE;
}
}
return null;
}
}

View File

@ -0,0 +1,163 @@
package data;
import static gui.Messages.*;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.Enums.solar.SolarFilters;
import gui.ValidationUtil;
import javafx.collections.ObservableList;
public class Corpus {
public final static Logger logger = LogManager.getLogger(Corpus.class);
private CorpusType corpusType;
private File chosenResultsLocation;
private File chosenCorpusLocation;
private Collection<File> detectedCorpusFiles;
boolean headerRead;
private ObservableList<String> taxonomy; // if gigafida or gos
private HashMap<String, ObservableList<String>> solarFilters; // if solar
private HashMap<String, HashSet<String>> solarFiltersForXML; // if solar - used while parsing xml
private boolean gosOrthMode;
boolean hasMsdData;
private ArrayList<String> validationErrors;
public Corpus() {
validationErrors = new ArrayList<>();
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
logger.info("Corpus.set: ", corpusType);
}
public File getChosenResultsLocation() {
return chosenResultsLocation;
}
public void setChosenResultsLocation(File chosenResultsLocation) {
this.chosenResultsLocation = chosenResultsLocation;
logger.info("Corpus.set: ", chosenResultsLocation);
}
public File getChosenCorpusLocation() {
return chosenCorpusLocation;
}
public void setChosenCorpusLocation(File chosenCorpusLocation) {
this.chosenCorpusLocation = chosenCorpusLocation;
logger.info("Corpus.set: ", chosenCorpusLocation);
}
public Collection<File> getDetectedCorpusFiles() {
return detectedCorpusFiles;
}
public void setDetectedCorpusFiles(Collection<File> detectedCorpusFiles) {
this.detectedCorpusFiles = detectedCorpusFiles;
logger.info("Corpus.set: ", detectedCorpusFiles);
}
public boolean isHeaderRead() {
return headerRead;
}
public void setHeaderRead(boolean headerRead) {
this.headerRead = headerRead;
}
public ObservableList<String> getTaxonomy() {
return taxonomy;
}
public void setTaxonomy(ObservableList<String> taxonomy) {
this.taxonomy = taxonomy;
logger.info("Corpus.set: ", taxonomy);
}
public HashMap<String, ObservableList<String>> getSolarFilters() {
return solarFilters;
}
public void setSolarFilters(HashMap<String, ObservableList<String>> solarFilters) {
this.solarFilters = solarFilters;
logger.info("Corpus.set: ", solarFilters);
}
public HashMap<String, HashSet<String>> getSolarFiltersForXML() {
return solarFiltersForXML;
}
public void setSolarFiltersForXML(HashMap<String, HashSet<String>> solarFiltersForXML) {
this.solarFiltersForXML = solarFiltersForXML;
logger.info("Corpus.set: ", solarFiltersForXML);
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
logger.info("Corpus.set: ", gosOrthMode);
}
public ArrayList<String> getValidationErrors() {
return validationErrors;
}
public String getValidationErrorsToString() {
return StringUtils.join(validationErrors, "\n - ");
}
public void setValidationErrors(ArrayList<String> validationErrors) {
this.validationErrors = validationErrors;
}
public boolean validate() {
if (corpusType == null) {
validationErrors.add(LABEL_RESULTS_CORPUS_TYPE_NOT_SET);
}
if (chosenCorpusLocation == null) {
validationErrors.add(LABEL_CORPUS_LOCATION_NOT_SET);
}
if (chosenResultsLocation == null) {
validationErrors.add(LABEL_RESULTS_LOCATION_NOT_SET);
}
if (!headerRead && corpusType != null) {
// if user didn't opt into reading the headers, set default taxonomy or solar filters
if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) {
taxonomy = Tax.getTaxonomyForComboBox(corpusType);
} else if (corpusType == CorpusType.SOLAR && solarFilters == null) {
setSolarFilters(SolarFilters.getFiltersForComboBoxes());
}
}
if (headerRead && ValidationUtil.isEmpty(taxonomy)) {
// mustn't happen, intercept at gui level
}
if (!ValidationUtil.isEmpty(validationErrors)) {
logger.error("Corpus validation error: ", StringUtils.join(validationErrors, "\n - "));
return false;
} else {
return true;
}
}
}

View File

@ -0,0 +1,25 @@
package data;
public enum CorpusType {
GIGAFIDA("Gigafida", "gigafida"),
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos");
private final String name;
private final String nameLowerCase;
CorpusType(String name, String nameLowerCase) {
this.name = name;
this.nameLowerCase = nameLowerCase;
}
public String toString() {
return this.name;
}
public String getNameLowerCase() {
return nameLowerCase;
}
}

View File

@ -0,0 +1,12 @@
package data.Enums;
import java.util.Arrays;
import java.util.HashSet;
public class InflectedJosTypes {
public static final HashSet<Character> inflectedJosTypes = new HashSet<>();
static {
inflectedJosTypes.addAll(Arrays.asList('S', 'G', 'P'));
}
}

View File

@ -0,0 +1,68 @@
package data.Enums;
import java.util.HashMap;
public enum Msd {
NOUN("samostalnik", 'S', "Noun", 'N', 5),
VERB("glagol", 'G', "Verb", 'V', 7),
ADJECTIVE("pridevnik", 'P', "Adjective", 'A', 6),
ADVERB("prislov", 'R', "Adverb", 'R', 2),
PRONOUN("zaimek", 'Z', "Pronoun", 'P', 8),
NUMERAL("števnik", 'K', "Numeral", 'M', 6),
PREPOSITION("predlog", 'D', "Preposition", 'S', 1),
CONJUNCTION("veznik", 'V', "Conjunction", 'C', 1),
PARTICLE("členek", 'L', "Particle", 'Q', 0),
INTERJECTION("medmet", 'M', "Interjection", 'I', 0),
ABBREVIATION("okrajšava", 'O', "Abbreviation", 'Y', 0),
RESIDUAL("neuvrščeno", 'N', "Residual", 'X', 1);
private final String siName;
private final Character siCode;
private final String enName;
private final Character enCode;
private final Integer nOfAttributes;
private static HashMap<Character, Integer> siCodeNOfAttributes;
static {
siCodeNOfAttributes = new HashMap<>();
for (Msd msd : Msd.values()) {
siCodeNOfAttributes.put(msd.getSiCode(), msd.nOfAttributes);
}
}
Msd(String siName, Character siCode, String enName, Character enCode, int nOfAttributes) {
this.siName = siName;
this.siCode = siCode;
this.enName = enName;
this.enCode = enCode;
this.nOfAttributes = nOfAttributes;
}
public String getSiName() {
return siName;
}
public Character getSiCode() {
return siCode;
}
public String getEnName() {
return enName;
}
public Character getEnCode() {
return enCode;
}
/**
* Returns the number of attributes for the given type.
*
* @param msd
*
* @return
*/
public static int getMsdLengthForType(String msd) {
return siCodeNOfAttributes.get(msd.charAt(0)) + 1;
}
}

View File

@ -0,0 +1,55 @@
package data.Enums;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class WordLevelDefaultValues {
public final static Logger logger = LogManager.getLogger(WordLevelDefaultValues.class);
private static HashSet<String> suffixes;
private static final String SUFFIXES_FILE = "/Lists/suffixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_SUFFIX = 2;
private static HashSet<String> prefixes;
private static final String PREFIXES_FILE = "/Lists/prefixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_PREFIX = 2;
static {
suffixes = new HashSet<>();
suffixes = readFromFile(SUFFIXES_FILE);
prefixes = new HashSet<>();
prefixes = readFromFile(PREFIXES_FILE);
}
private static HashSet<String> readFromFile(String fileName) {
Set<String> dictionary = new HashSet<>();
try (InputStream is = WordLevelDefaultValues.class.getClass().getResourceAsStream(fileName)) {
if (is != null) {
// TODO: warn if !exists
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
dictionary = reader.lines().collect(Collectors.toSet());
}
} catch (IOException e) {
logger.error("Problem reading init dictionary", e);
}
return (HashSet<String>) dictionary;
}
public static HashSet<String> getSuffixes() {
return suffixes;
}
public static HashSet<String> getPrefixes() {
return prefixes;
}
}

View File

@ -0,0 +1,16 @@
package data.Enums;
public enum WordLevelType {
SUFFIX("pripona"),
PREFIX("predpona");
private final String name;
WordLevelType(String name) {
this.name = name;
}
public String getName() {
return name;
}
}

View File

@ -0,0 +1,57 @@
package data.Enums.solar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class SolarFilters {
private static HashMap<String, ObservableList<String>> SOLAR_FILTERS;
public static final String SOLA = "sola";
public static final String PREDMET = "predmet";
public static final String RAZRED = "razred";
public static final String REGIJA = "regija";
public static final String TIP = "tip";
public static final String LETO = "leto";
static {
SOLAR_FILTERS = new HashMap<>();
SOLAR_FILTERS.put(REGIJA, FXCollections.observableArrayList("Celje", "Gorica", "Koper", "Kranj", "Krško", "Ljubljana", "Maribor", "Murska Sobota", "Novo mesto", "Postojna", "Slovenj Gradec"));
SOLAR_FILTERS.put(PREDMET, FXCollections.observableArrayList("državljanska vzgoja in etika", "ekonomija", "filozofija", "geografija", "kemija", "podjetništvo", "psihologija", "slovenščina", "sociologija", "umetnostna vzgoja", "zgodovina"));
SOLAR_FILTERS.put(RAZRED, FXCollections.observableArrayList("6. razred", "7. razred", "8. razred", "9. razred", "1. letnik", "2. letnik", "3. letnik", "4. letnik", "5. letnik", "maturitetni tečaj"));
SOLAR_FILTERS.put(LETO, FXCollections.observableArrayList("2007", "2008", "2009", "2009/2010", "2010"));
SOLAR_FILTERS.put(SOLA, FXCollections.observableArrayList("gimnazija", "osnovna šola", "poklicna šola", "strokovna šola"));
SOLAR_FILTERS.put(TIP, FXCollections.observableArrayList("esej/spis", "pisni izdelek (učna ura)", "test (daljše besedilo)", "test (odgovori na vprašanja)"));
}
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_FULL = FXCollections.observableArrayList("različnica", "lema", "oblikoskladenjska oznaka", "oblikoskladenjska lastnost", "besedna vrsta");
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_LIMITED = FXCollections.observableArrayList("različnica", "lema");
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes() {
return SOLAR_FILTERS;
}
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes(HashMap<String, HashSet<String>> foundFilters) {
HashMap<String, ObservableList<String>> filtersForComboBoxes = new HashMap<>();
for (Map.Entry<String, ObservableList<String>> e : SOLAR_FILTERS.entrySet()) {
if (!foundFilters.containsKey(e.getKey())) {
// if, by some reason a specific filter wasn't in the corpus, return a blank list for that filter
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList());
} else {
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList(foundFilters.get(e.getKey())).sorted());
}
}
return filtersForComboBoxes;
}
}

View File

@ -0,0 +1,144 @@
package data;
import static data.Filter.filterName.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.regex.Pattern;
import gui.ValidationUtil;
@SuppressWarnings("unchecked")
public class Filter {
private HashMap<filterName, Object> filter;
public enum filterName {
ANALYSIS_LEVEL,
CALCULATE_FOR,
NGRAM_VALUE,
SKIP_VALUE,
IS_CVV,
STRING_LENGTH,
TAXONOMY,
MSD,
HAS_MSD,
SOLAR_FILTERS
}
public Filter() {
filter = new HashMap<>();
}
public Filter(AnalysisLevel al, CalculateFor cf) {
filter = new HashMap<>();
filter.put(ANALYSIS_LEVEL, al);
filter.put(CALCULATE_FOR, cf);
}
public void setAl(AnalysisLevel al) {
filter.put(ANALYSIS_LEVEL, al);
}
public AnalysisLevel getAl() {
return (AnalysisLevel) filter.get(ANALYSIS_LEVEL);
}
public void setCalculateFor(CalculateFor cf) {
filter.put(CALCULATE_FOR, cf);
}
public CalculateFor getCalculateFor() {
return (CalculateFor) filter.get(CALCULATE_FOR);
}
public void setNgramValue(Integer ngramValue) {
filter.put(NGRAM_VALUE, ngramValue);
}
public Integer getNgramValue() {
return (Integer) filter.get(NGRAM_VALUE);
}
public void setSkipValue(Integer skipValue) {
filter.put(SKIP_VALUE, skipValue);
}
public Integer getSkipValue() {
return (Integer) filter.get(SKIP_VALUE);
}
public void setIsCvv(boolean isCvv) {
filter.put(IS_CVV, isCvv);
}
public boolean isCvv() {
return filter.containsKey(IS_CVV) && (boolean) filter.get(IS_CVV);
}
public void setStringLength(int stringLength) {
filter.put(STRING_LENGTH, stringLength);
}
public Integer getStringLength() {
return (Integer) filter.get(STRING_LENGTH);
}
public void setTaxonomy(ArrayList<String> taxonomy) {
filter.put(TAXONOMY, taxonomy);
}
public ArrayList<String> getTaxonomy() {
if (filter.containsKey(TAXONOMY) && filter.get(TAXONOMY) != null) {
return (ArrayList<String>) filter.get(TAXONOMY);
} else {
return new ArrayList<>();
}
}
public void setMsd(ArrayList<Pattern> msd) {
filter.put(MSD, msd);
if (!ValidationUtil.isEmpty(msd)) {
setHasMsd(true);
} else {
setHasMsd(false);
}
}
public ArrayList<Pattern> getMsd() {
return (ArrayList<Pattern>) filter.get(MSD);
}
public void setHasMsd(boolean hasMsd) {
filter.put(HAS_MSD, hasMsd);
}
public boolean hasMsd() {
return filter.containsKey(HAS_MSD) && (boolean) filter.get(HAS_MSD);
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Filter:");
for (Map.Entry<filterName, Object> entry : filter.entrySet()) {
sb.append(newLine)
.append(entry.getKey().toString())
.append(": ")
.append(entry.getValue() != null ? entry.getValue().toString() : "null");
}
return sb.toString();
}
public void setSolarFilters(HashMap<String, HashSet<String>> filters) {
filter.put(SOLAR_FILTERS, filters);
}
public HashMap<String, HashSet<String>> getSolarFilters() {
return (HashMap<String, HashSet<String>>) filter.get(SOLAR_FILTERS);
}
}

View File

@ -0,0 +1,71 @@
package data;
public enum GigafidaJosWordType {
SAMOSTALNIK("samostalnik", 'S'),
GLAGOL("glagol", 'G'),
PRIDEVNIK("pridevnik", 'P'),
PRISLOV("prislov", 'R'),
ZAIMEK("zaimek", 'Z'),
STEVNIK("stevnik", 'K'),
PREDLOG("predlog", 'D'),
VEZNIK("veznik", 'V'),
CLENEK("clenek", 'L'),
MEDMET("medmet", 'M'),
OKRAJSAVA("okrajsava", 'O');
private final String name;
private final char wordType;
GigafidaJosWordType(String name, char wordType) {
this.name = name;
this.wordType = wordType;
}
public String toString() {
return this.name;
}
public char getWordType() {
return wordType;
}
public static GigafidaJosWordType factory(String wType) {
if (wType != null) {
if (SAMOSTALNIK.toString().equals(wType)) {
return SAMOSTALNIK;
}
if (GLAGOL.toString().equals(wType)) {
return GLAGOL;
}
if (PRIDEVNIK.toString().equals(wType)) {
return PRIDEVNIK;
}
if (PRISLOV.toString().equals(wType)) {
return PRISLOV;
}
if (ZAIMEK.toString().equals(wType)) {
return ZAIMEK;
}
if (STEVNIK.toString().equals(wType)) {
return STEVNIK;
}
if (PREDLOG.toString().equals(wType)) {
return PREDLOG;
}
if (VEZNIK.toString().equals(wType)) {
return VEZNIK;
}
if (CLENEK.toString().equals(wType)) {
return CLENEK;
}
if (MEDMET.toString().equals(wType)) {
return MEDMET;
}
if (OKRAJSAVA.toString().equals(wType)) {
return OKRAJSAVA;
}
}
return null;
}
}

View File

@ -0,0 +1,76 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GigafidaTaxonomy {
TISK("tisk", "T"),
KNJIZNO("knjižno", "T.K"),
LEPOSLOVNO("leposlovno", "T.K.L"),
STROKOVNO("strokovno", "T.K.S"),
PERIODICNO("periodično", "T.P"),
CASOPIS("časopis", "T.P.C"),
REVIJA("revija", "T.P.R"),
INTERNET("internet", "I");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GigafidaTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GigafidaTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GigafidaTaxonomy factory(String tax) {
if (tax != null) {
if (TISK.toString().equals(tax)) {
return TISK;
}
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@ -0,0 +1,85 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GosTaxonomy {
JAVNI("javni", "gos.T.J"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "gos.T.J.I"),
RAZVEDRILNI("razvedrilni", "gos.T.J.R"),
NEJAVNI("nejavni", "gos.T.N"),
NEZASEBNI("nezasebni", "gos.T.N.N"),
ZASEBNI("zasebni", "gos.T.N.Z"),
OSEBNI_STIK("osebni stik", "gos.K.O"),
TELEFON("telefon", "gos.K.P"),
RADIO("radio", "gos.K.R"),
TELEVIZIJA("televizija", "gos.K.T");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GosTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GosTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GosTaxonomy factory(String tax) {
if (tax != null) {
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@ -0,0 +1,56 @@
package data;
import java.util.List;
import java.util.Map;
public class Sentence {
private List<Word> words;
private String taksonomija;
// GOS
private String type;
private Map<String, String> properties;
public Sentence(List<Word> words, String taksonomija) {
this.words = words;
this.taksonomija = taksonomija;
}
public Sentence(List<Word> words) {
this.words = words;
}
public Sentence(List<Word> words, String taksonomija, Map<String, String> properties) {
this.words = words;
this.taksonomija = taksonomija;
this.properties = properties;
}
public Sentence(List<Word> words, String taksonomija, String type) {
this.words = words;
this.taksonomija = taksonomija;
this.type = type;
}
public List<Word> getWords() {
return words;
}
public String getTaxonomy() {
return taksonomija;
}
public List<Word> getSublist(int indexFrom, int indexTo) {
return this.words.subList(indexFrom, indexTo);
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@ -0,0 +1,16 @@
package data;
import java.io.File;
import java.util.Collection;
public class Settings {
public static final int CORPUS_SENTENCE_LIMIT = 50000;
public static final boolean PRINT_LOG = false;
public static final String FX_ACCENT_OK = "-fx-accent: forestgreen;";
public static final String FX_ACCENT_NOK = "-fx-accent: red;";
public static Collection<File> corpus;
public static File resultsFilePath;
}

View File

@ -0,0 +1,299 @@
package data;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import util.Util;
import util.db.RDB;
public class Statistics {
private CorpusType corpusType;
private AnalysisLevel analysisLevel;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private String taxonomy;
private boolean taxonomyIsSet;
private char JOSType;
private boolean JOSTypeIsSet;
private String resultTitle;
public Map<String, AtomicLong> result = new ConcurrentHashMap<>();
// nGrams
private int nGramLevel;
private Integer skip;
private CalculateFor cf;
private List<Pattern> morphosyntacticFilter;
// distributions
private String distributionTaxonomy;
private char distributionJosWordType;
private boolean vcc;
private Integer substringLength;
// inflected JOS
private String inflectedJosTaxonomy;
// GOS
boolean gosOrthMode;
// šolar
Map<String, Object> solarHeadBlockFilter;
// for ngrams
public Statistics(AnalysisLevel al, int nGramLevel, Integer skip, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.cf = cf;
this.analysisLevel = al;
this.nGramLevel = nGramLevel;
this.skip = skip == null || skip == 0 ? null : skip;
this.resultTitle = String.format("%s%d-gram_%s_%s",
this.skip != null ? String.format("%d-%s-", skip, "skip") : "",
nGramLevel,
cf.toString(),
dateTime);
}
// for words distributions
public Statistics(AnalysisLevel al, Taxonomy distributionTaxonomy, GigafidaJosWordType distributionJosWordType, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy.toString() : "",
distributionJosWordType != null ? distributionJosWordType.toString() : "",
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.distributionTaxonomy = distributionTaxonomy != null ? distributionTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = distributionTaxonomy != null;
this.JOSTypeIsSet = distributionJosWordType != null;
this.distributionJosWordType = this.JOSTypeIsSet ? distributionJosWordType.getWordType() : ' ';
}
public Statistics(AnalysisLevel al, CalculateFor cf, Integer substringLength) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%d_%s",
"Distribucija zaporedij samoglasnikov in soglasnikov",
substringLength,
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.substringLength = substringLength;
this.vcc = true;
}
public Statistics(AnalysisLevel al, Taxonomy inflectedJosTaxonomy) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("InflectedJOS_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy : "",
dateTime);
this.analysisLevel = al;
this.inflectedJosTaxonomy = inflectedJosTaxonomy != null ? inflectedJosTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = inflectedJosTaxonomy != null;
}
public Integer getSkip() {
return skip;
}
public Integer getSubstringLength() {
return substringLength;
}
public String getInflectedJosTaxonomy() {
return inflectedJosTaxonomy;
}
public void setSubstringLength(Integer substringLength) {
this.substringLength = substringLength;
}
public boolean isVcc() {
return vcc;
}
public void setVcc(boolean vcc) {
this.vcc = vcc;
}
public String getDistributionTaxonomy() {
return distributionTaxonomy;
}
public void setDistributionTaxonomy(String distributionTaxonomy) {
this.distributionTaxonomy = distributionTaxonomy;
}
public char getDistributionJosWordType() {
return distributionJosWordType;
}
public void setDistributionJosWordType(char distributionJosWordType) {
this.distributionJosWordType = distributionJosWordType;
}
public void setMorphosyntacticFilter(List<String> morphosyntacticFilter) {
// change filter strings to regex patterns
this.morphosyntacticFilter = new ArrayList<>();
for (String s : morphosyntacticFilter) {
this.morphosyntacticFilter.add(Pattern.compile(s.replaceAll("\\*", ".")));
}
}
public List<Pattern> getMsd() {
return morphosyntacticFilter;
}
public Map<String, AtomicLong> getResult() {
return result;
}
public void setTaxonomy(String taxonomy) {
this.taxonomy = taxonomy;
}
public void setTaxonomyIsSet(boolean taxonomyIsSet) {
this.taxonomyIsSet = taxonomyIsSet;
}
public char getJOSType() {
return JOSType;
}
public void setJOSType(char JOSType) {
this.JOSType = JOSType;
}
public boolean isJOSTypeSet() {
return JOSTypeIsSet;
}
public void setJOSType(boolean JOSTypeIsSet) {
this.JOSTypeIsSet = JOSTypeIsSet;
}
public void saveResultToDisk(int... limit) throws UnsupportedEncodingException {
// Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
//
// if (useDB) {
// result = db.getDump();
// db.delete();
// }
//
// // if no results and nothing to save, return false
// if (!(result.size() > 0)) {
// analysisProducedResults = false;
// return;
// } else {
// analysisProducedResults = true;
// }
//
// stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
// Export.SetToCSV(stats);
}
// private Map<String, Integer> getSortedResultInflected(Map map) {
// // first convert to <String, Integer>
// Map<String, Integer> m = Util.sortByValue(Util.atomicInt2StringAndInt(map), 0);
//
// Map<String, Integer> sortedM = new TreeMap<>();
//
// sortedM.putAll(m);
//
// return sortedM;
// }
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public String getTaxonomy() {
return taxonomy;
}
public boolean isTaxonomySet() {
return taxonomyIsSet;
}
public int getnGramLevel() {
return nGramLevel;
}
public CalculateFor getCf() {
return cf;
}
public AnalysisLevel getAnalysisLevel() {
return analysisLevel;
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
}
public Map<String, Object> getSolarHeadBlockFilter() {
return solarHeadBlockFilter;
}
public void setSolarHeadBlockFilter(Map<String, Object> solarHeadBlockFilter) {
this.solarHeadBlockFilter = solarHeadBlockFilter;
}
public boolean isUseDB() {
return useDB;
}
public void setUseDB(boolean useDB) {
if (useDB && db == null) {
db = new RDB();
}
this.useDB = useDB;
}
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
}

View File

@ -0,0 +1,409 @@
package data;
import static gui.ValidationUtil.*;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.inflectedJOS.WordFormation;
import data.Enums.WordLevelType;
import javafx.collections.ObservableList;
import util.Export;
import util.Util;
import util.db.RDB;
@SuppressWarnings("Duplicates")
public class StatisticsNew {
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
private Corpus corpus;
private Filter filter;
private String resultTitle;
private Map<String, AtomicLong> result;
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private LocalDateTime time;
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
if (useDB) {
this.useDB = true;
db = new RDB();
}
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
resultNestedSuffix = new ConcurrentHashMap<>();
resultNestedPrefix = new ConcurrentHashMap<>();
} else {
result = new ConcurrentHashMap<>();
}
resultTitle = generateResultTitle();
logger.debug(toString());
}
/**
* Result's title consists of:
* <ul>
* <li>Corpus type</li>
* <li>Analysis level</li>
* <li>Calculate for</li>
* <li></li>
* <li></li>
* <li></li>
* <li></li>
* </ul>
*
* @return
*/
private String generateResultTitle() {
String separator = "_";
StringBuilder sb = new StringBuilder();
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if(ngramLevel == 0) {
sb.append("Crke").
append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
} else if(ngramLevel == 1) {
sb.append("Besede").append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
else {
sb.append(filter.getAl().toString())
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
sb.append(filter.getCalculateFor().toString())
.append(separator);
// ngram value
sb.append(filter.getNgramValue()).append("-gram")
.append(separator);
sb.append(filter.getSkipValue()).append("-preskok")
.append(separator);
}
// TODO: assure skip is not null but zero
} else {
sb.append(filter.getAl().toString()) // analysis level
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
// skip value
// msd ?
// if taxonomy -> taxonomy
// if cvv -> cvv + dolžina
this.time = this.time != null ? this.time : LocalDateTime.now();
sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")));
return sb.toString();
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
public void setAnalysisProducedResults(boolean analysisProducedResults) {
this.analysisProducedResults = analysisProducedResults;
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Statistic properties:");
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
sb.append(newLine).append(filter.toString());
return sb.toString();
}
public String getResultTitle() {
return resultTitle;
}
// ****************************************
// ***************** util *****************
// ****************************************
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
logger.error("Store tmp results to DB", e);
// e.printStackTrace();
}
}
public Filter getFilter() {
return filter;
}
public Corpus getCorpus() {
return corpus;
}
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
if (!isEmpty(resultNestedSuffix)) {
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
}
if (!isEmpty(resultNestedPrefix)) {
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
}
// if no results and nothing to save, return false
if (!(results.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
filter.setAl(AnalysisLevel.WORD_FORMATION);
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
WordFormation.calculateStatistics(this);
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
Map<String, Map<String, Long>> sorted = new HashMap<>();
for (String s : nestedMap.keySet()) {
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
}
return sorted;
}
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
result.get(o).incrementAndGet();
}
public Map<String, AtomicLong> getResult() {
return result;
}
public Object[][] getResultCustom() {
return resultCustom;
}
public void setResultCustom(Object[][] resultCustom) {
this.resultCustom = resultCustom;
}
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
if (type == WordLevelType.SUFFIX) {
updateResultsNestedSuffix(key, stringValue);
} else if (type == WordLevelType.PREFIX) {
updateResultsNestedPrefix(key, stringValue);
}
}
public void updateResultsNestedSuffix(String key, String stringValue) {
if (resultNestedSuffix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
}
}
public void updateResultsNestedPrefix(String key, String stringValue) {
if (resultNestedPrefix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
}
}
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();
info.put("Korpus:", corpus.getCorpusType().toString());
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put("Analiza:", "Črke");
else if (ngramLevel == 1)
info.put("Analiza", "Besede");
else
info.put("Analiza:", filter.getAl().toString());
} else {
info.put("Analiza:", filter.getAl().toString());
}
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
// n.gram nivo
if (ngramLevel > 1) {
info.put("n-gram nivo:", String.valueOf(ngramLevel));
} else if (ngramLevel == 1){
info.put("n-gram nivo:", "nivo besed");
} else {
info.put("n-gram nivo:", "nivo črk");
}
// skip
if (ngramLevel > 1)
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
// izračunaj za
info.put("Izračunaj za:", filter.getCalculateFor().toString());
// msd
if (!isEmpty(filter.getMsd())) {
StringBuilder msdPattern = new StringBuilder();
for (Pattern pattern : filter.getMsd()) {
msdPattern.append(pattern.toString()).append(" ");
}
info.put("MSD:", msdPattern.toString());
}
// taksonomija
if (!isEmpty(filter.getTaxonomy())) {
info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
}
}
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
info.put("Taksonomija: ", "");
String sep = "";
for (String s : tax) {
info.put(sep = sep + " ", s);
}
}
if (corpus.getCorpusType() == CorpusType.SOLAR) {
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
if (!isEmpty(filters)) {
info.put("Dodatni filtri: ", "");
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
}
}
}
return info;
}
}

175
src/main/java/data/Tax.java Normal file
View File

@ -0,0 +1,175 @@
package data;
import java.util.*;
import java.util.stream.Collectors;
import gui.ValidationUtil;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
static {
// GIGAFIDA ----------------------------
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
// GOS ----------------------------------
GOS_TAXONOMY = new LinkedHashMap<>();
GOS_TAXONOMY.put("gos.T", "diskurz");
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
GOS_TAXONOMY.put("gos.S", "situacija");
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
}
/**
* Returns the whole default taxonomy for the specified corpus type
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
return FXCollections.observableArrayList(GIGAFIDA_TAXONOMY.values());
} else if (corpusType == CorpusType.GOS) {
return FXCollections.observableArrayList(GOS_TAXONOMY.values());
}
return FXCollections.observableArrayList(new ArrayList<>());
}
/**
* Returns taxonomy names only for items found in headers
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> taxForCombo = new ArrayList<>();
// assures same relative order
for (String t : tax.keySet()) {
if (foundTax.contains(t)) {
taxForCombo.add(tax.get(t));
}
}
return FXCollections.observableArrayList(taxForCombo);
}
public static HashSet<CorpusType> getCorpusTypesWithTaxonomy() {
return corpusTypesWithTaxonomy;
}
public static ArrayList<String> getTaxonomyCodes(ArrayList<String> taxonomyNames, CorpusType corpusType) {
ArrayList<String> result = new ArrayList<>();
if (ValidationUtil.isEmpty(taxonomyNames)) {
return result;
}
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
// for easier lookup
Map<String, String> taxInversed = tax.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
for (String taxonomyName : taxonomyNames) {
result.add(taxInversed.get(taxonomyName));
}
return result;
}
/**
* Returns a list of proper names for codes
*
* @param corpusType
* @param taxonomy
*
* @return
*/
public static ArrayList<String> getTaxonomyForInfo(CorpusType corpusType, ArrayList<String> taxonomy) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> result = new ArrayList<>();
for (String t : taxonomy) {
result.add(tax.get(t));
}
return result;
}
}

View File

@ -0,0 +1,171 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum Taxonomy {
// GOS
JAVNI("javni", "T.J", "gos"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "T.J.I", "gos"),
RAZVEDRILNI("razvedrilni", "T.J.R", "gos"),
NEJAVNI("nejavni", "T.N", "gos"),
NEZASEBNI("nezasebni", "T.N.N", "gos"),
ZASEBNI("zasebni", "T.N.Z", "gos"),
OSEBNI_STIK("osebni stik", "K.O", "gos"),
TELEFON("telefon", "K.P", "gos"),
RADIO("radio", "K.R", "gos"),
TELEVIZIJA("televizija", "K.T", "gos"),
// Gigafida
KNJIZNO("knjižno", "T.K", "gigafida"),
LEPOSLOVNO("leposlovno", "T.K.L", "gigafida"),
STROKOVNO("strokovno", "T.K.S", "gigafida"),
PERIODICNO("periodično", "T.P", "gigafida"),
CASOPIS("časopis", "T.P.C", "gigafida"),
REVIJA("revija", "T.P.R", "gigafida"),
INTERNET("internet", "I", "gigafida"),
SSJ_TISK("tisk", "SSJ.T", "gigafida"),
SSJ_KNJIZNO("opis", "identifikator", "gigafida"),
SSJ_LEPOSLOVNO("opis", "identifikator", "gigafida"),
SSJ_STROKOVNO("opis", "identifikator", "gigafida"),
SSJ_PERIODICNO("opis", "identifikator", "gigafida"),
SSJ_CASOPIS("opis", "identifikator", "gigafida"),
SSJ_REVIJA("opis", "identifikator", "gigafida"),
SSJ_DRUGO("opis", "identifikator", "gigafida"),
SSJ_INTERNET("opis", "identifikator", "gigafida"),
FT_P_PRENOSNIK("opis", "identifikator", "gigafida"),
FT_P_GOVORNI("opis", "identifikator", "gigafida"),
FT_P_ELEKTRONSKI("opis", "identifikator", "gigafida"),
FT_P_PISNI("opis", "identifikator", "gigafida"),
FT_P_OBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_KNJIZNO("opis", "identifikator", "gigafida"),
FT_P_PERIODICNO("opis", "identifikator", "gigafida"),
FT_P_CASOPISNO("opis", "identifikator", "gigafida"),
FT_P_DNEVNO("opis", "identifikator", "gigafida"),
FT_P_VECKRAT_TEDENSKO("opis", "identifikator", "gigafida"),
// FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_REVIALNO("opis", "identifikator", "gigafida"),
FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_STIRINAJSTDNEVNO("opis", "identifikator", "gigafida"),
FT_P_MESECNO("opis", "identifikator", "gigafida"),
FT_P_REDKEJE_KOT_MESECNO("opis", "identifikator", "gigafida"),
FT_P_OBCASNO("opis", "identifikator", "gigafida"),
FT_P_NEOBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_JAVNO("opis", "identifikator", "gigafida"),
FT_P_INTERNO("opis", "identifikator", "gigafida"),
FT_P_ZASEBNO("opis", "identifikator", "gigafida"),
FT_ZVRST("opis", "identifikator", "gigafida"),
FT_UMETNOSTNA("opis", "identifikator", "gigafida"),
FT_PESNISKA("opis", "identifikator", "gigafida"),
FT_PROZNA("opis", "identifikator", "gigafida"),
FT_DRAMSKA("opis", "identifikator", "gigafida"),
FT_NEUMETNOSTNA("opis", "identifikator", "gigafida"),
FT_STROKOVNA("opis", "identifikator", "gigafida"),
FT_HID("opis", "identifikator", "gigafida"),
FT_NIT("opis", "identifikator", "gigafida"),
FT_NESTROKOVNA("opis", "identifikator", "gigafida"),
FT_PRAVNA("opis", "identifikator", "gigafida"),
FT_LEKTORIRANO("opis", "identifikator", "gigafida"),
FT_DA("opis", "identifikator", "gigafida"),
FT_NE("opis", "identifikator", "gigafida");
private final String name;
private final String taxonomy;
private final String corpus;
Taxonomy(String name, String taxonomy, String corpusType) {
this.name = name;
this.taxonomy = taxonomy;
this.corpus = corpusType;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static Taxonomy factory(String tax) {
if (tax != null) {
// GOS
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
// Gigafida
// if (TISK.toString().equals(tax)) {
// return TISK;
// }
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getDefaultForComboBox(String corpusType) {
ArrayList<String> values = Arrays.stream(Taxonomy.values())
.filter(x -> x.corpus.equals(corpusType))
.map(x -> x.name)
.collect(Collectors.toCollection(ArrayList::new));
return FXCollections.observableArrayList(values);
}
public static ObservableList<String> getDefaultForComboBox(CorpusType corpusType) {
return getDefaultForComboBox(corpusType.toString());
}
}

View File

@ -0,0 +1,53 @@
package data;
import static gui.ValidationUtil.*;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import gui.Messages;
import gui.ValidationUtil;
public class Validation {
public static String validateForStringLevel(Filter filter) {
ArrayList<String> errors = new ArrayList<>();
// should not be null, error if null, because init failed
if (filter.getNgramValue() == null) {
errors.add(Messages.MISSING_NGRAM_LEVEL);
}
// should not be null, error if null, because init failed
if (filter.getCalculateFor() == null) {
errors.add(Messages.MISSING_CALCULATE_FOR);
}
if (filter.getSkipValue() == null) {
filter.setSkipValue(0);
}
if (filter.getNgramValue() != null && ValidationUtil.isEmpty(filter.getMsd()) &&
(filter.getMsd().size() != filter.getNgramValue())) {
if (!(filter.getMsd().size() == 1 && filter.getNgramValue() == 0) && !ValidationUtil.isEmpty(filter.getMsd()))
errors.add(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES);
}
Integer ngramValue = filter.getNgramValue();
ArrayList<Pattern> msd = filter.getMsd();
if (ngramValue > 0 && !ValidationUtil.isEmpty(msd) && ngramValue != msd.size()) {
errors.add(String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, ngramValue, msd.size()));
}
if (filter.getNgramValue() != null && filter.getNgramValue() == 0 && isEmpty(filter.getStringLength())) {
// if count letters, make sure that the length is given
// TODO: check that words we're adding in xml reader are longer than this value
errors.add(Messages.MISSING_STRING_LENGTH);
}
return isEmpty(errors) ? null : StringUtils.join(errors, ", \n");
}
}

View File

@ -0,0 +1,141 @@
package data;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import data.Enums.Msd;
import gui.ValidationUtil;
public class Word implements Serializable {
public static final char PAD_CHARACTER = '-';
private String word;
private String lemma;
private String msd;
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
/**
* Possible values:
* <p>
* <ul>
* <li>S = samostalnik</li>
* <li>G = glagol</li>
* <li>P = pridevnik</li>
* <li>R = prislov</li>
* <li>Z = zaimek</li>
* <li>K = števnik</li>
* <li>D = predlog</li>
* <li>V = veznik</li>
* <li>L = členek</li>
* <li>M = medmet</li>
* <li>O = okrajšava</li>
* <li>N = neuvrščeno</li>
* </ul>
*/
//private char besedna_vrsta;
public Word(String word, String lemma, String msd) {
this.lemma = lemma;
this.msd = normalizeMsd(msd);
// veliko zacetnico ohranimo samo za lastna imena
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
&& this.msd.length() >= 2
&& this.msd.charAt(1) == 'l')) {
this.word = word.toLowerCase();
} else {
this.word = word;
}
}
public Word() {
}
/**
* Appends a number of '-' to msds which are not properly sized.
* E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
*
* @param msdInput
*
* @return
*/
private String normalizeMsd(String msdInput) {
if (ValidationUtil.isEmpty(msdInput)) {
return "";
} else {
return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
}
}
public Word(String word) {
this.word = word;
}
public String getWord() {
return word;
}
public String getCVVWord() {
return covertToCvv(word);
}
public String getCVVLemma() {
return covertToCvv(lemma);
}
private String covertToCvv(String s) {
char[] StringCA = s.toCharArray();
for (int i = 0; i < StringCA.length; i++) {
StringCA[i] = VOWELS.contains(StringCA[i]) ? 'V' : 'C';
}
return new String(StringCA);
}
public void setWord(String word) {
this.word = word;
}
public String getLemma() {
return lemma;
}
public void setLemma(String lemma) {
this.lemma = lemma;
}
public String getMsd() {
return msd;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("beseda:\t")
.append(getWord())
.append("\n")
.append("lema:\t")
.append(getLemma())
.append("\n")
.append("msd:\t")
.append(getMsd())
.append("\n");
return sb.toString();
}
public String getForCf(CalculateFor calculateFor, boolean cvv) {
String returnValue = "";
if (cvv) {
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
} else {
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
}
return returnValue;
}
}

View File

@ -0,0 +1,454 @@
package gui;
import data.*;
import javafx.application.HostServices;
import javafx.beans.value.ChangeListener;
import javafx.beans.value.ObservableValue;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import static alg.XML_processing.readXML;
import static gui.GUIController.showAlert;
import static gui.Messages.*;
@SuppressWarnings("Duplicates")
public class CharacterAnalysisTab {
public final static Logger logger = LogManager.getLogger(CharacterAnalysisTab.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private CheckBox calculatecvvCB;
private boolean calculateCvv;
@FXML
private TextField stringLengthTF;
private Integer stringLength;
@FXML
private ToggleGroup calculateForRB;
private CalculateFor calculateFor;
@FXML
private RadioButton lemmaRB;
@FXML
private RadioButton varietyRB;
@FXML
private Pane paneLetters;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("različnica", "lema");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.LETTER;
toggleMode(currentMode);
calculateForRB.selectedToggleProperty().addListener(new ChangeListener<Toggle>() {
@Override
public void changed(ObservableValue<? extends Toggle> observable, Toggle oldValue, Toggle newValue) {
//logger.info("calculateForRB:", newValue.toString());
RadioButton chk = (RadioButton)newValue.getToggleGroup().getSelectedToggle(); // Cast object to radio button
calculateFor = CalculateFor.factory(chk.getText());
logger.info("calculateForRB:", chk.getText());
//System.out.println("Selected Radio Button - "+chk.getText());
}
});
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = 1;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
// cvv
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
calculateCvv = newValue;
logger.info("calculate cvv: " + calculateCvv);
});
// string length
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = stringLengthTF.getText();
if (!ValidationUtil.isEmpty(value)) {
if (!ValidationUtil.isNumber(value)) {
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
}
stringLength = Integer.parseInt(value);
} else {
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
stringLengthTF.setText("1");
logAlert(WARNING_MISSING_STRING_LENGTH);
}
}
});
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForRB.selectToggle(lemmaRB);
calculateFor = CalculateFor.factory(calculateForRB.getSelectedToggle().toString());
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
// keep calculateCvv
calculatecvvCB.setSelected(calculateCvv);
// keep string length if set
if (stringLength != null) {
stringLengthTF.setText(String.valueOf(stringLength));
} else {
stringLengthTF.setText("1");
stringLength = 1;
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.LETTER) {
paneLetters.setVisible(true);
// populate with default cvv length value
if (stringLength == null) {
stringLengthTF.setText("1");
stringLength = 1;
} else {
stringLengthTF.setText(String.valueOf(stringLength));
}
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.LEMMA;
calculateForRB.selectToggle(lemmaRB);
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
// TODO change to
varietyRB.setDisable(true);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
varietyRB.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(0);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(calculateCvv);
filter.setSolarFilters(solarFiltersMap);
filter.setStringLength(stringLength);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@ -0,0 +1,517 @@
package gui;
import static data.CorpusType.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import static util.Util.*;
import java.io.File;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.XML_processing;
import data.Corpus;
import data.CorpusType;
import data.Enums.solar.SolarFilters;
import data.Tax;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import javafx.stage.DirectoryChooser;
import javafx.stage.Stage;
import javafx.application.HostServices;
public class CorpusTab {
public final static Logger logger = LogManager.getLogger(CorpusTab.class);
public Pane setCorpusWrapperP;
private Stage stage;
@FXML
private Button chooseCorpusLocationB;
private File chosenCorpusLocation;
@FXML
private CheckBox readHeaderInfoChB;
private boolean readHeaderInfo;
@FXML
private CheckBox gosUseOrthChB;
private boolean gosUseOrth;
@FXML
private Button chooseResultsLocationB;
@FXML
private Label chooseCorpusL;
private String chooseCorpusLabelContent;
@FXML
private Label chooseResultsL;
private String chooseResultsLabelContent;
@FXML
private ProgressIndicator locationScanPI;
@FXML
private Hyperlink helpH;
// *** shared ***
private Corpus corpus;
private CorpusType corpusType;
// tabs - used to enable/disable
private Tab stringLevelTabNew2;
private Tab oneWordAnalysisTab;
private Tab characterLevelTab;
private Tab wordFormationTab;
private Tab wordLevelTab;
private Tab filterTab;
private TabPane tabPane;
private StringAnalysisTabNew2 satNew2Controller;
private OneWordAnalysisTab oneWordTabController;
private CharacterAnalysisTab catController;
private FiltersForSolar ffsController;
//private WordFormationTab wfController;
private WordLevelTab wlController;
private HostServices hostService;
public void initialize() {
stage = new Stage();
// add listeners
chooseCorpusLocationB.setOnAction(e -> chooseCorpusLocation());
chooseCorpusLocationB.setTooltip(new Tooltip(TOOLTIP_chooseCorpusLocationB));
helpH.setOnAction(e -> openHelpWebsite());
readHeaderInfoChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
readHeaderInfo = newValue;
logger.info("read headers: ", readHeaderInfo);
});
readHeaderInfoChB.setTooltip(new Tooltip(TOOLTIP_readHeaderInfoChB));
gosUseOrthChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
gosUseOrth = newValue;
corpus.setGosOrthMode(gosUseOrth);
wordFormationTab.setDisable(gosUseOrth);
satNew2Controller.toggleMode(null);
oneWordTabController.toggleMode(null);
catController.toggleMode(null);
logger.info("gosUseOrth: ", gosUseOrth);
});
chooseResultsLocationB.setOnAction(e -> chooseResultsLocation(null));
// set labels and toggle visibility
toggleGosChBVisibility();
chooseCorpusLabelContent = Messages.LABEL_CORPUS_LOCATION_NOT_SET;
chooseCorpusL.setText(chooseCorpusLabelContent);
chooseResultsLabelContent = Messages.LABEL_RESULTS_LOCATION_NOT_SET;
chooseResultsL.setText(chooseResultsLabelContent);
togglePiAndSetCorpusWrapper(false);
}
private void togglePiAndSetCorpusWrapper(boolean piIsActive) {
locationScanPI.setVisible(piIsActive);
setCorpusWrapperP.setLayoutX(piIsActive ? 100.0 : 10.0);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
/**
* In order for a directory to pass as a valid corpus location, following criteria has to be met:
* <ul>
* <li>it can't be null</li>
* <li>it has to be readable</li>
* <li>it has to contain xml files</li>
* <li>xml files have to contain valid headers from which we can infer the corpus type</li>
* <li>corpus type must be one of the expected corpus types - as noted in the @see data.CorpusType.class </li>
* </ul>
* <p>
* Additionally, if the user checks to read taxonomy/filters from the corpus files, that read
* has to produce a non-empty list results list
*/
private void chooseCorpusLocation() {
File selectedDirectory = directoryChooser();
if (selectedDirectory != null && ValidationUtil.isReadableDirectory(selectedDirectory)) {
logger.info("selected corpus dir: ", selectedDirectory.getAbsolutePath());
// scan for xml files
Collection<File> corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
// make sure there are corpus files in selected directory or notify the user about it
if (corpusFiles.size() == 0) {
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
} else {
String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles, selectedDirectory.getAbsolutePath());
if (chooseCorpusLabelContentTmp == null) {
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
} else {
initNewCorpus(selectedDirectory, corpusFiles);
corpus.setChosenCorpusLocation(selectedDirectory);
corpus.setDetectedCorpusFiles(corpusFiles);
chooseCorpusLabelContent = chooseCorpusLabelContentTmp;
logger.info("corpus dir: ", corpus.getChosenCorpusLocation().getAbsolutePath());
if (readHeaderInfo) {
logger.info("reading header info...");
readHeaderInfo();
} else {
setResults();
setCorpusForAnalysis();
}
}
}
}
}
/**
* If a user selects a valid corpus location, we define a new corpus (so none of the old data gets carried over)
*
* @param selectedDirectory
* @param corpusFiles
*/
private void initNewCorpus(File selectedDirectory, Collection<File> corpusFiles) {
corpus = new Corpus();
corpus.setCorpusType(corpusType);
corpus.setDetectedCorpusFiles(corpusFiles);
corpus.setChosenCorpusLocation(selectedDirectory);
chooseResultsLocation(selectedDirectory);
}
private void chooseResultsLocation(File dir) {
// results location can be set either to default value (after selecting valid corpus location) - dir attribute
// or to a dir picked via directoryChooser (when dir == null
File selectedDirectory = dir == null ? directoryChooser() : dir;
if (selectedDirectory != null) {
String resultsLocationPath = selectedDirectory.getAbsolutePath().concat(File.separator);
File chosenResultsLocationTmp = new File(resultsLocationPath);
if (!ValidationUtil.isValidDirectory(chosenResultsLocationTmp)) {
showAlert(Alert.AlertType.ERROR, WARNING_RESULTS_DIR_NOT_VALID);
logger.info("alert: ", WARNING_RESULTS_DIR_NOT_VALID);
} else {
corpus.setChosenResultsLocation(chosenResultsLocationTmp);
chooseResultsLabelContent = corpus.getChosenResultsLocation().getAbsolutePath();
chooseResultsL.setText(chooseResultsLabelContent);
logger.info("results dir: " + chooseResultsLabelContent);
}
}
}
private void setResults() {
// if everything is ok
// check and enable checkbox if GOS
toggleGosChBVisibility();
// set default results location
String defaultResultsLocationPath = corpus.getChosenCorpusLocation().getAbsolutePath();
logger.info("setting default results location to: ", defaultResultsLocationPath);
chooseCorpusL.setText(chooseCorpusLabelContent);
}
private void readHeaderInfo() {
CorpusType corpusType = corpus.getCorpusType();
Collection<File> corpusFiles = corpus.getDetectedCorpusFiles();
togglePiAndSetCorpusWrapper(true);
chooseCorpusL.setText(LABEL_SCANNING_CORPUS);
logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@Override
protected HashSet<String> call() throws Exception {
HashSet<String> values = new HashSet<>();
long i = 0;
if (!corpusIsSplit) {
updateProgress(-1.0f, -1.0f);
}
for (File file : corpusFiles) {
values.addAll((Collection<? extends String>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType));
i++;
if (corpusIsSplit) {
updateProgress(i, corpusFiles.size());
}
}
updateProgress(1.0f, 1.0f);
return values;
}
};
locationScanPI.progressProperty().bind(task.progressProperty());
task.setOnSucceeded(e -> {
ObservableList<String> readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue());
if (ValidationUtil.isEmpty(readTaxonomy)) {
// if no taxonomy found alert the user and keep other tabs disabled
logger.info("No taxonomy found in headers.");
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_TAXONOMY_FOUND);
} else {
// set taxonomy, update label
corpus.setTaxonomy(readTaxonomy);
corpus.setHeaderRead(true);
chooseCorpusL.setText(chooseCorpusLabelContent);
setResults();
setCorpusForAnalysis();
}
togglePiAndSetCorpusWrapper(false);
});
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
} else if (corpusType == CorpusType.SOLAR) {
// many many fields
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashMap<String, HashSet<String>>> task = new Task<HashMap<String, HashSet<String>>>() {
@Override
protected HashMap<String, HashSet<String>> call() throws Exception {
HashMap<String, HashSet<String>> values = new HashMap<>();
long i = 0;
if (!corpusIsSplit) {
updateProgress(-1.0f, -1.0f);
}
for (File file : corpusFiles) {
HashMap<String, HashSet<String>> tmpvalues = (HashMap<String, HashSet<String>>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType);
// update final results
for (Map.Entry<String, HashSet<String>> entry : tmpvalues.entrySet()) {
if (values.containsKey(entry.getKey())) {
values.get(entry.getKey()).addAll(entry.getValue());
} else {
values.put(entry.getKey(), entry.getValue());
}
}
i++;
if (corpusIsSplit) {
updateProgress(i, corpusFiles.size());
}
}
updateProgress(1.0f, 1.0f);
return values;
}
};
locationScanPI.progressProperty().bind(task.progressProperty());
task.setOnSucceeded(e -> {
HashMap<String, HashSet<String>> values = task.getValue();
if (ValidationUtil.isEmpty(values)) {
// if no taxonomy found alert the user and keep other tabs disabled
logger.info("No solar filters found in headers.");
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_SOLAR_FILTERS_FOUND);
} else {
HashMap<String, ObservableList<String>> filtersForComboBoxes = SolarFilters.getFiltersForComboBoxes(values);
// set taxonomy, update label
corpus.setSolarFiltersForXML(values);
corpus.setSolarFilters(filtersForComboBoxes);
corpus.setHeaderRead(true);
chooseCorpusL.setText(chooseCorpusLabelContent);
setResults();
setCorpusForAnalysis();
}
togglePiAndSetCorpusWrapper(false);
});
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
}
private void setCorpusForAnalysis() {
if (corpus.validate()) {
// new statistic, enable tabs...
stringLevelTabNew2.setDisable(false);
satNew2Controller.setCorpus(corpus);
satNew2Controller.init();
oneWordAnalysisTab.setDisable(false);
oneWordTabController.setCorpus(corpus);
oneWordTabController.init();
characterLevelTab.setDisable(false);
catController.setCorpus(corpus);
catController.init();
wordFormationTab.setDisable(false);
wordLevelTab.setDisable(false);
//wfController.setCorpus(corpus);
//wfController.init();
wlController.setCorpus(corpus);
wlController.init();
if (corpus.getCorpusType() == CorpusType.SOLAR) {
filterTab.setDisable(false);
tabPane.getTabs().add(1, filterTab);
ffsController.setCorpus(corpus);
ffsController.initFilters();
} else {
filterTab.setDisable(true);
tabPane.getTabs().removeAll(filterTab);
}
} else {
GUIController.showAlert(Alert.AlertType.ERROR, corpus.getValidationErrorsToString());
}
}
private File directoryChooser() {
DirectoryChooser directoryChooser = new DirectoryChooser();
// open in the folder where the jar is located if possible
File workingDir = getWorkingDirectory();
if (workingDir != null) {
directoryChooser.setInitialDirectory(workingDir);
}
return directoryChooser.showDialog(stage);
}
/**
* Hides GOS related checkbox until needed.
*/
private void toggleGosChBVisibility() {
gosUseOrthChB.setVisible(corpus != null && corpus.getCorpusType() != null && corpus.getCorpusType() == CorpusType.GOS);
}
private String detectCorpusType(Collection<File> corpusFiles, String corpusLocation) {
// check that we recognize this corpus
// read first file only, maybe later do all, if toll on resources is acceptable
File f = corpusFiles.iterator().next();
String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
String test = CCKRES.getNameLowerCase();
String debug = "";
// check if XML file's title contains any of recognized corpus titles
corpusType = null;
if (title.contains(SOLAR.getNameLowerCase())) {
corpusType = SOLAR;
} else if (title.contains(GIGAFIDA.getNameLowerCase())) {
corpusType = GIGAFIDA;
} else if (title.contains(CCKRES.getNameLowerCase())) {
corpusType = CCKRES;
} else if (title.contains(GOS.getNameLowerCase())) {
corpusType = GOS;
}
if (corpusType == null) {
return null;
} else {
corpus.setCorpusType(corpusType);
StringBuilder sb = new StringBuilder();
sb.append(corpusLocation)
.append("\n")
.append(String.format(NOTIFICATION_FOUND_X_FILES, corpusFiles.size()))
.append("\n")
.append(String.format("Korpus: %s", corpusType.toString()));
String result = sb.toString();
logger.debug(result);
return result;
}
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
}
public void setStringLevelTabNew2(Tab stringLevelTabNew2) { this.stringLevelTabNew2 = stringLevelTabNew2; }
public void setOneWordAnalysisTab(Tab oneWordAnalysisTab) { this.oneWordAnalysisTab = oneWordAnalysisTab; }
public void setCharacterLevelTab(Tab characterLevelTab) { this.characterLevelTab = characterLevelTab; }
public void setWordLevelTab(Tab wordLevelTab) {
this.wordLevelTab = wordLevelTab;
}
public void setFilterTab(Tab filterTab) {
this.filterTab = filterTab;
}
public void setFfsController(FiltersForSolar ffsController) {
this.ffsController = ffsController;
}
public void setTabPane(TabPane tabPane) {
this.tabPane = tabPane;
}
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
/*public void setWfController(WordFormationTab wfController) {
this.wfController = wfController;
}*/
public void setWlController(WordLevelTab wlController) {
this.wlController = wlController;
}
public void setWordFormationTab(Tab wordFormationTab) {
this.wordFormationTab = wordFormationTab;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@ -0,0 +1,187 @@
package gui;
import static data.Enums.solar.SolarFilters.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import javafx.application.HostServices;
import javafx.scene.control.Hyperlink;
import org.controlsfx.control.CheckComboBox;
import data.Corpus;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.fxml.FXML;
import javafx.scene.control.Label;
import javafx.scene.layout.AnchorPane;
import util.Util;
public class FiltersForSolar {
@FXML
public AnchorPane solarFiltersTabPane;
@FXML
public CheckComboBox<String> solarRegijaCCB;
@FXML
public CheckComboBox<String> solarPredmetCCB;
@FXML
public CheckComboBox<String> solarRazredCCB;
@FXML
public CheckComboBox<String> solarLetoCCB;
@FXML
public CheckComboBox<String> solarSolaCCB;
@FXML
public CheckComboBox<String> solarVrstaBesedilaCCB;
@FXML
public Label selectedFiltersLabel;
@FXML
private Hyperlink helpH;
private HashMap<String, ObservableList<String>> selectedFilters;
private Corpus corpus;
private StringAnalysisTabNew2 satNew2Controller;
private OneWordAnalysisTab oneWordTabController;
private CharacterAnalysisTab catController;
//private WordFormationTab wfController;
private WordLevelTab wlController;
private HostServices hostService;
@SuppressWarnings("unchecked")
public void initialize() {
selectedFilters = new HashMap<>();
solarRegijaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(REGIJA, solarRegijaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarPredmetCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(PREDMET, solarPredmetCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarRazredCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(RAZRED, solarRazredCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarLetoCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(LETO, solarLetoCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarSolaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(SOLA, solarSolaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
solarVrstaBesedilaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
selectedFilters.put(TIP, solarVrstaBesedilaCCB.getCheckModel().getCheckedItems());
updateSolarFilterLabel();
});
helpH.setOnAction(e -> openHelpWebsite());
}
public void initFilters() {
solarRegijaCCB.getItems().removeAll();
solarRegijaCCB.getItems().setAll(corpus.getSolarFilters().get(REGIJA));
solarRegijaCCB.getItems().sorted();
solarPredmetCCB.getItems().removeAll();
solarPredmetCCB.getItems().setAll(corpus.getSolarFilters().get(PREDMET));
solarPredmetCCB.getItems().sorted();
solarRazredCCB.getItems().removeAll();
solarRazredCCB.getItems().setAll(corpus.getSolarFilters().get(RAZRED));
solarRazredCCB.getItems().sorted();
solarLetoCCB.getItems().removeAll();
solarLetoCCB.getItems().setAll(corpus.getSolarFilters().get(LETO));
solarLetoCCB.getItems().sorted();
solarSolaCCB.getItems().removeAll();
solarSolaCCB.getItems().setAll(corpus.getSolarFilters().get(SOLA));
solarSolaCCB.getItems().sorted();
solarVrstaBesedilaCCB.getItems().removeAll();
solarVrstaBesedilaCCB.getItems().setAll(corpus.getSolarFilters().get(TIP));
solarVrstaBesedilaCCB.getItems().sorted();
}
private void updateSolarFilterLabel() {
if (Util.isMapEmpty(selectedFilters)) {
setSOlarFIlterLabelText("/");
} else {
StringBuilder allFilters = new StringBuilder();
for (Map.Entry<String, ObservableList<String>> entry : selectedFilters.entrySet()) {
ArrayList<String> values = new ArrayList<>(entry.getValue());
if (!values.isEmpty()) {
allFilters.append(entry.getKey())
.append(": ");
for (int i = 0; i < values.size(); i++) {
allFilters.append(values.get(i));
if (i < values.size() - 1) {
// so we won't append a comma after the last element
allFilters.append(", ");
}
}
allFilters.append("\n\n");
}
}
setSOlarFIlterLabelText(allFilters.toString());
}
HashMap<String, HashSet<String>> solarFiltersMap = new HashMap<>();
for (Map.Entry<String, ObservableList<String>> e : selectedFilters.entrySet()) {
HashSet<String> values = new HashSet<>();
values.addAll(e.getValue());
solarFiltersMap.put(e.getKey(), values);
}
satNew2Controller.setSolarFiltersMap(solarFiltersMap);
oneWordTabController.setSolarFiltersMap(solarFiltersMap);
catController.setSolarFiltersMap(solarFiltersMap);
//wfController.setSolarFiltersMap(solarFiltersMap);
wlController.setSolarFiltersMap(solarFiltersMap);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void setSOlarFIlterLabelText(String content) {
selectedFiltersLabel.setText(content);
satNew2Controller.setSelectedFiltersLabel(content);
oneWordTabController.setSelectedFiltersLabel(content);
catController.setSelectedFiltersLabel(content);
//wfController.setSelectedFiltersLabel(content);
wlController.setSelectedFiltersLabel(content);
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
}
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
/*public void setWfController(WordFormationTab wfController) {
this.wfController = wfController;
}*/
public void setWlController(WordLevelTab wlController) {
this.wlController = wlController;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@ -0,0 +1,150 @@
package gui;
import java.io.IOException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.kordamp.ikonli.fontawesome.FontAwesome;
import org.kordamp.ikonli.javafx.FontIcon;
import data.Corpus;
import javafx.application.Application;
import javafx.fxml.FXML;
import javafx.fxml.FXMLLoader;
import javafx.scene.Parent;
import javafx.scene.Scene;
import javafx.scene.control.Alert;
import javafx.scene.control.Tab;
import javafx.scene.control.TabPane;
import javafx.stage.Stage;
public class GUIController extends Application {
public final static Logger logger = LogManager.getLogger(GUIController.class);
@FXML
public Tab StringLevelTabNew2;
@FXML
public Tab OneWordAnalysisTab;
@FXML
public Tab CharacterLevelTabNew;
@FXML
public Tab corpusTab;
public TabPane tabPane;
@FXML
private CharacterAnalysisTab catController;
@FXML
private static Parent sat;
@FXML
private StringAnalysisTabNew2 satNew2Controller;
@FXML
private static Parent satNew2;
@FXML
private OneWordAnalysisTab oneWordTabController;
@FXML
private static Parent oneWordTab;
@FXML
private CorpusTab ctController;
@FXML
private Parent ct;
//@FXML
//private WordFormationTab wfController;
@FXML
private Parent wf;
@FXML
private WordLevelTab wlController;
@FXML
private Parent wl;
@FXML
private FiltersForSolar ffsController;
@FXML
private Parent ffs;
@FXML
private SelectedFiltersPane sfpController;
@FXML
private Parent sfp;
@FXML
public Tab stringLevelTab;
@FXML
public Tab wordLevelTab;
/*@FXML
public Tab wordFormationTab;*/
@FXML
public Tab filterTab;
public Stage stage;
private Corpus corpus;
@Override
public void start(Stage primaryStage) throws IOException {
Parent root = FXMLLoader.load(getClass().getResource("/GUI.fxml"));
primaryStage.setTitle("GUI");
Scene scene = new Scene(root, 800, 600);
// https://github.com/dicolar/jbootx
// scene.getStylesheets().add(GUIController.class.getResource("bootstrap3.css").toExternalForm())
primaryStage.setScene(scene);
stage = primaryStage;
primaryStage.show();
}
public static void main(String[] args) {
launch(args);
}
public void initialize() {
corpus = new Corpus();
ctController.setCorpus(corpus);
ctController.setFilterTab(filterTab);
ctController.setStringLevelTabNew2(StringLevelTabNew2);
ctController.setOneWordAnalysisTab(OneWordAnalysisTab);
ctController.setCharacterLevelTab(CharacterLevelTabNew);
ctController.setSatNew2Controller(satNew2Controller);
ctController.setOneWordTabController(oneWordTabController);
ctController.setCatController(catController);
//ctController.setWfController(wfController);
ctController.setWlController(wlController);
ctController.setTabPane(tabPane);
ctController.setFfsController(ffsController);
//ctController.setWordFormationTab(wordFormationTab);
ctController.setWordLevelTab(wordLevelTab);
ctController.setHostServices(getHostServices());
satNew2Controller.setCorpus(corpus);
satNew2Controller.setHostServices(getHostServices());
oneWordTabController.setCorpus(corpus);
oneWordTabController.setHostServices(getHostServices());
catController.setCorpus(corpus);
catController.setHostServices(getHostServices());
//wfController.setCorpus(corpus);
//wfController.setHostServices(getHostServices());
wlController.setCorpus(corpus);
wlController.setHostServices(getHostServices());
ffsController.setSatNew2Controller(satNew2Controller);
ffsController.setOneWordTabController(oneWordTabController);
ffsController.setCatController(catController);
//ffsController.setWfController(wfController);
ffsController.setWlController(wlController);
ffsController.setHostServices(getHostServices());
// set tab icons
corpusTab.setGraphic(new FontIcon(FontAwesome.COG));
filterTab.setGraphic(new FontIcon(FontAwesome.FILTER));
// hide filter tab
tabPane.getTabs().removeAll(filterTab);
}
static void showAlert(Alert.AlertType alertType, String headerText, String contentText) {
Alert alert = new Alert(alertType);
alert.setTitle(Messages.windowTitles.get(alertType));
alert.setHeaderText(headerText != null ? headerText : "");
alert.setContentText(contentText != null ? contentText : "");
alert.showAndWait();
}
static void showAlert(Alert.AlertType alertType, String headerText) {
showAlert(alertType, headerText, null);
}
}

View File

@ -0,0 +1,74 @@
package gui;
import static javafx.scene.control.Alert.AlertType.*;
import java.util.HashMap;
import javafx.scene.control.Alert;
public class Messages {
// warnings & errors
public static final String WARNING_CORPUS_NOT_FOUND = "V izbranem direktoriju ni ustreznih korpusnih datotek.";
public static final String WARNING_RESULTS_DIR_NOT_VALID = "Za dostop do izbranega direktorija nimate potrebnih pravic.";
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS = "Izbran nivo ngramov in vpisano št. besed v filtru se ne ujemata.";
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS_INFO = "Izberite drugo število ali popravite filter.";
public static final String WARNING_WORD_OR_LEMMA = "Izberite, če želite statistiko izračunati za besede ali leme.";
public static final String WARNING_ONLY_NUMBERS_ALLOWED = "Prosim vnesite veljavno število.";
public static final String WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES = "Število za ngram (%d) in število msd oznak (%d) se morata ujemati.";
public static final String WARNING_MISSING_STRING_LENGTH = "Dolžina niza mora biti večja od 0. Vstavljena je privzeta vrednost (1).";
public static final String WARNING_NO_TAXONOMY_FOUND = "Iz korpusnih datotek ni bilo moč razbrati taksonomije. Prosim izberite drugo lokacijo ali korpus.";
public static final String WARNING_NO_SOLAR_FILTERS_FOUND = "Iz korpusnih datotek ni bilo moč razbrati filtrov. Prosim izberite drugo lokacijo ali korpus.";
public static final String ERROR_WHILE_EXECUTING = "Prišlo je do napake med izvajanjem.";
public static final String ERROR_WHILE_SAVING_RESULTS_TO_CSV = "Prišlo je do napake med shranjevanje rezultatov.";
// missing
public static final String MISSING_NGRAM_LEVEL = "N-gram nivo";
public static final String MISSING_CALCULATE_FOR = "Izračunaj za";
public static final String MISSING_SKIP = "";
public static final String MISSING_STRING_LENGTH = "Dolžina niza";
public static final String MISMATCHED_STRING_LENGTH_AND_MSD_REGEX = "Neujemajoča dolžina niza in regex filter";
// general notifications - static content/set only once
public static final String NOTIFICATION_FOUND_X_FILES = "Št. najdenih datotek: %d";
public static final String NOTIFICATION_ANALYSIS_COMPLETED = "Analiza je zaključena, rezultati so shranjeni.";
public static final String NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS = "Analiza je zaključena, vendar ni bilo moč izračunati statistike, ki bi ustrezala vsem navedenim pogojem.";
public static final String RESULTS_PATH_SET_TO_DEFAULT = "Lokacija za shranjevanje rezultatov je nastavljena na lokacijo korpusa.";
// ongoing notifications - displayed while processing, dynamically changing
public static final String ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y = "Analiziram datoteko %d od %d (%s)";
// Labels
public static final String LABEL_CORPUS_LOCATION_NOT_SET = "Lokacija korpusa ni nastavljena";
public static final String LABEL_RESULTS_LOCATION_NOT_SET = "Lokacija za shranjevanje rezultatov ni nastavljena";
public static final String LABEL_RESULTS_CORPUS_TYPE_NOT_SET = "Vrsta korpusa ni nastavljena";
public static final String LABEL_SCANNING_CORPUS = "Iskanje in analiza korpusnih datotek...";
public static final String LABEL_SCANNING_SINGLE_FILE_CORPUS = "Analiza vnosa ";
public static final String COMPLETED = "končano";
public static final String TOOLTIP_chooseCorpusLocationB = "Izberite mapo v kateri se nahaja korpus. Program izbrano mapo preišče rekurzivno, zato bodite pozorni, da ne izberete mape z več korpusi ali z mnogo datotekami, ki niso del korpusa.";
public static final String TOOLTIP_readHeaderInfoChB = "Če izberete to opcijo, se bo iz headerjev korpusa prebrala razpoložljiva taksonomija oz. filtri (korpus Šolar). Ta operacija lahko traja dlje časa, sploh če je korpus združen v eni sami datoteki.";
// Not properly to be here. TODO move somewhere else in future
public static final String HELP_URL = "http://slovnica.ijs.si/";
// helper maps
/**
* Typical window titles
* ERROR = "Napaka"
* WARNING = "Opozorilo"
* CONFIRMATION = "Potrdilo"
*/
static HashMap<Alert.AlertType, String> windowTitles = new HashMap<>();
static {
// automatically set window's title
windowTitles.put(ERROR, "Napaka");
windowTitles.put(WARNING, "Opozorilo");
windowTitles.put(CONFIRMATION, "Potrdilo");
}
}

View File

@ -0,0 +1,389 @@
package gui;
import data.*;
import javafx.application.HostServices;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import static alg.XML_processing.readXML;
import static gui.GUIController.showAlert;
import static gui.Messages.*;
@SuppressWarnings("Duplicates")
public class OneWordAnalysisTab {
public final static Logger logger = LogManager.getLogger(OneWordAnalysisTab.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private ComboBox<String> calculateForCB;
private CalculateFor calculateFor;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER,
WORD
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.WORD;
toggleMode(currentMode);
// calculateForCB
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
calculateFor = CalculateFor.factory(newValue);
logger.info("calculateForCB:", calculateFor.toString());
});
calculateForCB.getSelectionModel().select(0);
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = 1;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.WORD) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
} else if (mode == MODE.LETTER) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.WORD;
calculateForCB.getSelectionModel().select("različnica");
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(false);
filter.setSolarFilters(solarFiltersMap);
filter.setStringLength(1);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@ -0,0 +1,18 @@
package gui;
import javafx.scene.control.Label;
public class SelectedFiltersPane {
public Label selectedFiltersLabel;
public Label getSelectedFiltersLabel() {
return selectedFiltersLabel;
}
public void setSelectedFiltersLabel(String filters) {
this.selectedFiltersLabel = new Label(filters);
this.selectedFiltersLabel.setText("test?");
}
}

View File

@ -0,0 +1,511 @@
package gui;
import static alg.XML_processing.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.regex.Pattern;
import javafx.application.HostServices;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import data.*;
import javafx.collections.FXCollections;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.control.*;
import javafx.scene.layout.Pane;
@SuppressWarnings("Duplicates")
public class StringAnalysisTabNew2 {
public final static Logger logger = LogManager.getLogger(StringAnalysisTabNew2.class);
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private TextField msdTF;
private ArrayList<Pattern> msd;
private ArrayList<String> msdStrings;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private CheckBox calculatecvvCB;
private boolean calculateCvv;
@FXML
private TextField stringLengthTF;
private Integer stringLength;
@FXML
private ComboBox<String> calculateForCB;
private CalculateFor calculateFor;
@FXML
private ComboBox<String> ngramValueCB;
private Integer ngramValue;
@FXML
private ComboBox<String> skipValueCB;
private Integer skipValue;
@FXML
private Pane paneWords;
@FXML
private Pane paneLetters;
@FXML
private Button computeNgramsB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private enum MODE {
LETTER,
WORD
}
private MODE currentMode;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private Filter filter;
private boolean useDb;
private HostServices hostService;
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
// TODO: pass observables for taxonomy based on header scan
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
public void init() {
currentMode = MODE.WORD;
toggleMode(currentMode);
// ngram value CB
ngramValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
if (newValue.equals("nivo črk")) {
ngramValue = 0;
toggleMode(MODE.LETTER);
} else {
ngramValue = Integer.valueOf(newValue);
toggleMode(MODE.WORD);
}
// skip only on ngrams of more than one word
if (ngramValue > 1) {
skipValueCB.setDisable(false);
} else {
skipValueCB.getSelectionModel().select(0);
skipValue = 0;
skipValueCB.setDisable(true);
}
logger.info("ngramValueCB:", ngramValue);
});
// set first n-gram value to 2 at index 0
ngramValueCB.getSelectionModel().select(0); // selected index
ngramValue = 2; // actual value at that index
// calculateForCB
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
calculateFor = CalculateFor.factory(newValue);
logger.info("calculateForCB:", calculateFor.toString());
});
calculateForCB.getSelectionModel().select(0);
// msd
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = msdTF.getText();
logger.info("msdTf: ", value);
if (!ValidationUtil.isEmpty(value)) {
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
int nOfRequiredMsdTokens = ngramValue == 0 ? 1 : ngramValue;
if (msdTmp.size() != nOfRequiredMsdTokens) {
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
logAlert(msg);
showAlert(Alert.AlertType.ERROR, msg);
}
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
msdStrings.add(msdToken);
}
logger.info(String.format("msd accepted (%d)", msd.size()));
} else if (!ValidationUtil.isEmpty(newValue)) {
msd = new ArrayList<>();
msdStrings = new ArrayList<>();
}
}
});
msdTF.setText("");
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
// skip
skipValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
skipValue = Integer.valueOf(newValue);
logger.info("Skip " + skipValue);
});
skipValueCB.getSelectionModel().select(0);
skipValue = 0;
// cvv
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
calculateCvv = newValue;
logger.info("calculate cvv: " + calculateCvv);
});
calculatecvvCB.setSelected(false);
// string length
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
if (!newValue) {
// focus lost
String value = stringLengthTF.getText();
if (!ValidationUtil.isEmpty(value)) {
if (!ValidationUtil.isNumber(value)) {
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
}
stringLength = Integer.parseInt(value);
} else {
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
stringLengthTF.setText("1");
logAlert(WARNING_MISSING_STRING_LENGTH);
}
}
});
computeNgramsB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
/**
* case a: values for combo boxes can change after a corpus change
* <ul>
* <li>different corpus type - reset all fields so no old values remain</li>
* <li>same corpus type, different subset - keep</li>
* </ul>
* <p>
* case b: values for combo boxes can change after a header scan
* <ul>
* <li>at first, fields are populated by corpus type defaults</li>
* <li>after, with gathered data</li>
* </ul>
* <p></p>
* ngrams: 1
* calculateFor: word
* msd:
* taxonomy:
* skip: 0
* iscvv: false
* string length: 1
*/
public void populateFields() {
// corpus changed if: current one is null (this is first run of the app)
// or if currentCorpus != gui's corpus
boolean corpusChanged = currentCorpusType == null
|| currentCorpusType != corpus.getCorpusType();
// keep ngram value if set
if (ngramValue == null) {
ngramValueCB.getSelectionModel().select("1");
ngramValue = 1;
}
// TODO: check for GOS, GIGAFIDA, SOLAR...
// refresh and:
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
if (calculateFor == null) {
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
}
if (!filter.hasMsd()) {
// if current corpus doesn't have msd data, disable this field
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(true);
logger.info("no msd data");
} else {
if (ValidationUtil.isEmpty(msd)
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
// msd has not been set previously
// or msd has been set but the corpus changed -> reset
msd = new ArrayList<>();
msdTF.setText("");
msdTF.setDisable(false);
logger.info("msd reset");
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
// if msd has been set, but corpus type remained the same, we can keep any set msd value
msdTF.setText(StringUtils.join(msdStrings, " "));
msdTF.setDisable(false);
logger.info("msd kept");
}
}
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
// keep skip value
if (skipValue == null) {
skipValueCB.getSelectionModel().select("0");
skipValue = 0;
}
// keep calculateCvv
calculatecvvCB.setSelected(calculateCvv);
// keep string length if set
if (stringLength != null) {
stringLengthTF.setText(String.valueOf(stringLength));
} else {
stringLengthTF.setText("1");
stringLength = 1;
}
// TODO: trigger on rescan
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
// user changed corpus (by type) or by selection & triggered a rescan of headers
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
currentCorpusType = corpus.getCorpusType();
// setTaxonomyIsDirty(false);
} else {
}
// see if we read taxonomy from headers, otherwise use default values for given corpus
ObservableList<String> tax = corpus.getTaxonomy();
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
}
/**
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
* sets combobox values to what is applicable ...
*
* @param mode
*/
public void toggleMode(MODE mode) {
if (mode == null) {
mode = currentMode;
}
logger.info("mode: ", mode.toString());
if (mode == MODE.WORD) {
paneWords.setVisible(true);
paneLetters.setVisible(false);
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
} else if (mode == MODE.LETTER) {
paneWords.setVisible(false);
paneLetters.setVisible(true);
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
// populate with default cvv length value
if (stringLength == null) {
stringLengthTF.setText("1");
stringLength = 1;
} else {
stringLengthTF.setText(String.valueOf(stringLength));
}
// if calculateFor was selected for something other than a word or a lemma -> reset
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
// if the user selected something else before selecting ngram for letters, reset that choice
calculateFor = CalculateFor.WORD;
calculateForCB.getSelectionModel().select("različnica");
}
}
// override if orth mode, allow only word
if (corpus.isGosOrthMode()) {
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
msdTF.setDisable(true);
} else {
msdTF.setDisable(false);
}
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(ngramValue);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(skipValue);
filter.setIsCvv(calculateCvv);
filter.setSolarFilters(solarFiltersMap);
if (ngramValue != null && ngramValue == 0) {
filter.setStringLength(stringLength);
}
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void logAlert(String alert) {
logger.info("alert: " + alert);
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
public Corpus getCorpus() {
return corpus;
}
public void setCorpus(Corpus corpus) {
this.corpus = corpus;
if (corpus.getCorpusType() != CorpusType.SOLAR) {
setSelectedFiltersLabel(null);
} else {
setSelectedFiltersLabel("/");
}
}
public void setSelectedFiltersLabel(String content) {
if (content != null) {
solarFilters.setVisible(true);
selectedFiltersLabel.setVisible(true);
selectedFiltersLabel.setText(content);
} else {
solarFilters.setVisible(false);
selectedFiltersLabel.setVisible(false);
}
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
task.setOnFailed(e -> {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
logger.error("Error while executing", e);
ngramProgressBar.progressProperty().unbind();
ngramProgressBar.setProgress(0.0);
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
progressLabel.textProperty().unbind();
progressLabel.setText("");
});
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
}
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
this.solarFiltersMap = solarFiltersMap;
}
public void setHostServices(HostServices hostServices){
this.hostService = hostServices;
}
}

View File

@ -0,0 +1,77 @@
package gui;
import java.io.File;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.math.NumberUtils;
public class ValidationUtil {
public static boolean isNumber(String value) {
return NumberUtils.isCreatable(value);
}
/**
* Checks if an object is empty or null. Null part is especially important,
* since Java's built-in isEmpty() methods don't check for this condition
* and throw a nullPointerException as a result.
* <p>
* Supported structures:
* <ul>
* <li>String: empty if null or length is zero</li>
* <li>List: empty if null or size() == 0</li>
* <li>Map: empty if null or if it contains no keys, or if all keys map to an empty value </li>
* </ul>
*/
public static boolean isEmpty(Object o) {
if (o == null) {
return true;
}
if (o instanceof String) {
if (((String) o).length() == 0) {
return true;
}
}
if (o instanceof List) {
if (((List) o).isEmpty()) {
return true;
}
}
if (o instanceof Map) {
if (((Map) o).keySet().isEmpty()) {
return true;
} else {
for (Object val : ((Map) o).values()) {
if (!isEmpty(val)) {
// if map contains any value that isn't empty, the map isn't considered empty
return false;
}
}
}
}
return false;
}
public static boolean isNotEmpty(Object o) {
return !isEmpty(o);
}
/**
* Checks whether a given File is a folder for which we have appropriate permission
*/
public static boolean isValidDirectory(File f) {
return f.isDirectory() && f.canRead() && f.canWrite();
}
/**
* Checks whether a given File is a folder for which we have appropriate permission
*/
public static boolean isReadableDirectory(File f) {
return f.isDirectory() && f.canRead();
}
}

View File

@ -0,0 +1,208 @@
package gui;
import static alg.XML_processing.*;
import static gui.GUIController.*;
import static gui.Messages.*;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import javafx.application.HostServices;
import javafx.scene.control.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.controlsfx.control.CheckComboBox;
import data.*;
import javafx.collections.ListChangeListener;
import javafx.collections.ObservableList;
import javafx.concurrent.Task;
import javafx.fxml.FXML;
import javafx.scene.layout.AnchorPane;
@SuppressWarnings("Duplicates")
public class WordFormationTab {
public final static Logger logger = LogManager.getLogger(WordFormationTab.class);
public AnchorPane wordAnalysisTabPane;
@FXML
public Label selectedFiltersLabel;
@FXML
public Label solarFilters;
@FXML
private CheckComboBox<String> taxonomyCCB;
private ArrayList<String> taxonomy;
@FXML
private Button computeB;
@FXML
public ProgressBar ngramProgressBar;
@FXML
public Label progressLabel;
@FXML
private Hyperlink helpH;
private Corpus corpus;
private HashMap<String, HashSet<String>> solarFiltersMap;
private HostServices hostService;
// after header scan
private ObservableList<String> taxonomyCCBValues;
private CorpusType currentCorpusType;
private boolean useDb;
public void init() {
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
taxonomyCCB.getItems().removeAll();
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
taxonomy = new ArrayList<>();
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
taxonomy.addAll(checkedItems);
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
});
taxonomyCCB.getCheckModel().clearChecks();
} else {
taxonomyCCB.setDisable(true);
}
computeB.setOnAction(e -> {
compute();
logger.info("compute button");
});
helpH.setOnAction(e -> openHelpWebsite());
}
private void compute() {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setMsd(new ArrayList<>());
filter.setIsCvv(false);
filter.setSolarFilters(solarFiltersMap);
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
execute(statistic);
} else {
logAlert(message);
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
}
}
private void openHelpWebsite(){
hostService.showDocument(Messages.HELP_URL);
}
private void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
final Task<Void> task = new Task<Void>() {
@SuppressWarnings("Duplicates")
@Override
protected Void call() throws Exception {
long i = 0;
for (File f : corpusFiles) {
readXML(f.toString(), statistic);
i++;
this.updateProgress(i, corpusFiles.size());
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
}
return null;
}
};
ngramProgressBar.progressProperty().bind(task.progressProperty());
progressLabel.textProperty().bind(task.messageProperty());
task.setOnSucceeded(e -> {
try {
// first, we have to recalculate all occurrences to detailed statistics
boolean successullySaved = statistic.recalculateAndSaveResultToDisk();
if (successullySaved) {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
} else {
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
}
} catch (UnsupportedEncodingException e1) {
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
logger.error("Error while saving", e1);
}