Project copied
This commit is contained in:
commit
a18e52a599
160
.gitignore
vendored
Normal file
160
.gitignore
vendored
Normal file
@ -0,0 +1,160 @@
|
||||
# Created by .ignore support plugin (hsz.mobi)
|
||||
### Maven template
|
||||
target/
|
||||
pom.xml.tag
|
||||
pom.xml.releaseBackup
|
||||
pom.xml.versionsBackup
|
||||
pom.xml.next
|
||||
release.properties
|
||||
dependency-reduced-pom.xml
|
||||
buildNumber.properties
|
||||
.mvn/timing.properties
|
||||
|
||||
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
|
||||
!/.mvn/wrapper/maven-wrapper.jar
|
||||
### JetBrains template
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
# User-specific stuff:
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/dictionaries
|
||||
.idea/
|
||||
|
||||
# Sensitive or high-churn files:
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.xml
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
|
||||
# Gradle:
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# Mongo Explorer plugin:
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
## File-based project format:
|
||||
*.iws
|
||||
|
||||
## Plugin-specific files:
|
||||
|
||||
# IntelliJ
|
||||
/out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
### Java template
|
||||
# Compiled class file
|
||||
# Log file
|
||||
*.log
|
||||
|
||||
# BlueJ files
|
||||
*.ctxt
|
||||
|
||||
# Mobile Tools for Java (J2ME)
|
||||
.mtj.tmp/
|
||||
|
||||
# Package Files #
|
||||
*.war
|
||||
*.ear
|
||||
*.zip
|
||||
*.tar.gz
|
||||
*.rar
|
||||
|
||||
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
|
||||
hs_err_pid*
|
||||
### Eclipse template
|
||||
|
||||
.metadata
|
||||
bin/
|
||||
tmp/
|
||||
*.tmp
|
||||
*.bak
|
||||
*.swp
|
||||
*~.nib
|
||||
local.properties
|
||||
.settings/
|
||||
.loadpath
|
||||
.recommenders
|
||||
|
||||
# Eclipse Core
|
||||
.project
|
||||
|
||||
# External tool builders
|
||||
.externalToolBuilders/
|
||||
|
||||
# Locally stored "Eclipse launch configurations"
|
||||
*.launch
|
||||
|
||||
# PyDev specific (Python IDE for Eclipse)
|
||||
*.pydevproject
|
||||
|
||||
# CDT-specific (C/C++ Development Tooling)
|
||||
.cproject
|
||||
|
||||
# JDT-specific (Eclipse Java Development Tools)
|
||||
.classpath
|
||||
|
||||
# Java annotation processor (APT)
|
||||
.factorypath
|
||||
|
||||
# PDT-specific (PHP Development Tools)
|
||||
.buildpath
|
||||
|
||||
# sbteclipse plugin
|
||||
.target
|
||||
|
||||
# Tern plugin
|
||||
.tern-project
|
||||
|
||||
# TeXlipse plugin
|
||||
.texlipse
|
||||
|
||||
# STS (Spring Tool Suite)
|
||||
.springBeans
|
||||
|
||||
# Code Recommenders
|
||||
.recommenders/
|
||||
|
||||
# Scala IDE specific (Scala & Java development for Eclipse)
|
||||
.cache-main
|
||||
.scala_dependencies
|
||||
.worksheet
|
||||
|
||||
|
||||
|
||||
|
||||
### Windows ###
|
||||
# Windows thumbnail cache files
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
ehthumbs_vista.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Windows Installer files
|
||||
*.cab
|
||||
*.msi
|
||||
*.msm
|
||||
*.msp
|
||||
|
||||
# Windows shortcuts
|
||||
*.lnk
|
28
Corpus Analyzer.iml
Normal file
28
Corpus Analyzer.iml
Normal file
@ -0,0 +1,28 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
|
||||
</component>
|
||||
</module>
|
122
pom.xml
Normal file
122
pom.xml
Normal file
@ -0,0 +1,122 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>thesis</groupId>
|
||||
<artifactId>corpus-analyzer</artifactId>
|
||||
<version>1.2</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
<version>2.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.googlecode.json-simple</groupId>
|
||||
<artifactId>json-simple</artifactId>
|
||||
<version>1.1.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-csv</artifactId>
|
||||
<version>1.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.controlsfx</groupId>
|
||||
<artifactId>controlsfx</artifactId>
|
||||
<version>8.40.13</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.rocksdb</groupId>
|
||||
<artifactId>rocksdbjni</artifactId>
|
||||
<version>5.7.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-api</artifactId>
|
||||
<version>2.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-core</artifactId>
|
||||
<version>2.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kordamp.ikonli</groupId>
|
||||
<artifactId>ikonli-fontawesome-pack</artifactId>
|
||||
<version>1.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kordamp.ikonli</groupId>
|
||||
<artifactId>ikonli-javafx</artifactId>
|
||||
<version>1.9.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<!-- packages dependencies into the jar -->
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>gui.GUIController</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
<appendAssemblyId>false</appendAssemblyId>
|
||||
<outputDirectory>artifact</outputDirectory>
|
||||
<finalName>Corpus_Analyzer_${version}</finalName>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- JavaFX -->
|
||||
<groupId>com.zenjava</groupId>
|
||||
<artifactId>javafx-maven-plugin</artifactId>
|
||||
<version>8.6.0</version>
|
||||
<configuration>
|
||||
<mainClass>gui.GUIController</mainClass>
|
||||
<verbose>true</verbose>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>create-jfxjar</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>build-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
3
src/main/java/META-INF/MANIFEST.MF
Normal file
3
src/main/java/META-INF/MANIFEST.MF
Normal file
@ -0,0 +1,3 @@
|
||||
Manifest-Version: 1.0
|
||||
Main-Class: gui.GUIController
|
||||
|
15
src/main/java/alg/Common.java
Normal file
15
src/main/java/alg/Common.java
Normal file
@ -0,0 +1,15 @@
|
||||
package alg;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
public class Common {
|
||||
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
|
||||
// if not in map
|
||||
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null)
|
||||
map.get(o).incrementAndGet();
|
||||
}
|
||||
}
|
794
src/main/java/alg/XML_processing.java
Normal file
794
src/main/java/alg/XML_processing.java
Normal file
@ -0,0 +1,794 @@
|
||||
package alg;
|
||||
|
||||
import static data.Enums.solar.SolarFilters.*;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
import javax.xml.stream.XMLEventReader;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLStreamConstants;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.events.*;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
|
||||
import data.*;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class XML_processing {
|
||||
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
|
||||
|
||||
// public static void processCorpus(Statistics stats) {
|
||||
// // we can preset the list's size, so there won't be a need to resize it
|
||||
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
|
||||
//
|
||||
// int i = 0;
|
||||
// for (File f : Settings.corpus) {
|
||||
// i++;
|
||||
// readXML(f.toString(), stats);
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void readXML(String path, Statistics stats) {
|
||||
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
|
||||
// readXMLGigafida(path, stats);
|
||||
// } else if (stats.getCorpusType() == CorpusType.GOS) {
|
||||
// readXMLGos(path, stats);
|
||||
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
|
||||
// readXMLSolar(path, stats);
|
||||
// }
|
||||
// }
|
||||
|
||||
public static void readXML(String path, StatisticsNew stats) {
|
||||
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|
||||
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
|
||||
readXMLGigafida(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
|
||||
readXMLGos(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
readXMLSolar(path, stats);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value of a passed header tag or an empty string.
|
||||
* E.g. title tag, for discerning the corpus' type.
|
||||
* Notice: returns only the value of the first occurrence of a given tag name.
|
||||
*/
|
||||
public static String readXMLHeaderTag(String path, String tag) {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
try {
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = eventReader.nextEvent();
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String var = startElement.getName().getLocalPart();
|
||||
|
||||
if (var.equalsIgnoreCase(tag)) {
|
||||
return eventReader.nextEvent().asCharacters().getData();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ForkJoinPool pool = new ForkJoinPool();
|
||||
|
||||
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
|
||||
pool.invoke(wc);
|
||||
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
|
||||
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
|
||||
pool.invoke(wc);
|
||||
} else {
|
||||
// TODO:
|
||||
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
|
||||
// pool.invoke(wc);
|
||||
}
|
||||
}
|
||||
|
||||
// public static void readXMLGos(String path, Statistics stats) {
|
||||
// boolean in_word = false;
|
||||
// String taksonomija = "";
|
||||
// String lemma = "";
|
||||
// String msd = "";
|
||||
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
|
||||
//
|
||||
// List<Word> stavek = new ArrayList<>();
|
||||
// List<Sentence> corpus = new ArrayList<>();
|
||||
// String sentenceDelimiter = "seg";
|
||||
// String taxonomyPrefix = "gos.";
|
||||
//
|
||||
// try {
|
||||
// XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
//
|
||||
// while (eventReader.hasNext()) {
|
||||
// XMLEvent event = eventReader.nextEvent();
|
||||
//
|
||||
// switch (event.getEventType()) {
|
||||
// case XMLStreamConstants.START_ELEMENT:
|
||||
//
|
||||
// StartElement startElement = event.asStartElement();
|
||||
// String qName = startElement.getName().getLocalPart();
|
||||
//
|
||||
// // "word" node
|
||||
// if (qName.equals("w")) {
|
||||
// in_word = true;
|
||||
//
|
||||
// if (type.equals("norm")) {
|
||||
// // make sure we're looking at <w lemma...> and not <w type...>
|
||||
// Iterator var = startElement.getAttributes();
|
||||
// ArrayList<Object> attributes = new ArrayList<>();
|
||||
// while (var.hasNext()) {
|
||||
// attributes.add(var.next());
|
||||
// }
|
||||
//
|
||||
// if (attributes.contains("msd")) {
|
||||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
// } else {
|
||||
// msd = null;
|
||||
// }
|
||||
//
|
||||
// if (attributes.contains("lemma")) {
|
||||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// // taxonomy node
|
||||
// else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// // there are some term nodes at the beginning that are of no interest to us
|
||||
// // they differ by not having the attribute "ref", so test will equal null
|
||||
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
//
|
||||
// if (test != null) {
|
||||
// // keep only taxonomy properties
|
||||
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
|
||||
// }
|
||||
// } else if (qName.equalsIgnoreCase("div")) {
|
||||
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
//
|
||||
// }
|
||||
// break;
|
||||
//
|
||||
// case XMLStreamConstants.CHARACTERS:
|
||||
// Characters characters = event.asCharacters();
|
||||
//
|
||||
// // "word" node value
|
||||
// if (in_word) {
|
||||
// if (type.equals("norm") && msd != null) {
|
||||
// stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
// } else {
|
||||
// stavek.add(new Word(characters.getData()));
|
||||
// }
|
||||
//
|
||||
// in_word = false;
|
||||
// }
|
||||
// break;
|
||||
//
|
||||
// case XMLStreamConstants.END_ELEMENT:
|
||||
// EndElement endElement = event.asEndElement();
|
||||
//
|
||||
// // parser reached end of the current sentence
|
||||
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// // add sentence to corpus
|
||||
// corpus.add(new Sentence(stavek, taksonomija, type));
|
||||
// // and start a new one
|
||||
// stavek = new ArrayList<>();
|
||||
//
|
||||
// /* Invoke Fork-Join when we reach maximum limit of
|
||||
// * sentences (because we can't read everything to
|
||||
// * memory) or we reach the end of the file.
|
||||
// */
|
||||
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
// fj(corpus, stats);
|
||||
// // empty the current corpus, since we don't need
|
||||
// // the data anymore
|
||||
// corpus.clear();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // backup
|
||||
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
// fj(corpus, stats);
|
||||
// corpus.clear();
|
||||
// }
|
||||
//
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// } catch (FileNotFoundException | XMLStreamException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public static void readXMLSolar(String path, StatisticsNew stats) {
|
||||
boolean in_word = false;
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> stavek = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>();
|
||||
|
||||
// used for filter
|
||||
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
|
||||
Map<String, String> headBlock = null;
|
||||
boolean includeThisBlock = false;
|
||||
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
|
||||
StartElement startElement = event.asStartElement();
|
||||
// System.out.println(String.format("%s", startElement.toString()));
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w3")) {
|
||||
in_word = true;
|
||||
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
} else if (qName.equals("c3")) {
|
||||
String c3Content = eventReader.nextEvent().asCharacters().getData();
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek));
|
||||
// and start a new one
|
||||
stavek = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
} else if (headTags.contains(qName)) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(qName, tagContent);
|
||||
} else if (qName.equals("head")) {
|
||||
headBlock = new HashMap<>();
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
in_word = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
String qNameEnd = endElement.getName().getLocalPart();
|
||||
|
||||
if (qNameEnd.equals("head")) {
|
||||
// validate and set boolean
|
||||
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
|
||||
includeThisBlock = true;
|
||||
}
|
||||
} else if (qNameEnd.equals("body")) {
|
||||
// new block, reset filter status
|
||||
includeThisBlock = false;
|
||||
}
|
||||
|
||||
// backup
|
||||
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param readHeadBlock block of tags read from the corpus
|
||||
* @param userSetFilter tags with values set by the user
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
|
||||
boolean pass = true;
|
||||
|
||||
if (userSetFilter == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
|
||||
String key = filterEntry.getKey();
|
||||
HashSet<String> valueObject = filterEntry.getValue();
|
||||
|
||||
// if (valueObject instanceof String) {
|
||||
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
|
||||
// } else
|
||||
if (valueObject != null) {
|
||||
//noinspection unchecked
|
||||
for (String value : valueObject) {
|
||||
pass = validateHeadBlockEntry(readHeadBlock, key, value);
|
||||
}
|
||||
}
|
||||
|
||||
if (!pass) {
|
||||
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// if it gets to this point, it passed all the filters
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
|
||||
if (!readHeadBlock.keySet().contains(userSetKey)) {
|
||||
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
||||
return false;
|
||||
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
|
||||
// different values -> doesn't pass the filter
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
* @param filepath
|
||||
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
|
||||
* @param corpusType
|
||||
*/
|
||||
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
|
||||
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
|
||||
// solar
|
||||
Set<String> headTags = null;
|
||||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
|
||||
String headTagName;
|
||||
|
||||
if (corpusType == CorpusType.SOLAR) {
|
||||
headTagName = "head";
|
||||
// used for filter
|
||||
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
|
||||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
}
|
||||
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader xmlEventReader = null;
|
||||
try {
|
||||
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
|
||||
boolean insideHeader = false;
|
||||
|
||||
while (xmlEventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = xmlEventReader.nextEvent();
|
||||
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String elementName = startElement.getName().getLocalPart();
|
||||
|
||||
if (elementName.equalsIgnoreCase(headTagName)) {
|
||||
// if the corpus is split into files, we skip bodies
|
||||
// this toggle is true when we're inside a header (next block of code executes)
|
||||
// and false when we're not (skip reading unnecessary attributes)
|
||||
insideHeader = true;
|
||||
}
|
||||
|
||||
if (insideHeader) {
|
||||
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
String debug = "";
|
||||
|
||||
String tax = startElement.getAttributeByName(QName.valueOf("target"))
|
||||
.getValue()
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(elementName).add(tagContent);
|
||||
}
|
||||
}
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// if the corpus is split into multiple files, each with only one header block per file
|
||||
// that means we should stop after we reach the end of the header
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// whole corpus in one file, so we have to continue reading in order to find all header blocks
|
||||
insideHeader = false;
|
||||
}
|
||||
}
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("Streaming error", e);
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("File not found", e);
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
|
||||
} finally {
|
||||
if (xmlEventReader != null) {
|
||||
try {
|
||||
xmlEventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
}
|
||||
|
||||
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
|
||||
return event.asEndElement()
|
||||
.getName()
|
||||
.getLocalPart()
|
||||
.equalsIgnoreCase(headerTag);
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "s";
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
inWord = true;
|
||||
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
}
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(new Word(word, lemma, msd));
|
||||
inWord = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
String var = endElement.getName().getLocalPart();
|
||||
String debug = "";
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
corpus.add(new Sentence(sentence));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need the data anymore
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
||||
|
||||
if (currentFiletaxonomy.isEmpty()) {
|
||||
// taxonomies don't match so stop
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fallback
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inOrthDiv = false;
|
||||
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "seg";
|
||||
|
||||
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
boolean includeFile = true;
|
||||
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
if (qName.equals("div")) {
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
|
||||
if (atts.keySet().contains("type")) {
|
||||
inOrthDiv = atts.get("type").equals("orth");
|
||||
}
|
||||
}
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
// check that it's not a type
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
|
||||
if (!atts.containsKey("type")) {
|
||||
inWord = true;
|
||||
|
||||
if (atts.containsKey("msd")) {
|
||||
msd = atts.get("msd");
|
||||
|
||||
}
|
||||
if (atts.containsKey("lemma")) {
|
||||
lemma = atts.get("lemma");
|
||||
}
|
||||
//
|
||||
// if (!inOrthDiv) {
|
||||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
// }
|
||||
}
|
||||
|
||||
// }
|
||||
}
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
|
||||
}
|
||||
} else if (qName.equalsIgnoreCase("div")) {
|
||||
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
Characters characters = event.asCharacters();
|
||||
if (gosType.equals("norm") && msd != null) {
|
||||
sentence.add(new Word(characters.getData(), lemma, msd));
|
||||
} else {
|
||||
sentence.add(new Word(characters.getData()));
|
||||
}
|
||||
|
||||
inWord = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
boolean saveSentence = computeForOrth == inOrthDiv;
|
||||
|
||||
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
corpus.add(new Sentence(sentence));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
||||
|
||||
// disregard this entry if taxonomies don't match
|
||||
includeFile = !currentFiletaxonomy.isEmpty();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
// backup
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
} catch (Exception e) {
|
||||
logger.error("general error", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
|
||||
* Filters:
|
||||
* <ol>
|
||||
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
|
||||
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
|
||||
* </ol>
|
||||
*
|
||||
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
|
||||
*/
|
||||
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
|
||||
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// if we're calculating values for letters, omit words that are shorter than string length
|
||||
if (filter.getNgramValue() == 0) {
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
|
||||
}
|
||||
}
|
||||
|
||||
return sentence;
|
||||
}
|
||||
|
||||
private static HashMap<String, String> extractAttributes(StartElement se) {
|
||||
Iterator attributesIt = se.getAttributes();
|
||||
HashMap<String, String> atts = new HashMap<>();
|
||||
|
||||
while (attributesIt.hasNext()) {
|
||||
Attribute a = (Attribute) attributesIt.next();
|
||||
atts.put(a.getName().getLocalPart(), a.getValue());
|
||||
}
|
||||
|
||||
return atts;
|
||||
}
|
||||
}
|
67
src/main/java/alg/inflectedJOS/ForkJoin.java
Normal file
67
src/main/java/alg/inflectedJOS/ForkJoin.java
Normal file
@ -0,0 +1,67 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = -1260951004477299634L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private Statistics stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
|
||||
if (stats.isTaxonomySet()) {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
} else {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
170
src/main/java/alg/inflectedJOS/InflectedJOSCount.java
Normal file
170
src/main/java/alg/inflectedJOS/InflectedJOSCount.java
Normal file
@ -0,0 +1,170 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import alg.Common;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
public class InflectedJOSCount {
|
||||
|
||||
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
|
||||
// static {
|
||||
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// indices = new HashMap<>();
|
||||
// for (int i = 5; i <= 8; i++) {
|
||||
// indices.put(i, calculateCombinations(i));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static List<Integer> calculateCombinations(int i) {
|
||||
// int arr[] = {1, 2, 3, 4, 5};
|
||||
// int r = 3;
|
||||
// int n = arr.length;
|
||||
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
//
|
||||
// return printCombination(arr, n, r);
|
||||
// }
|
||||
//
|
||||
// /* arr[] ---> Input Array
|
||||
// data[] ---> Temporary array to store current combination
|
||||
// start & end ---> Staring and Ending indexes in arr[]
|
||||
// index ---> Current index in data[]
|
||||
// r ---> Size of a combination to be printed */
|
||||
// static void combinationUtil(int arr[], int data[], int start,
|
||||
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // Current combination is ready to be printed, print it
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
//
|
||||
// if (index == r) {
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// for (int j = 0; j < r; j++)
|
||||
// System.out.print(data[j] + " ");
|
||||
// System.out.println("");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// // replace index with all possible elements. The condition
|
||||
// // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // at index will make a combination with remaining elements
|
||||
// // at remaining positions
|
||||
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// data[index] = arr[i];
|
||||
// combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // The main function that prints all combinations of size r
|
||||
// // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// static void printCombination(int arr[], int n, int r) {
|
||||
// // A temporary array to store all combination one by one
|
||||
// int data[] = new int[r];
|
||||
//
|
||||
// // Print all combination using temprary array 'data[]'
|
||||
// combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// calculateCommon(s, stats.result);
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
// disregard if wrong taxonomy
|
||||
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
if (!(word.getMsd().length() > 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
Common.updateMap(stats.result, entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
// // TODO: if has defined msd and is of correct type (create a set)
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
stats.updateResults(entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
131
src/main/java/alg/inflectedJOS/WordFormation.java
Normal file
131
src/main/java/alg/inflectedJOS/WordFormation.java
Normal file
@ -0,0 +1,131 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import data.Enums.InflectedJosTypes;
|
||||
import data.StatisticsNew;
|
||||
import gui.ValidationUtil;
|
||||
import util.Combinations;
|
||||
|
||||
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
|
||||
public class WordFormation {
|
||||
private static HashMap<String, Long> josTypeResult;
|
||||
private static Object[][] tmpResults;
|
||||
|
||||
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
|
||||
|
||||
static {
|
||||
indices = new HashMap<>();
|
||||
|
||||
for (int i = 4; i <= 8; i++) {
|
||||
indices.put(i, Combinations.generateIndices(i));
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateStatistics(StatisticsNew stat) {
|
||||
Map<String, AtomicLong> result = stat.getResult();
|
||||
|
||||
// 1. filter - keep only inflected types
|
||||
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
|
||||
|
||||
// 2. for each inflected type get all possible subcombinations
|
||||
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
|
||||
josTypeResult = new HashMap<>();
|
||||
|
||||
// filter out results for a single word type
|
||||
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
|
||||
.filter(x -> x.getKey().charAt(0) == josChar)
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||
|
||||
if (ValidationUtil.isEmpty(singleTypeResults)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get all possible indices combos for a msd of this length
|
||||
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
|
||||
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
|
||||
|
||||
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
|
||||
int l = e.getKey().length();
|
||||
|
||||
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
|
||||
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
|
||||
}
|
||||
}
|
||||
|
||||
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
|
||||
}
|
||||
|
||||
stat.setResultCustom(tmpResults);
|
||||
}
|
||||
|
||||
private static String mask(String word, HashSet<Integer> indicesCombo) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(word.charAt(0));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
private static void updateResults(String s, Long nOfOccurences) {
|
||||
// if not in map add
|
||||
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
|
||||
|
||||
// else update
|
||||
if (r != null) {
|
||||
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
|
||||
}
|
||||
}
|
||||
|
||||
private static void resultsMapToArray(Long totalValue) {
|
||||
Double total = totalValue * 1.0;
|
||||
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
|
||||
|
||||
int i = 0;
|
||||
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
|
||||
josTypeResultArray[i][0] = e.getKey();
|
||||
josTypeResultArray[i][1] = e.getValue();
|
||||
josTypeResultArray[i][2] = e.getValue() / total;
|
||||
|
||||
if (e.getValue() > total) {
|
||||
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if (tmpResults == null) {
|
||||
tmpResults = josTypeResultArray;
|
||||
} else {
|
||||
int firstLength = tmpResults.length;
|
||||
int secondLength = josTypeResultArray.length;
|
||||
Object[][] tmp = new Object[firstLength + secondLength][3];
|
||||
|
||||
System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
|
||||
System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
|
||||
|
||||
tmpResults = tmp;
|
||||
|
||||
// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
|
||||
}
|
||||
}
|
||||
|
||||
private static void printArray() {
|
||||
for (int i = 0; i < tmpResults.length; i++) {
|
||||
for (int j = 0; j < tmpResults[i].length; j++) {
|
||||
System.out.print(tmpResults[i][j] + "\t");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
}
|
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
@ -0,0 +1,62 @@
|
||||
package alg.ngram;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = 5074814035083362355L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private StatisticsNew stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
Ngrams.calculateForAll(subCorpus, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
204
src/main/java/alg/ngram/Ngrams.java
Normal file
204
src/main/java/alg/ngram/Ngrams.java
Normal file
@ -0,0 +1,204 @@
|
||||
package alg.ngram;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import data.CalculateFor;
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Ngrams {
|
||||
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
||||
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
|
||||
generateNgramLetterCandidates(corpus, stats);
|
||||
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
|
||||
generateSkipgramCandidates(corpus, stats);
|
||||
} else {
|
||||
generateNgramCandidates(corpus, stats);
|
||||
}
|
||||
}
|
||||
|
||||
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
// skip sentences shorter than specified ngram length
|
||||
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
|
||||
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
||||
|
||||
// if msd regex is set and this candidate doesn't pass it, skip this iteration
|
||||
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether an ngram candidate passes specified regex filter.
|
||||
*/
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
|
||||
if (ngramCandidate.size() != regex.size()) {
|
||||
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < regex.size(); i++) {
|
||||
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
|
||||
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
|
||||
|
||||
switch (calculateFor) {
|
||||
case LEMMA:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getMsd)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD_TYPE:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
}
|
||||
|
||||
return StringUtils.join(candidate, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates candidates and updates results
|
||||
*
|
||||
* @param corpus
|
||||
* @param stats
|
||||
*/
|
||||
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word w : s.getWords()) {
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
||||
|
||||
// skip this iteration if:
|
||||
// - word doesn't contain a proper version (missing lemma for example)
|
||||
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
||||
// - given substring length is larger than the word length
|
||||
if (ValidationUtil.isEmpty(word)
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| word.length() < stats.getFilter().getStringLength()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
|
||||
// TODO: locila?
|
||||
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extracts skipgram candidates.
|
||||
*
|
||||
* @return List of candidates represented as a list<candidates(String)>
|
||||
*/
|
||||
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ArrayList<Word> currentLoop;
|
||||
int ngram = stats.getFilter().getNgramValue();
|
||||
int skip = stats.getFilter().getSkipValue();
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
||||
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
|
||||
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
||||
if (ngram == 2 && j < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
||||
if (ngram == 3 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
||||
if (ngram == 4 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
|
||||
if (ngram == 5 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
currentLoop.add(sentence.get(m));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
62
src/main/java/alg/word/ForkJoin.java
Normal file
62
src/main/java/alg/word/ForkJoin.java
Normal file
@ -0,0 +1,62 @@
|
||||
package alg.word;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = 7711587510996456040L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private StatisticsNew stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
WordLevel.calculateForAll(subCorpus, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
167
src/main/java/alg/word/WordCount.java
Normal file
167
src/main/java/alg/word/WordCount.java
Normal file
@ -0,0 +1,167 @@
|
||||
package alg.word;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import alg.Common;
|
||||
import data.CalculateFor;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.Word;
|
||||
|
||||
class WordCount {
|
||||
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
if (word.length() > stats.getSubstringLength()) {
|
||||
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
|
||||
String substring = word.substring(i, i + stats.getSubstringLength());
|
||||
Common.updateMap(stats.result, substring);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
boolean taxonomyIsSet = stats.isTaxonomySet();
|
||||
boolean JosTypeIsSet = stats.isJOSTypeSet();
|
||||
|
||||
// branching because even though the only difference is an if or two &&
|
||||
// O(if) = 1, the amount of ifs adds up and this saves some time
|
||||
if (taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForTaxonomyAndJosType(corpus, stats);
|
||||
} else if (taxonomyIsSet && !JosTypeIsSet) {
|
||||
calculateForTaxonomy(corpus, stats);
|
||||
} else if (!taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForJosType(corpus, stats);
|
||||
} else {
|
||||
if (stats.isVcc()) {
|
||||
calculateVCC(corpus, stats);
|
||||
} else {
|
||||
calculateNoFilter(corpus, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
112
src/main/java/alg/word/WordLevel.java
Normal file
112
src/main/java/alg/word/WordLevel.java
Normal file
@ -0,0 +1,112 @@
|
||||
package alg.word;
|
||||
|
||||
import static data.Enums.WordLevelDefaultValues.*;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import data.Enums.WordLevelDefaultValues;
|
||||
import data.Enums.WordLevelType;
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class WordLevel {
|
||||
private static HashSet<String> suffixes;
|
||||
private static int minSuffixLength;
|
||||
private static int maxSuffixLength;
|
||||
|
||||
private static HashSet<String> prefixes;
|
||||
private static int minPrefixLength;
|
||||
private static int maxPrefixLength;
|
||||
|
||||
static {
|
||||
suffixes = WordLevelDefaultValues.getSuffixes();
|
||||
calculateSuffixesLengths();
|
||||
|
||||
prefixes = WordLevelDefaultValues.getPrefixes();
|
||||
calculatePrefixesLengths();
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word word : s.getWords()) {
|
||||
calculateForSuffixes(word.getWord(), stats);
|
||||
calculateForPrefixes(word.getWord(), stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForPrefixes(String word, StatisticsNew stats) {
|
||||
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
|
||||
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
|
||||
return;
|
||||
}
|
||||
|
||||
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
|
||||
|
||||
if (prefixes.contains(extractedPrefix)) {
|
||||
// save suffix and full word
|
||||
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForSuffixes(String word, StatisticsNew stats) {
|
||||
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
|
||||
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
|
||||
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
|
||||
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
|
||||
return;
|
||||
}
|
||||
|
||||
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
|
||||
|
||||
if (suffixes.contains(extractedSuffix)) {
|
||||
// save suffix and full word
|
||||
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finds the shortest and longest suffix for quicker calculations
|
||||
public static void calculateSuffixesLengths() {
|
||||
minSuffixLength = -1;
|
||||
maxSuffixLength = -1;
|
||||
|
||||
for (String suffix : suffixes) {
|
||||
if (suffix.length() > maxSuffixLength) {
|
||||
maxSuffixLength = suffix.length();
|
||||
|
||||
if (minSuffixLength < 0) {
|
||||
minSuffixLength = maxSuffixLength;
|
||||
}
|
||||
} else if (suffix.length() < minSuffixLength) {
|
||||
minSuffixLength = suffix.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finds the shortest and longest suffix for quicker calculations
|
||||
public static void calculatePrefixesLengths() {
|
||||
minPrefixLength = -1;
|
||||
maxPrefixLength = -1;
|
||||
|
||||
for (String prefix : prefixes) {
|
||||
if (prefix.length() > maxPrefixLength) {
|
||||
maxPrefixLength = prefix.length();
|
||||
|
||||
if (minPrefixLength < 0) {
|
||||
minPrefixLength = maxPrefixLength;
|
||||
}
|
||||
} else if (prefix.length() < minPrefixLength) {
|
||||
minPrefixLength = prefix.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
17
src/main/java/data/AnalysisLevel.java
Normal file
17
src/main/java/data/AnalysisLevel.java
Normal file
@ -0,0 +1,17 @@
|
||||
package data;
|
||||
|
||||
public enum AnalysisLevel {
|
||||
STRING_LEVEL("Besedni nizi"),
|
||||
WORD_LEVEL("Nivo besed in delov besed"),
|
||||
WORD_FORMATION("Besedotvorni procesi");
|
||||
|
||||
private final String name;
|
||||
|
||||
AnalysisLevel(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
}
|
43
src/main/java/data/CalculateFor.java
Normal file
43
src/main/java/data/CalculateFor.java
Normal file
@ -0,0 +1,43 @@
|
||||
package data;
|
||||
|
||||
public enum CalculateFor {
|
||||
WORD("različnica"),
|
||||
LEMMA("lema"),
|
||||
MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"),
|
||||
MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"),
|
||||
WORD_TYPE("besedna vrsta"),
|
||||
DIST_WORDS("različnica"),
|
||||
DIST_LEMMAS("lema");
|
||||
|
||||
|
||||
private final String name;
|
||||
|
||||
CalculateFor(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public static CalculateFor factory(String cf) {
|
||||
if (cf != null) {
|
||||
if (WORD.toString().equals(cf)) {
|
||||
return WORD;
|
||||
}
|
||||
if (LEMMA.toString().equals(cf)) {
|
||||
return LEMMA;
|
||||
}
|
||||
if (MORPHOSYNTACTIC_SPECS.toString().equals(cf)) {
|
||||
return MORPHOSYNTACTIC_SPECS;
|
||||
}
|
||||
if (MORPHOSYNTACTIC_PROPERTY.toString().equals(cf)) {
|
||||
return MORPHOSYNTACTIC_PROPERTY;
|
||||
}
|
||||
if (WORD_TYPE.toString().equals(cf)) {
|
||||
return WORD_TYPE;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
163
src/main/java/data/Corpus.java
Normal file
163
src/main/java/data/Corpus.java
Normal file
@ -0,0 +1,163 @@
|
||||
package data;
|
||||
|
||||
import static gui.Messages.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import data.Enums.solar.SolarFilters;
|
||||
import gui.ValidationUtil;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class Corpus {
|
||||
public final static Logger logger = LogManager.getLogger(Corpus.class);
|
||||
|
||||
private CorpusType corpusType;
|
||||
private File chosenResultsLocation;
|
||||
private File chosenCorpusLocation;
|
||||
private Collection<File> detectedCorpusFiles;
|
||||
boolean headerRead;
|
||||
private ObservableList<String> taxonomy; // if gigafida or gos
|
||||
private HashMap<String, ObservableList<String>> solarFilters; // if solar
|
||||
private HashMap<String, HashSet<String>> solarFiltersForXML; // if solar - used while parsing xml
|
||||
private boolean gosOrthMode;
|
||||
boolean hasMsdData;
|
||||
private ArrayList<String> validationErrors;
|
||||
|
||||
public Corpus() {
|
||||
validationErrors = new ArrayList<>();
|
||||
}
|
||||
|
||||
public CorpusType getCorpusType() {
|
||||
return corpusType;
|
||||
}
|
||||
|
||||
public void setCorpusType(CorpusType corpusType) {
|
||||
this.corpusType = corpusType;
|
||||
logger.info("Corpus.set: ", corpusType);
|
||||
}
|
||||
|
||||
public File getChosenResultsLocation() {
|
||||
return chosenResultsLocation;
|
||||
}
|
||||
|
||||
public void setChosenResultsLocation(File chosenResultsLocation) {
|
||||
this.chosenResultsLocation = chosenResultsLocation;
|
||||
logger.info("Corpus.set: ", chosenResultsLocation);
|
||||
}
|
||||
|
||||
public File getChosenCorpusLocation() {
|
||||
return chosenCorpusLocation;
|
||||
}
|
||||
|
||||
public void setChosenCorpusLocation(File chosenCorpusLocation) {
|
||||
this.chosenCorpusLocation = chosenCorpusLocation;
|
||||
logger.info("Corpus.set: ", chosenCorpusLocation);
|
||||
}
|
||||
|
||||
public Collection<File> getDetectedCorpusFiles() {
|
||||
return detectedCorpusFiles;
|
||||
}
|
||||
|
||||
public void setDetectedCorpusFiles(Collection<File> detectedCorpusFiles) {
|
||||
this.detectedCorpusFiles = detectedCorpusFiles;
|
||||
logger.info("Corpus.set: ", detectedCorpusFiles);
|
||||
}
|
||||
|
||||
public boolean isHeaderRead() {
|
||||
return headerRead;
|
||||
}
|
||||
|
||||
public void setHeaderRead(boolean headerRead) {
|
||||
this.headerRead = headerRead;
|
||||
}
|
||||
|
||||
public ObservableList<String> getTaxonomy() {
|
||||
return taxonomy;
|
||||
}
|
||||
|
||||
public void setTaxonomy(ObservableList<String> taxonomy) {
|
||||
this.taxonomy = taxonomy;
|
||||
logger.info("Corpus.set: ", taxonomy);
|
||||
}
|
||||
|
||||
public HashMap<String, ObservableList<String>> getSolarFilters() {
|
||||
return solarFilters;
|
||||
}
|
||||
|
||||
public void setSolarFilters(HashMap<String, ObservableList<String>> solarFilters) {
|
||||
this.solarFilters = solarFilters;
|
||||
logger.info("Corpus.set: ", solarFilters);
|
||||
}
|
||||
|
||||
public HashMap<String, HashSet<String>> getSolarFiltersForXML() {
|
||||
return solarFiltersForXML;
|
||||
}
|
||||
|
||||
public void setSolarFiltersForXML(HashMap<String, HashSet<String>> solarFiltersForXML) {
|
||||
this.solarFiltersForXML = solarFiltersForXML;
|
||||
logger.info("Corpus.set: ", solarFiltersForXML);
|
||||
}
|
||||
|
||||
public boolean isGosOrthMode() {
|
||||
return gosOrthMode;
|
||||
}
|
||||
|
||||
public void setGosOrthMode(boolean gosOrthMode) {
|
||||
this.gosOrthMode = gosOrthMode;
|
||||
logger.info("Corpus.set: ", gosOrthMode);
|
||||
}
|
||||
|
||||
public ArrayList<String> getValidationErrors() {
|
||||
return validationErrors;
|
||||
}
|
||||
|
||||
public String getValidationErrorsToString() {
|
||||
return StringUtils.join(validationErrors, "\n - ");
|
||||
}
|
||||
|
||||
public void setValidationErrors(ArrayList<String> validationErrors) {
|
||||
this.validationErrors = validationErrors;
|
||||
}
|
||||
|
||||
public boolean validate() {
|
||||
if (corpusType == null) {
|
||||
validationErrors.add(LABEL_RESULTS_CORPUS_TYPE_NOT_SET);
|
||||
}
|
||||
|
||||
if (chosenCorpusLocation == null) {
|
||||
validationErrors.add(LABEL_CORPUS_LOCATION_NOT_SET);
|
||||
}
|
||||
|
||||
if (chosenResultsLocation == null) {
|
||||
validationErrors.add(LABEL_RESULTS_LOCATION_NOT_SET);
|
||||
}
|
||||
|
||||
if (!headerRead && corpusType != null) {
|
||||
// if user didn't opt into reading the headers, set default taxonomy or solar filters
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) {
|
||||
taxonomy = Tax.getTaxonomyForComboBox(corpusType);
|
||||
} else if (corpusType == CorpusType.SOLAR && solarFilters == null) {
|
||||
setSolarFilters(SolarFilters.getFiltersForComboBoxes());
|
||||
}
|
||||
}
|
||||
|
||||
if (headerRead && ValidationUtil.isEmpty(taxonomy)) {
|
||||
// mustn't happen, intercept at gui level
|
||||
}
|
||||
|
||||
if (!ValidationUtil.isEmpty(validationErrors)) {
|
||||
logger.error("Corpus validation error: ", StringUtils.join(validationErrors, "\n - "));
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
25
src/main/java/data/CorpusType.java
Normal file
25
src/main/java/data/CorpusType.java
Normal file
@ -0,0 +1,25 @@
|
||||
package data;
|
||||
|
||||
public enum CorpusType {
|
||||
GIGAFIDA("Gigafida", "gigafida"),
|
||||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos");
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String nameLowerCase;
|
||||
|
||||
CorpusType(String name, String nameLowerCase) {
|
||||
this.name = name;
|
||||
this.nameLowerCase = nameLowerCase;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getNameLowerCase() {
|
||||
return nameLowerCase;
|
||||
}
|
||||
}
|
12
src/main/java/data/Enums/InflectedJosTypes.java
Normal file
12
src/main/java/data/Enums/InflectedJosTypes.java
Normal file
@ -0,0 +1,12 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class InflectedJosTypes {
|
||||
public static final HashSet<Character> inflectedJosTypes = new HashSet<>();
|
||||
|
||||
static {
|
||||
inflectedJosTypes.addAll(Arrays.asList('S', 'G', 'P'));
|
||||
}
|
||||
}
|
68
src/main/java/data/Enums/Msd.java
Normal file
68
src/main/java/data/Enums/Msd.java
Normal file
@ -0,0 +1,68 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
public enum Msd {
|
||||
NOUN("samostalnik", 'S', "Noun", 'N', 5),
|
||||
VERB("glagol", 'G', "Verb", 'V', 7),
|
||||
ADJECTIVE("pridevnik", 'P', "Adjective", 'A', 6),
|
||||
ADVERB("prislov", 'R', "Adverb", 'R', 2),
|
||||
PRONOUN("zaimek", 'Z', "Pronoun", 'P', 8),
|
||||
NUMERAL("števnik", 'K', "Numeral", 'M', 6),
|
||||
PREPOSITION("predlog", 'D', "Preposition", 'S', 1),
|
||||
CONJUNCTION("veznik", 'V', "Conjunction", 'C', 1),
|
||||
PARTICLE("členek", 'L', "Particle", 'Q', 0),
|
||||
INTERJECTION("medmet", 'M', "Interjection", 'I', 0),
|
||||
ABBREVIATION("okrajšava", 'O', "Abbreviation", 'Y', 0),
|
||||
RESIDUAL("neuvrščeno", 'N', "Residual", 'X', 1);
|
||||
|
||||
private final String siName;
|
||||
private final Character siCode;
|
||||
private final String enName;
|
||||
private final Character enCode;
|
||||
private final Integer nOfAttributes;
|
||||
|
||||
private static HashMap<Character, Integer> siCodeNOfAttributes;
|
||||
|
||||
static {
|
||||
siCodeNOfAttributes = new HashMap<>();
|
||||
for (Msd msd : Msd.values()) {
|
||||
siCodeNOfAttributes.put(msd.getSiCode(), msd.nOfAttributes);
|
||||
}
|
||||
}
|
||||
|
||||
Msd(String siName, Character siCode, String enName, Character enCode, int nOfAttributes) {
|
||||
this.siName = siName;
|
||||
this.siCode = siCode;
|
||||
this.enName = enName;
|
||||
this.enCode = enCode;
|
||||
this.nOfAttributes = nOfAttributes;
|
||||
}
|
||||
|
||||
public String getSiName() {
|
||||
return siName;
|
||||
}
|
||||
|
||||
public Character getSiCode() {
|
||||
return siCode;
|
||||
}
|
||||
|
||||
public String getEnName() {
|
||||
return enName;
|
||||
}
|
||||
|
||||
public Character getEnCode() {
|
||||
return enCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of attributes for the given type.
|
||||
*
|
||||
* @param msd
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static int getMsdLengthForType(String msd) {
|
||||
return siCodeNOfAttributes.get(msd.charAt(0)) + 1;
|
||||
}
|
||||
}
|
55
src/main/java/data/Enums/WordLevelDefaultValues.java
Normal file
55
src/main/java/data/Enums/WordLevelDefaultValues.java
Normal file
@ -0,0 +1,55 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
public class WordLevelDefaultValues {
|
||||
public final static Logger logger = LogManager.getLogger(WordLevelDefaultValues.class);
|
||||
|
||||
private static HashSet<String> suffixes;
|
||||
private static final String SUFFIXES_FILE = "/Lists/suffixes.txt";
|
||||
public static final int MIN_N_OF_CHARACTERS_LEFT_SUFFIX = 2;
|
||||
|
||||
private static HashSet<String> prefixes;
|
||||
private static final String PREFIXES_FILE = "/Lists/prefixes.txt";
|
||||
public static final int MIN_N_OF_CHARACTERS_LEFT_PREFIX = 2;
|
||||
|
||||
static {
|
||||
suffixes = new HashSet<>();
|
||||
suffixes = readFromFile(SUFFIXES_FILE);
|
||||
prefixes = new HashSet<>();
|
||||
prefixes = readFromFile(PREFIXES_FILE);
|
||||
}
|
||||
|
||||
private static HashSet<String> readFromFile(String fileName) {
|
||||
Set<String> dictionary = new HashSet<>();
|
||||
|
||||
try (InputStream is = WordLevelDefaultValues.class.getClass().getResourceAsStream(fileName)) {
|
||||
if (is != null) {
|
||||
// TODO: warn if !exists
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
|
||||
dictionary = reader.lines().collect(Collectors.toSet());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("Problem reading init dictionary", e);
|
||||
}
|
||||
|
||||
return (HashSet<String>) dictionary;
|
||||
}
|
||||
|
||||
public static HashSet<String> getSuffixes() {
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
public static HashSet<String> getPrefixes() {
|
||||
return prefixes;
|
||||
}
|
||||
}
|
16
src/main/java/data/Enums/WordLevelType.java
Normal file
16
src/main/java/data/Enums/WordLevelType.java
Normal file
@ -0,0 +1,16 @@
|
||||
package data.Enums;
|
||||
|
||||
public enum WordLevelType {
|
||||
SUFFIX("pripona"),
|
||||
PREFIX("predpona");
|
||||
|
||||
private final String name;
|
||||
|
||||
WordLevelType(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
}
|
57
src/main/java/data/Enums/solar/SolarFilters.java
Normal file
57
src/main/java/data/Enums/solar/SolarFilters.java
Normal file
@ -0,0 +1,57 @@
|
||||
package data.Enums.solar;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class SolarFilters {
|
||||
private static HashMap<String, ObservableList<String>> SOLAR_FILTERS;
|
||||
public static final String SOLA = "sola";
|
||||
public static final String PREDMET = "predmet";
|
||||
public static final String RAZRED = "razred";
|
||||
public static final String REGIJA = "regija";
|
||||
public static final String TIP = "tip";
|
||||
public static final String LETO = "leto";
|
||||
|
||||
static {
|
||||
SOLAR_FILTERS = new HashMap<>();
|
||||
|
||||
SOLAR_FILTERS.put(REGIJA, FXCollections.observableArrayList("Celje", "Gorica", "Koper", "Kranj", "Krško", "Ljubljana", "Maribor", "Murska Sobota", "Novo mesto", "Postojna", "Slovenj Gradec"));
|
||||
SOLAR_FILTERS.put(PREDMET, FXCollections.observableArrayList("državljanska vzgoja in etika", "ekonomija", "filozofija", "geografija", "kemija", "podjetništvo", "psihologija", "slovenščina", "sociologija", "umetnostna vzgoja", "zgodovina"));
|
||||
SOLAR_FILTERS.put(RAZRED, FXCollections.observableArrayList("6. razred", "7. razred", "8. razred", "9. razred", "1. letnik", "2. letnik", "3. letnik", "4. letnik", "5. letnik", "maturitetni tečaj"));
|
||||
SOLAR_FILTERS.put(LETO, FXCollections.observableArrayList("2007", "2008", "2009", "2009/2010", "2010"));
|
||||
SOLAR_FILTERS.put(SOLA, FXCollections.observableArrayList("gimnazija", "osnovna šola", "poklicna šola", "strokovna šola"));
|
||||
SOLAR_FILTERS.put(TIP, FXCollections.observableArrayList("esej/spis", "pisni izdelek (učna ura)", "test (daljše besedilo)", "test (odgovori na vprašanja)"));
|
||||
}
|
||||
|
||||
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_FULL = FXCollections.observableArrayList("različnica", "lema", "oblikoskladenjska oznaka", "oblikoskladenjska lastnost", "besedna vrsta");
|
||||
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_LIMITED = FXCollections.observableArrayList("različnica", "lema");
|
||||
|
||||
/**
|
||||
* Returns filters with all possible values
|
||||
*/
|
||||
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes() {
|
||||
return SOLAR_FILTERS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns filters with all possible values
|
||||
*/
|
||||
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes(HashMap<String, HashSet<String>> foundFilters) {
|
||||
HashMap<String, ObservableList<String>> filtersForComboBoxes = new HashMap<>();
|
||||
|
||||
for (Map.Entry<String, ObservableList<String>> e : SOLAR_FILTERS.entrySet()) {
|
||||
if (!foundFilters.containsKey(e.getKey())) {
|
||||
// if, by some reason a specific filter wasn't in the corpus, return a blank list for that filter
|
||||
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList());
|
||||
} else {
|
||||
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList(foundFilters.get(e.getKey())).sorted());
|
||||
}
|
||||
}
|
||||
|
||||
return filtersForComboBoxes;
|
||||
}
|
||||
}
|
144
src/main/java/data/Filter.java
Normal file
144
src/main/java/data/Filter.java
Normal file
@ -0,0 +1,144 @@
|
||||
package data;
|
||||
|
||||
import static data.Filter.filterName.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import gui.ValidationUtil;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public class Filter {
|
||||
private HashMap<filterName, Object> filter;
|
||||
|
||||
public enum filterName {
|
||||
ANALYSIS_LEVEL,
|
||||
CALCULATE_FOR,
|
||||
NGRAM_VALUE,
|
||||
SKIP_VALUE,
|
||||
IS_CVV,
|
||||
STRING_LENGTH,
|
||||
TAXONOMY,
|
||||
MSD,
|
||||
HAS_MSD,
|
||||
SOLAR_FILTERS
|
||||
}
|
||||
|
||||
public Filter() {
|
||||
filter = new HashMap<>();
|
||||
}
|
||||
|
||||
public Filter(AnalysisLevel al, CalculateFor cf) {
|
||||
filter = new HashMap<>();
|
||||
|
||||
filter.put(ANALYSIS_LEVEL, al);
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
}
|
||||
|
||||
public void setAl(AnalysisLevel al) {
|
||||
filter.put(ANALYSIS_LEVEL, al);
|
||||
}
|
||||
|
||||
public AnalysisLevel getAl() {
|
||||
return (AnalysisLevel) filter.get(ANALYSIS_LEVEL);
|
||||
}
|
||||
|
||||
public void setCalculateFor(CalculateFor cf) {
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
}
|
||||
|
||||
public CalculateFor getCalculateFor() {
|
||||
return (CalculateFor) filter.get(CALCULATE_FOR);
|
||||
}
|
||||
|
||||
public void setNgramValue(Integer ngramValue) {
|
||||
filter.put(NGRAM_VALUE, ngramValue);
|
||||
}
|
||||
|
||||
public Integer getNgramValue() {
|
||||
return (Integer) filter.get(NGRAM_VALUE);
|
||||
}
|
||||
|
||||
public void setSkipValue(Integer skipValue) {
|
||||
filter.put(SKIP_VALUE, skipValue);
|
||||
}
|
||||
|
||||
public Integer getSkipValue() {
|
||||
return (Integer) filter.get(SKIP_VALUE);
|
||||
}
|
||||
|
||||
public void setIsCvv(boolean isCvv) {
|
||||
filter.put(IS_CVV, isCvv);
|
||||
}
|
||||
|
||||
public boolean isCvv() {
|
||||
return filter.containsKey(IS_CVV) && (boolean) filter.get(IS_CVV);
|
||||
}
|
||||
|
||||
public void setStringLength(int stringLength) {
|
||||
filter.put(STRING_LENGTH, stringLength);
|
||||
}
|
||||
|
||||
public Integer getStringLength() {
|
||||
return (Integer) filter.get(STRING_LENGTH);
|
||||
}
|
||||
|
||||
public void setTaxonomy(ArrayList<String> taxonomy) {
|
||||
filter.put(TAXONOMY, taxonomy);
|
||||
}
|
||||
|
||||
public ArrayList<String> getTaxonomy() {
|
||||
if (filter.containsKey(TAXONOMY) && filter.get(TAXONOMY) != null) {
|
||||
return (ArrayList<String>) filter.get(TAXONOMY);
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
public void setMsd(ArrayList<Pattern> msd) {
|
||||
filter.put(MSD, msd);
|
||||
if (!ValidationUtil.isEmpty(msd)) {
|
||||
setHasMsd(true);
|
||||
} else {
|
||||
setHasMsd(false);
|
||||
}
|
||||
}
|
||||
|
||||
public ArrayList<Pattern> getMsd() {
|
||||
return (ArrayList<Pattern>) filter.get(MSD);
|
||||
}
|
||||
|
||||
public void setHasMsd(boolean hasMsd) {
|
||||
filter.put(HAS_MSD, hasMsd);
|
||||
}
|
||||
|
||||
public boolean hasMsd() {
|
||||
return filter.containsKey(HAS_MSD) && (boolean) filter.get(HAS_MSD);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String newLine = "\n\t- ";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(newLine).append("Filter:");
|
||||
for (Map.Entry<filterName, Object> entry : filter.entrySet()) {
|
||||
sb.append(newLine)
|
||||
.append(entry.getKey().toString())
|
||||
.append(": ")
|
||||
.append(entry.getValue() != null ? entry.getValue().toString() : "null");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public void setSolarFilters(HashMap<String, HashSet<String>> filters) {
|
||||
filter.put(SOLAR_FILTERS, filters);
|
||||
}
|
||||
|
||||
public HashMap<String, HashSet<String>> getSolarFilters() {
|
||||
return (HashMap<String, HashSet<String>>) filter.get(SOLAR_FILTERS);
|
||||
}
|
||||
}
|
71
src/main/java/data/GigafidaJosWordType.java
Normal file
71
src/main/java/data/GigafidaJosWordType.java
Normal file
@ -0,0 +1,71 @@
|
||||
package data;
|
||||
|
||||
public enum GigafidaJosWordType {
|
||||
SAMOSTALNIK("samostalnik", 'S'),
|
||||
GLAGOL("glagol", 'G'),
|
||||
PRIDEVNIK("pridevnik", 'P'),
|
||||
PRISLOV("prislov", 'R'),
|
||||
ZAIMEK("zaimek", 'Z'),
|
||||
STEVNIK("stevnik", 'K'),
|
||||
PREDLOG("predlog", 'D'),
|
||||
VEZNIK("veznik", 'V'),
|
||||
CLENEK("clenek", 'L'),
|
||||
MEDMET("medmet", 'M'),
|
||||
OKRAJSAVA("okrajsava", 'O');
|
||||
|
||||
|
||||
private final String name;
|
||||
private final char wordType;
|
||||
|
||||
GigafidaJosWordType(String name, char wordType) {
|
||||
this.name = name;
|
||||
this.wordType = wordType;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public char getWordType() {
|
||||
return wordType;
|
||||
}
|
||||
|
||||
public static GigafidaJosWordType factory(String wType) {
|
||||
if (wType != null) {
|
||||
if (SAMOSTALNIK.toString().equals(wType)) {
|
||||
return SAMOSTALNIK;
|
||||
}
|
||||
if (GLAGOL.toString().equals(wType)) {
|
||||
return GLAGOL;
|
||||
}
|
||||
if (PRIDEVNIK.toString().equals(wType)) {
|
||||
return PRIDEVNIK;
|
||||
}
|
||||
if (PRISLOV.toString().equals(wType)) {
|
||||
return PRISLOV;
|
||||
}
|
||||
if (ZAIMEK.toString().equals(wType)) {
|
||||
return ZAIMEK;
|
||||
}
|
||||
if (STEVNIK.toString().equals(wType)) {
|
||||
return STEVNIK;
|
||||
}
|
||||
if (PREDLOG.toString().equals(wType)) {
|
||||
return PREDLOG;
|
||||
}
|
||||
if (VEZNIK.toString().equals(wType)) {
|
||||
return VEZNIK;
|
||||
}
|
||||
if (CLENEK.toString().equals(wType)) {
|
||||
return CLENEK;
|
||||
}
|
||||
if (MEDMET.toString().equals(wType)) {
|
||||
return MEDMET;
|
||||
}
|
||||
if (OKRAJSAVA.toString().equals(wType)) {
|
||||
return OKRAJSAVA;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
76
src/main/java/data/GigafidaTaxonomy.java
Normal file
76
src/main/java/data/GigafidaTaxonomy.java
Normal file
@ -0,0 +1,76 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum GigafidaTaxonomy {
|
||||
TISK("tisk", "T"),
|
||||
KNJIZNO("knjižno", "T.K"),
|
||||
LEPOSLOVNO("leposlovno", "T.K.L"),
|
||||
STROKOVNO("strokovno", "T.K.S"),
|
||||
PERIODICNO("periodično", "T.P"),
|
||||
CASOPIS("časopis", "T.P.C"),
|
||||
REVIJA("revija", "T.P.R"),
|
||||
INTERNET("internet", "I");
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
|
||||
private static final ObservableList<String> FOR_COMBO_BOX;
|
||||
|
||||
static {
|
||||
ArrayList<String> values = Arrays.stream(GigafidaTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
|
||||
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
GigafidaTaxonomy(String name, String taxonomy) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static GigafidaTaxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
if (TISK.toString().equals(tax)) {
|
||||
return TISK;
|
||||
}
|
||||
if (KNJIZNO.toString().equals(tax)) {
|
||||
return KNJIZNO;
|
||||
}
|
||||
if (LEPOSLOVNO.toString().equals(tax)) {
|
||||
return LEPOSLOVNO;
|
||||
}
|
||||
if (STROKOVNO.toString().equals(tax)) {
|
||||
return STROKOVNO;
|
||||
}
|
||||
if (PERIODICNO.toString().equals(tax)) {
|
||||
return PERIODICNO;
|
||||
}
|
||||
if (CASOPIS.toString().equals(tax)) {
|
||||
return CASOPIS;
|
||||
}
|
||||
if (REVIJA.toString().equals(tax)) {
|
||||
return REVIJA;
|
||||
}
|
||||
if (INTERNET.toString().equals(tax)) {
|
||||
return INTERNET;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getForComboBox() {
|
||||
return FOR_COMBO_BOX;
|
||||
}
|
||||
}
|
85
src/main/java/data/GosTaxonomy.java
Normal file
85
src/main/java/data/GosTaxonomy.java
Normal file
@ -0,0 +1,85 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum GosTaxonomy {
|
||||
JAVNI("javni", "gos.T.J"),
|
||||
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "gos.T.J.I"),
|
||||
RAZVEDRILNI("razvedrilni", "gos.T.J.R"),
|
||||
NEJAVNI("nejavni", "gos.T.N"),
|
||||
NEZASEBNI("nezasebni", "gos.T.N.N"),
|
||||
ZASEBNI("zasebni", "gos.T.N.Z"),
|
||||
OSEBNI_STIK("osebni stik", "gos.K.O"),
|
||||
TELEFON("telefon", "gos.K.P"),
|
||||
RADIO("radio", "gos.K.R"),
|
||||
TELEVIZIJA("televizija", "gos.K.T");
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
|
||||
private static final ObservableList<String> FOR_COMBO_BOX;
|
||||
|
||||
static {
|
||||
ArrayList<String> values = Arrays.stream(GosTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
|
||||
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
GosTaxonomy(String name, String taxonomy) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static GosTaxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
if (JAVNI.toString().equals(tax)) {
|
||||
return JAVNI;
|
||||
}
|
||||
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
|
||||
return INFORMATIVNO_IZOBRAZEVALNI;
|
||||
}
|
||||
if (RAZVEDRILNI.toString().equals(tax)) {
|
||||
return RAZVEDRILNI;
|
||||
}
|
||||
if (NEJAVNI.toString().equals(tax)) {
|
||||
return NEJAVNI;
|
||||
}
|
||||
if (NEZASEBNI.toString().equals(tax)) {
|
||||
return NEZASEBNI;
|
||||
}
|
||||
if (ZASEBNI.toString().equals(tax)) {
|
||||
return ZASEBNI;
|
||||
}
|
||||
if (OSEBNI_STIK.toString().equals(tax)) {
|
||||
return OSEBNI_STIK;
|
||||
}
|
||||
if (TELEFON.toString().equals(tax)) {
|
||||
return TELEFON;
|
||||
}
|
||||
if (RADIO.toString().equals(tax)) {
|
||||
return RADIO;
|
||||
}
|
||||
if (TELEVIZIJA.toString().equals(tax)) {
|
||||
return TELEVIZIJA;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getForComboBox() {
|
||||
return FOR_COMBO_BOX;
|
||||
}
|
||||
}
|
56
src/main/java/data/Sentence.java
Normal file
56
src/main/java/data/Sentence.java
Normal file
@ -0,0 +1,56 @@
|
||||
package data;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class Sentence {
|
||||
|
||||
|
||||
private List<Word> words;
|
||||
private String taksonomija;
|
||||
|
||||
// GOS
|
||||
private String type;
|
||||
private Map<String, String> properties;
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words) {
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija, Map<String, String> properties) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija, String type) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public List<Word> getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public String getTaxonomy() {
|
||||
return taksonomija;
|
||||
}
|
||||
|
||||
public List<Word> getSublist(int indexFrom, int indexTo) {
|
||||
return this.words.subList(indexFrom, indexTo);
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
16
src/main/java/data/Settings.java
Normal file
16
src/main/java/data/Settings.java
Normal file
@ -0,0 +1,16 @@
|
||||
package data;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
|
||||
public class Settings {
|
||||
public static final int CORPUS_SENTENCE_LIMIT = 50000;
|
||||
public static final boolean PRINT_LOG = false;
|
||||
|
||||
public static final String FX_ACCENT_OK = "-fx-accent: forestgreen;";
|
||||
public static final String FX_ACCENT_NOK = "-fx-accent: red;";
|
||||
|
||||
public static Collection<File> corpus;
|
||||
public static File resultsFilePath;
|
||||
}
|
299
src/main/java/data/Statistics.java
Normal file
299
src/main/java/data/Statistics.java
Normal file
@ -0,0 +1,299 @@
|
||||
package data;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import util.Util;
|
||||
import util.db.RDB;
|
||||
|
||||
public class Statistics {
|
||||
private CorpusType corpusType;
|
||||
private AnalysisLevel analysisLevel;
|
||||
private boolean useDB;
|
||||
private RDB db;
|
||||
|
||||
private boolean analysisProducedResults;
|
||||
|
||||
private String taxonomy;
|
||||
private boolean taxonomyIsSet;
|
||||
|
||||
private char JOSType;
|
||||
private boolean JOSTypeIsSet;
|
||||
|
||||
private String resultTitle;
|
||||
public Map<String, AtomicLong> result = new ConcurrentHashMap<>();
|
||||
|
||||
// nGrams
|
||||
private int nGramLevel;
|
||||
private Integer skip;
|
||||
private CalculateFor cf;
|
||||
private List<Pattern> morphosyntacticFilter;
|
||||
|
||||
// distributions
|
||||
private String distributionTaxonomy;
|
||||
private char distributionJosWordType;
|
||||
private boolean vcc;
|
||||
private Integer substringLength;
|
||||
|
||||
// inflected JOS
|
||||
private String inflectedJosTaxonomy;
|
||||
|
||||
// GOS
|
||||
boolean gosOrthMode;
|
||||
|
||||
// šolar
|
||||
Map<String, Object> solarHeadBlockFilter;
|
||||
|
||||
|
||||
// for ngrams
|
||||
public Statistics(AnalysisLevel al, int nGramLevel, Integer skip, CalculateFor cf) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
this.cf = cf;
|
||||
this.analysisLevel = al;
|
||||
this.nGramLevel = nGramLevel;
|
||||
this.skip = skip == null || skip == 0 ? null : skip;
|
||||
|
||||
this.resultTitle = String.format("%s%d-gram_%s_%s",
|
||||
this.skip != null ? String.format("%d-%s-", skip, "skip") : "",
|
||||
nGramLevel,
|
||||
cf.toString(),
|
||||
dateTime);
|
||||
}
|
||||
|
||||
// for words distributions
|
||||
public Statistics(AnalysisLevel al, Taxonomy distributionTaxonomy, GigafidaJosWordType distributionJosWordType, CalculateFor cf) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("%s_%s_%s",
|
||||
distributionTaxonomy != null ? distributionTaxonomy.toString() : "",
|
||||
distributionJosWordType != null ? distributionJosWordType.toString() : "",
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.cf = cf;
|
||||
this.distributionTaxonomy = distributionTaxonomy != null ? distributionTaxonomy.getTaxonomnyString() : null;
|
||||
this.taxonomyIsSet = distributionTaxonomy != null;
|
||||
|
||||
this.JOSTypeIsSet = distributionJosWordType != null;
|
||||
this.distributionJosWordType = this.JOSTypeIsSet ? distributionJosWordType.getWordType() : ' ';
|
||||
}
|
||||
|
||||
public Statistics(AnalysisLevel al, CalculateFor cf, Integer substringLength) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("%s_%d_%s",
|
||||
"Distribucija zaporedij samoglasnikov in soglasnikov",
|
||||
substringLength,
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.cf = cf;
|
||||
this.substringLength = substringLength;
|
||||
this.vcc = true;
|
||||
}
|
||||
|
||||
public Statistics(AnalysisLevel al, Taxonomy inflectedJosTaxonomy) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("InflectedJOS_%s_%s",
|
||||
distributionTaxonomy != null ? distributionTaxonomy : "",
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.inflectedJosTaxonomy = inflectedJosTaxonomy != null ? inflectedJosTaxonomy.getTaxonomnyString() : null;
|
||||
this.taxonomyIsSet = inflectedJosTaxonomy != null;
|
||||
}
|
||||
|
||||
public Integer getSkip() {
|
||||
return skip;
|
||||
}
|
||||
|
||||
public Integer getSubstringLength() {
|
||||
return substringLength;
|
||||
}
|
||||
|
||||
public String getInflectedJosTaxonomy() {
|
||||
return inflectedJosTaxonomy;
|
||||
}
|
||||
|
||||
public void setSubstringLength(Integer substringLength) {
|
||||
this.substringLength = substringLength;
|
||||
}
|
||||
|
||||
public boolean isVcc() {
|
||||
return vcc;
|
||||
}
|
||||
|
||||
public void setVcc(boolean vcc) {
|
||||
this.vcc = vcc;
|
||||
}
|
||||
|
||||
public String getDistributionTaxonomy() {
|
||||
return distributionTaxonomy;
|
||||
}
|
||||
|
||||
public void setDistributionTaxonomy(String distributionTaxonomy) {
|
||||
this.distributionTaxonomy = distributionTaxonomy;
|
||||
}
|
||||
|
||||
public char getDistributionJosWordType() {
|
||||
return distributionJosWordType;
|
||||
}
|
||||
|
||||
public void setDistributionJosWordType(char distributionJosWordType) {
|
||||
this.distributionJosWordType = distributionJosWordType;
|
||||
}
|
||||
|
||||
public void setMorphosyntacticFilter(List<String> morphosyntacticFilter) {
|
||||
// change filter strings to regex patterns
|
||||
this.morphosyntacticFilter = new ArrayList<>();
|
||||
for (String s : morphosyntacticFilter) {
|
||||
this.morphosyntacticFilter.add(Pattern.compile(s.replaceAll("\\*", ".")));
|
||||
}
|
||||
}
|
||||
|
||||
public List<Pattern> getMsd() {
|
||||
return morphosyntacticFilter;
|
||||
}
|
||||
|
||||
public Map<String, AtomicLong> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setTaxonomy(String taxonomy) {
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public void setTaxonomyIsSet(boolean taxonomyIsSet) {
|
||||
this.taxonomyIsSet = taxonomyIsSet;
|
||||
}
|
||||
|
||||
public char getJOSType() {
|
||||
return JOSType;
|
||||
}
|
||||
|
||||
public void setJOSType(char JOSType) {
|
||||
this.JOSType = JOSType;
|
||||
}
|
||||
|
||||
public boolean isJOSTypeSet() {
|
||||
return JOSTypeIsSet;
|
||||
}
|
||||
|
||||
public void setJOSType(boolean JOSTypeIsSet) {
|
||||
this.JOSTypeIsSet = JOSTypeIsSet;
|
||||
}
|
||||
|
||||
public void saveResultToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
// Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
|
||||
//
|
||||
// if (useDB) {
|
||||
// result = db.getDump();
|
||||
// db.delete();
|
||||
// }
|
||||
//
|
||||
// // if no results and nothing to save, return false
|
||||
// if (!(result.size() > 0)) {
|
||||
// analysisProducedResults = false;
|
||||
// return;
|
||||
// } else {
|
||||
// analysisProducedResults = true;
|
||||
// }
|
||||
//
|
||||
// stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
|
||||
// Export.SetToCSV(stats);
|
||||
}
|
||||
|
||||
// private Map<String, Integer> getSortedResultInflected(Map map) {
|
||||
// // first convert to <String, Integer>
|
||||
// Map<String, Integer> m = Util.sortByValue(Util.atomicInt2StringAndInt(map), 0);
|
||||
//
|
||||
// Map<String, Integer> sortedM = new TreeMap<>();
|
||||
//
|
||||
// sortedM.putAll(m);
|
||||
//
|
||||
// return sortedM;
|
||||
// }
|
||||
|
||||
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
|
||||
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
|
||||
}
|
||||
|
||||
public String getTaxonomy() {
|
||||
return taxonomy;
|
||||
}
|
||||
|
||||
public boolean isTaxonomySet() {
|
||||
return taxonomyIsSet;
|
||||
}
|
||||
|
||||
public int getnGramLevel() {
|
||||
return nGramLevel;
|
||||
}
|
||||
|
||||
public CalculateFor getCf() {
|
||||
return cf;
|
||||
}
|
||||
|
||||
public AnalysisLevel getAnalysisLevel() {
|
||||
return analysisLevel;
|
||||
}
|
||||
|
||||
public CorpusType getCorpusType() {
|
||||
return corpusType;
|
||||
}
|
||||
|
||||
public void setCorpusType(CorpusType corpusType) {
|
||||
this.corpusType = corpusType;
|
||||
}
|
||||
|
||||
public boolean isGosOrthMode() {
|
||||
return gosOrthMode;
|
||||
}
|
||||
|
||||
public void setGosOrthMode(boolean gosOrthMode) {
|
||||
this.gosOrthMode = gosOrthMode;
|
||||
}
|
||||
|
||||
public Map<String, Object> getSolarHeadBlockFilter() {
|
||||
return solarHeadBlockFilter;
|
||||
}
|
||||
|
||||
public void setSolarHeadBlockFilter(Map<String, Object> solarHeadBlockFilter) {
|
||||
this.solarHeadBlockFilter = solarHeadBlockFilter;
|
||||
}
|
||||
|
||||
public boolean isUseDB() {
|
||||
return useDB;
|
||||
}
|
||||
|
||||
public void setUseDB(boolean useDB) {
|
||||
if (useDB && db == null) {
|
||||
db = new RDB();
|
||||
}
|
||||
this.useDB = useDB;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores results from this batch to a database and clears results map
|
||||
*/
|
||||
public void storeTmpResultsToDB() {
|
||||
try {
|
||||
db.writeBatch(result);
|
||||
result = new ConcurrentHashMap<>();
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isAnalysisProducedResults() {
|
||||
return analysisProducedResults;
|
||||
}
|
||||
}
|
409
src/main/java/data/StatisticsNew.java
Normal file
409
src/main/java/data/StatisticsNew.java
Normal file
@ -0,0 +1,409 @@
|
||||
package data;
|
||||
|
||||
import static gui.ValidationUtil.*;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import alg.inflectedJOS.WordFormation;
|
||||
import data.Enums.WordLevelType;
|
||||
import javafx.collections.ObservableList;
|
||||
import util.Export;
|
||||
import util.Util;
|
||||
import util.db.RDB;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class StatisticsNew {
|
||||
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
|
||||
|
||||
private Corpus corpus;
|
||||
private Filter filter;
|
||||
|
||||
private String resultTitle;
|
||||
private Map<String, AtomicLong> result;
|
||||
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
|
||||
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
|
||||
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
|
||||
private boolean useDB;
|
||||
private RDB db;
|
||||
private boolean analysisProducedResults;
|
||||
private LocalDateTime time;
|
||||
|
||||
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
|
||||
this.corpus = corpus;
|
||||
this.filter = filter;
|
||||
|
||||
if (useDB) {
|
||||
this.useDB = true;
|
||||
db = new RDB();
|
||||
}
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
|
||||
resultNestedSuffix = new ConcurrentHashMap<>();
|
||||
resultNestedPrefix = new ConcurrentHashMap<>();
|
||||
} else {
|
||||
result = new ConcurrentHashMap<>();
|
||||
}
|
||||
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
logger.debug(toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Result's title consists of:
|
||||
* <ul>
|
||||
* <li>Corpus type</li>
|
||||
* <li>Analysis level</li>
|
||||
* <li>Calculate for</li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* </ul>
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private String generateResultTitle() {
|
||||
String separator = "_";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
if(ngramLevel == 0) {
|
||||
sb.append("Crke").
|
||||
append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
} else if(ngramLevel == 1) {
|
||||
sb.append("Besede").append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
}
|
||||
else {
|
||||
sb.append(filter.getAl().toString())
|
||||
.append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
sb.append(filter.getCalculateFor().toString())
|
||||
.append(separator);
|
||||
// ngram value
|
||||
sb.append(filter.getNgramValue()).append("-gram")
|
||||
.append(separator);
|
||||
sb.append(filter.getSkipValue()).append("-preskok")
|
||||
.append(separator);
|
||||
}
|
||||
// TODO: assure skip is not null but zero
|
||||
|
||||
} else {
|
||||
sb.append(filter.getAl().toString()) // analysis level
|
||||
.append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
}
|
||||
// skip value
|
||||
// msd ?
|
||||
// if taxonomy -> taxonomy
|
||||
// if cvv -> cvv + dolžina
|
||||
|
||||
this.time = this.time != null ? this.time : LocalDateTime.now();
|
||||
|
||||
sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")));
|
||||
return sb.toString();
|
||||
|
||||
}
|
||||
|
||||
public boolean isAnalysisProducedResults() {
|
||||
return analysisProducedResults;
|
||||
}
|
||||
|
||||
public void setAnalysisProducedResults(boolean analysisProducedResults) {
|
||||
this.analysisProducedResults = analysisProducedResults;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String newLine = "\n\t- ";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(newLine).append("Statistic properties:");
|
||||
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
|
||||
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
|
||||
sb.append(newLine).append(filter.toString());
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getResultTitle() {
|
||||
return resultTitle;
|
||||
}
|
||||
|
||||
// ****************************************
|
||||
// ***************** util *****************
|
||||
// ****************************************
|
||||
|
||||
/**
|
||||
* Stores results from this batch to a database and clears results map
|
||||
*/
|
||||
public void storeTmpResultsToDB() {
|
||||
try {
|
||||
db.writeBatch(result);
|
||||
result = new ConcurrentHashMap<>();
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
logger.error("Store tmp results to DB", e);
|
||||
// e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public Filter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(result.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
|
||||
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
|
||||
|
||||
if (!isEmpty(resultNestedSuffix)) {
|
||||
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
if (!isEmpty(resultNestedPrefix)) {
|
||||
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(results.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
|
||||
filter.setAl(AnalysisLevel.WORD_FORMATION);
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(result.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
WordFormation.calculateStatistics(this);
|
||||
|
||||
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
|
||||
Map<String, Map<String, Long>> sorted = new HashMap<>();
|
||||
|
||||
for (String s : nestedMap.keySet()) {
|
||||
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
return sorted;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
|
||||
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
|
||||
}
|
||||
|
||||
public void updateResults(String o) {
|
||||
// if not in map
|
||||
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null)
|
||||
result.get(o).incrementAndGet();
|
||||
}
|
||||
|
||||
public Map<String, AtomicLong> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public Object[][] getResultCustom() {
|
||||
return resultCustom;
|
||||
}
|
||||
|
||||
public void setResultCustom(Object[][] resultCustom) {
|
||||
this.resultCustom = resultCustom;
|
||||
}
|
||||
|
||||
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
|
||||
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
|
||||
|
||||
if (type == WordLevelType.SUFFIX) {
|
||||
updateResultsNestedSuffix(key, stringValue);
|
||||
} else if (type == WordLevelType.PREFIX) {
|
||||
updateResultsNestedPrefix(key, stringValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void updateResultsNestedSuffix(String key, String stringValue) {
|
||||
if (resultNestedSuffix.containsKey(key)) {
|
||||
// if not in map
|
||||
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null) {
|
||||
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
} else {
|
||||
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
|
||||
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
if (r != null) {
|
||||
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void updateResultsNestedPrefix(String key, String stringValue) {
|
||||
if (resultNestedPrefix.containsKey(key)) {
|
||||
// if not in map
|
||||
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null) {
|
||||
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
} else {
|
||||
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
|
||||
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
if (r != null) {
|
||||
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private LinkedHashMap<String, String> headerInfoBlock() {
|
||||
LinkedHashMap<String, String> info = new LinkedHashMap<>();
|
||||
|
||||
info.put("Korpus:", corpus.getCorpusType().toString());
|
||||
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
if (ngramLevel == 0)
|
||||
info.put("Analiza:", "Črke");
|
||||
else if (ngramLevel == 1)
|
||||
info.put("Analiza", "Besede");
|
||||
else
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
} else {
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
}
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
|
||||
// n.gram nivo
|
||||
if (ngramLevel > 1) {
|
||||
info.put("n-gram nivo:", String.valueOf(ngramLevel));
|
||||
} else if (ngramLevel == 1){
|
||||
info.put("n-gram nivo:", "nivo besed");
|
||||
} else {
|
||||
info.put("n-gram nivo:", "nivo črk");
|
||||
}
|
||||
// skip
|
||||
if (ngramLevel > 1)
|
||||
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
|
||||
|
||||
// izračunaj za
|
||||
info.put("Izračunaj za:", filter.getCalculateFor().toString());
|
||||
|
||||
// msd
|
||||
if (!isEmpty(filter.getMsd())) {
|
||||
StringBuilder msdPattern = new StringBuilder();
|
||||
for (Pattern pattern : filter.getMsd()) {
|
||||
msdPattern.append(pattern.toString()).append(" ");
|
||||
}
|
||||
|
||||
info.put("MSD:", msdPattern.toString());
|
||||
}
|
||||
|
||||
// taksonomija
|
||||
if (!isEmpty(filter.getTaxonomy())) {
|
||||
info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
|
||||
|
||||
info.put("Taksonomija: ", "");
|
||||
String sep = "";
|
||||
for (String s : tax) {
|
||||
info.put(sep = sep + " ", s);
|
||||
}
|
||||
}
|
||||
|
||||
if (corpus.getCorpusType() == CorpusType.SOLAR) {
|
||||
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
|
||||
|
||||
if (!isEmpty(filters)) {
|
||||
info.put("Dodatni filtri: ", "");
|
||||
|
||||
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
|
||||
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
}
|
175
src/main/java/data/Tax.java
Normal file
175
src/main/java/data/Tax.java
Normal file
@ -0,0 +1,175 @@
|
||||
package data;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import gui.ValidationUtil;
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
|
||||
|
||||
// GOS ----------------------------------
|
||||
GOS_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GOS_TAXONOMY.put("gos.T", "diskurz");
|
||||
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
|
||||
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
|
||||
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
|
||||
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
|
||||
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
|
||||
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
|
||||
|
||||
GOS_TAXONOMY.put("gos.S", "situacija");
|
||||
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
|
||||
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the whole default taxonomy for the specified corpus type
|
||||
*/
|
||||
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
return FXCollections.observableArrayList(GIGAFIDA_TAXONOMY.values());
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
return FXCollections.observableArrayList(GOS_TAXONOMY.values());
|
||||
}
|
||||
|
||||
return FXCollections.observableArrayList(new ArrayList<>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns taxonomy names only for items found in headers
|
||||
*/
|
||||
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
|
||||
// assures same relative order
|
||||
for (String t : tax.keySet()) {
|
||||
if (foundTax.contains(t)) {
|
||||
taxForCombo.add(tax.get(t));
|
||||
}
|
||||
}
|
||||
|
||||
return FXCollections.observableArrayList(taxForCombo);
|
||||
}
|
||||
|
||||
public static HashSet<CorpusType> getCorpusTypesWithTaxonomy() {
|
||||
return corpusTypesWithTaxonomy;
|
||||
}
|
||||
|
||||
public static ArrayList<String> getTaxonomyCodes(ArrayList<String> taxonomyNames, CorpusType corpusType) {
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
|
||||
if (ValidationUtil.isEmpty(taxonomyNames)) {
|
||||
return result;
|
||||
}
|
||||
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
// for easier lookup
|
||||
Map<String, String> taxInversed = tax.entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
|
||||
|
||||
for (String taxonomyName : taxonomyNames) {
|
||||
result.add(taxInversed.get(taxonomyName));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of proper names for codes
|
||||
*
|
||||
* @param corpusType
|
||||
* @param taxonomy
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static ArrayList<String> getTaxonomyForInfo(CorpusType corpusType, ArrayList<String> taxonomy) {
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
|
||||
for (String t : taxonomy) {
|
||||
result.add(tax.get(t));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
171
src/main/java/data/Taxonomy.java
Normal file
171
src/main/java/data/Taxonomy.java
Normal file
@ -0,0 +1,171 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum Taxonomy {
|
||||
// GOS
|
||||
JAVNI("javni", "T.J", "gos"),
|
||||
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "T.J.I", "gos"),
|
||||
RAZVEDRILNI("razvedrilni", "T.J.R", "gos"),
|
||||
NEJAVNI("nejavni", "T.N", "gos"),
|
||||
NEZASEBNI("nezasebni", "T.N.N", "gos"),
|
||||
ZASEBNI("zasebni", "T.N.Z", "gos"),
|
||||
OSEBNI_STIK("osebni stik", "K.O", "gos"),
|
||||
TELEFON("telefon", "K.P", "gos"),
|
||||
RADIO("radio", "K.R", "gos"),
|
||||
TELEVIZIJA("televizija", "K.T", "gos"),
|
||||
// Gigafida
|
||||
KNJIZNO("knjižno", "T.K", "gigafida"),
|
||||
LEPOSLOVNO("leposlovno", "T.K.L", "gigafida"),
|
||||
STROKOVNO("strokovno", "T.K.S", "gigafida"),
|
||||
PERIODICNO("periodično", "T.P", "gigafida"),
|
||||
CASOPIS("časopis", "T.P.C", "gigafida"),
|
||||
REVIJA("revija", "T.P.R", "gigafida"),
|
||||
INTERNET("internet", "I", "gigafida"),
|
||||
|
||||
SSJ_TISK("tisk", "SSJ.T", "gigafida"),
|
||||
SSJ_KNJIZNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_LEPOSLOVNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_STROKOVNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_PERIODICNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_CASOPIS("opis", "identifikator", "gigafida"),
|
||||
SSJ_REVIJA("opis", "identifikator", "gigafida"),
|
||||
SSJ_DRUGO("opis", "identifikator", "gigafida"),
|
||||
SSJ_INTERNET("opis", "identifikator", "gigafida"),
|
||||
FT_P_PRENOSNIK("opis", "identifikator", "gigafida"),
|
||||
FT_P_GOVORNI("opis", "identifikator", "gigafida"),
|
||||
FT_P_ELEKTRONSKI("opis", "identifikator", "gigafida"),
|
||||
FT_P_PISNI("opis", "identifikator", "gigafida"),
|
||||
FT_P_OBJAVLJENO("opis", "identifikator", "gigafida"),
|
||||
FT_P_KNJIZNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_PERIODICNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_CASOPISNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_DNEVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_VECKRAT_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
// FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
FT_P_REVIALNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
FT_P_STIRINAJSTDNEVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_MESECNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_REDKEJE_KOT_MESECNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_OBCASNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_NEOBJAVLJENO("opis", "identifikator", "gigafida"),
|
||||
FT_P_JAVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_INTERNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_ZASEBNO("opis", "identifikator", "gigafida"),
|
||||
FT_ZVRST("opis", "identifikator", "gigafida"),
|
||||
FT_UMETNOSTNA("opis", "identifikator", "gigafida"),
|
||||
FT_PESNISKA("opis", "identifikator", "gigafida"),
|
||||
FT_PROZNA("opis", "identifikator", "gigafida"),
|
||||
FT_DRAMSKA("opis", "identifikator", "gigafida"),
|
||||
FT_NEUMETNOSTNA("opis", "identifikator", "gigafida"),
|
||||
FT_STROKOVNA("opis", "identifikator", "gigafida"),
|
||||
FT_HID("opis", "identifikator", "gigafida"),
|
||||
FT_NIT("opis", "identifikator", "gigafida"),
|
||||
FT_NESTROKOVNA("opis", "identifikator", "gigafida"),
|
||||
FT_PRAVNA("opis", "identifikator", "gigafida"),
|
||||
FT_LEKTORIRANO("opis", "identifikator", "gigafida"),
|
||||
FT_DA("opis", "identifikator", "gigafida"),
|
||||
FT_NE("opis", "identifikator", "gigafida");
|
||||
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
private final String corpus;
|
||||
|
||||
Taxonomy(String name, String taxonomy, String corpusType) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
this.corpus = corpusType;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static Taxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
// GOS
|
||||
if (JAVNI.toString().equals(tax)) {
|
||||
return JAVNI;
|
||||
}
|
||||
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
|
||||
return INFORMATIVNO_IZOBRAZEVALNI;
|
||||
}
|
||||
if (RAZVEDRILNI.toString().equals(tax)) {
|
||||
return RAZVEDRILNI;
|
||||
}
|
||||
if (NEJAVNI.toString().equals(tax)) {
|
||||
return NEJAVNI;
|
||||
}
|
||||
if (NEZASEBNI.toString().equals(tax)) {
|
||||
return NEZASEBNI;
|
||||
}
|
||||
if (ZASEBNI.toString().equals(tax)) {
|
||||
return ZASEBNI;
|
||||
}
|
||||
if (OSEBNI_STIK.toString().equals(tax)) {
|
||||
return OSEBNI_STIK;
|
||||
}
|
||||
if (TELEFON.toString().equals(tax)) {
|
||||
return TELEFON;
|
||||
}
|
||||
if (RADIO.toString().equals(tax)) {
|
||||
return RADIO;
|
||||
}
|
||||
if (TELEVIZIJA.toString().equals(tax)) {
|
||||
return TELEVIZIJA;
|
||||
}
|
||||
|
||||
// Gigafida
|
||||
// if (TISK.toString().equals(tax)) {
|
||||
// return TISK;
|
||||
// }
|
||||
if (KNJIZNO.toString().equals(tax)) {
|
||||
return KNJIZNO;
|
||||
}
|
||||
if (LEPOSLOVNO.toString().equals(tax)) {
|
||||
return LEPOSLOVNO;
|
||||
}
|
||||
if (STROKOVNO.toString().equals(tax)) {
|
||||
return STROKOVNO;
|
||||
}
|
||||
if (PERIODICNO.toString().equals(tax)) {
|
||||
return PERIODICNO;
|
||||
}
|
||||
if (CASOPIS.toString().equals(tax)) {
|
||||
return CASOPIS;
|
||||
}
|
||||
if (REVIJA.toString().equals(tax)) {
|
||||
return REVIJA;
|
||||
}
|
||||
if (INTERNET.toString().equals(tax)) {
|
||||
return INTERNET;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getDefaultForComboBox(String corpusType) {
|
||||
ArrayList<String> values = Arrays.stream(Taxonomy.values())
|
||||
.filter(x -> x.corpus.equals(corpusType))
|
||||
.map(x -> x.name)
|
||||
.collect(Collectors.toCollection(ArrayList::new));
|
||||
|
||||
return FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
public static ObservableList<String> getDefaultForComboBox(CorpusType corpusType) {
|
||||
return getDefaultForComboBox(corpusType.toString());
|
||||
}
|
||||
}
|
53
src/main/java/data/Validation.java
Normal file
53
src/main/java/data/Validation.java
Normal file
@ -0,0 +1,53 @@
|
||||
package data;
|
||||
|
||||
import static gui.ValidationUtil.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import gui.Messages;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Validation {
|
||||
|
||||
public static String validateForStringLevel(Filter filter) {
|
||||
ArrayList<String> errors = new ArrayList<>();
|
||||
|
||||
// should not be null, error if null, because init failed
|
||||
if (filter.getNgramValue() == null) {
|
||||
errors.add(Messages.MISSING_NGRAM_LEVEL);
|
||||
}
|
||||
|
||||
// should not be null, error if null, because init failed
|
||||
if (filter.getCalculateFor() == null) {
|
||||
errors.add(Messages.MISSING_CALCULATE_FOR);
|
||||
}
|
||||
|
||||
if (filter.getSkipValue() == null) {
|
||||
filter.setSkipValue(0);
|
||||
}
|
||||
|
||||
if (filter.getNgramValue() != null && ValidationUtil.isEmpty(filter.getMsd()) &&
|
||||
(filter.getMsd().size() != filter.getNgramValue())) {
|
||||
if (!(filter.getMsd().size() == 1 && filter.getNgramValue() == 0) && !ValidationUtil.isEmpty(filter.getMsd()))
|
||||
errors.add(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES);
|
||||
}
|
||||
|
||||
Integer ngramValue = filter.getNgramValue();
|
||||
ArrayList<Pattern> msd = filter.getMsd();
|
||||
|
||||
if (ngramValue > 0 && !ValidationUtil.isEmpty(msd) && ngramValue != msd.size()) {
|
||||
errors.add(String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, ngramValue, msd.size()));
|
||||
}
|
||||
|
||||
if (filter.getNgramValue() != null && filter.getNgramValue() == 0 && isEmpty(filter.getStringLength())) {
|
||||
// if count letters, make sure that the length is given
|
||||
// TODO: check that words we're adding in xml reader are longer than this value
|
||||
errors.add(Messages.MISSING_STRING_LENGTH);
|
||||
}
|
||||
|
||||
return isEmpty(errors) ? null : StringUtils.join(errors, ", \n");
|
||||
}
|
||||
}
|
141
src/main/java/data/Word.java
Normal file
141
src/main/java/data/Word.java
Normal file
@ -0,0 +1,141 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import data.Enums.Msd;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Word implements Serializable {
|
||||
public static final char PAD_CHARACTER = '-';
|
||||
|
||||
private String word;
|
||||
private String lemma;
|
||||
private String msd;
|
||||
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
|
||||
|
||||
/**
|
||||
* Possible values:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>S = samostalnik</li>
|
||||
* <li>G = glagol</li>
|
||||
* <li>P = pridevnik</li>
|
||||
* <li>R = prislov</li>
|
||||
* <li>Z = zaimek</li>
|
||||
* <li>K = števnik</li>
|
||||
* <li>D = predlog</li>
|
||||
* <li>V = veznik</li>
|
||||
* <li>L = členek</li>
|
||||
* <li>M = medmet</li>
|
||||
* <li>O = okrajšava</li>
|
||||
* <li>N = neuvrščeno</li>
|
||||
* </ul>
|
||||
*/
|
||||
//private char besedna_vrsta;
|
||||
public Word(String word, String lemma, String msd) {
|
||||
this.lemma = lemma;
|
||||
this.msd = normalizeMsd(msd);
|
||||
|
||||
// veliko zacetnico ohranimo samo za lastna imena
|
||||
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
|
||||
&& this.msd.length() >= 2
|
||||
&& this.msd.charAt(1) == 'l')) {
|
||||
this.word = word.toLowerCase();
|
||||
} else {
|
||||
this.word = word;
|
||||
}
|
||||
}
|
||||
|
||||
public Word() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a number of '-' to msds which are not properly sized.
|
||||
* E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
|
||||
*
|
||||
* @param msdInput
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private String normalizeMsd(String msdInput) {
|
||||
if (ValidationUtil.isEmpty(msdInput)) {
|
||||
return "";
|
||||
} else {
|
||||
return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
|
||||
}
|
||||
}
|
||||
|
||||
public Word(String word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public String getCVVWord() {
|
||||
return covertToCvv(word);
|
||||
}
|
||||
|
||||
public String getCVVLemma() {
|
||||
return covertToCvv(lemma);
|
||||
}
|
||||
|
||||
private String covertToCvv(String s) {
|
||||
char[] StringCA = s.toCharArray();
|
||||
|
||||
for (int i = 0; i < StringCA.length; i++) {
|
||||
StringCA[i] = VOWELS.contains(StringCA[i]) ? 'V' : 'C';
|
||||
}
|
||||
|
||||
return new String(StringCA);
|
||||
}
|
||||
|
||||
public void setWord(String word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
public String getLemma() {
|
||||
return lemma;
|
||||
}
|
||||
|
||||
public void setLemma(String lemma) {
|
||||
this.lemma = lemma;
|
||||
}
|
||||
|
||||
public String getMsd() {
|
||||
return msd;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("beseda:\t")
|
||||
.append(getWord())
|
||||
.append("\n")
|
||||
.append("lema:\t")
|
||||
.append(getLemma())
|
||||
.append("\n")
|
||||
.append("msd:\t")
|
||||
.append(getMsd())
|
||||
.append("\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getForCf(CalculateFor calculateFor, boolean cvv) {
|
||||
String returnValue = "";
|
||||
|
||||
if (cvv) {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
|
||||
} else {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
|
||||
}
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
}
|
454
src/main/java/gui/CharacterAnalysisTab.java
Normal file
454
src/main/java/gui/CharacterAnalysisTab.java
Normal file
@ -0,0 +1,454 @@
|
||||
package gui;
|
||||
|
||||
import data.*;
|
||||
import javafx.application.HostServices;
|
||||
import javafx.beans.value.ChangeListener;
|
||||
import javafx.beans.value.ObservableValue;
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ListChangeListener;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.concurrent.Task;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.control.*;
|
||||
import javafx.scene.layout.Pane;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static alg.XML_processing.readXML;
|
||||
import static gui.GUIController.showAlert;
|
||||
import static gui.Messages.*;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class CharacterAnalysisTab {
|
||||
public final static Logger logger = LogManager.getLogger(CharacterAnalysisTab.class);
|
||||
|
||||
@FXML
|
||||
public Label selectedFiltersLabel;
|
||||
@FXML
|
||||
public Label solarFilters;
|
||||
|
||||
@FXML
|
||||
private TextField msdTF;
|
||||
private ArrayList<Pattern> msd;
|
||||
private ArrayList<String> msdStrings;
|
||||
|
||||
@FXML
|
||||
private CheckComboBox<String> taxonomyCCB;
|
||||
private ArrayList<String> taxonomy;
|
||||
|
||||
@FXML
|
||||
private CheckBox calculatecvvCB;
|
||||
private boolean calculateCvv;
|
||||
|
||||
@FXML
|
||||
private TextField stringLengthTF;
|
||||
private Integer stringLength;
|
||||
|
||||
@FXML
|
||||
private ToggleGroup calculateForRB;
|
||||
private CalculateFor calculateFor;
|
||||
|
||||
@FXML
|
||||
private RadioButton lemmaRB;
|
||||
|
||||
@FXML
|
||||
private RadioButton varietyRB;
|
||||
|
||||
@FXML
|
||||
private Pane paneLetters;
|
||||
|
||||
@FXML
|
||||
private Button computeNgramsB;
|
||||
|
||||
@FXML
|
||||
public ProgressBar ngramProgressBar;
|
||||
@FXML
|
||||
public Label progressLabel;
|
||||
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
private enum MODE {
|
||||
LETTER
|
||||
}
|
||||
|
||||
private MODE currentMode;
|
||||
|
||||
private Corpus corpus;
|
||||
private HashMap<String, HashSet<String>> solarFiltersMap;
|
||||
private Filter filter;
|
||||
private boolean useDb;
|
||||
private HostServices hostService;
|
||||
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("različnica", "lema");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
|
||||
|
||||
|
||||
// TODO: pass observables for taxonomy based on header scan
|
||||
// after header scan
|
||||
private ObservableList<String> taxonomyCCBValues;
|
||||
private CorpusType currentCorpusType;
|
||||
|
||||
public void init() {
|
||||
currentMode = MODE.LETTER;
|
||||
toggleMode(currentMode);
|
||||
|
||||
calculateForRB.selectedToggleProperty().addListener(new ChangeListener<Toggle>() {
|
||||
@Override
|
||||
public void changed(ObservableValue<? extends Toggle> observable, Toggle oldValue, Toggle newValue) {
|
||||
//logger.info("calculateForRB:", newValue.toString());
|
||||
RadioButton chk = (RadioButton)newValue.getToggleGroup().getSelectedToggle(); // Cast object to radio button
|
||||
calculateFor = CalculateFor.factory(chk.getText());
|
||||
logger.info("calculateForRB:", chk.getText());
|
||||
//System.out.println("Selected Radio Button - "+chk.getText());
|
||||
}
|
||||
});
|
||||
|
||||
// msd
|
||||
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (!newValue) {
|
||||
// focus lost
|
||||
String value = msdTF.getText();
|
||||
logger.info("msdTf: ", value);
|
||||
|
||||
if (!ValidationUtil.isEmpty(value)) {
|
||||
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
|
||||
|
||||
int nOfRequiredMsdTokens = 1;
|
||||
if (msdTmp.size() != nOfRequiredMsdTokens) {
|
||||
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
|
||||
logAlert(msg);
|
||||
showAlert(Alert.AlertType.ERROR, msg);
|
||||
}
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
for (String msdToken : msdTmp) {
|
||||
msd.add(Pattern.compile(msdToken));
|
||||
msdStrings.add(msdToken);
|
||||
}
|
||||
logger.info(String.format("msd accepted (%d)", msd.size()));
|
||||
|
||||
} else if (!ValidationUtil.isEmpty(newValue)) {
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
msdTF.setText("");
|
||||
msd = new ArrayList<>();
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
taxonomyCCB.getItems().removeAll();
|
||||
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
|
||||
taxonomy = new ArrayList<>();
|
||||
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
|
||||
taxonomy.addAll(checkedItems);
|
||||
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
|
||||
});
|
||||
taxonomyCCB.getCheckModel().clearChecks();
|
||||
} else {
|
||||
taxonomyCCB.setDisable(true);
|
||||
}
|
||||
|
||||
// cvv
|
||||
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
calculateCvv = newValue;
|
||||
logger.info("calculate cvv: " + calculateCvv);
|
||||
});
|
||||
|
||||
|
||||
// string length
|
||||
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (!newValue) {
|
||||
// focus lost
|
||||
String value = stringLengthTF.getText();
|
||||
if (!ValidationUtil.isEmpty(value)) {
|
||||
if (!ValidationUtil.isNumber(value)) {
|
||||
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
|
||||
}
|
||||
stringLength = Integer.parseInt(value);
|
||||
} else {
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
|
||||
stringLengthTF.setText("1");
|
||||
logAlert(WARNING_MISSING_STRING_LENGTH);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
computeNgramsB.setOnAction(e -> {
|
||||
compute();
|
||||
logger.info("compute button");
|
||||
});
|
||||
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
}
|
||||
|
||||
/**
|
||||
* case a: values for combo boxes can change after a corpus change
|
||||
* <ul>
|
||||
* <li>different corpus type - reset all fields so no old values remain</li>
|
||||
* <li>same corpus type, different subset - keep</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* case b: values for combo boxes can change after a header scan
|
||||
* <ul>
|
||||
* <li>at first, fields are populated by corpus type defaults</li>
|
||||
* <li>after, with gathered data</li>
|
||||
* </ul>
|
||||
* <p></p>
|
||||
* ngrams: 1
|
||||
* calculateFor: word
|
||||
* msd:
|
||||
* taxonomy:
|
||||
* skip: 0
|
||||
* iscvv: false
|
||||
* string length: 1
|
||||
*/
|
||||
public void populateFields() {
|
||||
// corpus changed if: current one is null (this is first run of the app)
|
||||
// or if currentCorpus != gui's corpus
|
||||
boolean corpusChanged = currentCorpusType == null
|
||||
|| currentCorpusType != corpus.getCorpusType();
|
||||
|
||||
// TODO: check for GOS, GIGAFIDA, SOLAR...
|
||||
// refresh and:
|
||||
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
|
||||
if (calculateFor == null) {
|
||||
calculateForRB.selectToggle(lemmaRB);
|
||||
calculateFor = CalculateFor.factory(calculateForRB.getSelectedToggle().toString());
|
||||
}
|
||||
|
||||
if (!filter.hasMsd()) {
|
||||
// if current corpus doesn't have msd data, disable this field
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(true);
|
||||
logger.info("no msd data");
|
||||
} else {
|
||||
if (ValidationUtil.isEmpty(msd)
|
||||
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
|
||||
// msd has not been set previously
|
||||
// or msd has been set but the corpus changed -> reset
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd reset");
|
||||
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
|
||||
// if msd has been set, but corpus type remained the same, we can keep any set msd value
|
||||
msdTF.setText(StringUtils.join(msdStrings, " "));
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd kept");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
|
||||
|
||||
// keep calculateCvv
|
||||
calculatecvvCB.setSelected(calculateCvv);
|
||||
|
||||
// keep string length if set
|
||||
if (stringLength != null) {
|
||||
stringLengthTF.setText(String.valueOf(stringLength));
|
||||
} else {
|
||||
stringLengthTF.setText("1");
|
||||
stringLength = 1;
|
||||
}
|
||||
|
||||
// TODO: trigger on rescan
|
||||
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
|
||||
// user changed corpus (by type) or by selection & triggered a rescan of headers
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
|
||||
currentCorpusType = corpus.getCorpusType();
|
||||
// setTaxonomyIsDirty(false);
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
|
||||
* sets combobox values to what is applicable ...
|
||||
*
|
||||
* @param mode
|
||||
*/
|
||||
public void toggleMode(MODE mode) {
|
||||
if (mode == null) {
|
||||
mode = currentMode;
|
||||
}
|
||||
|
||||
logger.info("mode: ", mode.toString());
|
||||
|
||||
if (mode == MODE.LETTER) {
|
||||
paneLetters.setVisible(true);
|
||||
|
||||
// populate with default cvv length value
|
||||
if (stringLength == null) {
|
||||
stringLengthTF.setText("1");
|
||||
stringLength = 1;
|
||||
} else {
|
||||
stringLengthTF.setText(String.valueOf(stringLength));
|
||||
}
|
||||
|
||||
// if calculateFor was selected for something other than a word or a lemma -> reset
|
||||
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
|
||||
// if the user selected something else before selecting ngram for letters, reset that choice
|
||||
calculateFor = CalculateFor.LEMMA;
|
||||
calculateForRB.selectToggle(lemmaRB);
|
||||
}
|
||||
}
|
||||
|
||||
// override if orth mode, allow only word
|
||||
if (corpus.isGosOrthMode()) {
|
||||
// TODO change to
|
||||
varietyRB.setDisable(true);
|
||||
msdTF.setDisable(true);
|
||||
} else {
|
||||
msdTF.setDisable(false);
|
||||
varietyRB.setDisable(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void compute() {
|
||||
Filter filter = new Filter();
|
||||
filter.setNgramValue(0);
|
||||
filter.setCalculateFor(calculateFor);
|
||||
filter.setMsd(msd);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setSkipValue(0);
|
||||
filter.setIsCvv(calculateCvv);
|
||||
filter.setSolarFilters(solarFiltersMap);
|
||||
filter.setStringLength(stringLength);
|
||||
|
||||
String message = Validation.validateForStringLevel(filter);
|
||||
if (message == null) {
|
||||
// no errors
|
||||
logger.info("Executing: ", filter.toString());
|
||||
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
|
||||
execute(statistic);
|
||||
} else {
|
||||
logAlert(message);
|
||||
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
|
||||
}
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
private void logAlert(String alert) {
|
||||
logger.info("alert: " + alert);
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public void setCorpus(Corpus corpus) {
|
||||
this.corpus = corpus;
|
||||
|
||||
if (corpus.getCorpusType() != CorpusType.SOLAR) {
|
||||
setSelectedFiltersLabel(null);
|
||||
} else {
|
||||
setSelectedFiltersLabel("/");
|
||||
}
|
||||
}
|
||||
|
||||
public void setSelectedFiltersLabel(String content) {
|
||||
if (content != null) {
|
||||
solarFilters.setVisible(true);
|
||||
selectedFiltersLabel.setVisible(true);
|
||||
selectedFiltersLabel.setText(content);
|
||||
} else {
|
||||
solarFilters.setVisible(false);
|
||||
selectedFiltersLabel.setVisible(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void execute(StatisticsNew statistic) {
|
||||
logger.info("Started execution: ", statistic.getFilter());
|
||||
|
||||
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<Void> task = new Task<Void>() {
|
||||
@SuppressWarnings("Duplicates")
|
||||
@Override
|
||||
protected Void call() throws Exception {
|
||||
long i = 0;
|
||||
for (File f : corpusFiles) {
|
||||
readXML(f.toString(), statistic);
|
||||
i++;
|
||||
this.updateProgress(i, corpusFiles.size());
|
||||
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
ngramProgressBar.progressProperty().bind(task.progressProperty());
|
||||
progressLabel.textProperty().bind(task.messageProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
try {
|
||||
boolean successullySaved = statistic.saveResultToDisk();
|
||||
if (successullySaved) {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
|
||||
} else {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
|
||||
}
|
||||
} catch (UnsupportedEncodingException e1) {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
|
||||
logger.error("Error while saving", e1);
|
||||
}
|
||||
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
task.setOnFailed(e -> {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
|
||||
logger.error("Error while executing", e);
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setProgress(0.0);
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
}
|
||||
|
||||
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
|
||||
this.solarFiltersMap = solarFiltersMap;
|
||||
}
|
||||
|
||||
public void setHostServices(HostServices hostServices){
|
||||
this.hostService = hostServices;
|
||||
}
|
||||
}
|
517
src/main/java/gui/CorpusTab.java
Normal file
517
src/main/java/gui/CorpusTab.java
Normal file
@ -0,0 +1,517 @@
|
||||
package gui;
|
||||
|
||||
import static data.CorpusType.*;
|
||||
import static gui.GUIController.*;
|
||||
import static gui.Messages.*;
|
||||
import static util.Util.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOCase;
|
||||
import org.apache.commons.io.filefilter.FileFilterUtils;
|
||||
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import alg.XML_processing;
|
||||
import data.Corpus;
|
||||
import data.CorpusType;
|
||||
import data.Enums.solar.SolarFilters;
|
||||
import data.Tax;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.concurrent.Task;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.control.*;
|
||||
import javafx.scene.layout.Pane;
|
||||
import javafx.stage.DirectoryChooser;
|
||||
import javafx.stage.Stage;
|
||||
import javafx.application.HostServices;
|
||||
|
||||
public class CorpusTab {
|
||||
public final static Logger logger = LogManager.getLogger(CorpusTab.class);
|
||||
public Pane setCorpusWrapperP;
|
||||
|
||||
private Stage stage;
|
||||
|
||||
@FXML
|
||||
private Button chooseCorpusLocationB;
|
||||
private File chosenCorpusLocation;
|
||||
|
||||
@FXML
|
||||
private CheckBox readHeaderInfoChB;
|
||||
private boolean readHeaderInfo;
|
||||
|
||||
@FXML
|
||||
private CheckBox gosUseOrthChB;
|
||||
private boolean gosUseOrth;
|
||||
|
||||
@FXML
|
||||
private Button chooseResultsLocationB;
|
||||
|
||||
@FXML
|
||||
private Label chooseCorpusL;
|
||||
private String chooseCorpusLabelContent;
|
||||
|
||||
@FXML
|
||||
private Label chooseResultsL;
|
||||
private String chooseResultsLabelContent;
|
||||
|
||||
@FXML
|
||||
private ProgressIndicator locationScanPI;
|
||||
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
// *** shared ***
|
||||
private Corpus corpus;
|
||||
private CorpusType corpusType;
|
||||
|
||||
// tabs - used to enable/disable
|
||||
private Tab stringLevelTabNew2;
|
||||
private Tab oneWordAnalysisTab;
|
||||
private Tab characterLevelTab;
|
||||
private Tab wordFormationTab;
|
||||
private Tab wordLevelTab;
|
||||
private Tab filterTab;
|
||||
private TabPane tabPane;
|
||||
private StringAnalysisTabNew2 satNew2Controller;
|
||||
private OneWordAnalysisTab oneWordTabController;
|
||||
private CharacterAnalysisTab catController;
|
||||
private FiltersForSolar ffsController;
|
||||
//private WordFormationTab wfController;
|
||||
private WordLevelTab wlController;
|
||||
private HostServices hostService;
|
||||
|
||||
|
||||
public void initialize() {
|
||||
stage = new Stage();
|
||||
|
||||
// add listeners
|
||||
chooseCorpusLocationB.setOnAction(e -> chooseCorpusLocation());
|
||||
chooseCorpusLocationB.setTooltip(new Tooltip(TOOLTIP_chooseCorpusLocationB));
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
|
||||
readHeaderInfoChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
readHeaderInfo = newValue;
|
||||
logger.info("read headers: ", readHeaderInfo);
|
||||
});
|
||||
readHeaderInfoChB.setTooltip(new Tooltip(TOOLTIP_readHeaderInfoChB));
|
||||
|
||||
gosUseOrthChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
gosUseOrth = newValue;
|
||||
corpus.setGosOrthMode(gosUseOrth);
|
||||
wordFormationTab.setDisable(gosUseOrth);
|
||||
satNew2Controller.toggleMode(null);
|
||||
oneWordTabController.toggleMode(null);
|
||||
catController.toggleMode(null);
|
||||
|
||||
logger.info("gosUseOrth: ", gosUseOrth);
|
||||
});
|
||||
|
||||
chooseResultsLocationB.setOnAction(e -> chooseResultsLocation(null));
|
||||
|
||||
// set labels and toggle visibility
|
||||
toggleGosChBVisibility();
|
||||
|
||||
chooseCorpusLabelContent = Messages.LABEL_CORPUS_LOCATION_NOT_SET;
|
||||
chooseCorpusL.setText(chooseCorpusLabelContent);
|
||||
|
||||
chooseResultsLabelContent = Messages.LABEL_RESULTS_LOCATION_NOT_SET;
|
||||
chooseResultsL.setText(chooseResultsLabelContent);
|
||||
|
||||
togglePiAndSetCorpusWrapper(false);
|
||||
}
|
||||
|
||||
private void togglePiAndSetCorpusWrapper(boolean piIsActive) {
|
||||
locationScanPI.setVisible(piIsActive);
|
||||
setCorpusWrapperP.setLayoutX(piIsActive ? 100.0 : 10.0);
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
/**
|
||||
* In order for a directory to pass as a valid corpus location, following criteria has to be met:
|
||||
* <ul>
|
||||
* <li>it can't be null</li>
|
||||
* <li>it has to be readable</li>
|
||||
* <li>it has to contain xml files</li>
|
||||
* <li>xml files have to contain valid headers from which we can infer the corpus type</li>
|
||||
* <li>corpus type must be one of the expected corpus types - as noted in the @see data.CorpusType.class </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Additionally, if the user checks to read taxonomy/filters from the corpus files, that read
|
||||
* has to produce a non-empty list results list
|
||||
*/
|
||||
private void chooseCorpusLocation() {
|
||||
File selectedDirectory = directoryChooser();
|
||||
|
||||
if (selectedDirectory != null && ValidationUtil.isReadableDirectory(selectedDirectory)) {
|
||||
logger.info("selected corpus dir: ", selectedDirectory.getAbsolutePath());
|
||||
|
||||
// scan for xml files
|
||||
Collection<File> corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
|
||||
|
||||
// make sure there are corpus files in selected directory or notify the user about it
|
||||
if (corpusFiles.size() == 0) {
|
||||
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
|
||||
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
|
||||
} else {
|
||||
String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles, selectedDirectory.getAbsolutePath());
|
||||
|
||||
if (chooseCorpusLabelContentTmp == null) {
|
||||
logger.info("alert: ", WARNING_CORPUS_NOT_FOUND);
|
||||
showAlert(Alert.AlertType.ERROR, WARNING_CORPUS_NOT_FOUND, null);
|
||||
} else {
|
||||
initNewCorpus(selectedDirectory, corpusFiles);
|
||||
|
||||
corpus.setChosenCorpusLocation(selectedDirectory);
|
||||
corpus.setDetectedCorpusFiles(corpusFiles);
|
||||
chooseCorpusLabelContent = chooseCorpusLabelContentTmp;
|
||||
logger.info("corpus dir: ", corpus.getChosenCorpusLocation().getAbsolutePath());
|
||||
|
||||
if (readHeaderInfo) {
|
||||
logger.info("reading header info...");
|
||||
readHeaderInfo();
|
||||
} else {
|
||||
setResults();
|
||||
|
||||
setCorpusForAnalysis();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If a user selects a valid corpus location, we define a new corpus (so none of the old data gets carried over)
|
||||
*
|
||||
* @param selectedDirectory
|
||||
* @param corpusFiles
|
||||
*/
|
||||
private void initNewCorpus(File selectedDirectory, Collection<File> corpusFiles) {
|
||||
corpus = new Corpus();
|
||||
corpus.setCorpusType(corpusType);
|
||||
corpus.setDetectedCorpusFiles(corpusFiles);
|
||||
corpus.setChosenCorpusLocation(selectedDirectory);
|
||||
chooseResultsLocation(selectedDirectory);
|
||||
}
|
||||
|
||||
private void chooseResultsLocation(File dir) {
|
||||
// results location can be set either to default value (after selecting valid corpus location) - dir attribute
|
||||
// or to a dir picked via directoryChooser (when dir == null
|
||||
File selectedDirectory = dir == null ? directoryChooser() : dir;
|
||||
|
||||
if (selectedDirectory != null) {
|
||||
String resultsLocationPath = selectedDirectory.getAbsolutePath().concat(File.separator);
|
||||
File chosenResultsLocationTmp = new File(resultsLocationPath);
|
||||
|
||||
if (!ValidationUtil.isValidDirectory(chosenResultsLocationTmp)) {
|
||||
showAlert(Alert.AlertType.ERROR, WARNING_RESULTS_DIR_NOT_VALID);
|
||||
logger.info("alert: ", WARNING_RESULTS_DIR_NOT_VALID);
|
||||
} else {
|
||||
corpus.setChosenResultsLocation(chosenResultsLocationTmp);
|
||||
chooseResultsLabelContent = corpus.getChosenResultsLocation().getAbsolutePath();
|
||||
chooseResultsL.setText(chooseResultsLabelContent);
|
||||
logger.info("results dir: " + chooseResultsLabelContent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setResults() {
|
||||
// if everything is ok
|
||||
// check and enable checkbox if GOS
|
||||
toggleGosChBVisibility();
|
||||
|
||||
// set default results location
|
||||
String defaultResultsLocationPath = corpus.getChosenCorpusLocation().getAbsolutePath();
|
||||
logger.info("setting default results location to: ", defaultResultsLocationPath);
|
||||
|
||||
chooseCorpusL.setText(chooseCorpusLabelContent);
|
||||
}
|
||||
|
||||
private void readHeaderInfo() {
|
||||
CorpusType corpusType = corpus.getCorpusType();
|
||||
Collection<File> corpusFiles = corpus.getDetectedCorpusFiles();
|
||||
togglePiAndSetCorpusWrapper(true);
|
||||
chooseCorpusL.setText(LABEL_SCANNING_CORPUS);
|
||||
|
||||
logger.info("reading header data for ", corpusType.toString());
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
|
||||
@Override
|
||||
protected HashSet<String> call() throws Exception {
|
||||
HashSet<String> values = new HashSet<>();
|
||||
long i = 0;
|
||||
|
||||
if (!corpusIsSplit) {
|
||||
updateProgress(-1.0f, -1.0f);
|
||||
}
|
||||
|
||||
for (File file : corpusFiles) {
|
||||
values.addAll((Collection<? extends String>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType));
|
||||
i++;
|
||||
|
||||
if (corpusIsSplit) {
|
||||
updateProgress(i, corpusFiles.size());
|
||||
}
|
||||
}
|
||||
|
||||
updateProgress(1.0f, 1.0f);
|
||||
return values;
|
||||
}
|
||||
};
|
||||
|
||||
locationScanPI.progressProperty().bind(task.progressProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
ObservableList<String> readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue());
|
||||
|
||||
if (ValidationUtil.isEmpty(readTaxonomy)) {
|
||||
// if no taxonomy found alert the user and keep other tabs disabled
|
||||
logger.info("No taxonomy found in headers.");
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_TAXONOMY_FOUND);
|
||||
} else {
|
||||
// set taxonomy, update label
|
||||
corpus.setTaxonomy(readTaxonomy);
|
||||
corpus.setHeaderRead(true);
|
||||
chooseCorpusL.setText(chooseCorpusLabelContent);
|
||||
setResults();
|
||||
setCorpusForAnalysis();
|
||||
}
|
||||
|
||||
togglePiAndSetCorpusWrapper(false);
|
||||
|
||||
});
|
||||
|
||||
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
|
||||
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
} else if (corpusType == CorpusType.SOLAR) {
|
||||
// many many fields
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashMap<String, HashSet<String>>> task = new Task<HashMap<String, HashSet<String>>>() {
|
||||
@Override
|
||||
protected HashMap<String, HashSet<String>> call() throws Exception {
|
||||
HashMap<String, HashSet<String>> values = new HashMap<>();
|
||||
long i = 0;
|
||||
|
||||
if (!corpusIsSplit) {
|
||||
updateProgress(-1.0f, -1.0f);
|
||||
}
|
||||
|
||||
for (File file : corpusFiles) {
|
||||
HashMap<String, HashSet<String>> tmpvalues = (HashMap<String, HashSet<String>>) XML_processing.readXmlHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType);
|
||||
|
||||
// update final results
|
||||
for (Map.Entry<String, HashSet<String>> entry : tmpvalues.entrySet()) {
|
||||
if (values.containsKey(entry.getKey())) {
|
||||
values.get(entry.getKey()).addAll(entry.getValue());
|
||||
} else {
|
||||
values.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
if (corpusIsSplit) {
|
||||
updateProgress(i, corpusFiles.size());
|
||||
}
|
||||
}
|
||||
|
||||
updateProgress(1.0f, 1.0f);
|
||||
return values;
|
||||
}
|
||||
};
|
||||
|
||||
locationScanPI.progressProperty().bind(task.progressProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
HashMap<String, HashSet<String>> values = task.getValue();
|
||||
|
||||
if (ValidationUtil.isEmpty(values)) {
|
||||
// if no taxonomy found alert the user and keep other tabs disabled
|
||||
logger.info("No solar filters found in headers.");
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_NO_SOLAR_FILTERS_FOUND);
|
||||
} else {
|
||||
HashMap<String, ObservableList<String>> filtersForComboBoxes = SolarFilters.getFiltersForComboBoxes(values);
|
||||
// set taxonomy, update label
|
||||
corpus.setSolarFiltersForXML(values);
|
||||
corpus.setSolarFilters(filtersForComboBoxes);
|
||||
corpus.setHeaderRead(true);
|
||||
chooseCorpusL.setText(chooseCorpusLabelContent);
|
||||
setResults();
|
||||
setCorpusForAnalysis();
|
||||
}
|
||||
|
||||
togglePiAndSetCorpusWrapper(false);
|
||||
|
||||
});
|
||||
|
||||
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
|
||||
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void setCorpusForAnalysis() {
|
||||
if (corpus.validate()) {
|
||||
// new statistic, enable tabs...
|
||||
stringLevelTabNew2.setDisable(false);
|
||||
satNew2Controller.setCorpus(corpus);
|
||||
satNew2Controller.init();
|
||||
oneWordAnalysisTab.setDisable(false);
|
||||
oneWordTabController.setCorpus(corpus);
|
||||
oneWordTabController.init();
|
||||
characterLevelTab.setDisable(false);
|
||||
catController.setCorpus(corpus);
|
||||
catController.init();
|
||||
wordFormationTab.setDisable(false);
|
||||
wordLevelTab.setDisable(false);
|
||||
//wfController.setCorpus(corpus);
|
||||
//wfController.init();
|
||||
wlController.setCorpus(corpus);
|
||||
wlController.init();
|
||||
|
||||
if (corpus.getCorpusType() == CorpusType.SOLAR) {
|
||||
filterTab.setDisable(false);
|
||||
tabPane.getTabs().add(1, filterTab);
|
||||
ffsController.setCorpus(corpus);
|
||||
ffsController.initFilters();
|
||||
} else {
|
||||
filterTab.setDisable(true);
|
||||
tabPane.getTabs().removeAll(filterTab);
|
||||
}
|
||||
} else {
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, corpus.getValidationErrorsToString());
|
||||
}
|
||||
}
|
||||
|
||||
private File directoryChooser() {
|
||||
DirectoryChooser directoryChooser = new DirectoryChooser();
|
||||
|
||||
// open in the folder where the jar is located if possible
|
||||
File workingDir = getWorkingDirectory();
|
||||
|
||||
if (workingDir != null) {
|
||||
directoryChooser.setInitialDirectory(workingDir);
|
||||
}
|
||||
|
||||
return directoryChooser.showDialog(stage);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hides GOS related checkbox until needed.
|
||||
*/
|
||||
private void toggleGosChBVisibility() {
|
||||
gosUseOrthChB.setVisible(corpus != null && corpus.getCorpusType() != null && corpus.getCorpusType() == CorpusType.GOS);
|
||||
}
|
||||
|
||||
private String detectCorpusType(Collection<File> corpusFiles, String corpusLocation) {
|
||||
// check that we recognize this corpus
|
||||
// read first file only, maybe later do all, if toll on resources is acceptable
|
||||
File f = corpusFiles.iterator().next();
|
||||
String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
|
||||
String test = CCKRES.getNameLowerCase();
|
||||
String debug = "";
|
||||
|
||||
// check if XML file's title contains any of recognized corpus titles
|
||||
corpusType = null;
|
||||
if (title.contains(SOLAR.getNameLowerCase())) {
|
||||
corpusType = SOLAR;
|
||||
} else if (title.contains(GIGAFIDA.getNameLowerCase())) {
|
||||
corpusType = GIGAFIDA;
|
||||
} else if (title.contains(CCKRES.getNameLowerCase())) {
|
||||
corpusType = CCKRES;
|
||||
} else if (title.contains(GOS.getNameLowerCase())) {
|
||||
corpusType = GOS;
|
||||
}
|
||||
|
||||
if (corpusType == null) {
|
||||
return null;
|
||||
} else {
|
||||
corpus.setCorpusType(corpusType);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(corpusLocation)
|
||||
.append("\n")
|
||||
.append(String.format(NOTIFICATION_FOUND_X_FILES, corpusFiles.size()))
|
||||
.append("\n")
|
||||
.append(String.format("Korpus: %s", corpusType.toString()));
|
||||
|
||||
String result = sb.toString();
|
||||
|
||||
logger.debug(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public void setCorpus(Corpus corpus) {
|
||||
this.corpus = corpus;
|
||||
}
|
||||
|
||||
public void setStringLevelTabNew2(Tab stringLevelTabNew2) { this.stringLevelTabNew2 = stringLevelTabNew2; }
|
||||
|
||||
public void setOneWordAnalysisTab(Tab oneWordAnalysisTab) { this.oneWordAnalysisTab = oneWordAnalysisTab; }
|
||||
|
||||
public void setCharacterLevelTab(Tab characterLevelTab) { this.characterLevelTab = characterLevelTab; }
|
||||
|
||||
public void setWordLevelTab(Tab wordLevelTab) {
|
||||
this.wordLevelTab = wordLevelTab;
|
||||
}
|
||||
|
||||
public void setFilterTab(Tab filterTab) {
|
||||
this.filterTab = filterTab;
|
||||
}
|
||||
|
||||
public void setFfsController(FiltersForSolar ffsController) {
|
||||
this.ffsController = ffsController;
|
||||
}
|
||||
|
||||
public void setTabPane(TabPane tabPane) {
|
||||
this.tabPane = tabPane;
|
||||
}
|
||||
|
||||
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
|
||||
|
||||
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
|
||||
|
||||
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
|
||||
|
||||
/*public void setWfController(WordFormationTab wfController) {
|
||||
this.wfController = wfController;
|
||||
}*/
|
||||
|
||||
public void setWlController(WordLevelTab wlController) {
|
||||
this.wlController = wlController;
|
||||
}
|
||||
|
||||
public void setWordFormationTab(Tab wordFormationTab) {
|
||||
this.wordFormationTab = wordFormationTab;
|
||||
}
|
||||
|
||||
public void setHostServices(HostServices hostServices){
|
||||
this.hostService = hostServices;
|
||||
}
|
||||
}
|
187
src/main/java/gui/FiltersForSolar.java
Normal file
187
src/main/java/gui/FiltersForSolar.java
Normal file
@ -0,0 +1,187 @@
|
||||
package gui;
|
||||
|
||||
import static data.Enums.solar.SolarFilters.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import javafx.application.HostServices;
|
||||
import javafx.scene.control.Hyperlink;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
import data.Corpus;
|
||||
import javafx.collections.ListChangeListener;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.control.Label;
|
||||
import javafx.scene.layout.AnchorPane;
|
||||
import util.Util;
|
||||
|
||||
public class FiltersForSolar {
|
||||
|
||||
@FXML
|
||||
public AnchorPane solarFiltersTabPane;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarRegijaCCB;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarPredmetCCB;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarRazredCCB;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarLetoCCB;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarSolaCCB;
|
||||
@FXML
|
||||
public CheckComboBox<String> solarVrstaBesedilaCCB;
|
||||
@FXML
|
||||
public Label selectedFiltersLabel;
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
private HashMap<String, ObservableList<String>> selectedFilters;
|
||||
private Corpus corpus;
|
||||
|
||||
private StringAnalysisTabNew2 satNew2Controller;
|
||||
private OneWordAnalysisTab oneWordTabController;
|
||||
private CharacterAnalysisTab catController;
|
||||
//private WordFormationTab wfController;
|
||||
private WordLevelTab wlController;
|
||||
private HostServices hostService;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public void initialize() {
|
||||
selectedFilters = new HashMap<>();
|
||||
|
||||
solarRegijaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(REGIJA, solarRegijaCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
solarPredmetCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(PREDMET, solarPredmetCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
solarRazredCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(RAZRED, solarRazredCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
solarLetoCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(LETO, solarLetoCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
solarSolaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(SOLA, solarSolaCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
solarVrstaBesedilaCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> {
|
||||
selectedFilters.put(TIP, solarVrstaBesedilaCCB.getCheckModel().getCheckedItems());
|
||||
updateSolarFilterLabel();
|
||||
});
|
||||
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
}
|
||||
|
||||
public void initFilters() {
|
||||
solarRegijaCCB.getItems().removeAll();
|
||||
solarRegijaCCB.getItems().setAll(corpus.getSolarFilters().get(REGIJA));
|
||||
solarRegijaCCB.getItems().sorted();
|
||||
solarPredmetCCB.getItems().removeAll();
|
||||
solarPredmetCCB.getItems().setAll(corpus.getSolarFilters().get(PREDMET));
|
||||
solarPredmetCCB.getItems().sorted();
|
||||
solarRazredCCB.getItems().removeAll();
|
||||
solarRazredCCB.getItems().setAll(corpus.getSolarFilters().get(RAZRED));
|
||||
solarRazredCCB.getItems().sorted();
|
||||
solarLetoCCB.getItems().removeAll();
|
||||
solarLetoCCB.getItems().setAll(corpus.getSolarFilters().get(LETO));
|
||||
solarLetoCCB.getItems().sorted();
|
||||
solarSolaCCB.getItems().removeAll();
|
||||
solarSolaCCB.getItems().setAll(corpus.getSolarFilters().get(SOLA));
|
||||
solarSolaCCB.getItems().sorted();
|
||||
solarVrstaBesedilaCCB.getItems().removeAll();
|
||||
solarVrstaBesedilaCCB.getItems().setAll(corpus.getSolarFilters().get(TIP));
|
||||
solarVrstaBesedilaCCB.getItems().sorted();
|
||||
}
|
||||
|
||||
private void updateSolarFilterLabel() {
|
||||
if (Util.isMapEmpty(selectedFilters)) {
|
||||
setSOlarFIlterLabelText("/");
|
||||
} else {
|
||||
StringBuilder allFilters = new StringBuilder();
|
||||
for (Map.Entry<String, ObservableList<String>> entry : selectedFilters.entrySet()) {
|
||||
ArrayList<String> values = new ArrayList<>(entry.getValue());
|
||||
|
||||
if (!values.isEmpty()) {
|
||||
allFilters.append(entry.getKey())
|
||||
.append(": ");
|
||||
|
||||
for (int i = 0; i < values.size(); i++) {
|
||||
allFilters.append(values.get(i));
|
||||
|
||||
if (i < values.size() - 1) {
|
||||
// so we won't append a comma after the last element
|
||||
allFilters.append(", ");
|
||||
}
|
||||
}
|
||||
allFilters.append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
setSOlarFIlterLabelText(allFilters.toString());
|
||||
}
|
||||
|
||||
HashMap<String, HashSet<String>> solarFiltersMap = new HashMap<>();
|
||||
for (Map.Entry<String, ObservableList<String>> e : selectedFilters.entrySet()) {
|
||||
HashSet<String> values = new HashSet<>();
|
||||
values.addAll(e.getValue());
|
||||
|
||||
solarFiltersMap.put(e.getKey(), values);
|
||||
}
|
||||
|
||||
satNew2Controller.setSolarFiltersMap(solarFiltersMap);
|
||||
oneWordTabController.setSolarFiltersMap(solarFiltersMap);
|
||||
catController.setSolarFiltersMap(solarFiltersMap);
|
||||
//wfController.setSolarFiltersMap(solarFiltersMap);
|
||||
wlController.setSolarFiltersMap(solarFiltersMap);
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
private void setSOlarFIlterLabelText(String content) {
|
||||
selectedFiltersLabel.setText(content);
|
||||
satNew2Controller.setSelectedFiltersLabel(content);
|
||||
oneWordTabController.setSelectedFiltersLabel(content);
|
||||
catController.setSelectedFiltersLabel(content);
|
||||
//wfController.setSelectedFiltersLabel(content);
|
||||
wlController.setSelectedFiltersLabel(content);
|
||||
}
|
||||
|
||||
public void setCorpus(Corpus corpus) {
|
||||
this.corpus = corpus;
|
||||
}
|
||||
|
||||
public void setSatNew2Controller(StringAnalysisTabNew2 satNew2Controller) { this.satNew2Controller = satNew2Controller; }
|
||||
|
||||
public void setOneWordTabController(OneWordAnalysisTab oneWordTabController) { this.oneWordTabController = oneWordTabController; }
|
||||
|
||||
public void setCatController(CharacterAnalysisTab catController) { this.catController = catController; }
|
||||
|
||||
/*public void setWfController(WordFormationTab wfController) {
|
||||
this.wfController = wfController;
|
||||
}*/
|
||||
|
||||
public void setWlController(WordLevelTab wlController) {
|
||||
this.wlController = wlController;
|
||||
}
|
||||
|
||||
public void setHostServices(HostServices hostServices){
|
||||
this.hostService = hostServices;
|
||||
}
|
||||
}
|
150
src/main/java/gui/GUIController.java
Normal file
150
src/main/java/gui/GUIController.java
Normal file
@ -0,0 +1,150 @@
|
||||
package gui;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.kordamp.ikonli.fontawesome.FontAwesome;
|
||||
import org.kordamp.ikonli.javafx.FontIcon;
|
||||
|
||||
import data.Corpus;
|
||||
import javafx.application.Application;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.fxml.FXMLLoader;
|
||||
import javafx.scene.Parent;
|
||||
import javafx.scene.Scene;
|
||||
import javafx.scene.control.Alert;
|
||||
import javafx.scene.control.Tab;
|
||||
import javafx.scene.control.TabPane;
|
||||
import javafx.stage.Stage;
|
||||
|
||||
public class GUIController extends Application {
|
||||
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||
|
||||
@FXML
|
||||
public Tab StringLevelTabNew2;
|
||||
@FXML
|
||||
public Tab OneWordAnalysisTab;
|
||||
@FXML
|
||||
public Tab CharacterLevelTabNew;
|
||||
@FXML
|
||||
public Tab corpusTab;
|
||||
public TabPane tabPane;
|
||||
@FXML
|
||||
private CharacterAnalysisTab catController;
|
||||
@FXML
|
||||
private static Parent sat;
|
||||
@FXML
|
||||
private StringAnalysisTabNew2 satNew2Controller;
|
||||
@FXML
|
||||
private static Parent satNew2;
|
||||
@FXML
|
||||
private OneWordAnalysisTab oneWordTabController;
|
||||
@FXML
|
||||
private static Parent oneWordTab;
|
||||
@FXML
|
||||
private CorpusTab ctController;
|
||||
@FXML
|
||||
private Parent ct;
|
||||
//@FXML
|
||||
//private WordFormationTab wfController;
|
||||
@FXML
|
||||
private Parent wf;
|
||||
@FXML
|
||||
private WordLevelTab wlController;
|
||||
@FXML
|
||||
private Parent wl;
|
||||
@FXML
|
||||
private FiltersForSolar ffsController;
|
||||
@FXML
|
||||
private Parent ffs;
|
||||
@FXML
|
||||
private SelectedFiltersPane sfpController;
|
||||
@FXML
|
||||
private Parent sfp;
|
||||
@FXML
|
||||
public Tab stringLevelTab;
|
||||
@FXML
|
||||
public Tab wordLevelTab;
|
||||
/*@FXML
|
||||
public Tab wordFormationTab;*/
|
||||
|
||||
|
||||
@FXML
|
||||
public Tab filterTab;
|
||||
public Stage stage;
|
||||
|
||||
private Corpus corpus;
|
||||
|
||||
|
||||
@Override
|
||||
public void start(Stage primaryStage) throws IOException {
|
||||
Parent root = FXMLLoader.load(getClass().getResource("/GUI.fxml"));
|
||||
primaryStage.setTitle("GUI");
|
||||
Scene scene = new Scene(root, 800, 600);
|
||||
// https://github.com/dicolar/jbootx
|
||||
// scene.getStylesheets().add(GUIController.class.getResource("bootstrap3.css").toExternalForm())
|
||||
primaryStage.setScene(scene);
|
||||
stage = primaryStage;
|
||||
primaryStage.show();
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
launch(args);
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
corpus = new Corpus();
|
||||
ctController.setCorpus(corpus);
|
||||
ctController.setFilterTab(filterTab);
|
||||
ctController.setStringLevelTabNew2(StringLevelTabNew2);
|
||||
ctController.setOneWordAnalysisTab(OneWordAnalysisTab);
|
||||
ctController.setCharacterLevelTab(CharacterLevelTabNew);
|
||||
ctController.setSatNew2Controller(satNew2Controller);
|
||||
ctController.setOneWordTabController(oneWordTabController);
|
||||
ctController.setCatController(catController);
|
||||
//ctController.setWfController(wfController);
|
||||
ctController.setWlController(wlController);
|
||||
ctController.setTabPane(tabPane);
|
||||
ctController.setFfsController(ffsController);
|
||||
//ctController.setWordFormationTab(wordFormationTab);
|
||||
ctController.setWordLevelTab(wordLevelTab);
|
||||
ctController.setHostServices(getHostServices());
|
||||
|
||||
satNew2Controller.setCorpus(corpus);
|
||||
satNew2Controller.setHostServices(getHostServices());
|
||||
oneWordTabController.setCorpus(corpus);
|
||||
oneWordTabController.setHostServices(getHostServices());
|
||||
catController.setCorpus(corpus);
|
||||
catController.setHostServices(getHostServices());
|
||||
//wfController.setCorpus(corpus);
|
||||
//wfController.setHostServices(getHostServices());
|
||||
wlController.setCorpus(corpus);
|
||||
wlController.setHostServices(getHostServices());
|
||||
ffsController.setSatNew2Controller(satNew2Controller);
|
||||
ffsController.setOneWordTabController(oneWordTabController);
|
||||
ffsController.setCatController(catController);
|
||||
//ffsController.setWfController(wfController);
|
||||
ffsController.setWlController(wlController);
|
||||
ffsController.setHostServices(getHostServices());
|
||||
|
||||
// set tab icons
|
||||
corpusTab.setGraphic(new FontIcon(FontAwesome.COG));
|
||||
filterTab.setGraphic(new FontIcon(FontAwesome.FILTER));
|
||||
|
||||
// hide filter tab
|
||||
tabPane.getTabs().removeAll(filterTab);
|
||||
}
|
||||
|
||||
static void showAlert(Alert.AlertType alertType, String headerText, String contentText) {
|
||||
Alert alert = new Alert(alertType);
|
||||
alert.setTitle(Messages.windowTitles.get(alertType));
|
||||
alert.setHeaderText(headerText != null ? headerText : "");
|
||||
alert.setContentText(contentText != null ? contentText : "");
|
||||
alert.showAndWait();
|
||||
}
|
||||
|
||||
static void showAlert(Alert.AlertType alertType, String headerText) {
|
||||
showAlert(alertType, headerText, null);
|
||||
}
|
||||
}
|
74
src/main/java/gui/Messages.java
Normal file
74
src/main/java/gui/Messages.java
Normal file
@ -0,0 +1,74 @@
|
||||
package gui;
|
||||
|
||||
import static javafx.scene.control.Alert.AlertType.*;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import javafx.scene.control.Alert;
|
||||
|
||||
public class Messages {
|
||||
|
||||
// warnings & errors
|
||||
public static final String WARNING_CORPUS_NOT_FOUND = "V izbranem direktoriju ni ustreznih korpusnih datotek.";
|
||||
public static final String WARNING_RESULTS_DIR_NOT_VALID = "Za dostop do izbranega direktorija nimate potrebnih pravic.";
|
||||
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS = "Izbran nivo ngramov in vpisano št. besed v filtru se ne ujemata.";
|
||||
public static final String WARNING_DIFFERING_NGRAM_LEVEL_AND_FILTER_TOKENS_INFO = "Izberite drugo število ali popravite filter.";
|
||||
public static final String WARNING_WORD_OR_LEMMA = "Izberite, če želite statistiko izračunati za besede ali leme.";
|
||||
public static final String WARNING_ONLY_NUMBERS_ALLOWED = "Prosim vnesite veljavno število.";
|
||||
public static final String WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES = "Število za ngram (%d) in število msd oznak (%d) se morata ujemati.";
|
||||
public static final String WARNING_MISSING_STRING_LENGTH = "Dolžina niza mora biti večja od 0. Vstavljena je privzeta vrednost (1).";
|
||||
public static final String WARNING_NO_TAXONOMY_FOUND = "Iz korpusnih datotek ni bilo moč razbrati taksonomije. Prosim izberite drugo lokacijo ali korpus.";
|
||||
public static final String WARNING_NO_SOLAR_FILTERS_FOUND = "Iz korpusnih datotek ni bilo moč razbrati filtrov. Prosim izberite drugo lokacijo ali korpus.";
|
||||
public static final String ERROR_WHILE_EXECUTING = "Prišlo je do napake med izvajanjem.";
|
||||
public static final String ERROR_WHILE_SAVING_RESULTS_TO_CSV = "Prišlo je do napake med shranjevanje rezultatov.";
|
||||
|
||||
// missing
|
||||
public static final String MISSING_NGRAM_LEVEL = "N-gram nivo";
|
||||
public static final String MISSING_CALCULATE_FOR = "Izračunaj za";
|
||||
public static final String MISSING_SKIP = "";
|
||||
public static final String MISSING_STRING_LENGTH = "Dolžina niza";
|
||||
public static final String MISMATCHED_STRING_LENGTH_AND_MSD_REGEX = "Neujemajoča dolžina niza in regex filter";
|
||||
|
||||
|
||||
// general notifications - static content/set only once
|
||||
public static final String NOTIFICATION_FOUND_X_FILES = "Št. najdenih datotek: %d";
|
||||
public static final String NOTIFICATION_ANALYSIS_COMPLETED = "Analiza je zaključena, rezultati so shranjeni.";
|
||||
public static final String NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS = "Analiza je zaključena, vendar ni bilo moč izračunati statistike, ki bi ustrezala vsem navedenim pogojem.";
|
||||
public static final String RESULTS_PATH_SET_TO_DEFAULT = "Lokacija za shranjevanje rezultatov je nastavljena na lokacijo korpusa.";
|
||||
|
||||
// ongoing notifications - displayed while processing, dynamically changing
|
||||
public static final String ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y = "Analiziram datoteko %d od %d (%s)";
|
||||
|
||||
// Labels
|
||||
public static final String LABEL_CORPUS_LOCATION_NOT_SET = "Lokacija korpusa ni nastavljena";
|
||||
public static final String LABEL_RESULTS_LOCATION_NOT_SET = "Lokacija za shranjevanje rezultatov ni nastavljena";
|
||||
public static final String LABEL_RESULTS_CORPUS_TYPE_NOT_SET = "Vrsta korpusa ni nastavljena";
|
||||
|
||||
public static final String LABEL_SCANNING_CORPUS = "Iskanje in analiza korpusnih datotek...";
|
||||
public static final String LABEL_SCANNING_SINGLE_FILE_CORPUS = "Analiza vnosa ";
|
||||
public static final String COMPLETED = "končano";
|
||||
|
||||
public static final String TOOLTIP_chooseCorpusLocationB = "Izberite mapo v kateri se nahaja korpus. Program izbrano mapo preišče rekurzivno, zato bodite pozorni, da ne izberete mape z več korpusi ali z mnogo datotekami, ki niso del korpusa.";
|
||||
public static final String TOOLTIP_readHeaderInfoChB = "Če izberete to opcijo, se bo iz headerjev korpusa prebrala razpoložljiva taksonomija oz. filtri (korpus Šolar). Ta operacija lahko traja dlje časa, sploh če je korpus združen v eni sami datoteki.";
|
||||
|
||||
|
||||
|
||||
// Not properly to be here. TODO move somewhere else in future
|
||||
public static final String HELP_URL = "http://slovnica.ijs.si/";
|
||||
|
||||
// helper maps
|
||||
/**
|
||||
* Typical window titles
|
||||
* ERROR = "Napaka"
|
||||
* WARNING = "Opozorilo"
|
||||
* CONFIRMATION = "Potrdilo"
|
||||
*/
|
||||
static HashMap<Alert.AlertType, String> windowTitles = new HashMap<>();
|
||||
|
||||
static {
|
||||
// automatically set window's title
|
||||
windowTitles.put(ERROR, "Napaka");
|
||||
windowTitles.put(WARNING, "Opozorilo");
|
||||
windowTitles.put(CONFIRMATION, "Potrdilo");
|
||||
}
|
||||
}
|
389
src/main/java/gui/OneWordAnalysisTab.java
Executable file
389
src/main/java/gui/OneWordAnalysisTab.java
Executable file
@ -0,0 +1,389 @@
|
||||
package gui;
|
||||
|
||||
import data.*;
|
||||
import javafx.application.HostServices;
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ListChangeListener;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.concurrent.Task;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.control.*;
|
||||
import javafx.scene.layout.Pane;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static alg.XML_processing.readXML;
|
||||
import static gui.GUIController.showAlert;
|
||||
import static gui.Messages.*;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class OneWordAnalysisTab {
|
||||
public final static Logger logger = LogManager.getLogger(OneWordAnalysisTab.class);
|
||||
|
||||
@FXML
|
||||
public Label selectedFiltersLabel;
|
||||
@FXML
|
||||
public Label solarFilters;
|
||||
|
||||
@FXML
|
||||
private TextField msdTF;
|
||||
private ArrayList<Pattern> msd;
|
||||
private ArrayList<String> msdStrings;
|
||||
|
||||
@FXML
|
||||
private CheckComboBox<String> taxonomyCCB;
|
||||
private ArrayList<String> taxonomy;
|
||||
|
||||
@FXML
|
||||
private ComboBox<String> calculateForCB;
|
||||
private CalculateFor calculateFor;
|
||||
|
||||
|
||||
@FXML
|
||||
private Button computeNgramsB;
|
||||
|
||||
@FXML
|
||||
public ProgressBar ngramProgressBar;
|
||||
@FXML
|
||||
public Label progressLabel;
|
||||
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
private enum MODE {
|
||||
LETTER,
|
||||
WORD
|
||||
}
|
||||
|
||||
private MODE currentMode;
|
||||
|
||||
private Corpus corpus;
|
||||
private HashMap<String, HashSet<String>> solarFiltersMap;
|
||||
private Filter filter;
|
||||
private boolean useDb;
|
||||
private HostServices hostService;
|
||||
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
|
||||
|
||||
|
||||
// TODO: pass observables for taxonomy based on header scan
|
||||
// after header scan
|
||||
private ObservableList<String> taxonomyCCBValues;
|
||||
private CorpusType currentCorpusType;
|
||||
|
||||
public void init() {
|
||||
currentMode = MODE.WORD;
|
||||
toggleMode(currentMode);
|
||||
|
||||
// calculateForCB
|
||||
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
|
||||
calculateFor = CalculateFor.factory(newValue);
|
||||
logger.info("calculateForCB:", calculateFor.toString());
|
||||
});
|
||||
|
||||
calculateForCB.getSelectionModel().select(0);
|
||||
|
||||
// msd
|
||||
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (!newValue) {
|
||||
// focus lost
|
||||
String value = msdTF.getText();
|
||||
logger.info("msdTf: ", value);
|
||||
|
||||
if (!ValidationUtil.isEmpty(value)) {
|
||||
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
|
||||
|
||||
int nOfRequiredMsdTokens = 1;
|
||||
if (msdTmp.size() != nOfRequiredMsdTokens) {
|
||||
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
|
||||
logAlert(msg);
|
||||
showAlert(Alert.AlertType.ERROR, msg);
|
||||
}
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
for (String msdToken : msdTmp) {
|
||||
msd.add(Pattern.compile(msdToken));
|
||||
msdStrings.add(msdToken);
|
||||
}
|
||||
logger.info(String.format("msd accepted (%d)", msd.size()));
|
||||
|
||||
} else if (!ValidationUtil.isEmpty(newValue)) {
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
msdTF.setText("");
|
||||
msd = new ArrayList<>();
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
taxonomyCCB.getItems().removeAll();
|
||||
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
|
||||
taxonomy = new ArrayList<>();
|
||||
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
|
||||
taxonomy.addAll(checkedItems);
|
||||
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
|
||||
});
|
||||
taxonomyCCB.getCheckModel().clearChecks();
|
||||
} else {
|
||||
taxonomyCCB.setDisable(true);
|
||||
}
|
||||
|
||||
computeNgramsB.setOnAction(e -> {
|
||||
compute();
|
||||
logger.info("compute button");
|
||||
});
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
}
|
||||
|
||||
/**
|
||||
* case a: values for combo boxes can change after a corpus change
|
||||
* <ul>
|
||||
* <li>different corpus type - reset all fields so no old values remain</li>
|
||||
* <li>same corpus type, different subset - keep</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* case b: values for combo boxes can change after a header scan
|
||||
* <ul>
|
||||
* <li>at first, fields are populated by corpus type defaults</li>
|
||||
* <li>after, with gathered data</li>
|
||||
* </ul>
|
||||
* <p></p>
|
||||
* ngrams: 1
|
||||
* calculateFor: word
|
||||
* msd:
|
||||
* taxonomy:
|
||||
* skip: 0
|
||||
* iscvv: false
|
||||
* string length: 1
|
||||
*/
|
||||
public void populateFields() {
|
||||
// corpus changed if: current one is null (this is first run of the app)
|
||||
// or if currentCorpus != gui's corpus
|
||||
boolean corpusChanged = currentCorpusType == null
|
||||
|| currentCorpusType != corpus.getCorpusType();
|
||||
|
||||
|
||||
// TODO: check for GOS, GIGAFIDA, SOLAR...
|
||||
// refresh and:
|
||||
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
|
||||
if (calculateFor == null) {
|
||||
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
|
||||
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
|
||||
}
|
||||
|
||||
if (!filter.hasMsd()) {
|
||||
// if current corpus doesn't have msd data, disable this field
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(true);
|
||||
logger.info("no msd data");
|
||||
} else {
|
||||
if (ValidationUtil.isEmpty(msd)
|
||||
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
|
||||
// msd has not been set previously
|
||||
// or msd has been set but the corpus changed -> reset
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd reset");
|
||||
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
|
||||
// if msd has been set, but corpus type remained the same, we can keep any set msd value
|
||||
msdTF.setText(StringUtils.join(msdStrings, " "));
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd kept");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: trigger on rescan
|
||||
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
|
||||
// user changed corpus (by type) or by selection & triggered a rescan of headers
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
|
||||
currentCorpusType = corpus.getCorpusType();
|
||||
// setTaxonomyIsDirty(false);
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
|
||||
* sets combobox values to what is applicable ...
|
||||
*
|
||||
* @param mode
|
||||
*/
|
||||
public void toggleMode(MODE mode) {
|
||||
if (mode == null) {
|
||||
mode = currentMode;
|
||||
}
|
||||
|
||||
logger.info("mode: ", mode.toString());
|
||||
|
||||
if (mode == MODE.WORD) {
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
|
||||
} else if (mode == MODE.LETTER) {
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
|
||||
|
||||
|
||||
// if calculateFor was selected for something other than a word or a lemma -> reset
|
||||
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
|
||||
// if the user selected something else before selecting ngram for letters, reset that choice
|
||||
calculateFor = CalculateFor.WORD;
|
||||
calculateForCB.getSelectionModel().select("različnica");
|
||||
}
|
||||
}
|
||||
|
||||
// override if orth mode, allow only word
|
||||
if (corpus.isGosOrthMode()) {
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
|
||||
msdTF.setDisable(true);
|
||||
} else {
|
||||
msdTF.setDisable(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void compute() {
|
||||
Filter filter = new Filter();
|
||||
filter.setNgramValue(1);
|
||||
filter.setCalculateFor(calculateFor);
|
||||
filter.setMsd(msd);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setSkipValue(0);
|
||||
filter.setIsCvv(false);
|
||||
filter.setSolarFilters(solarFiltersMap);
|
||||
filter.setStringLength(1);
|
||||
|
||||
String message = Validation.validateForStringLevel(filter);
|
||||
if (message == null) {
|
||||
// no errors
|
||||
logger.info("Executing: ", filter.toString());
|
||||
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
|
||||
execute(statistic);
|
||||
} else {
|
||||
logAlert(message);
|
||||
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
|
||||
}
|
||||
}
|
||||
|
||||
private void logAlert(String alert) {
|
||||
logger.info("alert: " + alert);
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public void setCorpus(Corpus corpus) {
|
||||
this.corpus = corpus;
|
||||
|
||||
if (corpus.getCorpusType() != CorpusType.SOLAR) {
|
||||
setSelectedFiltersLabel(null);
|
||||
} else {
|
||||
setSelectedFiltersLabel("/");
|
||||
}
|
||||
}
|
||||
|
||||
public void setSelectedFiltersLabel(String content) {
|
||||
if (content != null) {
|
||||
solarFilters.setVisible(true);
|
||||
selectedFiltersLabel.setVisible(true);
|
||||
selectedFiltersLabel.setText(content);
|
||||
} else {
|
||||
solarFilters.setVisible(false);
|
||||
selectedFiltersLabel.setVisible(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void execute(StatisticsNew statistic) {
|
||||
logger.info("Started execution: ", statistic.getFilter());
|
||||
|
||||
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<Void> task = new Task<Void>() {
|
||||
@SuppressWarnings("Duplicates")
|
||||
@Override
|
||||
protected Void call() throws Exception {
|
||||
long i = 0;
|
||||
for (File f : corpusFiles) {
|
||||
readXML(f.toString(), statistic);
|
||||
i++;
|
||||
this.updateProgress(i, corpusFiles.size());
|
||||
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
ngramProgressBar.progressProperty().bind(task.progressProperty());
|
||||
progressLabel.textProperty().bind(task.messageProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
try {
|
||||
boolean successullySaved = statistic.saveResultToDisk();
|
||||
if (successullySaved) {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
|
||||
} else {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
|
||||
}
|
||||
} catch (UnsupportedEncodingException e1) {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
|
||||
logger.error("Error while saving", e1);
|
||||
}
|
||||
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
task.setOnFailed(e -> {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
|
||||
logger.error("Error while executing", e);
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setProgress(0.0);
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
}
|
||||
|
||||
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
|
||||
this.solarFiltersMap = solarFiltersMap;
|
||||
}
|
||||
public void setHostServices(HostServices hostServices){
|
||||
this.hostService = hostServices;
|
||||
}
|
||||
|
||||
}
|
18
src/main/java/gui/SelectedFiltersPane.java
Normal file
18
src/main/java/gui/SelectedFiltersPane.java
Normal file
@ -0,0 +1,18 @@
|
||||
package gui;
|
||||
|
||||
import javafx.scene.control.Label;
|
||||
|
||||
public class SelectedFiltersPane {
|
||||
|
||||
|
||||
public Label selectedFiltersLabel;
|
||||
|
||||
public Label getSelectedFiltersLabel() {
|
||||
return selectedFiltersLabel;
|
||||
}
|
||||
|
||||
public void setSelectedFiltersLabel(String filters) {
|
||||
this.selectedFiltersLabel = new Label(filters);
|
||||
this.selectedFiltersLabel.setText("test?");
|
||||
}
|
||||
}
|
511
src/main/java/gui/StringAnalysisTabNew2.java
Executable file
511
src/main/java/gui/StringAnalysisTabNew2.java
Executable file
@ -0,0 +1,511 @@
|
||||
package gui;
|
||||
|
||||
import static alg.XML_processing.*;
|
||||
import static gui.GUIController.*;
|
||||
import static gui.Messages.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javafx.application.HostServices;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
import data.*;
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ListChangeListener;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.concurrent.Task;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.control.*;
|
||||
import javafx.scene.layout.Pane;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class StringAnalysisTabNew2 {
|
||||
public final static Logger logger = LogManager.getLogger(StringAnalysisTabNew2.class);
|
||||
|
||||
@FXML
|
||||
public Label selectedFiltersLabel;
|
||||
@FXML
|
||||
public Label solarFilters;
|
||||
|
||||
@FXML
|
||||
private TextField msdTF;
|
||||
private ArrayList<Pattern> msd;
|
||||
private ArrayList<String> msdStrings;
|
||||
|
||||
@FXML
|
||||
private CheckComboBox<String> taxonomyCCB;
|
||||
private ArrayList<String> taxonomy;
|
||||
|
||||
@FXML
|
||||
private CheckBox calculatecvvCB;
|
||||
private boolean calculateCvv;
|
||||
|
||||
@FXML
|
||||
private TextField stringLengthTF;
|
||||
private Integer stringLength;
|
||||
|
||||
@FXML
|
||||
private ComboBox<String> calculateForCB;
|
||||
private CalculateFor calculateFor;
|
||||
|
||||
@FXML
|
||||
private ComboBox<String> ngramValueCB;
|
||||
private Integer ngramValue;
|
||||
|
||||
@FXML
|
||||
private ComboBox<String> skipValueCB;
|
||||
private Integer skipValue;
|
||||
|
||||
@FXML
|
||||
private Pane paneWords;
|
||||
|
||||
@FXML
|
||||
private Pane paneLetters;
|
||||
|
||||
@FXML
|
||||
private Button computeNgramsB;
|
||||
|
||||
@FXML
|
||||
public ProgressBar ngramProgressBar;
|
||||
@FXML
|
||||
public Label progressLabel;
|
||||
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
private enum MODE {
|
||||
LETTER,
|
||||
WORD
|
||||
}
|
||||
|
||||
private MODE currentMode;
|
||||
|
||||
private Corpus corpus;
|
||||
private HashMap<String, HashSet<String>> solarFiltersMap;
|
||||
private Filter filter;
|
||||
private boolean useDb;
|
||||
private HostServices hostService;
|
||||
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica");
|
||||
private static final ObservableList<String> N_GRAM_COMPUTE_FOR_WORDS_ORTH = FXCollections.observableArrayList("različnica");
|
||||
|
||||
|
||||
// TODO: pass observables for taxonomy based on header scan
|
||||
// after header scan
|
||||
private ObservableList<String> taxonomyCCBValues;
|
||||
private CorpusType currentCorpusType;
|
||||
|
||||
public void init() {
|
||||
currentMode = MODE.WORD;
|
||||
toggleMode(currentMode);
|
||||
|
||||
// ngram value CB
|
||||
ngramValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (newValue.equals("nivo črk")) {
|
||||
ngramValue = 0;
|
||||
toggleMode(MODE.LETTER);
|
||||
} else {
|
||||
ngramValue = Integer.valueOf(newValue);
|
||||
toggleMode(MODE.WORD);
|
||||
}
|
||||
|
||||
// skip only on ngrams of more than one word
|
||||
if (ngramValue > 1) {
|
||||
skipValueCB.setDisable(false);
|
||||
} else {
|
||||
skipValueCB.getSelectionModel().select(0);
|
||||
skipValue = 0;
|
||||
skipValueCB.setDisable(true);
|
||||
}
|
||||
|
||||
logger.info("ngramValueCB:", ngramValue);
|
||||
});
|
||||
|
||||
// set first n-gram value to 2 at index 0
|
||||
ngramValueCB.getSelectionModel().select(0); // selected index
|
||||
ngramValue = 2; // actual value at that index
|
||||
|
||||
// calculateForCB
|
||||
calculateForCB.valueProperty().addListener((observable, oldValue, newValue) -> {
|
||||
calculateFor = CalculateFor.factory(newValue);
|
||||
logger.info("calculateForCB:", calculateFor.toString());
|
||||
});
|
||||
|
||||
calculateForCB.getSelectionModel().select(0);
|
||||
|
||||
// msd
|
||||
msdTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (!newValue) {
|
||||
// focus lost
|
||||
String value = msdTF.getText();
|
||||
logger.info("msdTf: ", value);
|
||||
|
||||
if (!ValidationUtil.isEmpty(value)) {
|
||||
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(value.split(" ")));
|
||||
|
||||
int nOfRequiredMsdTokens = ngramValue == 0 ? 1 : ngramValue;
|
||||
if (msdTmp.size() != nOfRequiredMsdTokens) {
|
||||
String msg = String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, nOfRequiredMsdTokens, msdTmp.size());
|
||||
logAlert(msg);
|
||||
showAlert(Alert.AlertType.ERROR, msg);
|
||||
}
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
for (String msdToken : msdTmp) {
|
||||
msd.add(Pattern.compile(msdToken));
|
||||
msdStrings.add(msdToken);
|
||||
}
|
||||
logger.info(String.format("msd accepted (%d)", msd.size()));
|
||||
|
||||
} else if (!ValidationUtil.isEmpty(newValue)) {
|
||||
msd = new ArrayList<>();
|
||||
msdStrings = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
msdTF.setText("");
|
||||
msd = new ArrayList<>();
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
taxonomyCCB.getItems().removeAll();
|
||||
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
|
||||
taxonomy = new ArrayList<>();
|
||||
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
|
||||
taxonomy.addAll(checkedItems);
|
||||
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
|
||||
});
|
||||
taxonomyCCB.getCheckModel().clearChecks();
|
||||
} else {
|
||||
taxonomyCCB.setDisable(true);
|
||||
}
|
||||
|
||||
// skip
|
||||
skipValueCB.valueProperty().addListener((observable, oldValue, newValue) -> {
|
||||
skipValue = Integer.valueOf(newValue);
|
||||
logger.info("Skip " + skipValue);
|
||||
});
|
||||
|
||||
skipValueCB.getSelectionModel().select(0);
|
||||
skipValue = 0;
|
||||
|
||||
// cvv
|
||||
calculatecvvCB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
calculateCvv = newValue;
|
||||
logger.info("calculate cvv: " + calculateCvv);
|
||||
});
|
||||
|
||||
calculatecvvCB.setSelected(false);
|
||||
|
||||
// string length
|
||||
stringLengthTF.focusedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
if (!newValue) {
|
||||
// focus lost
|
||||
String value = stringLengthTF.getText();
|
||||
if (!ValidationUtil.isEmpty(value)) {
|
||||
if (!ValidationUtil.isNumber(value)) {
|
||||
logAlert("stringlengthTf: " + WARNING_ONLY_NUMBERS_ALLOWED);
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_ONLY_NUMBERS_ALLOWED);
|
||||
}
|
||||
stringLength = Integer.parseInt(value);
|
||||
} else {
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, WARNING_MISSING_STRING_LENGTH);
|
||||
stringLengthTF.setText("1");
|
||||
logAlert(WARNING_MISSING_STRING_LENGTH);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
computeNgramsB.setOnAction(e -> {
|
||||
compute();
|
||||
logger.info("compute button");
|
||||
});
|
||||
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
}
|
||||
|
||||
/**
|
||||
* case a: values for combo boxes can change after a corpus change
|
||||
* <ul>
|
||||
* <li>different corpus type - reset all fields so no old values remain</li>
|
||||
* <li>same corpus type, different subset - keep</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* case b: values for combo boxes can change after a header scan
|
||||
* <ul>
|
||||
* <li>at first, fields are populated by corpus type defaults</li>
|
||||
* <li>after, with gathered data</li>
|
||||
* </ul>
|
||||
* <p></p>
|
||||
* ngrams: 1
|
||||
* calculateFor: word
|
||||
* msd:
|
||||
* taxonomy:
|
||||
* skip: 0
|
||||
* iscvv: false
|
||||
* string length: 1
|
||||
*/
|
||||
public void populateFields() {
|
||||
// corpus changed if: current one is null (this is first run of the app)
|
||||
// or if currentCorpus != gui's corpus
|
||||
boolean corpusChanged = currentCorpusType == null
|
||||
|| currentCorpusType != corpus.getCorpusType();
|
||||
|
||||
// keep ngram value if set
|
||||
if (ngramValue == null) {
|
||||
ngramValueCB.getSelectionModel().select("1");
|
||||
ngramValue = 1;
|
||||
}
|
||||
|
||||
// TODO: check for GOS, GIGAFIDA, SOLAR...
|
||||
// refresh and:
|
||||
// TODO if current value != null && is in new calculateFor ? keep : otherwise reset
|
||||
if (calculateFor == null) {
|
||||
calculateForCB.getSelectionModel().select(calculateForCB.getItems().get(0));
|
||||
calculateFor = CalculateFor.factory(calculateForCB.getItems().get(0));
|
||||
}
|
||||
|
||||
if (!filter.hasMsd()) {
|
||||
// if current corpus doesn't have msd data, disable this field
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(true);
|
||||
logger.info("no msd data");
|
||||
} else {
|
||||
if (ValidationUtil.isEmpty(msd)
|
||||
|| (!ValidationUtil.isEmpty(msd) && corpusChanged)) {
|
||||
// msd has not been set previously
|
||||
// or msd has been set but the corpus changed -> reset
|
||||
msd = new ArrayList<>();
|
||||
msdTF.setText("");
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd reset");
|
||||
} else if (!ValidationUtil.isEmpty(msd) && !corpusChanged) {
|
||||
// if msd has been set, but corpus type remained the same, we can keep any set msd value
|
||||
msdTF.setText(StringUtils.join(msdStrings, " "));
|
||||
msdTF.setDisable(false);
|
||||
logger.info("msd kept");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: taxonomy: refresh and keep if in new taxonomy, otherwise empty (no selection)
|
||||
|
||||
// keep skip value
|
||||
if (skipValue == null) {
|
||||
skipValueCB.getSelectionModel().select("0");
|
||||
skipValue = 0;
|
||||
}
|
||||
|
||||
// keep calculateCvv
|
||||
calculatecvvCB.setSelected(calculateCvv);
|
||||
|
||||
// keep string length if set
|
||||
if (stringLength != null) {
|
||||
stringLengthTF.setText(String.valueOf(stringLength));
|
||||
} else {
|
||||
stringLengthTF.setText("1");
|
||||
stringLength = 1;
|
||||
}
|
||||
|
||||
// TODO: trigger on rescan
|
||||
if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) {
|
||||
// user changed corpus (by type) or by selection & triggered a rescan of headers
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
|
||||
currentCorpusType = corpus.getCorpusType();
|
||||
// setTaxonomyIsDirty(false);
|
||||
} else {
|
||||
|
||||
}
|
||||
|
||||
// see if we read taxonomy from headers, otherwise use default values for given corpus
|
||||
ObservableList<String> tax = corpus.getTaxonomy();
|
||||
taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType());
|
||||
taxonomyCCB.getItems().addAll(taxonomyCCBValues);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggles visibility for panes which hold fields for skipgram value (not applicable when calculating for letters) etc.,
|
||||
* sets combobox values to what is applicable ...
|
||||
*
|
||||
* @param mode
|
||||
*/
|
||||
public void toggleMode(MODE mode) {
|
||||
if (mode == null) {
|
||||
mode = currentMode;
|
||||
}
|
||||
|
||||
logger.info("mode: ", mode.toString());
|
||||
|
||||
if (mode == MODE.WORD) {
|
||||
paneWords.setVisible(true);
|
||||
paneLetters.setVisible(false);
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS);
|
||||
} else if (mode == MODE.LETTER) {
|
||||
paneWords.setVisible(false);
|
||||
paneLetters.setVisible(true);
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_LETTERS);
|
||||
|
||||
// populate with default cvv length value
|
||||
if (stringLength == null) {
|
||||
stringLengthTF.setText("1");
|
||||
stringLength = 1;
|
||||
} else {
|
||||
stringLengthTF.setText(String.valueOf(stringLength));
|
||||
}
|
||||
|
||||
// if calculateFor was selected for something other than a word or a lemma -> reset
|
||||
if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) {
|
||||
// if the user selected something else before selecting ngram for letters, reset that choice
|
||||
calculateFor = CalculateFor.WORD;
|
||||
calculateForCB.getSelectionModel().select("različnica");
|
||||
}
|
||||
}
|
||||
|
||||
// override if orth mode, allow only word
|
||||
if (corpus.isGosOrthMode()) {
|
||||
calculateForCB.getItems().setAll(N_GRAM_COMPUTE_FOR_WORDS_ORTH);
|
||||
msdTF.setDisable(true);
|
||||
} else {
|
||||
msdTF.setDisable(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void compute() {
|
||||
Filter filter = new Filter();
|
||||
filter.setNgramValue(ngramValue);
|
||||
filter.setCalculateFor(calculateFor);
|
||||
filter.setMsd(msd);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setSkipValue(skipValue);
|
||||
filter.setIsCvv(calculateCvv);
|
||||
filter.setSolarFilters(solarFiltersMap);
|
||||
|
||||
if (ngramValue != null && ngramValue == 0) {
|
||||
filter.setStringLength(stringLength);
|
||||
}
|
||||
|
||||
String message = Validation.validateForStringLevel(filter);
|
||||
if (message == null) {
|
||||
// no errors
|
||||
logger.info("Executing: ", filter.toString());
|
||||
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
|
||||
execute(statistic);
|
||||
} else {
|
||||
logAlert(message);
|
||||
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
|
||||
}
|
||||
}
|
||||
|
||||
private void logAlert(String alert) {
|
||||
logger.info("alert: " + alert);
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public void setCorpus(Corpus corpus) {
|
||||
this.corpus = corpus;
|
||||
|
||||
if (corpus.getCorpusType() != CorpusType.SOLAR) {
|
||||
setSelectedFiltersLabel(null);
|
||||
} else {
|
||||
setSelectedFiltersLabel("/");
|
||||
}
|
||||
}
|
||||
|
||||
public void setSelectedFiltersLabel(String content) {
|
||||
if (content != null) {
|
||||
solarFilters.setVisible(true);
|
||||
selectedFiltersLabel.setVisible(true);
|
||||
selectedFiltersLabel.setText(content);
|
||||
} else {
|
||||
solarFilters.setVisible(false);
|
||||
selectedFiltersLabel.setVisible(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void execute(StatisticsNew statistic) {
|
||||
logger.info("Started execution: ", statistic.getFilter());
|
||||
|
||||
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<Void> task = new Task<Void>() {
|
||||
@SuppressWarnings("Duplicates")
|
||||
@Override
|
||||
protected Void call() throws Exception {
|
||||
long i = 0;
|
||||
for (File f : corpusFiles) {
|
||||
readXML(f.toString(), statistic);
|
||||
i++;
|
||||
this.updateProgress(i, corpusFiles.size());
|
||||
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
ngramProgressBar.progressProperty().bind(task.progressProperty());
|
||||
progressLabel.textProperty().bind(task.messageProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
try {
|
||||
boolean successullySaved = statistic.saveResultToDisk();
|
||||
if (successullySaved) {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
|
||||
} else {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
|
||||
}
|
||||
} catch (UnsupportedEncodingException e1) {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
|
||||
logger.error("Error while saving", e1);
|
||||
}
|
||||
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_OK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
task.setOnFailed(e -> {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_EXECUTING);
|
||||
logger.error("Error while executing", e);
|
||||
ngramProgressBar.progressProperty().unbind();
|
||||
ngramProgressBar.setProgress(0.0);
|
||||
ngramProgressBar.setStyle(Settings.FX_ACCENT_NOK);
|
||||
progressLabel.textProperty().unbind();
|
||||
progressLabel.setText("");
|
||||
});
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
}
|
||||
|
||||
public void setSolarFiltersMap(HashMap<String, HashSet<String>> solarFiltersMap) {
|
||||
this.solarFiltersMap = solarFiltersMap;
|
||||
}
|
||||
public void setHostServices(HostServices hostServices){
|
||||
this.hostService = hostServices;
|
||||
}
|
||||
}
|
77
src/main/java/gui/ValidationUtil.java
Normal file
77
src/main/java/gui/ValidationUtil.java
Normal file
@ -0,0 +1,77 @@
|
||||
package gui;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
|
||||
public class ValidationUtil {
|
||||
|
||||
public static boolean isNumber(String value) {
|
||||
return NumberUtils.isCreatable(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if an object is empty or null. Null part is especially important,
|
||||
* since Java's built-in isEmpty() methods don't check for this condition
|
||||
* and throw a nullPointerException as a result.
|
||||
* <p>
|
||||
* Supported structures:
|
||||
* <ul>
|
||||
* <li>String: empty if null or length is zero</li>
|
||||
* <li>List: empty if null or size() == 0</li>
|
||||
* <li>Map: empty if null or if it contains no keys, or if all keys map to an empty value </li>
|
||||
* </ul>
|
||||
*/
|
||||
public static boolean isEmpty(Object o) {
|
||||
if (o == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (o instanceof String) {
|
||||
if (((String) o).length() == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (o instanceof List) {
|
||||
if (((List) o).isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (o instanceof Map) {
|
||||
if (((Map) o).keySet().isEmpty()) {
|
||||
return true;
|
||||
} else {
|
||||
for (Object val : ((Map) o).values()) {
|
||||
if (!isEmpty(val)) {
|
||||
// if map contains any value that isn't empty, the map isn't considered empty
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isNotEmpty(Object o) {
|
||||
return !isEmpty(o);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a given File is a folder for which we have appropriate permission
|
||||
*/
|
||||
public static boolean isValidDirectory(File f) {
|
||||
return f.isDirectory() && f.canRead() && f.canWrite();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a given File is a folder for which we have appropriate permission
|
||||
*/
|
||||
public static boolean isReadableDirectory(File f) {
|
||||
return f.isDirectory() && f.canRead();
|
||||
}
|
||||
}
|
208
src/main/java/gui/WordFormationTab.java
Normal file
208
src/main/java/gui/WordFormationTab.java
Normal file
@ -0,0 +1,208 @@
|
||||
package gui;
|
||||
|
||||
import static alg.XML_processing.*;
|
||||
import static gui.GUIController.*;
|
||||
import static gui.Messages.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import javafx.application.HostServices;
|
||||
import javafx.scene.control.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.controlsfx.control.CheckComboBox;
|
||||
|
||||
import data.*;
|
||||
import javafx.collections.ListChangeListener;
|
||||
import javafx.collections.ObservableList;
|
||||
import javafx.concurrent.Task;
|
||||
import javafx.fxml.FXML;
|
||||
import javafx.scene.layout.AnchorPane;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class WordFormationTab {
|
||||
public final static Logger logger = LogManager.getLogger(WordFormationTab.class);
|
||||
|
||||
public AnchorPane wordAnalysisTabPane;
|
||||
|
||||
@FXML
|
||||
public Label selectedFiltersLabel;
|
||||
@FXML
|
||||
public Label solarFilters;
|
||||
|
||||
@FXML
|
||||
private CheckComboBox<String> taxonomyCCB;
|
||||
private ArrayList<String> taxonomy;
|
||||
|
||||
@FXML
|
||||
private Button computeB;
|
||||
|
||||
@FXML
|
||||
public ProgressBar ngramProgressBar;
|
||||
@FXML
|
||||
public Label progressLabel;
|
||||
|
||||
@FXML
|
||||
private Hyperlink helpH;
|
||||
|
||||
private Corpus corpus;
|
||||
private HashMap<String, HashSet<String>> solarFiltersMap;
|
||||
private HostServices hostService;
|
||||
|
||||
// after header scan
|
||||
private ObservableList<String> taxonomyCCBValues;
|
||||
private CorpusType currentCorpusType;
|
||||
private boolean useDb;
|
||||
|
||||
|
||||
public void init() {
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
taxonomyCCB.getItems().removeAll();
|
||||
taxonomyCCB.getItems().setAll(corpus.getTaxonomy());
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener<String>) c -> {
|
||||
taxonomy = new ArrayList<>();
|
||||
ObservableList<String> checkedItems = taxonomyCCB.getCheckModel().getCheckedItems();
|
||||
taxonomy.addAll(checkedItems);
|
||||
logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ",")));
|
||||
});
|
||||
taxonomyCCB.getCheckModel().clearChecks();
|
||||
} else {
|
||||
taxonomyCCB.setDisable(true);
|
||||
}
|
||||
|
||||
computeB.setOnAction(e -> {
|
||||
compute();
|
||||
logger.info("compute button");
|
||||
});
|
||||
|
||||
helpH.setOnAction(e -> openHelpWebsite());
|
||||
}
|
||||
|
||||
private void compute() {
|
||||
Filter filter = new Filter();
|
||||
filter.setNgramValue(1);
|
||||
filter.setCalculateFor(CalculateFor.MORPHOSYNTACTIC_PROPERTY);
|
||||
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
|
||||
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||
filter.setSkipValue(0);
|
||||
filter.setMsd(new ArrayList<>());
|
||||
filter.setIsCvv(false);
|
||||
filter.setSolarFilters(solarFiltersMap);
|
||||
|
||||
String message = Validation.validateForStringLevel(filter);
|
||||
if (message == null) {
|
||||
// no errors
|
||||
logger.info("Executing: ", filter.toString());
|
||||
StatisticsNew statistic = new StatisticsNew(corpus, filter, useDb);
|
||||
execute(statistic);
|
||||
} else {
|
||||
logAlert(message);
|
||||
showAlert(Alert.AlertType.ERROR, "Prosim izpolnite polja:", message);
|
||||
}
|
||||
}
|
||||
|
||||
private void openHelpWebsite(){
|
||||
hostService.showDocument(Messages.HELP_URL);
|
||||
}
|
||||
|
||||
private void execute(StatisticsNew statistic) {
|
||||
logger.info("Started execution: ", statistic.getFilter());
|
||||
|
||||
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||
|
||||
final Task<Void> task = new Task<Void>() {
|
||||
@SuppressWarnings("Duplicates")
|
||||
@Override
|
||||
protected Void call() throws Exception {
|
||||
long i = 0;
|
||||
for (File f : corpusFiles) {
|
||||
readXML(f.toString(), statistic);
|
||||
i++;
|
||||
this.updateProgress(i, corpusFiles.size());
|
||||
this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size(), f.getName()));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
ngramProgressBar.progressProperty().bind(task.progressProperty());
|
||||
progressLabel.textProperty().bind(task.messageProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
try {
|
||||
// first, we have to recalculate all occurrences to detailed statistics
|
||||
boolean successullySaved = statistic.recalculateAndSaveResultToDisk();
|
||||
|
||||
if (successullySaved) {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED);
|
||||
} else {
|
||||
showAlert(Alert.AlertType.INFORMATION, Messages.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS);
|
||||
}
|
||||
} catch (UnsupportedEncodingException e1) {
|
||||
showAlert(Alert.AlertType.ERROR, ERROR_WHILE_SAVING_RESULTS_TO_CSV);
|
||||
logger.error("Error while saving", e1);
|
||||
}
|
||||
|
||||