Browse Source

Project copied

master
Luka 3 years ago
commit
a18e52a599
  1. 160
      .gitignore
  2. 28
      Corpus Analyzer.iml
  3. 122
      pom.xml
  4. 3
      src/main/java/META-INF/MANIFEST.MF
  5. 15
      src/main/java/alg/Common.java
  6. 794
      src/main/java/alg/XML_processing.java
  7. 67
      src/main/java/alg/inflectedJOS/ForkJoin.java
  8. 170
      src/main/java/alg/inflectedJOS/InflectedJOSCount.java
  9. 131
      src/main/java/alg/inflectedJOS/WordFormation.java
  10. 62
      src/main/java/alg/ngram/ForkJoin.java
  11. 204
      src/main/java/alg/ngram/Ngrams.java
  12. 62
      src/main/java/alg/word/ForkJoin.java
  13. 167
      src/main/java/alg/word/WordCount.java
  14. 112
      src/main/java/alg/word/WordLevel.java
  15. 17
      src/main/java/data/AnalysisLevel.java
  16. 43
      src/main/java/data/CalculateFor.java
  17. 163
      src/main/java/data/Corpus.java
  18. 25
      src/main/java/data/CorpusType.java
  19. 12
      src/main/java/data/Enums/InflectedJosTypes.java
  20. 68
      src/main/java/data/Enums/Msd.java
  21. 55
      src/main/java/data/Enums/WordLevelDefaultValues.java
  22. 16
      src/main/java/data/Enums/WordLevelType.java
  23. 57
      src/main/java/data/Enums/solar/SolarFilters.java
  24. 144
      src/main/java/data/Filter.java
  25. 71
      src/main/java/data/GigafidaJosWordType.java
  26. 76
      src/main/java/data/GigafidaTaxonomy.java
  27. 85
      src/main/java/data/GosTaxonomy.java
  28. 56
      src/main/java/data/Sentence.java
  29. 16
      src/main/java/data/Settings.java
  30. 299
      src/main/java/data/Statistics.java
  31. 409
      src/main/java/data/StatisticsNew.java
  32. 175
      src/main/java/data/Tax.java
  33. 171
      src/main/java/data/Taxonomy.java
  34. 53
      src/main/java/data/Validation.java
  35. 141
      src/main/java/data/Word.java
  36. 454
      src/main/java/gui/CharacterAnalysisTab.java
  37. 517
      src/main/java/gui/CorpusTab.java
  38. 187
      src/main/java/gui/FiltersForSolar.java
  39. 150
      src/main/java/gui/GUIController.java
  40. 74
      src/main/java/gui/Messages.java
  41. 389
      src/main/java/gui/OneWordAnalysisTab.java
  42. 18
      src/main/java/gui/SelectedFiltersPane.java
  43. 511
      src/main/java/gui/StringAnalysisTabNew2.java
  44. 77
      src/main/java/gui/ValidationUtil.java
  45. 208
      src/main/java/gui/WordFormationTab.java
  46. 207
      src/main/java/gui/WordLevelTab.java
  47. 3
      src/main/java/manifest/META-INF/MANIFEST.MF
  48. 25
      src/main/java/util/ByteUtils.java
  49. 46
      src/main/java/util/Combinations.java
  50. 267
      src/main/java/util/Export.java
  51. 31
      src/main/java/util/Key.java
  52. 63
      src/main/java/util/TimeWatch.java
  53. 225
      src/main/java/util/Util.java
  54. 132
      src/main/java/util/db/RDB.java
  55. 68720
      src/main/resources/GOS_small/TEI_GOS_small.xml
  56. 524
      src/main/resources/GOS_tax_test/GOS_tax_test.xml
  57. 133
      src/main/resources/GUI.fxml
  58. 237
      src/main/resources/Gigafida_minimal/gfmin.xml
  59. 70
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.34.13.csv
  60. 390
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.37.50.csv
  61. 1147
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.38.17.csv
  62. 455
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_1-gram_0-skip_31.01.2018_05.11.26.csv
  63. 1160
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_2-gram_1-skip_31.01.2018_05.11.33.csv
  64. 512
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_1-gram_0-skip_25.01.2018_06.27.41.csv
  65. 623
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_2-gram_0-skip_20.01.2018_01.27.csv
  66. 572
      src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_3-gram_0-skip_20.01.2018_01.27.csv
  67. 350
      src/main/resources/Gigafida_subset/F0012405.xml
  68. 367
      src/main/resources/Gigafida_subset/F0016316.xml
  69. 336
      src/main/resources/Gigafida_subset/F0018194.xml
  70. 367
      src/main/resources/Gigafida_subset/F0026709.xml
  71. 365
      src/main/resources/Gigafida_subset/F0030361.xml
  72. 356
      src/main/resources/Gigafida_subset/nested/F0036980.xml
  73. 408
      src/main/resources/Gigafida_subset/nested/F0037258.xml
  74. 391
      src/main/resources/Gigafida_subset/nested/F0037544.xml
  75. 355
      src/main/resources/Gigafida_subset/nested/F0038754.xml
  76. 402
      src/main/resources/Gigafida_subset/nested/F0038920.xml
  77. 18
      src/main/resources/Lists/prefixes.txt
  78. 7
      src/main/resources/Lists/suffixes.txt
  79. 54
      src/main/resources/gui/CharacterAnalysisTab.fxml
  80. 32
      src/main/resources/gui/CorpusTab.fxml
  81. 30
      src/main/resources/gui/FiltersForSolar.fxml
  82. 56
      src/main/resources/gui/OneWordAnalysisTab.fxml
  83. 13
      src/main/resources/gui/SelectedFiltersPane.fxml
  84. 105
      src/main/resources/gui/StringAnalysisTabNew2.fxml
  85. 25
      src/main/resources/gui/WordFormationTab.fxml
  86. 25
      src/main/resources/gui/WordLevelTab.fxml
  87. 22
      src/main/resources/log4j2.xml
  88. 85
      src/test/java/Common.java
  89. 42
      src/test/java/CorpusTests.java
  90. 66
      src/test/java/DBTest.java
  91. 334
      src/test/java/NgramTests.java
  92. 51
      src/test/java/WordFormationTest.java
  93. 15
      src/test/java/WordLevelTest.java
  94. 39
      src/test/java/WordTest.java

160
.gitignore

@ -0,0 +1,160 @@
# Created by .ignore support plugin (hsz.mobi)
### Maven template
target/
pom.xml.tag
pom.xml.releaseBackup
pom.xml.versionsBackup
pom.xml.next
release.properties
dependency-reduced-pom.xml
buildNumber.properties
.mvn/timing.properties
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored)
!/.mvn/wrapper/maven-wrapper.jar
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
.idea/
# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
# Gradle:
.idea/**/gradle.xml
.idea/**/libraries
# Mongo Explorer plugin:
.idea/**/mongoSettings.xml
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
/out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
### Java template
# Compiled class file
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.war
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
### Eclipse template
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders
# Eclipse Core
.project
# External tool builders
.externalToolBuilders/
# Locally stored "Eclipse launch configurations"
*.launch
# PyDev specific (Python IDE for Eclipse)
*.pydevproject
# CDT-specific (C/C++ Development Tooling)
.cproject
# JDT-specific (Eclipse Java Development Tools)
.classpath
# Java annotation processor (APT)
.factorypath
# PDT-specific (PHP Development Tools)
.buildpath
# sbteclipse plugin
.target
# Tern plugin
.tern-project
# TeXlipse plugin
.texlipse
# STS (Spring Tool Suite)
.springBeans
# Code Recommenders
.recommenders/
# Scala IDE specific (Scala & Java development for Eclipse)
.cache-main
.scala_dependencies
.worksheet
### Windows ###
# Windows thumbnail cache files
Thumbs.db
ehthumbs.db
ehthumbs_vista.db
# Folder config file
Desktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msm
*.msp
# Windows shortcuts
*.lnk

28
Corpus Analyzer.iml

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
</component>
</module>

122
pom.xml

@ -0,0 +1,122 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>thesis</groupId>
<artifactId>corpus-analyzer</artifactId>
<version>1.2</version>
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.6</version>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.controlsfx</groupId>
<artifactId>controlsfx</artifactId>
<version>8.40.13</version>
</dependency>
<dependency>
<groupId>org.rocksdb</groupId>
<artifactId>rocksdbjni</artifactId>
<version>5.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-fontawesome-pack</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-javafx</artifactId>
<version>1.9.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!-- packages dependencies into the jar -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>gui.GUIController</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>artifact</outputDirectory>
<finalName>Corpus_Analyzer_${version}</finalName>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- JavaFX -->
<groupId>com.zenjava</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>8.6.0</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
<verbose>true</verbose>
</configuration>
<executions>
<execution>
<id>create-jfxjar</id>
<phase>package</phase>
<goals>
<goal>build-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

3
src/main/java/META-INF/MANIFEST.MF

@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: gui.GUIController

15
src/main/java/alg/Common.java

@ -0,0 +1,15 @@
package alg;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
public class Common {
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
// if not in map
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
map.get(o).incrementAndGet();
}
}

794
src/main/java/alg/XML_processing.java

@ -0,0 +1,794 @@
package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.logging.log4j.LogManager;
import data.*;
import gui.ValidationUtil;
public class XML_processing {
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
// public static void processCorpus(Statistics stats) {
// // we can preset the list's size, so there won't be a need to resize it
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
//
// int i = 0;
// for (File f : Settings.corpus) {
// i++;
// readXML(f.toString(), stats);
// }
// }
// public static void readXML(String path, Statistics stats) {
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
// readXMLGigafida(path, stats);
// } else if (stats.getCorpusType() == CorpusType.GOS) {
// readXMLGos(path, stats);
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
// readXMLSolar(path, stats);
// }
// }
public static void readXML(String path, StatisticsNew stats) {
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
readXMLGigafida(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
}
}
/**
* Reads and returns the value of a passed header tag or an empty string.
* E.g. title tag, for discerning the corpus' type.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderTag(String path, String tag) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
pool.invoke(wc);
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
pool.invoke(wc);
} else {
// TODO:
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
}
// public static void readXMLGos(String path, Statistics stats) {
// boolean in_word = false;
// String taksonomija = "";
// String lemma = "";
// String msd = "";
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
//
// List<Word> stavek = new ArrayList<>();
// List<Sentence> corpus = new ArrayList<>();
// String sentenceDelimiter = "seg";
// String taxonomyPrefix = "gos.";
//
// try {
// XMLInputFactory factory = XMLInputFactory.newInstance();
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
//
// while (eventReader.hasNext()) {
// XMLEvent event = eventReader.nextEvent();
//
// switch (event.getEventType()) {
// case XMLStreamConstants.START_ELEMENT:
//
// StartElement startElement = event.asStartElement();
// String qName = startElement.getName().getLocalPart();
//
// // "word" node
// if (qName.equals("w")) {
// in_word = true;
//
// if (type.equals("norm")) {
// // make sure we're looking at <w lemma...> and not <w type...>
// Iterator var = startElement.getAttributes();
// ArrayList<Object> attributes = new ArrayList<>();
// while (var.hasNext()) {
// attributes.add(var.next());
// }
//
// if (attributes.contains("msd")) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// } else {
// msd = null;
// }
//
// if (attributes.contains("lemma")) {
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
// }
// }
// // taxonomy node
// else if (qName.equalsIgnoreCase("catRef")) {
// // there are some term nodes at the beginning that are of no interest to us
// // they differ by not having the attribute "ref", so test will equal null
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
//
// if (test != null) {
// // keep only taxonomy properties
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
// }
// } else if (qName.equalsIgnoreCase("div")) {
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
//
// }
// break;
//
// case XMLStreamConstants.CHARACTERS:
// Characters characters = event.asCharacters();
//
// // "word" node value
// if (in_word) {
// if (type.equals("norm") && msd != null) {
// stavek.add(new Word(characters.getData(), lemma, msd));
// } else {
// stavek.add(new Word(characters.getData()));
// }
//
// in_word = false;
// }
// break;
//
// case XMLStreamConstants.END_ELEMENT:
// EndElement endElement = event.asEndElement();
//
// // parser reached end of the current sentence
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// // add sentence to corpus
// corpus.add(new Sentence(stavek, taksonomija, type));
// // and start a new one
// stavek = new ArrayList<>();
//
// /* Invoke Fork-Join when we reach maximum limit of
// * sentences (because we can't read everything to
// * memory) or we reach the end of the file.
// */
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
// fj(corpus, stats);
// // empty the current corpus, since we don't need
// // the data anymore
// corpus.clear();
// }
// }
//
// // backup
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// fj(corpus, stats);
// corpus.clear();
// }
//
// break;
// }
// }
// } catch (FileNotFoundException | XMLStreamException e) {
// e.printStackTrace();
// }
// }
@SuppressWarnings("unused")
public static void readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>();
// used for filter
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
// System.out.println(String.format("%s", startElement.toString()));
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek));
// and start a new one
stavek = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String qNameEnd = endElement.getName().getLocalPart();
if (qNameEnd.equals("head")) {
// validate and set boolean
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
includeThisBlock = true;
}
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
}
// backup
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
}
}
/**
* @param readHeadBlock block of tags read from the corpus
* @param userSetFilter tags with values set by the user
*
* @return
*/
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
boolean pass = true;
if (userSetFilter == null) {
return true;
}
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
String key = filterEntry.getKey();
HashSet<String> valueObject = filterEntry.getValue();
// if (valueObject instanceof String) {
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
// } else
if (valueObject != null) {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
}
}
if (!pass) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
}
}
// if it gets to this point, it passed all the filters
return true;
}
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
if (!readHeadBlock.keySet().contains(userSetKey)) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
// different values -> doesn't pass the filter
return false;
}
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
String headTagName;
if (corpusType == CorpusType.SOLAR) {
headTagName = "head";
// used for filter
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else {
headTagName = "teiHeader";
}
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = null;
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String elementName = startElement.getName().getLocalPart();
if (elementName.equalsIgnoreCase(headTagName)) {
// if the corpus is split into files, we skip bodies
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
}
if (insideHeader) {
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
HashMap<String, String> atts = extractAttributes(startElement);
String debug = "";
String tax = startElement.getAttributeByName(QName.valueOf("target"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
}
}
} catch (XMLStreamException e) {
logger.error("Streaming error", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
} catch (FileNotFoundException e) {
logger.error("File not found", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
} finally {
if (xmlEventReader != null) {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return parseTaxonomy ? resultTaxonomy : resultFilters;
}
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
return event.asEndElement()
.getName()
.getLocalPart()
.equalsIgnoreCase(headerTag);
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
}
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "seg";
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
XMLEventReader eventReader = null;
boolean includeFile = true;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
if (qName.equals("div")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("type")) {
inOrthDiv = atts.get("type").equals("orth");
}
}
// "word" node
if (qName.equals("w")) {
// check that it's not a type
HashMap<String, String> atts = extractAttributes(startElement);
if (!atts.containsKey("type")) {
inWord = true;
if (atts.containsKey("msd")) {
msd = atts.get("msd");
}
if (atts.containsKey("lemma")) {
lemma = atts.get("lemma");
}
//
// if (!inOrthDiv) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
}
// }
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd));
} else {
sentence.add(new Word(characters.getData()));
}
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
boolean saveSentence = computeForOrth == inOrthDiv;
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
// disregard this entry if taxonomies don't match
includeFile = !currentFiletaxonomy.isEmpty();
currentFiletaxonomy = new ArrayList<>();
}
}
// backup
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
} catch (Exception e) {
logger.error("general error", e);
}
}
}
return true;
}
/**
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
* Filters:
* <ol>
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
* </ol>
*
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
*/
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
return null;
}
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
}
}
return sentence;
}
private static HashMap<String, String> extractAttributes(StartElement se) {
Iterator attributesIt = se.getAttributes();
HashMap<String, String> atts = new HashMap<>();
while (attributesIt.hasNext()) {
Attribute a = (Attribute) attributesIt.next();
atts.put(a.getName().getLocalPart(), a.getValue());
}
return atts;
}
}

67
src/main/java/alg/inflectedJOS/ForkJoin.java

@ -0,0 +1,67 @@
package alg.inflectedJOS;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.Statistics;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private Statistics stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
}
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

170
src/main/java/alg/inflectedJOS/InflectedJOSCount.java

@ -0,0 +1,170 @@
package alg.inflectedJOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import alg.Common;
import data.Sentence;
import data.Statistics;
import data.StatisticsNew;
import data.Word;
public class InflectedJOSCount {
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
// static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i));
// }
// }
//
// private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5};
// int r = 3;
// int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
//
// return printCombination(arr, n, r);
// }
//
// /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[]
// r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>();
//
// if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++)
// System.out.print(data[j] + " ");
// System.out.println("");
// return;
// }
//
// // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements
// // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r);
// }
// }
//
// // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one
// int data[] = new int[r];
//
// // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r);
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
// }
//
// calculateCommon(s, stats.result);
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) {
// disregard if wrong taxonomy
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
continue;
}
for (Word word : s.getWords()) {
// skip if current word is not inflected
if (!(word.getMsd().length() > 0)) {
continue;
}
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
Common.updateMap(stats.result, entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
// skip if current word is not inflected
// // TODO: if has defined msd and is of correct type (create a set)
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}

131
src/main/java/alg/inflectedJOS/WordFormation.java

@ -0,0 +1,131 @@
package alg.inflectedJOS;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import data.Enums.InflectedJosTypes;
import data.StatisticsNew;
import gui.ValidationUtil;
import util.Combinations;
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
public class WordFormation {
private static HashMap<String, Long> josTypeResult;
private static Object[][] tmpResults;
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
static {
indices = new HashMap<>();
for (int i = 4; i <= 8; i++) {
indices.put(i, Combinations.generateIndices(i));
}
}
public static void calculateStatistics(StatisticsNew stat) {
Map<String, AtomicLong> result = stat.getResult();
// 1. filter - keep only inflected types
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
// 2. for each inflected type get all possible subcombinations
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
josTypeResult = new HashMap<>();
// filter out results for a single word type
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
.filter(x -> x.getKey().charAt(0) == josChar)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
if (ValidationUtil.isEmpty(singleTypeResults)) {
continue;
}
// get all possible indices combos for a msd of this length
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
int l = e.getKey().length();
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
}
}
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
}
stat.setResultCustom(tmpResults);
}
private static String mask(String word, HashSet<Integer> indicesCombo) {
StringBuilder sb = new StringBuilder();
sb.append(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
}
return sb.toString();
}
private static void updateResults(String s, Long nOfOccurences) {
// if not in map add
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
// else update
if (r != null) {
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
}
}
private static void resultsMapToArray(Long totalValue) {
Double total = totalValue * 1.0;
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
int i = 0;
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
josTypeResultArray[i][0] = e.getKey();
josTypeResultArray[i][1] = e.getValue();
josTypeResultArray[i][2] = e.getValue() / total;
if (e.getValue() > total) {
String debug = "";
}
i++;
}
if (tmpResults == null) {
tmpResults = josTypeResultArray;
} else {
int firstLength = tmpResults.length;
int secondLength = josTypeResultArray.length;
Object[][] tmp = new Object[firstLength + secondLength][3]