commit
a18e52a599
94 changed files with 87092 additions and 0 deletions
-
160.gitignore
-
28Corpus Analyzer.iml
-
122pom.xml
-
3src/main/java/META-INF/MANIFEST.MF
-
15src/main/java/alg/Common.java
-
794src/main/java/alg/XML_processing.java
-
67src/main/java/alg/inflectedJOS/ForkJoin.java
-
170src/main/java/alg/inflectedJOS/InflectedJOSCount.java
-
131src/main/java/alg/inflectedJOS/WordFormation.java
-
62src/main/java/alg/ngram/ForkJoin.java
-
204src/main/java/alg/ngram/Ngrams.java
-
62src/main/java/alg/word/ForkJoin.java
-
167src/main/java/alg/word/WordCount.java
-
112src/main/java/alg/word/WordLevel.java
-
17src/main/java/data/AnalysisLevel.java
-
43src/main/java/data/CalculateFor.java
-
163src/main/java/data/Corpus.java
-
25src/main/java/data/CorpusType.java
-
12src/main/java/data/Enums/InflectedJosTypes.java
-
68src/main/java/data/Enums/Msd.java
-
55src/main/java/data/Enums/WordLevelDefaultValues.java
-
16src/main/java/data/Enums/WordLevelType.java
-
57src/main/java/data/Enums/solar/SolarFilters.java
-
144src/main/java/data/Filter.java
-
71src/main/java/data/GigafidaJosWordType.java
-
76src/main/java/data/GigafidaTaxonomy.java
-
85src/main/java/data/GosTaxonomy.java
-
56src/main/java/data/Sentence.java
-
16src/main/java/data/Settings.java
-
299src/main/java/data/Statistics.java
-
409src/main/java/data/StatisticsNew.java
-
175src/main/java/data/Tax.java
-
171src/main/java/data/Taxonomy.java
-
53src/main/java/data/Validation.java
-
141src/main/java/data/Word.java
-
454src/main/java/gui/CharacterAnalysisTab.java
-
517src/main/java/gui/CorpusTab.java
-
187src/main/java/gui/FiltersForSolar.java
-
150src/main/java/gui/GUIController.java
-
74src/main/java/gui/Messages.java
-
389src/main/java/gui/OneWordAnalysisTab.java
-
18src/main/java/gui/SelectedFiltersPane.java
-
511src/main/java/gui/StringAnalysisTabNew2.java
-
77src/main/java/gui/ValidationUtil.java
-
208src/main/java/gui/WordFormationTab.java
-
207src/main/java/gui/WordLevelTab.java
-
3src/main/java/manifest/META-INF/MANIFEST.MF
-
25src/main/java/util/ByteUtils.java
-
46src/main/java/util/Combinations.java
-
267src/main/java/util/Export.java
-
31src/main/java/util/Key.java
-
63src/main/java/util/TimeWatch.java
-
225src/main/java/util/Util.java
-
132src/main/java/util/db/RDB.java
-
68720src/main/resources/GOS_small/TEI_GOS_small.xml
-
524src/main/resources/GOS_tax_test/GOS_tax_test.xml
-
133src/main/resources/GUI.fxml
-
237src/main/resources/Gigafida_minimal/gfmin.xml
-
70src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.34.13.csv
-
390src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.37.50.csv
-
1147src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_0-gram_0-skip_14.05.2018_06.38.17.csv
-
455src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_1-gram_0-skip_31.01.2018_05.11.26.csv
-
1160src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_lema_2-gram_1-skip_31.01.2018_05.11.33.csv
-
512src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_1-gram_0-skip_25.01.2018_06.27.41.csv
-
623src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_2-gram_0-skip_20.01.2018_01.27.csv
-
572src/main/resources/Gigafida_subset/Besedni_nizi_Gigafida_različnica_3-gram_0-skip_20.01.2018_01.27.csv
-
350src/main/resources/Gigafida_subset/F0012405.xml
-
367src/main/resources/Gigafida_subset/F0016316.xml
-
336src/main/resources/Gigafida_subset/F0018194.xml
-
367src/main/resources/Gigafida_subset/F0026709.xml
-
365src/main/resources/Gigafida_subset/F0030361.xml
-
356src/main/resources/Gigafida_subset/nested/F0036980.xml
-
408src/main/resources/Gigafida_subset/nested/F0037258.xml
-
391src/main/resources/Gigafida_subset/nested/F0037544.xml
-
355src/main/resources/Gigafida_subset/nested/F0038754.xml
-
402src/main/resources/Gigafida_subset/nested/F0038920.xml
-
18src/main/resources/Lists/prefixes.txt
-
7src/main/resources/Lists/suffixes.txt
-
54src/main/resources/gui/CharacterAnalysisTab.fxml
-
32src/main/resources/gui/CorpusTab.fxml
-
30src/main/resources/gui/FiltersForSolar.fxml
-
56src/main/resources/gui/OneWordAnalysisTab.fxml
-
13src/main/resources/gui/SelectedFiltersPane.fxml
-
105src/main/resources/gui/StringAnalysisTabNew2.fxml
-
25src/main/resources/gui/WordFormationTab.fxml
-
25src/main/resources/gui/WordLevelTab.fxml
-
22src/main/resources/log4j2.xml
-
85src/test/java/Common.java
-
42src/test/java/CorpusTests.java
-
66src/test/java/DBTest.java
-
334src/test/java/NgramTests.java
-
51src/test/java/WordFormationTest.java
-
15src/test/java/WordLevelTest.java
-
39src/test/java/WordTest.java
@ -0,0 +1,160 @@ |
|||
# Created by .ignore support plugin (hsz.mobi) |
|||
### Maven template |
|||
target/ |
|||
pom.xml.tag |
|||
pom.xml.releaseBackup |
|||
pom.xml.versionsBackup |
|||
pom.xml.next |
|||
release.properties |
|||
dependency-reduced-pom.xml |
|||
buildNumber.properties |
|||
.mvn/timing.properties |
|||
|
|||
# Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) |
|||
!/.mvn/wrapper/maven-wrapper.jar |
|||
### JetBrains template |
|||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm |
|||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 |
|||
|
|||
# User-specific stuff: |
|||
.idea/**/workspace.xml |
|||
.idea/**/tasks.xml |
|||
.idea/dictionaries |
|||
.idea/ |
|||
|
|||
# Sensitive or high-churn files: |
|||
.idea/**/dataSources/ |
|||
.idea/**/dataSources.ids |
|||
.idea/**/dataSources.xml |
|||
.idea/**/dataSources.local.xml |
|||
.idea/**/sqlDataSources.xml |
|||
.idea/**/dynamic.xml |
|||
.idea/**/uiDesigner.xml |
|||
|
|||
# Gradle: |
|||
.idea/**/gradle.xml |
|||
.idea/**/libraries |
|||
|
|||
# Mongo Explorer plugin: |
|||
.idea/**/mongoSettings.xml |
|||
|
|||
## File-based project format: |
|||
*.iws |
|||
|
|||
## Plugin-specific files: |
|||
|
|||
# IntelliJ |
|||
/out/ |
|||
|
|||
# mpeltonen/sbt-idea plugin |
|||
.idea_modules/ |
|||
|
|||
# JIRA plugin |
|||
atlassian-ide-plugin.xml |
|||
|
|||
# Crashlytics plugin (for Android Studio and IntelliJ) |
|||
com_crashlytics_export_strings.xml |
|||
crashlytics.properties |
|||
crashlytics-build.properties |
|||
fabric.properties |
|||
### Java template |
|||
# Compiled class file |
|||
# Log file |
|||
*.log |
|||
|
|||
# BlueJ files |
|||
*.ctxt |
|||
|
|||
# Mobile Tools for Java (J2ME) |
|||
.mtj.tmp/ |
|||
|
|||
# Package Files # |
|||
*.war |
|||
*.ear |
|||
*.zip |
|||
*.tar.gz |
|||
*.rar |
|||
|
|||
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml |
|||
hs_err_pid* |
|||
### Eclipse template |
|||
|
|||
.metadata |
|||
bin/ |
|||
tmp/ |
|||
*.tmp |
|||
*.bak |
|||
*.swp |
|||
*~.nib |
|||
local.properties |
|||
.settings/ |
|||
.loadpath |
|||
.recommenders |
|||
|
|||
# Eclipse Core |
|||
.project |
|||
|
|||
# External tool builders |
|||
.externalToolBuilders/ |
|||
|
|||
# Locally stored "Eclipse launch configurations" |
|||
*.launch |
|||
|
|||
# PyDev specific (Python IDE for Eclipse) |
|||
*.pydevproject |
|||
|
|||
# CDT-specific (C/C++ Development Tooling) |
|||
.cproject |
|||
|
|||
# JDT-specific (Eclipse Java Development Tools) |
|||
.classpath |
|||
|
|||
# Java annotation processor (APT) |
|||
.factorypath |
|||
|
|||
# PDT-specific (PHP Development Tools) |
|||
.buildpath |
|||
|
|||
# sbteclipse plugin |
|||
.target |
|||
|
|||
# Tern plugin |
|||
.tern-project |
|||
|
|||
# TeXlipse plugin |
|||
.texlipse |
|||
|
|||
# STS (Spring Tool Suite) |
|||
.springBeans |
|||
|
|||
# Code Recommenders |
|||
.recommenders/ |
|||
|
|||
# Scala IDE specific (Scala & Java development for Eclipse) |
|||
.cache-main |
|||
.scala_dependencies |
|||
.worksheet |
|||
|
|||
|
|||
|
|||
|
|||
### Windows ### |
|||
# Windows thumbnail cache files |
|||
Thumbs.db |
|||
ehthumbs.db |
|||
ehthumbs_vista.db |
|||
|
|||
# Folder config file |
|||
Desktop.ini |
|||
|
|||
# Recycle Bin used on file shares |
|||
$RECYCLE.BIN/ |
|||
|
|||
# Windows Installer files |
|||
*.cab |
|||
*.msi |
|||
*.msm |
|||
*.msp |
|||
|
|||
# Windows shortcuts |
|||
*.lnk |
@ -0,0 +1,28 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> |
|||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8"> |
|||
<output url="file://$MODULE_DIR$/target/classes" /> |
|||
<output-test url="file://$MODULE_DIR$/target/test-classes" /> |
|||
<content url="file://$MODULE_DIR$"> |
|||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" /> |
|||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> |
|||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" /> |
|||
<excludeFolder url="file://$MODULE_DIR$/target" /> |
|||
</content> |
|||
<orderEntry type="inheritedJdk" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" /> |
|||
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" /> |
|||
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" /> |
|||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" /> |
|||
</component> |
|||
</module> |
@ -0,0 +1,122 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
|
|||
<groupId>thesis</groupId> |
|||
<artifactId>corpus-analyzer</artifactId> |
|||
<version>1.2</version> |
|||
|
|||
<dependencies> |
|||
<dependency> |
|||
<groupId>commons-io</groupId> |
|||
<artifactId>commons-io</artifactId> |
|||
<version>2.5</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.commons</groupId> |
|||
<artifactId>commons-lang3</artifactId> |
|||
<version>3.6</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.googlecode.json-simple</groupId> |
|||
<artifactId>json-simple</artifactId> |
|||
<version>1.1.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.commons</groupId> |
|||
<artifactId>commons-csv</artifactId> |
|||
<version>1.4</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.controlsfx</groupId> |
|||
<artifactId>controlsfx</artifactId> |
|||
<version>8.40.13</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.rocksdb</groupId> |
|||
<artifactId>rocksdbjni</artifactId> |
|||
<version>5.7.3</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.logging.log4j</groupId> |
|||
<artifactId>log4j-api</artifactId> |
|||
<version>2.9.0</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.logging.log4j</groupId> |
|||
<artifactId>log4j-core</artifactId> |
|||
<version>2.9.0</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.kordamp.ikonli</groupId> |
|||
<artifactId>ikonli-fontawesome-pack</artifactId> |
|||
<version>1.9.0</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.kordamp.ikonli</groupId> |
|||
<artifactId>ikonli-javafx</artifactId> |
|||
<version>1.9.0</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<!-- packages dependencies into the jar --> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<executions> |
|||
<execution> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>gui.GUIController</mainClass> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
<appendAssemblyId>false</appendAssemblyId> |
|||
<outputDirectory>artifact</outputDirectory> |
|||
<finalName>Corpus_Analyzer_${version}</finalName> |
|||
</configuration> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
<plugin> |
|||
<!-- JavaFX --> |
|||
<groupId>com.zenjava</groupId> |
|||
<artifactId>javafx-maven-plugin</artifactId> |
|||
<version>8.6.0</version> |
|||
<configuration> |
|||
<mainClass>gui.GUIController</mainClass> |
|||
<verbose>true</verbose> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>create-jfxjar</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>build-jar</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<configuration> |
|||
<source>1.8</source> |
|||
<target>1.8</target> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
|
|||
</project> |
@ -0,0 +1,3 @@ |
|||
Manifest-Version: 1.0 |
|||
Main-Class: gui.GUIController |
|||
|
@ -0,0 +1,15 @@ |
|||
package alg; |
|||
|
|||
import java.util.Map; |
|||
import java.util.concurrent.atomic.AtomicLong; |
|||
|
|||
public class Common { |
|||
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) { |
|||
// if not in map |
|||
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1)); |
|||
|
|||
// else |
|||
if (r != null) |
|||
map.get(o).incrementAndGet(); |
|||
} |
|||
} |
@ -0,0 +1,794 @@ |
|||
package alg; |
|||
|
|||
import static data.Enums.solar.SolarFilters.*; |
|||
|
|||
import java.io.FileInputStream; |
|||
import java.io.FileNotFoundException; |
|||
import java.util.*; |
|||
import java.util.concurrent.ForkJoinPool; |
|||
|
|||
import javax.xml.namespace.QName; |
|||
import javax.xml.stream.XMLEventReader; |
|||
import javax.xml.stream.XMLInputFactory; |
|||
import javax.xml.stream.XMLStreamConstants; |
|||
import javax.xml.stream.XMLStreamException; |
|||
import javax.xml.stream.events.*; |
|||
|
|||
import org.apache.logging.log4j.LogManager; |
|||
|
|||
import data.*; |
|||
import gui.ValidationUtil; |
|||
|
|||
public class XML_processing { |
|||
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class); |
|||
|
|||
// public static void processCorpus(Statistics stats) { |
|||
// // we can preset the list's size, so there won't be a need to resize it |
|||
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); |
|||
// |
|||
// int i = 0; |
|||
// for (File f : Settings.corpus) { |
|||
// i++; |
|||
// readXML(f.toString(), stats); |
|||
// } |
|||
// } |
|||
|
|||
// public static void readXML(String path, Statistics stats) { |
|||
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) { |
|||
// readXMLGigafida(path, stats); |
|||
// } else if (stats.getCorpusType() == CorpusType.GOS) { |
|||
// readXMLGos(path, stats); |
|||
// } else if (stats.getCorpusType() == CorpusType.SOLAR) { |
|||
// readXMLSolar(path, stats); |
|||
// } |
|||
// } |
|||
|
|||
public static void readXML(String path, StatisticsNew stats) { |
|||
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA |
|||
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) { |
|||
readXMLGigafida(path, stats); |
|||
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) { |
|||
readXMLGos(path, stats); |
|||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { |
|||
readXMLSolar(path, stats); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* Reads and returns the value of a passed header tag or an empty string. |
|||
* E.g. title tag, for discerning the corpus' type. |
|||
* Notice: returns only the value of the first occurrence of a given tag name. |
|||
*/ |
|||
public static String readXMLHeaderTag(String path, String tag) { |
|||
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
XMLEventReader eventReader = null; |
|||
|
|||
try { |
|||
eventReader = factory.createXMLEventReader(new FileInputStream(path)); |
|||
while (eventReader.hasNext()) { |
|||
XMLEvent xmlEvent = eventReader.nextEvent(); |
|||
if (xmlEvent.isStartElement()) { |
|||
StartElement startElement = xmlEvent.asStartElement(); |
|||
String var = startElement.getName().getLocalPart(); |
|||
|
|||
if (var.equalsIgnoreCase(tag)) { |
|||
return eventReader.nextEvent().asCharacters().getData(); |
|||
} |
|||
} |
|||
} |
|||
} catch (FileNotFoundException | XMLStreamException e) { |
|||
e.printStackTrace(); |
|||
} finally { |
|||
if (eventReader != null) { |
|||
try { |
|||
eventReader.close(); |
|||
} catch (XMLStreamException e) { |
|||
logger.error("closing stream", e); |
|||
} |
|||
} |
|||
} |
|||
return ""; |
|||
} |
|||
|
|||
private static void fj(List<Sentence> corpus, StatisticsNew stats) { |
|||
ForkJoinPool pool = new ForkJoinPool(); |
|||
|
|||
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) { |
|||
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats); |
|||
pool.invoke(wc); |
|||
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) { |
|||
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats); |
|||
pool.invoke(wc); |
|||
} else { |
|||
// TODO: |
|||
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats); |
|||
// pool.invoke(wc); |
|||
} |
|||
} |
|||
|
|||
// public static void readXMLGos(String path, Statistics stats) { |
|||
// boolean in_word = false; |
|||
// String taksonomija = ""; |
|||
// String lemma = ""; |
|||
// String msd = ""; |
|||
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm |
|||
// |
|||
// List<Word> stavek = new ArrayList<>(); |
|||
// List<Sentence> corpus = new ArrayList<>(); |
|||
// String sentenceDelimiter = "seg"; |
|||
// String taxonomyPrefix = "gos."; |
|||
// |
|||
// try { |
|||
// XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); |
|||
// |
|||
// while (eventReader.hasNext()) { |
|||
// XMLEvent event = eventReader.nextEvent(); |
|||
// |
|||
// switch (event.getEventType()) { |
|||
// case XMLStreamConstants.START_ELEMENT: |
|||
// |
|||
// StartElement startElement = event.asStartElement(); |
|||
// String qName = startElement.getName().getLocalPart(); |
|||
// |
|||
// // "word" node |
|||
// if (qName.equals("w")) { |
|||
// in_word = true; |
|||
// |
|||
// if (type.equals("norm")) { |
|||
// // make sure we're looking at <w lemma...> and not <w type...> |
|||
// Iterator var = startElement.getAttributes(); |
|||
// ArrayList<Object> attributes = new ArrayList<>(); |
|||
// while (var.hasNext()) { |
|||
// attributes.add(var.next()); |
|||
// } |
|||
// |
|||
// if (attributes.contains("msd")) { |
|||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); |
|||
// } else { |
|||
// msd = null; |
|||
// } |
|||
// |
|||
// if (attributes.contains("lemma")) { |
|||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); |
|||
// } |
|||
// } |
|||
// } |
|||
// // taxonomy node |
|||
// else if (qName.equalsIgnoreCase("catRef")) { |
|||
// // there are some term nodes at the beginning that are of no interest to us |
|||
// // they differ by not having the attribute "ref", so test will equal null |
|||
// Attribute test = startElement.getAttributeByName(QName.valueOf("target")); |
|||
// |
|||
// if (test != null) { |
|||
// // keep only taxonomy properties |
|||
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, ""); |
|||
// } |
|||
// } else if (qName.equalsIgnoreCase("div")) { |
|||
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); |
|||
// |
|||
// } |
|||
// break; |
|||
// |
|||
// case XMLStreamConstants.CHARACTERS: |
|||
// Characters characters = event.asCharacters(); |
|||
// |
|||
// // "word" node value |
|||
// if (in_word) { |
|||
// if (type.equals("norm") && msd != null) { |
|||
// stavek.add(new Word(characters.getData(), lemma, msd)); |
|||
// } else { |
|||
// stavek.add(new Word(characters.getData())); |
|||
// } |
|||
// |
|||
// in_word = false; |
|||
// } |
|||
// break; |
|||
// |
|||
// case XMLStreamConstants.END_ELEMENT: |
|||
// EndElement endElement = event.asEndElement(); |
|||
// |
|||
// // parser reached end of the current sentence |
|||
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { |
|||
// // add sentence to corpus |
|||
// corpus.add(new Sentence(stavek, taksonomija, type)); |
|||
// // and start a new one |
|||
// stavek = new ArrayList<>(); |
|||
// |
|||
// /* Invoke Fork-Join when we reach maximum limit of |
|||
// * sentences (because we can't read everything to |
|||
// * memory) or we reach the end of the file. |
|||
// */ |
|||
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { |
|||
// fj(corpus, stats); |
|||
// // empty the current corpus, since we don't need |
|||
// // the data anymore |
|||
// corpus.clear(); |
|||
// } |
|||
// } |
|||
// |
|||
// // backup |
|||
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { |
|||
// fj(corpus, stats); |
|||
// corpus.clear(); |
|||
// } |
|||
// |
|||
// break; |
|||
// } |
|||
// } |
|||
// } catch (FileNotFoundException | XMLStreamException e) { |
|||
// e.printStackTrace(); |
|||
// } |
|||
// } |
|||
|
|||
@SuppressWarnings("unused") |
|||
public static void readXMLSolar(String path, StatisticsNew stats) { |
|||
boolean in_word = false; |
|||
String lemma = ""; |
|||
String msd = ""; |
|||
|
|||
List<Word> stavek = new ArrayList<>(); |
|||
List<Sentence> corpus = new ArrayList<>(); |
|||
|
|||
// used for filter |
|||
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto")); |
|||
Map<String, String> headBlock = null; |
|||
boolean includeThisBlock = false; |
|||
|
|||
try { |
|||
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); |
|||
|
|||
while (eventReader.hasNext()) { |
|||
XMLEvent event = eventReader.nextEvent(); |
|||
|
|||
switch (event.getEventType()) { |
|||
case XMLStreamConstants.START_ELEMENT: |
|||
|
|||
StartElement startElement = event.asStartElement(); |
|||
// System.out.println(String.format("%s", startElement.toString())); |
|||
String qName = startElement.getName().getLocalPart(); |
|||
|
|||
// "word" node |
|||
if (qName.equals("w3")) { |
|||
in_word = true; |
|||
|
|||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); |
|||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); |
|||
} else if (qName.equals("c3")) { |
|||
String c3Content = eventReader.nextEvent().asCharacters().getData(); |
|||
|
|||
if (c3Content.equals(".") && includeThisBlock) { |
|||
// add sentence to corpus |
|||
corpus.add(new Sentence(stavek)); |
|||
// and start a new one |
|||
stavek = new ArrayList<>(); |
|||
|
|||
/* Invoke Fork-Join when we reach maximum limit of |
|||
* sentences (because we can't read everything to |
|||
* memory) or we reach the end of the file. |
|||
*/ |
|||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { |
|||
fj(corpus, stats); |
|||
// empty the current corpus, since we don't need |
|||
// the data anymore |
|||
corpus.clear(); |
|||
} |
|||
} |
|||
} else if (headTags.contains(qName)) { |
|||
String tagContent = eventReader.nextEvent().asCharacters().getData(); |
|||
headBlock.put(qName, tagContent); |
|||
} else if (qName.equals("head")) { |
|||
headBlock = new HashMap<>(); |
|||
} |
|||
|
|||
break; |
|||
|
|||
case XMLStreamConstants.CHARACTERS: |
|||
Characters characters = event.asCharacters(); |
|||
|
|||
// "word" node value |
|||
if (in_word) { |
|||
stavek.add(new Word(characters.getData(), lemma, msd)); |
|||
in_word = false; |
|||
} |
|||
break; |
|||
|
|||
case XMLStreamConstants.END_ELEMENT: |
|||
EndElement endElement = event.asEndElement(); |
|||
String qNameEnd = endElement.getName().getLocalPart(); |
|||
|
|||
if (qNameEnd.equals("head")) { |
|||
// validate and set boolean |
|||
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) { |
|||
includeThisBlock = true; |
|||
} |
|||
} else if (qNameEnd.equals("body")) { |
|||
// new block, reset filter status |
|||
includeThisBlock = false; |
|||
} |
|||
|
|||
// backup |
|||
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) { |
|||
fj(corpus, stats); |
|||
corpus.clear(); |
|||
} |
|||
|
|||
break; |
|||
} |
|||
} |
|||
} catch (FileNotFoundException | XMLStreamException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* @param readHeadBlock block of tags read from the corpus |
|||
* @param userSetFilter tags with values set by the user |
|||
* |
|||
* @return |
|||
*/ |
|||
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) { |
|||
boolean pass = true; |
|||
|
|||
if (userSetFilter == null) { |
|||
return true; |
|||
} |
|||
|
|||
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) { |
|||
String key = filterEntry.getKey(); |
|||
HashSet<String> valueObject = filterEntry.getValue(); |
|||
|
|||
// if (valueObject instanceof String) { |
|||
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject); |
|||
// } else |
|||
if (valueObject != null) { |
|||
//noinspection unchecked |
|||
for (String value : valueObject) { |
|||
pass = validateHeadBlockEntry(readHeadBlock, key, value); |
|||
} |
|||
} |
|||
|
|||
if (!pass) { |
|||
// current head block does not include one of the set filters - not likely, but an edge case anyway |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
// if it gets to this point, it passed all the filters |
|||
return true; |
|||
} |
|||
|
|||
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) { |
|||
if (!readHeadBlock.keySet().contains(userSetKey)) { |
|||
// current head block does not include one of the set filters - not likely, but an edge case anyway |
|||
return false; |
|||
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) { |
|||
// different values -> doesn't pass the filter |
|||
return false; |
|||
} |
|||
|
|||
return true; |
|||
} |
|||
|
|||
/** |
|||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar) |
|||
* |
|||
* @param filepath |
|||
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file |
|||
* @param corpusType |
|||
*/ |
|||
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { |
|||
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); |
|||
// solar |
|||
Set<String> headTags = null; |
|||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>(); |
|||
// taxonomy corpora |
|||
HashSet<String> resultTaxonomy = new HashSet<>(); |
|||
|
|||
String headTagName; |
|||
|
|||
if (corpusType == CorpusType.SOLAR) { |
|||
headTagName = "head"; |
|||
// used for filter |
|||
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); |
|||
|
|||
// init results now to avoid null pointers |
|||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); |
|||
} else { |
|||
headTagName = "teiHeader"; |
|||
} |
|||
|
|||
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
XMLEventReader xmlEventReader = null; |
|||
try { |
|||
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); |
|||
boolean insideHeader = false; |
|||
|
|||
while (xmlEventReader.hasNext()) { |
|||
XMLEvent xmlEvent = xmlEventReader.nextEvent(); |
|||
|
|||
if (xmlEvent.isStartElement()) { |
|||
StartElement startElement = xmlEvent.asStartElement(); |
|||
String elementName = startElement.getName().getLocalPart(); |
|||
|
|||
if (elementName.equalsIgnoreCase(headTagName)) { |
|||
// if the corpus is split into files, we skip bodies |
|||
// this toggle is true when we're inside a header (next block of code executes) |
|||
// and false when we're not (skip reading unnecessary attributes) |
|||
insideHeader = true; |
|||
} |
|||
|
|||
if (insideHeader) { |
|||
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { |
|||
HashMap<String, String> atts = extractAttributes(startElement); |
|||
String debug = ""; |
|||
|
|||
String tax = startElement.getAttributeByName(QName.valueOf("target")) |
|||
.getValue() |
|||
.replace("#", ""); |
|||
|
|||
resultTaxonomy.add(tax); |
|||
} else if (!parseTaxonomy && headTags.contains(elementName)) { |
|||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); |
|||
resultFilters.get(elementName).add(tagContent); |
|||
} |
|||
} |
|||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { |
|||
// if the corpus is split into multiple files, each with only one header block per file |
|||
// that means we should stop after we reach the end of the header |
|||
return parseTaxonomy ? resultTaxonomy : resultFilters; |
|||
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { |
|||
// whole corpus in one file, so we have to continue reading in order to find all header blocks |
|||
insideHeader = false; |
|||
} |
|||
} |
|||
} catch (XMLStreamException e) { |
|||
logger.error("Streaming error", e); |
|||
return parseTaxonomy ? resultTaxonomy : resultFilters; |
|||
} catch (FileNotFoundException e) { |
|||
logger.error("File not found", e); |
|||
return parseTaxonomy ? resultTaxonomy : resultFilters; |
|||
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user |
|||
} finally { |
|||
if (xmlEventReader != null) { |
|||
try { |
|||
xmlEventReader.close(); |
|||
} catch (XMLStreamException e) { |
|||
logger.error("closing stream", e); |
|||
} |
|||
} |
|||
} |
|||
return parseTaxonomy ? resultTaxonomy : resultFilters; |
|||
} |
|||
|
|||
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) { |
|||
return event.asEndElement() |
|||
.getName() |
|||
.getLocalPart() |
|||
.equalsIgnoreCase(headerTag); |
|||
} |
|||
|
|||
@SuppressWarnings("Duplicates") |
|||
public static boolean readXMLGigafida(String path, StatisticsNew stats) { |
|||
boolean inWord = false; |
|||
ArrayList<String> currentFiletaxonomy = new ArrayList<>(); |
|||
String lemma = ""; |
|||
String msd = ""; |
|||
|
|||
List<Word> sentence = new ArrayList<>(); |
|||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it |
|||
String sentenceDelimiter = "s"; |
|||
|
|||
XMLEventReader eventReader = null; |
|||
try { |
|||
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
eventReader = factory.createXMLEventReader(new FileInputStream(path)); |
|||
|
|||
while (eventReader.hasNext()) { |
|||
XMLEvent event = eventReader.nextEvent(); |
|||
|
|||
switch (event.getEventType()) { |
|||
case XMLStreamConstants.START_ELEMENT: |
|||
StartElement startElement = event.asStartElement(); |
|||
String qName = startElement.getName().getLocalPart(); |
|||
|
|||
// "word" node |
|||
if (qName.equals("w")) { |
|||
inWord = true; |
|||
|
|||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); |
|||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); |
|||
} |
|||
// taxonomy node |
|||
else if (qName.equalsIgnoreCase("catRef")) { |
|||
// there are some term nodes at the beginning that are of no interest to us |
|||
// they differ by not having the attribute "ref", so test will equal null |
|||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); |
|||
|
|||
if (tax != null) { |
|||
// keep only taxonomy properties |
|||
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", "")); |
|||
} |
|||
} |
|||
break; |
|||
|
|||
case XMLStreamConstants.CHARACTERS: |
|||
Characters characters = event.asCharacters(); |
|||
|
|||
// "word" node value |
|||
if (inWord) { |
|||
String word = characters.getData(); |
|||
sentence.add(new Word(word, lemma, msd)); |
|||
inWord = false; |
|||
} |
|||
break; |
|||
|
|||
case XMLStreamConstants.END_ELEMENT: |
|||
EndElement endElement = event.asEndElement(); |
|||
|
|||
String var = endElement.getName().getLocalPart(); |
|||
String debug = ""; |
|||
|
|||
// parser reached end of the current sentence |
|||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { |
|||
// add sentence to corpus if it passes filters |
|||
sentence = runFilters(sentence, stats.getFilter()); |
|||
|
|||
if (!ValidationUtil.isEmpty(sentence)) { |
|||
corpus.add(new Sentence(sentence)); |
|||
} |
|||
|
|||
// and start a new one |
|||
sentence = new ArrayList<>(); |
|||
|
|||
/* Invoke Fork-Join when we reach maximum limit of |
|||
* sentences (because we can't read everything to |
|||
* memory) or we reach the end of the file. |
|||
*/ |
|||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { |
|||
fj(corpus, stats); |
|||
// empty the current corpus, since we don't need the data anymore |
|||
corpus.clear(); |
|||
|
|||
// TODO: if (stats.isUseDB()) { |
|||
// stats.storeTmpResultsToDB(); |
|||
// } |
|||
} |
|||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) { |
|||
// before proceeding to read this file, make sure that taxonomy filters are a match |
|||
|
|||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { |
|||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection |
|||
|
|||
if (currentFiletaxonomy.isEmpty()) { |
|||
// taxonomies don't match so stop |
|||
return false; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// fallback |
|||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { |
|||
fj(corpus, stats); |
|||
corpus.clear(); |
|||
|
|||
// TODO: if (stats.isUseDB()) { |
|||
// stats.storeTmpResultsToDB(); |
|||
// } |
|||
} |
|||
|
|||
break; |
|||
} |
|||
} |
|||
} catch (FileNotFoundException | XMLStreamException e) { |
|||
e.printStackTrace(); |
|||
} finally { |
|||
if (eventReader != null) { |
|||
try { |
|||
eventReader.close(); |
|||
} catch (XMLStreamException e) { |
|||
logger.error("closing stream", e); |
|||
} |
|||
} |
|||
} |
|||
|
|||
return true; |
|||
} |
|||
|
|||
@SuppressWarnings("Duplicates") |
|||
public static boolean readXMLGos(String path, StatisticsNew stats) { |
|||
boolean inWord = false; |
|||
boolean inOrthDiv = false; |
|||
boolean computeForOrth = stats.getCorpus().isGosOrthMode(); |
|||
ArrayList<String> currentFiletaxonomy = new ArrayList<>(); |
|||
String lemma = ""; |
|||
String msd = ""; |
|||
|
|||
List<Word> sentence = new ArrayList<>(); |
|||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it |
|||
String sentenceDelimiter = "seg"; |
|||
|
|||
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm |
|||
|
|||
XMLEventReader eventReader = null; |
|||
|
|||
boolean includeFile = true; |
|||
|
|||
try { |
|||
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|||
eventReader = factory.createXMLEventReader(new FileInputStream(path)); |
|||
|
|||
while (eventReader.hasNext()) { |
|||
XMLEvent event = eventReader.nextEvent(); |
|||
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); |
|||
|
|||
switch (event.getEventType()) { |
|||
case XMLStreamConstants.START_ELEMENT: |
|||
StartElement startElement = event.asStartElement(); |
|||
String qName = startElement.getName().getLocalPart(); |
|||
|
|||
if (qName.equals("div")) { |
|||
HashMap<String, String> atts = extractAttributes(startElement); |
|||
|
|||
if (atts.keySet().contains("type")) { |
|||
inOrthDiv = atts.get("type").equals("orth"); |
|||
} |
|||
} |
|||
|
|||
// "word" node |
|||
if (qName.equals("w")) { |
|||
// check that it's not a type |
|||
HashMap<String, String> atts = extractAttributes(startElement); |
|||
|
|||
if (!atts.containsKey("type")) { |
|||
inWord = true; |
|||
|
|||
if (atts.containsKey("msd")) { |
|||
msd = atts.get("msd"); |
|||
|
|||
} |
|||
if (atts.containsKey("lemma")) { |
|||
lemma = atts.get("lemma"); |
|||
} |
|||
// |
|||
// if (!inOrthDiv) { |
|||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); |
|||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); |
|||
// } |
|||
} |
|||
|
|||
// } |
|||
} |
|||
// taxonomy node |
|||
else if (qName.equalsIgnoreCase("catRef")) { |
|||
// there are some term nodes at the beginning that are of no interest to us |
|||
// they differ by not having the attribute "ref", so test will equal null |
|||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); |
|||
|
|||
if (tax != null) { |
|||
// keep only taxonomy properties |
|||
currentFiletaxonomy.add(String.valueOf(tax.getValue())); |
|||
} |
|||
} else if (qName.equalsIgnoreCase("div")) { |
|||
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); |
|||
} |
|||
break; |
|||
|
|||
case XMLStreamConstants.CHARACTERS: |
|||
// "word" node value |
|||
if (inWord) { |
|||
Characters characters = event.asCharacters(); |
|||
if (gosType.equals("norm") && msd != null) { |
|||
sentence.add(new Word(characters.getData(), lemma, msd)); |
|||
} else { |
|||
sentence.add(new Word(characters.getData())); |
|||
} |
|||
|
|||
inWord = false; |
|||
} |
|||
break; |
|||
|
|||
case XMLStreamConstants.END_ELEMENT: |
|||
EndElement endElement = event.asEndElement(); |
|||
|
|||
// parser reached end of the current sentence |
|||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { |
|||
// add sentence to corpus if it passes filters |
|||
boolean saveSentence = computeForOrth == inOrthDiv; |
|||
|
|||
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) { |
|||
sentence = runFilters(sentence, stats.getFilter()); |
|||
corpus.add(new Sentence(sentence)); |
|||
} |
|||
|
|||
// and start a new one |
|||
sentence = new ArrayList<>(); |
|||
|
|||
/* Invoke Fork-Join when we reach maximum limit of |
|||
* sentences (because we can't read everything to |
|||
* memory) or we reach the end of the file. |
|||
*/ |
|||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { |
|||
fj(corpus, stats); |
|||
// empty the current corpus, since we don't need |
|||
// the data anymore |
|||
corpus.clear(); |
|||
} |
|||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) { |
|||
// before proceeding to read this file, make sure that taxonomy filters are a match |
|||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { |
|||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection |
|||
|
|||
// disregard this entry if taxonomies don't match |
|||
includeFile = !currentFiletaxonomy.isEmpty(); |
|||
|
|||
currentFiletaxonomy = new ArrayList<>(); |
|||
} |
|||
} |
|||
|
|||
// backup |
|||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { |
|||
fj(corpus, stats); |
|||
corpus.clear(); |
|||
} |
|||
|
|||
break; |
|||
} |
|||
} |
|||
} catch (FileNotFoundException | XMLStreamException e) { |
|||
e.printStackTrace(); |
|||
} finally { |
|||
if (eventReader != null) { |
|||
try { |
|||
eventReader.close(); |
|||
} catch (XMLStreamException e) { |
|||
logger.error("closing stream", e); |
|||
} catch (Exception e) { |
|||
logger.error("general error", e); |
|||
} |
|||
} |
|||
} |
|||
|
|||
return true; |
|||
} |
|||
|
|||
/** |
|||
* Runs the sentence through some filters, so we don't do calculations when unnecessary. |
|||
* Filters: |
|||
* <ol> |
|||
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li> |
|||
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li> |
|||
* </ol> |
|||
* |
|||
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.) |
|||
*/ |
|||
private static List<Word> runFilters(List<Word> sentence, Filter filter) { |
|||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { |
|||
// ngram level: if not 0 must be less than or equal to number of words in this sentence. |
|||
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { |
|||
return null; |
|||
} |
|||
|
|||
// if we're calculating values for letters, omit words that are shorter than string length |
|||
if (filter.getNgramValue() == 0) { |
|||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength()) |
|||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength())); |
|||
} |
|||
} |
|||
|
|||
return sentence; |
|||
} |
|||
|
|||
private static HashMap<String, String> extractAttributes(StartElement se) { |
|||
Iterator attributesIt = se.getAttributes(); |
|||
HashMap<String, String> atts = new HashMap<>(); |
|||
|
|||
while (attributesIt.hasNext()) { |
|||
Attribute a = (Attribute) attributesIt.next(); |
|||
atts.put(a.getName().getLocalPart(), a.getValue()); |
|||
} |
|||
|
|||
return atts; |
|||
} |
|||
} |
@ -0,0 +1,67 @@ |
|||
package alg.inflectedJOS; |
|||
|
|||
import java.util.List; |
|||
import java.util.concurrent.RecursiveAction; |
|||
|
|||
import data.Sentence; |
|||
import data.Statistics; |
|||
|
|||
public class ForkJoin extends RecursiveAction { |
|||
private static final long serialVersionUID = -1260951004477299634L; |
|||
|
|||
private static final int ACCEPTABLE_SIZE = 1000; |
|||
private List<Sentence> corpus; |
|||
private Statistics stats; |
|||
private int start; |
|||
private int end; |
|||
|
|||
|
|||
/** |
|||
* Constructor for subproblems. |
|||
*/ |
|||
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) { |
|||
this.corpus = corpus; |
|||
this.start = start; |
|||
this.end = end; |
|||
this.stats = stats; |
|||
} |
|||
|
|||
/** |
|||
* Default constructor for the initial problem |
|||
*/ |
|||
public ForkJoin(List<Sentence> corpus, Statistics stats) { |
|||
this.corpus = corpus; |
|||
this.start = 0; |
|||
this.end = corpus.size(); |
|||
this.stats = stats; |
|||
} |
|||
|
|||
private void computeDirectly() { |
|||
List<Sentence> subCorpus = corpus.subList(start, end); |
|||
|
|||
if (stats.isTaxonomySet()) { |
|||
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy()); |
|||
} else { |
|||
InflectedJOSCount.calculateForAll(subCorpus, stats, null); |
|||
} |
|||
} |
|||
|
|||
@Override |
|||
protected void compute() { |
|||
int subCorpusSize = end - start; |
|||
|
|||
if (subCorpusSize < ACCEPTABLE_SIZE) { |
|||
computeDirectly(); |
|||
} else { |
|||
int mid = start + subCorpusSize / 2; |
|||
ForkJoin left = new ForkJoin(corpus, start, mid, stats); |
|||
ForkJoin right = new ForkJoin(corpus, mid, end, stats); |
|||
|
|||
// fork (push to queue)-> compute -> join |
|||
left.fork(); |
|||
right.fork(); |
|||
left.join(); |
|||
right.join(); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,170 @@ |
|||
package alg.inflectedJOS; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.HashMap; |
|||
import java.util.List; |
|||
|
|||
import org.apache.commons.lang3.StringUtils; |
|||
|
|||
import alg.Common; |
|||
import data.Sentence; |
|||
import data.Statistics; |
|||
import data.StatisticsNew; |
|||
import data.Word; |
|||
|
|||
public class InflectedJOSCount { |
|||
|
|||
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices; |
|||
|
|||
// static { |
|||
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics |
|||
// indices = new HashMap<>(); |
|||
// for (int i = 5; i <= 8; i++) { |
|||
// indices.put(i, calculateCombinations(i)); |
|||
// } |
|||
// } |
|||
// |
|||
// private static List<Integer> calculateCombinations(int i) { |
|||
// int arr[] = {1, 2, 3, 4, 5}; |
|||
// int r = 3; |
|||
// int n = arr.length; |
|||
// ArrayList<ArrayList<Integer>> result = new ArrayList<>(); |
|||
// |
|||
// return printCombination(arr, n, r); |
|||
// } |
|||
// |
|||
// /* arr[] ---> Input Array |
|||
// data[] ---> Temporary array to store current combination |
|||
// start & end ---> Staring and Ending indexes in arr[] |
|||
// index ---> Current index in data[] |
|||
// r ---> Size of a combination to be printed */ |
|||
// static void combinationUtil(int arr[], int data[], int start, |
|||
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) { |
|||
// // Current combination is ready to be printed, print it |
|||
// ArrayList<Integer> tmpResult = new ArrayList<>(); |
|||
// |
|||
// if (index == r) { |
|||
// ArrayList<Integer> tmpResult = new ArrayList<>(); |
|||
// for (int j = 0; j < r; j++) |
|||
// System.out.print(data[j] + " "); |
|||
// System.out.println(""); |
|||
// return; |
|||
// } |
|||
// |
|||
// // replace index with all possible elements. The condition |
|||
// // "end-i+1 >= r-index" makes sure that including one element |
|||
// // at index will make a combination with remaining elements |
|||
// // at remaining positions |
|||
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) { |
|||
// data[index] = arr[i]; |
|||
// combinationUtil(arr, data, i + 1, end, index + 1, r); |
|||
// } |
|||
// } |
|||
// |
|||
// // The main function that prints all combinations of size r |
|||
// // in arr[] of size n. This function mainly uses combinationUtil() |
|||
// static void printCombination(int arr[], int n, int r) { |
|||
// // A temporary array to store all combination one by one |
|||
// int data[] = new int[r]; |
|||
// |
|||
// // Print all combination using temprary array 'data[]' |
|||
// combinationUtil(arr, data, 0, n - 1, 0, r); |
|||
// } |
|||
|
|||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) { |
|||
// for (Sentence s : corpus) { |
|||
// // disregard if wrong taxonomy |
|||
// if (!(s.getTaxonomy().startsWith(taxonomy))) { |
|||
// continue; |
|||
// } |
|||
// |
|||
// calculateCommon(s, stats.result); |
|||
// |
|||
// for (Word word : s.getWords()) { |
|||
// // skip if current word is not inflected |
|||
// if (!(word.getMsd().length() > 0)) { |
|||
// continue; |
|||
// } |
|||
// |
|||
// String msd = word.getMsd(); |
|||
// |
|||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); |
|||
// |
|||
// for (int i = 1; i < msd.length(); i++) { |
|||
// entry.setCharAt(i, msd.charAt(i)); |
|||
// Common.updateMap(stats.result, entry.toString()); |
|||
// entry.setCharAt(i, '-'); |
|||
// } |
|||
// } |
|||
// } |
|||
// } |
|||
|
|||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) { |
|||
// for (Sentence s : corpus) { |
|||
// for (Word word : s.getWords()) { |
|||
// if (!(word.getMsd().length() > 0)) { |
|||
// continue; |
|||
// } |
|||
// |
|||
// String msd = word.getMsd(); |
|||
// |
|||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); |
|||
// |
|||
// for (int i = 1; i < msd.length(); i++) { |
|||
// entry.setCharAt(i, msd.charAt(i)); |
|||
// Common.updateMap(stats.result, entry.toString()); |
|||
// entry.setCharAt(i, '-'); |
|||
// } |
|||
// } |
|||
// } |
|||
// } |
|||
|
|||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) { |
|||
for (Sentence s : corpus) { |
|||
// disregard if wrong taxonomy |
|||
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { |
|||
continue; |
|||
} |
|||
|
|||
for (Word word : s.getWords()) { |
|||
// skip if current word is not inflected |
|||
if (!(word.getMsd().length() > 0)) { |
|||
continue; |
|||
} |
|||
|
|||
String msd = word.getMsd(); |
|||
|
|||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); |
|||
|
|||
for (int i = 1; i < msd.length(); i++) { |
|||
entry.setCharAt(i, msd.charAt(i)); |
|||
Common.updateMap(stats.result, entry.toString()); |
|||
entry.setCharAt(i, '-'); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) { |
|||
for (Sentence s : corpus) { |
|||
|
|||
for (Word word : s.getWords()) { |
|||
// skip if current word is not inflected |
|||
// // TODO: if has defined msd and is of correct type (create a set) |
|||
// if (!(word.getMsd().length() > 0)) { |
|||
// continue; |
|||
// } |
|||
|
|||
String msd = word.getMsd(); |
|||
|
|||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1))); |
|||
|
|||
for (int i = 1; i < msd.length(); i++) { |
|||
entry.setCharAt(i, msd.charAt(i)); |
|||
stats.updateResults(entry.toString()); |
|||
entry.setCharAt(i, '-'); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,131 @@ |
|||
package alg.inflectedJOS; |
|||
|
|||
import java.util.HashMap; |
|||
import java.util.HashSet; |
|||
import java.util.Map; |
|||
import java.util.concurrent.atomic.AtomicLong; |
|||
import java.util.stream.Collectors; |
|||
|
|||
import data.Enums.InflectedJosTypes; |
|||
import data.StatisticsNew; |
|||
import gui.ValidationUtil; |
|||
import util.Combinations; |
|||
|
|||
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/ |
|||
public class WordFormation { |
|||
private static HashMap<String, Long> josTypeResult; |
|||
private static Object[][] tmpResults; |
|||
|
|||
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices; |
|||
|
|||
static { |
|||
indices = new HashMap<>(); |
|||
|
|||
for (int i = 4; i <= 8; i++) { |
|||
indices.put(i, Combinations.generateIndices(i)); |
|||
} |
|||
} |
|||
|
|||
public static void calculateStatistics(StatisticsNew stat) { |
|||
Map<String, AtomicLong> result = stat.getResult(); |
|||
|
|||
// 1. filter - keep only inflected types |
|||
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0))); |
|||
|
|||
// 2. for each inflected type get all possible subcombinations |
|||
for (Character josChar : InflectedJosTypes.inflectedJosTypes) { |
|||
josTypeResult = new HashMap<>(); |
|||
|
|||
// filter out results for a single word type |
|||
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream() |
|||
.filter(x -> x.getKey().charAt(0) == josChar) |
|||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); |
|||
|
|||
if (ValidationUtil.isEmpty(singleTypeResults)) { |
|||
continue; |
|||
} |
|||
|
|||
// get all possible indices combos for a msd of this length |
|||
// HashSet<HashSet<Integer>> indicesCombos = indices.get() |
|||
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length()); |
|||
|
|||
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) { |
|||
int l = e.getKey().length(); |
|||
|
|||
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) { |
|||
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue()); |
|||
} |
|||
} |
|||
|
|||
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum()); |
|||
} |
|||
|
|||
stat.setResultCustom(tmpResults); |
|||
} |
|||
|
|||
private static String mask(String word, HashSet<Integer> indicesCombo) { |
|||
StringBuilder sb = new StringBuilder(); |
|||
|
|||
sb.append(word.charAt(0)); |
|||
for (int i = 1; i < word.length(); i++) { |
|||
sb.append(indicesCombo.contains(i) ? word.charAt(i) : "."); |
|||
} |
|||
|
|||
return sb.toString(); |
|||
} |
|||
|
|||
|
|||
private static void updateResults(String s, Long nOfOccurences) { |
|||
// if not in map add |
|||
Long r = josTypeResult.putIfAbsent(s, nOfOccurences); |
|||
|
|||
// else update |
|||
if (r != null) { |
|||
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences); |
|||
} |
|||
} |
|||
|
|||
private static void resultsMapToArray(Long totalValue) { |
|||
Double total = totalValue * 1.0; |
|||
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3]; |
|||
|
|||
int i = 0; |
|||
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) { |
|||
josTypeResultArray[i][0] = e.getKey(); |
|||
josTypeResultArray[i][1] = e.getValue(); |
|||
josTypeResultArray[i][2] = e.getValue() / total; |
|||
|
|||
if (e.getValue() > total) { |
|||
|
|||
String debug = ""; |
|||
|
|||
} |
|||
|
|||
i++; |
|||
} |
|||
|
|||
if (tmpResults == null) { |
|||
tmpResults = josTypeResultArray; |
|||
} else { |
|||
int firstLength = tmpResults.length; |
|||
int secondLength = josTypeResultArray |