Project copied
This commit is contained in:
17
src/main/java/data/AnalysisLevel.java
Normal file
17
src/main/java/data/AnalysisLevel.java
Normal file
@@ -0,0 +1,17 @@
|
||||
package data;
|
||||
|
||||
public enum AnalysisLevel {
|
||||
STRING_LEVEL("Besedni nizi"),
|
||||
WORD_LEVEL("Nivo besed in delov besed"),
|
||||
WORD_FORMATION("Besedotvorni procesi");
|
||||
|
||||
private final String name;
|
||||
|
||||
AnalysisLevel(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
}
|
||||
43
src/main/java/data/CalculateFor.java
Normal file
43
src/main/java/data/CalculateFor.java
Normal file
@@ -0,0 +1,43 @@
|
||||
package data;
|
||||
|
||||
public enum CalculateFor {
|
||||
WORD("različnica"),
|
||||
LEMMA("lema"),
|
||||
MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"),
|
||||
MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"),
|
||||
WORD_TYPE("besedna vrsta"),
|
||||
DIST_WORDS("različnica"),
|
||||
DIST_LEMMAS("lema");
|
||||
|
||||
|
||||
private final String name;
|
||||
|
||||
CalculateFor(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public static CalculateFor factory(String cf) {
|
||||
if (cf != null) {
|
||||
if (WORD.toString().equals(cf)) {
|
||||
return WORD;
|
||||
}
|
||||
if (LEMMA.toString().equals(cf)) {
|
||||
return LEMMA;
|
||||
}
|
||||
if (MORPHOSYNTACTIC_SPECS.toString().equals(cf)) {
|
||||
return MORPHOSYNTACTIC_SPECS;
|
||||
}
|
||||
if (MORPHOSYNTACTIC_PROPERTY.toString().equals(cf)) {
|
||||
return MORPHOSYNTACTIC_PROPERTY;
|
||||
}
|
||||
if (WORD_TYPE.toString().equals(cf)) {
|
||||
return WORD_TYPE;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
163
src/main/java/data/Corpus.java
Normal file
163
src/main/java/data/Corpus.java
Normal file
@@ -0,0 +1,163 @@
|
||||
package data;
|
||||
|
||||
import static gui.Messages.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import data.Enums.solar.SolarFilters;
|
||||
import gui.ValidationUtil;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class Corpus {
|
||||
public final static Logger logger = LogManager.getLogger(Corpus.class);
|
||||
|
||||
private CorpusType corpusType;
|
||||
private File chosenResultsLocation;
|
||||
private File chosenCorpusLocation;
|
||||
private Collection<File> detectedCorpusFiles;
|
||||
boolean headerRead;
|
||||
private ObservableList<String> taxonomy; // if gigafida or gos
|
||||
private HashMap<String, ObservableList<String>> solarFilters; // if solar
|
||||
private HashMap<String, HashSet<String>> solarFiltersForXML; // if solar - used while parsing xml
|
||||
private boolean gosOrthMode;
|
||||
boolean hasMsdData;
|
||||
private ArrayList<String> validationErrors;
|
||||
|
||||
public Corpus() {
|
||||
validationErrors = new ArrayList<>();
|
||||
}
|
||||
|
||||
public CorpusType getCorpusType() {
|
||||
return corpusType;
|
||||
}
|
||||
|
||||
public void setCorpusType(CorpusType corpusType) {
|
||||
this.corpusType = corpusType;
|
||||
logger.info("Corpus.set: ", corpusType);
|
||||
}
|
||||
|
||||
public File getChosenResultsLocation() {
|
||||
return chosenResultsLocation;
|
||||
}
|
||||
|
||||
public void setChosenResultsLocation(File chosenResultsLocation) {
|
||||
this.chosenResultsLocation = chosenResultsLocation;
|
||||
logger.info("Corpus.set: ", chosenResultsLocation);
|
||||
}
|
||||
|
||||
public File getChosenCorpusLocation() {
|
||||
return chosenCorpusLocation;
|
||||
}
|
||||
|
||||
public void setChosenCorpusLocation(File chosenCorpusLocation) {
|
||||
this.chosenCorpusLocation = chosenCorpusLocation;
|
||||
logger.info("Corpus.set: ", chosenCorpusLocation);
|
||||
}
|
||||
|
||||
public Collection<File> getDetectedCorpusFiles() {
|
||||
return detectedCorpusFiles;
|
||||
}
|
||||
|
||||
public void setDetectedCorpusFiles(Collection<File> detectedCorpusFiles) {
|
||||
this.detectedCorpusFiles = detectedCorpusFiles;
|
||||
logger.info("Corpus.set: ", detectedCorpusFiles);
|
||||
}
|
||||
|
||||
public boolean isHeaderRead() {
|
||||
return headerRead;
|
||||
}
|
||||
|
||||
public void setHeaderRead(boolean headerRead) {
|
||||
this.headerRead = headerRead;
|
||||
}
|
||||
|
||||
public ObservableList<String> getTaxonomy() {
|
||||
return taxonomy;
|
||||
}
|
||||
|
||||
public void setTaxonomy(ObservableList<String> taxonomy) {
|
||||
this.taxonomy = taxonomy;
|
||||
logger.info("Corpus.set: ", taxonomy);
|
||||
}
|
||||
|
||||
public HashMap<String, ObservableList<String>> getSolarFilters() {
|
||||
return solarFilters;
|
||||
}
|
||||
|
||||
public void setSolarFilters(HashMap<String, ObservableList<String>> solarFilters) {
|
||||
this.solarFilters = solarFilters;
|
||||
logger.info("Corpus.set: ", solarFilters);
|
||||
}
|
||||
|
||||
public HashMap<String, HashSet<String>> getSolarFiltersForXML() {
|
||||
return solarFiltersForXML;
|
||||
}
|
||||
|
||||
public void setSolarFiltersForXML(HashMap<String, HashSet<String>> solarFiltersForXML) {
|
||||
this.solarFiltersForXML = solarFiltersForXML;
|
||||
logger.info("Corpus.set: ", solarFiltersForXML);
|
||||
}
|
||||
|
||||
public boolean isGosOrthMode() {
|
||||
return gosOrthMode;
|
||||
}
|
||||
|
||||
public void setGosOrthMode(boolean gosOrthMode) {
|
||||
this.gosOrthMode = gosOrthMode;
|
||||
logger.info("Corpus.set: ", gosOrthMode);
|
||||
}
|
||||
|
||||
public ArrayList<String> getValidationErrors() {
|
||||
return validationErrors;
|
||||
}
|
||||
|
||||
public String getValidationErrorsToString() {
|
||||
return StringUtils.join(validationErrors, "\n - ");
|
||||
}
|
||||
|
||||
public void setValidationErrors(ArrayList<String> validationErrors) {
|
||||
this.validationErrors = validationErrors;
|
||||
}
|
||||
|
||||
public boolean validate() {
|
||||
if (corpusType == null) {
|
||||
validationErrors.add(LABEL_RESULTS_CORPUS_TYPE_NOT_SET);
|
||||
}
|
||||
|
||||
if (chosenCorpusLocation == null) {
|
||||
validationErrors.add(LABEL_CORPUS_LOCATION_NOT_SET);
|
||||
}
|
||||
|
||||
if (chosenResultsLocation == null) {
|
||||
validationErrors.add(LABEL_RESULTS_LOCATION_NOT_SET);
|
||||
}
|
||||
|
||||
if (!headerRead && corpusType != null) {
|
||||
// if user didn't opt into reading the headers, set default taxonomy or solar filters
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) {
|
||||
taxonomy = Tax.getTaxonomyForComboBox(corpusType);
|
||||
} else if (corpusType == CorpusType.SOLAR && solarFilters == null) {
|
||||
setSolarFilters(SolarFilters.getFiltersForComboBoxes());
|
||||
}
|
||||
}
|
||||
|
||||
if (headerRead && ValidationUtil.isEmpty(taxonomy)) {
|
||||
// mustn't happen, intercept at gui level
|
||||
}
|
||||
|
||||
if (!ValidationUtil.isEmpty(validationErrors)) {
|
||||
logger.error("Corpus validation error: ", StringUtils.join(validationErrors, "\n - "));
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
25
src/main/java/data/CorpusType.java
Normal file
25
src/main/java/data/CorpusType.java
Normal file
@@ -0,0 +1,25 @@
|
||||
package data;
|
||||
|
||||
public enum CorpusType {
|
||||
GIGAFIDA("Gigafida", "gigafida"),
|
||||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos");
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String nameLowerCase;
|
||||
|
||||
CorpusType(String name, String nameLowerCase) {
|
||||
this.name = name;
|
||||
this.nameLowerCase = nameLowerCase;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getNameLowerCase() {
|
||||
return nameLowerCase;
|
||||
}
|
||||
}
|
||||
12
src/main/java/data/Enums/InflectedJosTypes.java
Normal file
12
src/main/java/data/Enums/InflectedJosTypes.java
Normal file
@@ -0,0 +1,12 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class InflectedJosTypes {
|
||||
public static final HashSet<Character> inflectedJosTypes = new HashSet<>();
|
||||
|
||||
static {
|
||||
inflectedJosTypes.addAll(Arrays.asList('S', 'G', 'P'));
|
||||
}
|
||||
}
|
||||
68
src/main/java/data/Enums/Msd.java
Normal file
68
src/main/java/data/Enums/Msd.java
Normal file
@@ -0,0 +1,68 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
public enum Msd {
|
||||
NOUN("samostalnik", 'S', "Noun", 'N', 5),
|
||||
VERB("glagol", 'G', "Verb", 'V', 7),
|
||||
ADJECTIVE("pridevnik", 'P', "Adjective", 'A', 6),
|
||||
ADVERB("prislov", 'R', "Adverb", 'R', 2),
|
||||
PRONOUN("zaimek", 'Z', "Pronoun", 'P', 8),
|
||||
NUMERAL("števnik", 'K', "Numeral", 'M', 6),
|
||||
PREPOSITION("predlog", 'D', "Preposition", 'S', 1),
|
||||
CONJUNCTION("veznik", 'V', "Conjunction", 'C', 1),
|
||||
PARTICLE("členek", 'L', "Particle", 'Q', 0),
|
||||
INTERJECTION("medmet", 'M', "Interjection", 'I', 0),
|
||||
ABBREVIATION("okrajšava", 'O', "Abbreviation", 'Y', 0),
|
||||
RESIDUAL("neuvrščeno", 'N', "Residual", 'X', 1);
|
||||
|
||||
private final String siName;
|
||||
private final Character siCode;
|
||||
private final String enName;
|
||||
private final Character enCode;
|
||||
private final Integer nOfAttributes;
|
||||
|
||||
private static HashMap<Character, Integer> siCodeNOfAttributes;
|
||||
|
||||
static {
|
||||
siCodeNOfAttributes = new HashMap<>();
|
||||
for (Msd msd : Msd.values()) {
|
||||
siCodeNOfAttributes.put(msd.getSiCode(), msd.nOfAttributes);
|
||||
}
|
||||
}
|
||||
|
||||
Msd(String siName, Character siCode, String enName, Character enCode, int nOfAttributes) {
|
||||
this.siName = siName;
|
||||
this.siCode = siCode;
|
||||
this.enName = enName;
|
||||
this.enCode = enCode;
|
||||
this.nOfAttributes = nOfAttributes;
|
||||
}
|
||||
|
||||
public String getSiName() {
|
||||
return siName;
|
||||
}
|
||||
|
||||
public Character getSiCode() {
|
||||
return siCode;
|
||||
}
|
||||
|
||||
public String getEnName() {
|
||||
return enName;
|
||||
}
|
||||
|
||||
public Character getEnCode() {
|
||||
return enCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of attributes for the given type.
|
||||
*
|
||||
* @param msd
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static int getMsdLengthForType(String msd) {
|
||||
return siCodeNOfAttributes.get(msd.charAt(0)) + 1;
|
||||
}
|
||||
}
|
||||
55
src/main/java/data/Enums/WordLevelDefaultValues.java
Normal file
55
src/main/java/data/Enums/WordLevelDefaultValues.java
Normal file
@@ -0,0 +1,55 @@
|
||||
package data.Enums;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
public class WordLevelDefaultValues {
|
||||
public final static Logger logger = LogManager.getLogger(WordLevelDefaultValues.class);
|
||||
|
||||
private static HashSet<String> suffixes;
|
||||
private static final String SUFFIXES_FILE = "/Lists/suffixes.txt";
|
||||
public static final int MIN_N_OF_CHARACTERS_LEFT_SUFFIX = 2;
|
||||
|
||||
private static HashSet<String> prefixes;
|
||||
private static final String PREFIXES_FILE = "/Lists/prefixes.txt";
|
||||
public static final int MIN_N_OF_CHARACTERS_LEFT_PREFIX = 2;
|
||||
|
||||
static {
|
||||
suffixes = new HashSet<>();
|
||||
suffixes = readFromFile(SUFFIXES_FILE);
|
||||
prefixes = new HashSet<>();
|
||||
prefixes = readFromFile(PREFIXES_FILE);
|
||||
}
|
||||
|
||||
private static HashSet<String> readFromFile(String fileName) {
|
||||
Set<String> dictionary = new HashSet<>();
|
||||
|
||||
try (InputStream is = WordLevelDefaultValues.class.getClass().getResourceAsStream(fileName)) {
|
||||
if (is != null) {
|
||||
// TODO: warn if !exists
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
|
||||
dictionary = reader.lines().collect(Collectors.toSet());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.error("Problem reading init dictionary", e);
|
||||
}
|
||||
|
||||
return (HashSet<String>) dictionary;
|
||||
}
|
||||
|
||||
public static HashSet<String> getSuffixes() {
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
public static HashSet<String> getPrefixes() {
|
||||
return prefixes;
|
||||
}
|
||||
}
|
||||
16
src/main/java/data/Enums/WordLevelType.java
Normal file
16
src/main/java/data/Enums/WordLevelType.java
Normal file
@@ -0,0 +1,16 @@
|
||||
package data.Enums;
|
||||
|
||||
public enum WordLevelType {
|
||||
SUFFIX("pripona"),
|
||||
PREFIX("predpona");
|
||||
|
||||
private final String name;
|
||||
|
||||
WordLevelType(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
57
src/main/java/data/Enums/solar/SolarFilters.java
Normal file
57
src/main/java/data/Enums/solar/SolarFilters.java
Normal file
@@ -0,0 +1,57 @@
|
||||
package data.Enums.solar;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class SolarFilters {
|
||||
private static HashMap<String, ObservableList<String>> SOLAR_FILTERS;
|
||||
public static final String SOLA = "sola";
|
||||
public static final String PREDMET = "predmet";
|
||||
public static final String RAZRED = "razred";
|
||||
public static final String REGIJA = "regija";
|
||||
public static final String TIP = "tip";
|
||||
public static final String LETO = "leto";
|
||||
|
||||
static {
|
||||
SOLAR_FILTERS = new HashMap<>();
|
||||
|
||||
SOLAR_FILTERS.put(REGIJA, FXCollections.observableArrayList("Celje", "Gorica", "Koper", "Kranj", "Krško", "Ljubljana", "Maribor", "Murska Sobota", "Novo mesto", "Postojna", "Slovenj Gradec"));
|
||||
SOLAR_FILTERS.put(PREDMET, FXCollections.observableArrayList("državljanska vzgoja in etika", "ekonomija", "filozofija", "geografija", "kemija", "podjetništvo", "psihologija", "slovenščina", "sociologija", "umetnostna vzgoja", "zgodovina"));
|
||||
SOLAR_FILTERS.put(RAZRED, FXCollections.observableArrayList("6. razred", "7. razred", "8. razred", "9. razred", "1. letnik", "2. letnik", "3. letnik", "4. letnik", "5. letnik", "maturitetni tečaj"));
|
||||
SOLAR_FILTERS.put(LETO, FXCollections.observableArrayList("2007", "2008", "2009", "2009/2010", "2010"));
|
||||
SOLAR_FILTERS.put(SOLA, FXCollections.observableArrayList("gimnazija", "osnovna šola", "poklicna šola", "strokovna šola"));
|
||||
SOLAR_FILTERS.put(TIP, FXCollections.observableArrayList("esej/spis", "pisni izdelek (učna ura)", "test (daljše besedilo)", "test (odgovori na vprašanja)"));
|
||||
}
|
||||
|
||||
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_FULL = FXCollections.observableArrayList("različnica", "lema", "oblikoskladenjska oznaka", "oblikoskladenjska lastnost", "besedna vrsta");
|
||||
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_LIMITED = FXCollections.observableArrayList("različnica", "lema");
|
||||
|
||||
/**
|
||||
* Returns filters with all possible values
|
||||
*/
|
||||
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes() {
|
||||
return SOLAR_FILTERS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns filters with all possible values
|
||||
*/
|
||||
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes(HashMap<String, HashSet<String>> foundFilters) {
|
||||
HashMap<String, ObservableList<String>> filtersForComboBoxes = new HashMap<>();
|
||||
|
||||
for (Map.Entry<String, ObservableList<String>> e : SOLAR_FILTERS.entrySet()) {
|
||||
if (!foundFilters.containsKey(e.getKey())) {
|
||||
// if, by some reason a specific filter wasn't in the corpus, return a blank list for that filter
|
||||
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList());
|
||||
} else {
|
||||
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList(foundFilters.get(e.getKey())).sorted());
|
||||
}
|
||||
}
|
||||
|
||||
return filtersForComboBoxes;
|
||||
}
|
||||
}
|
||||
144
src/main/java/data/Filter.java
Normal file
144
src/main/java/data/Filter.java
Normal file
@@ -0,0 +1,144 @@
|
||||
package data;
|
||||
|
||||
import static data.Filter.filterName.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import gui.ValidationUtil;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public class Filter {
|
||||
private HashMap<filterName, Object> filter;
|
||||
|
||||
public enum filterName {
|
||||
ANALYSIS_LEVEL,
|
||||
CALCULATE_FOR,
|
||||
NGRAM_VALUE,
|
||||
SKIP_VALUE,
|
||||
IS_CVV,
|
||||
STRING_LENGTH,
|
||||
TAXONOMY,
|
||||
MSD,
|
||||
HAS_MSD,
|
||||
SOLAR_FILTERS
|
||||
}
|
||||
|
||||
public Filter() {
|
||||
filter = new HashMap<>();
|
||||
}
|
||||
|
||||
public Filter(AnalysisLevel al, CalculateFor cf) {
|
||||
filter = new HashMap<>();
|
||||
|
||||
filter.put(ANALYSIS_LEVEL, al);
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
}
|
||||
|
||||
public void setAl(AnalysisLevel al) {
|
||||
filter.put(ANALYSIS_LEVEL, al);
|
||||
}
|
||||
|
||||
public AnalysisLevel getAl() {
|
||||
return (AnalysisLevel) filter.get(ANALYSIS_LEVEL);
|
||||
}
|
||||
|
||||
public void setCalculateFor(CalculateFor cf) {
|
||||
filter.put(CALCULATE_FOR, cf);
|
||||
}
|
||||
|
||||
public CalculateFor getCalculateFor() {
|
||||
return (CalculateFor) filter.get(CALCULATE_FOR);
|
||||
}
|
||||
|
||||
public void setNgramValue(Integer ngramValue) {
|
||||
filter.put(NGRAM_VALUE, ngramValue);
|
||||
}
|
||||
|
||||
public Integer getNgramValue() {
|
||||
return (Integer) filter.get(NGRAM_VALUE);
|
||||
}
|
||||
|
||||
public void setSkipValue(Integer skipValue) {
|
||||
filter.put(SKIP_VALUE, skipValue);
|
||||
}
|
||||
|
||||
public Integer getSkipValue() {
|
||||
return (Integer) filter.get(SKIP_VALUE);
|
||||
}
|
||||
|
||||
public void setIsCvv(boolean isCvv) {
|
||||
filter.put(IS_CVV, isCvv);
|
||||
}
|
||||
|
||||
public boolean isCvv() {
|
||||
return filter.containsKey(IS_CVV) && (boolean) filter.get(IS_CVV);
|
||||
}
|
||||
|
||||
public void setStringLength(int stringLength) {
|
||||
filter.put(STRING_LENGTH, stringLength);
|
||||
}
|
||||
|
||||
public Integer getStringLength() {
|
||||
return (Integer) filter.get(STRING_LENGTH);
|
||||
}
|
||||
|
||||
public void setTaxonomy(ArrayList<String> taxonomy) {
|
||||
filter.put(TAXONOMY, taxonomy);
|
||||
}
|
||||
|
||||
public ArrayList<String> getTaxonomy() {
|
||||
if (filter.containsKey(TAXONOMY) && filter.get(TAXONOMY) != null) {
|
||||
return (ArrayList<String>) filter.get(TAXONOMY);
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
public void setMsd(ArrayList<Pattern> msd) {
|
||||
filter.put(MSD, msd);
|
||||
if (!ValidationUtil.isEmpty(msd)) {
|
||||
setHasMsd(true);
|
||||
} else {
|
||||
setHasMsd(false);
|
||||
}
|
||||
}
|
||||
|
||||
public ArrayList<Pattern> getMsd() {
|
||||
return (ArrayList<Pattern>) filter.get(MSD);
|
||||
}
|
||||
|
||||
public void setHasMsd(boolean hasMsd) {
|
||||
filter.put(HAS_MSD, hasMsd);
|
||||
}
|
||||
|
||||
public boolean hasMsd() {
|
||||
return filter.containsKey(HAS_MSD) && (boolean) filter.get(HAS_MSD);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String newLine = "\n\t- ";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(newLine).append("Filter:");
|
||||
for (Map.Entry<filterName, Object> entry : filter.entrySet()) {
|
||||
sb.append(newLine)
|
||||
.append(entry.getKey().toString())
|
||||
.append(": ")
|
||||
.append(entry.getValue() != null ? entry.getValue().toString() : "null");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public void setSolarFilters(HashMap<String, HashSet<String>> filters) {
|
||||
filter.put(SOLAR_FILTERS, filters);
|
||||
}
|
||||
|
||||
public HashMap<String, HashSet<String>> getSolarFilters() {
|
||||
return (HashMap<String, HashSet<String>>) filter.get(SOLAR_FILTERS);
|
||||
}
|
||||
}
|
||||
71
src/main/java/data/GigafidaJosWordType.java
Normal file
71
src/main/java/data/GigafidaJosWordType.java
Normal file
@@ -0,0 +1,71 @@
|
||||
package data;
|
||||
|
||||
public enum GigafidaJosWordType {
|
||||
SAMOSTALNIK("samostalnik", 'S'),
|
||||
GLAGOL("glagol", 'G'),
|
||||
PRIDEVNIK("pridevnik", 'P'),
|
||||
PRISLOV("prislov", 'R'),
|
||||
ZAIMEK("zaimek", 'Z'),
|
||||
STEVNIK("stevnik", 'K'),
|
||||
PREDLOG("predlog", 'D'),
|
||||
VEZNIK("veznik", 'V'),
|
||||
CLENEK("clenek", 'L'),
|
||||
MEDMET("medmet", 'M'),
|
||||
OKRAJSAVA("okrajsava", 'O');
|
||||
|
||||
|
||||
private final String name;
|
||||
private final char wordType;
|
||||
|
||||
GigafidaJosWordType(String name, char wordType) {
|
||||
this.name = name;
|
||||
this.wordType = wordType;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public char getWordType() {
|
||||
return wordType;
|
||||
}
|
||||
|
||||
public static GigafidaJosWordType factory(String wType) {
|
||||
if (wType != null) {
|
||||
if (SAMOSTALNIK.toString().equals(wType)) {
|
||||
return SAMOSTALNIK;
|
||||
}
|
||||
if (GLAGOL.toString().equals(wType)) {
|
||||
return GLAGOL;
|
||||
}
|
||||
if (PRIDEVNIK.toString().equals(wType)) {
|
||||
return PRIDEVNIK;
|
||||
}
|
||||
if (PRISLOV.toString().equals(wType)) {
|
||||
return PRISLOV;
|
||||
}
|
||||
if (ZAIMEK.toString().equals(wType)) {
|
||||
return ZAIMEK;
|
||||
}
|
||||
if (STEVNIK.toString().equals(wType)) {
|
||||
return STEVNIK;
|
||||
}
|
||||
if (PREDLOG.toString().equals(wType)) {
|
||||
return PREDLOG;
|
||||
}
|
||||
if (VEZNIK.toString().equals(wType)) {
|
||||
return VEZNIK;
|
||||
}
|
||||
if (CLENEK.toString().equals(wType)) {
|
||||
return CLENEK;
|
||||
}
|
||||
if (MEDMET.toString().equals(wType)) {
|
||||
return MEDMET;
|
||||
}
|
||||
if (OKRAJSAVA.toString().equals(wType)) {
|
||||
return OKRAJSAVA;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
76
src/main/java/data/GigafidaTaxonomy.java
Normal file
76
src/main/java/data/GigafidaTaxonomy.java
Normal file
@@ -0,0 +1,76 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum GigafidaTaxonomy {
|
||||
TISK("tisk", "T"),
|
||||
KNJIZNO("knjižno", "T.K"),
|
||||
LEPOSLOVNO("leposlovno", "T.K.L"),
|
||||
STROKOVNO("strokovno", "T.K.S"),
|
||||
PERIODICNO("periodično", "T.P"),
|
||||
CASOPIS("časopis", "T.P.C"),
|
||||
REVIJA("revija", "T.P.R"),
|
||||
INTERNET("internet", "I");
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
|
||||
private static final ObservableList<String> FOR_COMBO_BOX;
|
||||
|
||||
static {
|
||||
ArrayList<String> values = Arrays.stream(GigafidaTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
|
||||
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
GigafidaTaxonomy(String name, String taxonomy) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static GigafidaTaxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
if (TISK.toString().equals(tax)) {
|
||||
return TISK;
|
||||
}
|
||||
if (KNJIZNO.toString().equals(tax)) {
|
||||
return KNJIZNO;
|
||||
}
|
||||
if (LEPOSLOVNO.toString().equals(tax)) {
|
||||
return LEPOSLOVNO;
|
||||
}
|
||||
if (STROKOVNO.toString().equals(tax)) {
|
||||
return STROKOVNO;
|
||||
}
|
||||
if (PERIODICNO.toString().equals(tax)) {
|
||||
return PERIODICNO;
|
||||
}
|
||||
if (CASOPIS.toString().equals(tax)) {
|
||||
return CASOPIS;
|
||||
}
|
||||
if (REVIJA.toString().equals(tax)) {
|
||||
return REVIJA;
|
||||
}
|
||||
if (INTERNET.toString().equals(tax)) {
|
||||
return INTERNET;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getForComboBox() {
|
||||
return FOR_COMBO_BOX;
|
||||
}
|
||||
}
|
||||
85
src/main/java/data/GosTaxonomy.java
Normal file
85
src/main/java/data/GosTaxonomy.java
Normal file
@@ -0,0 +1,85 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum GosTaxonomy {
|
||||
JAVNI("javni", "gos.T.J"),
|
||||
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "gos.T.J.I"),
|
||||
RAZVEDRILNI("razvedrilni", "gos.T.J.R"),
|
||||
NEJAVNI("nejavni", "gos.T.N"),
|
||||
NEZASEBNI("nezasebni", "gos.T.N.N"),
|
||||
ZASEBNI("zasebni", "gos.T.N.Z"),
|
||||
OSEBNI_STIK("osebni stik", "gos.K.O"),
|
||||
TELEFON("telefon", "gos.K.P"),
|
||||
RADIO("radio", "gos.K.R"),
|
||||
TELEVIZIJA("televizija", "gos.K.T");
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
|
||||
private static final ObservableList<String> FOR_COMBO_BOX;
|
||||
|
||||
static {
|
||||
ArrayList<String> values = Arrays.stream(GosTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
|
||||
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
GosTaxonomy(String name, String taxonomy) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static GosTaxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
if (JAVNI.toString().equals(tax)) {
|
||||
return JAVNI;
|
||||
}
|
||||
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
|
||||
return INFORMATIVNO_IZOBRAZEVALNI;
|
||||
}
|
||||
if (RAZVEDRILNI.toString().equals(tax)) {
|
||||
return RAZVEDRILNI;
|
||||
}
|
||||
if (NEJAVNI.toString().equals(tax)) {
|
||||
return NEJAVNI;
|
||||
}
|
||||
if (NEZASEBNI.toString().equals(tax)) {
|
||||
return NEZASEBNI;
|
||||
}
|
||||
if (ZASEBNI.toString().equals(tax)) {
|
||||
return ZASEBNI;
|
||||
}
|
||||
if (OSEBNI_STIK.toString().equals(tax)) {
|
||||
return OSEBNI_STIK;
|
||||
}
|
||||
if (TELEFON.toString().equals(tax)) {
|
||||
return TELEFON;
|
||||
}
|
||||
if (RADIO.toString().equals(tax)) {
|
||||
return RADIO;
|
||||
}
|
||||
if (TELEVIZIJA.toString().equals(tax)) {
|
||||
return TELEVIZIJA;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getForComboBox() {
|
||||
return FOR_COMBO_BOX;
|
||||
}
|
||||
}
|
||||
56
src/main/java/data/Sentence.java
Normal file
56
src/main/java/data/Sentence.java
Normal file
@@ -0,0 +1,56 @@
|
||||
package data;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class Sentence {
|
||||
|
||||
|
||||
private List<Word> words;
|
||||
private String taksonomija;
|
||||
|
||||
// GOS
|
||||
private String type;
|
||||
private Map<String, String> properties;
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words) {
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija, Map<String, String> properties) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
public Sentence(List<Word> words, String taksonomija, String type) {
|
||||
this.words = words;
|
||||
this.taksonomija = taksonomija;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public List<Word> getWords() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public String getTaxonomy() {
|
||||
return taksonomija;
|
||||
}
|
||||
|
||||
public List<Word> getSublist(int indexFrom, int indexTo) {
|
||||
return this.words.subList(indexFrom, indexTo);
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
||||
16
src/main/java/data/Settings.java
Normal file
16
src/main/java/data/Settings.java
Normal file
@@ -0,0 +1,16 @@
|
||||
package data;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
|
||||
public class Settings {
|
||||
public static final int CORPUS_SENTENCE_LIMIT = 50000;
|
||||
public static final boolean PRINT_LOG = false;
|
||||
|
||||
public static final String FX_ACCENT_OK = "-fx-accent: forestgreen;";
|
||||
public static final String FX_ACCENT_NOK = "-fx-accent: red;";
|
||||
|
||||
public static Collection<File> corpus;
|
||||
public static File resultsFilePath;
|
||||
}
|
||||
299
src/main/java/data/Statistics.java
Normal file
299
src/main/java/data/Statistics.java
Normal file
@@ -0,0 +1,299 @@
|
||||
package data;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import util.Util;
|
||||
import util.db.RDB;
|
||||
|
||||
public class Statistics {
|
||||
private CorpusType corpusType;
|
||||
private AnalysisLevel analysisLevel;
|
||||
private boolean useDB;
|
||||
private RDB db;
|
||||
|
||||
private boolean analysisProducedResults;
|
||||
|
||||
private String taxonomy;
|
||||
private boolean taxonomyIsSet;
|
||||
|
||||
private char JOSType;
|
||||
private boolean JOSTypeIsSet;
|
||||
|
||||
private String resultTitle;
|
||||
public Map<String, AtomicLong> result = new ConcurrentHashMap<>();
|
||||
|
||||
// nGrams
|
||||
private int nGramLevel;
|
||||
private Integer skip;
|
||||
private CalculateFor cf;
|
||||
private List<Pattern> morphosyntacticFilter;
|
||||
|
||||
// distributions
|
||||
private String distributionTaxonomy;
|
||||
private char distributionJosWordType;
|
||||
private boolean vcc;
|
||||
private Integer substringLength;
|
||||
|
||||
// inflected JOS
|
||||
private String inflectedJosTaxonomy;
|
||||
|
||||
// GOS
|
||||
boolean gosOrthMode;
|
||||
|
||||
// šolar
|
||||
Map<String, Object> solarHeadBlockFilter;
|
||||
|
||||
|
||||
// for ngrams
|
||||
public Statistics(AnalysisLevel al, int nGramLevel, Integer skip, CalculateFor cf) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
this.cf = cf;
|
||||
this.analysisLevel = al;
|
||||
this.nGramLevel = nGramLevel;
|
||||
this.skip = skip == null || skip == 0 ? null : skip;
|
||||
|
||||
this.resultTitle = String.format("%s%d-gram_%s_%s",
|
||||
this.skip != null ? String.format("%d-%s-", skip, "skip") : "",
|
||||
nGramLevel,
|
||||
cf.toString(),
|
||||
dateTime);
|
||||
}
|
||||
|
||||
// for words distributions
|
||||
public Statistics(AnalysisLevel al, Taxonomy distributionTaxonomy, GigafidaJosWordType distributionJosWordType, CalculateFor cf) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("%s_%s_%s",
|
||||
distributionTaxonomy != null ? distributionTaxonomy.toString() : "",
|
||||
distributionJosWordType != null ? distributionJosWordType.toString() : "",
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.cf = cf;
|
||||
this.distributionTaxonomy = distributionTaxonomy != null ? distributionTaxonomy.getTaxonomnyString() : null;
|
||||
this.taxonomyIsSet = distributionTaxonomy != null;
|
||||
|
||||
this.JOSTypeIsSet = distributionJosWordType != null;
|
||||
this.distributionJosWordType = this.JOSTypeIsSet ? distributionJosWordType.getWordType() : ' ';
|
||||
}
|
||||
|
||||
public Statistics(AnalysisLevel al, CalculateFor cf, Integer substringLength) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("%s_%d_%s",
|
||||
"Distribucija zaporedij samoglasnikov in soglasnikov",
|
||||
substringLength,
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.cf = cf;
|
||||
this.substringLength = substringLength;
|
||||
this.vcc = true;
|
||||
}
|
||||
|
||||
public Statistics(AnalysisLevel al, Taxonomy inflectedJosTaxonomy) {
|
||||
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
|
||||
|
||||
this.resultTitle = String.format("InflectedJOS_%s_%s",
|
||||
distributionTaxonomy != null ? distributionTaxonomy : "",
|
||||
dateTime);
|
||||
|
||||
this.analysisLevel = al;
|
||||
this.inflectedJosTaxonomy = inflectedJosTaxonomy != null ? inflectedJosTaxonomy.getTaxonomnyString() : null;
|
||||
this.taxonomyIsSet = inflectedJosTaxonomy != null;
|
||||
}
|
||||
|
||||
public Integer getSkip() {
|
||||
return skip;
|
||||
}
|
||||
|
||||
public Integer getSubstringLength() {
|
||||
return substringLength;
|
||||
}
|
||||
|
||||
public String getInflectedJosTaxonomy() {
|
||||
return inflectedJosTaxonomy;
|
||||
}
|
||||
|
||||
public void setSubstringLength(Integer substringLength) {
|
||||
this.substringLength = substringLength;
|
||||
}
|
||||
|
||||
public boolean isVcc() {
|
||||
return vcc;
|
||||
}
|
||||
|
||||
public void setVcc(boolean vcc) {
|
||||
this.vcc = vcc;
|
||||
}
|
||||
|
||||
public String getDistributionTaxonomy() {
|
||||
return distributionTaxonomy;
|
||||
}
|
||||
|
||||
public void setDistributionTaxonomy(String distributionTaxonomy) {
|
||||
this.distributionTaxonomy = distributionTaxonomy;
|
||||
}
|
||||
|
||||
public char getDistributionJosWordType() {
|
||||
return distributionJosWordType;
|
||||
}
|
||||
|
||||
public void setDistributionJosWordType(char distributionJosWordType) {
|
||||
this.distributionJosWordType = distributionJosWordType;
|
||||
}
|
||||
|
||||
public void setMorphosyntacticFilter(List<String> morphosyntacticFilter) {
|
||||
// change filter strings to regex patterns
|
||||
this.morphosyntacticFilter = new ArrayList<>();
|
||||
for (String s : morphosyntacticFilter) {
|
||||
this.morphosyntacticFilter.add(Pattern.compile(s.replaceAll("\\*", ".")));
|
||||
}
|
||||
}
|
||||
|
||||
public List<Pattern> getMsd() {
|
||||
return morphosyntacticFilter;
|
||||
}
|
||||
|
||||
public Map<String, AtomicLong> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setTaxonomy(String taxonomy) {
|
||||
this.taxonomy = taxonomy;
|
||||
}
|
||||
|
||||
public void setTaxonomyIsSet(boolean taxonomyIsSet) {
|
||||
this.taxonomyIsSet = taxonomyIsSet;
|
||||
}
|
||||
|
||||
public char getJOSType() {
|
||||
return JOSType;
|
||||
}
|
||||
|
||||
public void setJOSType(char JOSType) {
|
||||
this.JOSType = JOSType;
|
||||
}
|
||||
|
||||
public boolean isJOSTypeSet() {
|
||||
return JOSTypeIsSet;
|
||||
}
|
||||
|
||||
public void setJOSType(boolean JOSTypeIsSet) {
|
||||
this.JOSTypeIsSet = JOSTypeIsSet;
|
||||
}
|
||||
|
||||
public void saveResultToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
// Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
|
||||
//
|
||||
// if (useDB) {
|
||||
// result = db.getDump();
|
||||
// db.delete();
|
||||
// }
|
||||
//
|
||||
// // if no results and nothing to save, return false
|
||||
// if (!(result.size() > 0)) {
|
||||
// analysisProducedResults = false;
|
||||
// return;
|
||||
// } else {
|
||||
// analysisProducedResults = true;
|
||||
// }
|
||||
//
|
||||
// stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
|
||||
// Export.SetToCSV(stats);
|
||||
}
|
||||
|
||||
// private Map<String, Integer> getSortedResultInflected(Map map) {
|
||||
// // first convert to <String, Integer>
|
||||
// Map<String, Integer> m = Util.sortByValue(Util.atomicInt2StringAndInt(map), 0);
|
||||
//
|
||||
// Map<String, Integer> sortedM = new TreeMap<>();
|
||||
//
|
||||
// sortedM.putAll(m);
|
||||
//
|
||||
// return sortedM;
|
||||
// }
|
||||
|
||||
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
|
||||
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
|
||||
}
|
||||
|
||||
public String getTaxonomy() {
|
||||
return taxonomy;
|
||||
}
|
||||
|
||||
public boolean isTaxonomySet() {
|
||||
return taxonomyIsSet;
|
||||
}
|
||||
|
||||
public int getnGramLevel() {
|
||||
return nGramLevel;
|
||||
}
|
||||
|
||||
public CalculateFor getCf() {
|
||||
return cf;
|
||||
}
|
||||
|
||||
public AnalysisLevel getAnalysisLevel() {
|
||||
return analysisLevel;
|
||||
}
|
||||
|
||||
public CorpusType getCorpusType() {
|
||||
return corpusType;
|
||||
}
|
||||
|
||||
public void setCorpusType(CorpusType corpusType) {
|
||||
this.corpusType = corpusType;
|
||||
}
|
||||
|
||||
public boolean isGosOrthMode() {
|
||||
return gosOrthMode;
|
||||
}
|
||||
|
||||
public void setGosOrthMode(boolean gosOrthMode) {
|
||||
this.gosOrthMode = gosOrthMode;
|
||||
}
|
||||
|
||||
public Map<String, Object> getSolarHeadBlockFilter() {
|
||||
return solarHeadBlockFilter;
|
||||
}
|
||||
|
||||
public void setSolarHeadBlockFilter(Map<String, Object> solarHeadBlockFilter) {
|
||||
this.solarHeadBlockFilter = solarHeadBlockFilter;
|
||||
}
|
||||
|
||||
public boolean isUseDB() {
|
||||
return useDB;
|
||||
}
|
||||
|
||||
public void setUseDB(boolean useDB) {
|
||||
if (useDB && db == null) {
|
||||
db = new RDB();
|
||||
}
|
||||
this.useDB = useDB;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores results from this batch to a database and clears results map
|
||||
*/
|
||||
public void storeTmpResultsToDB() {
|
||||
try {
|
||||
db.writeBatch(result);
|
||||
result = new ConcurrentHashMap<>();
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isAnalysisProducedResults() {
|
||||
return analysisProducedResults;
|
||||
}
|
||||
}
|
||||
409
src/main/java/data/StatisticsNew.java
Normal file
409
src/main/java/data/StatisticsNew.java
Normal file
@@ -0,0 +1,409 @@
|
||||
package data;
|
||||
|
||||
import static gui.ValidationUtil.*;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import alg.inflectedJOS.WordFormation;
|
||||
import data.Enums.WordLevelType;
|
||||
import javafx.collections.ObservableList;
|
||||
import util.Export;
|
||||
import util.Util;
|
||||
import util.db.RDB;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class StatisticsNew {
|
||||
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
|
||||
|
||||
private Corpus corpus;
|
||||
private Filter filter;
|
||||
|
||||
private String resultTitle;
|
||||
private Map<String, AtomicLong> result;
|
||||
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
|
||||
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
|
||||
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
|
||||
private boolean useDB;
|
||||
private RDB db;
|
||||
private boolean analysisProducedResults;
|
||||
private LocalDateTime time;
|
||||
|
||||
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
|
||||
this.corpus = corpus;
|
||||
this.filter = filter;
|
||||
|
||||
if (useDB) {
|
||||
this.useDB = true;
|
||||
db = new RDB();
|
||||
}
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
|
||||
resultNestedSuffix = new ConcurrentHashMap<>();
|
||||
resultNestedPrefix = new ConcurrentHashMap<>();
|
||||
} else {
|
||||
result = new ConcurrentHashMap<>();
|
||||
}
|
||||
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
logger.debug(toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Result's title consists of:
|
||||
* <ul>
|
||||
* <li>Corpus type</li>
|
||||
* <li>Analysis level</li>
|
||||
* <li>Calculate for</li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* <li></li>
|
||||
* </ul>
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private String generateResultTitle() {
|
||||
String separator = "_";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
if(ngramLevel == 0) {
|
||||
sb.append("Crke").
|
||||
append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
} else if(ngramLevel == 1) {
|
||||
sb.append("Besede").append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
}
|
||||
else {
|
||||
sb.append(filter.getAl().toString())
|
||||
.append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
sb.append(filter.getCalculateFor().toString())
|
||||
.append(separator);
|
||||
// ngram value
|
||||
sb.append(filter.getNgramValue()).append("-gram")
|
||||
.append(separator);
|
||||
sb.append(filter.getSkipValue()).append("-preskok")
|
||||
.append(separator);
|
||||
}
|
||||
// TODO: assure skip is not null but zero
|
||||
|
||||
} else {
|
||||
sb.append(filter.getAl().toString()) // analysis level
|
||||
.append(separator)
|
||||
.append(corpus.getCorpusType().toString())
|
||||
.append(separator);
|
||||
}
|
||||
// skip value
|
||||
// msd ?
|
||||
// if taxonomy -> taxonomy
|
||||
// if cvv -> cvv + dolžina
|
||||
|
||||
this.time = this.time != null ? this.time : LocalDateTime.now();
|
||||
|
||||
sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")));
|
||||
return sb.toString();
|
||||
|
||||
}
|
||||
|
||||
public boolean isAnalysisProducedResults() {
|
||||
return analysisProducedResults;
|
||||
}
|
||||
|
||||
public void setAnalysisProducedResults(boolean analysisProducedResults) {
|
||||
this.analysisProducedResults = analysisProducedResults;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String newLine = "\n\t- ";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(newLine).append("Statistic properties:");
|
||||
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
|
||||
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
|
||||
sb.append(newLine).append(filter.toString());
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getResultTitle() {
|
||||
return resultTitle;
|
||||
}
|
||||
|
||||
// ****************************************
|
||||
// ***************** util *****************
|
||||
// ****************************************
|
||||
|
||||
/**
|
||||
* Stores results from this batch to a database and clears results map
|
||||
*/
|
||||
public void storeTmpResultsToDB() {
|
||||
try {
|
||||
db.writeBatch(result);
|
||||
result = new ConcurrentHashMap<>();
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
logger.error("Store tmp results to DB", e);
|
||||
// e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public Filter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
public Corpus getCorpus() {
|
||||
return corpus;
|
||||
}
|
||||
|
||||
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(result.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
|
||||
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
|
||||
|
||||
if (!isEmpty(resultNestedSuffix)) {
|
||||
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
if (!isEmpty(resultNestedPrefix)) {
|
||||
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(results.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
|
||||
filter.setAl(AnalysisLevel.WORD_FORMATION);
|
||||
resultTitle = generateResultTitle();
|
||||
|
||||
if (useDB) {
|
||||
result = db.getDump();
|
||||
db.delete();
|
||||
}
|
||||
|
||||
// if no results and nothing to save, return false
|
||||
if (!(result.size() > 0)) {
|
||||
analysisProducedResults = false;
|
||||
return false;
|
||||
} else {
|
||||
analysisProducedResults = true;
|
||||
}
|
||||
|
||||
WordFormation.calculateStatistics(this);
|
||||
|
||||
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
|
||||
return true;
|
||||
}
|
||||
|
||||
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
|
||||
Map<String, Map<String, Long>> sorted = new HashMap<>();
|
||||
|
||||
for (String s : nestedMap.keySet()) {
|
||||
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
|
||||
}
|
||||
|
||||
return sorted;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
|
||||
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
|
||||
}
|
||||
|
||||
public void updateResults(String o) {
|
||||
// if not in map
|
||||
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null)
|
||||
result.get(o).incrementAndGet();
|
||||
}
|
||||
|
||||
public Map<String, AtomicLong> getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public Object[][] getResultCustom() {
|
||||
return resultCustom;
|
||||
}
|
||||
|
||||
public void setResultCustom(Object[][] resultCustom) {
|
||||
this.resultCustom = resultCustom;
|
||||
}
|
||||
|
||||
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
|
||||
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
|
||||
|
||||
if (type == WordLevelType.SUFFIX) {
|
||||
updateResultsNestedSuffix(key, stringValue);
|
||||
} else if (type == WordLevelType.PREFIX) {
|
||||
updateResultsNestedPrefix(key, stringValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void updateResultsNestedSuffix(String key, String stringValue) {
|
||||
if (resultNestedSuffix.containsKey(key)) {
|
||||
// if not in map
|
||||
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null) {
|
||||
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
} else {
|
||||
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
|
||||
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
if (r != null) {
|
||||
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void updateResultsNestedPrefix(String key, String stringValue) {
|
||||
if (resultNestedPrefix.containsKey(key)) {
|
||||
// if not in map
|
||||
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null) {
|
||||
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
} else {
|
||||
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
|
||||
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
|
||||
|
||||
if (r != null) {
|
||||
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private LinkedHashMap<String, String> headerInfoBlock() {
|
||||
LinkedHashMap<String, String> info = new LinkedHashMap<>();
|
||||
|
||||
info.put("Korpus:", corpus.getCorpusType().toString());
|
||||
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
if (ngramLevel == 0)
|
||||
info.put("Analiza:", "Črke");
|
||||
else if (ngramLevel == 1)
|
||||
info.put("Analiza", "Besede");
|
||||
else
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
} else {
|
||||
info.put("Analiza:", filter.getAl().toString());
|
||||
}
|
||||
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
Integer ngramLevel = filter.getNgramValue();
|
||||
|
||||
// n.gram nivo
|
||||
if (ngramLevel > 1) {
|
||||
info.put("n-gram nivo:", String.valueOf(ngramLevel));
|
||||
} else if (ngramLevel == 1){
|
||||
info.put("n-gram nivo:", "nivo besed");
|
||||
} else {
|
||||
info.put("n-gram nivo:", "nivo črk");
|
||||
}
|
||||
// skip
|
||||
if (ngramLevel > 1)
|
||||
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
|
||||
|
||||
// izračunaj za
|
||||
info.put("Izračunaj za:", filter.getCalculateFor().toString());
|
||||
|
||||
// msd
|
||||
if (!isEmpty(filter.getMsd())) {
|
||||
StringBuilder msdPattern = new StringBuilder();
|
||||
for (Pattern pattern : filter.getMsd()) {
|
||||
msdPattern.append(pattern.toString()).append(" ");
|
||||
}
|
||||
|
||||
info.put("MSD:", msdPattern.toString());
|
||||
}
|
||||
|
||||
// taksonomija
|
||||
if (!isEmpty(filter.getTaxonomy())) {
|
||||
info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
|
||||
|
||||
info.put("Taksonomija: ", "");
|
||||
String sep = "";
|
||||
for (String s : tax) {
|
||||
info.put(sep = sep + " ", s);
|
||||
}
|
||||
}
|
||||
|
||||
if (corpus.getCorpusType() == CorpusType.SOLAR) {
|
||||
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
|
||||
|
||||
if (!isEmpty(filters)) {
|
||||
info.put("Dodatni filtri: ", "");
|
||||
|
||||
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
|
||||
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
}
|
||||
175
src/main/java/data/Tax.java
Normal file
175
src/main/java/data/Tax.java
Normal file
@@ -0,0 +1,175 @@
|
||||
package data;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import gui.ValidationUtil;
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
|
||||
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
|
||||
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
|
||||
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
|
||||
|
||||
// GOS ----------------------------------
|
||||
GOS_TAXONOMY = new LinkedHashMap<>();
|
||||
|
||||
GOS_TAXONOMY.put("gos.T", "diskurz");
|
||||
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
|
||||
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
|
||||
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
|
||||
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
|
||||
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
|
||||
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
|
||||
|
||||
GOS_TAXONOMY.put("gos.S", "situacija");
|
||||
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
|
||||
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the whole default taxonomy for the specified corpus type
|
||||
*/
|
||||
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
return FXCollections.observableArrayList(GIGAFIDA_TAXONOMY.values());
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
return FXCollections.observableArrayList(GOS_TAXONOMY.values());
|
||||
}
|
||||
|
||||
return FXCollections.observableArrayList(new ArrayList<>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns taxonomy names only for items found in headers
|
||||
*/
|
||||
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
|
||||
// assures same relative order
|
||||
for (String t : tax.keySet()) {
|
||||
if (foundTax.contains(t)) {
|
||||
taxForCombo.add(tax.get(t));
|
||||
}
|
||||
}
|
||||
|
||||
return FXCollections.observableArrayList(taxForCombo);
|
||||
}
|
||||
|
||||
public static HashSet<CorpusType> getCorpusTypesWithTaxonomy() {
|
||||
return corpusTypesWithTaxonomy;
|
||||
}
|
||||
|
||||
public static ArrayList<String> getTaxonomyCodes(ArrayList<String> taxonomyNames, CorpusType corpusType) {
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
|
||||
if (ValidationUtil.isEmpty(taxonomyNames)) {
|
||||
return result;
|
||||
}
|
||||
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
// for easier lookup
|
||||
Map<String, String> taxInversed = tax.entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
|
||||
|
||||
for (String taxonomyName : taxonomyNames) {
|
||||
result.add(taxInversed.get(taxonomyName));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of proper names for codes
|
||||
*
|
||||
* @param corpusType
|
||||
* @param taxonomy
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static ArrayList<String> getTaxonomyForInfo(CorpusType corpusType, ArrayList<String> taxonomy) {
|
||||
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
|
||||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
}
|
||||
|
||||
ArrayList<String> result = new ArrayList<>();
|
||||
|
||||
for (String t : taxonomy) {
|
||||
result.add(tax.get(t));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
171
src/main/java/data/Taxonomy.java
Normal file
171
src/main/java/data/Taxonomy.java
Normal file
@@ -0,0 +1,171 @@
|
||||
package data;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javafx.collections.FXCollections;
|
||||
import javafx.collections.ObservableList;
|
||||
|
||||
public enum Taxonomy {
|
||||
// GOS
|
||||
JAVNI("javni", "T.J", "gos"),
|
||||
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "T.J.I", "gos"),
|
||||
RAZVEDRILNI("razvedrilni", "T.J.R", "gos"),
|
||||
NEJAVNI("nejavni", "T.N", "gos"),
|
||||
NEZASEBNI("nezasebni", "T.N.N", "gos"),
|
||||
ZASEBNI("zasebni", "T.N.Z", "gos"),
|
||||
OSEBNI_STIK("osebni stik", "K.O", "gos"),
|
||||
TELEFON("telefon", "K.P", "gos"),
|
||||
RADIO("radio", "K.R", "gos"),
|
||||
TELEVIZIJA("televizija", "K.T", "gos"),
|
||||
// Gigafida
|
||||
KNJIZNO("knjižno", "T.K", "gigafida"),
|
||||
LEPOSLOVNO("leposlovno", "T.K.L", "gigafida"),
|
||||
STROKOVNO("strokovno", "T.K.S", "gigafida"),
|
||||
PERIODICNO("periodično", "T.P", "gigafida"),
|
||||
CASOPIS("časopis", "T.P.C", "gigafida"),
|
||||
REVIJA("revija", "T.P.R", "gigafida"),
|
||||
INTERNET("internet", "I", "gigafida"),
|
||||
|
||||
SSJ_TISK("tisk", "SSJ.T", "gigafida"),
|
||||
SSJ_KNJIZNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_LEPOSLOVNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_STROKOVNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_PERIODICNO("opis", "identifikator", "gigafida"),
|
||||
SSJ_CASOPIS("opis", "identifikator", "gigafida"),
|
||||
SSJ_REVIJA("opis", "identifikator", "gigafida"),
|
||||
SSJ_DRUGO("opis", "identifikator", "gigafida"),
|
||||
SSJ_INTERNET("opis", "identifikator", "gigafida"),
|
||||
FT_P_PRENOSNIK("opis", "identifikator", "gigafida"),
|
||||
FT_P_GOVORNI("opis", "identifikator", "gigafida"),
|
||||
FT_P_ELEKTRONSKI("opis", "identifikator", "gigafida"),
|
||||
FT_P_PISNI("opis", "identifikator", "gigafida"),
|
||||
FT_P_OBJAVLJENO("opis", "identifikator", "gigafida"),
|
||||
FT_P_KNJIZNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_PERIODICNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_CASOPISNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_DNEVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_VECKRAT_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
// FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
FT_P_REVIALNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
|
||||
FT_P_STIRINAJSTDNEVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_MESECNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_REDKEJE_KOT_MESECNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_OBCASNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_NEOBJAVLJENO("opis", "identifikator", "gigafida"),
|
||||
FT_P_JAVNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_INTERNO("opis", "identifikator", "gigafida"),
|
||||
FT_P_ZASEBNO("opis", "identifikator", "gigafida"),
|
||||
FT_ZVRST("opis", "identifikator", "gigafida"),
|
||||
FT_UMETNOSTNA("opis", "identifikator", "gigafida"),
|
||||
FT_PESNISKA("opis", "identifikator", "gigafida"),
|
||||
FT_PROZNA("opis", "identifikator", "gigafida"),
|
||||
FT_DRAMSKA("opis", "identifikator", "gigafida"),
|
||||
FT_NEUMETNOSTNA("opis", "identifikator", "gigafida"),
|
||||
FT_STROKOVNA("opis", "identifikator", "gigafida"),
|
||||
FT_HID("opis", "identifikator", "gigafida"),
|
||||
FT_NIT("opis", "identifikator", "gigafida"),
|
||||
FT_NESTROKOVNA("opis", "identifikator", "gigafida"),
|
||||
FT_PRAVNA("opis", "identifikator", "gigafida"),
|
||||
FT_LEKTORIRANO("opis", "identifikator", "gigafida"),
|
||||
FT_DA("opis", "identifikator", "gigafida"),
|
||||
FT_NE("opis", "identifikator", "gigafida");
|
||||
|
||||
|
||||
|
||||
private final String name;
|
||||
private final String taxonomy;
|
||||
private final String corpus;
|
||||
|
||||
Taxonomy(String name, String taxonomy, String corpusType) {
|
||||
this.name = name;
|
||||
this.taxonomy = taxonomy;
|
||||
this.corpus = corpusType;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getTaxonomnyString() {
|
||||
return this.taxonomy;
|
||||
}
|
||||
|
||||
public static Taxonomy factory(String tax) {
|
||||
if (tax != null) {
|
||||
// GOS
|
||||
if (JAVNI.toString().equals(tax)) {
|
||||
return JAVNI;
|
||||
}
|
||||
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
|
||||
return INFORMATIVNO_IZOBRAZEVALNI;
|
||||
}
|
||||
if (RAZVEDRILNI.toString().equals(tax)) {
|
||||
return RAZVEDRILNI;
|
||||
}
|
||||
if (NEJAVNI.toString().equals(tax)) {
|
||||
return NEJAVNI;
|
||||
}
|
||||
if (NEZASEBNI.toString().equals(tax)) {
|
||||
return NEZASEBNI;
|
||||
}
|
||||
if (ZASEBNI.toString().equals(tax)) {
|
||||
return ZASEBNI;
|
||||
}
|
||||
if (OSEBNI_STIK.toString().equals(tax)) {
|
||||
return OSEBNI_STIK;
|
||||
}
|
||||
if (TELEFON.toString().equals(tax)) {
|
||||
return TELEFON;
|
||||
}
|
||||
if (RADIO.toString().equals(tax)) {
|
||||
return RADIO;
|
||||
}
|
||||
if (TELEVIZIJA.toString().equals(tax)) {
|
||||
return TELEVIZIJA;
|
||||
}
|
||||
|
||||
// Gigafida
|
||||
// if (TISK.toString().equals(tax)) {
|
||||
// return TISK;
|
||||
// }
|
||||
if (KNJIZNO.toString().equals(tax)) {
|
||||
return KNJIZNO;
|
||||
}
|
||||
if (LEPOSLOVNO.toString().equals(tax)) {
|
||||
return LEPOSLOVNO;
|
||||
}
|
||||
if (STROKOVNO.toString().equals(tax)) {
|
||||
return STROKOVNO;
|
||||
}
|
||||
if (PERIODICNO.toString().equals(tax)) {
|
||||
return PERIODICNO;
|
||||
}
|
||||
if (CASOPIS.toString().equals(tax)) {
|
||||
return CASOPIS;
|
||||
}
|
||||
if (REVIJA.toString().equals(tax)) {
|
||||
return REVIJA;
|
||||
}
|
||||
if (INTERNET.toString().equals(tax)) {
|
||||
return INTERNET;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getDefaultForComboBox(String corpusType) {
|
||||
ArrayList<String> values = Arrays.stream(Taxonomy.values())
|
||||
.filter(x -> x.corpus.equals(corpusType))
|
||||
.map(x -> x.name)
|
||||
.collect(Collectors.toCollection(ArrayList::new));
|
||||
|
||||
return FXCollections.observableArrayList(values);
|
||||
}
|
||||
|
||||
public static ObservableList<String> getDefaultForComboBox(CorpusType corpusType) {
|
||||
return getDefaultForComboBox(corpusType.toString());
|
||||
}
|
||||
}
|
||||
53
src/main/java/data/Validation.java
Normal file
53
src/main/java/data/Validation.java
Normal file
@@ -0,0 +1,53 @@
|
||||
package data;
|
||||
|
||||
import static gui.ValidationUtil.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import gui.Messages;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Validation {
|
||||
|
||||
public static String validateForStringLevel(Filter filter) {
|
||||
ArrayList<String> errors = new ArrayList<>();
|
||||
|
||||
// should not be null, error if null, because init failed
|
||||
if (filter.getNgramValue() == null) {
|
||||
errors.add(Messages.MISSING_NGRAM_LEVEL);
|
||||
}
|
||||
|
||||
// should not be null, error if null, because init failed
|
||||
if (filter.getCalculateFor() == null) {
|
||||
errors.add(Messages.MISSING_CALCULATE_FOR);
|
||||
}
|
||||
|
||||
if (filter.getSkipValue() == null) {
|
||||
filter.setSkipValue(0);
|
||||
}
|
||||
|
||||
if (filter.getNgramValue() != null && ValidationUtil.isEmpty(filter.getMsd()) &&
|
||||
(filter.getMsd().size() != filter.getNgramValue())) {
|
||||
if (!(filter.getMsd().size() == 1 && filter.getNgramValue() == 0) && !ValidationUtil.isEmpty(filter.getMsd()))
|
||||
errors.add(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES);
|
||||
}
|
||||
|
||||
Integer ngramValue = filter.getNgramValue();
|
||||
ArrayList<Pattern> msd = filter.getMsd();
|
||||
|
||||
if (ngramValue > 0 && !ValidationUtil.isEmpty(msd) && ngramValue != msd.size()) {
|
||||
errors.add(String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, ngramValue, msd.size()));
|
||||
}
|
||||
|
||||
if (filter.getNgramValue() != null && filter.getNgramValue() == 0 && isEmpty(filter.getStringLength())) {
|
||||
// if count letters, make sure that the length is given
|
||||
// TODO: check that words we're adding in xml reader are longer than this value
|
||||
errors.add(Messages.MISSING_STRING_LENGTH);
|
||||
}
|
||||
|
||||
return isEmpty(errors) ? null : StringUtils.join(errors, ", \n");
|
||||
}
|
||||
}
|
||||
141
src/main/java/data/Word.java
Normal file
141
src/main/java/data/Word.java
Normal file
@@ -0,0 +1,141 @@
|
||||
package data;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import data.Enums.Msd;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Word implements Serializable {
|
||||
public static final char PAD_CHARACTER = '-';
|
||||
|
||||
private String word;
|
||||
private String lemma;
|
||||
private String msd;
|
||||
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
|
||||
|
||||
/**
|
||||
* Possible values:
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>S = samostalnik</li>
|
||||
* <li>G = glagol</li>
|
||||
* <li>P = pridevnik</li>
|
||||
* <li>R = prislov</li>
|
||||
* <li>Z = zaimek</li>
|
||||
* <li>K = števnik</li>
|
||||
* <li>D = predlog</li>
|
||||
* <li>V = veznik</li>
|
||||
* <li>L = členek</li>
|
||||
* <li>M = medmet</li>
|
||||
* <li>O = okrajšava</li>
|
||||
* <li>N = neuvrščeno</li>
|
||||
* </ul>
|
||||
*/
|
||||
//private char besedna_vrsta;
|
||||
public Word(String word, String lemma, String msd) {
|
||||
this.lemma = lemma;
|
||||
this.msd = normalizeMsd(msd);
|
||||
|
||||
// veliko zacetnico ohranimo samo za lastna imena
|
||||
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
|
||||
&& this.msd.length() >= 2
|
||||
&& this.msd.charAt(1) == 'l')) {
|
||||
this.word = word.toLowerCase();
|
||||
} else {
|
||||
this.word = word;
|
||||
}
|
||||
}
|
||||
|
||||
public Word() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a number of '-' to msds which are not properly sized.
|
||||
* E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
|
||||
*
|
||||
* @param msdInput
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private String normalizeMsd(String msdInput) {
|
||||
if (ValidationUtil.isEmpty(msdInput)) {
|
||||
return "";
|
||||
} else {
|
||||
return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
|
||||
}
|
||||
}
|
||||
|
||||
public Word(String word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
public String getCVVWord() {
|
||||
return covertToCvv(word);
|
||||
}
|
||||
|
||||
public String getCVVLemma() {
|
||||
return covertToCvv(lemma);
|
||||
}
|
||||
|
||||
private String covertToCvv(String s) {
|
||||
char[] StringCA = s.toCharArray();
|
||||
|
||||
for (int i = 0; i < StringCA.length; i++) {
|
||||
StringCA[i] = VOWELS.contains(StringCA[i]) ? 'V' : 'C';
|
||||
}
|
||||
|
||||
return new String(StringCA);
|
||||
}
|
||||
|
||||
public void setWord(String word) {
|
||||
this.word = word;
|
||||
}
|
||||
|
||||
public String getLemma() {
|
||||
return lemma;
|
||||
}
|
||||
|
||||
public void setLemma(String lemma) {
|
||||
this.lemma = lemma;
|
||||
}
|
||||
|
||||
public String getMsd() {
|
||||
return msd;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("beseda:\t")
|
||||
.append(getWord())
|
||||
.append("\n")
|
||||
.append("lema:\t")
|
||||
.append(getLemma())
|
||||
.append("\n")
|
||||
.append("msd:\t")
|
||||
.append(getMsd())
|
||||
.append("\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getForCf(CalculateFor calculateFor, boolean cvv) {
|
||||
String returnValue = "";
|
||||
|
||||
if (cvv) {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
|
||||
} else {
|
||||
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
|
||||
}
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user