Project copied

This commit is contained in:
2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions

View File

@@ -0,0 +1,17 @@
package data;
public enum AnalysisLevel {
STRING_LEVEL("Besedni nizi"),
WORD_LEVEL("Nivo besed in delov besed"),
WORD_FORMATION("Besedotvorni procesi");
private final String name;
AnalysisLevel(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
}

View File

@@ -0,0 +1,43 @@
package data;
public enum CalculateFor {
WORD("različnica"),
LEMMA("lema"),
MORPHOSYNTACTIC_SPECS("oblikoskladenjska oznaka"),
MORPHOSYNTACTIC_PROPERTY("oblikoskladenjska lastnost"),
WORD_TYPE("besedna vrsta"),
DIST_WORDS("različnica"),
DIST_LEMMAS("lema");
private final String name;
CalculateFor(String name) {
this.name = name;
}
public String toString() {
return this.name;
}
public static CalculateFor factory(String cf) {
if (cf != null) {
if (WORD.toString().equals(cf)) {
return WORD;
}
if (LEMMA.toString().equals(cf)) {
return LEMMA;
}
if (MORPHOSYNTACTIC_SPECS.toString().equals(cf)) {
return MORPHOSYNTACTIC_SPECS;
}
if (MORPHOSYNTACTIC_PROPERTY.toString().equals(cf)) {
return MORPHOSYNTACTIC_PROPERTY;
}
if (WORD_TYPE.toString().equals(cf)) {
return WORD_TYPE;
}
}
return null;
}
}

View File

@@ -0,0 +1,163 @@
package data;
import static gui.Messages.*;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.Enums.solar.SolarFilters;
import gui.ValidationUtil;
import javafx.collections.ObservableList;
public class Corpus {
public final static Logger logger = LogManager.getLogger(Corpus.class);
private CorpusType corpusType;
private File chosenResultsLocation;
private File chosenCorpusLocation;
private Collection<File> detectedCorpusFiles;
boolean headerRead;
private ObservableList<String> taxonomy; // if gigafida or gos
private HashMap<String, ObservableList<String>> solarFilters; // if solar
private HashMap<String, HashSet<String>> solarFiltersForXML; // if solar - used while parsing xml
private boolean gosOrthMode;
boolean hasMsdData;
private ArrayList<String> validationErrors;
public Corpus() {
validationErrors = new ArrayList<>();
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
logger.info("Corpus.set: ", corpusType);
}
public File getChosenResultsLocation() {
return chosenResultsLocation;
}
public void setChosenResultsLocation(File chosenResultsLocation) {
this.chosenResultsLocation = chosenResultsLocation;
logger.info("Corpus.set: ", chosenResultsLocation);
}
public File getChosenCorpusLocation() {
return chosenCorpusLocation;
}
public void setChosenCorpusLocation(File chosenCorpusLocation) {
this.chosenCorpusLocation = chosenCorpusLocation;
logger.info("Corpus.set: ", chosenCorpusLocation);
}
public Collection<File> getDetectedCorpusFiles() {
return detectedCorpusFiles;
}
public void setDetectedCorpusFiles(Collection<File> detectedCorpusFiles) {
this.detectedCorpusFiles = detectedCorpusFiles;
logger.info("Corpus.set: ", detectedCorpusFiles);
}
public boolean isHeaderRead() {
return headerRead;
}
public void setHeaderRead(boolean headerRead) {
this.headerRead = headerRead;
}
public ObservableList<String> getTaxonomy() {
return taxonomy;
}
public void setTaxonomy(ObservableList<String> taxonomy) {
this.taxonomy = taxonomy;
logger.info("Corpus.set: ", taxonomy);
}
public HashMap<String, ObservableList<String>> getSolarFilters() {
return solarFilters;
}
public void setSolarFilters(HashMap<String, ObservableList<String>> solarFilters) {
this.solarFilters = solarFilters;
logger.info("Corpus.set: ", solarFilters);
}
public HashMap<String, HashSet<String>> getSolarFiltersForXML() {
return solarFiltersForXML;
}
public void setSolarFiltersForXML(HashMap<String, HashSet<String>> solarFiltersForXML) {
this.solarFiltersForXML = solarFiltersForXML;
logger.info("Corpus.set: ", solarFiltersForXML);
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
logger.info("Corpus.set: ", gosOrthMode);
}
public ArrayList<String> getValidationErrors() {
return validationErrors;
}
public String getValidationErrorsToString() {
return StringUtils.join(validationErrors, "\n - ");
}
public void setValidationErrors(ArrayList<String> validationErrors) {
this.validationErrors = validationErrors;
}
public boolean validate() {
if (corpusType == null) {
validationErrors.add(LABEL_RESULTS_CORPUS_TYPE_NOT_SET);
}
if (chosenCorpusLocation == null) {
validationErrors.add(LABEL_CORPUS_LOCATION_NOT_SET);
}
if (chosenResultsLocation == null) {
validationErrors.add(LABEL_RESULTS_LOCATION_NOT_SET);
}
if (!headerRead && corpusType != null) {
// if user didn't opt into reading the headers, set default taxonomy or solar filters
if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) {
taxonomy = Tax.getTaxonomyForComboBox(corpusType);
} else if (corpusType == CorpusType.SOLAR && solarFilters == null) {
setSolarFilters(SolarFilters.getFiltersForComboBoxes());
}
}
if (headerRead && ValidationUtil.isEmpty(taxonomy)) {
// mustn't happen, intercept at gui level
}
if (!ValidationUtil.isEmpty(validationErrors)) {
logger.error("Corpus validation error: ", StringUtils.join(validationErrors, "\n - "));
return false;
} else {
return true;
}
}
}

View File

@@ -0,0 +1,25 @@
package data;
public enum CorpusType {
GIGAFIDA("Gigafida", "gigafida"),
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos");
private final String name;
private final String nameLowerCase;
CorpusType(String name, String nameLowerCase) {
this.name = name;
this.nameLowerCase = nameLowerCase;
}
public String toString() {
return this.name;
}
public String getNameLowerCase() {
return nameLowerCase;
}
}

View File

@@ -0,0 +1,12 @@
package data.Enums;
import java.util.Arrays;
import java.util.HashSet;
public class InflectedJosTypes {
public static final HashSet<Character> inflectedJosTypes = new HashSet<>();
static {
inflectedJosTypes.addAll(Arrays.asList('S', 'G', 'P'));
}
}

View File

@@ -0,0 +1,68 @@
package data.Enums;
import java.util.HashMap;
public enum Msd {
NOUN("samostalnik", 'S', "Noun", 'N', 5),
VERB("glagol", 'G', "Verb", 'V', 7),
ADJECTIVE("pridevnik", 'P', "Adjective", 'A', 6),
ADVERB("prislov", 'R', "Adverb", 'R', 2),
PRONOUN("zaimek", 'Z', "Pronoun", 'P', 8),
NUMERAL("števnik", 'K', "Numeral", 'M', 6),
PREPOSITION("predlog", 'D', "Preposition", 'S', 1),
CONJUNCTION("veznik", 'V', "Conjunction", 'C', 1),
PARTICLE("členek", 'L', "Particle", 'Q', 0),
INTERJECTION("medmet", 'M', "Interjection", 'I', 0),
ABBREVIATION("okrajšava", 'O', "Abbreviation", 'Y', 0),
RESIDUAL("neuvrščeno", 'N', "Residual", 'X', 1);
private final String siName;
private final Character siCode;
private final String enName;
private final Character enCode;
private final Integer nOfAttributes;
private static HashMap<Character, Integer> siCodeNOfAttributes;
static {
siCodeNOfAttributes = new HashMap<>();
for (Msd msd : Msd.values()) {
siCodeNOfAttributes.put(msd.getSiCode(), msd.nOfAttributes);
}
}
Msd(String siName, Character siCode, String enName, Character enCode, int nOfAttributes) {
this.siName = siName;
this.siCode = siCode;
this.enName = enName;
this.enCode = enCode;
this.nOfAttributes = nOfAttributes;
}
public String getSiName() {
return siName;
}
public Character getSiCode() {
return siCode;
}
public String getEnName() {
return enName;
}
public Character getEnCode() {
return enCode;
}
/**
* Returns the number of attributes for the given type.
*
* @param msd
*
* @return
*/
public static int getMsdLengthForType(String msd) {
return siCodeNOfAttributes.get(msd.charAt(0)) + 1;
}
}

View File

@@ -0,0 +1,55 @@
package data.Enums;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class WordLevelDefaultValues {
public final static Logger logger = LogManager.getLogger(WordLevelDefaultValues.class);
private static HashSet<String> suffixes;
private static final String SUFFIXES_FILE = "/Lists/suffixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_SUFFIX = 2;
private static HashSet<String> prefixes;
private static final String PREFIXES_FILE = "/Lists/prefixes.txt";
public static final int MIN_N_OF_CHARACTERS_LEFT_PREFIX = 2;
static {
suffixes = new HashSet<>();
suffixes = readFromFile(SUFFIXES_FILE);
prefixes = new HashSet<>();
prefixes = readFromFile(PREFIXES_FILE);
}
private static HashSet<String> readFromFile(String fileName) {
Set<String> dictionary = new HashSet<>();
try (InputStream is = WordLevelDefaultValues.class.getClass().getResourceAsStream(fileName)) {
if (is != null) {
// TODO: warn if !exists
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
dictionary = reader.lines().collect(Collectors.toSet());
}
} catch (IOException e) {
logger.error("Problem reading init dictionary", e);
}
return (HashSet<String>) dictionary;
}
public static HashSet<String> getSuffixes() {
return suffixes;
}
public static HashSet<String> getPrefixes() {
return prefixes;
}
}

View File

@@ -0,0 +1,16 @@
package data.Enums;
public enum WordLevelType {
SUFFIX("pripona"),
PREFIX("predpona");
private final String name;
WordLevelType(String name) {
this.name = name;
}
public String getName() {
return name;
}
}

View File

@@ -0,0 +1,57 @@
package data.Enums.solar;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class SolarFilters {
private static HashMap<String, ObservableList<String>> SOLAR_FILTERS;
public static final String SOLA = "sola";
public static final String PREDMET = "predmet";
public static final String RAZRED = "razred";
public static final String REGIJA = "regija";
public static final String TIP = "tip";
public static final String LETO = "leto";
static {
SOLAR_FILTERS = new HashMap<>();
SOLAR_FILTERS.put(REGIJA, FXCollections.observableArrayList("Celje", "Gorica", "Koper", "Kranj", "Krško", "Ljubljana", "Maribor", "Murska Sobota", "Novo mesto", "Postojna", "Slovenj Gradec"));
SOLAR_FILTERS.put(PREDMET, FXCollections.observableArrayList("državljanska vzgoja in etika", "ekonomija", "filozofija", "geografija", "kemija", "podjetništvo", "psihologija", "slovenščina", "sociologija", "umetnostna vzgoja", "zgodovina"));
SOLAR_FILTERS.put(RAZRED, FXCollections.observableArrayList("6. razred", "7. razred", "8. razred", "9. razred", "1. letnik", "2. letnik", "3. letnik", "4. letnik", "5. letnik", "maturitetni tečaj"));
SOLAR_FILTERS.put(LETO, FXCollections.observableArrayList("2007", "2008", "2009", "2009/2010", "2010"));
SOLAR_FILTERS.put(SOLA, FXCollections.observableArrayList("gimnazija", "osnovna šola", "poklicna šola", "strokovna šola"));
SOLAR_FILTERS.put(TIP, FXCollections.observableArrayList("esej/spis", "pisni izdelek (učna ura)", "test (daljše besedilo)", "test (odgovori na vprašanja)"));
}
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_FULL = FXCollections.observableArrayList("različnica", "lema", "oblikoskladenjska oznaka", "oblikoskladenjska lastnost", "besedna vrsta");
public static final ObservableList<String> N_GRAM_COMPUTE_FOR_LIMITED = FXCollections.observableArrayList("različnica", "lema");
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes() {
return SOLAR_FILTERS;
}
/**
* Returns filters with all possible values
*/
public static HashMap<String, ObservableList<String>> getFiltersForComboBoxes(HashMap<String, HashSet<String>> foundFilters) {
HashMap<String, ObservableList<String>> filtersForComboBoxes = new HashMap<>();
for (Map.Entry<String, ObservableList<String>> e : SOLAR_FILTERS.entrySet()) {
if (!foundFilters.containsKey(e.getKey())) {
// if, by some reason a specific filter wasn't in the corpus, return a blank list for that filter
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList());
} else {
filtersForComboBoxes.put(e.getKey(), FXCollections.observableArrayList(foundFilters.get(e.getKey())).sorted());
}
}
return filtersForComboBoxes;
}
}

View File

@@ -0,0 +1,144 @@
package data;
import static data.Filter.filterName.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.regex.Pattern;
import gui.ValidationUtil;
@SuppressWarnings("unchecked")
public class Filter {
private HashMap<filterName, Object> filter;
public enum filterName {
ANALYSIS_LEVEL,
CALCULATE_FOR,
NGRAM_VALUE,
SKIP_VALUE,
IS_CVV,
STRING_LENGTH,
TAXONOMY,
MSD,
HAS_MSD,
SOLAR_FILTERS
}
public Filter() {
filter = new HashMap<>();
}
public Filter(AnalysisLevel al, CalculateFor cf) {
filter = new HashMap<>();
filter.put(ANALYSIS_LEVEL, al);
filter.put(CALCULATE_FOR, cf);
}
public void setAl(AnalysisLevel al) {
filter.put(ANALYSIS_LEVEL, al);
}
public AnalysisLevel getAl() {
return (AnalysisLevel) filter.get(ANALYSIS_LEVEL);
}
public void setCalculateFor(CalculateFor cf) {
filter.put(CALCULATE_FOR, cf);
}
public CalculateFor getCalculateFor() {
return (CalculateFor) filter.get(CALCULATE_FOR);
}
public void setNgramValue(Integer ngramValue) {
filter.put(NGRAM_VALUE, ngramValue);
}
public Integer getNgramValue() {
return (Integer) filter.get(NGRAM_VALUE);
}
public void setSkipValue(Integer skipValue) {
filter.put(SKIP_VALUE, skipValue);
}
public Integer getSkipValue() {
return (Integer) filter.get(SKIP_VALUE);
}
public void setIsCvv(boolean isCvv) {
filter.put(IS_CVV, isCvv);
}
public boolean isCvv() {
return filter.containsKey(IS_CVV) && (boolean) filter.get(IS_CVV);
}
public void setStringLength(int stringLength) {
filter.put(STRING_LENGTH, stringLength);
}
public Integer getStringLength() {
return (Integer) filter.get(STRING_LENGTH);
}
public void setTaxonomy(ArrayList<String> taxonomy) {
filter.put(TAXONOMY, taxonomy);
}
public ArrayList<String> getTaxonomy() {
if (filter.containsKey(TAXONOMY) && filter.get(TAXONOMY) != null) {
return (ArrayList<String>) filter.get(TAXONOMY);
} else {
return new ArrayList<>();
}
}
public void setMsd(ArrayList<Pattern> msd) {
filter.put(MSD, msd);
if (!ValidationUtil.isEmpty(msd)) {
setHasMsd(true);
} else {
setHasMsd(false);
}
}
public ArrayList<Pattern> getMsd() {
return (ArrayList<Pattern>) filter.get(MSD);
}
public void setHasMsd(boolean hasMsd) {
filter.put(HAS_MSD, hasMsd);
}
public boolean hasMsd() {
return filter.containsKey(HAS_MSD) && (boolean) filter.get(HAS_MSD);
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Filter:");
for (Map.Entry<filterName, Object> entry : filter.entrySet()) {
sb.append(newLine)
.append(entry.getKey().toString())
.append(": ")
.append(entry.getValue() != null ? entry.getValue().toString() : "null");
}
return sb.toString();
}
public void setSolarFilters(HashMap<String, HashSet<String>> filters) {
filter.put(SOLAR_FILTERS, filters);
}
public HashMap<String, HashSet<String>> getSolarFilters() {
return (HashMap<String, HashSet<String>>) filter.get(SOLAR_FILTERS);
}
}

View File

@@ -0,0 +1,71 @@
package data;
public enum GigafidaJosWordType {
SAMOSTALNIK("samostalnik", 'S'),
GLAGOL("glagol", 'G'),
PRIDEVNIK("pridevnik", 'P'),
PRISLOV("prislov", 'R'),
ZAIMEK("zaimek", 'Z'),
STEVNIK("stevnik", 'K'),
PREDLOG("predlog", 'D'),
VEZNIK("veznik", 'V'),
CLENEK("clenek", 'L'),
MEDMET("medmet", 'M'),
OKRAJSAVA("okrajsava", 'O');
private final String name;
private final char wordType;
GigafidaJosWordType(String name, char wordType) {
this.name = name;
this.wordType = wordType;
}
public String toString() {
return this.name;
}
public char getWordType() {
return wordType;
}
public static GigafidaJosWordType factory(String wType) {
if (wType != null) {
if (SAMOSTALNIK.toString().equals(wType)) {
return SAMOSTALNIK;
}
if (GLAGOL.toString().equals(wType)) {
return GLAGOL;
}
if (PRIDEVNIK.toString().equals(wType)) {
return PRIDEVNIK;
}
if (PRISLOV.toString().equals(wType)) {
return PRISLOV;
}
if (ZAIMEK.toString().equals(wType)) {
return ZAIMEK;
}
if (STEVNIK.toString().equals(wType)) {
return STEVNIK;
}
if (PREDLOG.toString().equals(wType)) {
return PREDLOG;
}
if (VEZNIK.toString().equals(wType)) {
return VEZNIK;
}
if (CLENEK.toString().equals(wType)) {
return CLENEK;
}
if (MEDMET.toString().equals(wType)) {
return MEDMET;
}
if (OKRAJSAVA.toString().equals(wType)) {
return OKRAJSAVA;
}
}
return null;
}
}

View File

@@ -0,0 +1,76 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GigafidaTaxonomy {
TISK("tisk", "T"),
KNJIZNO("knjižno", "T.K"),
LEPOSLOVNO("leposlovno", "T.K.L"),
STROKOVNO("strokovno", "T.K.S"),
PERIODICNO("periodično", "T.P"),
CASOPIS("časopis", "T.P.C"),
REVIJA("revija", "T.P.R"),
INTERNET("internet", "I");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GigafidaTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GigafidaTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GigafidaTaxonomy factory(String tax) {
if (tax != null) {
if (TISK.toString().equals(tax)) {
return TISK;
}
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@@ -0,0 +1,85 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum GosTaxonomy {
JAVNI("javni", "gos.T.J"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "gos.T.J.I"),
RAZVEDRILNI("razvedrilni", "gos.T.J.R"),
NEJAVNI("nejavni", "gos.T.N"),
NEZASEBNI("nezasebni", "gos.T.N.N"),
ZASEBNI("zasebni", "gos.T.N.Z"),
OSEBNI_STIK("osebni stik", "gos.K.O"),
TELEFON("telefon", "gos.K.P"),
RADIO("radio", "gos.K.R"),
TELEVIZIJA("televizija", "gos.K.T");
private final String name;
private final String taxonomy;
private static final ObservableList<String> FOR_COMBO_BOX;
static {
ArrayList<String> values = Arrays.stream(GosTaxonomy.values()).map(x -> x.name).collect(Collectors.toCollection(ArrayList::new));
FOR_COMBO_BOX = FXCollections.observableArrayList(values);
}
GosTaxonomy(String name, String taxonomy) {
this.name = name;
this.taxonomy = taxonomy;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static GosTaxonomy factory(String tax) {
if (tax != null) {
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
}
return null;
}
public static ObservableList<String> getForComboBox() {
return FOR_COMBO_BOX;
}
}

View File

@@ -0,0 +1,56 @@
package data;
import java.util.List;
import java.util.Map;
public class Sentence {
private List<Word> words;
private String taksonomija;
// GOS
private String type;
private Map<String, String> properties;
public Sentence(List<Word> words, String taksonomija) {
this.words = words;
this.taksonomija = taksonomija;
}
public Sentence(List<Word> words) {
this.words = words;
}
public Sentence(List<Word> words, String taksonomija, Map<String, String> properties) {
this.words = words;
this.taksonomija = taksonomija;
this.properties = properties;
}
public Sentence(List<Word> words, String taksonomija, String type) {
this.words = words;
this.taksonomija = taksonomija;
this.type = type;
}
public List<Word> getWords() {
return words;
}
public String getTaxonomy() {
return taksonomija;
}
public List<Word> getSublist(int indexFrom, int indexTo) {
return this.words.subList(indexFrom, indexTo);
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@@ -0,0 +1,16 @@
package data;
import java.io.File;
import java.util.Collection;
public class Settings {
public static final int CORPUS_SENTENCE_LIMIT = 50000;
public static final boolean PRINT_LOG = false;
public static final String FX_ACCENT_OK = "-fx-accent: forestgreen;";
public static final String FX_ACCENT_NOK = "-fx-accent: red;";
public static Collection<File> corpus;
public static File resultsFilePath;
}

View File

@@ -0,0 +1,299 @@
package data;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import util.Util;
import util.db.RDB;
public class Statistics {
private CorpusType corpusType;
private AnalysisLevel analysisLevel;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private String taxonomy;
private boolean taxonomyIsSet;
private char JOSType;
private boolean JOSTypeIsSet;
private String resultTitle;
public Map<String, AtomicLong> result = new ConcurrentHashMap<>();
// nGrams
private int nGramLevel;
private Integer skip;
private CalculateFor cf;
private List<Pattern> morphosyntacticFilter;
// distributions
private String distributionTaxonomy;
private char distributionJosWordType;
private boolean vcc;
private Integer substringLength;
// inflected JOS
private String inflectedJosTaxonomy;
// GOS
boolean gosOrthMode;
// šolar
Map<String, Object> solarHeadBlockFilter;
// for ngrams
public Statistics(AnalysisLevel al, int nGramLevel, Integer skip, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.cf = cf;
this.analysisLevel = al;
this.nGramLevel = nGramLevel;
this.skip = skip == null || skip == 0 ? null : skip;
this.resultTitle = String.format("%s%d-gram_%s_%s",
this.skip != null ? String.format("%d-%s-", skip, "skip") : "",
nGramLevel,
cf.toString(),
dateTime);
}
// for words distributions
public Statistics(AnalysisLevel al, Taxonomy distributionTaxonomy, GigafidaJosWordType distributionJosWordType, CalculateFor cf) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy.toString() : "",
distributionJosWordType != null ? distributionJosWordType.toString() : "",
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.distributionTaxonomy = distributionTaxonomy != null ? distributionTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = distributionTaxonomy != null;
this.JOSTypeIsSet = distributionJosWordType != null;
this.distributionJosWordType = this.JOSTypeIsSet ? distributionJosWordType.getWordType() : ' ';
}
public Statistics(AnalysisLevel al, CalculateFor cf, Integer substringLength) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("%s_%d_%s",
"Distribucija zaporedij samoglasnikov in soglasnikov",
substringLength,
dateTime);
this.analysisLevel = al;
this.cf = cf;
this.substringLength = substringLength;
this.vcc = true;
}
public Statistics(AnalysisLevel al, Taxonomy inflectedJosTaxonomy) {
String dateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm"));
this.resultTitle = String.format("InflectedJOS_%s_%s",
distributionTaxonomy != null ? distributionTaxonomy : "",
dateTime);
this.analysisLevel = al;
this.inflectedJosTaxonomy = inflectedJosTaxonomy != null ? inflectedJosTaxonomy.getTaxonomnyString() : null;
this.taxonomyIsSet = inflectedJosTaxonomy != null;
}
public Integer getSkip() {
return skip;
}
public Integer getSubstringLength() {
return substringLength;
}
public String getInflectedJosTaxonomy() {
return inflectedJosTaxonomy;
}
public void setSubstringLength(Integer substringLength) {
this.substringLength = substringLength;
}
public boolean isVcc() {
return vcc;
}
public void setVcc(boolean vcc) {
this.vcc = vcc;
}
public String getDistributionTaxonomy() {
return distributionTaxonomy;
}
public void setDistributionTaxonomy(String distributionTaxonomy) {
this.distributionTaxonomy = distributionTaxonomy;
}
public char getDistributionJosWordType() {
return distributionJosWordType;
}
public void setDistributionJosWordType(char distributionJosWordType) {
this.distributionJosWordType = distributionJosWordType;
}
public void setMorphosyntacticFilter(List<String> morphosyntacticFilter) {
// change filter strings to regex patterns
this.morphosyntacticFilter = new ArrayList<>();
for (String s : morphosyntacticFilter) {
this.morphosyntacticFilter.add(Pattern.compile(s.replaceAll("\\*", ".")));
}
}
public List<Pattern> getMsd() {
return morphosyntacticFilter;
}
public Map<String, AtomicLong> getResult() {
return result;
}
public void setTaxonomy(String taxonomy) {
this.taxonomy = taxonomy;
}
public void setTaxonomyIsSet(boolean taxonomyIsSet) {
this.taxonomyIsSet = taxonomyIsSet;
}
public char getJOSType() {
return JOSType;
}
public void setJOSType(char JOSType) {
this.JOSType = JOSType;
}
public boolean isJOSTypeSet() {
return JOSTypeIsSet;
}
public void setJOSType(boolean JOSTypeIsSet) {
this.JOSTypeIsSet = JOSTypeIsSet;
}
public void saveResultToDisk(int... limit) throws UnsupportedEncodingException {
// Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
//
// if (useDB) {
// result = db.getDump();
// db.delete();
// }
//
// // if no results and nothing to save, return false
// if (!(result.size() > 0)) {
// analysisProducedResults = false;
// return;
// } else {
// analysisProducedResults = true;
// }
//
// stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
// Export.SetToCSV(stats);
}
// private Map<String, Integer> getSortedResultInflected(Map map) {
// // first convert to <String, Integer>
// Map<String, Integer> m = Util.sortByValue(Util.atomicInt2StringAndInt(map), 0);
//
// Map<String, Integer> sortedM = new TreeMap<>();
//
// sortedM.putAll(m);
//
// return sortedM;
// }
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public String getTaxonomy() {
return taxonomy;
}
public boolean isTaxonomySet() {
return taxonomyIsSet;
}
public int getnGramLevel() {
return nGramLevel;
}
public CalculateFor getCf() {
return cf;
}
public AnalysisLevel getAnalysisLevel() {
return analysisLevel;
}
public CorpusType getCorpusType() {
return corpusType;
}
public void setCorpusType(CorpusType corpusType) {
this.corpusType = corpusType;
}
public boolean isGosOrthMode() {
return gosOrthMode;
}
public void setGosOrthMode(boolean gosOrthMode) {
this.gosOrthMode = gosOrthMode;
}
public Map<String, Object> getSolarHeadBlockFilter() {
return solarHeadBlockFilter;
}
public void setSolarHeadBlockFilter(Map<String, Object> solarHeadBlockFilter) {
this.solarHeadBlockFilter = solarHeadBlockFilter;
}
public boolean isUseDB() {
return useDB;
}
public void setUseDB(boolean useDB) {
if (useDB && db == null) {
db = new RDB();
}
this.useDB = useDB;
}
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
}

View File

@@ -0,0 +1,409 @@
package data;
import static gui.ValidationUtil.*;
import java.io.UnsupportedEncodingException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import alg.inflectedJOS.WordFormation;
import data.Enums.WordLevelType;
import javafx.collections.ObservableList;
import util.Export;
import util.Util;
import util.db.RDB;
@SuppressWarnings("Duplicates")
public class StatisticsNew {
public final static Logger logger = LogManager.getLogger(StatisticsNew.class);
private Corpus corpus;
private Filter filter;
private String resultTitle;
private Map<String, AtomicLong> result;
private Object[][] resultCustom; // for when calculating percentages that don't add up to 100%
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedSuffix;
private Map<String, ConcurrentHashMap<String, AtomicLong>> resultNestedPrefix;
private boolean useDB;
private RDB db;
private boolean analysisProducedResults;
private LocalDateTime time;
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
if (useDB) {
this.useDB = true;
db = new RDB();
}
if (filter.getAl() == AnalysisLevel.WORD_LEVEL) {
resultNestedSuffix = new ConcurrentHashMap<>();
resultNestedPrefix = new ConcurrentHashMap<>();
} else {
result = new ConcurrentHashMap<>();
}
resultTitle = generateResultTitle();
logger.debug(toString());
}
/**
* Result's title consists of:
* <ul>
* <li>Corpus type</li>
* <li>Analysis level</li>
* <li>Calculate for</li>
* <li></li>
* <li></li>
* <li></li>
* <li></li>
* </ul>
*
* @return
*/
private String generateResultTitle() {
String separator = "_";
StringBuilder sb = new StringBuilder();
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if(ngramLevel == 0) {
sb.append("Crke").
append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
} else if(ngramLevel == 1) {
sb.append("Besede").append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
else {
sb.append(filter.getAl().toString())
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
sb.append(filter.getCalculateFor().toString())
.append(separator);
// ngram value
sb.append(filter.getNgramValue()).append("-gram")
.append(separator);
sb.append(filter.getSkipValue()).append("-preskok")
.append(separator);
}
// TODO: assure skip is not null but zero
} else {
sb.append(filter.getAl().toString()) // analysis level
.append(separator)
.append(corpus.getCorpusType().toString())
.append(separator);
}
// skip value
// msd ?
// if taxonomy -> taxonomy
// if cvv -> cvv + dolžina
this.time = this.time != null ? this.time : LocalDateTime.now();
sb.append(time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy_hh.mm.ss")));
return sb.toString();
}
public boolean isAnalysisProducedResults() {
return analysisProducedResults;
}
public void setAnalysisProducedResults(boolean analysisProducedResults) {
this.analysisProducedResults = analysisProducedResults;
}
public String toString() {
String newLine = "\n\t- ";
StringBuilder sb = new StringBuilder();
sb.append(newLine).append("Statistic properties:");
sb.append(newLine).append(corpus.getCorpusType().toString()).append(String.format(" (%d files)", corpus.getDetectedCorpusFiles().size()));
sb.append(newLine).append(useDB ? "use DB" : "run in memory");
sb.append(newLine).append(filter.toString());
return sb.toString();
}
public String getResultTitle() {
return resultTitle;
}
// ****************************************
// ***************** util *****************
// ****************************************
/**
* Stores results from this batch to a database and clears results map
*/
public void storeTmpResultsToDB() {
try {
db.writeBatch(result);
result = new ConcurrentHashMap<>();
} catch (UnsupportedEncodingException e) {
logger.error("Store tmp results to DB", e);
// e.printStackTrace();
}
}
public Filter getFilter() {
return filter;
}
public Corpus getCorpus() {
return corpus;
}
public boolean saveResultToDisk(int... limit) throws UnsupportedEncodingException {
Set<Pair<String, Map<String, Long>>> stats = new HashSet<>();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
stats.add(ImmutablePair.of(resultTitle, getSortedResult(result, Util.getValidInt(limit))));
Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean saveResultNestedToDisk(int... limit) throws UnsupportedEncodingException {
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
Map<WordLevelType, Map<String, Map<String, Long>>> results = new HashMap<>();
if (!isEmpty(resultNestedSuffix)) {
results.put(WordLevelType.SUFFIX, sortNestedMap(resultNestedSuffix, Util.getValidInt(limit)));
}
if (!isEmpty(resultNestedPrefix)) {
results.put(WordLevelType.PREFIX, sortNestedMap(resultNestedPrefix, Util.getValidInt(limit)));
}
// if no results and nothing to save, return false
if (!(results.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
Export.nestedMapToCSV(resultTitle, results, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
public boolean recalculateAndSaveResultToDisk() throws UnsupportedEncodingException {
filter.setAl(AnalysisLevel.WORD_FORMATION);
resultTitle = generateResultTitle();
if (useDB) {
result = db.getDump();
db.delete();
}
// if no results and nothing to save, return false
if (!(result.size() > 0)) {
analysisProducedResults = false;
return false;
} else {
analysisProducedResults = true;
}
WordFormation.calculateStatistics(this);
Export.SetToCSV(resultTitle, resultCustom, corpus.getChosenResultsLocation(), headerInfoBlock());
return true;
}
private Map<String, Map<String, Long>> sortNestedMap(Map<String, ConcurrentHashMap<String, AtomicLong>> nestedMap, int limit) {
Map<String, Map<String, Long>> sorted = new HashMap<>();
for (String s : nestedMap.keySet()) {
sorted.put(s, getSortedResult(nestedMap.get(s), Util.getValidInt(limit)));
}
return sorted;
}
private Map<String, Long> getSortedResult(Map<String, AtomicLong> map, int limit) {
return Util.sortByValue(Util.atomicInt2StringAndInt(map), limit);
}
public void updateResults(String o) {
// if not in map
AtomicLong r = result.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
result.get(o).incrementAndGet();
}
public Map<String, AtomicLong> getResult() {
return result;
}
public Object[][] getResultCustom() {
return resultCustom;
}
public void setResultCustom(Object[][] resultCustom) {
this.resultCustom = resultCustom;
}
public void updateResultsNested(WordLevelType type, String key, String stringValue) {
ConcurrentHashMap<String, ConcurrentHashMap<String, AtomicLong>> resultsMap;
if (type == WordLevelType.SUFFIX) {
updateResultsNestedSuffix(key, stringValue);
} else if (type == WordLevelType.PREFIX) {
updateResultsNestedPrefix(key, stringValue);
}
}
public void updateResultsNestedSuffix(String key, String stringValue) {
if (resultNestedSuffix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedSuffix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedSuffix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedSuffix.get(key).get(stringValue).incrementAndGet();
}
}
}
public void updateResultsNestedPrefix(String key, String stringValue) {
if (resultNestedPrefix.containsKey(key)) {
// if not in map
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
// else
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
} else {
resultNestedPrefix.putIfAbsent(key, new ConcurrentHashMap<>());
AtomicLong r = resultNestedPrefix.get(key).putIfAbsent(stringValue, new AtomicLong(1));
if (r != null) {
resultNestedPrefix.get(key).get(stringValue).incrementAndGet();
}
}
}
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();
info.put("Korpus:", corpus.getCorpusType().toString());
info.put("Datum:", time.format(DateTimeFormatter.ofPattern("dd.MM.yyyy hh:mm")));
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
if (ngramLevel == 0)
info.put("Analiza:", "Črke");
else if (ngramLevel == 1)
info.put("Analiza", "Besede");
else
info.put("Analiza:", filter.getAl().toString());
} else {
info.put("Analiza:", filter.getAl().toString());
}
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
Integer ngramLevel = filter.getNgramValue();
// n.gram nivo
if (ngramLevel > 1) {
info.put("n-gram nivo:", String.valueOf(ngramLevel));
} else if (ngramLevel == 1){
info.put("n-gram nivo:", "nivo besed");
} else {
info.put("n-gram nivo:", "nivo črk");
}
// skip
if (ngramLevel > 1)
info.put("Skip:", isNotEmpty(filter.getSkipValue()) ? filter.getSkipValue().toString() : "0");
// izračunaj za
info.put("Izračunaj za:", filter.getCalculateFor().toString());
// msd
if (!isEmpty(filter.getMsd())) {
StringBuilder msdPattern = new StringBuilder();
for (Pattern pattern : filter.getMsd()) {
msdPattern.append(pattern.toString()).append(" ");
}
info.put("MSD:", msdPattern.toString());
}
// taksonomija
if (!isEmpty(filter.getTaxonomy())) {
info.put("Taksonomija:", StringUtils.join(filter.getTaxonomy(), ", "));
}
}
if (isNotEmpty(filter.getTaxonomy()) && Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
ArrayList<String> tax = Tax.getTaxonomyForInfo(corpus.getCorpusType(), filter.getTaxonomy());
info.put("Taksonomija: ", "");
String sep = "";
for (String s : tax) {
info.put(sep = sep + " ", s);
}
}
if (corpus.getCorpusType() == CorpusType.SOLAR) {
HashMap<String, ObservableList<String>> filters = corpus.getSolarFilters();
if (!isEmpty(filters)) {
info.put("Dodatni filtri: ", "");
for (Map.Entry<String, ObservableList<String>> f : filters.entrySet()) {
info.put(f.getKey(), StringUtils.join(f.getValue(), ", "));
}
}
}
return info;
}
}

175
src/main/java/data/Tax.java Normal file
View File

@@ -0,0 +1,175 @@
package data;
import java.util.*;
import java.util.stream.Collectors;
import gui.ValidationUtil;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
static {
// GIGAFIDA ----------------------------
GIGAFIDA_TAXONOMY = new LinkedHashMap<>();
GIGAFIDA_TAXONOMY.put("SSJ.T", "tisk");
GIGAFIDA_TAXONOMY.put("SSJ.T.K", "tisk-knjižno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.L", "tisk-knjižno-leposlovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.K.S", "tisk-knjižno-strokovno");
GIGAFIDA_TAXONOMY.put("SSJ.T.P", "tisk-periodično");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.C", "tisk-periodično-časopis");
GIGAFIDA_TAXONOMY.put("SSJ.T.P.R", "tisk-periodično-revija");
GIGAFIDA_TAXONOMY.put("SSJ.T.D", "tisk-drugo");
GIGAFIDA_TAXONOMY.put("SSJ.I", "internet");
GIGAFIDA_TAXONOMY.put("Ft.P", "prenosnik");
GIGAFIDA_TAXONOMY.put("Ft.P.G", "prenosnik-govorni");
GIGAFIDA_TAXONOMY.put("Ft.P.E", "prenosnik-elektronski");
GIGAFIDA_TAXONOMY.put("Ft.P.P", "prenosnik-pisni");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O", "prenosnik-pisni-objavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.K", "prenosnik-pisni-objavljeno-knjižno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P", "prenosnik-pisni-objavljeno-periodično");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C", "prenosnik-pisni-objavljeno-periodično-časopisno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.D", "prenosnik-pisni-objavljeno-periodično-časopisno-dnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.V", "prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.C.T", "prenosnik-pisni-objavljeno-periodično-časopisno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R", "prenosnik-pisni-objavljeno-periodično-revialno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.T", "prenosnik-pisni-objavljeno-periodično-revialno-tedensko");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.S", "prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.M", "prenosnik-pisni-objavljeno-periodično-revialno-mesečno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.D", "prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec");
GIGAFIDA_TAXONOMY.put("Ft.P.P.O.P.R.O", "prenosnik-pisni-objavljeno-periodično-revialno-občasno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N", "prenosnik-pisni-neobjavljeno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.J", "prenosnik-pisni-neobjavljeno-javno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.I", "prenosnik-pisni-neobjavljeno-interno");
GIGAFIDA_TAXONOMY.put("Ft.P.P.N.Z", "prenosnik-pisni-neobjavljeno-zasebno");
GIGAFIDA_TAXONOMY.put("Ft.Z", "zvrst");
GIGAFIDA_TAXONOMY.put("Ft.Z.U", "zvrst-umetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.P", "zvrst-umetnostna-pesniška");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.R", "zvrst-umetnostna-prozna");
GIGAFIDA_TAXONOMY.put("Ft.Z.U.D", "zvrst-umetnostna-dramska");
GIGAFIDA_TAXONOMY.put("Ft.Z.N", "zvrst-neumetnostna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S", "zvrst-neumetnostna-strokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.H", "zvrst-neumetnostna-strokovna-humanistična in družboslovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.S.N", "zvrst-neumetnostna-strokovna-naravoslovna in tehnična");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.N", "zvrst-neumetnostna-nestrokovna");
GIGAFIDA_TAXONOMY.put("Ft.Z.N.P", "zvrst-neumetnostna-pravna");
GIGAFIDA_TAXONOMY.put("Ft.L", "zvrst-lektorirano");
GIGAFIDA_TAXONOMY.put("Ft.L.D", "zvrst-lektorirano-da");
GIGAFIDA_TAXONOMY.put("Ft.L.N", "zvrst-lektorirano-ne");
// GOS ----------------------------------
GOS_TAXONOMY = new LinkedHashMap<>();
GOS_TAXONOMY.put("gos.T", "diskurz");
GOS_TAXONOMY.put("gos.T.J", "diskurz-javni");
GOS_TAXONOMY.put("gos.T.J.I", "diskurz-javni-informativno-izobraževalni");
GOS_TAXONOMY.put("gos.T.J.R", "diskurz-javni-razvedrilni");
GOS_TAXONOMY.put("gos.T.N", "diskurz-nejavni");
GOS_TAXONOMY.put("gos.T.N.N", "diskurz-nejavni-nezasebni");
GOS_TAXONOMY.put("gos.T.N.Z", "diskurz-nejavni-zasebni");
GOS_TAXONOMY.put("gos.S", "situacija");
GOS_TAXONOMY.put("gos.S.R", "situacija-radio");
GOS_TAXONOMY.put("gos.S.T", "situacija-televizija");
}
/**
* Returns the whole default taxonomy for the specified corpus type
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
return FXCollections.observableArrayList(GIGAFIDA_TAXONOMY.values());
} else if (corpusType == CorpusType.GOS) {
return FXCollections.observableArrayList(GOS_TAXONOMY.values());
}
return FXCollections.observableArrayList(new ArrayList<>());
}
/**
* Returns taxonomy names only for items found in headers
*/
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> taxForCombo = new ArrayList<>();
// assures same relative order
for (String t : tax.keySet()) {
if (foundTax.contains(t)) {
taxForCombo.add(tax.get(t));
}
}
return FXCollections.observableArrayList(taxForCombo);
}
public static HashSet<CorpusType> getCorpusTypesWithTaxonomy() {
return corpusTypesWithTaxonomy;
}
public static ArrayList<String> getTaxonomyCodes(ArrayList<String> taxonomyNames, CorpusType corpusType) {
ArrayList<String> result = new ArrayList<>();
if (ValidationUtil.isEmpty(taxonomyNames)) {
return result;
}
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
// for easier lookup
Map<String, String> taxInversed = tax.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey));
for (String taxonomyName : taxonomyNames) {
result.add(taxInversed.get(taxonomyName));
}
return result;
}
/**
* Returns a list of proper names for codes
*
* @param corpusType
* @param taxonomy
*
* @return
*/
public static ArrayList<String> getTaxonomyForInfo(CorpusType corpusType, ArrayList<String> taxonomy) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
}
ArrayList<String> result = new ArrayList<>();
for (String t : taxonomy) {
result.add(tax.get(t));
}
return result;
}
}

View File

@@ -0,0 +1,171 @@
package data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.stream.Collectors;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
public enum Taxonomy {
// GOS
JAVNI("javni", "T.J", "gos"),
INFORMATIVNO_IZOBRAZEVALNI("informativno-izobraževalni", "T.J.I", "gos"),
RAZVEDRILNI("razvedrilni", "T.J.R", "gos"),
NEJAVNI("nejavni", "T.N", "gos"),
NEZASEBNI("nezasebni", "T.N.N", "gos"),
ZASEBNI("zasebni", "T.N.Z", "gos"),
OSEBNI_STIK("osebni stik", "K.O", "gos"),
TELEFON("telefon", "K.P", "gos"),
RADIO("radio", "K.R", "gos"),
TELEVIZIJA("televizija", "K.T", "gos"),
// Gigafida
KNJIZNO("knjižno", "T.K", "gigafida"),
LEPOSLOVNO("leposlovno", "T.K.L", "gigafida"),
STROKOVNO("strokovno", "T.K.S", "gigafida"),
PERIODICNO("periodično", "T.P", "gigafida"),
CASOPIS("časopis", "T.P.C", "gigafida"),
REVIJA("revija", "T.P.R", "gigafida"),
INTERNET("internet", "I", "gigafida"),
SSJ_TISK("tisk", "SSJ.T", "gigafida"),
SSJ_KNJIZNO("opis", "identifikator", "gigafida"),
SSJ_LEPOSLOVNO("opis", "identifikator", "gigafida"),
SSJ_STROKOVNO("opis", "identifikator", "gigafida"),
SSJ_PERIODICNO("opis", "identifikator", "gigafida"),
SSJ_CASOPIS("opis", "identifikator", "gigafida"),
SSJ_REVIJA("opis", "identifikator", "gigafida"),
SSJ_DRUGO("opis", "identifikator", "gigafida"),
SSJ_INTERNET("opis", "identifikator", "gigafida"),
FT_P_PRENOSNIK("opis", "identifikator", "gigafida"),
FT_P_GOVORNI("opis", "identifikator", "gigafida"),
FT_P_ELEKTRONSKI("opis", "identifikator", "gigafida"),
FT_P_PISNI("opis", "identifikator", "gigafida"),
FT_P_OBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_KNJIZNO("opis", "identifikator", "gigafida"),
FT_P_PERIODICNO("opis", "identifikator", "gigafida"),
FT_P_CASOPISNO("opis", "identifikator", "gigafida"),
FT_P_DNEVNO("opis", "identifikator", "gigafida"),
FT_P_VECKRAT_TEDENSKO("opis", "identifikator", "gigafida"),
// FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_REVIALNO("opis", "identifikator", "gigafida"),
FT_P_TEDENSKO("opis", "identifikator", "gigafida"),
FT_P_STIRINAJSTDNEVNO("opis", "identifikator", "gigafida"),
FT_P_MESECNO("opis", "identifikator", "gigafida"),
FT_P_REDKEJE_KOT_MESECNO("opis", "identifikator", "gigafida"),
FT_P_OBCASNO("opis", "identifikator", "gigafida"),
FT_P_NEOBJAVLJENO("opis", "identifikator", "gigafida"),
FT_P_JAVNO("opis", "identifikator", "gigafida"),
FT_P_INTERNO("opis", "identifikator", "gigafida"),
FT_P_ZASEBNO("opis", "identifikator", "gigafida"),
FT_ZVRST("opis", "identifikator", "gigafida"),
FT_UMETNOSTNA("opis", "identifikator", "gigafida"),
FT_PESNISKA("opis", "identifikator", "gigafida"),
FT_PROZNA("opis", "identifikator", "gigafida"),
FT_DRAMSKA("opis", "identifikator", "gigafida"),
FT_NEUMETNOSTNA("opis", "identifikator", "gigafida"),
FT_STROKOVNA("opis", "identifikator", "gigafida"),
FT_HID("opis", "identifikator", "gigafida"),
FT_NIT("opis", "identifikator", "gigafida"),
FT_NESTROKOVNA("opis", "identifikator", "gigafida"),
FT_PRAVNA("opis", "identifikator", "gigafida"),
FT_LEKTORIRANO("opis", "identifikator", "gigafida"),
FT_DA("opis", "identifikator", "gigafida"),
FT_NE("opis", "identifikator", "gigafida");
private final String name;
private final String taxonomy;
private final String corpus;
Taxonomy(String name, String taxonomy, String corpusType) {
this.name = name;
this.taxonomy = taxonomy;
this.corpus = corpusType;
}
public String toString() {
return this.name;
}
public String getTaxonomnyString() {
return this.taxonomy;
}
public static Taxonomy factory(String tax) {
if (tax != null) {
// GOS
if (JAVNI.toString().equals(tax)) {
return JAVNI;
}
if (INFORMATIVNO_IZOBRAZEVALNI.toString().equals(tax)) {
return INFORMATIVNO_IZOBRAZEVALNI;
}
if (RAZVEDRILNI.toString().equals(tax)) {
return RAZVEDRILNI;
}
if (NEJAVNI.toString().equals(tax)) {
return NEJAVNI;
}
if (NEZASEBNI.toString().equals(tax)) {
return NEZASEBNI;
}
if (ZASEBNI.toString().equals(tax)) {
return ZASEBNI;
}
if (OSEBNI_STIK.toString().equals(tax)) {
return OSEBNI_STIK;
}
if (TELEFON.toString().equals(tax)) {
return TELEFON;
}
if (RADIO.toString().equals(tax)) {
return RADIO;
}
if (TELEVIZIJA.toString().equals(tax)) {
return TELEVIZIJA;
}
// Gigafida
// if (TISK.toString().equals(tax)) {
// return TISK;
// }
if (KNJIZNO.toString().equals(tax)) {
return KNJIZNO;
}
if (LEPOSLOVNO.toString().equals(tax)) {
return LEPOSLOVNO;
}
if (STROKOVNO.toString().equals(tax)) {
return STROKOVNO;
}
if (PERIODICNO.toString().equals(tax)) {
return PERIODICNO;
}
if (CASOPIS.toString().equals(tax)) {
return CASOPIS;
}
if (REVIJA.toString().equals(tax)) {
return REVIJA;
}
if (INTERNET.toString().equals(tax)) {
return INTERNET;
}
}
return null;
}
public static ObservableList<String> getDefaultForComboBox(String corpusType) {
ArrayList<String> values = Arrays.stream(Taxonomy.values())
.filter(x -> x.corpus.equals(corpusType))
.map(x -> x.name)
.collect(Collectors.toCollection(ArrayList::new));
return FXCollections.observableArrayList(values);
}
public static ObservableList<String> getDefaultForComboBox(CorpusType corpusType) {
return getDefaultForComboBox(corpusType.toString());
}
}

View File

@@ -0,0 +1,53 @@
package data;
import static gui.ValidationUtil.*;
import java.util.ArrayList;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import gui.Messages;
import gui.ValidationUtil;
public class Validation {
public static String validateForStringLevel(Filter filter) {
ArrayList<String> errors = new ArrayList<>();
// should not be null, error if null, because init failed
if (filter.getNgramValue() == null) {
errors.add(Messages.MISSING_NGRAM_LEVEL);
}
// should not be null, error if null, because init failed
if (filter.getCalculateFor() == null) {
errors.add(Messages.MISSING_CALCULATE_FOR);
}
if (filter.getSkipValue() == null) {
filter.setSkipValue(0);
}
if (filter.getNgramValue() != null && ValidationUtil.isEmpty(filter.getMsd()) &&
(filter.getMsd().size() != filter.getNgramValue())) {
if (!(filter.getMsd().size() == 1 && filter.getNgramValue() == 0) && !ValidationUtil.isEmpty(filter.getMsd()))
errors.add(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES);
}
Integer ngramValue = filter.getNgramValue();
ArrayList<Pattern> msd = filter.getMsd();
if (ngramValue > 0 && !ValidationUtil.isEmpty(msd) && ngramValue != msd.size()) {
errors.add(String.format(Messages.WARNING_MISMATCHED_NGRAM_AND_TOKENS_VALUES, ngramValue, msd.size()));
}
if (filter.getNgramValue() != null && filter.getNgramValue() == 0 && isEmpty(filter.getStringLength())) {
// if count letters, make sure that the length is given
// TODO: check that words we're adding in xml reader are longer than this value
errors.add(Messages.MISSING_STRING_LENGTH);
}
return isEmpty(errors) ? null : StringUtils.join(errors, ", \n");
}
}

View File

@@ -0,0 +1,141 @@
package data;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.lang3.StringUtils;
import data.Enums.Msd;
import gui.ValidationUtil;
public class Word implements Serializable {
public static final char PAD_CHARACTER = '-';
private String word;
private String lemma;
private String msd;
private final HashSet<Character> VOWELS = new HashSet<>(Arrays.asList('a', 'e', 'i', 'o', 'u'));
/**
* Possible values:
* <p>
* <ul>
* <li>S = samostalnik</li>
* <li>G = glagol</li>
* <li>P = pridevnik</li>
* <li>R = prislov</li>
* <li>Z = zaimek</li>
* <li>K = števnik</li>
* <li>D = predlog</li>
* <li>V = veznik</li>
* <li>L = členek</li>
* <li>M = medmet</li>
* <li>O = okrajšava</li>
* <li>N = neuvrščeno</li>
* </ul>
*/
//private char besedna_vrsta;
public Word(String word, String lemma, String msd) {
this.lemma = lemma;
this.msd = normalizeMsd(msd);
// veliko zacetnico ohranimo samo za lastna imena
if (!ValidationUtil.isEmpty(this.msd) && !(this.msd.charAt(0) == 'S'
&& this.msd.length() >= 2
&& this.msd.charAt(1) == 'l')) {
this.word = word.toLowerCase();
} else {
this.word = word;
}
}
public Word() {
}
/**
* Appends a number of '-' to msds which are not properly sized.
* E.g. nouns should have 5 attributes, yet the last one isn't always defined (Somei vs. Sometd)
*
* @param msdInput
*
* @return
*/
private String normalizeMsd(String msdInput) {
if (ValidationUtil.isEmpty(msdInput)) {
return "";
} else {
return StringUtils.rightPad(msdInput, Msd.getMsdLengthForType(msdInput), PAD_CHARACTER);
}
}
public Word(String word) {
this.word = word;
}
public String getWord() {
return word;
}
public String getCVVWord() {
return covertToCvv(word);
}
public String getCVVLemma() {
return covertToCvv(lemma);
}
private String covertToCvv(String s) {
char[] StringCA = s.toCharArray();
for (int i = 0; i < StringCA.length; i++) {
StringCA[i] = VOWELS.contains(StringCA[i]) ? 'V' : 'C';
}
return new String(StringCA);
}
public void setWord(String word) {
this.word = word;
}
public String getLemma() {
return lemma;
}
public void setLemma(String lemma) {
this.lemma = lemma;
}
public String getMsd() {
return msd;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("beseda:\t")
.append(getWord())
.append("\n")
.append("lema:\t")
.append(getLemma())
.append("\n")
.append("msd:\t")
.append(getMsd())
.append("\n");
return sb.toString();
}
public String getForCf(CalculateFor calculateFor, boolean cvv) {
String returnValue = "";
if (cvv) {
returnValue = calculateFor == CalculateFor.WORD ? getCVVWord() : getCVVLemma();
} else {
returnValue = calculateFor == CalculateFor.WORD ? getWord() : getLemma();
}
return returnValue;
}
}