You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
3.1 KiB
113 lines
3.1 KiB
package alg.word;
|
|
|
|
import static data.Enums.WordLevelDefaultValues.*;
|
|
|
|
import java.util.HashSet;
|
|
import java.util.List;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import data.Enums.WordLevelDefaultValues;
|
|
import data.Enums.WordLevelType;
|
|
import data.Sentence;
|
|
import data.StatisticsNew;
|
|
import data.Word;
|
|
|
|
@SuppressWarnings("Duplicates")
|
|
public class WordLevel {
|
|
private static HashSet<String> suffixes;
|
|
private static int minSuffixLength;
|
|
private static int maxSuffixLength;
|
|
|
|
private static HashSet<String> prefixes;
|
|
private static int minPrefixLength;
|
|
private static int maxPrefixLength;
|
|
|
|
static {
|
|
suffixes = WordLevelDefaultValues.getSuffixes();
|
|
calculateSuffixesLengths();
|
|
|
|
prefixes = WordLevelDefaultValues.getPrefixes();
|
|
calculatePrefixesLengths();
|
|
}
|
|
|
|
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
|
for (Sentence s : corpus) {
|
|
for (Word word : s.getWords()) {
|
|
calculateForSuffixes(word.getWord(), stats);
|
|
calculateForPrefixes(word.getWord(), stats);
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void calculateForPrefixes(String word, StatisticsNew stats) {
|
|
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
|
|
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
|
|
return;
|
|
}
|
|
|
|
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
|
|
|
|
if (prefixes.contains(extractedPrefix)) {
|
|
// save suffix and full word
|
|
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
public static void calculateForSuffixes(String word, StatisticsNew stats) {
|
|
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
|
|
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
|
|
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
|
|
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
|
|
return;
|
|
}
|
|
|
|
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
|
|
|
|
if (suffixes.contains(extractedSuffix)) {
|
|
// save suffix and full word
|
|
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// finds the shortest and longest suffix for quicker calculations
|
|
public static void calculateSuffixesLengths() {
|
|
minSuffixLength = -1;
|
|
maxSuffixLength = -1;
|
|
|
|
for (String suffix : suffixes) {
|
|
if (suffix.length() > maxSuffixLength) {
|
|
maxSuffixLength = suffix.length();
|
|
|
|
if (minSuffixLength < 0) {
|
|
minSuffixLength = maxSuffixLength;
|
|
}
|
|
} else if (suffix.length() < minSuffixLength) {
|
|
minSuffixLength = suffix.length();
|
|
}
|
|
}
|
|
}
|
|
|
|
// finds the shortest and longest suffix for quicker calculations
|
|
public static void calculatePrefixesLengths() {
|
|
minPrefixLength = -1;
|
|
maxPrefixLength = -1;
|
|
|
|
for (String prefix : prefixes) {
|
|
if (prefix.length() > maxPrefixLength) {
|
|
maxPrefixLength = prefix.length();
|
|
|
|
if (minPrefixLength < 0) {
|
|
minPrefixLength = maxPrefixLength;
|
|
}
|
|
} else if (prefix.length() < minPrefixLength) {
|
|
minPrefixLength = prefix.length();
|
|
}
|
|
}
|
|
}
|
|
}
|