Added some performance measures
This commit is contained in:
@@ -5,6 +5,7 @@ import static data.Enums.solar.SolarFilters.*;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
@@ -261,7 +262,7 @@ public class XML_processing {
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek));
|
||||
corpus.add(new Sentence(stavek, null));
|
||||
// and start a new one
|
||||
stavek = new ArrayList<>();
|
||||
|
||||
@@ -293,7 +294,7 @@ public class XML_processing {
|
||||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd, null));
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
in_word = false;
|
||||
} else if(inPunctuation){
|
||||
String punctuation = ",";
|
||||
@@ -543,7 +544,7 @@ public class XML_processing {
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
|
||||
sentence.add(new Word(word, lemma, msd));
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
@@ -588,7 +589,7 @@ public class XML_processing {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
corpus.add(new Sentence(sentence));
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
@@ -655,6 +656,7 @@ public class XML_processing {
|
||||
boolean inPunctuation = false;
|
||||
boolean inOrthDiv = false;
|
||||
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
||||
boolean inSeparatedWord = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
@@ -662,7 +664,10 @@ public class XML_processing {
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
|
||||
String GOSCorpusHMKey = "";
|
||||
String sentenceDelimiter = "seg";
|
||||
int wordIndex = 0;
|
||||
|
||||
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
|
||||
|
||||
@@ -674,6 +679,8 @@ public class XML_processing {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
// created hashmap to combine words with normalized words
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
|
||||
@@ -711,7 +718,9 @@ public class XML_processing {
|
||||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
// }
|
||||
}
|
||||
} else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
|
||||
inSeparatedWord = true;
|
||||
}
|
||||
|
||||
// }
|
||||
}
|
||||
@@ -730,49 +739,107 @@ public class XML_processing {
|
||||
}
|
||||
} else if (qName.equalsIgnoreCase("div")) {
|
||||
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
} else if (qName.equalsIgnoreCase("seg")) {
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
|
||||
if (atts.keySet().contains("id")) {
|
||||
if (inOrthDiv) {
|
||||
GOSCorpusHMKey = atts.get("id") + ".norm";
|
||||
} else {
|
||||
GOSCorpusHMKey = atts.get("id");
|
||||
}
|
||||
} else {
|
||||
System.out.println("No attribute \"id\"");
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
Characters characters = event.asCharacters();
|
||||
if (gosType.equals("norm") && msd != null) {
|
||||
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
|
||||
// if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
|
||||
// System.out.println(wordIndex);
|
||||
// }
|
||||
// if algorithm is in orthodox part add new word to sentence
|
||||
if (inOrthDiv){
|
||||
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
|
||||
String word = "";
|
||||
Characters characters = event.asCharacters();
|
||||
sentence.add(new Word(characters.getData(), "", ""));
|
||||
// if algorithm is in normalized part find orthodox word and add other info to it
|
||||
} else {
|
||||
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
|
||||
Characters characters = event.asCharacters();
|
||||
// System.out.println(wordIndex);
|
||||
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
|
||||
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
|
||||
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
|
||||
currentWord.setLemma(lemma);
|
||||
currentWord.setMsd(msd);
|
||||
currentWord.setNormalizedWord(characters.getData());
|
||||
|
||||
wordIndex += 1;
|
||||
|
||||
// when a word is separated from one to many we have to create these duplicates
|
||||
if (inSeparatedWord){
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
|
||||
}
|
||||
} //else {
|
||||
// System.out.println("Error");
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
inWord = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
if (endElement.getName().getLocalPart().equals("w")) {
|
||||
if (inWord){
|
||||
inWord = false;
|
||||
} else if(inSeparatedWord) {
|
||||
// when there are no separated words left we have to delete last aditional duplicate
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);
|
||||
|
||||
inSeparatedWord = false;
|
||||
}
|
||||
}
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
boolean saveSentence = computeForOrth == inOrthDiv;
|
||||
if (inOrthDiv){
|
||||
// add sentence to corpus
|
||||
GOSCorpusHM.put(GOSCorpusHMKey, sentence);
|
||||
} else {
|
||||
|
||||
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
corpus.add(new Sentence(sentence));
|
||||
|
||||
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
|
||||
// add sentence to corpus if it passes filters
|
||||
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
wordIndex = 0;
|
||||
|
||||
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
// start a new sentence
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
|
||||
@@ -122,9 +122,9 @@ public class InflectedJOSCount {
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
// disregard if wrong taxonomy
|
||||
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
continue;
|
||||
}
|
||||
// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
|
||||
@@ -3,9 +3,11 @@ package alg.ngram;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
|
||||
import data.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
@@ -28,6 +30,9 @@ public class Ngrams {
|
||||
}
|
||||
|
||||
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
|
||||
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
// skip sentences shorter than specified ngram length
|
||||
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
||||
@@ -46,29 +51,62 @@ public class Ngrams {
|
||||
String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());
|
||||
|
||||
// if last letter is ',' erase it
|
||||
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
||||
// String key = "aaaaaaaaaaaaaaaaaaaaaaa";
|
||||
|
||||
String lemma = "";
|
||||
String wordType = "";
|
||||
String msd = "";
|
||||
for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
|
||||
if(otherKey.toString().equals("lema")){
|
||||
// lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
|
||||
lemma = wordToString(ngramCandidate, otherKey);
|
||||
} else if(otherKey.toString().equals("besedna vrsta")){
|
||||
wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
|
||||
} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
|
||||
msd = wordToString(ngramCandidate, otherKey);
|
||||
}
|
||||
// if (key.equals("")){
|
||||
// String test = key;
|
||||
// }
|
||||
|
||||
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
||||
|
||||
MultipleHMKeys multipleKeys;
|
||||
|
||||
// create MultipleHMKeys for different amount of other keys
|
||||
switch (otherKeys.size()) {
|
||||
case 0:
|
||||
multipleKeys = new MultipleHMKeys1(key);
|
||||
break;
|
||||
case 1:
|
||||
multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
|
||||
break;
|
||||
case 2:
|
||||
multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
||||
wordToString(ngramCandidate, otherKeys.get(1)));
|
||||
break;
|
||||
case 3:
|
||||
multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
||||
wordToString(ngramCandidate, otherKeys.get(1)),
|
||||
wordToString(ngramCandidate, otherKeys.get(2)));
|
||||
break;
|
||||
case 4:
|
||||
multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
|
||||
wordToString(ngramCandidate, otherKeys.get(1)),
|
||||
wordToString(ngramCandidate, otherKeys.get(2)),
|
||||
wordToString(ngramCandidate, otherKeys.get(3)));
|
||||
break;
|
||||
default:
|
||||
multipleKeys = null;
|
||||
}
|
||||
|
||||
|
||||
// String lemma = "";
|
||||
// String wordType = "";
|
||||
// String msd = "";
|
||||
// for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
|
||||
// if(otherKey.toString().equals("lema")){
|
||||
// lemma = wordToString(ngramCandidate, otherKey);
|
||||
// } else if(otherKey.toString().equals("besedna vrsta")){
|
||||
// wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
|
||||
// } else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
|
||||
// msd = wordToString(ngramCandidate, otherKey);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
|
||||
|
||||
|
||||
MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
|
||||
|
||||
// UPDATE TAXONOMY HERE!!!
|
||||
stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());
|
||||
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
|
||||
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
@@ -102,26 +140,31 @@ public class Ngrams {
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
return StringUtils.join(candidate, " ");
|
||||
case WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
return StringUtils.join(candidate, " ");
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getMsd)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
return StringUtils.join(candidate, " ");
|
||||
case WORD_TYPE:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
// candidate.addAll(ngramCandidate
|
||||
// .stream()
|
||||
// .map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
// .collect(Collectors.toList()));
|
||||
// .substring(0, 1)
|
||||
return StringUtils.join(candidate, " ");
|
||||
}
|
||||
|
||||
return StringUtils.join(candidate, " ");
|
||||
@@ -136,7 +179,7 @@ public class Ngrams {
|
||||
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word w : s.getWords()) {
|
||||
List<String> taxonomy = w.getTaxonomy();
|
||||
List<String> taxonomy = s.getTaxonomy();
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
||||
|
||||
// skip this iteration if:
|
||||
@@ -152,7 +195,7 @@ public class Ngrams {
|
||||
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
|
||||
// TODO: locila?
|
||||
|
||||
MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));
|
||||
MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
|
||||
stats.updateTaxonomyResults(multipleKeys, taxonomy);
|
||||
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
|
||||
@@ -183,8 +226,7 @@ public class Ngrams {
|
||||
String punctuation = ",";
|
||||
return new Word(sentence.get(i).getWord() + punctuation,
|
||||
sentence.get(i).getLemma() + punctuation,
|
||||
sentence.get(i).getMsd() + punctuation,
|
||||
sentence.get(i).getTaxonomy());
|
||||
sentence.get(i).getMsd() + punctuation);
|
||||
}
|
||||
}
|
||||
return sentence.get(i);
|
||||
@@ -204,6 +246,10 @@ public class Ngrams {
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
||||
if (sentence == null){
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
|
||||
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
||||
if (ngram == 2 && j < sentence.size()) {
|
||||
@@ -260,7 +306,7 @@ public class Ngrams {
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
|
||||
key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
|
||||
stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""),
|
||||
stats.updateTaxonomyResults(new MultipleHMKeys1(key),
|
||||
stats.getCorpus().getTaxonomy());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,79 +89,79 @@ class WordCount {
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
// private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
// List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
// List<Word> filteredWords = new ArrayList<>();
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
// filteredWords.add(word);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
// sentence.addAll(filteredWords
|
||||
// .stream()
|
||||
// .map(Word::getLemma)
|
||||
// .collect(Collectors.toList()));
|
||||
// } else if (stats.getCf() == CalculateFor.WORD) {
|
||||
// sentence.addAll(filteredWords
|
||||
// .stream()
|
||||
// .map(Word::getWord)
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
//
|
||||
// for (String word : sentence) {
|
||||
// Common.updateMap(stats.result, word);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
// private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
// List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
//
|
||||
// if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getLemma)
|
||||
// .collect(Collectors.toList()));
|
||||
// } else if (stats.getCf() == CalculateFor.WORD) {
|
||||
// sentence.addAll(s.getWords()
|
||||
// .stream()
|
||||
// .map(Word::getWord)
|
||||
// .collect(Collectors.toList()));
|
||||
// }
|
||||
//
|
||||
// for (String word : sentence) {
|
||||
// Common.updateMap(stats.result, word);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
boolean taxonomyIsSet = stats.isTaxonomySet();
|
||||
boolean JosTypeIsSet = stats.isJOSTypeSet();
|
||||
|
||||
// branching because even though the only difference is an if or two &&
|
||||
// O(if) = 1, the amount of ifs adds up and this saves some time
|
||||
if (taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForTaxonomyAndJosType(corpus, stats);
|
||||
} else if (taxonomyIsSet && !JosTypeIsSet) {
|
||||
calculateForTaxonomy(corpus, stats);
|
||||
} else if (!taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForJosType(corpus, stats);
|
||||
} else {
|
||||
if (stats.isVcc()) {
|
||||
calculateVCC(corpus, stats);
|
||||
} else {
|
||||
calculateNoFilter(corpus, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
// static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// boolean taxonomyIsSet = stats.isTaxonomySet();
|
||||
// boolean JosTypeIsSet = stats.isJOSTypeSet();
|
||||
//
|
||||
// // branching because even though the only difference is an if or two &&
|
||||
// // O(if) = 1, the amount of ifs adds up and this saves some time
|
||||
// if (taxonomyIsSet && JosTypeIsSet) {
|
||||
// calculateForTaxonomyAndJosType(corpus, stats);
|
||||
// } else if (taxonomyIsSet && !JosTypeIsSet) {
|
||||
// calculateForTaxonomy(corpus, stats);
|
||||
// } else if (!taxonomyIsSet && JosTypeIsSet) {
|
||||
// calculateForJosType(corpus, stats);
|
||||
// } else {
|
||||
// if (stats.isVcc()) {
|
||||
// calculateVCC(corpus, stats);
|
||||
// } else {
|
||||
// calculateNoFilter(corpus, stats);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
Reference in New Issue
Block a user