Added some optimizations and new taxonomy names

This commit is contained in:
2018-08-31 07:57:58 +02:00
parent 1c00f1a283
commit 426a9ccc46
21 changed files with 1345 additions and 1182 deletions

View File

@@ -262,7 +262,7 @@ public class XML_processing {
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){
stavek.add(new Word(c3Content, c3Content, "/"));
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
}
@@ -297,7 +297,7 @@ public class XML_processing {
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
in_word = false;
}
break;
@@ -537,12 +537,12 @@ public class XML_processing {
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(new Word(punctuation, punctuation, "/"));
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
// String punctuation = ",";
@@ -761,7 +761,7 @@ public class XML_processing {
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
String word = "";
Characters characters = event.asCharacters();
sentence.add(new Word(characters.getData(), "", ""));
sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
// if algorithm is in normalized part find orthodox word and add other info to it
} else {
Characters characters = event.asCharacters();
@@ -769,15 +769,16 @@ public class XML_processing {
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
currentWord.setLemma(lemma);
currentWord.setMsd(msd);
currentWord.setNormalizedWord(characters.getData());
currentWord.setLemma(lemma, stats.getFilter().getWordParts());
currentWord.setMsd(msd, stats.getFilter().getWordParts());
currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
wordIndex += 1;
// when a word is separated from one to many we have to create these duplicates
if (inSeparatedWord){
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
"", "", "", stats.getFilter()));
}
} //else {
// System.out.println("Error");
@@ -893,8 +894,8 @@ public class XML_processing {
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
}
}
@@ -912,4 +913,38 @@ public class XML_processing {
return atts;
}
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);
if (f.getWordParts().contains(CalculateFor.LEMMA))
wString.add(lemma);
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
wString.add(msd);
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
wString.add(normalizedWord);
// find appropriate strings and put them in word
Word w;
switch (f.getWordParts().size()) {
case 1:
w = new Word1(wString.get(0));
break;
case 2:
w = new Word2(wString.get(0), wString.get(1));
break;
case 3:
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
break;
case 4:
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
break;
default:
w = null;
}
return w;
}
}