Added some optimizations and new taxonomy names
This commit is contained in:
@@ -262,7 +262,7 @@ public class XML_processing {
|
||||
|
||||
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
|
||||
stavek.size() > 0){
|
||||
stavek.add(new Word(c3Content, c3Content, "/"));
|
||||
stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));
|
||||
|
||||
}
|
||||
|
||||
@@ -297,7 +297,7 @@ public class XML_processing {
|
||||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
|
||||
in_word = false;
|
||||
}
|
||||
break;
|
||||
@@ -537,12 +537,12 @@ public class XML_processing {
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(new Word(word, lemma, msd));
|
||||
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
|
||||
inWord = false;
|
||||
}
|
||||
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
String punctuation = characters.getData();
|
||||
sentence.add(new Word(punctuation, punctuation, "/"));
|
||||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
|
||||
// String punctuation = ",";
|
||||
@@ -761,7 +761,7 @@ public class XML_processing {
|
||||
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
|
||||
String word = "";
|
||||
Characters characters = event.asCharacters();
|
||||
sentence.add(new Word(characters.getData(), "", ""));
|
||||
sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
|
||||
// if algorithm is in normalized part find orthodox word and add other info to it
|
||||
} else {
|
||||
Characters characters = event.asCharacters();
|
||||
@@ -769,15 +769,16 @@ public class XML_processing {
|
||||
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
|
||||
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
|
||||
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
|
||||
currentWord.setLemma(lemma);
|
||||
currentWord.setMsd(msd);
|
||||
currentWord.setNormalizedWord(characters.getData());
|
||||
currentWord.setLemma(lemma, stats.getFilter().getWordParts());
|
||||
currentWord.setMsd(msd, stats.getFilter().getWordParts());
|
||||
currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());
|
||||
|
||||
wordIndex += 1;
|
||||
|
||||
// when a word is separated from one to many we have to create these duplicates
|
||||
if (inSeparatedWord){
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
|
||||
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
|
||||
"", "", "", stats.getFilter()));
|
||||
}
|
||||
} //else {
|
||||
// System.out.println("Error");
|
||||
@@ -893,8 +894,8 @@ public class XML_processing {
|
||||
|
||||
// if we're calculating values for letters, omit words that are shorter than string length
|
||||
if (filter.getNgramValue() == 0) {
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -912,4 +913,38 @@ public class XML_processing {
|
||||
|
||||
return atts;
|
||||
}
|
||||
|
||||
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
|
||||
List<String> wString = new ArrayList<>();
|
||||
if (f.getWordParts().contains(CalculateFor.WORD))
|
||||
wString.add(word);
|
||||
if (f.getWordParts().contains(CalculateFor.LEMMA))
|
||||
wString.add(lemma);
|
||||
if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
|
||||
wString.add(msd);
|
||||
if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
|
||||
wString.add(normalizedWord);
|
||||
|
||||
// find appropriate strings and put them in word
|
||||
Word w;
|
||||
|
||||
switch (f.getWordParts().size()) {
|
||||
case 1:
|
||||
w = new Word1(wString.get(0));
|
||||
break;
|
||||
case 2:
|
||||
w = new Word2(wString.get(0), wString.get(1));
|
||||
break;
|
||||
case 3:
|
||||
w = new Word3(wString.get(0), wString.get(1), wString.get(2));
|
||||
break;
|
||||
case 4:
|
||||
w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
|
||||
break;
|
||||
default:
|
||||
w = null;
|
||||
|
||||
}
|
||||
return w;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user