Added some optimizations and new taxonomy names

2018-08-31 07:57:58 +02:00
parent 1c00f1a283
commit 426a9ccc46
21 changed files with 1345 additions and 1182 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -262,7 +262,7 @@ public class XML_processing {

                            if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
                                    stavek.size() > 0){
-                                stavek.add(new Word(c3Content, c3Content, "/"));
+                                stavek.add(createWord(c3Content, c3Content, "/", "", stats.getFilter()));

                            }

@@ -297,7 +297,7 @@ public class XML_processing {

 						// "word" node value
 						if (in_word) {
-							stavek.add(new Word(characters.getData(), lemma, msd));
+							stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter()));
 							in_word = false;
 						}
 						break;
@@ -537,12 +537,12 @@ public class XML_processing {
 						// "word" node value
 						if (inWord) {
 							String word = characters.getData();
-							sentence.add(new Word(word, lemma, msd));
+							sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
 							inWord = false;
 						}
 						if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
 						    String punctuation = characters.getData();
-							sentence.add(new Word(punctuation, punctuation, "/"));
+							sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
 							inPunctuation = false;

 //						    String punctuation = ",";
@@ -761,7 +761,7 @@ public class XML_processing {
 //								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
 								String word = "";
 								Characters characters = event.asCharacters();
-								sentence.add(new Word(characters.getData(), "", ""));
+								sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter()));
 							// if algorithm is in normalized part find orthodox word and add other info to it
 							} else {
 								Characters characters = event.asCharacters();
@@ -769,15 +769,16 @@ public class XML_processing {
 //								System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
 								if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
 									Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
-									currentWord.setLemma(lemma);
-									currentWord.setMsd(msd);
-									currentWord.setNormalizedWord(characters.getData());
+									currentWord.setLemma(lemma, stats.getFilter().getWordParts());
+									currentWord.setMsd(msd, stats.getFilter().getWordParts());
+									currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts());

 									wordIndex += 1;

                                    // when a word is separated from one to many we have to create these duplicates
                                    if (inSeparatedWord){
-                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
+                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()),
+                                                "", "", "", stats.getFilter()));
                                    }
 								} //else {
 //								    System.out.println("Error");
@@ -893,8 +894,8 @@ public class XML_processing {

 			// if we're calculating values for letters, omit words that are shorter than string length
 			if (filter.getNgramValue() == 0) {
-				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
-						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
+				sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength())
+						|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength()));
 			}
 		}

@@ -912,4 +913,38 @@ public class XML_processing {

 		return atts;
 	}
+
+	private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
+		List<String> wString = new ArrayList<>();
+		if (f.getWordParts().contains(CalculateFor.WORD))
+			wString.add(word);
+		if (f.getWordParts().contains(CalculateFor.LEMMA))
+			wString.add(lemma);
+		if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
+			wString.add(msd);
+		if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD))
+			wString.add(normalizedWord);
+
+		// find appropriate strings and put them in word
+		Word w;
+
+		switch (f.getWordParts().size()) {
+			case 1:
+				w = new Word1(wString.get(0));
+				break;
+			case 2:
+				w = new Word2(wString.get(0), wString.get(1));
+				break;
+			case 3:
+				w = new Word3(wString.get(0), wString.get(1), wString.get(2));
+				break;
+			case 4:
+				w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3));
+				break;
+			default:
+				w = null;
+
+		}
+		return w;
+	}
 }