Added some performance measures

2018-08-09 09:21:06 +02:00
parent 179f09c4bd
commit 9b5fa4616b
24 changed files with 734 additions and 379 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -5,6 +5,7 @@ import static data.Enums.solar.SolarFilters.*;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ForkJoinPool;

 import javax.xml.namespace.QName;
@@ -261,7 +262,7 @@ public class XML_processing {

 							if (c3Content.equals(".") && includeThisBlock) {
 								// add sentence to corpus
-								corpus.add(new Sentence(stavek));
+								corpus.add(new Sentence(stavek, null));
 								// and start a new one
 								stavek = new ArrayList<>();

@@ -293,7 +294,7 @@ public class XML_processing {

 						// "word" node value
 						if (in_word) {
-							stavek.add(new Word(characters.getData(), lemma, msd, null));
+							stavek.add(new Word(characters.getData(), lemma, msd));
 							in_word = false;
 						} else if(inPunctuation){
                            String punctuation = ",";
@@ -543,7 +544,7 @@ public class XML_processing {
 						// "word" node value
 						if (inWord) {
 							String word = characters.getData();
-							sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
+							sentence.add(new Word(word, lemma, msd));
 							inWord = false;
 						}
 						if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
@@ -588,7 +589,7 @@ public class XML_processing {
 							sentence = runFilters(sentence, stats.getFilter());

 							if (!ValidationUtil.isEmpty(sentence)) {
-								corpus.add(new Sentence(sentence));
+								corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
 							}

 							// and start a new one
@@ -655,6 +656,7 @@ public class XML_processing {
        boolean inPunctuation = false;
 		boolean inOrthDiv = false;
 		boolean computeForOrth = stats.getCorpus().isGosOrthMode();
+		boolean inSeparatedWord = false;
 		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
 		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
@@ -662,7 +664,10 @@ public class XML_processing {

 		List<Word> sentence = new ArrayList<>();
 		List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
+		Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
+		String GOSCorpusHMKey = "";
 		String sentenceDelimiter = "seg";
+		int wordIndex = 0;

 		String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm

@@ -674,6 +679,8 @@ public class XML_processing {
 			XMLInputFactory factory = XMLInputFactory.newInstance();
 			eventReader = factory.createXMLEventReader(new FileInputStream(path));

+			// created hashmap to combine words with normalized words
+
 			while (eventReader.hasNext()) {
 				XMLEvent event = eventReader.nextEvent();
 				// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
@@ -711,7 +718,9 @@ public class XML_processing {
 								// 	msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
 								// 	lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
 								// }
-							}
+							} else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
+                                inSeparatedWord = true;
+                            }

 							// }
 						}
@@ -730,49 +739,107 @@ public class XML_processing {
 							}
 						} else if (qName.equalsIgnoreCase("div")) {
 							gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
+						} else if (qName.equalsIgnoreCase("seg")) {
+							HashMap<String, String> atts = extractAttributes(startElement);
+
+							if (atts.keySet().contains("id")) {
+							    if (inOrthDiv) {
+                                    GOSCorpusHMKey = atts.get("id") + ".norm";
+                                } else {
+                                    GOSCorpusHMKey = atts.get("id");
+                                }
+							} else {
+								System.out.println("No attribute \"id\"");
+							}
 						}
 						break;

 					case XMLStreamConstants.CHARACTERS:
 						// "word" node value
 						if (inWord) {
-							Characters characters = event.asCharacters();
-							if (gosType.equals("norm") && msd != null) {
-								sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
+//						    if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
+//                                System.out.println(wordIndex);
+//                            }
+							// if algorithm is in orthodox part add new word to sentence
+							if (inOrthDiv){
+//								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
+								String word = "";
+								Characters characters = event.asCharacters();
+								sentence.add(new Word(characters.getData(), "", ""));
+							// if algorithm is in normalized part find orthodox word and add other info to it
 							} else {
-								sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
+								Characters characters = event.asCharacters();
+//								System.out.println(wordIndex);
+//								System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
+								if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
+									Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
+									currentWord.setLemma(lemma);
+									currentWord.setMsd(msd);
+									currentWord.setNormalizedWord(characters.getData());
+
+									wordIndex += 1;
+
+                                    // when a word is separated from one to many we have to create these duplicates
+                                    if (inSeparatedWord){
+                                        GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
+                                    }
+								} //else {
+//								    System.out.println("Error");
+//                                }
+
 							}

-							inWord = false;
 						}
 						break;

 					case XMLStreamConstants.END_ELEMENT:
 						EndElement endElement = event.asEndElement();

+                        if (endElement.getName().getLocalPart().equals("w")) {
+                            if (inWord){
+                                inWord = false;
+                            } else if(inSeparatedWord) {
+                                // when there are no separated words left we have to delete last aditional duplicate
+                                GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);
+
+                                inSeparatedWord = false;
+                            }
+                        }
+
 						// parser reached end of the current sentence
 						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
-							// add sentence to corpus if it passes filters
-							boolean saveSentence = computeForOrth == inOrthDiv;
+							if (inOrthDiv){
+							    // add sentence to corpus
+								GOSCorpusHM.put(GOSCorpusHMKey, sentence);
+							} else {

-							if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
-								sentence = runFilters(sentence, stats.getFilter());
-								corpus.add(new Sentence(sentence));
+
+                                sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
+								// add sentence to corpus if it passes filters
+								if (includeFile && !ValidationUtil.isEmpty(sentence)) {
+									sentence = runFilters(sentence, stats.getFilter());
+									corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+								}
+
+								wordIndex = 0;
+
+
+
+								/* Invoke Fork-Join when we reach maximum limit of
+								 * sentences (because we can't read everything to
+								 * memory) or we reach the end of the file.
+								 */
+								if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+									fj(corpus, stats);
+									// empty the current corpus, since we don't need
+									// the data anymore
+									corpus.clear();
+								}
 							}
+                            // start a new sentence
+                            sentence = new ArrayList<>();

-							// and start a new one
-							sentence = new ArrayList<>();

-							/* Invoke Fork-Join when we reach maximum limit of
-							 * sentences (because we can't read everything to
-							 * memory) or we reach the end of the file.
-							 */
-							if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
-								fj(corpus, stats);
-								// empty the current corpus, since we don't need
-								// the data anymore
-								corpus.clear();
-							}
 						} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
 							// before proceeding to read this file, make sure that taxonomy filters are a match
 							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
--- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
+++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java
@@ -122,9 +122,9 @@ public class InflectedJOSCount {
 	static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
 		for (Sentence s : corpus) {
 			// disregard if wrong taxonomy
-			if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
-				continue;
-			}
+//			if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
+//				continue;
+//			}

 			for (Word word : s.getWords()) {
 				// skip if current word is not inflected
--- a/src/main/java/alg/ngram/Ngrams.java
+++ b/src/main/java/alg/ngram/Ngrams.java
@@ -3,9 +3,11 @@ package alg.ngram;

 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;

+import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
 import data.*;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
@@ -28,6 +30,9 @@ public class Ngrams {
 	}

 	public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
+		// preprocess CalculateFor for this corpus and prepare data for MultipleHMKeys
+		ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
+
 		for (Sentence s : corpus) {
 			// skip sentences shorter than specified ngram length
 			if (s.getWords().size() < stats.getFilter().getNgramValue()) {
@@ -46,29 +51,62 @@ public class Ngrams {
 				String key = wordToString(ngramCandidate, stats.getFilter().getCalculateFor());

 				// if last letter is ',' erase it
-				key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
-//				String key = "aaaaaaaaaaaaaaaaaaaaaaa";

-				String lemma = "";
-				String wordType = "";
-				String msd = "";
-				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
-					if(otherKey.toString().equals("lema")){
-//						lemma = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
-						lemma = wordToString(ngramCandidate, otherKey);
-					} else if(otherKey.toString().equals("besedna vrsta")){
-						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
-					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
-						msd = wordToString(ngramCandidate, otherKey);
-					}
+//				if (key.equals("")){
+//					String test = key;
+//				}
+
+//				key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
+
+				MultipleHMKeys multipleKeys;
+
+				// create MultipleHMKeys for different amount of other keys
+				switch (otherKeys.size()) {
+					case 0:
+						multipleKeys = new MultipleHMKeys1(key);
+						break;
+					case 1:
+						multipleKeys = new MultipleHMKeys2(key, wordToString(ngramCandidate, otherKeys.get(0)));
+						break;
+					case 2:
+						multipleKeys = new MultipleHMKeys3(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)));
+						break;
+					case 3:
+						multipleKeys = new MultipleHMKeys4(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)),
+								wordToString(ngramCandidate, otherKeys.get(2)));
+						break;
+					case 4:
+						multipleKeys = new MultipleHMKeys5(key, wordToString(ngramCandidate, otherKeys.get(0)),
+								wordToString(ngramCandidate, otherKeys.get(1)),
+								wordToString(ngramCandidate, otherKeys.get(2)),
+								wordToString(ngramCandidate, otherKeys.get(3)));
+						break;
+					default:
+						multipleKeys = null;
 				}


+//				String lemma = "";
+//				String wordType = "";
+//				String msd = "";
+//				for (CalculateFor otherKey : stats.getFilter().getMultipleKeys()){
+//					if(otherKey.toString().equals("lema")){
+//						lemma = wordToString(ngramCandidate, otherKey);
+//					} else if(otherKey.toString().equals("besedna vrsta")){
+//						wordType = wordToString(ngramCandidate, otherKey).substring(0, 1);
+//					} else if(otherKey.toString().equals("oblikoskladenjska oznaka")){
+//						msd = wordToString(ngramCandidate, otherKey);
+//					}
+//				}
+//
+//				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);
+

-				MultipleHMKeys multipleKeys = new MultipleHMKeys(key, lemma, wordType, msd);

 				// UPDATE TAXONOMY HERE!!!
-                stats.updateTaxonomyResults(multipleKeys, ngramCandidate.get(0).getTaxonomy());
+                stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
 //				stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
 			}
 		}
@@ -102,26 +140,31 @@ public class Ngrams {
 						.stream()
 						.map(Word::getLemma)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case WORD:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(Word::getWord)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case MORPHOSYNTACTIC_SPECS:
 			case MORPHOSYNTACTIC_PROPERTY:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(Word::getMsd)
 						.collect(Collectors.toList()));
-				break;
+				return StringUtils.join(candidate, " ");
 			case WORD_TYPE:
 				candidate.addAll(ngramCandidate
 						.stream()
 						.map(w -> Character.toString(w.getMsd().charAt(0)))
 						.collect(Collectors.toList()));
-				break;
+//				candidate.addAll(ngramCandidate
+//						.stream()
+//						.map(w -> Character.toString(w.getMsd().charAt(0)))
+//						.collect(Collectors.toList()));
+//				.substring(0, 1)
+				return StringUtils.join(candidate, " ");
 		}

 		return StringUtils.join(candidate, " ");
@@ -136,7 +179,7 @@ public class Ngrams {
 	private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
 		for (Sentence s : corpus) {
 			for (Word w : s.getWords()) {
-				List<String> taxonomy = w.getTaxonomy();
+				List<String> taxonomy = s.getTaxonomy();
 				String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());

 				// skip this iteration if:
@@ -152,7 +195,7 @@ public class Ngrams {
 				for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
 					// TODO: locila?

-					MultipleHMKeys multipleKeys = new MultipleHMKeys(word.substring(i, i + stats.getFilter().getStringLength()));
+					MultipleHMKeys multipleKeys = new MultipleHMKeys1(word.substring(i, i + stats.getFilter().getStringLength()));
 					stats.updateTaxonomyResults(multipleKeys, taxonomy);
 //					stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));

@@ -183,8 +226,7 @@ public class Ngrams {
 				String punctuation = ",";
 				return new Word(sentence.get(i).getWord() + punctuation,
 						sentence.get(i).getLemma() + punctuation,
-						sentence.get(i).getMsd() + punctuation,
-						sentence.get(i).getTaxonomy());
+						sentence.get(i).getMsd() + punctuation);
 			}
 		}
 		return sentence.get(i);
@@ -204,6 +246,10 @@ public class Ngrams {
 		for (Sentence s : corpus) {
 			List<Word> sentence = s.getWords();

+			if (sentence == null){
+				continue;
+			}
+
 			for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
 				for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
 					if (ngram == 2 && j < sentence.size()) {
@@ -260,7 +306,7 @@ public class Ngrams {
 		if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
 		    String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor());
            key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
-			stats.updateTaxonomyResults(new MultipleHMKeys(key, "", "", ""),
+			stats.updateTaxonomyResults(new MultipleHMKeys1(key),
 										stats.getCorpus().getTaxonomy());
 		}
 	}
--- a/src/main/java/alg/word/WordCount.java
+++ b/src/main/java/alg/word/WordCount.java
@@ -89,79 +89,79 @@ class WordCount {
 		}
 	}

-	private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
-		for (Sentence s : corpus) {
-			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
-				List<String> sentence = new ArrayList<>(s.getWords().size());
-				List<Word> filteredWords = new ArrayList<>();
+//	private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
+//		for (Sentence s : corpus) {
+//			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
+//				List<String> sentence = new ArrayList<>(s.getWords().size());
+//				List<Word> filteredWords = new ArrayList<>();
+//
+//				for (Word word : s.getWords()) {
+//					if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
+//						filteredWords.add(word);
+//					}
+//				}
+//
+//				if (stats.getCf() == CalculateFor.LEMMA) {
+//					sentence.addAll(filteredWords
+//							.stream()
+//							.map(Word::getLemma)
+//							.collect(Collectors.toList()));
+//				} else if (stats.getCf() == CalculateFor.WORD) {
+//					sentence.addAll(filteredWords
+//							.stream()
+//							.map(Word::getWord)
+//							.collect(Collectors.toList()));
+//				}
+//
+//				for (String word : sentence) {
+//					Common.updateMap(stats.result, word);
+//				}
+//			}
+//		}
+//	}

-				for (Word word : s.getWords()) {
-					if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
-						filteredWords.add(word);
-					}
-				}
+//	private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
+//		for (Sentence s : corpus) {
+//			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
+//				List<String> sentence = new ArrayList<>(s.getWords().size());
+//
+//				if (stats.getCf() == CalculateFor.LEMMA) {
+//					sentence.addAll(s.getWords()
+//							.stream()
+//							.map(Word::getLemma)
+//							.collect(Collectors.toList()));
+//				} else if (stats.getCf() == CalculateFor.WORD) {
+//					sentence.addAll(s.getWords()
+//							.stream()
+//							.map(Word::getWord)
+//							.collect(Collectors.toList()));
+//				}
+//
+//				for (String word : sentence) {
+//					Common.updateMap(stats.result, word);
+//				}
+//			}
+//		}
+//	}

-				if (stats.getCf() == CalculateFor.LEMMA) {
-					sentence.addAll(filteredWords
-							.stream()
-							.map(Word::getLemma)
-							.collect(Collectors.toList()));
-				} else if (stats.getCf() == CalculateFor.WORD) {
-					sentence.addAll(filteredWords
-							.stream()
-							.map(Word::getWord)
-							.collect(Collectors.toList()));
-				}
-
-				for (String word : sentence) {
-					Common.updateMap(stats.result, word);
-				}
-			}
-		}
-	}
-
-	private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
-		for (Sentence s : corpus) {
-			if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
-				List<String> sentence = new ArrayList<>(s.getWords().size());
-
-				if (stats.getCf() == CalculateFor.LEMMA) {
-					sentence.addAll(s.getWords()
-							.stream()
-							.map(Word::getLemma)
-							.collect(Collectors.toList()));
-				} else if (stats.getCf() == CalculateFor.WORD) {
-					sentence.addAll(s.getWords()
-							.stream()
-							.map(Word::getWord)
-							.collect(Collectors.toList()));
-				}
-
-				for (String word : sentence) {
-					Common.updateMap(stats.result, word);
-				}
-			}
-		}
-	}
-
-	static void calculateForAll(List<Sentence> corpus, Statistics stats) {
-		boolean taxonomyIsSet = stats.isTaxonomySet();
-		boolean JosTypeIsSet = stats.isJOSTypeSet();
-
-		// branching because even though the only difference is an if or two &&
-		// O(if) = 1, the amount of ifs adds up and this saves some time
-		if (taxonomyIsSet && JosTypeIsSet) {
-			calculateForTaxonomyAndJosType(corpus, stats);
-		} else if (taxonomyIsSet && !JosTypeIsSet) {
-			calculateForTaxonomy(corpus, stats);
-		} else if (!taxonomyIsSet && JosTypeIsSet) {
-			calculateForJosType(corpus, stats);
-		} else {
-			if (stats.isVcc()) {
-				calculateVCC(corpus, stats);
-			} else {
-				calculateNoFilter(corpus, stats);
-			}
-		}
-	}
+//	static void calculateForAll(List<Sentence> corpus, Statistics stats) {
+//		boolean taxonomyIsSet = stats.isTaxonomySet();
+//		boolean JosTypeIsSet = stats.isJOSTypeSet();
+//
+//		// branching because even though the only difference is an if or two &&
+//		// O(if) = 1, the amount of ifs adds up and this saves some time
+//		if (taxonomyIsSet && JosTypeIsSet) {
+//			calculateForTaxonomyAndJosType(corpus, stats);
+//		} else if (taxonomyIsSet && !JosTypeIsSet) {
+//			calculateForTaxonomy(corpus, stats);
+//		} else if (!taxonomyIsSet && JosTypeIsSet) {
+//			calculateForJosType(corpus, stats);
+//		} else {
+//			if (stats.isVcc()) {
+//				calculateVCC(corpus, stats);
+//			} else {
+//				calculateNoFilter(corpus, stats);
+//			}
+//		}
+//	}
 }