Added some performance measures

This commit is contained in:
2018-08-09 09:21:06 +02:00
parent 179f09c4bd
commit 9b5fa4616b
24 changed files with 734 additions and 379 deletions

View File

@@ -5,6 +5,7 @@ import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
@@ -261,7 +262,7 @@ public class XML_processing {
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek));
corpus.add(new Sentence(stavek, null));
// and start a new one
stavek = new ArrayList<>();
@@ -293,7 +294,7 @@ public class XML_processing {
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd, null));
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
} else if(inPunctuation){
String punctuation = ",";
@@ -543,7 +544,7 @@ public class XML_processing {
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
sentence.add(new Word(word, lemma, msd));
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
@@ -588,7 +589,7 @@ public class XML_processing {
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence));
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
// and start a new one
@@ -655,6 +656,7 @@ public class XML_processing {
boolean inPunctuation = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
boolean inSeparatedWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
@@ -662,7 +664,10 @@ public class XML_processing {
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
Map<String, List<Word>> GOSCorpusHM = new ConcurrentHashMap<>();
String GOSCorpusHMKey = "";
String sentenceDelimiter = "seg";
int wordIndex = 0;
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
@@ -674,6 +679,8 @@ public class XML_processing {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
// created hashmap to combine words with normalized words
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
@@ -711,7 +718,9 @@ public class XML_processing {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
}
} else if (atts.containsKey("type") && atts.get("type").equals("separated")) {
inSeparatedWord = true;
}
// }
}
@@ -730,49 +739,107 @@ public class XML_processing {
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
} else if (qName.equalsIgnoreCase("seg")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("id")) {
if (inOrthDiv) {
GOSCorpusHMKey = atts.get("id") + ".norm";
} else {
GOSCorpusHMKey = atts.get("id");
}
} else {
System.out.println("No attribute \"id\"");
}
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
// if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){
// System.out.println(wordIndex);
// }
// if algorithm is in orthodox part add new word to sentence
if (inOrthDiv){
// GOSCorpusHM.put(GOSCorpusHMKey, sentence);
String word = "";
Characters characters = event.asCharacters();
sentence.add(new Word(characters.getData(), "", ""));
// if algorithm is in normalized part find orthodox word and add other info to it
} else {
sentence.add(new Word(characters.getData(), lemma, msd, currentFiletaxonomyLong));
Characters characters = event.asCharacters();
// System.out.println(wordIndex);
// System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex);
if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) {
Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex);
currentWord.setLemma(lemma);
currentWord.setMsd(msd);
currentWord.setNormalizedWord(characters.getData());
wordIndex += 1;
// when a word is separated from one to many we have to create these duplicates
if (inSeparatedWord){
GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, new Word(currentWord.getWord(), "", ""));
}
} //else {
// System.out.println("Error");
// }
}
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
if (endElement.getName().getLocalPart().equals("w")) {
if (inWord){
inWord = false;
} else if(inSeparatedWord) {
// when there are no separated words left we have to delete last aditional duplicate
GOSCorpusHM.get(GOSCorpusHMKey).remove(wordIndex);
inSeparatedWord = false;
}
}
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
boolean saveSentence = computeForOrth == inOrthDiv;
if (inOrthDiv){
// add sentence to corpus
GOSCorpusHM.put(GOSCorpusHMKey, sentence);
} else {
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence));
sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
// add sentence to corpus if it passes filters
if (includeFile && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
wordIndex = 0;
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
// start a new sentence
sentence = new ArrayList<>();
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {