Added filter parameters to CSV + created names of columns for MSDs + [partly] fixed number of words parameter
This commit is contained in:
@@ -535,6 +535,7 @@ public class XML_processing {
|
||||
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
boolean taxonomyMatch = true;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
@@ -635,13 +636,19 @@ public class XML_processing {
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// count all UniGramOccurrences in sentence for statistics
|
||||
stats.updateUniGramOccurrences(sentence.size());
|
||||
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
|
||||
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
|
||||
}
|
||||
|
||||
// taxonomyMatch = true;
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
@@ -666,7 +673,9 @@ public class XML_processing {
|
||||
|
||||
if (currentFiletaxonomy.isEmpty()) {
|
||||
// taxonomies don't match so stop
|
||||
return false;
|
||||
// return false;
|
||||
taxonomyMatch = false;
|
||||
// System.out.println("TEST");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,8 @@ public class Ngrams {
|
||||
ArrayList<CalculateFor> otherKeys = stats.getFilter().getMultipleKeys();
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
// stats.updateUniGramOccurrences(s.getWords().size());
|
||||
|
||||
// skip sentences shorter than specified ngram length
|
||||
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
||||
continue;
|
||||
@@ -176,6 +178,8 @@ public class Ngrams {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// UPDATE TAXONOMY HERE!!!
|
||||
stats.updateTaxonomyResults(multipleKeys, s.getTaxonomy());
|
||||
// stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
@@ -261,16 +265,34 @@ public class Ngrams {
|
||||
*/
|
||||
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
// stats.updateUniGramOccurrences(s.getWords().size());
|
||||
for (Word w : s.getWords()) {
|
||||
List<String> taxonomy = s.getTaxonomy();
|
||||
|
||||
//// List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
||||
List<Word> ngramCandidate = new ArrayList<>();
|
||||
ngramCandidate.add(w);
|
||||
//
|
||||
// // if msd regex is set and this candidate doesn't pass it, skip this iteration
|
||||
// if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv(), stats.getFilter().getWordParts());
|
||||
|
||||
// skip this iteration if:
|
||||
// - word doesn't contain a proper version (missing lemma for example)
|
||||
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
||||
// - given substring length is larger than the word length
|
||||
|
||||
// boolean t1 = stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
|
||||
// boolean t2 = !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern());
|
||||
// String t3 = stats.getFilter().getMsd().get(0).pattern();
|
||||
// ArrayList<CalculateFor> t4 = stats.getFilter().getWordParts();
|
||||
// boolean t5 = word.length() < stats.getFilter().getStringLength();
|
||||
|
||||
if (ValidationUtil.isEmpty(word)
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd(stats.getFilter().getWordParts()).matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())
|
||||
|| word.length() < stats.getFilter().getStringLength()) {
|
||||
continue;
|
||||
}
|
||||
@@ -330,6 +352,8 @@ public class Ngrams {
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
||||
// stats.updateUniGramOccurrences(s.getWords().size());
|
||||
|
||||
if (sentence == null){
|
||||
continue;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user