Added filter delete words with lower frequency from output (large corpuses optimization)

This commit is contained in:
2019-02-27 10:14:40 +01:00
parent b8dee86c36
commit 82d111eade
20 changed files with 1670 additions and 561 deletions

View File

@@ -6,6 +6,7 @@ import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.atomic.AtomicLong;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
@@ -178,6 +179,26 @@ public class XML_processing {
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
// if running with minimalRelFre frequency erase all ngrams with occurrences lower than set value per 1M
if(stats.getFilter().getIsMinimalRelFreScraper()) {
// long countFor1MWords = stats.getCountWordsForMinimalRelFreNgrams() +
long countFor1MWords = stats.getUniGramOccurrences().get(stats.getCorpus().getTotal()).longValue();
if(countFor1MWords > 1000000L){
double absToRelFactor = (stats.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
stats.updateMinimalRelFre(stats.getTaxonomyResult().get(stats.getCorpus().getTotal()).entrySet(), absToRelFactor);
// reset all values
for(Taxonomy taxonomy : stats.getTaxonomyResult().keySet()){
stats.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>());
}
for(Taxonomy taxonomy : stats.getUniGramOccurrences().keySet()){
stats.getUniGramOccurrences().put(taxonomy, new AtomicLong(0));
}
}
// System.out.println("asd");
}
}
// public static void readXMLGos(String path, Statistics stats) {