Added filter delete words with lower frequency from output (large corpuses optimization)
This commit is contained in:
@@ -6,6 +6,7 @@ import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
import javax.xml.stream.XMLEventReader;
|
||||
@@ -178,6 +179,26 @@ public class XML_processing {
|
||||
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
|
||||
// pool.invoke(wc);
|
||||
}
|
||||
|
||||
// if running with minimalRelFre frequency erase all ngrams with occurrences lower than set value per 1M
|
||||
if(stats.getFilter().getIsMinimalRelFreScraper()) {
|
||||
// long countFor1MWords = stats.getCountWordsForMinimalRelFreNgrams() +
|
||||
long countFor1MWords = stats.getUniGramOccurrences().get(stats.getCorpus().getTotal()).longValue();
|
||||
if(countFor1MWords > 1000000L){
|
||||
double absToRelFactor = (stats.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
|
||||
|
||||
stats.updateMinimalRelFre(stats.getTaxonomyResult().get(stats.getCorpus().getTotal()).entrySet(), absToRelFactor);
|
||||
|
||||
// reset all values
|
||||
for(Taxonomy taxonomy : stats.getTaxonomyResult().keySet()){
|
||||
stats.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>());
|
||||
}
|
||||
for(Taxonomy taxonomy : stats.getUniGramOccurrences().keySet()){
|
||||
stats.getUniGramOccurrences().put(taxonomy, new AtomicLong(0));
|
||||
}
|
||||
}
|
||||
// System.out.println("asd");
|
||||
}
|
||||
}
|
||||
|
||||
// public static void readXMLGos(String path, Statistics stats) {
|
||||
|
||||
Reference in New Issue
Block a user