Added filter delete words with lower frequency from output (large corpuses optimization)
This commit is contained in:
@@ -29,6 +29,8 @@ public class Filter implements Cloneable {
|
||||
NOTE_PUNCTUATIONS,
|
||||
MINIMAL_OCCURRENCES,
|
||||
MINIMAL_TAXONOMY,
|
||||
MINIMAL_REL_FRE,
|
||||
IS_MINIMAL_REL_FRE_SCRAPER,
|
||||
TAXONOMY_SET_OPERATION,
|
||||
COLLOCABILITY,
|
||||
PREFIX_LENGTH,
|
||||
@@ -41,6 +43,7 @@ public class Filter implements Cloneable {
|
||||
filter = new HashMap<>();
|
||||
filter.put(WRITE_MSD_AT_THE_END, false);
|
||||
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
|
||||
filter.put(IS_MINIMAL_REL_FRE_SCRAPER, false);
|
||||
}
|
||||
|
||||
public Filter(AnalysisLevel al, CalculateFor cf) {
|
||||
@@ -258,6 +261,24 @@ public class Filter implements Cloneable {
|
||||
return (Integer) filter.get(MINIMAL_TAXONOMY);
|
||||
}
|
||||
|
||||
|
||||
public void setMinimalRelFre(Integer minimalRelFre) {
|
||||
filter.put(MINIMAL_REL_FRE, minimalRelFre);
|
||||
}
|
||||
|
||||
public Integer getMinimalRelFre() {
|
||||
return (Integer) filter.get(MINIMAL_REL_FRE);
|
||||
}
|
||||
|
||||
|
||||
public void setIsMinimalRelFreScraper(boolean isMinimalRelFreScraper) {
|
||||
filter.put(IS_MINIMAL_REL_FRE_SCRAPER, isMinimalRelFreScraper);
|
||||
}
|
||||
|
||||
public boolean getIsMinimalRelFreScraper() {
|
||||
return (boolean) filter.get(IS_MINIMAL_REL_FRE_SCRAPER);
|
||||
}
|
||||
|
||||
// PREFIX_LENGTH,
|
||||
// SUFFIX_LENGTH,
|
||||
// PREFIX_LIST,
|
||||
|
||||
@@ -66,4 +66,6 @@ public interface MultipleHMKeys {
|
||||
.thenComparing(MultipleHMKeys::getK5)
|
||||
.compare(this, othr);
|
||||
}
|
||||
|
||||
MultipleHMKeys[] splitNgramTo1grams();
|
||||
}
|
||||
|
||||
@@ -36,4 +36,13 @@ public final class MultipleHMKeys1 implements MultipleHMKeys {
|
||||
public boolean equals(Object obj) {
|
||||
return (obj instanceof MultipleHMKeys1) && ((MultipleHMKeys1) obj).k1.equals(k1);
|
||||
}
|
||||
|
||||
public MultipleHMKeys[] splitNgramTo1grams(){
|
||||
String[] k1 = getK1().split(" ");
|
||||
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
|
||||
for(int i = 0; i < k1.length; i++){
|
||||
res[i] = new MultipleHMKeys1(k1[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,4 +46,14 @@ public final class MultipleHMKeys2 implements MultipleHMKeys {
|
||||
|
||||
// return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key.equals(key);
|
||||
}
|
||||
|
||||
public MultipleHMKeys[] splitNgramTo1grams(){
|
||||
String[] k1 = getK1().split(" ");
|
||||
String[] k2 = getK2().split(" ");
|
||||
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
|
||||
for(int i = 0; i < k1.length; i++){
|
||||
res[i] = new MultipleHMKeys2(k1[i], k2[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,4 +50,15 @@ public final class MultipleHMKeys3 implements MultipleHMKeys {
|
||||
&& ((MultipleHMKeys3) obj).k2.equals(k2)
|
||||
&& ((MultipleHMKeys3) obj).k3.equals(k3);
|
||||
}
|
||||
|
||||
public MultipleHMKeys[] splitNgramTo1grams(){
|
||||
String[] k1 = getK1().split(" ");
|
||||
String[] k2 = getK2().split(" ");
|
||||
String[] k3 = getK3().split(" ");
|
||||
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
|
||||
for(int i = 0; i < k1.length; i++){
|
||||
res[i] = new MultipleHMKeys3(k1[i], k2[i], k3[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,4 +58,16 @@ public final class MultipleHMKeys4 implements MultipleHMKeys {
|
||||
&& ((MultipleHMKeys4) obj).k3.equals(k3)
|
||||
&& ((MultipleHMKeys4) obj).k4.equals(k4);
|
||||
}
|
||||
|
||||
public MultipleHMKeys[] splitNgramTo1grams(){
|
||||
String[] k1 = getK1().split(" ");
|
||||
String[] k2 = getK2().split(" ");
|
||||
String[] k3 = getK3().split(" ");
|
||||
String[] k4 = getK4().split(" ");
|
||||
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
|
||||
for(int i = 0; i < k1.length; i++){
|
||||
res[i] = new MultipleHMKeys4(k1[i], k2[i], k3[i], k4[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,4 +66,17 @@ public final class MultipleHMKeys5 implements MultipleHMKeys {
|
||||
&& ((MultipleHMKeys5) obj).k4.equals(k4)
|
||||
&& ((MultipleHMKeys5) obj).k5.equals(k5);
|
||||
}
|
||||
|
||||
public MultipleHMKeys[] splitNgramTo1grams(){
|
||||
String[] k1 = getK1().split(" ");
|
||||
String[] k2 = getK2().split(" ");
|
||||
String[] k3 = getK3().split(" ");
|
||||
String[] k4 = getK4().split(" ");
|
||||
String[] k5 = getK5().split(" ");
|
||||
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
|
||||
for(int i = 0; i < k1.length; i++){
|
||||
res[i] = new MultipleHMKeys5(k1[i], k2[i], k3[i], k4[i], k5[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,6 +46,9 @@ public class StatisticsNew {
|
||||
private Map<Collocability, Map<MultipleHMKeys, Double>> collocability;
|
||||
private Map<Taxonomy, AtomicLong> uniGramTaxonomyOccurrences;
|
||||
|
||||
private HashSet<MultipleHMKeys> minimalRelFreNgrams;
|
||||
private HashSet<MultipleHMKeys> minimalRelFre1grams;
|
||||
|
||||
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
|
||||
this.corpus = corpus;
|
||||
this.filter = filter;
|
||||
@@ -54,6 +57,9 @@ public class StatisticsNew {
|
||||
this.collocability = new ConcurrentHashMap<>();
|
||||
this.uniGramTaxonomyOccurrences = new ConcurrentHashMap<>();
|
||||
this.uniGramTaxonomyOccurrences.put(corpus.getTotal(), new AtomicLong(0L));
|
||||
this.minimalRelFreNgrams = new HashSet<>();
|
||||
this.minimalRelFre1grams = new HashSet<>();
|
||||
|
||||
|
||||
|
||||
// create table for counting word occurrences per taxonomies
|
||||
@@ -373,6 +379,10 @@ public class StatisticsNew {
|
||||
}
|
||||
|
||||
public void updateTaxonomyResults(MultipleHMKeys o, List<Taxonomy> taxonomy) {
|
||||
if(minimalRelFreNgrams.size() > 0 && !filter.getIsMinimalRelFreScraper() && !(minimalRelFreNgrams.contains(o) || minimalRelFre1grams.contains(o))) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (Taxonomy key : taxonomyResult.keySet()) {
|
||||
// first word should have the same taxonomy as others
|
||||
if (key.equals(corpus.getTotal()) || taxonomy.contains(key)) {
|
||||
@@ -472,6 +482,28 @@ public class StatisticsNew {
|
||||
}
|
||||
}
|
||||
|
||||
public HashSet<MultipleHMKeys> getMinimalRelFreNgrams() {
|
||||
return minimalRelFreNgrams;
|
||||
}
|
||||
|
||||
public HashSet<MultipleHMKeys> getMinimalRelFre1grams() {
|
||||
return minimalRelFre1grams;
|
||||
}
|
||||
|
||||
public void updateMinimalRelFre(HashSet<MultipleHMKeys> hsNgrams, HashSet<MultipleHMKeys> hs1grams) {
|
||||
minimalRelFreNgrams = hsNgrams;
|
||||
minimalRelFre1grams = hs1grams;
|
||||
}
|
||||
|
||||
public void updateMinimalRelFre(Set<Map.Entry<MultipleHMKeys, AtomicLong>> entries, double absToRelFactor) {
|
||||
for(Map.Entry<MultipleHMKeys, AtomicLong> entry : entries){
|
||||
if(entry.getValue().longValue() >= absToRelFactor){
|
||||
minimalRelFreNgrams.add(entry.getKey());
|
||||
minimalRelFre1grams.addAll(Arrays.asList(entry.getKey().splitNgramTo1grams()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private LinkedHashMap<String, String> headerInfoBlock() {
|
||||
LinkedHashMap<String, String> info = new LinkedHashMap<>();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user