Added filter delete words with lower frequency from output (large corpuses optimization)

This commit is contained in:
2019-02-27 10:14:40 +01:00
parent b8dee86c36
commit 82d111eade
20 changed files with 1670 additions and 561 deletions

View File

@@ -29,6 +29,8 @@ public class Filter implements Cloneable {
NOTE_PUNCTUATIONS,
MINIMAL_OCCURRENCES,
MINIMAL_TAXONOMY,
MINIMAL_REL_FRE,
IS_MINIMAL_REL_FRE_SCRAPER,
TAXONOMY_SET_OPERATION,
COLLOCABILITY,
PREFIX_LENGTH,
@@ -41,6 +43,7 @@ public class Filter implements Cloneable {
filter = new HashMap<>();
filter.put(WRITE_MSD_AT_THE_END, false);
filter.put(WORD_PARTS, new ArrayList<CalculateFor>());
filter.put(IS_MINIMAL_REL_FRE_SCRAPER, false);
}
public Filter(AnalysisLevel al, CalculateFor cf) {
@@ -258,6 +261,24 @@ public class Filter implements Cloneable {
return (Integer) filter.get(MINIMAL_TAXONOMY);
}
public void setMinimalRelFre(Integer minimalRelFre) {
filter.put(MINIMAL_REL_FRE, minimalRelFre);
}
public Integer getMinimalRelFre() {
return (Integer) filter.get(MINIMAL_REL_FRE);
}
public void setIsMinimalRelFreScraper(boolean isMinimalRelFreScraper) {
filter.put(IS_MINIMAL_REL_FRE_SCRAPER, isMinimalRelFreScraper);
}
public boolean getIsMinimalRelFreScraper() {
return (boolean) filter.get(IS_MINIMAL_REL_FRE_SCRAPER);
}
// PREFIX_LENGTH,
// SUFFIX_LENGTH,
// PREFIX_LIST,

View File

@@ -66,4 +66,6 @@ public interface MultipleHMKeys {
.thenComparing(MultipleHMKeys::getK5)
.compare(this, othr);
}
MultipleHMKeys[] splitNgramTo1grams();
}

View File

@@ -36,4 +36,13 @@ public final class MultipleHMKeys1 implements MultipleHMKeys {
public boolean equals(Object obj) {
return (obj instanceof MultipleHMKeys1) && ((MultipleHMKeys1) obj).k1.equals(k1);
}
public MultipleHMKeys[] splitNgramTo1grams(){
String[] k1 = getK1().split(" ");
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
for(int i = 0; i < k1.length; i++){
res[i] = new MultipleHMKeys1(k1[i]);
}
return res;
}
}

View File

@@ -46,4 +46,14 @@ public final class MultipleHMKeys2 implements MultipleHMKeys {
// return (obj instanceof MultipleHMKeys) && ((MultipleHMKeys) obj).key.equals(key);
}
public MultipleHMKeys[] splitNgramTo1grams(){
String[] k1 = getK1().split(" ");
String[] k2 = getK2().split(" ");
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
for(int i = 0; i < k1.length; i++){
res[i] = new MultipleHMKeys2(k1[i], k2[i]);
}
return res;
}
}

View File

@@ -50,4 +50,15 @@ public final class MultipleHMKeys3 implements MultipleHMKeys {
&& ((MultipleHMKeys3) obj).k2.equals(k2)
&& ((MultipleHMKeys3) obj).k3.equals(k3);
}
public MultipleHMKeys[] splitNgramTo1grams(){
String[] k1 = getK1().split(" ");
String[] k2 = getK2().split(" ");
String[] k3 = getK3().split(" ");
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
for(int i = 0; i < k1.length; i++){
res[i] = new MultipleHMKeys3(k1[i], k2[i], k3[i]);
}
return res;
}
}

View File

@@ -58,4 +58,16 @@ public final class MultipleHMKeys4 implements MultipleHMKeys {
&& ((MultipleHMKeys4) obj).k3.equals(k3)
&& ((MultipleHMKeys4) obj).k4.equals(k4);
}
public MultipleHMKeys[] splitNgramTo1grams(){
String[] k1 = getK1().split(" ");
String[] k2 = getK2().split(" ");
String[] k3 = getK3().split(" ");
String[] k4 = getK4().split(" ");
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
for(int i = 0; i < k1.length; i++){
res[i] = new MultipleHMKeys4(k1[i], k2[i], k3[i], k4[i]);
}
return res;
}
}

View File

@@ -66,4 +66,17 @@ public final class MultipleHMKeys5 implements MultipleHMKeys {
&& ((MultipleHMKeys5) obj).k4.equals(k4)
&& ((MultipleHMKeys5) obj).k5.equals(k5);
}
public MultipleHMKeys[] splitNgramTo1grams(){
String[] k1 = getK1().split(" ");
String[] k2 = getK2().split(" ");
String[] k3 = getK3().split(" ");
String[] k4 = getK4().split(" ");
String[] k5 = getK5().split(" ");
MultipleHMKeys[] res = new MultipleHMKeys[k1.length];
for(int i = 0; i < k1.length; i++){
res[i] = new MultipleHMKeys5(k1[i], k2[i], k3[i], k4[i], k5[i]);
}
return res;
}
}

View File

@@ -46,6 +46,9 @@ public class StatisticsNew {
private Map<Collocability, Map<MultipleHMKeys, Double>> collocability;
private Map<Taxonomy, AtomicLong> uniGramTaxonomyOccurrences;
private HashSet<MultipleHMKeys> minimalRelFreNgrams;
private HashSet<MultipleHMKeys> minimalRelFre1grams;
public StatisticsNew(Corpus corpus, Filter filter, boolean useDB) {
this.corpus = corpus;
this.filter = filter;
@@ -54,6 +57,9 @@ public class StatisticsNew {
this.collocability = new ConcurrentHashMap<>();
this.uniGramTaxonomyOccurrences = new ConcurrentHashMap<>();
this.uniGramTaxonomyOccurrences.put(corpus.getTotal(), new AtomicLong(0L));
this.minimalRelFreNgrams = new HashSet<>();
this.minimalRelFre1grams = new HashSet<>();
// create table for counting word occurrences per taxonomies
@@ -373,6 +379,10 @@ public class StatisticsNew {
}
public void updateTaxonomyResults(MultipleHMKeys o, List<Taxonomy> taxonomy) {
if(minimalRelFreNgrams.size() > 0 && !filter.getIsMinimalRelFreScraper() && !(minimalRelFreNgrams.contains(o) || minimalRelFre1grams.contains(o))) {
return;
}
for (Taxonomy key : taxonomyResult.keySet()) {
// first word should have the same taxonomy as others
if (key.equals(corpus.getTotal()) || taxonomy.contains(key)) {
@@ -472,6 +482,28 @@ public class StatisticsNew {
}
}
public HashSet<MultipleHMKeys> getMinimalRelFreNgrams() {
return minimalRelFreNgrams;
}
public HashSet<MultipleHMKeys> getMinimalRelFre1grams() {
return minimalRelFre1grams;
}
public void updateMinimalRelFre(HashSet<MultipleHMKeys> hsNgrams, HashSet<MultipleHMKeys> hs1grams) {
minimalRelFreNgrams = hsNgrams;
minimalRelFre1grams = hs1grams;
}
public void updateMinimalRelFre(Set<Map.Entry<MultipleHMKeys, AtomicLong>> entries, double absToRelFactor) {
for(Map.Entry<MultipleHMKeys, AtomicLong> entry : entries){
if(entry.getValue().longValue() >= absToRelFactor){
minimalRelFreNgrams.add(entry.getKey());
minimalRelFre1grams.addAll(Arrays.asList(entry.getKey().splitNgramTo1grams()));
}
}
}
private LinkedHashMap<String, String> headerInfoBlock() {
LinkedHashMap<String, String> info = new LinkedHashMap<>();