320 lines
14 KiB
Java
320 lines
14 KiB
Java
package nogui;
|
|
|
|
import alg.XML_processing;
|
|
import data.*;
|
|
import gui.GUIController;
|
|
import gui.I18N;
|
|
import javafx.scene.control.Alert;
|
|
import org.apache.logging.log4j.LogManager;
|
|
import org.apache.logging.log4j.Logger;
|
|
import org.json.simple.JSONArray;
|
|
|
|
import java.io.File;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.util.*;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
import java.util.concurrent.atomic.AtomicLong;
|
|
import java.util.regex.Pattern;
|
|
|
|
import static gui.GUIController.showAlert;
|
|
|
|
public class Utils {
|
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
|
|
|
public static ArrayList<Taxonomy> getTaxonomy(JSONArray taxonomyArray, Corpus corpus) {
|
|
// convert JSONArray to ObservableList<String>
|
|
ArrayList<String> checkedItems = new ArrayList<>();
|
|
for (Object o : taxonomyArray) {
|
|
checkedItems.add((String) o);
|
|
}
|
|
|
|
ArrayList<Taxonomy> taxonomy = new ArrayList<>();
|
|
ArrayList<Taxonomy> checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus);
|
|
return checkedItemsTaxonomy;
|
|
}
|
|
|
|
public static ArrayList<Collocability> getCollocability(JSONArray collocabilityArray) {
|
|
// convert JSONArray to ObservableList<String>
|
|
ArrayList<Collocability> checkedItems = new ArrayList<>();
|
|
for (Object o : collocabilityArray) {
|
|
checkedItems.add(Collocability.factory((String) o));
|
|
}
|
|
return checkedItems;
|
|
}
|
|
|
|
public static ArrayList<String> getArrayList(JSONArray array) {
|
|
// convert JSONArray to ObservableList<String>
|
|
ArrayList<String> arrayList = new ArrayList<>();
|
|
for (Object o : array) {
|
|
arrayList.add((String) o);
|
|
}
|
|
return arrayList;
|
|
}
|
|
|
|
public static ArrayList<String> getAlsoVisualizeList(JSONArray array) {
|
|
// convert JSONArray to ObservableList<String>
|
|
ArrayList<String> arrayList = new ArrayList<>();
|
|
for (Object o : array) {
|
|
arrayList.add(I18N.get((String) o));
|
|
}
|
|
return arrayList;
|
|
}
|
|
|
|
public static ArrayList<Pattern> getMsd(String stringMsd) {
|
|
ArrayList<Pattern> msd = new ArrayList<>();
|
|
if (stringMsd.equals("")) {
|
|
return msd;
|
|
}
|
|
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(stringMsd.split(" ")));
|
|
for (String msdToken : msdTmp) {
|
|
msd.add(Pattern.compile(msdToken));
|
|
}
|
|
return msd;
|
|
}
|
|
|
|
public static void updateProgress(int i, int size, String format) {
|
|
}
|
|
|
|
public static void updateProgress(double i, int size, String format) {
|
|
}
|
|
|
|
public static void prepareTaskForMinRelFre(StatisticsNew statistic, Corpus corpus) {
|
|
Filter fi = statistic.getFilter();
|
|
logger.info("Started execution: ", fi);
|
|
|
|
try{
|
|
Filter f2 = (Filter) fi.clone();
|
|
f2.setIsMinimalRelFreScraper(true);
|
|
StatisticsNew statisticsMinRelFre = new StatisticsNew(corpus, f2, false);
|
|
|
|
Collection<File> corpusFiles = statisticsMinRelFre.getCorpus().getDetectedCorpusFiles();
|
|
|
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statisticsMinRelFre.getCorpus().getCorpusType());
|
|
|
|
Date startTime = new Date();
|
|
Date previousTime = new Date();
|
|
int remainingSeconds = -1;
|
|
int corpusSize;
|
|
int i;
|
|
if(statistic.getFilter().getCollocability().size() > 0){
|
|
i = 0;
|
|
corpusSize = corpusFiles.size() * 3;
|
|
} else {
|
|
i = 0;
|
|
corpusSize = corpusFiles.size() * 2;
|
|
}
|
|
for (File f : corpusFiles) {
|
|
final int iFinal = i;
|
|
XML_processing xml_processing = new XML_processing();
|
|
i++;
|
|
if (multipleFiles) {
|
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000);
|
|
previousTime = new Date();
|
|
}
|
|
} else {}
|
|
xml_processing.readXML(f.toString(), statisticsMinRelFre);
|
|
}
|
|
|
|
// add remaining minRelFre results
|
|
if(statisticsMinRelFre.getFilter().getIsMinimalRelFreScraper()) {
|
|
long countFor1MWords = statisticsMinRelFre.getUniGramOccurrences().get(statisticsMinRelFre.getCorpus().getTotal()).longValue();
|
|
double absToRelFactor = (statisticsMinRelFre.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
|
|
|
|
statisticsMinRelFre.updateMinimalRelFre(statisticsMinRelFre.getTaxonomyResult().get(statisticsMinRelFre.getCorpus().getTotal()).entrySet(), absToRelFactor);
|
|
|
|
// reset all values
|
|
for(Taxonomy taxonomy : statisticsMinRelFre.getTaxonomyResult().keySet()){
|
|
statisticsMinRelFre.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>());
|
|
}
|
|
for(Taxonomy taxonomy : statisticsMinRelFre.getUniGramOccurrences().keySet()){
|
|
statisticsMinRelFre.getUniGramOccurrences().put(taxonomy, new AtomicLong(0));
|
|
}
|
|
}
|
|
|
|
|
|
prepareMainTask(statistic, corpus);
|
|
|
|
|
|
}catch(CloneNotSupportedException c){}
|
|
}
|
|
|
|
public static void prepareMainTask(StatisticsNew statistic, Corpus corpus) {
|
|
Filter f = statistic.getFilter();
|
|
logger.info("Started execution: ", f);
|
|
|
|
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
|
|
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
|
|
Date startTime = new Date();
|
|
Date previousTime = new Date();
|
|
int remainingSeconds = -1;
|
|
int corpusSize;
|
|
int i;
|
|
int taskIndex = 0;
|
|
if(statistic.getFilter().getCollocability().size() > 0 && statistic.getFilter().getMinimalRelFre() > 1){
|
|
i = corpusFiles.size();
|
|
corpusSize = corpusFiles.size() * 3;
|
|
} else if (statistic.getFilter().getMinimalRelFre() > 1) {
|
|
i = corpusFiles.size();
|
|
corpusSize = corpusFiles.size() * 2;
|
|
} else if (statistic.getFilter().getCollocability().size() > 0) {
|
|
i = 0;
|
|
corpusSize = corpusFiles.size() * 2;
|
|
} else {
|
|
i = 0;
|
|
corpusSize = corpusFiles.size();
|
|
}
|
|
for (File fi : corpusFiles) {
|
|
final int iFinal = i;
|
|
XML_processing xml_processing = new XML_processing();
|
|
xml_processing.isCancelled = false;
|
|
i++;
|
|
taskIndex++;
|
|
// if(xml_processing.progressBarListener != null) {
|
|
// xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
|
|
// }
|
|
if (multipleFiles) {
|
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
|
|
previousTime = new Date();
|
|
}
|
|
// this.updateProgress(i, corpusSize);
|
|
// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds));
|
|
|
|
} else {
|
|
// xml_processing.progressBarListener = new InvalidationListener() {
|
|
// int remainingSeconds = -1;
|
|
// Date previousTime = new Date();
|
|
// @Override
|
|
// public void invalidated(Observable observable) {
|
|
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
|
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
|
|
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
|
|
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
|
|
// previousTime = new Date();
|
|
// }
|
|
// xml_processing.isCancelled = isCancelled();
|
|
// updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100);
|
|
// updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), 1, 1, f.getName(), remainingSeconds));
|
|
// }
|
|
// };
|
|
//
|
|
// xml_processing.progressProperty().addListener(xml_processing.progressBarListener);
|
|
}
|
|
xml_processing.readXML(fi.toString(), statistic);
|
|
}
|
|
// if getMinimalRelFre > 1 erase all words that have lower occurrences at the end of processing
|
|
if (statistic.getFilter().getMinimalRelFre() > 1){
|
|
long countFor1MWords = statistic.getUniGramOccurrences().get(statistic.getCorpus().getTotal()).longValue();
|
|
double absToRelFactor = (statistic.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
|
|
|
|
|
|
for(Map.Entry<MultipleHMKeys, AtomicLong> entry : statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet()){
|
|
if(entry.getValue().longValue() < absToRelFactor){
|
|
statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).remove(entry.getKey());
|
|
}
|
|
}
|
|
statistic.updateMinimalRelFre(statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet(), absToRelFactor);
|
|
}
|
|
|
|
if (f.getCollocability().size() > 0) {
|
|
try{
|
|
Filter f2 = (Filter) f.clone();
|
|
f2.setNgramValue(1);
|
|
StatisticsNew statisticsOneGrams = new StatisticsNew(corpus, f2, false);
|
|
prepareTaskForCollocability(statistic, statisticsOneGrams);
|
|
}catch(CloneNotSupportedException c){}
|
|
|
|
|
|
|
|
} else {
|
|
try {
|
|
boolean successullySaved = statistic.saveResultToDisk();
|
|
if (successullySaved) {
|
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
|
} else {
|
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
|
}
|
|
} catch (UnsupportedEncodingException e1) {
|
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
|
logger.error("Error while saving", e1);
|
|
} catch (OutOfMemoryError e1) {
|
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
|
logger.error("Out of memory error", e1);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static void prepareTaskForCollocability(StatisticsNew statistic, StatisticsNew statisticsOneGrams) {
|
|
Collection<File> corpusFiles = statisticsOneGrams.getCorpus().getDetectedCorpusFiles();
|
|
|
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
|
|
Date startTime = new Date();
|
|
Date previousTime = new Date();
|
|
int remainingSeconds = -1;
|
|
|
|
int corpusSize;
|
|
int i;
|
|
int taskIndex = 0;
|
|
if(statistic.getFilter().getMinimalRelFre() > 1){
|
|
i = corpusFiles.size() * 2;
|
|
corpusSize = corpusFiles.size() * 3;
|
|
} else {
|
|
i = corpusFiles.size();
|
|
corpusSize = corpusFiles.size() * 2;
|
|
}
|
|
|
|
|
|
|
|
for (File f : corpusFiles) {
|
|
XML_processing xml_processing = new XML_processing();
|
|
i++;
|
|
taskIndex++;
|
|
if(xml_processing.progressBarListener != null) {
|
|
xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
|
|
}
|
|
if (multipleFiles) {
|
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
|
|
previousTime = new Date();
|
|
}
|
|
} else {
|
|
// xml_processing.progressBarListener = new InvalidationListener() {
|
|
// int remainingSeconds = -1;
|
|
// Date previousTime = new Date();
|
|
// @Override
|
|
// public void invalidated(Observable observable) {
|
|
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
|
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
|
|
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
|
|
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
|
|
// previousTime = new Date();
|
|
// }
|
|
// }
|
|
// };
|
|
}
|
|
xml_processing.isCollocability = true;
|
|
xml_processing.readXML(f.toString(), statisticsOneGrams);
|
|
xml_processing.isCollocability = false;
|
|
}
|
|
|
|
try {
|
|
System.out.print(statistic);
|
|
statistic.updateCalculateCollocabilities(statisticsOneGrams);
|
|
boolean successullySaved = statistic.saveResultToDisk();
|
|
if (successullySaved) {
|
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
|
} else {
|
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
|
}
|
|
} catch (UnsupportedEncodingException e1) {
|
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
|
logger.error("Error while saving", e1);
|
|
} catch (OutOfMemoryError e1) {
|
|
logger.error(I18N.get("message.ERROR_NOT_ENOUGH_MEMORY"));
|
|
logger.error("Out of memory error", e1);
|
|
}
|
|
}
|
|
}
|