Added read taxonomy for vert

This commit is contained in:
Luka 2018-12-17 12:58:43 +01:00
parent 3889b834e3
commit bb9f3f0fb9
10 changed files with 188 additions and 16 deletions

View File

@ -2,8 +2,7 @@ package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
@ -15,6 +14,8 @@ import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.logging.log4j.LogManager;
import data.*;
@ -436,6 +437,62 @@ public class XML_processing {
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
// boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
LineIterator it = null;
try {
it = FileUtils.lineIterator(new File(filepath), "UTF-8");
try {
boolean insideHeader = false;
while (it.hasNext()) {
String line = it.nextLine();
if (line.length() > 4 && line.substring(1, 5).equals("text")) {
// split over "\" "
String[] split = line.split("\" ");
// String mediumId = "";
// String typeId = "";
// String proofreadId = "";
for (String el : split) {
String[] attribute = el.split("=\"");
if (attribute[0].equals("medium_id")) {
// mediumId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("type_id")) {
// typeId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("proofread_id")) {
// proofreadId = attribute[1];
resultTaxonomy.add(attribute[1]);
}
}
}
}
} finally {
LineIterator.closeQuietly(it);
}
} catch (IOException e) {
e.printStackTrace();
}
resultTaxonomy.remove("-");
return resultTaxonomy;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*

View File

@ -5,7 +5,8 @@ public enum CorpusType {
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos"),
SSJ500K("ssj500k", "ssj500k");
SSJ500K("ssj500k", "ssj500k"),
VERT("vert", "vert");
private final String name;

View File

@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K));
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.VERT));
static {
// GIGAFIDA ----------------------------
@ -108,6 +108,11 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT){
// if VERT only order taxonomy by alphabet
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
Collections.sort(sortedFoundTaxonomy);
return FXCollections.observableArrayList(sortedFoundTaxonomy);
}
ArrayList<String> taxForCombo = new ArrayList<>();

View File

@ -696,6 +696,7 @@ public enum Taxonomy {
}
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(ObservableList<String> stringList){
System.out.println(stringList);
ArrayList<Taxonomy> taxonomyList = new ArrayList<>();
// System.out.println("INTERESTING STUFF");
@ -710,6 +711,9 @@ public enum Taxonomy {
public static void modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ArrayList<Taxonomy> checkedItemsTaxonomy, Corpus corpus){
// get taxonomies that were selected/deselected by user
// System.out.println(taxonomy);
// System.out.println(checkedItemsTaxonomy);
Set<Taxonomy> disjointTaxonomies = new HashSet<>(checkedItemsTaxonomy);
if (taxonomy != null) {
disjointTaxonomies.addAll(taxonomy);

View File

@ -241,7 +241,7 @@ public class CharacterAnalysisTab {
msd = new ArrayList<>();
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
if (taxonomyListener != null){
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
}
@ -287,7 +287,7 @@ public class CharacterAnalysisTab {
displayTaxonomy = false;
displayTaxonomyChB.setSelected(false);
// set
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
displayTaxonomyChB.setDisable(false);
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
displayTaxonomy = newValue;

View File

@ -211,8 +211,49 @@ public class CorpusTab {
// make sure there are corpus files in selected directory or notify the user about it
if (corpusFiles.size() == 0) {
logger.info("alert: ", I18N.get("message.WARNING_CORPUS_NOT_FOUND"));
showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_CORPUS_NOT_FOUND"), null);
// try .vert
corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("vert", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
Collection<File> corpusFilesRegi = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("regi", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
if (corpusFiles.size() == 0){
logger.info("alert: ", I18N.get("message.WARNING_CORPUS_NOT_FOUND"));
showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_CORPUS_NOT_FOUND"), null);
} else {
corpusType = VERT;
corpus.setCorpusType(corpusType);
Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType.toString());
StringBuilder sb = new StringBuilder();
sb.append(corpusLocation)
.append("\n")
.append(String.format(I18N.get("message.NOTIFICATION_FOUND_X_FILES"), corpusFiles.size()))
.append("\n")
.append(String.format(I18N.get("message.NOTIFICATION_CORPUS"), corpusType.toString()));
String result = sb.toString();
logger.debug(result);
initNewCorpus(selectedDirectory, corpusFiles);
Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType.toString());
corpus.setChosenCorpusLocation(selectedDirectory);
corpus.setDetectedCorpusFiles(corpusFiles);
chooseCorpusLabelContent = result;
if (readHeaderInfo) {
logger.info("reading header info...");
readHeaderInfo();
} else {
setResults();
setCorpusForAnalysis();
}
}
} else {
String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles);
@ -420,6 +461,70 @@ public class CorpusTab {
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();
} else if (corpusType == CorpusType.VERT) {
// many many fields
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@Override
protected HashSet<String> call() throws Exception {
HashSet<String> values = new HashSet<>();
long i = 0;
if (!corpusIsSplit) {
updateProgress(-1.0f, -1.0f);
}
for (File file : corpusFiles) {
HashSet<String> tmpvalues = XML_processing.readVertHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType);
// update final results
for (String entry : tmpvalues) {
if (!entry.equals("-")) {
values.add(entry);
}
}
i++;
if (corpusIsSplit) {
updateProgress(i, corpusFiles.size());
}
}
updateProgress(1.0f, 1.0f);
return values;
}
};
locationScanPI.progressProperty().bind(task.progressProperty());
task.setOnSucceeded(e -> {
ObservableList<String> readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue());
if (ValidationUtil.isEmpty(readTaxonomy)) {
// if no taxonomy found alert the user and keep other tabs disabled
logger.info("No vert filters found in headers.");
GUIController.showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_NO_SOLAR_FILTERS_FOUND"));
} else {
// set taxonomy, update label
corpus.setTaxonomy(readTaxonomy);
corpus.setHeaderRead(true);
Messages.setChooseCorpusL(chooseCorpusL, chooseCorpusLabelContent);
setResults();
setCorpusForAnalysis();
}
togglePiAndSetCorpusWrapper(false);
});
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
final Thread thread = new Thread(task, "task");
thread.setDaemon(true);
thread.start();

View File

@ -383,7 +383,7 @@ public class OneWordAnalysisTab {
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
if (taxonomyListener != null){
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
}
@ -430,7 +430,7 @@ public class OneWordAnalysisTab {
displayTaxonomy = false;
displayTaxonomyChB.setSelected(false);
// set
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
displayTaxonomyChB.setDisable(false);
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
displayTaxonomy = newValue;

View File

@ -312,7 +312,7 @@ public class StringAnalysisTabNew2 {
displayTaxonomyChB.setSelected(false);
// set
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
displayTaxonomyChB.setDisable(false);
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
displayTaxonomy = newValue;
@ -515,7 +515,7 @@ public class StringAnalysisTabNew2 {
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
if (taxonomyListener != null){
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
}

View File

@ -509,7 +509,7 @@ public class WordLevelTab {
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
// taxonomy
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
if (taxonomyListener != null){
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
}
@ -556,7 +556,7 @@ public class WordLevelTab {
displayTaxonomy = false;
displayTaxonomyChB.setSelected(false);
// set
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
displayTaxonomyChB.setDisable(false);
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
displayTaxonomy = newValue;

View File

@ -61,7 +61,7 @@
<Label fx:id="alsoVisualizeLH" layoutX="10.0" layoutY="90.0" prefHeight="10.0" styleClass="help"/>
<Label fx:id="displayTaxonomyL" layoutX="10.0" layoutY="100.0" prefHeight="25.0" text="Izpiši taksonomije" />
<CheckBox fx:id="displayTaxonomyChB" layoutX="263.0" layoutY="105.0" selected="false" />
<CheckBox fx:id="displayTaxonomyChB" layoutX="283.0" layoutY="105.0" selected="false" />
<Label fx:id="displayTaxonomyLH" layoutX="10.0" layoutY="130.0" prefHeight="10.0" styleClass="help"/>
<Label fx:id="slowSpeedWarning2L" layoutX="10.0" layoutY="140.0" prefHeight="10.0" text="* IZBIRA PREDHODNEGA FILTRA LAHKO MOČNO UPOČASNI DELOVANJE" styleClass="warning"/>
@ -80,7 +80,7 @@
<Label fx:id="ngramValueLH" layoutX="10.0" layoutY="180.0" prefHeight="10.0" styleClass="help"/>
<Label fx:id="notePunctuationsL" layoutX="10.0" layoutY="240.0" prefHeight="25.0" text="Upoštevaj ločila" />
<CheckBox fx:id="notePunctuationsChB" layoutX="263.0" layoutY="240.0" selected="false" />
<CheckBox fx:id="notePunctuationsChB" layoutX="283.0" layoutY="240.0" selected="false" />
<Label fx:id="notePunctuationsLH" layoutX="10.0" layoutY="270.0" prefHeight="10.0" styleClass="help"/>
<Label fx:id="collocabilityL" layoutX="10.0" layoutY="280.0" prefHeight="25.0" text="Kolokabilnost" />