Added read taxonomy for vert
This commit is contained in:
parent
3889b834e3
commit
bb9f3f0fb9
|
@ -2,8 +2,7 @@ package alg;
|
|||
|
||||
import static data.Enums.solar.SolarFilters.*;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
@ -15,6 +14,8 @@ import javax.xml.stream.XMLStreamConstants;
|
|||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.events.*;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.LineIterator;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
|
||||
import data.*;
|
||||
|
@ -436,6 +437,62 @@ public class XML_processing {
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
* @param filepath
|
||||
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
|
||||
* @param corpusType
|
||||
*/
|
||||
public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
|
||||
// boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
|
||||
// solar
|
||||
Set<String> headTags = null;
|
||||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
|
||||
LineIterator it = null;
|
||||
try {
|
||||
it = FileUtils.lineIterator(new File(filepath), "UTF-8");
|
||||
try {
|
||||
boolean insideHeader = false;
|
||||
|
||||
while (it.hasNext()) {
|
||||
String line = it.nextLine();
|
||||
|
||||
if (line.length() > 4 && line.substring(1, 5).equals("text")) {
|
||||
// split over "\" "
|
||||
String[] split = line.split("\" ");
|
||||
// String mediumId = "";
|
||||
// String typeId = "";
|
||||
// String proofreadId = "";
|
||||
for (String el : split) {
|
||||
String[] attribute = el.split("=\"");
|
||||
if (attribute[0].equals("medium_id")) {
|
||||
// mediumId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
} else if (attribute[0].equals("type_id")) {
|
||||
// typeId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
} else if (attribute[0].equals("proofread_id")) {
|
||||
// proofreadId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
LineIterator.closeQuietly(it);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
resultTaxonomy.remove("-");
|
||||
return resultTaxonomy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
|
|
|
@ -5,7 +5,8 @@ public enum CorpusType {
|
|||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos"),
|
||||
SSJ500K("ssj500k", "ssj500k");
|
||||
SSJ500K("ssj500k", "ssj500k"),
|
||||
VERT("vert", "vert");
|
||||
|
||||
|
||||
private final String name;
|
||||
|
|
|
@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
|
|||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K));
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.VERT));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
|
@ -108,6 +108,11 @@ public class Tax {
|
|||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.VERT){
|
||||
// if VERT only order taxonomy by alphabet
|
||||
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
|
||||
Collections.sort(sortedFoundTaxonomy);
|
||||
return FXCollections.observableArrayList(sortedFoundTaxonomy);
|
||||
}
|
||||
|
||||
ArrayList<String> taxForCombo = new ArrayList<>();
|
||||
|
|
|
@ -696,6 +696,7 @@ public enum Taxonomy {
|
|||
}
|
||||
|
||||
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(ObservableList<String> stringList){
|
||||
System.out.println(stringList);
|
||||
ArrayList<Taxonomy> taxonomyList = new ArrayList<>();
|
||||
|
||||
// System.out.println("INTERESTING STUFF");
|
||||
|
@ -710,6 +711,9 @@ public enum Taxonomy {
|
|||
|
||||
public static void modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ArrayList<Taxonomy> checkedItemsTaxonomy, Corpus corpus){
|
||||
// get taxonomies that were selected/deselected by user
|
||||
// System.out.println(taxonomy);
|
||||
// System.out.println(checkedItemsTaxonomy);
|
||||
|
||||
Set<Taxonomy> disjointTaxonomies = new HashSet<>(checkedItemsTaxonomy);
|
||||
if (taxonomy != null) {
|
||||
disjointTaxonomies.addAll(taxonomy);
|
||||
|
|
|
@ -241,7 +241,7 @@ public class CharacterAnalysisTab {
|
|||
msd = new ArrayList<>();
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
if (taxonomyListener != null){
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
|
||||
}
|
||||
|
@ -287,7 +287,7 @@ public class CharacterAnalysisTab {
|
|||
displayTaxonomy = false;
|
||||
displayTaxonomyChB.setSelected(false);
|
||||
// set
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
displayTaxonomyChB.setDisable(false);
|
||||
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
displayTaxonomy = newValue;
|
||||
|
|
|
@ -210,9 +210,50 @@ public class CorpusTab {
|
|||
Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType != null ? corpusType.toString() : null);
|
||||
|
||||
// make sure there are corpus files in selected directory or notify the user about it
|
||||
if (corpusFiles.size() == 0) {
|
||||
// try .vert
|
||||
corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("vert", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
|
||||
Collection<File> corpusFilesRegi = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("regi", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
|
||||
|
||||
if (corpusFiles.size() == 0){
|
||||
logger.info("alert: ", I18N.get("message.WARNING_CORPUS_NOT_FOUND"));
|
||||
showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_CORPUS_NOT_FOUND"), null);
|
||||
} else {
|
||||
corpusType = VERT;
|
||||
|
||||
corpus.setCorpusType(corpusType);
|
||||
|
||||
Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType.toString());
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(corpusLocation)
|
||||
.append("\n")
|
||||
.append(String.format(I18N.get("message.NOTIFICATION_FOUND_X_FILES"), corpusFiles.size()))
|
||||
.append("\n")
|
||||
.append(String.format(I18N.get("message.NOTIFICATION_CORPUS"), corpusType.toString()));
|
||||
|
||||
String result = sb.toString();
|
||||
|
||||
logger.debug(result);
|
||||
|
||||
initNewCorpus(selectedDirectory, corpusFiles);
|
||||
Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType.toString());
|
||||
|
||||
corpus.setChosenCorpusLocation(selectedDirectory);
|
||||
corpus.setDetectedCorpusFiles(corpusFiles);
|
||||
|
||||
chooseCorpusLabelContent = result;
|
||||
if (readHeaderInfo) {
|
||||
logger.info("reading header info...");
|
||||
readHeaderInfo();
|
||||
} else {
|
||||
setResults();
|
||||
|
||||
setCorpusForAnalysis();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles);
|
||||
|
||||
|
@ -420,6 +461,70 @@ public class CorpusTab {
|
|||
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
|
||||
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
} else if (corpusType == CorpusType.VERT) {
|
||||
// many many fields
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
|
||||
@Override
|
||||
protected HashSet<String> call() throws Exception {
|
||||
HashSet<String> values = new HashSet<>();
|
||||
long i = 0;
|
||||
|
||||
if (!corpusIsSplit) {
|
||||
updateProgress(-1.0f, -1.0f);
|
||||
}
|
||||
|
||||
for (File file : corpusFiles) {
|
||||
HashSet<String> tmpvalues = XML_processing.readVertHeaderTaxonomyAndFilters(file.getAbsolutePath(), corpusIsSplit, corpusType);
|
||||
|
||||
// update final results
|
||||
for (String entry : tmpvalues) {
|
||||
if (!entry.equals("-")) {
|
||||
values.add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
|
||||
if (corpusIsSplit) {
|
||||
updateProgress(i, corpusFiles.size());
|
||||
}
|
||||
}
|
||||
|
||||
updateProgress(1.0f, 1.0f);
|
||||
return values;
|
||||
}
|
||||
};
|
||||
|
||||
locationScanPI.progressProperty().bind(task.progressProperty());
|
||||
|
||||
task.setOnSucceeded(e -> {
|
||||
ObservableList<String> readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue());
|
||||
|
||||
if (ValidationUtil.isEmpty(readTaxonomy)) {
|
||||
// if no taxonomy found alert the user and keep other tabs disabled
|
||||
logger.info("No vert filters found in headers.");
|
||||
GUIController.showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_NO_SOLAR_FILTERS_FOUND"));
|
||||
} else {
|
||||
// set taxonomy, update label
|
||||
corpus.setTaxonomy(readTaxonomy);
|
||||
corpus.setHeaderRead(true);
|
||||
Messages.setChooseCorpusL(chooseCorpusL, chooseCorpusLabelContent);
|
||||
setResults();
|
||||
setCorpusForAnalysis();
|
||||
}
|
||||
|
||||
togglePiAndSetCorpusWrapper(false);
|
||||
|
||||
});
|
||||
|
||||
task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false));
|
||||
task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false));
|
||||
|
||||
final Thread thread = new Thread(task, "task");
|
||||
thread.setDaemon(true);
|
||||
thread.start();
|
||||
|
|
|
@ -383,7 +383,7 @@ public class OneWordAnalysisTab {
|
|||
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
if (taxonomyListener != null){
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
|
||||
}
|
||||
|
@ -430,7 +430,7 @@ public class OneWordAnalysisTab {
|
|||
displayTaxonomy = false;
|
||||
displayTaxonomyChB.setSelected(false);
|
||||
// set
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
displayTaxonomyChB.setDisable(false);
|
||||
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
displayTaxonomy = newValue;
|
||||
|
|
|
@ -312,7 +312,7 @@ public class StringAnalysisTabNew2 {
|
|||
displayTaxonomyChB.setSelected(false);
|
||||
// set
|
||||
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
displayTaxonomyChB.setDisable(false);
|
||||
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
displayTaxonomy = newValue;
|
||||
|
@ -515,7 +515,7 @@ public class StringAnalysisTabNew2 {
|
|||
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
if (taxonomyListener != null){
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
|
||||
}
|
||||
|
|
|
@ -509,7 +509,7 @@ public class WordLevelTab {
|
|||
alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener);
|
||||
|
||||
// taxonomy
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
if (taxonomyListener != null){
|
||||
taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener);
|
||||
}
|
||||
|
@ -556,7 +556,7 @@ public class WordLevelTab {
|
|||
displayTaxonomy = false;
|
||||
displayTaxonomyChB.setSelected(false);
|
||||
// set
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) {
|
||||
if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) {
|
||||
displayTaxonomyChB.setDisable(false);
|
||||
displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> {
|
||||
displayTaxonomy = newValue;
|
||||
|
|
|
@ -61,7 +61,7 @@
|
|||
<Label fx:id="alsoVisualizeLH" layoutX="10.0" layoutY="90.0" prefHeight="10.0" styleClass="help"/>
|
||||
|
||||
<Label fx:id="displayTaxonomyL" layoutX="10.0" layoutY="100.0" prefHeight="25.0" text="Izpiši taksonomije" />
|
||||
<CheckBox fx:id="displayTaxonomyChB" layoutX="263.0" layoutY="105.0" selected="false" />
|
||||
<CheckBox fx:id="displayTaxonomyChB" layoutX="283.0" layoutY="105.0" selected="false" />
|
||||
<Label fx:id="displayTaxonomyLH" layoutX="10.0" layoutY="130.0" prefHeight="10.0" styleClass="help"/>
|
||||
|
||||
<Label fx:id="slowSpeedWarning2L" layoutX="10.0" layoutY="140.0" prefHeight="10.0" text="* IZBIRA PREDHODNEGA FILTRA LAHKO MOČNO UPOČASNI DELOVANJE" styleClass="warning"/>
|
||||
|
@ -80,7 +80,7 @@
|
|||
<Label fx:id="ngramValueLH" layoutX="10.0" layoutY="180.0" prefHeight="10.0" styleClass="help"/>
|
||||
|
||||
<Label fx:id="notePunctuationsL" layoutX="10.0" layoutY="240.0" prefHeight="25.0" text="Upoštevaj ločila" />
|
||||
<CheckBox fx:id="notePunctuationsChB" layoutX="263.0" layoutY="240.0" selected="false" />
|
||||
<CheckBox fx:id="notePunctuationsChB" layoutX="283.0" layoutY="240.0" selected="false" />
|
||||
<Label fx:id="notePunctuationsLH" layoutX="10.0" layoutY="270.0" prefHeight="10.0" styleClass="help"/>
|
||||
|
||||
<Label fx:id="collocabilityL" layoutX="10.0" layoutY="280.0" prefHeight="25.0" text="Kolokabilnost" />
|
||||
|
|
Loading…
Reference in New Issue
Block a user