From 6f09cf9bedb28762ace760a6bb0aad363a5252c0 Mon Sep 17 00:00:00 2001
From: lkrsnik <krsnik.luka92@gmail.com>
Date: Mon, 10 Jun 2024 15:27:15 +0200
Subject: [PATCH] Added KOST taxonomy.

---
 src/main/java/alg/XML_processing.java | 54 +++++++++++++++++++++++++--
 src/main/java/data/CorpusType.java    |  1 +
 src/main/java/data/Tax.java           |  6 +--
 src/main/java/data/Taxonomy.java      | 38 ++++++++++++++++++-
 src/main/java/gui/CorpusTab.java      | 11 ++++--
 5 files changed, 99 insertions(+), 11 deletions(-)
diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java
index 8634427..26ed8e5 100755
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -50,7 +50,8 @@ public class XML_processing {
         } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
             return readXMLSolar(path, stats);
         } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
-                stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
+                stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
+                stats.getCorpus().getCorpusType() == CorpusType.KOST) {
             return readXMLSSJ500K(path, stats);
         } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
             return readVERT(path, stats);
@@ -461,6 +462,8 @@ public class XML_processing {
         HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
         // taxonomy corpora
         HashSet<String> resultTaxonomy = new HashSet<>();
+        HashSet<String> taxonomyNames = new HashSet<String>(
+                Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
 
         String headTagName;
 
@@ -471,7 +474,7 @@ public class XML_processing {
 
             // init results now to avoid null pointers
             headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
-        } else if (corpusType == CorpusType.SSJ500K) {
+        } else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
             headTagName = "bibl";
         } else {
             headTagName = "teiHeader";
@@ -482,6 +485,9 @@ public class XML_processing {
         try {
             xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
             boolean insideHeader = false;
+            boolean insideNote = false;
+            String filterName = "";
+            String filterValue = "";
 
             while (xmlEventReader.hasNext()) {
                 XMLEvent xmlEvent = xmlEventReader.nextEvent();
@@ -516,6 +522,11 @@ public class XML_processing {
                                     .replace("#", "");
 
                             resultTaxonomy.add(tax);
+                        // kost
+                        } else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
+                            filterName = startElement.getAttributeByName(QName.valueOf("ana"))
+                                    .getValue().replace("#", "");
+                            insideNote = true;
                         // solar
                         } else if (!parseTaxonomy) {
                             boolean inHeadTags = false;
@@ -533,13 +544,22 @@ public class XML_processing {
                             }
                         }
                     }
-                } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
+                } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
                     // if the corpus is split into multiple files, each with only one header block per file
                     // that means we should stop after we reach the end of the header
                     return parseTaxonomy ? resultTaxonomy : resultFilters;
                 } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
                     // whole corpus in one file, so we have to continue reading in order to find all header blocks
                     insideHeader = false;
+                } else if (xmlEvent.isEndElement() && insideNote) {
+                    if (taxonomyNames.contains(filterName)) {
+                        Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
+                    }
+
+                    insideNote = false;
+                } else if (xmlEvent.isCharacters() && insideNote) {
+                    Characters characters = xmlEvent.asCharacters();
+                    filterValue = characters.getData();
                 }
             }
         } catch (XMLStreamException e) {
@@ -726,6 +746,8 @@ public class XML_processing {
         boolean inPunctuation = false;
         boolean taxonomyMatch = true;
         ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
+        HashSet<String> taxonomyNames = new HashSet<String>(
+                Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
         String lemma = "";
         String msd = "";
 
@@ -760,6 +782,9 @@ public class XML_processing {
         try {
             XMLInputFactory factory = XMLInputFactory.newInstance();
             eventReader = factory.createXMLEventReader(new FileInputStream(path));
+            boolean insideNote = false;
+            String filterName = "";
+            String filterValue = "";
 
             while (eventReader.hasNext()) {
                 int percentage = (int) (lineNum * 100.0 / numLines);
@@ -803,6 +828,12 @@ public class XML_processing {
                                 Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
                                 currentFiletaxonomy.add(currentFiletaxonomyElement);
                             }
+                            // kost
+                        } else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
+                            filterName = startElement.getAttributeByName(QName.valueOf("ana"))
+                                    .getValue().replace("#", "");
+                            insideNote = true;
+
                         } else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
                             // get value from attribute target
                             Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
@@ -836,6 +867,10 @@ public class XML_processing {
                             sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
                             inPunctuation = false;
                         }
+                        // kost
+                        if (insideNote) {
+                            filterValue = characters.getData();
+                        }
                         break;
 
                     case XMLStreamConstants.END_ELEMENT:
@@ -892,7 +927,7 @@ public class XML_processing {
                                     // taxonomies don't match so stop
                                     // union (select words that match any of selected taxonomy
                                     taxonomyMatch = false;
-//
+
                                 } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
                                     // intersection (select only words that precisely match selected taxonomy
                                     taxonomyMatch = false;
@@ -900,6 +935,17 @@ public class XML_processing {
                             }
                         } else if (endElement.getName().getLocalPart().equals("text")){
                             taxonomyMatch = false;
+                            // kost
+                        }
+                        if (insideNote) {
+                            if (taxonomyNames.contains(filterName)) {
+                                for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
+                                    // keep only taxonomy properties
+                                    Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
+                                    currentFiletaxonomy.add(currentFiletaxonomyElement);
+                                }
+                            }
+                            insideNote = false;
                         }
 
                         break;
diff --git a/src/main/java/data/CorpusType.java b/src/main/java/data/CorpusType.java
index 811d024..4b970d5 100755
--- a/src/main/java/data/CorpusType.java
+++ b/src/main/java/data/CorpusType.java
@@ -9,6 +9,7 @@ public enum CorpusType {
 	CCKRES("ccKres ", "cckres"),
 	SOLAR("Šolar", "šolar"),
 	GOS("GOS", "gos"),
+	KOST("KOST", "kost"),
 	SSJ500K("ssj500k", "ssj500k"),
 	VERT("vert", "vert");
 
diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java
index 106570f..36cf3ad 100755
--- a/src/main/java/data/Tax.java
+++ b/src/main/java/data/Tax.java
@@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
 public class Tax {
 	private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
 	private static LinkedHashMap<String, String> GOS_TAXONOMY;
-	private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
+	private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
 
 	static {
 		// GIGAFIDA ----------------------------
@@ -108,7 +108,7 @@ public class Tax {
 			tax = GIGAFIDA_TAXONOMY;
 		} else if (corpusType == CorpusType.GOS) {
 			tax = GOS_TAXONOMY;
-		} else if (corpusType == CorpusType.VERT  || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
+		} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
 			// if VERT only order taxonomy by alphabet
 			ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
 			Collections.sort(sortedFoundTaxonomy);
@@ -199,7 +199,7 @@ public class Tax {
 			tax = GIGAFIDA_TAXONOMY;
 		} else if (corpusType == CorpusType.GOS) {
 			tax = GOS_TAXONOMY;
-		} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
+		} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
 			for (Taxonomy t : taxonomy) {
 				result.add(t.toLongNameString());
 			}
diff --git a/src/main/java/data/Taxonomy.java b/src/main/java/data/Taxonomy.java
index 2755fc5..b2edd8d 100755
--- a/src/main/java/data/Taxonomy.java
+++ b/src/main/java/data/Taxonomy.java
@@ -763,6 +763,42 @@ public class Taxonomy {
 
 	}
 
+	public static String[] format_KOST_taxonomy(String value, String parameter) {
+		Map<String, String> filterMap = new HashMap<>();
+		filterMap.put("FirstLang", "Prvi Jezik tvorca");
+		filterMap.put("TaskSetting", "Okoliščine nastanka");
+		filterMap.put("ProficSlv", "Nivo");
+		filterMap.put("ProgramType", "Program");
+		filterMap.put("InputType", "Napisano");
+
+		String[] split_value = new String[] {};
+		if (parameter.equals("FirstLang")) {
+			if (value.contains(", ")) {
+				split_value = value.split(", ");
+			} else if (value.contains(" ")) {
+				for (String v : value.split(" ")) {
+					if (v.equals("španščina") || v.equals("angleščina")) {
+						split_value = new String[] {v};
+					}
+				}
+			} else {
+				split_value = new String[] {value};
+			}
+		} else if (parameter.equals("ProficSlv")) {
+			if (value.equals("Izpopolnjevalec")) {
+				split_value = new String[] {"izpopolnjevalec"};
+			} else {
+				split_value = new String[] {value};
+			}
+		} else {
+			split_value = new String[] {value};
+		}
+
+        return Arrays.stream(split_value)
+				.map(val -> filterMap.get(parameter) + " - " + val)
+				.toArray(String[]::new);
+	}
+
 	public String toString() {
 		return this.name;
 	}
@@ -834,7 +870,7 @@ public class Taxonomy {
 
 	public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ObservableList<String> checkedItems, Corpus corpus){
 		ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
-		if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
+		if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
 			TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
 			return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
 		} else {
diff --git a/src/main/java/gui/CorpusTab.java b/src/main/java/gui/CorpusTab.java
index 76d182d..d0482d5 100755
--- a/src/main/java/gui/CorpusTab.java
+++ b/src/main/java/gui/CorpusTab.java
@@ -5,6 +5,7 @@ import static gui.GUIController.*;
 import static util.Util.*;
 
 import java.io.File;
+import java.io.FileWriter;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.Field;
@@ -149,7 +150,7 @@ public class CorpusTab {
 	private String corpusLocation;
 	private String corpusFilesSize;
 
-	private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
+	private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
 	private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
 
 	private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
@@ -497,7 +498,7 @@ public class CorpusTab {
 
 		logger.info("reading header data for ", corpusType.toString());
 
-		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
+		if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
 			boolean corpusIsSplit = corpusFiles.size() > 1;
 
 			final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@@ -738,7 +739,6 @@ public class CorpusTab {
 
 	private void selectReader() {
 		switch (selectReader) {
-//			"vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
 			case "VERT + REGI":
 				corpusType = VERT;
 				break;
@@ -748,6 +748,9 @@ public class CorpusTab {
 			case "XML (GOS 1.0)":
 				corpusType = GOS;
 				break;
+			case "XML (KOST 2.0)":
+				corpusType = KOST;
+				break;
 			case "XML (ssj500k 2.1)":
 				corpusType = SSJ500K;
 				break;
@@ -786,6 +789,8 @@ public class CorpusTab {
 			corpusType = GOS;
 		} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
 			corpusType = SSJ500K;
+		} else if (attrib.contains(KOST.getNameLowerCase())) {
+			corpusType = KOST;
 		}
 
 		if (corpusType == null) {