From 6f09cf9bedb28762ace760a6bb0aad363a5252c0 Mon Sep 17 00:00:00 2001 From: lkrsnik Date: Mon, 10 Jun 2024 15:27:15 +0200 Subject: [PATCH] Added KOST taxonomy. --- src/main/java/alg/XML_processing.java | 54 +++++++++++++++++++++++++-- src/main/java/data/CorpusType.java | 1 + src/main/java/data/Tax.java | 6 +-- src/main/java/data/Taxonomy.java | 38 ++++++++++++++++++- src/main/java/gui/CorpusTab.java | 11 ++++-- 5 files changed, 99 insertions(+), 11 deletions(-) diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 8634427..26ed8e5 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -50,7 +50,8 @@ public class XML_processing { } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { return readXMLSolar(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K || - stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) { + stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 || + stats.getCorpus().getCorpusType() == CorpusType.KOST) { return readXMLSSJ500K(path, stats); } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) { return readVERT(path, stats); @@ -461,6 +462,8 @@ public class XML_processing { HashMap> resultFilters = new HashMap<>(); // taxonomy corpora HashSet resultTaxonomy = new HashSet<>(); + HashSet taxonomyNames = new HashSet( + Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType")); String headTagName; @@ -471,7 +474,7 @@ public class XML_processing { // init results now to avoid null pointers headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); - } else if (corpusType == CorpusType.SSJ500K) { + } else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) { headTagName = "bibl"; } else { headTagName = "teiHeader"; @@ -482,6 +485,9 @@ public class XML_processing { try { xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); boolean insideHeader = false; + boolean insideNote = false; + String filterName = ""; + String filterValue = ""; while (xmlEventReader.hasNext()) { XMLEvent xmlEvent = xmlEventReader.nextEvent(); @@ -516,6 +522,11 @@ public class XML_processing { .replace("#", ""); resultTaxonomy.add(tax); + // kost + } else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) { + filterName = startElement.getAttributeByName(QName.valueOf("ana")) + .getValue().replace("#", ""); + insideNote = true; // solar } else if (!parseTaxonomy) { boolean inHeadTags = false; @@ -533,13 +544,22 @@ public class XML_processing { } } } - } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { + } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) { // if the corpus is split into multiple files, each with only one header block per file // that means we should stop after we reach the end of the header return parseTaxonomy ? resultTaxonomy : resultFilters; } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { // whole corpus in one file, so we have to continue reading in order to find all header blocks insideHeader = false; + } else if (xmlEvent.isEndElement() && insideNote) { + if (taxonomyNames.contains(filterName)) { + Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName)); + } + + insideNote = false; + } else if (xmlEvent.isCharacters() && insideNote) { + Characters characters = xmlEvent.asCharacters(); + filterValue = characters.getData(); } } } catch (XMLStreamException e) { @@ -726,6 +746,8 @@ public class XML_processing { boolean inPunctuation = false; boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); + HashSet taxonomyNames = new HashSet( + Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType")); String lemma = ""; String msd = ""; @@ -760,6 +782,9 @@ public class XML_processing { try { XMLInputFactory factory = XMLInputFactory.newInstance(); eventReader = factory.createXMLEventReader(new FileInputStream(path)); + boolean insideNote = false; + String filterName = ""; + String filterValue = ""; while (eventReader.hasNext()) { int percentage = (int) (lineNum * 100.0 / numLines); @@ -803,6 +828,12 @@ public class XML_processing { Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); } + // kost + } else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) { + filterName = startElement.getAttributeByName(QName.valueOf("ana")) + .getValue().replace("#", ""); + insideNote = true; + } else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) { // get value from attribute target Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); @@ -836,6 +867,10 @@ public class XML_processing { sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); inPunctuation = false; } + // kost + if (insideNote) { + filterValue = characters.getData(); + } break; case XMLStreamConstants.END_ELEMENT: @@ -892,7 +927,7 @@ public class XML_processing { // taxonomies don't match so stop // union (select words that match any of selected taxonomy taxonomyMatch = false; -// + } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ // intersection (select only words that precisely match selected taxonomy taxonomyMatch = false; @@ -900,6 +935,17 @@ public class XML_processing { } } else if (endElement.getName().getLocalPart().equals("text")){ taxonomyMatch = false; + // kost + } + if (insideNote) { + if (taxonomyNames.contains(filterName)) { + for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) { + // keep only taxonomy properties + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + } + } + insideNote = false; } break; diff --git a/src/main/java/data/CorpusType.java b/src/main/java/data/CorpusType.java index 811d024..4b970d5 100755 --- a/src/main/java/data/CorpusType.java +++ b/src/main/java/data/CorpusType.java @@ -9,6 +9,7 @@ public enum CorpusType { CCKRES("ccKres ", "cckres"), SOLAR("Šolar", "šolar"), GOS("GOS", "gos"), + KOST("KOST", "kost"), SSJ500K("ssj500k", "ssj500k"), VERT("vert", "vert"); diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index 106570f..36cf3ad 100755 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -10,7 +10,7 @@ import javafx.collections.ObservableList; public class Tax { private static LinkedHashMap GIGAFIDA_TAXONOMY; private static LinkedHashMap GOS_TAXONOMY; - private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT)); + private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT)); static { // GIGAFIDA ---------------------------- @@ -108,7 +108,7 @@ public class Tax { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; - } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){ + } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){ // if VERT only order taxonomy by alphabet ArrayList sortedFoundTaxonomy = new ArrayList<>(foundTax); Collections.sort(sortedFoundTaxonomy); @@ -199,7 +199,7 @@ public class Tax { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; - } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { + } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { for (Taxonomy t : taxonomy) { result.add(t.toLongNameString()); } diff --git a/src/main/java/data/Taxonomy.java b/src/main/java/data/Taxonomy.java index 2755fc5..b2edd8d 100755 --- a/src/main/java/data/Taxonomy.java +++ b/src/main/java/data/Taxonomy.java @@ -763,6 +763,42 @@ public class Taxonomy { } + public static String[] format_KOST_taxonomy(String value, String parameter) { + Map filterMap = new HashMap<>(); + filterMap.put("FirstLang", "Prvi Jezik tvorca"); + filterMap.put("TaskSetting", "Okoliščine nastanka"); + filterMap.put("ProficSlv", "Nivo"); + filterMap.put("ProgramType", "Program"); + filterMap.put("InputType", "Napisano"); + + String[] split_value = new String[] {}; + if (parameter.equals("FirstLang")) { + if (value.contains(", ")) { + split_value = value.split(", "); + } else if (value.contains(" ")) { + for (String v : value.split(" ")) { + if (v.equals("španščina") || v.equals("angleščina")) { + split_value = new String[] {v}; + } + } + } else { + split_value = new String[] {value}; + } + } else if (parameter.equals("ProficSlv")) { + if (value.equals("Izpopolnjevalec")) { + split_value = new String[] {"izpopolnjevalec"}; + } else { + split_value = new String[] {value}; + } + } else { + split_value = new String[] {value}; + } + + return Arrays.stream(split_value) + .map(val -> filterMap.get(parameter) + " - " + val) + .toArray(String[]::new); + } + public String toString() { return this.name; } @@ -834,7 +870,7 @@ public class Taxonomy { public static ArrayList modifyingTaxonomy(ArrayList taxonomy, ObservableList checkedItems, Corpus corpus){ ArrayList checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus); - if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) { + if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) { TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus); return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus); } else { diff --git a/src/main/java/gui/CorpusTab.java b/src/main/java/gui/CorpusTab.java index 76d182d..d0482d5 100755 --- a/src/main/java/gui/CorpusTab.java +++ b/src/main/java/gui/CorpusTab.java @@ -5,6 +5,7 @@ import static gui.GUIController.*; import static util.Util.*; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.Field; @@ -149,7 +150,7 @@ public class CorpusTab { private String corpusLocation; private String corpusFilesSize; - private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"}; + private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"}; private static final ArrayList SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY)); private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"}; @@ -497,7 +498,7 @@ public class CorpusTab { logger.info("reading header data for ", corpusType.toString()); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { boolean corpusIsSplit = corpusFiles.size() > 1; final Task> task = new Task>() { @@ -738,7 +739,6 @@ public class CorpusTab { private void selectReader() { switch (selectReader) { -// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)" case "VERT + REGI": corpusType = VERT; break; @@ -748,6 +748,9 @@ public class CorpusTab { case "XML (GOS 1.0)": corpusType = GOS; break; + case "XML (KOST 2.0)": + corpusType = KOST; + break; case "XML (ssj500k 2.1)": corpusType = SSJ500K; break; @@ -786,6 +789,8 @@ public class CorpusTab { corpusType = GOS; } else if (attrib.contains(SSJ500K.getNameLowerCase())) { corpusType = SSJ500K; + } else if (attrib.contains(KOST.getNameLowerCase())) { + corpusType = KOST; } if (corpusType == null) {