Added KOST taxonomy.
This commit is contained in:
parent
e58faf5604
commit
6f09cf9bed
|
@ -50,7 +50,8 @@ public class XML_processing {
|
|||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
return readXMLSolar(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
|
||||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.KOST) {
|
||||
return readXMLSSJ500K(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
|
||||
return readVERT(path, stats);
|
||||
|
@ -461,6 +462,8 @@ public class XML_processing {
|
|||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
HashSet<String> taxonomyNames = new HashSet<String>(
|
||||
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
|
||||
|
||||
String headTagName;
|
||||
|
||||
|
@ -471,7 +474,7 @@ public class XML_processing {
|
|||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else if (corpusType == CorpusType.SSJ500K) {
|
||||
} else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
|
||||
headTagName = "bibl";
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
|
@ -482,6 +485,9 @@ public class XML_processing {
|
|||
try {
|
||||
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
|
||||
boolean insideHeader = false;
|
||||
boolean insideNote = false;
|
||||
String filterName = "";
|
||||
String filterValue = "";
|
||||
|
||||
while (xmlEventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = xmlEventReader.nextEvent();
|
||||
|
@ -516,6 +522,11 @@ public class XML_processing {
|
|||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
// kost
|
||||
} else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
|
||||
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
|
||||
.getValue().replace("#", "");
|
||||
insideNote = true;
|
||||
// solar
|
||||
} else if (!parseTaxonomy) {
|
||||
boolean inHeadTags = false;
|
||||
|
@ -533,13 +544,22 @@ public class XML_processing {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
|
||||
// if the corpus is split into multiple files, each with only one header block per file
|
||||
// that means we should stop after we reach the end of the header
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// whole corpus in one file, so we have to continue reading in order to find all header blocks
|
||||
insideHeader = false;
|
||||
} else if (xmlEvent.isEndElement() && insideNote) {
|
||||
if (taxonomyNames.contains(filterName)) {
|
||||
Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
|
||||
}
|
||||
|
||||
insideNote = false;
|
||||
} else if (xmlEvent.isCharacters() && insideNote) {
|
||||
Characters characters = xmlEvent.asCharacters();
|
||||
filterValue = characters.getData();
|
||||
}
|
||||
}
|
||||
} catch (XMLStreamException e) {
|
||||
|
@ -726,6 +746,8 @@ public class XML_processing {
|
|||
boolean inPunctuation = false;
|
||||
boolean taxonomyMatch = true;
|
||||
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
|
||||
HashSet<String> taxonomyNames = new HashSet<String>(
|
||||
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
|
@ -760,6 +782,9 @@ public class XML_processing {
|
|||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
boolean insideNote = false;
|
||||
String filterName = "";
|
||||
String filterValue = "";
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
int percentage = (int) (lineNum * 100.0 / numLines);
|
||||
|
@ -803,6 +828,12 @@ public class XML_processing {
|
|||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
}
|
||||
// kost
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
|
||||
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
|
||||
.getValue().replace("#", "");
|
||||
insideNote = true;
|
||||
|
||||
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
|
||||
// get value from attribute target
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
@ -836,6 +867,10 @@ public class XML_processing {
|
|||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
}
|
||||
// kost
|
||||
if (insideNote) {
|
||||
filterValue = characters.getData();
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
|
@ -892,7 +927,7 @@ public class XML_processing {
|
|||
// taxonomies don't match so stop
|
||||
// union (select words that match any of selected taxonomy
|
||||
taxonomyMatch = false;
|
||||
//
|
||||
|
||||
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
|
||||
// intersection (select only words that precisely match selected taxonomy
|
||||
taxonomyMatch = false;
|
||||
|
@ -900,6 +935,17 @@ public class XML_processing {
|
|||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("text")){
|
||||
taxonomyMatch = false;
|
||||
// kost
|
||||
}
|
||||
if (insideNote) {
|
||||
if (taxonomyNames.contains(filterName)) {
|
||||
for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
|
||||
// keep only taxonomy properties
|
||||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
}
|
||||
}
|
||||
insideNote = false;
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
|
@ -9,6 +9,7 @@ public enum CorpusType {
|
|||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos"),
|
||||
KOST("KOST", "kost"),
|
||||
SSJ500K("ssj500k", "ssj500k"),
|
||||
VERT("vert", "vert");
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
|
|||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
|
@ -108,7 +108,7 @@ public class Tax {
|
|||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
|
||||
// if VERT only order taxonomy by alphabet
|
||||
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
|
||||
Collections.sort(sortedFoundTaxonomy);
|
||||
|
@ -199,7 +199,7 @@ public class Tax {
|
|||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
for (Taxonomy t : taxonomy) {
|
||||
result.add(t.toLongNameString());
|
||||
}
|
||||
|
|
|
@ -763,6 +763,42 @@ public class Taxonomy {
|
|||
|
||||
}
|
||||
|
||||
public static String[] format_KOST_taxonomy(String value, String parameter) {
|
||||
Map<String, String> filterMap = new HashMap<>();
|
||||
filterMap.put("FirstLang", "Prvi Jezik tvorca");
|
||||
filterMap.put("TaskSetting", "Okoliščine nastanka");
|
||||
filterMap.put("ProficSlv", "Nivo");
|
||||
filterMap.put("ProgramType", "Program");
|
||||
filterMap.put("InputType", "Napisano");
|
||||
|
||||
String[] split_value = new String[] {};
|
||||
if (parameter.equals("FirstLang")) {
|
||||
if (value.contains(", ")) {
|
||||
split_value = value.split(", ");
|
||||
} else if (value.contains(" ")) {
|
||||
for (String v : value.split(" ")) {
|
||||
if (v.equals("španščina") || v.equals("angleščina")) {
|
||||
split_value = new String[] {v};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
} else if (parameter.equals("ProficSlv")) {
|
||||
if (value.equals("Izpopolnjevalec")) {
|
||||
split_value = new String[] {"izpopolnjevalec"};
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
|
||||
return Arrays.stream(split_value)
|
||||
.map(val -> filterMap.get(parameter) + " - " + val)
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
@ -834,7 +870,7 @@ public class Taxonomy {
|
|||
|
||||
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ObservableList<String> checkedItems, Corpus corpus){
|
||||
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
|
||||
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
||||
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
||||
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
|
||||
return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
|
||||
} else {
|
||||
|
|
|
@ -5,6 +5,7 @@ import static gui.GUIController.*;
|
|||
import static util.Util.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -149,7 +150,7 @@ public class CorpusTab {
|
|||
private String corpusLocation;
|
||||
private String corpusFilesSize;
|
||||
|
||||
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
|
||||
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
|
||||
private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
|
||||
|
||||
private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
|
||||
|
@ -497,7 +498,7 @@ public class CorpusTab {
|
|||
|
||||
logger.info("reading header data for ", corpusType.toString());
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
|
||||
|
@ -738,7 +739,6 @@ public class CorpusTab {
|
|||
|
||||
private void selectReader() {
|
||||
switch (selectReader) {
|
||||
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
|
||||
case "VERT + REGI":
|
||||
corpusType = VERT;
|
||||
break;
|
||||
|
@ -748,6 +748,9 @@ public class CorpusTab {
|
|||
case "XML (GOS 1.0)":
|
||||
corpusType = GOS;
|
||||
break;
|
||||
case "XML (KOST 2.0)":
|
||||
corpusType = KOST;
|
||||
break;
|
||||
case "XML (ssj500k 2.1)":
|
||||
corpusType = SSJ500K;
|
||||
break;
|
||||
|
@ -786,6 +789,8 @@ public class CorpusTab {
|
|||
corpusType = GOS;
|
||||
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
|
||||
corpusType = SSJ500K;
|
||||
} else if (attrib.contains(KOST.getNameLowerCase())) {
|
||||
corpusType = KOST;
|
||||
}
|
||||
|
||||
if (corpusType == null) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user