From eb72b380a5ece6d93ce4b65b206b65dcbd6d27c9 Mon Sep 17 00:00:00 2001 From: lkrsnik Date: Thu, 20 Jun 2024 12:13:27 +0200 Subject: [PATCH] Fixed taxonomy processing for KOST + Added ignoring of certain files in KOST --- build_instructions.md | 1 + src/main/java/alg/XML_processing.java | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/build_instructions.md b/build_instructions.md index 930ec08..405942e 100644 --- a/build_instructions.md +++ b/build_instructions.md @@ -3,6 +3,7 @@ ```shell mvn package ``` +- results are in shade folder # Build executable using Launch4j - Install Java on Windows diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index 26ed8e5..a6e9e8e 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -501,6 +501,10 @@ public class XML_processing { // this toggle is true when we're inside a header (next block of code executes) // and false when we're not (skip reading unnecessary attributes) insideHeader = true; + } else if (corpusType == CorpusType.KOST && elementName.equals("standOff") || + corpusType == CorpusType.KOST && elementName.equals("TEI") + ) { + return resultTaxonomy; } if (insideHeader) { @@ -849,6 +853,10 @@ public class XML_processing { } else if (qName.equals("text")){ taxonomyMatch = true; + } else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") || + stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI") + ) { + return true; } break; @@ -911,7 +919,8 @@ public class XML_processing { } // fallback else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") && - stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { + (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K || + stats.getCorpus().getCorpusType() == CorpusType.KOST)) { // join corpus and stats fj(corpus, stats); corpus.clear();