Fixed taxonomy processing for KOST + Added ignoring of certain files in KOST

This commit is contained in:
lkrsnik 2024-06-20 12:13:27 +02:00
parent 6f09cf9bed
commit eb72b380a5
2 changed files with 11 additions and 1 deletions

View File

@ -3,6 +3,7 @@
```shell
mvn package
```
- results are in shade folder
# Build executable using Launch4j
- Install Java on Windows

View File

@ -501,6 +501,10 @@ public class XML_processing {
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
corpusType == CorpusType.KOST && elementName.equals("TEI")
) {
return resultTaxonomy;
}
if (insideHeader) {
@ -849,6 +853,10 @@ public class XML_processing {
} else if (qName.equals("text")){
taxonomyMatch = true;
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
) {
return true;
}
break;
@ -911,7 +919,8 @@ public class XML_processing {
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
(stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();