Fixed taxonomy processing for KOST + Added ignoring of certain files in KOST

This commit is contained in:
lkrsnik 2024-06-20 12:13:27 +02:00
parent 6f09cf9bed
commit eb72b380a5
2 changed files with 11 additions and 1 deletions

View File

@ -3,6 +3,7 @@
```shell ```shell
mvn package mvn package
``` ```
- results are in shade folder
# Build executable using Launch4j # Build executable using Launch4j
- Install Java on Windows - Install Java on Windows

View File

@ -501,6 +501,10 @@ public class XML_processing {
// this toggle is true when we're inside a header (next block of code executes) // this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes) // and false when we're not (skip reading unnecessary attributes)
insideHeader = true; insideHeader = true;
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
corpusType == CorpusType.KOST && elementName.equals("TEI")
) {
return resultTaxonomy;
} }
if (insideHeader) { if (insideHeader) {
@ -849,6 +853,10 @@ public class XML_processing {
} else if (qName.equals("text")){ } else if (qName.equals("text")){
taxonomyMatch = true; taxonomyMatch = true;
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
) {
return true;
} }
break; break;
@ -911,7 +919,8 @@ public class XML_processing {
} }
// fallback // fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") && else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
// join corpus and stats // join corpus and stats
fj(corpus, stats); fj(corpus, stats);
corpus.clear(); corpus.clear();