Added read taxonomy for vert

This commit is contained in:
2018-12-17 12:58:43 +01:00
parent 3889b834e3
commit bb9f3f0fb9
10 changed files with 188 additions and 16 deletions

View File

@@ -2,8 +2,7 @@ package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ForkJoinPool;
@@ -15,6 +14,8 @@ import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.logging.log4j.LogManager;
import data.*;
@@ -436,6 +437,62 @@ public class XML_processing {
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
// boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
LineIterator it = null;
try {
it = FileUtils.lineIterator(new File(filepath), "UTF-8");
try {
boolean insideHeader = false;
while (it.hasNext()) {
String line = it.nextLine();
if (line.length() > 4 && line.substring(1, 5).equals("text")) {
// split over "\" "
String[] split = line.split("\" ");
// String mediumId = "";
// String typeId = "";
// String proofreadId = "";
for (String el : split) {
String[] attribute = el.split("=\"");
if (attribute[0].equals("medium_id")) {
// mediumId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("type_id")) {
// typeId = attribute[1];
resultTaxonomy.add(attribute[1]);
} else if (attribute[0].equals("proofread_id")) {
// proofreadId = attribute[1];
resultTaxonomy.add(attribute[1]);
}
}
}
}
} finally {
LineIterator.closeQuietly(it);
}
} catch (IOException e) {
e.printStackTrace();
}
resultTaxonomy.remove("-");
return resultTaxonomy;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*