Added read taxonomy for vert
This commit is contained in:
@@ -2,8 +2,7 @@ package alg;
|
||||
|
||||
import static data.Enums.solar.SolarFilters.*;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
@@ -15,6 +14,8 @@ import javax.xml.stream.XMLStreamConstants;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.events.*;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.LineIterator;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
|
||||
import data.*;
|
||||
@@ -436,6 +437,62 @@ public class XML_processing {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
* @param filepath
|
||||
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
|
||||
* @param corpusType
|
||||
*/
|
||||
public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
|
||||
// boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
|
||||
// solar
|
||||
Set<String> headTags = null;
|
||||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
|
||||
LineIterator it = null;
|
||||
try {
|
||||
it = FileUtils.lineIterator(new File(filepath), "UTF-8");
|
||||
try {
|
||||
boolean insideHeader = false;
|
||||
|
||||
while (it.hasNext()) {
|
||||
String line = it.nextLine();
|
||||
|
||||
if (line.length() > 4 && line.substring(1, 5).equals("text")) {
|
||||
// split over "\" "
|
||||
String[] split = line.split("\" ");
|
||||
// String mediumId = "";
|
||||
// String typeId = "";
|
||||
// String proofreadId = "";
|
||||
for (String el : split) {
|
||||
String[] attribute = el.split("=\"");
|
||||
if (attribute[0].equals("medium_id")) {
|
||||
// mediumId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
} else if (attribute[0].equals("type_id")) {
|
||||
// typeId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
} else if (attribute[0].equals("proofread_id")) {
|
||||
// proofreadId = attribute[1];
|
||||
resultTaxonomy.add(attribute[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
LineIterator.closeQuietly(it);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
resultTaxonomy.remove("-");
|
||||
return resultTaxonomy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user