Added read taxonomy for vert

2018-12-17 12:58:43 +01:00
parent 3889b834e3
commit bb9f3f0fb9
10 changed files with 188 additions and 16 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -2,8 +2,7 @@ package alg;

 import static data.Enums.solar.SolarFilters.*;

-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
+import java.io.*;
 import java.util.*;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ForkJoinPool;
@@ -15,6 +14,8 @@ import javax.xml.stream.XMLStreamConstants;
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.events.*;

+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.LineIterator;
 import org.apache.logging.log4j.LogManager;

 import data.*;
@@ -436,6 +437,62 @@ public class XML_processing {
 		return true;
 	}

+
+	/**
+	 * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
+	 *
+	 * @param filepath
+	 * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
+	 * @param corpusType
+	 */
+	public static HashSet<String> readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
+//		boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
+		// solar
+		Set<String> headTags = null;
+		HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
+		// taxonomy corpora
+		HashSet<String> resultTaxonomy = new HashSet<>();
+
+		LineIterator it = null;
+		try {
+			it = FileUtils.lineIterator(new File(filepath), "UTF-8");
+			try {
+				boolean insideHeader = false;
+
+				while (it.hasNext()) {
+					String line = it.nextLine();
+
+					if (line.length() > 4 && line.substring(1, 5).equals("text")) {
+						// split over "\" "
+						String[] split = line.split("\" ");
+//						String mediumId = "";
+//						String typeId = "";
+//						String proofreadId = "";
+						for (String el : split) {
+							String[] attribute = el.split("=\"");
+							if (attribute[0].equals("medium_id")) {
+//								mediumId = attribute[1];
+								resultTaxonomy.add(attribute[1]);
+							} else if (attribute[0].equals("type_id")) {
+//								typeId = attribute[1];
+								resultTaxonomy.add(attribute[1]);
+							} else if (attribute[0].equals("proofread_id")) {
+//								proofreadId = attribute[1];
+								resultTaxonomy.add(attribute[1]);
+							}
+						}
+					}
+				}
+			} finally {
+				LineIterator.closeQuietly(it);
+			}
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		resultTaxonomy.remove("-");
+		return resultTaxonomy;
+	}
+
 	/**
 	 * Parses XML headers for information about its taxonomy (if supported) or filters (solar)
 	 *