Added new ssj500k reading option. Fixed GOS taxonomy

2018-09-03 13:31:41 +02:00
parent 426a9ccc46
commit 1d9e9b7ed6
9 changed files with 280 additions and 40 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -52,7 +52,9 @@ public class XML_processing {
 			readXMLGos(path, stats);
 		} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
 			readXMLSolar(path, stats);
-		}
+		} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
+            readXMLSSJ500K(path, stats);
+        }
 	}

 	/**
@@ -91,6 +93,50 @@ public class XML_processing {
 		return "";
 	}

+	/**
+	 * Reads and returns the value of a passed header attribute or an empty string.
+	 * E.g. body base attribute, for discerning the corpus' type of ssj500k.
+	 * Notice: returns only the value of the first occurrence of a given tag name.
+	 */
+	public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
+		XMLInputFactory factory = XMLInputFactory.newInstance();
+		XMLEventReader eventReader = null;
+
+		try {
+			eventReader = factory.createXMLEventReader(new FileInputStream(path));
+			while (eventReader.hasNext()) {
+				XMLEvent xmlEvent = eventReader.nextEvent();
+				if (xmlEvent.isStartElement()) {
+					StartElement startElement = xmlEvent.asStartElement();
+					String var = startElement.getName().getLocalPart();
+
+					if (var.equalsIgnoreCase(tag)) {
+                        HashMap<String, String> att = extractAttributes(startElement);
+
+						if (att.containsKey("base")) {
+							return att.get("base").substring(0, att.get("base").length() - 12);
+						}
+
+
+
+						return eventReader.nextEvent().asCharacters().getData();
+					}
+				}
+			}
+		} catch (FileNotFoundException | XMLStreamException e) {
+			e.printStackTrace();
+		} finally {
+			if (eventReader != null) {
+				try {
+					eventReader.close();
+				} catch (XMLStreamException e) {
+					logger.error("closing stream", e);
+				}
+			}
+		}
+		return "";
+	}
+
 	private static void fj(List<Sentence> corpus, StatisticsNew stats) {
 		ForkJoinPool pool = new ForkJoinPool();

@@ -403,7 +449,9 @@ public class XML_processing {

 			// init results now to avoid null pointers
 			headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
-		} else {
+		} else if (corpusType == CorpusType.SSJ500K) {
+            headTagName = "bibl";
+        } else {
 			headTagName = "teiHeader";
 		}

@@ -437,7 +485,13 @@ public class XML_processing {
 									.replace("#", "");

 							resultTaxonomy.add(tax);
-						} else if (!parseTaxonomy && headTags.contains(elementName)) {
+						} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
+                            String tax = startElement.getAttributeByName(QName.valueOf("ref"))
+                                    .getValue()
+                                    .replace("#", "");
+
+                            resultTaxonomy.add(tax);
+                        } else if (!parseTaxonomy && headTags.contains(elementName)) {
 							String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
 							resultFilters.get(elementName).add(tagContent);
 						}
@@ -646,6 +700,138 @@ public class XML_processing {
 		return true;
 	}

+    @SuppressWarnings("Duplicates")
+    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
+        boolean inWord = false;
+        boolean inPunctuation = false;
+        ArrayList<String> currentFiletaxonomy = new ArrayList<>();
+        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
+        String lemma = "";
+        String msd = "";
+
+        List<Word> sentence = new ArrayList<>();
+        List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
+        String sentenceDelimiter = "s";
+
+        XMLEventReader eventReader = null;
+        try {
+            XMLInputFactory factory = XMLInputFactory.newInstance();
+            eventReader = factory.createXMLEventReader(new FileInputStream(path));
+
+            while (eventReader.hasNext()) {
+                XMLEvent event = eventReader.nextEvent();
+
+                switch (event.getEventType()) {
+                    case XMLStreamConstants.START_ELEMENT:
+                        StartElement startElement = event.asStartElement();
+                        String qName = startElement.getName().getLocalPart();
+
+                        // "word" node
+                        if (qName.equals("w")) {
+                            inWord = true;
+                            if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
+                                System.out.println("MSD written incorrectly");
+                            }
+                            msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
+                            lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
+                        }
+
+                        else if (qName.equals("pc")){
+                            inPunctuation = true;
+                        }
+
+                        // taxonomy node
+                        else if (qName.equalsIgnoreCase("term")) {
+                            // there are some term nodes at the beginning that are of no interest to us
+                            // they differ by not having the attribute "ref", so test will equal null
+                            Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
+
+                            if (tax != null) {
+                                // keep only taxonomy properties
+                                String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
+                                currentFiletaxonomy.add(currentFiletaxonomyElement);
+                                Tax taxonomy = new Tax();
+                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
+                            }
+                        }
+                        break;
+
+                    case XMLStreamConstants.CHARACTERS:
+                        Characters characters = event.asCharacters();
+
+                        // "word" node value
+                        if (inWord) {
+                            String word = characters.getData();
+                            sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
+                            inWord = false;
+                        }
+                        if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
+                            String punctuation = characters.getData();
+                            sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
+                            inPunctuation = false;
+                        }
+                        break;
+
+                    case XMLStreamConstants.END_ELEMENT:
+                        EndElement endElement = event.asEndElement();
+
+                        String var = endElement.getName().getLocalPart();
+                        String debug = "";
+
+                        // parser reached end of the current sentence
+                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+                            // add sentence to corpus if it passes filters
+                            sentence = runFilters(sentence, stats.getFilter());
+
+                            if (!ValidationUtil.isEmpty(sentence)) {
+                                corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+                            }
+
+                            // and start a new one
+                            sentence = new ArrayList<>();
+
+                            /* Invoke Fork-Join when we reach maximum limit of
+                             * sentences (because we can't read everything to
+                             * memory) or we reach the end of the file.
+                             */
+                            if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
+                                fj(corpus, stats);
+                                // empty the current corpus, since we don't need the data anymore
+                                corpus.clear();
+
+                                // TODO: if (stats.isUseDB()) {
+                                // 	stats.storeTmpResultsToDB();
+                                // }
+                            }
+                        }
+                        // fallback
+                        else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
+                            // join corpus and stats
+                            fj(corpus, stats);
+                            corpus.clear();
+
+                            currentFiletaxonomy = new ArrayList<>();
+                            currentFiletaxonomyLong = new ArrayList<>();
+                        }
+
+                        break;
+                }
+            }
+        } catch (FileNotFoundException | XMLStreamException e) {
+            e.printStackTrace();
+        } finally {
+            if (eventReader != null) {
+                try {
+                    eventReader.close();
+                } catch (XMLStreamException e) {
+                    logger.error("closing stream", e);
+                }
+            }
+        }
+
+        return true;
+    }
+
 	@SuppressWarnings("Duplicates")
 	public static boolean readXMLGos(String path, StatisticsNew stats) {
 		boolean inWord = false;
@@ -853,6 +1039,9 @@ public class XML_processing {
 						else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
 							fj(corpus, stats);
 							corpus.clear();
+
+                            currentFiletaxonomy = new ArrayList<>();
+                            currentFiletaxonomyLong = new ArrayList<>();
 						}

 						break;
@@ -914,7 +1103,7 @@ public class XML_processing {
 		return atts;
 	}

-	private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
+	public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
 		List<String> wString = new ArrayList<>();
 		if (f.getWordParts().contains(CalculateFor.WORD))
 			wString.add(word);