diff --git a/src/main/java/alg/XML_processing.java b/src/main/java/alg/XML_processing.java index a60fc3a..c393fad 100755 --- a/src/main/java/alg/XML_processing.java +++ b/src/main/java/alg/XML_processing.java @@ -14,6 +14,10 @@ import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.*; +import gui.I18N; +import javafx.beans.property.ReadOnlyDoubleProperty; +import javafx.beans.property.ReadOnlyDoubleWrapper; +import javafx.concurrent.Task; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.logging.log4j.LogManager; @@ -22,290 +26,341 @@ import data.*; import gui.ValidationUtil; public class XML_processing { - public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class); + public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class); - // public static void processCorpus(Statistics stats) { - // // we can preset the list's size, so there won't be a need to resize it - // List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); - // - // int i = 0; - // for (File f : Settings.corpus) { - // i++; - // readXML(f.toString(), stats); - // } - // } + // progress tracking functionality + private static final ReadOnlyDoubleWrapper progress = new ReadOnlyDoubleWrapper(); - // public static void readXML(String path, Statistics stats) { - // if (stats.getCorpusType() == CorpusType.GIGAFIDA) { - // readXMLGigafida(path, stats); - // } else if (stats.getCorpusType() == CorpusType.GOS) { - // readXMLGos(path, stats); - // } else if (stats.getCorpusType() == CorpusType.SOLAR) { - // readXMLSolar(path, stats); - // } - // } + public static boolean isCancelled = false; + public static Date startTime = new Date(); + public static boolean isCollocability = false; - public static void readXML(String path, StatisticsNew stats) { - if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA - || stats.getCorpus().getCorpusType() == CorpusType.CCKRES) { - readXMLGigafida(path, stats); - } else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) { - readXMLGos(path, stats); - } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { - readXMLSolar(path, stats); - } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { - readXMLSSJ500K(path, stats); + public double getProgress() { + return progressProperty().get(); + } + + public ReadOnlyDoubleProperty progressProperty() { + return progress ; + } + + // public static void processCorpus(Statistics stats) { + // // we can preset the list's size, so there won't be a need to resize it + // List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); + // + // int i = 0; + // for (File f : Settings.corpus) { + // i++; + // readXML(f.toString(), stats); + // } + // } + + // public static void readXML(String path, Statistics stats) { + // if (stats.getCorpusType() == CorpusType.GIGAFIDA) { + // readXMLGigafida(path, stats); + // } else if (stats.getCorpusType() == CorpusType.GOS) { + // readXMLGos(path, stats); + // } else if (stats.getCorpusType() == CorpusType.SOLAR) { + // readXMLSolar(path, stats); + // } + // } + + public static boolean readXML(String path, StatisticsNew stats) { + if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA + || stats.getCorpus().getCorpusType() == CorpusType.CCKRES) { + return readXMLGigafida(path, stats); + } else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) { + return readXMLGos(path, stats); + } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { + return readXMLSolar(path, stats); + } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K || + stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) { + return readXMLSSJ500K(path, stats); + } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) { + return readVERT(path, stats); } - } +// task.updateProgress(fileNum, size); + return false; + } - /** - * Reads and returns the value of a passed header tag or an empty string. - * E.g. title tag, for discerning the corpus' type. - * Notice: returns only the value of the first occurrence of a given tag name. - */ - public static String readXMLHeaderTag(String path, String tag) { - XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLEventReader eventReader = null; + /** + * Reads and returns the value of a passed header tag or an empty string. + * E.g. title tag, for discerning the corpus' type. + * Notice: returns only the value of the first occurrence of a given tag name. + */ + public static String readXMLHeaderTag(String path, String tag) { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = null; - try { - eventReader = factory.createXMLEventReader(new FileInputStream(path)); - while (eventReader.hasNext()) { - XMLEvent xmlEvent = eventReader.nextEvent(); - if (xmlEvent.isStartElement()) { - StartElement startElement = xmlEvent.asStartElement(); - String var = startElement.getName().getLocalPart(); + try { + eventReader = factory.createXMLEventReader(new FileInputStream(path)); + while (eventReader.hasNext()) { + XMLEvent xmlEvent = eventReader.nextEvent(); + if (xmlEvent.isStartElement()) { + StartElement startElement = xmlEvent.asStartElement(); + String var = startElement.getName().getLocalPart(); - if (var.equalsIgnoreCase(tag)) { - return eventReader.nextEvent().asCharacters().getData(); - } - } - } - } catch (FileNotFoundException | XMLStreamException e) { - e.printStackTrace(); - } finally { - if (eventReader != null) { - try { - eventReader.close(); - } catch (XMLStreamException e) { - logger.error("closing stream", e); - } - } - } - return ""; - } + if (var.equalsIgnoreCase(tag)) { + return eventReader.nextEvent().asCharacters().getData(); + } + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + return ""; + } - /** - * Reads and returns the value of a passed header attribute or an empty string. - * E.g. body base attribute, for discerning the corpus' type of ssj500k. - * Notice: returns only the value of the first occurrence of a given tag name. - */ - public static String readXMLHeaderAttribute(String path, String tag, String attribute) { - XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLEventReader eventReader = null; + /** + * Reads and returns the value of a passed header attribute or an empty string. + * E.g. body base attribute, for discerning the corpus' type of ssj500k. + * Notice: returns only the value of the first occurrence of a given tag name. + */ + public static String readXMLHeaderAttribute(String path, String tag, String attribute) { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = null; - try { - eventReader = factory.createXMLEventReader(new FileInputStream(path)); - while (eventReader.hasNext()) { - XMLEvent xmlEvent = eventReader.nextEvent(); - if (xmlEvent.isStartElement()) { - StartElement startElement = xmlEvent.asStartElement(); - String var = startElement.getName().getLocalPart(); + try { + eventReader = factory.createXMLEventReader(new FileInputStream(path)); + while (eventReader.hasNext()) { + XMLEvent xmlEvent = eventReader.nextEvent(); + if (xmlEvent.isStartElement()) { + StartElement startElement = xmlEvent.asStartElement(); + String var = startElement.getName().getLocalPart(); - if (var.equalsIgnoreCase(tag)) { + if (var.equalsIgnoreCase(tag)) { HashMap att = extractAttributes(startElement); - if (att.containsKey("base")) { - return att.get("base").substring(0, att.get("base").length() - 12); - } + if (att.containsKey("base")) { + return att.get("base").substring(0, att.get("base").length() - 12); + } - return eventReader.nextEvent().asCharacters().getData(); - } - } - } - } catch (FileNotFoundException | XMLStreamException e) { - e.printStackTrace(); - } finally { - if (eventReader != null) { - try { - eventReader.close(); - } catch (XMLStreamException e) { - logger.error("closing stream", e); - } - } - } - return ""; - } + return eventReader.nextEvent().asCharacters().getData(); + } + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + return ""; + } - private static void fj(List corpus, StatisticsNew stats) { - ForkJoinPool pool = new ForkJoinPool(); + private static void fj(List corpus, StatisticsNew stats) { + ForkJoinPool pool = new ForkJoinPool(); - if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) { - alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats); - pool.invoke(wc); - } else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) { - alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats); - pool.invoke(wc); - } else { - // TODO: - // alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats); - // pool.invoke(wc); - } - } + if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) { + alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats); + pool.invoke(wc); + } else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) { + alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats); + pool.invoke(wc); + } else { + // TODO: + // alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats); + // pool.invoke(wc); + } + } - // public static void readXMLGos(String path, Statistics stats) { - // boolean in_word = false; - // String taksonomija = ""; - // String lemma = ""; - // String msd = ""; - // String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm - // - // List stavek = new ArrayList<>(); - // List corpus = new ArrayList<>(); - // String sentenceDelimiter = "seg"; - // String taxonomyPrefix = "gos."; - // - // try { - // XMLInputFactory factory = XMLInputFactory.newInstance(); - // XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); - // - // while (eventReader.hasNext()) { - // XMLEvent event = eventReader.nextEvent(); - // - // switch (event.getEventType()) { - // case XMLStreamConstants.START_ELEMENT: - // - // StartElement startElement = event.asStartElement(); - // String qName = startElement.getName().getLocalPart(); - // - // // "word" node - // if (qName.equals("w")) { - // in_word = true; - // - // if (type.equals("norm")) { - // // make sure we're looking at and not - // Iterator var = startElement.getAttributes(); - // ArrayList attributes = new ArrayList<>(); - // while (var.hasNext()) { - // attributes.add(var.next()); - // } - // - // if (attributes.contains("msd")) { - // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); - // } else { - // msd = null; - // } - // - // if (attributes.contains("lemma")) { - // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); - // } - // } - // } - // // taxonomy node - // else if (qName.equalsIgnoreCase("catRef")) { - // // there are some term nodes at the beginning that are of no interest to us - // // they differ by not having the attribute "ref", so test will equal null - // Attribute test = startElement.getAttributeByName(QName.valueOf("target")); - // - // if (test != null) { - // // keep only taxonomy properties - // taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, ""); - // } - // } else if (qName.equalsIgnoreCase("div")) { - // type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); - // - // } - // break; - // - // case XMLStreamConstants.CHARACTERS: - // Characters characters = event.asCharacters(); - // - // // "word" node value - // if (in_word) { - // if (type.equals("norm") && msd != null) { - // stavek.add(new Word(characters.getData(), lemma, msd)); - // } else { - // stavek.add(new Word(characters.getData())); - // } - // - // in_word = false; - // } - // break; - // - // case XMLStreamConstants.END_ELEMENT: - // EndElement endElement = event.asEndElement(); - // - // // parser reached end of the current sentence - // if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { - // // add sentence to corpus - // corpus.add(new Sentence(stavek, taksonomija, type)); - // // and start a new one - // stavek = new ArrayList<>(); - // - // /* Invoke Fork-Join when we reach maximum limit of - // * sentences (because we can't read everything to - // * memory) or we reach the end of the file. - // */ - // if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { - // fj(corpus, stats); - // // empty the current corpus, since we don't need - // // the data anymore - // corpus.clear(); - // } - // } - // - // // backup - // if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { - // fj(corpus, stats); - // corpus.clear(); - // } - // - // break; - // } - // } - // } catch (FileNotFoundException | XMLStreamException e) { - // e.printStackTrace(); - // } - // } + // public static void readXMLGos(String path, Statistics stats) { + // boolean in_word = false; + // String taksonomija = ""; + // String lemma = ""; + // String msd = ""; + // String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm + // + // List stavek = new ArrayList<>(); + // List corpus = new ArrayList<>(); + // String sentenceDelimiter = "seg"; + // String taxonomyPrefix = "gos."; + // + // try { + // XMLInputFactory factory = XMLInputFactory.newInstance(); + // XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); + // + // while (eventReader.hasNext()) { + // XMLEvent event = eventReader.nextEvent(); + // + // switch (event.getEventType()) { + // case XMLStreamConstants.START_ELEMENT: + // + // StartElement startElement = event.asStartElement(); + // String qName = startElement.getName().getLocalPart(); + // + // // "word" node + // if (qName.equals("w")) { + // in_word = true; + // + // if (type.equals("norm")) { + // // make sure we're looking at and not + // Iterator var = startElement.getAttributes(); + // ArrayList attributes = new ArrayList<>(); + // while (var.hasNext()) { + // attributes.add(var.next()); + // } + // + // if (attributes.contains("msd")) { + // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); + // } else { + // msd = null; + // } + // + // if (attributes.contains("lemma")) { + // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); + // } + // } + // } + // // taxonomy node + // else if (qName.equalsIgnoreCase("catRef")) { + // // there are some term nodes at the beginning that are of no interest to us + // // they differ by not having the attribute "ref", so test will equal null + // Attribute test = startElement.getAttributeByName(QName.valueOf("target")); + // + // if (test != null) { + // // keep only taxonomy properties + // taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, ""); + // } + // } else if (qName.equalsIgnoreCase("div")) { + // type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); + // + // } + // break; + // + // case XMLStreamConstants.CHARACTERS: + // Characters characters = event.asCharacters(); + // + // // "word" node value + // if (in_word) { + // if (type.equals("norm") && msd != null) { + // stavek.add(new Word(characters.getData(), lemma, msd)); + // } else { + // stavek.add(new Word(characters.getData())); + // } + // + // in_word = false; + // } + // break; + // + // case XMLStreamConstants.END_ELEMENT: + // EndElement endElement = event.asEndElement(); + // + // // parser reached end of the current sentence + // if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + // // add sentence to corpus + // corpus.add(new Sentence(stavek, taksonomija, type)); + // // and start a new one + // stavek = new ArrayList<>(); + // + // /* Invoke Fork-Join when we reach maximum limit of + // * sentences (because we can't read everything to + // * memory) or we reach the end of the file. + // */ + // if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + // fj(corpus, stats); + // // empty the current corpus, since we don't need + // // the data anymore + // corpus.clear(); + // } + // } + // + // // backup + // if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { + // fj(corpus, stats); + // corpus.clear(); + // } + // + // break; + // } + // } + // } catch (FileNotFoundException | XMLStreamException e) { + // e.printStackTrace(); + // } + // } - @SuppressWarnings("unused") - public static void readXMLSolar(String path, StatisticsNew stats) { - boolean in_word = false; + @SuppressWarnings("unused") + public static boolean readXMLSolar(String path, StatisticsNew stats) { + boolean in_word = false; boolean inPunctuation = false; String lemma = ""; - String msd = ""; + String msd = ""; - List stavek = new ArrayList<>(); - List corpus = new ArrayList<>(); + List stavek = new ArrayList<>(); + List corpus = new ArrayList<>(); - // used for filter - Set headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto")); - Map headBlock = null; - boolean includeThisBlock = false; + // used for filter + Set headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto")); + Map headBlock = null; + boolean includeThisBlock = false; - try { - XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); + int numLines = 0; + int lineNum = 0; + progress.set(0.0); + if(!isCollocability) { + startTime = new Date(); + } + // get number of lines + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); - while (eventReader.hasNext()) { - XMLEvent event = eventReader.nextEvent(); + while (eventReader.hasNext()) + { + eventReader.next(); + numLines ++; + // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file + } + } catch (IOException e) { + e.printStackTrace(); + } catch (XMLStreamException e) { + e.printStackTrace(); + } - switch (event.getEventType()) { - case XMLStreamConstants.START_ELEMENT: + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); - StartElement startElement = event.asStartElement(); - // System.out.println(String.format("%s", startElement.toString())); - String qName = startElement.getName().getLocalPart(); + while (eventReader.hasNext()) { + int percentage = (int) (lineNum * 100.0 / numLines); + if(progress.get() < percentage) { + progress.set(percentage); + } + if(isCancelled) { + return false; + } + lineNum ++; + XMLEvent event = eventReader.nextEvent(); - // "word" node - if (qName.equals("w3")) { - in_word = true; + switch (event.getEventType()) { + case XMLStreamConstants.START_ELEMENT: - msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); - lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); - } else if (qName.equals("c3")) { - String c3Content = eventReader.nextEvent().asCharacters().getData(); + StartElement startElement = event.asStartElement(); + // System.out.println(String.format("%s", startElement.toString())); + String qName = startElement.getName().getLocalPart(); + + // "word" node + if (qName.equals("w3")) { + in_word = true; + + msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); + lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); + } else if (qName.equals("c3")) { + String c3Content = eventReader.nextEvent().asCharacters().getData(); if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && stavek.size() > 0){ @@ -313,485 +368,318 @@ public class XML_processing { } - if (c3Content.equals(".") && includeThisBlock) { - if (stats.getFilter().getNgramValue() == 0){ - int numSentenceParts = 0; - for(Word w : stavek){ - int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); - numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; - } - stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>()); - } else if(stats.getFilter().getNgramValue() >= 1) { - stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>()); - } + if (c3Content.equals(".") && includeThisBlock) { + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : stavek){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>()); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>()); + } - // add sentence to corpus - corpus.add(new Sentence(stavek, null)); - // and start a new one - stavek = new ArrayList<>(); + // add sentence to corpus + corpus.add(new Sentence(stavek, null)); + // and start a new one + stavek = new ArrayList<>(); - /* Invoke Fork-Join when we reach maximum limit of - * sentences (because we can't read everything to - * memory) or we reach the end of the file. - */ - if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { - fj(corpus, stats); - // empty the current corpus, since we don't need - // the data anymore - corpus.clear(); - } - } - } else if (headTags.contains(qName)) { - String tagContent = eventReader.nextEvent().asCharacters().getData(); - headBlock.put(qName, tagContent); - } else if (qName.equals("head")) { - headBlock = new HashMap<>(); - } + /* Invoke Fork-Join when we reach maximum limit of + * sentences (because we can't read everything to + * memory) or we reach the end of the file. + */ + if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + fj(corpus, stats); + // empty the current corpus, since we don't need + // the data anymore + corpus.clear(); + } + } + } else if (headTags.contains(qName)) { + String tagContent = eventReader.nextEvent().asCharacters().getData(); + headBlock.put(qName, tagContent); + } else if (qName.equals("head")) { + headBlock = new HashMap<>(); + } - break; + break; - case XMLStreamConstants.CHARACTERS: - Characters characters = event.asCharacters(); + case XMLStreamConstants.CHARACTERS: + Characters characters = event.asCharacters(); - // "word" node value - if (in_word) { - stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter())); - in_word = false; - } - break; + // "word" node value + if (in_word) { + stavek.add(createWord(characters.getData(), lemma, msd, "", stats.getFilter())); + in_word = false; + } + break; - case XMLStreamConstants.END_ELEMENT: - EndElement endElement = event.asEndElement(); - String qNameEnd = endElement.getName().getLocalPart(); + case XMLStreamConstants.END_ELEMENT: + EndElement endElement = event.asEndElement(); + String qNameEnd = endElement.getName().getLocalPart(); - if (qNameEnd.equals("head")) { - // validate and set boolean - if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) { - includeThisBlock = true; - } - } else if (qNameEnd.equals("body")) { - // new block, reset filter status - includeThisBlock = false; - } + if (qNameEnd.equals("head")) { + // validate and set boolean + if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) { + includeThisBlock = true; + } + } else if (qNameEnd.equals("body")) { + // new block, reset filter status + includeThisBlock = false; + } - // backup - if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) { - fj(corpus, stats); - corpus.clear(); - } + // backup + if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) { + fj(corpus, stats); + corpus.clear(); + } - break; - } - } - } catch (FileNotFoundException | XMLStreamException e) { - e.printStackTrace(); - } - } + break; + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } + return true; + } - /** - * @param readHeadBlock block of tags read from the corpus - * @param userSetFilter tags with values set by the user - * - * @return - */ - private static boolean validateHeadBlock(Map readHeadBlock, HashMap> userSetFilter) { - boolean pass = true; + /** + * @param readHeadBlock block of tags read from the corpus + * @param userSetFilter tags with values set by the user + * + * @return + */ + private static boolean validateHeadBlock(Map readHeadBlock, HashMap> userSetFilter) { + boolean pass = true; - if (userSetFilter == null) { - return true; - } + if (userSetFilter == null) { + return true; + } - for (Map.Entry> filterEntry : userSetFilter.entrySet()) { - String key = filterEntry.getKey(); - HashSet valueObject = filterEntry.getValue(); + for (Map.Entry> filterEntry : userSetFilter.entrySet()) { + String key = filterEntry.getKey(); + HashSet valueObject = filterEntry.getValue(); - // if (valueObject instanceof String) { - // pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject); - // } else - if (valueObject != null) { - //noinspection unchecked - for (String value : valueObject) { - pass = validateHeadBlockEntry(readHeadBlock, key, value); - } - } + // if (valueObject instanceof String) { + // pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject); + // } else + if (valueObject != null) { + //noinspection unchecked + for (String value : valueObject) { + pass = validateHeadBlockEntry(readHeadBlock, key, value); + } + } - if (!pass) { - // current head block does not include one of the set filters - not likely, but an edge case anyway - return false; - } - } + if (!pass) { + // current head block does not include one of the set filters - not likely, but an edge case anyway + return false; + } + } - // if it gets to this point, it passed all the filters - return true; - } + // if it gets to this point, it passed all the filters + return true; + } - private static boolean validateHeadBlockEntry(Map readHeadBlock, String userSetKey, String userSetValue) { - if (!readHeadBlock.keySet().contains(userSetKey)) { - // current head block does not include one of the set filters - not likely, but an edge case anyway - return false; - } else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) { - // different values -> doesn't pass the filter - return false; - } + private static boolean validateHeadBlockEntry(Map readHeadBlock, String userSetKey, String userSetValue) { + if (!readHeadBlock.keySet().contains(userSetKey)) { + // current head block does not include one of the set filters - not likely, but an edge case anyway + return false; + } else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) { + // different values -> doesn't pass the filter + return false; + } - return true; - } + return true; + } - /** - * Parses XML headers for information about its taxonomy (if supported) or filters (solar) - * - * @param filepath - * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file - * @param corpusType - */ - public static HashSet readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { -// boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); - // solar - Set headTags = null; - HashMap> resultFilters = new HashMap<>(); - // taxonomy corpora - HashSet resultTaxonomy = new HashSet<>(); + /** + * Parses XML headers for information about its taxonomy (if supported) or filters (solar) + * + * @param filepath + * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file + * @param corpusType + */ + public static HashSet readVertHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { + // taxonomy corpora + HashSet resultTaxonomy = new HashSet<>(); - LineIterator it = null; - try { - it = FileUtils.lineIterator(new File(filepath), "UTF-8"); - try { - boolean insideHeader = false; + LineIterator it = null; + try { + it = FileUtils.lineIterator(new File(filepath), "UTF-8"); + try { + boolean insideHeader = false; - while (it.hasNext()) { - String line = it.nextLine(); + while (it.hasNext()) { + String line = it.nextLine(); - if (line.length() > 4 && line.substring(1, 5).equals("text")) { - // split over "\" " - String[] split = line.split("\" "); + if (line.length() > 4 && line.substring(1, 5).equals("text")) { + // split over "\" " + String[] split = line.split("\" "); // String mediumId = ""; // String typeId = ""; // String proofreadId = ""; - for (String el : split) { - String[] attribute = el.split("=\""); - if (attribute[0].equals("medium_id")) { + boolean idsPresent = false; + for (String el : split) { + String[] attribute = el.split("=\""); + if (attribute[0].equals("medium_id")) { // mediumId = attribute[1]; - resultTaxonomy.add(attribute[1]); - } else if (attribute[0].equals("type_id")) { + idsPresent = true; + resultTaxonomy.add(attribute[1]); + } else if (attribute[0].equals("type_id")) { // typeId = attribute[1]; - resultTaxonomy.add(attribute[1]); - } else if (attribute[0].equals("proofread_id")) { + idsPresent = true; + resultTaxonomy.add(attribute[1]); + } else if (attribute[0].equals("proofread_id")) { // proofreadId = attribute[1]; - resultTaxonomy.add(attribute[1]); - } - } - } - } - } finally { - LineIterator.closeQuietly(it); - } - } catch (IOException e) { - e.printStackTrace(); - } - resultTaxonomy.remove("-"); - return resultTaxonomy; - } + idsPresent = true; + resultTaxonomy.add(attribute[1]); + } + } + if (!idsPresent){ + for (String el : split) { + String[] attribute = el.split("=\""); + if (attribute[0].equals("medium")) { +// mediumId = attribute[1]; + resultTaxonomy.add(attribute[1]); + } else if (attribute[0].equals("type")) { +// typeId = attribute[1]; + resultTaxonomy.add(attribute[1]); + } else if (attribute[0].equals("proofread")) { +// proofreadId = attribute[1]; + resultTaxonomy.add(attribute[1]); + } + } + } + } + } + } finally { + LineIterator.closeQuietly(it); + } + } catch (IOException e) { + e.printStackTrace(); + } + resultTaxonomy.remove("-"); + return resultTaxonomy; + } - /** - * Parses XML headers for information about its taxonomy (if supported) or filters (solar) - * - * @param filepath - * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file - * @param corpusType - */ - public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { - boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); - // solar - Set headTags = null; - HashMap> resultFilters = new HashMap<>(); - // taxonomy corpora - HashSet resultTaxonomy = new HashSet<>(); + /** + * Parses XML headers for information about its taxonomy (if supported) or filters (solar) + * + * @param filepath + * @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file + * @param corpusType + */ + public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) { + boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType); + // solar + Set headTags = null; + HashMap> resultFilters = new HashMap<>(); + // taxonomy corpora + HashSet resultTaxonomy = new HashSet<>(); - String headTagName; + String headTagName; - if (corpusType == CorpusType.SOLAR) { - headTagName = "head"; - // used for filter - headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); + if (corpusType == CorpusType.SOLAR) { + headTagName = "head"; + // used for filter + headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO)); - // init results now to avoid null pointers - headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); - } else if (corpusType == CorpusType.SSJ500K) { + // init results now to avoid null pointers + headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); + } else if (corpusType == CorpusType.SSJ500K) { headTagName = "bibl"; } else { - headTagName = "teiHeader"; - } + headTagName = "teiHeader"; + } - XMLInputFactory factory = XMLInputFactory.newInstance(); - XMLEventReader xmlEventReader = null; - try { - xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); - boolean insideHeader = false; + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader xmlEventReader = null; + try { + xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); + boolean insideHeader = false; - while (xmlEventReader.hasNext()) { - XMLEvent xmlEvent = xmlEventReader.nextEvent(); + while (xmlEventReader.hasNext()) { + XMLEvent xmlEvent = xmlEventReader.nextEvent(); - if (xmlEvent.isStartElement()) { - StartElement startElement = xmlEvent.asStartElement(); - String elementName = startElement.getName().getLocalPart(); + if (xmlEvent.isStartElement()) { + StartElement startElement = xmlEvent.asStartElement(); + String elementName = startElement.getName().getLocalPart(); - if (elementName.equalsIgnoreCase(headTagName)) { - // if the corpus is split into files, we skip bodies - // this toggle is true when we're inside a header (next block of code executes) - // and false when we're not (skip reading unnecessary attributes) - insideHeader = true; - } + if (elementName.equalsIgnoreCase(headTagName)) { + // if the corpus is split into files, we skip bodies + // this toggle is true when we're inside a header (next block of code executes) + // and false when we're not (skip reading unnecessary attributes) + insideHeader = true; + } - if (insideHeader) { - if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { - HashMap atts = extractAttributes(startElement); - String debug = ""; + if (insideHeader) { + if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { + HashMap atts = extractAttributes(startElement); + String debug = ""; - String tax = startElement.getAttributeByName(QName.valueOf("target")) - .getValue() - .replace("#", ""); + String tax = startElement.getAttributeByName(QName.valueOf("target")) + .getValue() + .replace("#", ""); - resultTaxonomy.add(tax); - } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { + if (tax.indexOf(':') >= 0) { + tax = tax.split(":")[1]; + } + resultTaxonomy.add(tax); + } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { String tax = startElement.getAttributeByName(QName.valueOf("ref")) .getValue() .replace("#", ""); resultTaxonomy.add(tax); } else if (!parseTaxonomy && headTags.contains(elementName)) { - String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); - resultFilters.get(elementName).add(tagContent); - } - } - } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { - // if the corpus is split into multiple files, each with only one header block per file - // that means we should stop after we reach the end of the header - return parseTaxonomy ? resultTaxonomy : resultFilters; - } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { - // whole corpus in one file, so we have to continue reading in order to find all header blocks - insideHeader = false; - } - } - } catch (XMLStreamException e) { - logger.error("Streaming error", e); - return parseTaxonomy ? resultTaxonomy : resultFilters; - } catch (FileNotFoundException e) { - logger.error("File not found", e); - return parseTaxonomy ? resultTaxonomy : resultFilters; - // TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user - } finally { - if (xmlEventReader != null) { - try { - xmlEventReader.close(); - } catch (XMLStreamException e) { - logger.error("closing stream", e); - } - } - } - return parseTaxonomy ? resultTaxonomy : resultFilters; - } - - private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) { - return event.asEndElement() - .getName() - .getLocalPart() - .equalsIgnoreCase(headerTag); - } - - @SuppressWarnings("Duplicates") - public static boolean readXMLGigafida(String path, StatisticsNew stats) { - boolean inWord = false; - boolean inPunctuation = false; - boolean taxonomyMatch = true; - ArrayList currentFiletaxonomy = new ArrayList<>(); -// ArrayList currentFiletaxonomyLong = new ArrayList<>(); - String lemma = ""; - String msd = ""; - - List sentence = new ArrayList<>(); - List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it - String sentenceDelimiter = "s"; - - XMLEventReader eventReader = null; - try { - XMLInputFactory factory = XMLInputFactory.newInstance(); - eventReader = factory.createXMLEventReader(new FileInputStream(path)); - - while (eventReader.hasNext()) { - XMLEvent event = eventReader.nextEvent(); - - switch (event.getEventType()) { - case XMLStreamConstants.START_ELEMENT: - StartElement startElement = event.asStartElement(); - String qName = startElement.getName().getLocalPart(); - - // "word" node - if (qName.equals("w")) { - inWord = true; - - msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); - lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); - } - - if (qName.equals("c")){ - inPunctuation = true; - } - - // taxonomy node - else if (qName.equalsIgnoreCase("catRef")) { - // there are some term nodes at the beginning that are of no interest to us - // they differ by not having the attribute "ref", so test will equal null - Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); - - if (tax != null) { - // keep only taxonomy properties - Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", "")); - currentFiletaxonomy.add(currentFiletaxonomyElement); - Tax taxonomy = new Tax(); -// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); - } - } - break; - - case XMLStreamConstants.CHARACTERS: - Characters characters = event.asCharacters(); - - // "word" node value - if (inWord) { - String word = characters.getData(); - sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); - inWord = false; - } - if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { - String punctuation = characters.getData(); - sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); - inPunctuation = false; - -// String punctuation = ","; -// -// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation); -// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation); -// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation); -// inPunctuation = false; + String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); + resultFilters.get(elementName).add(tagContent); } - break; + } + } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { + // if the corpus is split into multiple files, each with only one header block per file + // that means we should stop after we reach the end of the header + return parseTaxonomy ? resultTaxonomy : resultFilters; + } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { + // whole corpus in one file, so we have to continue reading in order to find all header blocks + insideHeader = false; + } + } + } catch (XMLStreamException e) { + logger.error("Streaming error", e); + return parseTaxonomy ? resultTaxonomy : resultFilters; + } catch (FileNotFoundException e) { + logger.error("File not found", e); + return parseTaxonomy ? resultTaxonomy : resultFilters; + // TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user + } finally { + if (xmlEventReader != null) { + try { + xmlEventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + return parseTaxonomy ? resultTaxonomy : resultFilters; + } -// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { -// String actualPunctuation = characters.getData(); -// if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("...")) -// break; -// String punctuation = ","; -// int skip_number = 0; -// if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){ -// skip_number = stats.getFilter().getSkipValue(); -// } -// for(int i = 1; i < skip_number + 2; i ++){ -// if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) { -// sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation); -// sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation); -// sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation); -// } -// } -// inPunctuation = false; -// } - - case XMLStreamConstants.END_ELEMENT: - EndElement endElement = event.asEndElement(); - - String var = endElement.getName().getLocalPart(); - String debug = ""; - - // parser reached end of the current sentence - if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { - // count all UniGramOccurrences in sentence for statistics - if (stats.getFilter().getNgramValue() == 0){ - int numSentenceParts = 0; - for(Word w : sentence){ - int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); - numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; - } - stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); - } else if(stats.getFilter().getNgramValue() >= 1) { - stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); - } - // add sentence to corpus if it passes filters - sentence = runFilters(sentence, stats.getFilter()); - - - - if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { - corpus.add(new Sentence(sentence, currentFiletaxonomy)); - } - -// taxonomyMatch = true; - // and start a new one - sentence = new ArrayList<>(); - - /* Invoke Fork-Join when we reach maximum limit of - * sentences (because we can't read everything to - * memory) or we reach the end of the file. - */ - if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { - fj(corpus, stats); - // empty the current corpus, since we don't need the data anymore - corpus.clear(); - - // TODO: if (stats.isUseDB()) { - // stats.storeTmpResultsToDB(); - // } - } - } else if (endElement.getName().getLocalPart().equals("teiHeader")) { - // before proceeding to read this file, make sure that taxonomy filters are a match - - if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { - currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection - - if (currentFiletaxonomy.isEmpty()) { - // taxonomies don't match so stop -// return false; - taxonomyMatch = false; -// System.out.println("TEST"); - } - } - } - - // fallback - else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { - // join corpus and stats - fj(corpus, stats); - corpus.clear(); - - // TODO: if (stats.isUseDB()) { - // stats.storeTmpResultsToDB(); - // } - } - - break; - } - } - } catch (FileNotFoundException | XMLStreamException e) { - e.printStackTrace(); - } finally { - if (eventReader != null) { - try { - eventReader.close(); - } catch (XMLStreamException e) { - logger.error("closing stream", e); - } - } - } - - return true; - } + private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) { + return event.asEndElement() + .getName() + .getLocalPart() + .equalsIgnoreCase(headerTag); + } @SuppressWarnings("Duplicates") - public static boolean readXMLSSJ500K(String path, StatisticsNew stats) { + public static boolean readXMLGigafida(String path, StatisticsNew stats) { boolean inWord = false; boolean inPunctuation = false; boolean taxonomyMatch = true; ArrayList currentFiletaxonomy = new ArrayList<>(); -// ArrayList currentFiletaxonomyLong = new ArrayList<>(); +// ArrayList currentFiletaxonomyLong = new ArrayList<>(); String lemma = ""; String msd = ""; @@ -815,7 +703,225 @@ public class XML_processing { // "word" node if (qName.equals("w")) { inWord = true; - if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){ + + msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); + lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); + } + + if (qName.equals("c")){ + inPunctuation = true; + } + + // taxonomy node + else if (qName.equalsIgnoreCase("catRef")) { + // there are some term nodes at the beginning that are of no interest to us + // they differ by not having the attribute "ref", so test will equal null + Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); + + if (tax != null) { + // keep only taxonomy properties + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + Tax taxonomy = new Tax(); +// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); + } + } + break; + + case XMLStreamConstants.CHARACTERS: + Characters characters = event.asCharacters(); + + // "word" node value + if (inWord) { + String word = characters.getData(); + sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); + inWord = false; + } + if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { + String punctuation = characters.getData(); + sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); + inPunctuation = false; + +// String punctuation = ","; +// +// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation); +// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation); +// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation); +// inPunctuation = false; + } + break; + +// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { +// String actualPunctuation = characters.getData(); +// if (actualPunctuation.equals(".") || actualPunctuation.equals("!") || actualPunctuation.equals("?") || actualPunctuation.equals("...")) +// break; +// String punctuation = ","; +// int skip_number = 0; +// if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue())){ +// skip_number = stats.getFilter().getSkipValue(); +// } +// for(int i = 1; i < skip_number + 2; i ++){ +// if (i < sentence.size() && !sentence.get(sentence.size() - i).equals(punctuation)) { +// sentence.get(sentence.size() - i).setWord(sentence.get(sentence.size() - i).getWord() + punctuation); +// sentence.get(sentence.size() - i).setLemma(sentence.get(sentence.size() - i).getLemma() + punctuation); +// sentence.get(sentence.size() - i).setMsd(sentence.get(sentence.size() - i).getMsd() + punctuation); +// } +// } +// inPunctuation = false; +// } + + case XMLStreamConstants.END_ELEMENT: + EndElement endElement = event.asEndElement(); + + String var = endElement.getName().getLocalPart(); + String debug = ""; + + // parser reached end of the current sentence + if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + // count all UniGramOccurrences in sentence for statistics + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } + // add sentence to corpus if it passes filters + sentence = runFilters(sentence, stats.getFilter()); + + + + if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { + corpus.add(new Sentence(sentence, currentFiletaxonomy)); + } + +// taxonomyMatch = true; + // and start a new one + sentence = new ArrayList<>(); + + /* Invoke Fork-Join when we reach maximum limit of + * sentences (because we can't read everything to + * memory) or we reach the end of the file. + */ + if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + fj(corpus, stats); + // empty the current corpus, since we don't need the data anymore + corpus.clear(); + + // TODO: if (stats.isUseDB()) { + // stats.storeTmpResultsToDB(); + // } + } + } else if (endElement.getName().getLocalPart().equals("teiHeader")) { + // before proceeding to read this file, make sure that taxonomy filters are a match + + if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { + currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection + + if (currentFiletaxonomy.isEmpty()) { + // taxonomies don't match so stop +// return false; + taxonomyMatch = false; +// System.out.println("TEST"); + } + } + } + + // fallback + else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { + // join corpus and stats + fj(corpus, stats); + corpus.clear(); + + // TODO: if (stats.isUseDB()) { + // stats.storeTmpResultsToDB(); + // } + } + + break; + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } + } + } + + return true; + } + + @SuppressWarnings("Duplicates") + public static boolean readXMLSSJ500K(String path, StatisticsNew stats) { + boolean inWord = false; + boolean inPunctuation = false; + boolean taxonomyMatch = true; + ArrayList currentFiletaxonomy = new ArrayList<>(); +// ArrayList currentFiletaxonomyLong = new ArrayList<>(); + String lemma = ""; + String msd = ""; + + List sentence = new ArrayList<>(); + List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it + String sentenceDelimiter = "s"; + + int numLines = 0; + int lineNum = 0; + progress.set(0.0); + if(!isCollocability) { + startTime = new Date(); + } + // get number of lines + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); + + while (eventReader.hasNext()) + { + eventReader.next(); + numLines ++; + // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file + } + } catch (IOException e) { + e.printStackTrace(); + } catch (XMLStreamException e) { + e.printStackTrace(); + } + + XMLEventReader eventReader = null; + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + eventReader = factory.createXMLEventReader(new FileInputStream(path)); + + while (eventReader.hasNext()) { + int percentage = (int) (lineNum * 100.0 / numLines); + if(progress.get() < percentage) { + progress.set(percentage); + } + if(isCancelled) { + return false; + } + lineNum ++; + XMLEvent event = eventReader.nextEvent(); + + switch (event.getEventType()) { + case XMLStreamConstants.START_ELEMENT: + StartElement startElement = event.asStartElement(); + String qName = startElement.getName().getLocalPart(); + + // "word" node + if (qName.equals("w")) { + inWord = true; + if (!(String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:") || + String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("mte:"))){ System.out.println("MSD written incorrectly"); } msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4); @@ -834,28 +940,78 @@ public class XML_processing { if (tax != null) { // keep only taxonomy properties - Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", "")); + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); currentFiletaxonomy.add(currentFiletaxonomyElement); // Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); } - } else if (qName.equals("bibl")) { - // before proceeding to read this file, make sure that taxonomy filters are a match - taxonomyMatch = true; + } else if (qName.equalsIgnoreCase("catRef")) { + // get value from attribute target + Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); - } + if (tax != null && !tax.getValue().equals("dedup:nodup")) { + // keep only taxonomy properties + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).split(":")[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); +// Tax taxonomy = new Tax(); +// currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); + } + + + + + +// if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) { +// HashMap atts = extractAttributes(startElement); +// String debug = ""; +// +// String tax = startElement.getAttributeByName(QName.valueOf("target")) +// .getValue() +// .replace("#", ""); +// +// if (tax.indexOf(':') >= 0) { +// tax = tax.split(":")[1]; +// } +// resultTaxonomy.add(tax); +// } else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) { +// String tax = startElement.getAttributeByName(QName.valueOf("ref")) +// .getValue() +// .replace("#", ""); +// +// resultTaxonomy.add(tax); +// } else if (!parseTaxonomy && headTags.contains(elementName)) { +// String tagContent = xmlEventReader.nextEvent().asCharacters().getData(); +// resultFilters.get(elementName).add(tagContent); +// } + + + + + + } else if (qName.equals("bibl")) { + // before proceeding to read this file, make sure that taxonomy filters are a match + taxonomyMatch = true; + + } else if (qName.equals("text")){ + taxonomyMatch = true; + } break; case XMLStreamConstants.CHARACTERS: Characters characters = event.asCharacters(); + // "word" node value if (inWord) { String word = characters.getData(); +// if (word.equals("Banovec")){ +// System.out.println("Test"); +// } sentence.add(createWord(word, lemma, msd, word, stats.getFilter())); inWord = false; } - if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { + if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation) { +// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) { String punctuation = characters.getData(); sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); inPunctuation = false; @@ -870,16 +1026,16 @@ public class XML_processing { // parser reached end of the current sentence if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { - if (stats.getFilter().getNgramValue() == 0){ - int numSentenceParts = 0; - for(Word w : sentence){ - int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); - numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; - } - stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); - } else if(stats.getFilter().getNgramValue() >= 1) { - stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); - } + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } // add sentence to corpus if it passes filters sentence = runFilters(sentence, stats.getFilter()); @@ -906,7 +1062,8 @@ public class XML_processing { } } // fallback - else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) { + else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") && + stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { // join corpus and stats fj(corpus, stats); corpus.clear(); @@ -914,23 +1071,34 @@ public class XML_processing { currentFiletaxonomy = new ArrayList<>(); // currentFiletaxonomyLong = new ArrayList<>(); } else if (endElement.getName().getLocalPart().equals("bibl")) { - // before proceeding to read this file, make sure that taxonomy filters are a match + // before proceeding to read this file, make sure that taxonomy filters are a match - if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { - currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection + if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { + currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection - if (currentFiletaxonomy.isEmpty()) { - // taxonomies don't match so stop + if (currentFiletaxonomy.isEmpty()) { + // taxonomies don't match so stop // return false; - taxonomyMatch = false; + taxonomyMatch = false; // System.out.println("TEST"); - } - } - } + } + } + } else if (endElement.getName().getLocalPart().equals("text")){ + taxonomyMatch = false; + } break; } } + if (corpus.size() > 0) { + fj(corpus, stats); + // empty the current corpus, since we don't need the data anymore + corpus.clear(); + + // TODO: if (stats.isUseDB()) { + // stats.storeTmpResultsToDB(); + // } + } } catch (FileNotFoundException | XMLStreamException e) { e.printStackTrace(); } finally { @@ -946,150 +1114,183 @@ public class XML_processing { return true; } - @SuppressWarnings("Duplicates") - public static boolean readXMLGos(String path, StatisticsNew stats) { - boolean inWord = false; + @SuppressWarnings("Duplicates") + public static boolean readXMLGos(String path, StatisticsNew stats) { + boolean inWord = false; boolean inPunctuation = false; - boolean inOrthDiv = false; - boolean computeForOrth = stats.getCorpus().isGosOrthMode(); - boolean inSeparatedWord = false; - ArrayList currentFiletaxonomy = new ArrayList<>(); + boolean inOrthDiv = false; + boolean computeForOrth = stats.getCorpus().isGosOrthMode(); + boolean inSeparatedWord = false; + ArrayList currentFiletaxonomy = new ArrayList<>(); // ArrayList currentFiletaxonomyLong = new ArrayList<>(); - String lemma = ""; - String msd = ""; + String lemma = ""; + String msd = ""; - List sentence = new ArrayList<>(); - List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it - Map> GOSCorpusHM = new ConcurrentHashMap<>(); - String GOSCorpusHMKey = ""; - String sentenceDelimiter = "seg"; - int wordIndex = 0; + List sentence = new ArrayList<>(); + List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it + Map> GOSCorpusHM = new ConcurrentHashMap<>(); + String GOSCorpusHMKey = ""; + String sentenceDelimiter = "seg"; + int wordIndex = 0; - String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm + String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm - XMLEventReader eventReader = null; - boolean includeFile = true; - try { - XMLInputFactory factory = XMLInputFactory.newInstance(); - eventReader = factory.createXMLEventReader(new FileInputStream(path)); + int numLines = 0; + int lineNum = 0; + progress.set(0.0); + if(!isCollocability) { + startTime = new Date(); + } + // get number of lines + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path)); - // created hashmap to combine words with normalized words + while (eventReader.hasNext()) + { + eventReader.next(); + numLines ++; + // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file + } + } catch (IOException e) { + e.printStackTrace(); + } catch (XMLStreamException e) { + e.printStackTrace(); + } - while (eventReader.hasNext()) { - XMLEvent event = eventReader.nextEvent(); - // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); - switch (event.getEventType()) { - case XMLStreamConstants.START_ELEMENT: - StartElement startElement = event.asStartElement(); - String qName = startElement.getName().getLocalPart(); - if (qName.equals("div")) { - HashMap atts = extractAttributes(startElement); + XMLEventReader eventReader = null; + boolean includeFile = true; + try { + XMLInputFactory factory = XMLInputFactory.newInstance(); + eventReader = factory.createXMLEventReader(new FileInputStream(path)); - if (atts.keySet().contains("type")) { - inOrthDiv = atts.get("type").equals("orth"); - } - } + // created hashmap to combine words with normalized words - // "word" node - if (qName.equals("w")) { - // check that it's not a type - HashMap atts = extractAttributes(startElement); + while (eventReader.hasNext()) { + int percentage = (int) (lineNum * 100.0 / numLines); + if(progress.get() < percentage) { + progress.set(percentage); + } + if(isCancelled) { + return false; + } + lineNum ++; + XMLEvent event = eventReader.nextEvent(); + // System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", ""))); - if (!atts.containsKey("type")) { - inWord = true; + switch (event.getEventType()) { + case XMLStreamConstants.START_ELEMENT: + StartElement startElement = event.asStartElement(); + String qName = startElement.getName().getLocalPart(); - if (atts.containsKey("msd")) { - msd = atts.get("msd"); + if (qName.equals("div")) { + HashMap atts = extractAttributes(startElement); - } - if (atts.containsKey("lemma")) { - lemma = atts.get("lemma"); - } - // - // if (!inOrthDiv) { - // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); - // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); - // } - } else if (atts.containsKey("type") && atts.get("type").equals("separated")) { + if (atts.keySet().contains("type")) { + inOrthDiv = atts.get("type").equals("orth"); + } + } + + // "word" node + if (qName.equals("w")) { + // check that it's not a type + HashMap atts = extractAttributes(startElement); + + if (!atts.containsKey("type")) { + inWord = true; + + if (atts.containsKey("msd")) { + msd = atts.get("msd"); + + } + if (atts.containsKey("lemma")) { + lemma = atts.get("lemma"); + } + // + // if (!inOrthDiv) { + // msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue()); + // lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue()); + // } + } else if (atts.containsKey("type") && atts.get("type").equals("separated")) { inSeparatedWord = true; } - // } - } - // taxonomy node - else if (qName.equalsIgnoreCase("catRef")) { - // there are some term nodes at the beginning that are of no interest to us - // they differ by not having the attribute "ref", so test will equal null - Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); + // } + } + // taxonomy node + else if (qName.equalsIgnoreCase("catRef")) { + // there are some term nodes at the beginning that are of no interest to us + // they differ by not having the attribute "ref", so test will equal null + Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); - if (tax != null) { - // keep only taxonomy properties - Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue())); - currentFiletaxonomy.add(currentFiletaxonomyElement); + if (tax != null) { + // keep only taxonomy properties + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()), stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); // Tax taxonomy = new Tax(); // currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement)); - } - } else if (qName.equalsIgnoreCase("div")) { - gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); - } else if (qName.equalsIgnoreCase("seg")) { - HashMap atts = extractAttributes(startElement); + } + } else if (qName.equalsIgnoreCase("div")) { + gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue()); + } else if (qName.equalsIgnoreCase("seg")) { + HashMap atts = extractAttributes(startElement); - if (atts.keySet().contains("id")) { - if (inOrthDiv) { + if (atts.keySet().contains("id")) { + if (inOrthDiv) { GOSCorpusHMKey = atts.get("id") + ".norm"; } else { GOSCorpusHMKey = atts.get("id"); } - } else { - System.out.println("No attribute \"id\""); - } - } - break; + } else { + System.out.println("No attribute \"id\""); + } + } + break; - case XMLStreamConstants.CHARACTERS: - // "word" node value - if (inWord) { + case XMLStreamConstants.CHARACTERS: + // "word" node value + if (inWord) { // if (GOSCorpusHMKey.equals("gos.028-0108.norm") && wordIndex > 8){ // System.out.println(wordIndex); // } - // if algorithm is in orthodox part add new word to sentence - if (inOrthDiv){ + // if algorithm is in orthodox part add new word to sentence + if (inOrthDiv){ // GOSCorpusHM.put(GOSCorpusHMKey, sentence); - String word = ""; - Characters characters = event.asCharacters(); - sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter())); - // if algorithm is in normalized part find orthodox word and add other info to it - } else { - Characters characters = event.asCharacters(); + String word = ""; + Characters characters = event.asCharacters(); + sentence.add(createWord(characters.getData(), "", "", "", stats.getFilter())); + // if algorithm is in normalized part find orthodox word and add other info to it + } else { + Characters characters = event.asCharacters(); // System.out.println(wordIndex); // System.out.println(GOSCorpusHMKey + " " + lemma + " " + wordIndex); - if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { - Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); - currentWord.setLemma(lemma, stats.getFilter().getWordParts()); - currentWord.setMsd(msd, stats.getFilter().getWordParts()); - currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts()); + if (wordIndex < GOSCorpusHM.get(GOSCorpusHMKey).size()) { + Word currentWord = GOSCorpusHM.get(GOSCorpusHMKey).get(wordIndex); + currentWord.setLemma(lemma, stats.getFilter().getWordParts()); + currentWord.setMsd(msd, stats.getFilter().getWordParts()); + currentWord.setNormalizedWord(characters.getData(), stats.getFilter().getWordParts()); - wordIndex += 1; + wordIndex += 1; // when a word is separated from one to many we have to create these duplicates if (inSeparatedWord){ GOSCorpusHM.get(GOSCorpusHMKey).add(wordIndex, createWord(currentWord.getWord(stats.getFilter().getWordParts()), "", "", "", stats.getFilter())); } - } //else { + } //else { // System.out.println("Error"); // } - } + } - } - break; + } + break; - case XMLStreamConstants.END_ELEMENT: - EndElement endElement = event.asEndElement(); + case XMLStreamConstants.END_ELEMENT: + EndElement endElement = event.asEndElement(); if (endElement.getName().getLocalPart().equals("w")) { if (inWord){ @@ -1102,174 +1303,407 @@ public class XML_processing { } } - // parser reached end of the current sentence - if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { - if (inOrthDiv){ - // add sentence to corpus - GOSCorpusHM.put(GOSCorpusHMKey, sentence); - } else { + // parser reached end of the current sentence + if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) { + if (inOrthDiv){ + // add sentence to corpus + GOSCorpusHM.put(GOSCorpusHMKey, sentence); + } else { sentence = GOSCorpusHM.remove(GOSCorpusHMKey); - if (stats.getFilter().getNgramValue() == 0){ - int numSentenceParts = 0; - for(Word w : sentence){ - int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); - numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; - } - stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); - } else if(stats.getFilter().getNgramValue() >= 1) { - stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); - } + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } - // add sentence to corpus if it passes filters - if (includeFile && !ValidationUtil.isEmpty(sentence)) { + // add sentence to corpus if it passes filters + if (includeFile && !ValidationUtil.isEmpty(sentence)) { // for(Word w : sentence) { // if (w.getW1().equals("")) { // System.out.println("HERE!!!"); // } // } - sentence = runFilters(sentence, stats.getFilter()); + sentence = runFilters(sentence, stats.getFilter()); // for(Word w : sentence) { // if (w.getW1().equals("")) { // System.out.println("HERE!!!"); // } // } - corpus.add(new Sentence(sentence, currentFiletaxonomy)); - } + corpus.add(new Sentence(sentence, currentFiletaxonomy)); + } - wordIndex = 0; + wordIndex = 0; - /* Invoke Fork-Join when we reach maximum limit of - * sentences (because we can't read everything to - * memory) or we reach the end of the file. - */ - if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { - fj(corpus, stats); - // empty the current corpus, since we don't need - // the data anymore - corpus.clear(); - } - } + /* Invoke Fork-Join when we reach maximum limit of + * sentences (because we can't read everything to + * memory) or we reach the end of the file. + */ + if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) { + fj(corpus, stats); + // empty the current corpus, since we don't need + // the data anymore + corpus.clear(); + } + } // start a new sentence sentence = new ArrayList<>(); - } else if (endElement.getName().getLocalPart().equals("teiHeader")) { - // before proceeding to read this file, make sure that taxonomy filters are a match - if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { - currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection + } else if (endElement.getName().getLocalPart().equals("teiHeader")) { + // before proceeding to read this file, make sure that taxonomy filters are a match + if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { + currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection - // disregard this entry if taxonomies don't match - includeFile = !currentFiletaxonomy.isEmpty(); + // disregard this entry if taxonomies don't match + includeFile = !currentFiletaxonomy.isEmpty(); // currentFiletaxonomy = new ArrayList<>(); - } - } + } + } - // backup - else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { - fj(corpus, stats); - corpus.clear(); + // backup + else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) { + fj(corpus, stats); + corpus.clear(); currentFiletaxonomy = new ArrayList<>(); // currentFiletaxonomyLong = new ArrayList<>(); - } + } - break; - } - } - } catch (FileNotFoundException | XMLStreamException e) { - e.printStackTrace(); - } finally { - if (eventReader != null) { - try { - eventReader.close(); - } catch (XMLStreamException e) { - logger.error("closing stream", e); - } catch (Exception e) { - logger.error("general error", e); - } - } - } + break; + } + } + } catch (FileNotFoundException | XMLStreamException e) { + e.printStackTrace(); + } finally { + if (eventReader != null) { + try { + eventReader.close(); + } catch (XMLStreamException e) { + logger.error("closing stream", e); + } catch (Exception e) { + logger.error("general error", e); + } + } + } - return true; - } + return true; + } - /** - * Runs the sentence through some filters, so we don't do calculations when unnecessary. - * Filters: - *
    - *
  1. Ngrams: omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)
  2. - *
  3. Letter ngrams: omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)
  4. - *
- * - * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.) - */ - private static List runFilters(List sentence, Filter filter) { - if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { - // ngram level: if not 0 must be less than or equal to number of words in this sentence. - if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { - return new ArrayList<>(); - } + @SuppressWarnings("Duplicates") + public static boolean readVERT(String path, StatisticsNew stats) { + // taxonomy corpora +// HashSet resultTaxonomy = new HashSet<>(); - // if we're calculating values for letters, omit words that are shorter than string length - if (filter.getNgramValue() == 0) { - sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength()) - || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength())); - } - } - return sentence; - } + // regi path + String regiPath = path.substring(0, path.length()-4) + "regi"; - private static HashMap extractAttributes(StartElement se) { - Iterator attributesIt = se.getAttributes(); - HashMap atts = new HashMap<>(); + LineIterator regiIt; + int wordIndex = -1; + int lemmaIndex = -1; + int msdIndex = -1; + boolean slovene = false; + try { + // read regi file + regiIt = FileUtils.lineIterator(new File(regiPath), "UTF-8"); + try { + boolean insideHeader = false; + int attributeIndex = 0; + while (regiIt.hasNext()) { + String line = regiIt.nextLine(); - while (attributesIt.hasNext()) { - Attribute a = (Attribute) attributesIt.next(); - atts.put(a.getName().getLocalPart(), a.getValue()); - } + if (line.length() >= 9 && line.substring(0, 9).equals("ATTRIBUTE")) { + // split over "\" " + String[] split = line.split(" "); + if (split[1].equals("word") && wordIndex == -1){ + wordIndex = attributeIndex; + } else if (split[1].equals("lempos") && lemmaIndex == -1){ + lemmaIndex = attributeIndex; + } else if (split[1].equals("tag") && msdIndex == -1){ + msdIndex = attributeIndex; + } + attributeIndex ++; + if (wordIndex >= 0 && lemmaIndex >= 0 && msdIndex >= 0){ + break; + } + } else if (line.length() >= 8 && line.substring(0, 8).equals("LANGUAGE")) { + String[] split = line.split(" "); + if (split[1].equals("\"Slovenian\"")){ + slovene = true; + } + } + } + } finally { + LineIterator.closeQuietly(regiIt); + } + } catch (IOException e) { + e.printStackTrace(); + } - return atts; - } + int numLines = 0; + // get number of lines + try (FileReader input = new FileReader(path); + LineNumberReader count = new LineNumberReader(input) + ) + { + while (count.skip(Long.MAX_VALUE) > 0) + { + // Loop just in case the file is > Long.MAX_VALUE or skip() decides to not read the entire file + } - public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ - List wString = new ArrayList<>(); - if (f.getWordParts().contains(CalculateFor.WORD)) - wString.add(word); - if (f.getWordParts().contains(CalculateFor.LEMMA)) - wString.add(lemma); - if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) - wString.add(msd); - if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD)) - wString.add(normalizedWord); + numLines = count.getLineNumber() + 1; // +1 because line index starts at 0 + } catch (IOException e) { + e.printStackTrace(); + } - // find appropriate strings and put them in word - Word w; + LineIterator it; - switch (f.getWordParts().size()) { - case 1: - w = new Word1(wString.get(0)); - break; - case 2: - w = new Word2(wString.get(0), wString.get(1)); - break; - case 3: - w = new Word3(wString.get(0), wString.get(1), wString.get(2)); - break; - case 4: - w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3)); - break; - default: - w = null; + ArrayList currentFiletaxonomy = new ArrayList<>(); + boolean inParagraph = false; + boolean inSentence = false; + boolean taxonomyMatch = true; + int lineNum = 0; + int numSentences = 0; + int numSentencesLimit = 1000; + List sentence = new ArrayList<>(); + List corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); - } - return w; - } + progress.set(0.0); + if(!isCollocability) { + startTime = new Date(); + } + try { + it = FileUtils.lineIterator(new File(path), "UTF-8"); + try { + boolean insideHeader = false; + + while (it.hasNext()) { + int percentage = (int) (lineNum * 100.0 / numLines); + if(progress.get() < percentage) { + progress.set(percentage); + } + if(isCancelled) { + return false; + } + lineNum ++; + String line = it.nextLine(); + // beginning tags + + // taxonomy + if (line.length() > 4 && line.substring(1, 5).equals("text")) { + String[] split = line.split("\" "); + currentFiletaxonomy = new ArrayList<>(); + + boolean medium = false; + boolean type = false; + boolean proofread = false; + for (String el : split) { + String[] attribute = el.split("=\""); + boolean idsPresent = false; + if (attribute[0].equals("medium_id") && !attribute[1].equals("-")) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + medium = true; + } else if (attribute[0].equals("type_id") && !attribute[1].equals("-")) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + type = true; + } else if (attribute[0].equals("proofread_id") && !attribute[1].equals("-")) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + proofread = true; + } + if (attribute[0].equals("medium") && !attribute[1].equals("-") && !medium) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + } else if (attribute[0].equals("type") && !attribute[1].equals("-") && !type) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + } else if (attribute[0].equals("proofread") && !attribute[1].equals("-") && !attribute[1].equals("-\">") && !proofread) { + Taxonomy currentFiletaxonomyElement = Taxonomy.factory(attribute[1], stats.getCorpus()); + currentFiletaxonomy.add(currentFiletaxonomyElement); + } + + } + taxonomyMatch = true; + if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) { + currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection + + if (currentFiletaxonomy.isEmpty()) { + // taxonomies don't match so don't save + taxonomyMatch = false; + } + } + + } +// else if((line.length() >= 3 && line.substring(0, 2).equals("")) || +// (line.length() >= 3 && line.substring(0, 3).equals(""))){ +// inParagraph = true; +// } else if((line.length() == 4 && line.equals("

")) || (line.length() == 5 && line.equals(""))){ +// inParagraph = false; +// } + else if(line.length() >= 3 && line.substring(0, 2).equals("")){ + inSentence = true; + } else if(line.length() == 4 && line.equals("")){ + inSentence = false; + + if (stats.getFilter().getNgramValue() == 0){ + int numSentenceParts = 0; + for(Word w : sentence){ + int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1); + numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts; + } + stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy); + } else if(stats.getFilter().getNgramValue() >= 1) { + stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy); + } + + sentence = runFilters(sentence, stats.getFilter()); + + if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) { + corpus.add(new Sentence(sentence, currentFiletaxonomy)); + } + + if (numSentences == numSentencesLimit) { + fj(corpus, stats); + corpus.clear(); + numSentences = 0; + } else { + numSentences ++; + } + + // and start a new one + sentence = new ArrayList<>(); + +// corpus.add(new Sentence(sentence, currentFiletaxonomy)); + } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence){ +// } else if(!(line.charAt(0) == '<' && line.charAt(line.length() - 1) == '>') && inSentence && inParagraph){ + String[] split = line.split("\t"); + if(slovene) { + if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) && + !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u")) { + Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) { + Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-u") || + stats.getFilter().getNotePunctuations()) { + Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } + } else { + if (split[lemmaIndex].length() > 2 && split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) == '-' && Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1)) && + !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z")) { + Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } else if (stats.getFilter().getNotePunctuations() && (split[lemmaIndex].length() <= 2 || (split[lemmaIndex].charAt(split[lemmaIndex].length() - 2) != '-' && !Character.isAlphabetic(split[lemmaIndex].charAt(split[lemmaIndex].length() - 1))))) { + Word word = createWord(split[wordIndex], split[lemmaIndex], split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } else if (split[lemmaIndex].length() > 2 && !split[lemmaIndex].substring(split[lemmaIndex].length() - 2, split[lemmaIndex].length()).equals("-z") || + stats.getFilter().getNotePunctuations()) { + Word word = createWord(split[wordIndex], split[lemmaIndex].substring(0, split[lemmaIndex].length() - 2), split[msdIndex], split[wordIndex], stats.getFilter()); + sentence.add(word); + } + } + } + } + if (corpus.size() > 0) { + fj(corpus, stats); + corpus.clear(); + } + } finally { + LineIterator.closeQuietly(it); + } + } catch (IOException e) { + e.printStackTrace(); + } +// resultTaxonomy.remove("-"); + return true; + } + + /** + * Runs the sentence through some filters, so we don't do calculations when unnecessary. + * Filters: + *
    + *
  1. Ngrams: omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)
  2. + *
  3. Letter ngrams: omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)
  4. + *
+ * + * @return Empty sentence (if fails 1.) or a sentence with some words removed (2.) + */ + private static List runFilters(List sentence, Filter filter) { + if (filter.getAl() == AnalysisLevel.STRING_LEVEL) { + // ngram level: if not 0 must be less than or equal to number of words in this sentence. + if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) { + return new ArrayList<>(); + } + + // if we're calculating values for letters, omit words that are shorter than string length + if (filter.getNgramValue() == 0) { + sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord(filter.getWordParts()).length() < filter.getStringLength()) + || (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma(filter.getWordParts()).length() < filter.getStringLength())); + } + } + + return sentence; + } + + private static HashMap extractAttributes(StartElement se) { + Iterator attributesIt = se.getAttributes(); + HashMap atts = new HashMap<>(); + + while (attributesIt.hasNext()) { + Attribute a = (Attribute) attributesIt.next(); + atts.put(a.getName().getLocalPart(), a.getValue()); + } + + return atts; + } + + public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){ + List wString = new ArrayList<>(); + if (f.getWordParts().contains(CalculateFor.WORD)) + wString.add(word); + if (f.getWordParts().contains(CalculateFor.LEMMA)) + wString.add(lemma); + if (f.getWordParts().contains(CalculateFor.MORPHOSYNTACTIC_SPECS)) + wString.add(msd); + if (f.getWordParts().contains(CalculateFor.NORMALIZED_WORD)) + wString.add(normalizedWord); + + // find appropriate strings and put them in word + Word w; + + switch (f.getWordParts().size()) { + case 1: + w = new Word1(wString.get(0)); + break; + case 2: + w = new Word2(wString.get(0), wString.get(1)); + break; + case 3: + w = new Word3(wString.get(0), wString.get(1), wString.get(2)); + break; + case 4: + w = new Word4(wString.get(0), wString.get(1), wString.get(2), wString.get(3)); + break; + default: + w = null; + + } + return w; + } } diff --git a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java index b4f02bf..3f8c480 100755 --- a/src/main/java/alg/inflectedJOS/InflectedJOSCount.java +++ b/src/main/java/alg/inflectedJOS/InflectedJOSCount.java @@ -74,7 +74,7 @@ // // public static void calculateForAll(List corpus, Statistics stats, String taxonomy) { // // for (Sentence s : corpus) { // // // disregard if wrong taxonomy -// // if (!(s.getTaxonomy().startsWith(taxonomy))) { +// // if (!(s.getObservableListTaxonomy().startsWith(taxonomy))) { // // continue; // // } // // @@ -122,7 +122,7 @@ // static void calculateForAll(List corpus, Statistics stats, String taxonomy) { // for (Sentence s : corpus) { // // disregard if wrong taxonomy -//// if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) { +//// if (taxonomy != null && !(s.getObservableListTaxonomy().startsWith(taxonomy))) { //// continue; //// } // diff --git a/src/main/java/alg/ngram/Ngrams.java b/src/main/java/alg/ngram/Ngrams.java index f8e42da..e93dcf9 100755 --- a/src/main/java/alg/ngram/Ngrams.java +++ b/src/main/java/alg/ngram/Ngrams.java @@ -432,7 +432,7 @@ public class Ngrams { // String key = wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()); // key = (key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key; // stats.updateTaxonomyResults(new MultipleHMKeys1(key), -// stats.getCorpus().getTaxonomy()); +// stats.getCorpus().getObservableListTaxonomy()); ArrayList otherKeys = stats.getFilter().getMultipleKeys(); diff --git a/src/main/java/alg/word/WordCount.java b/src/main/java/alg/word/WordCount.java index 5ee2160..b6f4cbc 100755 --- a/src/main/java/alg/word/WordCount.java +++ b/src/main/java/alg/word/WordCount.java @@ -91,7 +91,7 @@ import data.Word; // private static void calculateForTaxonomyAndJosType(List corpus, Statistics stats) { // for (Sentence s : corpus) { -// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { +// if (s.getObservableListTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { // List sentence = new ArrayList<>(s.getWords().size()); // List filteredWords = new ArrayList<>(); // @@ -122,7 +122,7 @@ import data.Word; // private static void calculateForTaxonomy(List corpus, Statistics stats) { // for (Sentence s : corpus) { -// if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { +// if (s.getObservableListTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) { // List sentence = new ArrayList<>(s.getWords().size()); // // if (stats.getCf() == CalculateFor.LEMMA) { diff --git a/src/main/java/data/Corpus.java b/src/main/java/data/Corpus.java index dfbd710..1899a4f 100755 --- a/src/main/java/data/Corpus.java +++ b/src/main/java/data/Corpus.java @@ -27,7 +27,8 @@ public class Corpus { private File chosenCorpusLocation; private Collection detectedCorpusFiles; boolean headerRead; - private ObservableList taxonomy; // if gigafida or gos + private ArrayList taxonomy; // if gigafida or gos + private Taxonomy taxonomyTotal; private HashMap> solarFilters; // if solar private HashMap> solarFiltersForXML; // if solar - used while parsing xml private boolean gosOrthMode; @@ -36,6 +37,7 @@ public class Corpus { public Corpus() { validationErrors = new ArrayList<>(); + setTotal(); } public CorpusType getCorpusType() { @@ -82,9 +84,25 @@ public class Corpus { this.headerRead = headerRead; } - public ObservableList getTaxonomy() { + public Taxonomy getTotal() { + return taxonomyTotal; + } + + public void setTotal() { + taxonomyTotal = new Taxonomy("Total", false); + } + + public ArrayList getTaxonomy() { return taxonomy; } + + public ObservableList getObservableListTaxonomy() { + ArrayList al = new ArrayList<>(); + for (Taxonomy t : this.taxonomy){ + al.add(t.toLongNameString()); + } + return FXCollections.observableArrayList(al); + } // // public ObservableList getFormattedTaxonomy() { // ArrayList al = Tax.getTaxonomyFormatted(new ArrayList<>(taxonomy), corpusType); @@ -92,7 +110,10 @@ public class Corpus { // } public void setTaxonomy(ObservableList taxonomy) { - this.taxonomy = taxonomy; + this.taxonomy = new ArrayList<>(); + for(String t : taxonomy){ + this.taxonomy.add(new Taxonomy(t, true)); + } logger.info("Corpus.set: ", taxonomy); } @@ -151,7 +172,8 @@ public class Corpus { if (!headerRead && corpusType != null) { // if user didn't opt into reading the headers, set default taxonomy or solar filters if (Tax.getCorpusTypesWithTaxonomy().contains(corpusType)) { - taxonomy = Tax.getTaxonomyForComboBox(corpusType); + Tax.getTaxonomyForComboBox(corpusType); + setTaxonomy(Tax.getTaxonomyForComboBox(corpusType)); } else if (corpusType == CorpusType.SOLAR && solarFilters == null) { setSolarFilters(SolarFilters.getFiltersForComboBoxes()); } diff --git a/src/main/java/data/CorpusType.java b/src/main/java/data/CorpusType.java index 7cac659..e14c4df 100755 --- a/src/main/java/data/CorpusType.java +++ b/src/main/java/data/CorpusType.java @@ -2,6 +2,7 @@ package data; public enum CorpusType { GIGAFIDA("Gigafida", "gigafida"), + GIGAFIDA2("Gigafida2.0", "gigafida2.0"), CCKRES("ccKres ", "cckres"), SOLAR("Šolar", "šolar"), GOS("GOS", "gos"), diff --git a/src/main/java/data/StatisticsNew.java b/src/main/java/data/StatisticsNew.java index cc29469..9301c64 100755 --- a/src/main/java/data/StatisticsNew.java +++ b/src/main/java/data/StatisticsNew.java @@ -10,7 +10,6 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; -import java.util.stream.Collectors; import gui.I18N; import org.apache.commons.lang3.StringUtils; @@ -51,17 +50,17 @@ public class StatisticsNew { this.corpus = corpus; this.filter = filter; this.taxonomyResult = new ConcurrentHashMap<>(); - this.taxonomyResult.put(Taxonomy.TOTAL, new ConcurrentHashMap<>()); + this.taxonomyResult.put(corpus.getTotal(), new ConcurrentHashMap<>()); this.collocability = new ConcurrentHashMap<>(); this.uniGramTaxonomyOccurrences = new ConcurrentHashMap<>(); - this.uniGramTaxonomyOccurrences.put(Taxonomy.TOTAL, new AtomicLong(0L)); + this.uniGramTaxonomyOccurrences.put(corpus.getTotal(), new AtomicLong(0L)); // create table for counting word occurrences per taxonomies - if (this.corpus.getTaxonomy() != null && filter.getDisplayTaxonomy()) { + if (this.corpus.getObservableListTaxonomy() != null && filter.getDisplayTaxonomy()) { if (this.filter.getTaxonomy().isEmpty()) { - for (int i = 0; i < this.corpus.getTaxonomy().size(); i++) { - this.taxonomyResult.put(Taxonomy.factoryLongName(this.corpus.getTaxonomy().get(i)), new ConcurrentHashMap<>()); + for (int i = 0; i < this.corpus.getObservableListTaxonomy().size(); i++) { + this.taxonomyResult.put(Taxonomy.factoryLongName(this.corpus.getObservableListTaxonomy().get(i), corpus), new ConcurrentHashMap<>()); } } else { for (int i = 0; i < this.filter.getTaxonomy().size(); i++) { @@ -234,14 +233,14 @@ public class StatisticsNew { removeMinimalTaxonomy(taxonomyResult, filter.getMinimalTaxonomy()); // if no results and nothing to save, return false - if (!(taxonomyResult.get(Taxonomy.TOTAL).size() > 0)) { + if (!(taxonomyResult.get(corpus.getTotal()).size() > 0)) { analysisProducedResults = false; return false; } else { analysisProducedResults = true; } - stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get(Taxonomy.TOTAL), Util.getValidInt(limit)))); + stats.add(ImmutablePair.of(resultTitle, getSortedResult(taxonomyResult.get(corpus.getTotal()), Util.getValidInt(limit)))); Export.SetToCSV(stats, corpus.getChosenResultsLocation(), headerInfoBlock(), this, filter); return true; } @@ -253,14 +252,14 @@ public class StatisticsNew { if (minimalTaxonomy == 1) return; int occurances; - for (MultipleHMKeys key : taxonomyResult.get(Taxonomy.TOTAL).keySet()){ + for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){ occurances = 0; for (Taxonomy columnNameKey : taxonomyResult.keySet()){ - if(!columnNameKey.equals(Taxonomy.TOTAL) && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1) + if(!columnNameKey.equals(corpus.getTotal()) && taxonomyResult.get(columnNameKey).get(key).intValue() >= 1) occurances++; } if(occurances < minimalTaxonomy){ - taxonomyResult.get(Taxonomy.TOTAL).remove(key); + taxonomyResult.get(corpus.getTotal()).remove(key); } } } @@ -271,8 +270,8 @@ public class StatisticsNew { private void removeMinimalOccurrences(Integer minimalOccurrences) { if (minimalOccurrences == 0) return; - for (MultipleHMKeys key : taxonomyResult.get(Taxonomy.TOTAL).keySet()){ - if(taxonomyResult.get(Taxonomy.TOTAL).get(key).intValue() < minimalOccurrences){ + for (MultipleHMKeys key : taxonomyResult.get(corpus.getTotal()).keySet()){ + if(taxonomyResult.get(corpus.getTotal()).get(key).intValue() < minimalOccurrences){ for (Taxonomy t : taxonomyResult.keySet()){ taxonomyResult.get(t).remove(key); } @@ -349,7 +348,7 @@ public class StatisticsNew { } public void updateUniGramOccurrences(int amount, ArrayList taxonomy){ - uniGramTaxonomyOccurrences.get(Taxonomy.TOTAL).set(uniGramTaxonomyOccurrences.get(Taxonomy.TOTAL).longValue() + amount); + uniGramTaxonomyOccurrences.get(corpus.getTotal()).set(uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue() + amount); for (Taxonomy t : taxonomy){ if (uniGramTaxonomyOccurrences.get(t) != null){ uniGramTaxonomyOccurrences.get(t).set(uniGramTaxonomyOccurrences.get(t).longValue() + amount); @@ -360,15 +359,15 @@ public class StatisticsNew { } public Map getUniGramOccurrences(){ -// return uniGramTaxonomyOccurrences.get(Taxonomy.TOTAL).longValue(); +// return uniGramTaxonomyOccurrences.get(corpus.getTotal()).longValue(); return uniGramTaxonomyOccurrences; } public void updateTaxonomyResults(MultipleHMKeys o, List taxonomy) { for (Taxonomy key : taxonomyResult.keySet()) { // first word should have the same taxonomy as others - if (key.equals(Taxonomy.TOTAL) || taxonomy.contains(key)) { -// if (key.equals(Taxonomy.TOTAL) || taxonomy != null && taxonomy.contains(key)) { + if (key.equals(corpus.getTotal()) || taxonomy.contains(key)) { +// if (key.equals(corpus.getTotal()) || taxonomy != null && taxonomy.contains(key)) { // if taxonomy not in map and in this word AtomicLong r = taxonomyResult.get(key).putIfAbsent(o, new AtomicLong(1)); @@ -607,7 +606,7 @@ public class StatisticsNew { // sortedTaxonomyString.add(t); // } // getTaxonomyForTaxonomyResult - tax = Tax.getTaxonomyForTaxonomyResult(corpus.getCorpusType(), taxonomyResult.keySet()); + tax = Tax.getTaxonomyForTaxonomyResult(corpus, taxonomyResult.keySet()); } // String sep = ""; @@ -618,11 +617,11 @@ public class StatisticsNew { } // info.put(sep = sep + " ", s); - if (uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s)) == null) { + if (uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)) == null) { info.put(s, ""); continue; } - int n = uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s)).intValue(); + int n = uniGramTaxonomyOccurrences.get(Taxonomy.factoryLongName(s, corpus)).intValue(); if (n == 0) { info.put(s, ""); } else { @@ -662,11 +661,11 @@ public class StatisticsNew { // count number of all words long N = 0; - for(AtomicLong a : oneWordTaxonomyResult.get(Taxonomy.TOTAL).values()){ + for(AtomicLong a : oneWordTaxonomyResult.get(corpus.getTotal()).values()){ N += a.longValue(); } - for(MultipleHMKeys hmKey : taxonomyResult.get(Taxonomy.TOTAL).keySet()) { + for(MultipleHMKeys hmKey : taxonomyResult.get(corpus.getTotal()).keySet()) { // String[] splitedString = hmKey.getK1().split("\\s+"); long sum_fwi =0L; @@ -674,15 +673,15 @@ public class StatisticsNew { for(MultipleHMKeys smallHmKey : hmKey.getSplittedMultipleHMKeys()){ // System.out.println(smallHmKey.getK1()); - sum_fwi += oneWordTaxonomyResult.get(Taxonomy.TOTAL).get(smallHmKey).longValue(); - mul_fwi *= oneWordTaxonomyResult.get(Taxonomy.TOTAL).get(smallHmKey).longValue(); + sum_fwi += oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue(); + mul_fwi *= oneWordTaxonomyResult.get(corpus.getTotal()).get(smallHmKey).longValue(); } // String t = hmKey.getK1(); // if(hmKey.getK1().equals("v Slovenija")){ // System.out.println("TEST"); // // } - double O = (double)taxonomyResult.get(Taxonomy.TOTAL).get(hmKey).longValue(); + double O = (double)taxonomyResult.get(corpus.getTotal()).get(hmKey).longValue(); double n = (double)filter.getNgramValue(); double E = (double)mul_fwi / Math.pow(N, n - 1); if (collocabilityMap.keySet().contains(Collocability.DICE)){ diff --git a/src/main/java/data/Tax.java b/src/main/java/data/Tax.java index 305fcb7..3e708e8 100755 --- a/src/main/java/data/Tax.java +++ b/src/main/java/data/Tax.java @@ -10,7 +10,7 @@ import javafx.collections.ObservableList; public class Tax { private static LinkedHashMap GIGAFIDA_TAXONOMY; private static LinkedHashMap GOS_TAXONOMY; - private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.VERT)); + private static final HashSet corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT)); static { // GIGAFIDA ---------------------------- @@ -104,7 +104,7 @@ public class Tax { public static ObservableList getTaxonomyForComboBox(CorpusType corpusType, HashSet foundTax) { LinkedHashMap tax = new LinkedHashMap<>(); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; @@ -143,13 +143,13 @@ public class Tax { /** * Returns taxonomy names only for items found in headers */ - public static ArrayList getTaxonomyForTaxonomyResult(CorpusType corpusType, Set foundTax) { + public static ArrayList getTaxonomyForTaxonomyResult(Corpus corpus, Set foundTax) { LinkedHashMap tax = new LinkedHashMap<>(); Set foundTaxHS= new HashSet<>(foundTax); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { + if (corpus.getCorpusType() == CorpusType.GIGAFIDA || corpus.getCorpusType() == CorpusType.CCKRES || corpus.getCorpusType() == CorpusType.SSJ500K || corpus.getCorpusType() == CorpusType.GIGAFIDA2) { tax = GIGAFIDA_TAXONOMY; - } else if (corpusType == CorpusType.GOS) { + } else if (corpus.getCorpusType() == CorpusType.GOS) { tax = GOS_TAXONOMY; } @@ -161,7 +161,7 @@ public class Tax { for(Taxonomy e : foundTaxHS){ String[] elList = e.toString().split("\\."); for(int i = 1; i < elList.length - 1; i++){ - Taxonomy candidate = Taxonomy.factory(String.join(".", Arrays.copyOfRange(elList, 0, elList.length - i))); + Taxonomy candidate = Taxonomy.factory(String.join(".", Arrays.copyOfRange(elList, 0, elList.length - i)), corpus); genFoundTax.add(candidate); } } @@ -186,7 +186,7 @@ public class Tax { // assures same relative order for (String t : tax.keySet()) { - if (foundTaxHS.contains(Taxonomy.factory(t))) { + if (foundTaxHS.contains(Taxonomy.factory(t, corpus))) { taxForCombo.add(tax.get(t)); } } @@ -263,13 +263,19 @@ public class Tax { public static ArrayList getTaxonomyForInfo(CorpusType corpusType, ArrayList taxonomy) { LinkedHashMap tax = new LinkedHashMap<>(); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { + ArrayList result = new ArrayList<>(); + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { tax = GIGAFIDA_TAXONOMY; } else if (corpusType == CorpusType.GOS) { tax = GOS_TAXONOMY; + } else if (corpusType == CorpusType.VERT) { + for (Taxonomy t : taxonomy) { + result.add(t.toLongNameString()); + } + return result; } - ArrayList result = new ArrayList<>(); + for (Taxonomy t : taxonomy) { result.add(tax.get(t.toString())); diff --git a/src/main/java/data/Taxonomy.java b/src/main/java/data/Taxonomy.java index ce4ae7d..a4da582 100755 --- a/src/main/java/data/Taxonomy.java +++ b/src/main/java/data/Taxonomy.java @@ -5,7 +5,7 @@ import java.util.concurrent.ConcurrentHashMap; import javafx.collections.ObservableList; -public enum Taxonomy { +enum TaxonomyEnum { TOTAL("Total", "Total"), // GOS @@ -85,7 +85,7 @@ public enum Taxonomy { private final String name; private final String longName; - Taxonomy(String name, String longName) { + TaxonomyEnum(String name, String longName) { this.name = name; this.longName = longName; } @@ -98,7 +98,7 @@ public enum Taxonomy { return this.longName; } - public static Taxonomy factory(String tax) { + public static TaxonomyEnum factory(String tax) { if (tax != null) { // GOS if (DISKURZ.toString().equals(tax)) { @@ -289,7 +289,7 @@ public enum Taxonomy { return null; } - public static Taxonomy factoryLongName(String tax) { + public static TaxonomyEnum factoryLongName(String tax) { if (tax != null) { // GOS if (DISKURZ.toLongNameString().equals(tax)) { @@ -477,11 +477,15 @@ public enum Taxonomy { } } +// return new Taxonomy(tax, tax); + System.out.println("2."); + System.out.println(tax); + return null; } - public static ArrayList taxonomySelected(Taxonomy disjointTaxonomy) { - ArrayList r = new ArrayList<>(); + public static ArrayList taxonomySelected(TaxonomyEnum disjointTaxonomy) { + ArrayList r = new ArrayList<>(); System.out.println(disjointTaxonomy); if(disjointTaxonomy.equals(DISKURZ)){ @@ -628,9 +632,9 @@ public enum Taxonomy { return r; } - public static ArrayList taxonomyDeselected(Taxonomy disjointTaxonomy){ - ArrayList r = new ArrayList<>(); - Map connections = new ConcurrentHashMap<>(); + public static ArrayList taxonomyDeselected(TaxonomyEnum disjointTaxonomy){ + ArrayList r = new ArrayList<>(); + Map connections = new ConcurrentHashMap<>(); connections.put(DISKURZ_JAVNI, DISKURZ); connections.put(DISKURZ_INFORMATIVNO_IZOBRAZEVALNI, DISKURZ_JAVNI); connections.put(DISKURZ_RAZVEDRILNI, DISKURZ_JAVNI); @@ -685,7 +689,7 @@ public enum Taxonomy { connections.put(FT_DA, FT_LEKTORIRANO); connections.put(FT_NE, FT_LEKTORIRANO); - Taxonomy currentTaxonomy = disjointTaxonomy; + TaxonomyEnum currentTaxonomy = disjointTaxonomy; r.add(currentTaxonomy); while(connections.containsKey(currentTaxonomy)){ currentTaxonomy = connections.get(currentTaxonomy); @@ -695,29 +699,36 @@ public enum Taxonomy { return r; } - public static ArrayList convertStringListToTaxonomyList(ObservableList stringList){ + public static ArrayList convertStringListToTaxonomyList(ObservableList stringList, Corpus corpus){ + System.out.println("1."); System.out.println(stringList); - ArrayList taxonomyList = new ArrayList<>(); + ArrayList taxonomyList = new ArrayList<>(); // System.out.println("INTERESTING STUFF"); // System.out.println(stringList); for (String e : stringList) { - taxonomyList.add(factoryLongName(e)); + for (Taxonomy t : corpus.getTaxonomy()){ + if (t.toLongNameString().equals(e)) { + taxonomyList.add(t.getTaxonomyEnum()); + } + } } // System.out.println(taxonomyList); // System.out.println("-----------------"); return taxonomyList; } - public static void modifyingTaxonomy(ArrayList taxonomy, ArrayList checkedItemsTaxonomy, Corpus corpus){ + public static void modifyingTaxonomy(ArrayList taxonomy, ArrayList checkedItemsTaxonomy, Corpus corpus){ // get taxonomies that were selected/deselected by user -// System.out.println(taxonomy); -// System.out.println(checkedItemsTaxonomy); + System.out.println("Print here:"); + System.out.println(taxonomy); + System.out.println(checkedItemsTaxonomy); + System.out.println("-------------"); - Set disjointTaxonomies = new HashSet<>(checkedItemsTaxonomy); + Set disjointTaxonomies = new HashSet<>(checkedItemsTaxonomy); if (taxonomy != null) { disjointTaxonomies.addAll(taxonomy); - for (Taxonomy s : checkedItemsTaxonomy) { + for (TaxonomyEnum s : checkedItemsTaxonomy) { if (taxonomy.contains(s)) { disjointTaxonomies.remove(s); } @@ -725,11 +736,11 @@ public enum Taxonomy { } // remove previously selected items plus remove taxonomies that are not presented in current setup - ArrayList disArr = new ArrayList<>(disjointTaxonomies); + ArrayList disArr = new ArrayList<>(disjointTaxonomies); int i = 0; while(i < disArr.size()){ - Taxonomy s = disArr.get(i); - if(!Taxonomy.convertStringListToTaxonomyList(corpus.getTaxonomy()).contains(s)){ + TaxonomyEnum s = disArr.get(i); + if(!TaxonomyEnum.convertStringListToTaxonomyList(corpus.getObservableListTaxonomy(), corpus).contains(s)){ disjointTaxonomies.remove(s); disArr.remove(s); // taxonomy.remove(s); @@ -740,14 +751,14 @@ public enum Taxonomy { if (disjointTaxonomies.size() > 0) { - Taxonomy disjointTaxonomy = disjointTaxonomies.iterator().next(); + TaxonomyEnum disjointTaxonomy = disjointTaxonomies.iterator().next(); // taxonomy was selected if (checkedItemsTaxonomy.contains(disjointTaxonomy)) { - ArrayList addTaxonomies = Taxonomy.taxonomySelected(disjointTaxonomy); + ArrayList addTaxonomies = TaxonomyEnum.taxonomySelected(disjointTaxonomy); checkedItemsTaxonomy.addAll(addTaxonomies); } else if (taxonomy.contains(disjointTaxonomy)) { - ArrayList removeTaxonomies = Taxonomy.taxonomyDeselected(disjointTaxonomy); + ArrayList removeTaxonomies = TaxonomyEnum.taxonomyDeselected(disjointTaxonomy); checkedItemsTaxonomy.removeAll(removeTaxonomies); } } @@ -755,3 +766,203 @@ public enum Taxonomy { } + +public class Taxonomy { + private String name; + private String longName; + private TaxonomyEnum taxonomyEnum; + + public Taxonomy(String tax, boolean longName) { + if (!longName) { + this.taxonomyEnum = TaxonomyEnum.factory(tax); + } else { + this.taxonomyEnum = TaxonomyEnum.factoryLongName(tax); + } + if (taxonomyEnum != null){ + this.name = this.taxonomyEnum.toString(); + this.longName = this.taxonomyEnum.toLongNameString(); + } else { + this.name = tax; + this.longName = tax; + } + } + + public Taxonomy(TaxonomyEnum taxonomyEnum) { + this.taxonomyEnum = taxonomyEnum; + this.name = this.taxonomyEnum.toString(); + this.longName = this.taxonomyEnum.toLongNameString(); + + } + +// public Taxonomy(String name, String longName) { +// this.name = name; +// this.longName = longName; +// } + + public String toString() { + return this.name; + } + + public String toLongNameString() { + return this.longName; + } + + public TaxonomyEnum getTaxonomyEnum() { + return this.taxonomyEnum; + } + + public static Taxonomy factory(String tax, Corpus corpus) { + for (Taxonomy t : corpus.getTaxonomy()){ + if(tax.equals(t.toString())) + return t; + } + return null; +// return new Taxonomy(tax, false); + } + + public static Taxonomy factoryLongName(String tax, Corpus corpus) { + for (Taxonomy t : corpus.getTaxonomy()){ + if(tax.equals(t.toLongNameString())) + return t; + } + return null; +// return new Taxonomy(tax, true); + } + +// public static ArrayList taxonomySelected(Taxonomy disjointTaxonomy) { +// ArrayList rTaxonomyEnum = TaxonomyEnum.taxonomySelected(disjointTaxonomy.getTaxonomyEnum()); +// +// ArrayList r = new ArrayList<>(); +// +// for(TaxonomyEnum t : rTaxonomyEnum){ +// r.add(new Taxonomy(t.toString(), false)); +// } +// +// return r; +// } + + public static ArrayList taxonomyDeselected(Taxonomy disjointTaxonomy){ +// ArrayList r = new ArrayList<>(); +// Map connections = new ConcurrentHashMap<>(); +// connections.put(DISKURZ_JAVNI, DISKURZ); +// connections.put(DISKURZ_INFORMATIVNO_IZOBRAZEVALNI, DISKURZ_JAVNI); +// connections.put(DISKURZ_RAZVEDRILNI, DISKURZ_JAVNI); +// connections.put(DISKURZ_NEJAVNI, DISKURZ); +// connections.put(DISKURZ_NEZASEBNI, DISKURZ_NEJAVNI); +// connections.put(DISKURZ_ZASEBNI, DISKURZ_NEJAVNI); +// connections.put(SITUACIJA_RADIO, SITUACIJA); +// connections.put(SITUACIJA_TELEVIZIJA, SITUACIJA); +// connections.put(KANAL_OSEBNI_STIK, KANAL); +// connections.put(KANAL_TELEFON, KANAL); +// connections.put(KANAL_RADIO, KANAL); +// connections.put(KANAL_TELEVIZIJA, KANAL); +// +// connections.put(SSJ_KNJIZNO, SSJ_TISK); +// connections.put(SSJ_LEPOSLOVNO, SSJ_KNJIZNO); +// connections.put(SSJ_STROKOVNO, SSJ_KNJIZNO); +// connections.put(SSJ_PERIODICNO, SSJ_TISK); +// connections.put(SSJ_CASOPIS, SSJ_PERIODICNO); +// connections.put(SSJ_REVIJA, SSJ_PERIODICNO); +// connections.put(SSJ_DRUGO, SSJ_TISK); +// +// connections.put(FT_P_GOVORNI, FT_P_PRENOSNIK); +// connections.put(FT_P_ELEKTRONSKI, FT_P_PRENOSNIK); +// connections.put(FT_P_PISNI, FT_P_PRENOSNIK); +// connections.put(FT_P_OBJAVLJENO, FT_P_PISNI); +// connections.put(FT_P_KNJIZNO, FT_P_OBJAVLJENO); +// connections.put(FT_P_PERIODICNO, FT_P_OBJAVLJENO); +// connections.put(FT_P_CASOPISNO, FT_P_OBJAVLJENO); +// connections.put(FT_P_DNEVNO, FT_P_CASOPISNO); +// connections.put(FT_P_VECKRAT_TEDENSKO, FT_P_CASOPISNO); +// connections.put(FT_P_CASOPISNO_TEDENSKO, FT_P_CASOPISNO); +// connections.put(FT_P_REVIALNO, FT_P_PERIODICNO); +// connections.put(FT_P_TEDENSKO, FT_P_REVIALNO); +// connections.put(FT_P_STIRINAJSTDNEVNO, FT_P_REVIALNO); +// connections.put(FT_P_MESECNO, FT_P_REVIALNO); +// connections.put(FT_P_REDKEJE_KOT_MESECNO, FT_P_REVIALNO); +// connections.put(FT_P_OBCASNO, FT_P_REVIALNO); +// connections.put(FT_P_NEOBJAVLJENO, FT_P_PISNI); +// connections.put(FT_P_JAVNO, FT_P_NEOBJAVLJENO); +// connections.put(FT_P_INTERNO, FT_P_NEOBJAVLJENO); +// connections.put(FT_P_ZASEBNO, FT_P_NEOBJAVLJENO); +// connections.put(FT_UMETNOSTNA, FT_ZVRST); +// connections.put(FT_PESNISKA, FT_UMETNOSTNA); +// connections.put(FT_PROZNA, FT_UMETNOSTNA); +// connections.put(FT_DRAMSKA, FT_UMETNOSTNA); +// connections.put(FT_NEUMETNOSTNA, FT_ZVRST); +// connections.put(FT_STROKOVNA, FT_NEUMETNOSTNA); +// connections.put(FT_HID, FT_STROKOVNA); +// connections.put(FT_NIT, FT_STROKOVNA); +// connections.put(FT_NESTROKOVNA, FT_NEUMETNOSTNA); +// connections.put(FT_PRAVNA, FT_NEUMETNOSTNA); +// connections.put(FT_DA, FT_LEKTORIRANO); +// connections.put(FT_NE, FT_LEKTORIRANO); +// +// TaxonomyEnum currentTaxonomy = disjointTaxonomy; +// r.add(currentTaxonomy); +// while(connections.containsKey(currentTaxonomy)){ +// currentTaxonomy = connections.get(currentTaxonomy); +// r.add(currentTaxonomy); +// } +// Collections.reverse(r); +// return r; + return null; + } + + public static ArrayList convertStringListToTaxonomyList(ObservableList stringList, Corpus corpus){ + ArrayList taxonomyList = new ArrayList<>(); + + for (String e : stringList) { + for (Taxonomy t : corpus.getTaxonomy()){ + if (t.toLongNameString().equals(e)) { + taxonomyList.add(t); + } + } + } + return taxonomyList; + } + + public static ArrayList taxonomyToTaxonomyEnum(ArrayList taxonomy){ + System.out.println(taxonomy); + if (taxonomy == null) { + return null; + } + ArrayList r = new ArrayList<>(); + for (Taxonomy t : taxonomy){ + if (t.taxonomyEnum == null){ + return null; + } + r.add(t.taxonomyEnum); + } + return r; + } + + public static ArrayList taxonomyEnumToTaxonomy(ArrayList taxonomy, Corpus corpus){ +// ArrayList r = new ArrayList<>(); +// for (TaxonomyEnum t : taxonomy){ +// r.add(new Taxonomy(t)); +// } +// return r; + ArrayList r = new ArrayList<>(); + for (TaxonomyEnum te : taxonomy){ + for (Taxonomy t : corpus.getTaxonomy()){ + if (t.taxonomyEnum.equals(te)) { + r.add(t); + break; + } + } + + } + return r; + } + + public static ArrayList modifyingTaxonomy(ArrayList taxonomy, ObservableList checkedItems, Corpus corpus){ + ArrayList checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus); + if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT) { + TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus); + return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus); + } else { + return convertStringListToTaxonomyList(checkedItems, corpus); + } + } +} diff --git a/src/main/java/gui/CharacterAnalysisTab.java b/src/main/java/gui/CharacterAnalysisTab.java index e068884..c286bad 100755 --- a/src/main/java/gui/CharacterAnalysisTab.java +++ b/src/main/java/gui/CharacterAnalysisTab.java @@ -1,8 +1,11 @@ package gui; +import alg.XML_processing; import data.*; import javafx.application.HostServices; -import javafx.beans.binding.Bindings; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; import javafx.beans.value.ChangeListener; import javafx.beans.value.ObservableValue; import javafx.collections.FXCollections; @@ -25,7 +28,6 @@ import java.util.regex.Pattern; import static alg.XML_processing.readXML; import static gui.GUIController.showAlert; -import static gui.Messages.*; @SuppressWarnings("Duplicates") public class CharacterAnalysisTab { @@ -160,6 +162,7 @@ public class CharacterAnalysisTab { private boolean useDb; private HostServices hostService; private ListChangeListener taxonomyListener; + private InvalidationListener progressBarListener; private static final String [] N_GRAM_COMPUTE_FOR_LETTERS_ARRAY = {"calculateFor.WORD", "calculateFor.LEMMA"}; private static final ArrayList N_GRAM_COMPUTE_FOR_LETTERS = new ArrayList<>(Arrays.asList(N_GRAM_COMPUTE_FOR_LETTERS_ARRAY)); @@ -241,53 +244,56 @@ public class CharacterAnalysisTab { msd = new ArrayList<>(); // taxonomy - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { - if (taxonomyListener != null){ - taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); - } - - taxonomyListener = new ListChangeListener() { - boolean changing = true; - - @Override - public void onChanged(ListChangeListener.Change c){ - if(changing) { - ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); - ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); - - Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); - - taxonomy = new ArrayList<>(); - taxonomy.addAll(checkedItemsTaxonomy); - - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - // taxonomyCCB.getCheckModel().clearChecks(); - changing = false; - taxonomyCCB.getCheckModel().clearChecks(); - for (Taxonomy t : checkedItemsTaxonomy) { - taxonomyCCB.getCheckModel().check(t.toLongNameString()); - } - changing = true; - logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); - } - } - }; - taxonomyCCB.getCheckModel().clearChecks(); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { taxonomyCCB.setDisable(false); - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); } else { taxonomyCCB.setDisable(true); } + if (taxonomyListener != null){ + taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); + } + + taxonomyListener = new ListChangeListener() { + boolean changing = true; + + @Override + public void onChanged(ListChangeListener.Change c){ + if(changing) { + ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); +// ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); + + ArrayList checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus); + + taxonomy = new ArrayList<>(); + taxonomy.addAll(checkedItemsTaxonomy); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + // taxonomyCCB.getCheckModel().clearChecks(); + changing = false; + taxonomyCCB.getCheckModel().clearChecks(); + for (Taxonomy t : checkedItemsTaxonomy) { + taxonomyCCB.getCheckModel().check(t.toLongNameString()); + } + changing = true; + logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); + } + } + }; + taxonomyCCB.getCheckModel().clearChecks(); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); + + displayTaxonomy = false; displayTaxonomyChB.setSelected(false); // set - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { displayTaxonomyChB.setDisable(false); displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { displayTaxonomy = newValue; @@ -475,7 +481,7 @@ public class CharacterAnalysisTab { // if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { // // user changed corpus (by type) or by selection & triggered a rescan of headers // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // // currentCorpusType = corpus.getCorpusType(); @@ -485,7 +491,7 @@ public class CharacterAnalysisTab { // } // // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCB.getItems().addAll(taxonomyCCBValues); // @@ -548,7 +554,7 @@ public class CharacterAnalysisTab { // if calculateFor was selected for something other than a word or a lemma -> reset if (!(calculateFor == CalculateFor.WORD || calculateFor == CalculateFor.LEMMA)) { // if the user selected something else before selecting ngram for letters, reset that choice - calculateFor = CalculateFor.LEMMA; + calculateFor = CalculateFor.WORD; calculateForCB.getSelectionModel().select(0); } @@ -637,16 +643,66 @@ public class CharacterAnalysisTab { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = 0; + if(corpusFiles.size() > 1){ + cancel.setVisible(true); + } + int i = 0; +// DateFormat df = new SimpleDateFormat("hh:mm:ss"); + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; for (File f : corpusFiles) { - readXML(f.toString(), statistic); + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); i++; if (isCancelled()) { updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); break; } - this.updateProgress(i, corpusFiles.size()); - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); + if (corpusFiles.size() > 1) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000); + previousTime = new Date(); + } + this.updateProgress(i, corpusFiles.size()); + this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds)); + } else { + if(progressBarListener != null) { + xml_processing.progressProperty().removeListener(progressBarListener); + } + + progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + cancel.setVisible(true); + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); + previousTime = new Date(); + } + xml_processing.isCancelled = isCancelled(); + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100); + updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds)); +// updateProgress((iFinal * 100) + (double) observable, corpusFiles.size() * 100); + } + }; +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds)); + + + xml_processing.progressProperty().addListener(progressBarListener); + +// xml_processing.progressProperty().addListener((obs, oldProgress, newProgress) -> +// updateProgress((iFinal * 100) + newProgress.doubleValue(), corpusFiles.size() * 100)); + } + xml_processing.readXML(f.toString(), statistic); + if (isCancelled()) { + updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); + break; + } +// readXML(f.toString(), statistic, this, corpusFiles.size(), startTime, previousTime, i); } return null; @@ -703,8 +759,6 @@ public class CharacterAnalysisTab { logger.info("cancel button"); }); - cancel.setVisible(true); - final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/gui/CorpusTab.java b/src/main/java/gui/CorpusTab.java index 2750f4b..73e4ed7 100755 --- a/src/main/java/gui/CorpusTab.java +++ b/src/main/java/gui/CorpusTab.java @@ -6,11 +6,13 @@ import static gui.Messages.*; import static util.Util.*; import java.io.File; +import java.io.IOException; import java.util.*; import javafx.scene.layout.AnchorPane; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOCase; +import org.apache.commons.io.LineIterator; import org.apache.commons.io.filefilter.FileFilterUtils; import org.apache.commons.io.filefilter.TrueFileFilter; import org.apache.logging.log4j.LogManager; @@ -205,9 +207,6 @@ public class CorpusTab { // scan for xml files Collection corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE); - corpusLocation = selectedDirectory.getAbsolutePath(); - corpusFilesSize = String.valueOf(corpusFiles.size()); - Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType != null ? corpusType.toString() : null); // make sure there are corpus files in selected directory or notify the user about it if (corpusFiles.size() == 0) { @@ -215,10 +214,20 @@ public class CorpusTab { corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("vert", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE); Collection corpusFilesRegi = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("regi", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE); + +// if (!checkRegiFile(corpusFilesRegi)){ +// return; +// } + if (corpusFiles.size() == 0){ logger.info("alert: ", I18N.get("message.WARNING_CORPUS_NOT_FOUND")); showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_CORPUS_NOT_FOUND"), null); + } else if (corpusFilesRegi.size() == 0){ + GUIController.showAlert(Alert.AlertType.ERROR, String.format(I18N.get("message.ERROR_NO_REGI_FILE_FOUND"), selectedDirectory.getAbsolutePath())); } else { + corpusLocation = selectedDirectory.getAbsolutePath(); + corpusFilesSize = String.valueOf(corpusFiles.size()); + Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType != null ? corpusType.toString() : null); corpusType = VERT; corpus.setCorpusType(corpusType); @@ -255,12 +264,17 @@ public class CorpusTab { } } else { + corpusLocation = selectedDirectory.getAbsolutePath(); + corpusFilesSize = String.valueOf(corpusFiles.size()); + Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType != null ? corpusType.toString() : null); + String chooseCorpusLabelContentTmp = detectCorpusType(corpusFiles); if (chooseCorpusLabelContentTmp == null) { logger.info("alert: ", I18N.get("message.WARNING_CORPUS_NOT_FOUND")); showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_CORPUS_NOT_FOUND"), null); } else { + initNewCorpus(selectedDirectory, corpusFiles); Messages.setChooseCorpusProperties(corpusLocation, corpusFilesSize, corpusType.toString()); @@ -330,6 +344,28 @@ public class CorpusTab { Messages.setChooseCorpusL(chooseCorpusL, chooseCorpusLabelContent); } + private boolean checkRegiFile(Collection corpusFiles) { +// CorpusType corpusType = corpus.getCorpusType(); +// Collection corpusFiles = corpus.getDetectedCorpusFiles(); + + + for (File file : corpusFiles) { + // try to open .regi file + String regiPath = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "regi"; + LineIterator regiIt; + try { + // read regi file + regiIt = FileUtils.lineIterator(new File(regiPath), "UTF-8"); + LineIterator.closeQuietly(regiIt); + } catch (IOException e) { + GUIController.showAlert(Alert.AlertType.ERROR, String.format(I18N.get("message.ERROR_NO_REGI_FILE_FOUND"), regiPath)); + return false; + } + } + return true; + + } + private void readHeaderInfo() { CorpusType corpusType = corpus.getCorpusType(); Collection corpusFiles = corpus.getDetectedCorpusFiles(); @@ -339,7 +375,7 @@ public class CorpusTab { logger.info("reading header data for ", corpusType.toString()); - if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) { + if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { boolean corpusIsSplit = corpusFiles.size() > 1; final Task> task = new Task>() { @@ -505,26 +541,27 @@ public class CorpusTab { task.setOnSucceeded(e -> { ObservableList readTaxonomy = Tax.getTaxonomyForComboBox(corpusType, task.getValue()); - if (ValidationUtil.isEmpty(readTaxonomy)) { - // if no taxonomy found alert the user and keep other tabs disabled - logger.info("No vert filters found in headers."); - GUIController.showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_NO_SOLAR_FILTERS_FOUND")); - } else { + // if (ValidationUtil.isEmpty(readTaxonomy)) { + // // if no taxonomy found alert the user and keep other tabs disabled + // logger.info("No vert filters found in headers."); + // GUIController.showAlert(Alert.AlertType.ERROR, I18N.get("message.WARNING_NO_SOLAR_FILTERS_FOUND")); + // } else { // set taxonomy, update label corpus.setTaxonomy(readTaxonomy); corpus.setHeaderRead(true); Messages.setChooseCorpusL(chooseCorpusL, chooseCorpusLabelContent); setResults(); setCorpusForAnalysis(); - } + // } - togglePiAndSetCorpusWrapper(false); + togglePiAndSetCorpusWrapper(false); }); task.setOnCancelled(e -> togglePiAndSetCorpusWrapper(false)); task.setOnFailed(e -> togglePiAndSetCorpusWrapper(false)); + final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); @@ -599,7 +636,12 @@ public class CorpusTab { if (title.contains(SOLAR.getNameLowerCase())) { corpusType = SOLAR; } else if (title.contains(GIGAFIDA.getNameLowerCase())) { - corpusType = GIGAFIDA; + String edition = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "edition").toLowerCase(); + if (Double.valueOf(edition) < 2.0) { + corpusType = GIGAFIDA; + } else { + corpusType = GIGAFIDA2; + } } else if (title.contains(CCKRES.getNameLowerCase())) { corpusType = CCKRES; } else if (title.contains(GOS.getNameLowerCase())) { diff --git a/src/main/java/gui/Messages.java b/src/main/java/gui/Messages.java index 72db5c1..a910dc7 100755 --- a/src/main/java/gui/Messages.java +++ b/src/main/java/gui/Messages.java @@ -114,8 +114,10 @@ public class Messages { .append(String.format(I18N.get("message.NOTIFICATION_CORPUS"), chooseCorpusLabelProperties[2])); chooseCorpusLabelContent = sb.toString(); - chooseCorpusL.textProperty().unbind(); - chooseCorpusL.setText(chooseCorpusLabelContent); + if (chooseCorpusL != null) { + chooseCorpusL.textProperty().unbind(); + chooseCorpusL.setText(chooseCorpusLabelContent); + } } } } diff --git a/src/main/java/gui/OneWordAnalysisTab.java b/src/main/java/gui/OneWordAnalysisTab.java index 655f176..1665177 100755 --- a/src/main/java/gui/OneWordAnalysisTab.java +++ b/src/main/java/gui/OneWordAnalysisTab.java @@ -1,22 +1,23 @@ package gui; +import alg.XML_processing; import data.*; import javafx.application.HostServices; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; import javafx.beans.value.ChangeListener; import javafx.beans.value.ObservableValue; -import javafx.collections.FXCollections; import javafx.collections.ListChangeListener; import javafx.collections.ObservableList; import javafx.concurrent.Task; import javafx.fxml.FXML; -import javafx.scene.Scene; import javafx.scene.control.*; import javafx.scene.layout.AnchorPane; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.controlsfx.control.CheckComboBox; -import org.controlsfx.control.IndexedCheckModel; import java.io.File; import java.io.UnsupportedEncodingException; @@ -26,7 +27,6 @@ import java.util.regex.Pattern; import static alg.XML_processing.readXML; import static gui.GUIController.showAlert; -import static gui.Messages.*; @SuppressWarnings("Duplicates") public class OneWordAnalysisTab { @@ -158,6 +158,7 @@ public class OneWordAnalysisTab { private ListChangeListener taxonomyListener; private ListChangeListener alsoVisualizeListener; private ChangeListener calculateForListener; + private InvalidationListener progressBarListener; // private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka"); // private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); @@ -383,54 +384,57 @@ public class OneWordAnalysisTab { alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener); // taxonomy - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { - if (taxonomyListener != null){ - taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); - } - - taxonomyListener = new ListChangeListener() { - public boolean changing = true; - - @Override - public void onChanged(Change c) { - if (changing) { - ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); - ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); - - Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); - - taxonomy = new ArrayList<>(); - taxonomy.addAll(checkedItemsTaxonomy); - - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - // taxonomyCCB.getCheckModel().clearChecks(); - changing = false; - taxonomyCCB.getCheckModel().clearChecks(); - for (Taxonomy t : checkedItemsTaxonomy) { - taxonomyCCB.getCheckModel().check(t.toLongNameString()); - } - changing = true; - logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); - } - } - }; - - taxonomyCCB.getCheckModel().clearChecks(); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { taxonomyCCB.setDisable(false); - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); } else { taxonomyCCB.setDisable(true); } + if (taxonomyListener != null){ + taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); + } + + taxonomyListener = new ListChangeListener() { + public boolean changing = true; + + @Override + public void onChanged(Change c) { + if (changing) { + ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); +// ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); + ArrayList checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus); + +// Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); + + taxonomy = new ArrayList<>(); + taxonomy.addAll(checkedItemsTaxonomy); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + // taxonomyCCB.getCheckModel().clearChecks(); + changing = false; + taxonomyCCB.getCheckModel().clearChecks(); + for (Taxonomy t : checkedItemsTaxonomy) { + taxonomyCCB.getCheckModel().check(t.toLongNameString()); + } + changing = true; + logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); + } + } + }; + + taxonomyCCB.getCheckModel().clearChecks(); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); + displayTaxonomy = false; displayTaxonomyChB.setSelected(false); // set - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { displayTaxonomyChB.setDisable(false); displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { displayTaxonomy = newValue; @@ -586,7 +590,7 @@ public class OneWordAnalysisTab { // if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { // // user changed corpus (by type) or by selection & triggered a rescan of headers // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // // currentCorpusType = corpus.getCorpusType(); @@ -596,7 +600,7 @@ public class OneWordAnalysisTab { // } // // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCB.getItems().addAll(taxonomyCCBValues); // @@ -733,22 +737,63 @@ public class OneWordAnalysisTab { logger.info("Started execution: ", statistic.getFilter()); Collection corpusFiles = statistic.getCorpus().getDetectedCorpusFiles(); - boolean corpusIsSplit = corpusFiles.size() > 1; final Task task = new Task() { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = 0; + if(corpusFiles.size() > 1){ + cancel.setVisible(true); + } + int i = 0; + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; for (File f : corpusFiles) { - readXML(f.toString(), statistic); + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); i++; + if (corpusFiles.size() > 1) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000); + previousTime = new Date(); + } + this.updateProgress(i, corpusFiles.size()); + this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds)); +// if (isCancelled()) { +// updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); +// break; +// } + } else { + if(progressBarListener != null) { + xml_processing.progressProperty().removeListener(progressBarListener); + } + + progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + cancel.setVisible(true); + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); + previousTime = new Date(); + } + xml_processing.isCancelled = isCancelled(); + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100); + updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds)); + } + }; + + xml_processing.progressProperty().addListener(progressBarListener); + } + xml_processing.readXML(f.toString(), statistic); if (isCancelled()) { updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); break; } - this.updateProgress(i, corpusFiles.size()); - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); } return null; @@ -805,7 +850,6 @@ public class OneWordAnalysisTab { logger.info("cancel button"); }); - cancel.setVisible(true); final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/gui/StringAnalysisTabNew2.java b/src/main/java/gui/StringAnalysisTabNew2.java index dec4053..4e7ed97 100755 --- a/src/main/java/gui/StringAnalysisTabNew2.java +++ b/src/main/java/gui/StringAnalysisTabNew2.java @@ -2,21 +2,20 @@ package gui; import static alg.XML_processing.*; import static gui.GUIController.*; -import static gui.Messages.*; import java.io.File; import java.io.UnsupportedEncodingException; import java.util.*; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; +import alg.XML_processing; import javafx.application.HostServices; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; import javafx.beans.value.ChangeListener; import javafx.beans.value.ObservableValue; import javafx.scene.layout.AnchorPane; -import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -208,6 +207,7 @@ public class StringAnalysisTabNew2 { private ListChangeListener alsoVisualizeListener; private ListChangeListener collocabilityListener; private ChangeListener calculateForListener; + private InvalidationListener progressBarListener; // private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica", "oblikoskladenjska oznaka"); // private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); @@ -306,13 +306,14 @@ public class StringAnalysisTabNew2 { notePunctuations = newValue; logger.info("note punctuations: ", notePunctuations); }); + notePunctuationsChB.setSelected(false); notePunctuationsChB.setTooltip(new Tooltip(I18N.get("message.TOOLTIP_readNotePunctuationsChB"))); displayTaxonomy = false; displayTaxonomyChB.setSelected(false); // set - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { displayTaxonomyChB.setDisable(false); displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { displayTaxonomy = newValue; @@ -515,49 +516,52 @@ public class StringAnalysisTabNew2 { alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener); // taxonomy - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { - if (taxonomyListener != null){ - taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); - } - - taxonomyListener = new ListChangeListener() { - boolean changing = true; - - @Override - public void onChanged(ListChangeListener.Change c){ - if(changing) { - ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); - ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); - - Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); - - taxonomy = new ArrayList<>(); - taxonomy.addAll(checkedItemsTaxonomy); - - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - // taxonomyCCB.getCheckModel().clearChecks(); - changing = false; - taxonomyCCB.getCheckModel().clearChecks(); - for (Taxonomy t : checkedItemsTaxonomy) { - taxonomyCCB.getCheckModel().check(t.toLongNameString()); - } - changing = true; - logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); - } - } - }; - taxonomyCCB.getCheckModel().clearChecks(); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { taxonomyCCB.setDisable(false); - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); } else { taxonomyCCB.setDisable(true); } + if (taxonomyListener != null){ + taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); + } + + taxonomyListener = new ListChangeListener() { + boolean changing = true; + + @Override + public void onChanged(ListChangeListener.Change c){ + if(changing) { + ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); +// ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); +// +// Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); + ArrayList checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus); + + taxonomy = new ArrayList<>(); + taxonomy.addAll(checkedItemsTaxonomy); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + // taxonomyCCB.getCheckModel().clearChecks(); + changing = false; + taxonomyCCB.getCheckModel().clearChecks(); + for (Taxonomy t : checkedItemsTaxonomy) { + taxonomyCCB.getCheckModel().check(t.toLongNameString()); + } + changing = true; + logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); + } + } + }; + taxonomyCCB.getCheckModel().clearChecks(); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); + // skip skipValueCB.valueProperty().addListener((observable, oldValue, newValue) -> { skipValue = Integer.valueOf(newValue); @@ -738,7 +742,7 @@ public class StringAnalysisTabNew2 { // if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { // // user changed corpus (by type) or by selection & triggered a rescan of headers // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // // currentCorpusType = corpus.getCorpusType(); @@ -748,7 +752,7 @@ public class StringAnalysisTabNew2 { // } // // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCB.getItems().addAll(taxonomyCCBValues); // @@ -913,16 +917,78 @@ public class StringAnalysisTabNew2 { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = corpusFiles.size(); + if(corpusFiles.size() > 1){ + cancel.setVisible(true); + } + int i = corpusFiles.size(); + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + int corpusSize; + if (statistic.getFilter().getCollocability().size() > 0) { + corpusSize = corpusFiles.size() * 2; + } else { + corpusSize = corpusFiles.size(); + } for (File f : corpusFiles) { - readXML(f.toString(), statisticsOneGrams); + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); i++; - this.updateProgress(i, corpusFiles.size() * 2); - if (statistic.getFilter().getCollocability().size() > 0) { - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size() * 2, f.getName())); + if (corpusFiles.size() > 1) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000); + previousTime = new Date(); + } + this.updateProgress(i, corpusSize); + this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds)); +// if (isCancelled()) { +// updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); +// break; +// } } else { - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); + if(progressBarListener != null) { + xml_processing.progressProperty().removeListener(progressBarListener); + } + + progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + cancel.setVisible(true); + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); +// System.out.println(((new Date()).getTime() - xml_processing.startTime.getTime())); +// System.out.println((1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1))); +// System.out.println(((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get())); +// System.out.println(remainingSeconds); + previousTime = new Date(); + } + xml_processing.isCancelled = isCancelled(); + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100); + updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusSize, f.getName(), remainingSeconds)); + } + }; + + xml_processing.progressProperty().addListener(progressBarListener); } + xml_processing.isCollocability = true; + xml_processing.readXML(f.toString(), statisticsOneGrams); + xml_processing.isCollocability = false; + if (isCancelled()) { + updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); + break; + } +// readXML(f.toString(), statisticsOneGrams); +// i++; +// this.updateProgress(i, corpusFiles.size() * 2); +// if (statistic.getFilter().getCollocability().size() > 0) { +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size() * 2, f.getName())); +// } else { +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); +// } } return null; @@ -998,8 +1064,6 @@ public class StringAnalysisTabNew2 { task.cancel(); // logger.info("cancel button"); }); - -// cancel.setVisible(true); return task; } @@ -1009,28 +1073,90 @@ public class StringAnalysisTabNew2 { // Task task_collocability = null; Collection corpusFiles = statistic.getCorpus().getDetectedCorpusFiles(); - boolean corpusIsSplit = corpusFiles.size() > 1; final Task task = new Task() { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = 0; + if(corpusFiles.size() > 1){ + cancel.setVisible(true); + } + int i = 0; + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + int corpusSize; + if (statistic.getFilter().getCollocability().size() > 0) { + corpusSize = corpusFiles.size() * 2; + } else { + corpusSize = corpusFiles.size(); + } for (File f : corpusFiles) { - readXML(f.toString(), statistic); + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); i++; + if (corpusFiles.size() > 1) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000); + previousTime = new Date(); + } + this.updateProgress(i, corpusSize); + this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds)); +// if (isCancelled()) { +// updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); +// break; +// } + } else { + if(progressBarListener != null) { + xml_processing.progressProperty().removeListener(progressBarListener); + } + + progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + cancel.setVisible(true); + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); +// System.out.println(((new Date()).getTime() - xml_processing.startTime.getTime())); +// System.out.println((1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get())) + 1); +// System.out.println(((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get())); +// System.out.println(remainingSeconds); + previousTime = new Date(); + } + xml_processing.isCancelled = isCancelled(); + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100); + updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusSize, f.getName(), remainingSeconds)); + } + }; + + xml_processing.progressProperty().addListener(progressBarListener); + } + xml_processing.readXML(f.toString(), statistic); if (isCancelled()) { updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); break; } - if (statistic.getFilter().getCollocability().size() > 0) { - this.updateProgress(i, corpusFiles.size() * 2); - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size() * 2, f.getName())); - } else { - this.updateProgress(i, corpusFiles.size()); - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); + if(!(corpusFiles.size() > 1)){ + cancel.setVisible(false); } -// this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); +// readXML(f.toString(), statistic); +// i++; +// if (isCancelled()) { +// updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); +// break; +// } +// if (statistic.getFilter().getCollocability().size() > 0) { +// this.updateProgress(i, corpusFiles.size() * 2); +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size() * 2, f.getName())); +// } else { +// this.updateProgress(i, corpusFiles.size()); +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); +// } +//// this.updateMessage(String.format(ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y, i, corpusFiles.size() * 2, f.getName())); } return null; @@ -1106,8 +1232,6 @@ public class StringAnalysisTabNew2 { logger.info("cancel button"); }); - cancel.setVisible(true); - final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/gui/WordFormationTab.java b/src/main/java/gui/WordFormationTab.java index 914afb1..4ea87a5 100755 --- a/src/main/java/gui/WordFormationTab.java +++ b/src/main/java/gui/WordFormationTab.java @@ -2,14 +2,10 @@ package gui; import static alg.XML_processing.*; import static gui.GUIController.*; -import static gui.Messages.*; import java.io.File; import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; +import java.util.*; import javafx.application.HostServices; import javafx.scene.control.*; @@ -73,11 +69,11 @@ public class WordFormationTab { // taxonomy if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType())) { taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); taxonomyCCB.getCheckModel().getCheckedItems().addListener((ListChangeListener) c -> { taxonomy = new ArrayList<>(); ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); - ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); + ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems, corpus); taxonomy.addAll(checkedItemsTaxonomy); logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); }); @@ -175,7 +171,9 @@ public class WordFormationTab { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = 0; + int i = 0; + Date startTime = new Date(); + Date previousTime = new Date(); for (File f : corpusFiles) { readXML(f.toString(), statistic); i++; diff --git a/src/main/java/gui/WordLevelTab.java b/src/main/java/gui/WordLevelTab.java index 9d83422..d276e03 100755 --- a/src/main/java/gui/WordLevelTab.java +++ b/src/main/java/gui/WordLevelTab.java @@ -1,10 +1,13 @@ package gui; +import alg.XML_processing; import data.*; import javafx.application.HostServices; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; import javafx.beans.value.ChangeListener; import javafx.beans.value.ObservableValue; -import javafx.collections.FXCollections; import javafx.collections.ListChangeListener; import javafx.collections.ObservableList; import javafx.concurrent.Task; @@ -24,7 +27,6 @@ import java.util.regex.Pattern; import static alg.XML_processing.readXML; import static gui.GUIController.showAlert; -import static gui.Messages.*; @SuppressWarnings("Duplicates") public class WordLevelTab { @@ -196,6 +198,7 @@ public class WordLevelTab { private ListChangeListener taxonomyListener; private ListChangeListener alsoVisualizeListener; private ChangeListener calculateForListener; + private InvalidationListener progressBarListener; // private static final ObservableList N_GRAM_COMPUTE_FOR_WORDS = FXCollections.observableArrayList("lema", "različnica"); // private static final ObservableList N_GRAM_COMPUTE_FOR_LETTERS = FXCollections.observableArrayList("lema", "različnica"); @@ -509,54 +512,57 @@ public class WordLevelTab { alsoVisualizeCCB.getCheckModel().getCheckedItems().addListener(alsoVisualizeListener); // taxonomy - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { - if (taxonomyListener != null){ - taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); - } - - taxonomyListener = new ListChangeListener() { - boolean changing = true; - - @Override - public void onChanged(ListChangeListener.Change c){ - if(changing) { - ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); - ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); - - Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); - - taxonomy = new ArrayList<>(); - taxonomy.addAll(checkedItemsTaxonomy); - - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - // taxonomyCCB.getCheckModel().clearChecks(); - changing = false; - taxonomyCCB.getCheckModel().clearChecks(); - for (Taxonomy t : checkedItemsTaxonomy) { - taxonomyCCB.getCheckModel().check(t.toLongNameString()); - } - changing = true; - logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); - } - } - }; - - taxonomyCCB.getCheckModel().clearChecks(); + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { taxonomyCCB.setDisable(false); - taxonomyCCB.getItems().removeAll(); - taxonomyCCB.getItems().setAll(corpus.getTaxonomy()); - - taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); } else { taxonomyCCB.setDisable(true); } + if (taxonomyListener != null){ + taxonomyCCB.getCheckModel().getCheckedItems().removeListener(taxonomyListener); + } + + taxonomyListener = new ListChangeListener() { + boolean changing = true; + + @Override + public void onChanged(ListChangeListener.Change c){ + if(changing) { + ObservableList checkedItems = taxonomyCCB.getCheckModel().getCheckedItems(); +// ArrayList checkedItemsTaxonomy = Taxonomy.convertStringListToTaxonomyList(checkedItems); + + ArrayList checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus); +// Taxonomy.modifyingTaxonomy(taxonomy, checkedItemsTaxonomy, corpus); + + taxonomy = new ArrayList<>(); + taxonomy.addAll(checkedItemsTaxonomy); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + // taxonomyCCB.getCheckModel().clearChecks(); + changing = false; + taxonomyCCB.getCheckModel().clearChecks(); + for (Taxonomy t : checkedItemsTaxonomy) { + taxonomyCCB.getCheckModel().check(t.toLongNameString()); + } + changing = true; + logger.info(String.format("Selected taxonomy: %s", StringUtils.join(checkedItems, ","))); + } + } + }; + + taxonomyCCB.getCheckModel().clearChecks(); + + taxonomyCCB.getItems().removeAll(); + taxonomyCCB.getItems().setAll(corpus.getObservableListTaxonomy()); + + taxonomyCCB.getCheckModel().getCheckedItems().addListener(taxonomyListener); + displayTaxonomy = false; displayTaxonomyChB.setSelected(false); // set - if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getTaxonomy().size() > 0) { + if (Tax.getCorpusTypesWithTaxonomy().contains(corpus.getCorpusType()) && corpus.getObservableListTaxonomy().size() > 0) { displayTaxonomyChB.setDisable(false); displayTaxonomyChB.selectedProperty().addListener((observable, oldValue, newValue) -> { displayTaxonomy = newValue; @@ -714,7 +720,7 @@ public class WordLevelTab { // if ((currentCorpusType != null && currentCorpusType != corpus.getCorpusType())) { // // user changed corpus (by type) or by selection & triggered a rescan of headers // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // // currentCorpusType = corpus.getCorpusType(); @@ -724,7 +730,7 @@ public class WordLevelTab { // } // // // see if we read taxonomy from headers, otherwise use default values for given corpus -// ObservableList tax = corpus.getTaxonomy(); +// ObservableList tax = corpus.getObservableListTaxonomy(); // taxonomyCCBValues = tax != null ? tax : Taxonomy.getDefaultForComboBox(corpus.getCorpusType()); // taxonomyCCB.getItems().addAll(taxonomyCCBValues); // @@ -879,22 +885,63 @@ public class WordLevelTab { logger.info("Started execution: ", statistic.getFilter()); Collection corpusFiles = statistic.getCorpus().getDetectedCorpusFiles(); - boolean corpusIsSplit = corpusFiles.size() > 1; final Task task = new Task() { @SuppressWarnings("Duplicates") @Override protected Void call() throws Exception { - long i = 0; + if(corpusFiles.size() > 1){ + cancel.setVisible(true); + } + int i = 0; + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; for (File f : corpusFiles) { - readXML(f.toString(), statistic); + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); i++; if (isCancelled()) { updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); break; } - this.updateProgress(i, corpusFiles.size()); - this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName())); + if (corpusFiles.size() > 1) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000); + previousTime = new Date(); + } + this.updateProgress(i, corpusFiles.size()); + this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds)); + } else { + if(progressBarListener != null) { + xml_processing.progressProperty().removeListener(progressBarListener); + } + + progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + cancel.setVisible(true); + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); + previousTime = new Date(); + } + xml_processing.isCancelled = isCancelled(); + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100); + updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds)); + } + }; + + xml_processing.progressProperty().addListener(progressBarListener); + } + xml_processing.readXML(f.toString(), statistic); + if (isCancelled()) { + updateMessage(I18N.get("message.CANCELING_NOTIFICATION")); + break; + } } return null; @@ -951,7 +998,6 @@ public class WordLevelTab { logger.info("cancel button"); }); - cancel.setVisible(true); final Thread thread = new Thread(task, "task"); thread.setDaemon(true); thread.start(); diff --git a/src/main/java/util/Export.java b/src/main/java/util/Export.java index fafac1b..f65f013 100755 --- a/src/main/java/util/Export.java +++ b/src/main/java/util/Export.java @@ -111,8 +111,8 @@ public class Export { } } - headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); - headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); + headerInfoBlock.put(filter.getCalculateFor().totalSumString(filter.getNgramValue()), String.valueOf(num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); + headerInfoBlock.put(filter.getCalculateFor().foundSumString(filter.getNgramValue()), String.valueOf(num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); // headerInfoBlock.put(filter.getCalculateFor().toMetadataString(), String.valueOf(num_frequencies)); for (CalculateFor otherKey : filter.getMultipleKeys()) { @@ -134,7 +134,7 @@ public class Export { } for (Taxonomy key : taxonomyResults.keySet()) { - if(!key.equals(Taxonomy.TOTAL) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { + if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { FILE_HEADER_AL.add(I18N.get("exportTable.absoluteFrequency") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.percentage") + " [" + key.toString() + "]"); FILE_HEADER_AL.add(I18N.get("exportTable.relativeFrequency") + " [" + key.toString() + "]"); @@ -280,10 +280,10 @@ public class Export { dataEntry.add(e.getValue().toString()); - dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(Taxonomy.TOTAL))); - dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(Taxonomy.TOTAL).longValue())); + dataEntry.add(formatNumberAsPercent((double) e.getValue() / num_selected_taxonomy_frequencies.get(statistics.getCorpus().getTotal()))); + dataEntry.add(String.format("%.2f", ((double) e.getValue() * 1000000)/num_taxonomy_frequencies.get(statistics.getCorpus().getTotal()).longValue())); for (Taxonomy key : taxonomyResults.keySet()){ - if(!key.equals(Taxonomy.TOTAL) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { + if(!key.equals(statistics.getCorpus().getTotal()) && num_taxonomy_frequencies.containsKey(key) && num_taxonomy_frequencies.get(key).longValue() > 0) { AtomicLong frequency = taxonomyResults.get(key).get(e.getKey()); dataEntry.add(frequency.toString()); dataEntry.add(formatNumberAsPercent((double) frequency.get() / num_selected_taxonomy_frequencies.get(key))); diff --git a/src/main/resources/message_en.properties b/src/main/resources/message_en.properties index aafa86e..5b8f2c8 100644 --- a/src/main/resources/message_en.properties +++ b/src/main/resources/message_en.properties @@ -118,6 +118,7 @@ message.WARNING_NO_SOLAR_FILTERS_FOUND=We weren't able to read filters from corp message.ERROR_WHILE_EXECUTING=Error in program execution. message.ERROR_WHILE_SAVING_RESULTS_TO_CSV=Error while saving results. message.ERROR_NOT_ENOUGH_MEMORY=You do not have sufficient RAM for analyzing such amount of data. You can try changing filters. +message.ERROR_NO_REGI_FILE_FOUND=Missing file \"%s\". message.MISSING_NGRAM_LEVEL=N-gram level message.MISSING_CALCULATE_FOR=Calculate for @@ -132,7 +133,7 @@ message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS=Analysis completed, however n message.RESULTS_PATH_SET_TO_DEFAULT=Save location is set on corpus location. message.NOTIFICATION_ANALYSIS_CANCELED=Analysis was cancled. -message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y=Analyzing file %d of %d (%s) +message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y=Analyzing file %d of %d (%s) - Estimated time remaining %d s message.CANCELING_NOTIFICATION=Canceled message.LABEL_CORPUS_LOCATION_NOT_SET=Corpus location is not set diff --git a/src/main/resources/message_sl.properties b/src/main/resources/message_sl.properties index 3f3424c..bb6f142 100644 --- a/src/main/resources/message_sl.properties +++ b/src/main/resources/message_sl.properties @@ -118,6 +118,7 @@ message.WARNING_NO_SOLAR_FILTERS_FOUND=Iz korpusnih datotek ni bilo moč razbrat message.ERROR_WHILE_EXECUTING=Prišlo je do napake med izvajanjem. message.ERROR_WHILE_SAVING_RESULTS_TO_CSV=Prišlo je do napake med shranjevanje rezultatov. message.ERROR_NOT_ENOUGH_MEMORY=Na voljo imate premalo pomnilnika (RAM-a) za analizo takšne količine podatkov. +message.ERROR_NO_REGI_FILE_FOUND=Manjka datoteka \"%s\". message.MISSING_NGRAM_LEVEL=N-gram nivo message.MISSING_CALCULATE_FOR=Izračunaj za @@ -132,7 +133,7 @@ message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS=Analiza je zaključena, venda message.RESULTS_PATH_SET_TO_DEFAULT=Lokacija za shranjevanje rezultatov je nastavljena na lokacijo korpusa. message.NOTIFICATION_ANALYSIS_CANCELED=Analiziranje je bilo prekinjeno. -message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y=Analiziram datoteko %d od %d (%s) +message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y=Analiziram datoteko %d od %d (%s) - Preostali čas %d s message.CANCELING_NOTIFICATION=Prekinjeno message.LABEL_CORPUS_LOCATION_NOT_SET=Lokacija korpusa ni nastavljena