Project copied
This commit is contained in:
15
src/main/java/alg/Common.java
Normal file
15
src/main/java/alg/Common.java
Normal file
@@ -0,0 +1,15 @@
|
||||
package alg;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
public class Common {
|
||||
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
|
||||
// if not in map
|
||||
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
|
||||
|
||||
// else
|
||||
if (r != null)
|
||||
map.get(o).incrementAndGet();
|
||||
}
|
||||
}
|
||||
794
src/main/java/alg/XML_processing.java
Normal file
794
src/main/java/alg/XML_processing.java
Normal file
@@ -0,0 +1,794 @@
|
||||
package alg;
|
||||
|
||||
import static data.Enums.solar.SolarFilters.*;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
|
||||
import javax.xml.namespace.QName;
|
||||
import javax.xml.stream.XMLEventReader;
|
||||
import javax.xml.stream.XMLInputFactory;
|
||||
import javax.xml.stream.XMLStreamConstants;
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
import javax.xml.stream.events.*;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
|
||||
import data.*;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class XML_processing {
|
||||
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
|
||||
|
||||
// public static void processCorpus(Statistics stats) {
|
||||
// // we can preset the list's size, so there won't be a need to resize it
|
||||
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
|
||||
//
|
||||
// int i = 0;
|
||||
// for (File f : Settings.corpus) {
|
||||
// i++;
|
||||
// readXML(f.toString(), stats);
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void readXML(String path, Statistics stats) {
|
||||
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
|
||||
// readXMLGigafida(path, stats);
|
||||
// } else if (stats.getCorpusType() == CorpusType.GOS) {
|
||||
// readXMLGos(path, stats);
|
||||
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
|
||||
// readXMLSolar(path, stats);
|
||||
// }
|
||||
// }
|
||||
|
||||
public static void readXML(String path, StatisticsNew stats) {
|
||||
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|
||||
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
|
||||
readXMLGigafida(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
|
||||
readXMLGos(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
readXMLSolar(path, stats);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads and returns the value of a passed header tag or an empty string.
|
||||
* E.g. title tag, for discerning the corpus' type.
|
||||
* Notice: returns only the value of the first occurrence of a given tag name.
|
||||
*/
|
||||
public static String readXMLHeaderTag(String path, String tag) {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
try {
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = eventReader.nextEvent();
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String var = startElement.getName().getLocalPart();
|
||||
|
||||
if (var.equalsIgnoreCase(tag)) {
|
||||
return eventReader.nextEvent().asCharacters().getData();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ForkJoinPool pool = new ForkJoinPool();
|
||||
|
||||
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
|
||||
pool.invoke(wc);
|
||||
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
|
||||
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
|
||||
pool.invoke(wc);
|
||||
} else {
|
||||
// TODO:
|
||||
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
|
||||
// pool.invoke(wc);
|
||||
}
|
||||
}
|
||||
|
||||
// public static void readXMLGos(String path, Statistics stats) {
|
||||
// boolean in_word = false;
|
||||
// String taksonomija = "";
|
||||
// String lemma = "";
|
||||
// String msd = "";
|
||||
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
|
||||
//
|
||||
// List<Word> stavek = new ArrayList<>();
|
||||
// List<Sentence> corpus = new ArrayList<>();
|
||||
// String sentenceDelimiter = "seg";
|
||||
// String taxonomyPrefix = "gos.";
|
||||
//
|
||||
// try {
|
||||
// XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
//
|
||||
// while (eventReader.hasNext()) {
|
||||
// XMLEvent event = eventReader.nextEvent();
|
||||
//
|
||||
// switch (event.getEventType()) {
|
||||
// case XMLStreamConstants.START_ELEMENT:
|
||||
//
|
||||
// StartElement startElement = event.asStartElement();
|
||||
// String qName = startElement.getName().getLocalPart();
|
||||
//
|
||||
// // "word" node
|
||||
// if (qName.equals("w")) {
|
||||
// in_word = true;
|
||||
//
|
||||
// if (type.equals("norm")) {
|
||||
// // make sure we're looking at <w lemma...> and not <w type...>
|
||||
// Iterator var = startElement.getAttributes();
|
||||
// ArrayList<Object> attributes = new ArrayList<>();
|
||||
// while (var.hasNext()) {
|
||||
// attributes.add(var.next());
|
||||
// }
|
||||
//
|
||||
// if (attributes.contains("msd")) {
|
||||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
// } else {
|
||||
// msd = null;
|
||||
// }
|
||||
//
|
||||
// if (attributes.contains("lemma")) {
|
||||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// // taxonomy node
|
||||
// else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// // there are some term nodes at the beginning that are of no interest to us
|
||||
// // they differ by not having the attribute "ref", so test will equal null
|
||||
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
//
|
||||
// if (test != null) {
|
||||
// // keep only taxonomy properties
|
||||
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
|
||||
// }
|
||||
// } else if (qName.equalsIgnoreCase("div")) {
|
||||
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
//
|
||||
// }
|
||||
// break;
|
||||
//
|
||||
// case XMLStreamConstants.CHARACTERS:
|
||||
// Characters characters = event.asCharacters();
|
||||
//
|
||||
// // "word" node value
|
||||
// if (in_word) {
|
||||
// if (type.equals("norm") && msd != null) {
|
||||
// stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
// } else {
|
||||
// stavek.add(new Word(characters.getData()));
|
||||
// }
|
||||
//
|
||||
// in_word = false;
|
||||
// }
|
||||
// break;
|
||||
//
|
||||
// case XMLStreamConstants.END_ELEMENT:
|
||||
// EndElement endElement = event.asEndElement();
|
||||
//
|
||||
// // parser reached end of the current sentence
|
||||
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// // add sentence to corpus
|
||||
// corpus.add(new Sentence(stavek, taksonomija, type));
|
||||
// // and start a new one
|
||||
// stavek = new ArrayList<>();
|
||||
//
|
||||
// /* Invoke Fork-Join when we reach maximum limit of
|
||||
// * sentences (because we can't read everything to
|
||||
// * memory) or we reach the end of the file.
|
||||
// */
|
||||
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
// fj(corpus, stats);
|
||||
// // empty the current corpus, since we don't need
|
||||
// // the data anymore
|
||||
// corpus.clear();
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // backup
|
||||
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
// fj(corpus, stats);
|
||||
// corpus.clear();
|
||||
// }
|
||||
//
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// } catch (FileNotFoundException | XMLStreamException e) {
|
||||
// e.printStackTrace();
|
||||
// }
|
||||
// }
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public static void readXMLSolar(String path, StatisticsNew stats) {
|
||||
boolean in_word = false;
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> stavek = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>();
|
||||
|
||||
// used for filter
|
||||
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
|
||||
Map<String, String> headBlock = null;
|
||||
boolean includeThisBlock = false;
|
||||
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
|
||||
StartElement startElement = event.asStartElement();
|
||||
// System.out.println(String.format("%s", startElement.toString()));
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w3")) {
|
||||
in_word = true;
|
||||
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
} else if (qName.equals("c3")) {
|
||||
String c3Content = eventReader.nextEvent().asCharacters().getData();
|
||||
|
||||
if (c3Content.equals(".") && includeThisBlock) {
|
||||
// add sentence to corpus
|
||||
corpus.add(new Sentence(stavek));
|
||||
// and start a new one
|
||||
stavek = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
} else if (headTags.contains(qName)) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(qName, tagContent);
|
||||
} else if (qName.equals("head")) {
|
||||
headBlock = new HashMap<>();
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
in_word = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
String qNameEnd = endElement.getName().getLocalPart();
|
||||
|
||||
if (qNameEnd.equals("head")) {
|
||||
// validate and set boolean
|
||||
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
|
||||
includeThisBlock = true;
|
||||
}
|
||||
} else if (qNameEnd.equals("body")) {
|
||||
// new block, reset filter status
|
||||
includeThisBlock = false;
|
||||
}
|
||||
|
||||
// backup
|
||||
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param readHeadBlock block of tags read from the corpus
|
||||
* @param userSetFilter tags with values set by the user
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
|
||||
boolean pass = true;
|
||||
|
||||
if (userSetFilter == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
|
||||
String key = filterEntry.getKey();
|
||||
HashSet<String> valueObject = filterEntry.getValue();
|
||||
|
||||
// if (valueObject instanceof String) {
|
||||
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
|
||||
// } else
|
||||
if (valueObject != null) {
|
||||
//noinspection unchecked
|
||||
for (String value : valueObject) {
|
||||
pass = validateHeadBlockEntry(readHeadBlock, key, value);
|
||||
}
|
||||
}
|
||||
|
||||
if (!pass) {
|
||||
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// if it gets to this point, it passed all the filters
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
|
||||
if (!readHeadBlock.keySet().contains(userSetKey)) {
|
||||
// current head block does not include one of the set filters - not likely, but an edge case anyway
|
||||
return false;
|
||||
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
|
||||
// different values -> doesn't pass the filter
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
|
||||
*
|
||||
* @param filepath
|
||||
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
|
||||
* @param corpusType
|
||||
*/
|
||||
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
|
||||
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
|
||||
// solar
|
||||
Set<String> headTags = null;
|
||||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
|
||||
String headTagName;
|
||||
|
||||
if (corpusType == CorpusType.SOLAR) {
|
||||
headTagName = "head";
|
||||
// used for filter
|
||||
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
|
||||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
}
|
||||
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
XMLEventReader xmlEventReader = null;
|
||||
try {
|
||||
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
|
||||
boolean insideHeader = false;
|
||||
|
||||
while (xmlEventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = xmlEventReader.nextEvent();
|
||||
|
||||
if (xmlEvent.isStartElement()) {
|
||||
StartElement startElement = xmlEvent.asStartElement();
|
||||
String elementName = startElement.getName().getLocalPart();
|
||||
|
||||
if (elementName.equalsIgnoreCase(headTagName)) {
|
||||
// if the corpus is split into files, we skip bodies
|
||||
// this toggle is true when we're inside a header (next block of code executes)
|
||||
// and false when we're not (skip reading unnecessary attributes)
|
||||
insideHeader = true;
|
||||
}
|
||||
|
||||
if (insideHeader) {
|
||||
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
String debug = "";
|
||||
|
||||
String tax = startElement.getAttributeByName(QName.valueOf("target"))
|
||||
.getValue()
|
||||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
} else if (!parseTaxonomy && headTags.contains(elementName)) {
|
||||
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
|
||||
resultFilters.get(elementName).add(tagContent);
|
||||
}
|
||||
}
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// if the corpus is split into multiple files, each with only one header block per file
|
||||
// that means we should stop after we reach the end of the header
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// whole corpus in one file, so we have to continue reading in order to find all header blocks
|
||||
insideHeader = false;
|
||||
}
|
||||
}
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("Streaming error", e);
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} catch (FileNotFoundException e) {
|
||||
logger.error("File not found", e);
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
|
||||
} finally {
|
||||
if (xmlEventReader != null) {
|
||||
try {
|
||||
xmlEventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
}
|
||||
|
||||
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
|
||||
return event.asEndElement()
|
||||
.getName()
|
||||
.getLocalPart()
|
||||
.equalsIgnoreCase(headerTag);
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "s";
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
inWord = true;
|
||||
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
}
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
Characters characters = event.asCharacters();
|
||||
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
String word = characters.getData();
|
||||
sentence.add(new Word(word, lemma, msd));
|
||||
inWord = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
String var = endElement.getName().getLocalPart();
|
||||
String debug = "";
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
|
||||
if (!ValidationUtil.isEmpty(sentence)) {
|
||||
corpus.add(new Sentence(sentence));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need the data anymore
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
||||
|
||||
if (currentFiletaxonomy.isEmpty()) {
|
||||
// taxonomies don't match so stop
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fallback
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
||||
// TODO: if (stats.isUseDB()) {
|
||||
// stats.storeTmpResultsToDB();
|
||||
// }
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inOrthDiv = false;
|
||||
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> sentence = new ArrayList<>();
|
||||
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
|
||||
String sentenceDelimiter = "seg";
|
||||
|
||||
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
|
||||
|
||||
XMLEventReader eventReader = null;
|
||||
|
||||
boolean includeFile = true;
|
||||
|
||||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
XMLEvent event = eventReader.nextEvent();
|
||||
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
|
||||
|
||||
switch (event.getEventType()) {
|
||||
case XMLStreamConstants.START_ELEMENT:
|
||||
StartElement startElement = event.asStartElement();
|
||||
String qName = startElement.getName().getLocalPart();
|
||||
|
||||
if (qName.equals("div")) {
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
|
||||
if (atts.keySet().contains("type")) {
|
||||
inOrthDiv = atts.get("type").equals("orth");
|
||||
}
|
||||
}
|
||||
|
||||
// "word" node
|
||||
if (qName.equals("w")) {
|
||||
// check that it's not a type
|
||||
HashMap<String, String> atts = extractAttributes(startElement);
|
||||
|
||||
if (!atts.containsKey("type")) {
|
||||
inWord = true;
|
||||
|
||||
if (atts.containsKey("msd")) {
|
||||
msd = atts.get("msd");
|
||||
|
||||
}
|
||||
if (atts.containsKey("lemma")) {
|
||||
lemma = atts.get("lemma");
|
||||
}
|
||||
//
|
||||
// if (!inOrthDiv) {
|
||||
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
// }
|
||||
}
|
||||
|
||||
// }
|
||||
}
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
// they differ by not having the attribute "ref", so test will equal null
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
||||
if (tax != null) {
|
||||
// keep only taxonomy properties
|
||||
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
|
||||
}
|
||||
} else if (qName.equalsIgnoreCase("div")) {
|
||||
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.CHARACTERS:
|
||||
// "word" node value
|
||||
if (inWord) {
|
||||
Characters characters = event.asCharacters();
|
||||
if (gosType.equals("norm") && msd != null) {
|
||||
sentence.add(new Word(characters.getData(), lemma, msd));
|
||||
} else {
|
||||
sentence.add(new Word(characters.getData()));
|
||||
}
|
||||
|
||||
inWord = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
EndElement endElement = event.asEndElement();
|
||||
|
||||
// parser reached end of the current sentence
|
||||
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
|
||||
// add sentence to corpus if it passes filters
|
||||
boolean saveSentence = computeForOrth == inOrthDiv;
|
||||
|
||||
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
|
||||
sentence = runFilters(sentence, stats.getFilter());
|
||||
corpus.add(new Sentence(sentence));
|
||||
}
|
||||
|
||||
// and start a new one
|
||||
sentence = new ArrayList<>();
|
||||
|
||||
/* Invoke Fork-Join when we reach maximum limit of
|
||||
* sentences (because we can't read everything to
|
||||
* memory) or we reach the end of the file.
|
||||
*/
|
||||
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
|
||||
fj(corpus, stats);
|
||||
// empty the current corpus, since we don't need
|
||||
// the data anymore
|
||||
corpus.clear();
|
||||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
|
||||
// before proceeding to read this file, make sure that taxonomy filters are a match
|
||||
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
|
||||
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
|
||||
|
||||
// disregard this entry if taxonomies don't match
|
||||
includeFile = !currentFiletaxonomy.isEmpty();
|
||||
|
||||
currentFiletaxonomy = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
// backup
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException | XMLStreamException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
if (eventReader != null) {
|
||||
try {
|
||||
eventReader.close();
|
||||
} catch (XMLStreamException e) {
|
||||
logger.error("closing stream", e);
|
||||
} catch (Exception e) {
|
||||
logger.error("general error", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
|
||||
* Filters:
|
||||
* <ol>
|
||||
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
|
||||
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
|
||||
* </ol>
|
||||
*
|
||||
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
|
||||
*/
|
||||
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
|
||||
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
|
||||
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
|
||||
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// if we're calculating values for letters, omit words that are shorter than string length
|
||||
if (filter.getNgramValue() == 0) {
|
||||
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|
||||
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
|
||||
}
|
||||
}
|
||||
|
||||
return sentence;
|
||||
}
|
||||
|
||||
private static HashMap<String, String> extractAttributes(StartElement se) {
|
||||
Iterator attributesIt = se.getAttributes();
|
||||
HashMap<String, String> atts = new HashMap<>();
|
||||
|
||||
while (attributesIt.hasNext()) {
|
||||
Attribute a = (Attribute) attributesIt.next();
|
||||
atts.put(a.getName().getLocalPart(), a.getValue());
|
||||
}
|
||||
|
||||
return atts;
|
||||
}
|
||||
}
|
||||
67
src/main/java/alg/inflectedJOS/ForkJoin.java
Normal file
67
src/main/java/alg/inflectedJOS/ForkJoin.java
Normal file
@@ -0,0 +1,67 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = -1260951004477299634L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private Statistics stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, Statistics stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
|
||||
if (stats.isTaxonomySet()) {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
|
||||
} else {
|
||||
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
170
src/main/java/alg/inflectedJOS/InflectedJOSCount.java
Normal file
170
src/main/java/alg/inflectedJOS/InflectedJOSCount.java
Normal file
@@ -0,0 +1,170 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import alg.Common;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
public class InflectedJOSCount {
|
||||
|
||||
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
|
||||
|
||||
// static {
|
||||
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
|
||||
// indices = new HashMap<>();
|
||||
// for (int i = 5; i <= 8; i++) {
|
||||
// indices.put(i, calculateCombinations(i));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// private static List<Integer> calculateCombinations(int i) {
|
||||
// int arr[] = {1, 2, 3, 4, 5};
|
||||
// int r = 3;
|
||||
// int n = arr.length;
|
||||
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
|
||||
//
|
||||
// return printCombination(arr, n, r);
|
||||
// }
|
||||
//
|
||||
// /* arr[] ---> Input Array
|
||||
// data[] ---> Temporary array to store current combination
|
||||
// start & end ---> Staring and Ending indexes in arr[]
|
||||
// index ---> Current index in data[]
|
||||
// r ---> Size of a combination to be printed */
|
||||
// static void combinationUtil(int arr[], int data[], int start,
|
||||
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
|
||||
// // Current combination is ready to be printed, print it
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
//
|
||||
// if (index == r) {
|
||||
// ArrayList<Integer> tmpResult = new ArrayList<>();
|
||||
// for (int j = 0; j < r; j++)
|
||||
// System.out.print(data[j] + " ");
|
||||
// System.out.println("");
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// // replace index with all possible elements. The condition
|
||||
// // "end-i+1 >= r-index" makes sure that including one element
|
||||
// // at index will make a combination with remaining elements
|
||||
// // at remaining positions
|
||||
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
|
||||
// data[index] = arr[i];
|
||||
// combinationUtil(arr, data, i + 1, end, index + 1, r);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // The main function that prints all combinations of size r
|
||||
// // in arr[] of size n. This function mainly uses combinationUtil()
|
||||
// static void printCombination(int arr[], int n, int r) {
|
||||
// // A temporary array to store all combination one by one
|
||||
// int data[] = new int[r];
|
||||
//
|
||||
// // Print all combination using temprary array 'data[]'
|
||||
// combinationUtil(arr, data, 0, n - 1, 0, r);
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
// for (Sentence s : corpus) {
|
||||
// // disregard if wrong taxonomy
|
||||
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// calculateCommon(s, stats.result);
|
||||
//
|
||||
// for (Word word : s.getWords()) {
|
||||
// // skip if current word is not inflected
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
// for (Sentence s : corpus) {
|
||||
// for (Word word : s.getWords()) {
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// String msd = word.getMsd();
|
||||
//
|
||||
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
//
|
||||
// for (int i = 1; i < msd.length(); i++) {
|
||||
// entry.setCharAt(i, msd.charAt(i));
|
||||
// Common.updateMap(stats.result, entry.toString());
|
||||
// entry.setCharAt(i, '-');
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
// disregard if wrong taxonomy
|
||||
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
if (!(word.getMsd().length() > 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
Common.updateMap(stats.result, entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
|
||||
for (Sentence s : corpus) {
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
// skip if current word is not inflected
|
||||
// // TODO: if has defined msd and is of correct type (create a set)
|
||||
// if (!(word.getMsd().length() > 0)) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
String msd = word.getMsd();
|
||||
|
||||
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
|
||||
|
||||
for (int i = 1; i < msd.length(); i++) {
|
||||
entry.setCharAt(i, msd.charAt(i));
|
||||
stats.updateResults(entry.toString());
|
||||
entry.setCharAt(i, '-');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
131
src/main/java/alg/inflectedJOS/WordFormation.java
Normal file
131
src/main/java/alg/inflectedJOS/WordFormation.java
Normal file
@@ -0,0 +1,131 @@
|
||||
package alg.inflectedJOS;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import data.Enums.InflectedJosTypes;
|
||||
import data.StatisticsNew;
|
||||
import gui.ValidationUtil;
|
||||
import util.Combinations;
|
||||
|
||||
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
|
||||
public class WordFormation {
|
||||
private static HashMap<String, Long> josTypeResult;
|
||||
private static Object[][] tmpResults;
|
||||
|
||||
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
|
||||
|
||||
static {
|
||||
indices = new HashMap<>();
|
||||
|
||||
for (int i = 4; i <= 8; i++) {
|
||||
indices.put(i, Combinations.generateIndices(i));
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateStatistics(StatisticsNew stat) {
|
||||
Map<String, AtomicLong> result = stat.getResult();
|
||||
|
||||
// 1. filter - keep only inflected types
|
||||
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
|
||||
|
||||
// 2. for each inflected type get all possible subcombinations
|
||||
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
|
||||
josTypeResult = new HashMap<>();
|
||||
|
||||
// filter out results for a single word type
|
||||
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
|
||||
.filter(x -> x.getKey().charAt(0) == josChar)
|
||||
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||
|
||||
if (ValidationUtil.isEmpty(singleTypeResults)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// get all possible indices combos for a msd of this length
|
||||
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
|
||||
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
|
||||
|
||||
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
|
||||
int l = e.getKey().length();
|
||||
|
||||
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
|
||||
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
|
||||
}
|
||||
}
|
||||
|
||||
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
|
||||
}
|
||||
|
||||
stat.setResultCustom(tmpResults);
|
||||
}
|
||||
|
||||
private static String mask(String word, HashSet<Integer> indicesCombo) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append(word.charAt(0));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
private static void updateResults(String s, Long nOfOccurences) {
|
||||
// if not in map add
|
||||
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
|
||||
|
||||
// else update
|
||||
if (r != null) {
|
||||
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
|
||||
}
|
||||
}
|
||||
|
||||
private static void resultsMapToArray(Long totalValue) {
|
||||
Double total = totalValue * 1.0;
|
||||
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
|
||||
|
||||
int i = 0;
|
||||
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
|
||||
josTypeResultArray[i][0] = e.getKey();
|
||||
josTypeResultArray[i][1] = e.getValue();
|
||||
josTypeResultArray[i][2] = e.getValue() / total;
|
||||
|
||||
if (e.getValue() > total) {
|
||||
|
||||
String debug = "";
|
||||
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if (tmpResults == null) {
|
||||
tmpResults = josTypeResultArray;
|
||||
} else {
|
||||
int firstLength = tmpResults.length;
|
||||
int secondLength = josTypeResultArray.length;
|
||||
Object[][] tmp = new Object[firstLength + secondLength][3];
|
||||
|
||||
System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
|
||||
System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
|
||||
|
||||
tmpResults = tmp;
|
||||
|
||||
// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
|
||||
}
|
||||
}
|
||||
|
||||
private static void printArray() {
|
||||
for (int i = 0; i < tmpResults.length; i++) {
|
||||
for (int j = 0; j < tmpResults[i].length; j++) {
|
||||
System.out.print(tmpResults[i][j] + "\t");
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
62
src/main/java/alg/ngram/ForkJoin.java
Normal file
@@ -0,0 +1,62 @@
|
||||
package alg.ngram;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = 5074814035083362355L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private StatisticsNew stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
Ngrams.calculateForAll(subCorpus, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
204
src/main/java/alg/ngram/Ngrams.java
Normal file
204
src/main/java/alg/ngram/Ngrams.java
Normal file
@@ -0,0 +1,204 @@
|
||||
package alg.ngram;
|
||||
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import data.CalculateFor;
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
import gui.ValidationUtil;
|
||||
|
||||
public class Ngrams {
|
||||
public final static Logger logger = LogManager.getLogger(Ngrams.class);
|
||||
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
|
||||
generateNgramLetterCandidates(corpus, stats);
|
||||
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
|
||||
generateSkipgramCandidates(corpus, stats);
|
||||
} else {
|
||||
generateNgramCandidates(corpus, stats);
|
||||
}
|
||||
}
|
||||
|
||||
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
// skip sentences shorter than specified ngram length
|
||||
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
|
||||
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
|
||||
|
||||
// if msd regex is set and this candidate doesn't pass it, skip this iteration
|
||||
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether an ngram candidate passes specified regex filter.
|
||||
*/
|
||||
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
|
||||
if (ngramCandidate.size() != regex.size()) {
|
||||
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < regex.size(); i++) {
|
||||
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
|
||||
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
|
||||
|
||||
switch (calculateFor) {
|
||||
case LEMMA:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case MORPHOSYNTACTIC_SPECS:
|
||||
case MORPHOSYNTACTIC_PROPERTY:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(Word::getMsd)
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
case WORD_TYPE:
|
||||
candidate.addAll(ngramCandidate
|
||||
.stream()
|
||||
.map(w -> Character.toString(w.getMsd().charAt(0)))
|
||||
.collect(Collectors.toList()));
|
||||
break;
|
||||
}
|
||||
|
||||
return StringUtils.join(candidate, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates candidates and updates results
|
||||
*
|
||||
* @param corpus
|
||||
* @param stats
|
||||
*/
|
||||
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word w : s.getWords()) {
|
||||
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
|
||||
|
||||
// skip this iteration if:
|
||||
// - word doesn't contain a proper version (missing lemma for example)
|
||||
// - msd regex is given but this word's msd doesn't match it, skip this iteration
|
||||
// - given substring length is larger than the word length
|
||||
if (ValidationUtil.isEmpty(word)
|
||||
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|
||||
|| word.length() < stats.getFilter().getStringLength()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
|
||||
// TODO: locila?
|
||||
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extracts skipgram candidates.
|
||||
*
|
||||
* @return List of candidates represented as a list<candidates(String)>
|
||||
*/
|
||||
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
|
||||
ArrayList<Word> currentLoop;
|
||||
int ngram = stats.getFilter().getNgramValue();
|
||||
int skip = stats.getFilter().getSkipValue();
|
||||
|
||||
for (Sentence s : corpus) {
|
||||
List<Word> sentence = s.getWords();
|
||||
|
||||
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
|
||||
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
|
||||
if (ngram == 2 && j < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
|
||||
if (ngram == 3 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
|
||||
if (ngram == 4 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
} else {
|
||||
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
|
||||
if (ngram == 5 && k < sentence.size()) {
|
||||
currentLoop = new ArrayList<>();
|
||||
currentLoop.add(sentence.get(i));
|
||||
currentLoop.add(sentence.get(j));
|
||||
currentLoop.add(sentence.get(k));
|
||||
currentLoop.add(sentence.get(l));
|
||||
currentLoop.add(sentence.get(m));
|
||||
|
||||
validateAndCountSkipgramCandidate(currentLoop, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
|
||||
// count if no regex is set or if it is & candidate passes it
|
||||
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
|
||||
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
|
||||
}
|
||||
}
|
||||
}
|
||||
62
src/main/java/alg/word/ForkJoin.java
Normal file
62
src/main/java/alg/word/ForkJoin.java
Normal file
@@ -0,0 +1,62 @@
|
||||
package alg.word;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.RecursiveAction;
|
||||
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
|
||||
public class ForkJoin extends RecursiveAction {
|
||||
private static final long serialVersionUID = 7711587510996456040L;
|
||||
|
||||
private static final int ACCEPTABLE_SIZE = 1000;
|
||||
private List<Sentence> corpus;
|
||||
private StatisticsNew stats;
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for subproblems.
|
||||
*/
|
||||
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default constructor for the initial problem
|
||||
*/
|
||||
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
|
||||
this.corpus = corpus;
|
||||
this.start = 0;
|
||||
this.end = corpus.size();
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
private void computeDirectly() {
|
||||
List<Sentence> subCorpus = corpus.subList(start, end);
|
||||
WordLevel.calculateForAll(subCorpus, stats);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void compute() {
|
||||
int subCorpusSize = end - start;
|
||||
|
||||
if (subCorpusSize < ACCEPTABLE_SIZE) {
|
||||
computeDirectly();
|
||||
} else {
|
||||
int mid = start + subCorpusSize / 2;
|
||||
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
|
||||
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
|
||||
|
||||
// fork (push to queue)-> compute -> join
|
||||
left.fork();
|
||||
right.fork();
|
||||
left.join();
|
||||
right.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
167
src/main/java/alg/word/WordCount.java
Normal file
167
src/main/java/alg/word/WordCount.java
Normal file
@@ -0,0 +1,167 @@
|
||||
package alg.word;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import alg.Common;
|
||||
import data.CalculateFor;
|
||||
import data.Sentence;
|
||||
import data.Statistics;
|
||||
import data.Word;
|
||||
|
||||
class WordCount {
|
||||
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getCVVWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
if (word.length() > stats.getSubstringLength()) {
|
||||
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
|
||||
String substring = word.substring(i, i + stats.getSubstringLength());
|
||||
Common.updateMap(stats.result, substring);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
List<Word> filteredWords = new ArrayList<>();
|
||||
|
||||
for (Word word : s.getWords()) {
|
||||
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
|
||||
filteredWords.add(word);
|
||||
}
|
||||
}
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(filteredWords
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
|
||||
for (Sentence s : corpus) {
|
||||
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
|
||||
List<String> sentence = new ArrayList<>(s.getWords().size());
|
||||
|
||||
if (stats.getCf() == CalculateFor.LEMMA) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getLemma)
|
||||
.collect(Collectors.toList()));
|
||||
} else if (stats.getCf() == CalculateFor.WORD) {
|
||||
sentence.addAll(s.getWords()
|
||||
.stream()
|
||||
.map(Word::getWord)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
for (String word : sentence) {
|
||||
Common.updateMap(stats.result, word);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
|
||||
boolean taxonomyIsSet = stats.isTaxonomySet();
|
||||
boolean JosTypeIsSet = stats.isJOSTypeSet();
|
||||
|
||||
// branching because even though the only difference is an if or two &&
|
||||
// O(if) = 1, the amount of ifs adds up and this saves some time
|
||||
if (taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForTaxonomyAndJosType(corpus, stats);
|
||||
} else if (taxonomyIsSet && !JosTypeIsSet) {
|
||||
calculateForTaxonomy(corpus, stats);
|
||||
} else if (!taxonomyIsSet && JosTypeIsSet) {
|
||||
calculateForJosType(corpus, stats);
|
||||
} else {
|
||||
if (stats.isVcc()) {
|
||||
calculateVCC(corpus, stats);
|
||||
} else {
|
||||
calculateNoFilter(corpus, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
112
src/main/java/alg/word/WordLevel.java
Normal file
112
src/main/java/alg/word/WordLevel.java
Normal file
@@ -0,0 +1,112 @@
|
||||
package alg.word;
|
||||
|
||||
import static data.Enums.WordLevelDefaultValues.*;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import data.Enums.WordLevelDefaultValues;
|
||||
import data.Enums.WordLevelType;
|
||||
import data.Sentence;
|
||||
import data.StatisticsNew;
|
||||
import data.Word;
|
||||
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class WordLevel {
|
||||
private static HashSet<String> suffixes;
|
||||
private static int minSuffixLength;
|
||||
private static int maxSuffixLength;
|
||||
|
||||
private static HashSet<String> prefixes;
|
||||
private static int minPrefixLength;
|
||||
private static int maxPrefixLength;
|
||||
|
||||
static {
|
||||
suffixes = WordLevelDefaultValues.getSuffixes();
|
||||
calculateSuffixesLengths();
|
||||
|
||||
prefixes = WordLevelDefaultValues.getPrefixes();
|
||||
calculatePrefixesLengths();
|
||||
}
|
||||
|
||||
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
|
||||
for (Sentence s : corpus) {
|
||||
for (Word word : s.getWords()) {
|
||||
calculateForSuffixes(word.getWord(), stats);
|
||||
calculateForPrefixes(word.getWord(), stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void calculateForPrefixes(String word, StatisticsNew stats) {
|
||||
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
|
||||
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
|
||||
return;
|
||||
}
|
||||
|
||||
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
|
||||
|
||||
if (prefixes.contains(extractedPrefix)) {
|
||||
// save suffix and full word
|
||||
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void calculateForSuffixes(String word, StatisticsNew stats) {
|
||||
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
|
||||
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
|
||||
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
|
||||
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
|
||||
return;
|
||||
}
|
||||
|
||||
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
|
||||
|
||||
if (suffixes.contains(extractedSuffix)) {
|
||||
// save suffix and full word
|
||||
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finds the shortest and longest suffix for quicker calculations
|
||||
public static void calculateSuffixesLengths() {
|
||||
minSuffixLength = -1;
|
||||
maxSuffixLength = -1;
|
||||
|
||||
for (String suffix : suffixes) {
|
||||
if (suffix.length() > maxSuffixLength) {
|
||||
maxSuffixLength = suffix.length();
|
||||
|
||||
if (minSuffixLength < 0) {
|
||||
minSuffixLength = maxSuffixLength;
|
||||
}
|
||||
} else if (suffix.length() < minSuffixLength) {
|
||||
minSuffixLength = suffix.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// finds the shortest and longest suffix for quicker calculations
|
||||
public static void calculatePrefixesLengths() {
|
||||
minPrefixLength = -1;
|
||||
maxPrefixLength = -1;
|
||||
|
||||
for (String prefix : prefixes) {
|
||||
if (prefix.length() > maxPrefixLength) {
|
||||
maxPrefixLength = prefix.length();
|
||||
|
||||
if (minPrefixLength < 0) {
|
||||
minPrefixLength = maxPrefixLength;
|
||||
}
|
||||
} else if (prefix.length() < minPrefixLength) {
|
||||
minPrefixLength = prefix.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user