Project copied

This commit is contained in:
2018-06-19 09:15:37 +02:00
commit a18e52a599
94 changed files with 87092 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
package alg;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
public class Common {
public static <K, V> void updateMap(Map<K, AtomicLong> map, K o) {
// if not in map
AtomicLong r = map.putIfAbsent(o, new AtomicLong(1));
// else
if (r != null)
map.get(o).incrementAndGet();
}
}

View File

@@ -0,0 +1,794 @@
package alg;
import static data.Enums.solar.SolarFilters.*;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.*;
import java.util.concurrent.ForkJoinPool;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.*;
import org.apache.logging.log4j.LogManager;
import data.*;
import gui.ValidationUtil;
public class XML_processing {
public final static org.apache.logging.log4j.Logger logger = LogManager.getLogger(XML_processing.class);
// public static void processCorpus(Statistics stats) {
// // we can preset the list's size, so there won't be a need to resize it
// List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT);
//
// int i = 0;
// for (File f : Settings.corpus) {
// i++;
// readXML(f.toString(), stats);
// }
// }
// public static void readXML(String path, Statistics stats) {
// if (stats.getCorpusType() == CorpusType.GIGAFIDA) {
// readXMLGigafida(path, stats);
// } else if (stats.getCorpusType() == CorpusType.GOS) {
// readXMLGos(path, stats);
// } else if (stats.getCorpusType() == CorpusType.SOLAR) {
// readXMLSolar(path, stats);
// }
// }
public static void readXML(String path, StatisticsNew stats) {
if (stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA
|| stats.getCorpus().getCorpusType() == CorpusType.CCKRES) {
readXMLGigafida(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.GOS) {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
}
}
/**
* Reads and returns the value of a passed header tag or an empty string.
* E.g. title tag, for discerning the corpus' type.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderTag(String path, String tag) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
if (stats.getFilter().getAl() == AnalysisLevel.STRING_LEVEL) {
alg.ngram.ForkJoin wc = new alg.ngram.ForkJoin(corpus, stats);
pool.invoke(wc);
} else if (stats.getFilter().getAl() == AnalysisLevel.WORD_LEVEL) {
alg.word.ForkJoin wc = new alg.word.ForkJoin(corpus, stats);
pool.invoke(wc);
} else {
// TODO:
// alg.inflectedJOS.ForkJoin wc = new alg.inflectedJOS.ForkJoin(corpus, stats);
// pool.invoke(wc);
}
}
// public static void readXMLGos(String path, Statistics stats) {
// boolean in_word = false;
// String taksonomija = "";
// String lemma = "";
// String msd = "";
// String type = stats.isGosOrthMode() ? "orth" : "norm"; // orth & norm
//
// List<Word> stavek = new ArrayList<>();
// List<Sentence> corpus = new ArrayList<>();
// String sentenceDelimiter = "seg";
// String taxonomyPrefix = "gos.";
//
// try {
// XMLInputFactory factory = XMLInputFactory.newInstance();
// XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
//
// while (eventReader.hasNext()) {
// XMLEvent event = eventReader.nextEvent();
//
// switch (event.getEventType()) {
// case XMLStreamConstants.START_ELEMENT:
//
// StartElement startElement = event.asStartElement();
// String qName = startElement.getName().getLocalPart();
//
// // "word" node
// if (qName.equals("w")) {
// in_word = true;
//
// if (type.equals("norm")) {
// // make sure we're looking at <w lemma...> and not <w type...>
// Iterator var = startElement.getAttributes();
// ArrayList<Object> attributes = new ArrayList<>();
// while (var.hasNext()) {
// attributes.add(var.next());
// }
//
// if (attributes.contains("msd")) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// } else {
// msd = null;
// }
//
// if (attributes.contains("lemma")) {
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
// }
// }
// // taxonomy node
// else if (qName.equalsIgnoreCase("catRef")) {
// // there are some term nodes at the beginning that are of no interest to us
// // they differ by not having the attribute "ref", so test will equal null
// Attribute test = startElement.getAttributeByName(QName.valueOf("target"));
//
// if (test != null) {
// // keep only taxonomy properties
// taksonomija = String.valueOf(test.getValue()).replace(taxonomyPrefix, "");
// }
// } else if (qName.equalsIgnoreCase("div")) {
// type = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
//
// }
// break;
//
// case XMLStreamConstants.CHARACTERS:
// Characters characters = event.asCharacters();
//
// // "word" node value
// if (in_word) {
// if (type.equals("norm") && msd != null) {
// stavek.add(new Word(characters.getData(), lemma, msd));
// } else {
// stavek.add(new Word(characters.getData()));
// }
//
// in_word = false;
// }
// break;
//
// case XMLStreamConstants.END_ELEMENT:
// EndElement endElement = event.asEndElement();
//
// // parser reached end of the current sentence
// if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// // add sentence to corpus
// corpus.add(new Sentence(stavek, taksonomija, type));
// // and start a new one
// stavek = new ArrayList<>();
//
// /* Invoke Fork-Join when we reach maximum limit of
// * sentences (because we can't read everything to
// * memory) or we reach the end of the file.
// */
// if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
// fj(corpus, stats);
// // empty the current corpus, since we don't need
// // the data anymore
// corpus.clear();
// }
// }
//
// // backup
// if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
// fj(corpus, stats);
// corpus.clear();
// }
//
// break;
// }
// }
// } catch (FileNotFoundException | XMLStreamException e) {
// e.printStackTrace();
// }
// }
@SuppressWarnings("unused")
public static void readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>();
// used for filter
Set<String> headTags = new HashSet<>(Arrays.asList("sola", "predmet", "razred", "regija", "tip", "leto"));
Map<String, String> headBlock = null;
boolean includeThisBlock = false;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
// System.out.println(String.format("%s", startElement.toString()));
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w3")) {
in_word = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek));
// and start a new one
stavek = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
} else if (qName.equals("head")) {
headBlock = new HashMap<>();
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String qNameEnd = endElement.getName().getLocalPart();
if (qNameEnd.equals("head")) {
// validate and set boolean
if (validateHeadBlock(headBlock, stats.getFilter().getSolarFilters())) {
includeThisBlock = true;
}
} else if (qNameEnd.equals("body")) {
// new block, reset filter status
includeThisBlock = false;
}
// backup
if (endElement.getName().getLocalPart().equalsIgnoreCase("korpus")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
}
}
/**
* @param readHeadBlock block of tags read from the corpus
* @param userSetFilter tags with values set by the user
*
* @return
*/
private static boolean validateHeadBlock(Map<String, String> readHeadBlock, HashMap<String, HashSet<String>> userSetFilter) {
boolean pass = true;
if (userSetFilter == null) {
return true;
}
for (Map.Entry<String, HashSet<String>> filterEntry : userSetFilter.entrySet()) {
String key = filterEntry.getKey();
HashSet<String> valueObject = filterEntry.getValue();
// if (valueObject instanceof String) {
// pass = validateHeadBlockEntry(readHeadBlock, key, (String) valueObject);
// } else
if (valueObject != null) {
//noinspection unchecked
for (String value : valueObject) {
pass = validateHeadBlockEntry(readHeadBlock, key, value);
}
}
if (!pass) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
}
}
// if it gets to this point, it passed all the filters
return true;
}
private static boolean validateHeadBlockEntry(Map<String, String> readHeadBlock, String userSetKey, String userSetValue) {
if (!readHeadBlock.keySet().contains(userSetKey)) {
// current head block does not include one of the set filters - not likely, but an edge case anyway
return false;
} else if (!readHeadBlock.get(userSetKey).equals(userSetValue)) {
// different values -> doesn't pass the filter
return false;
}
return true;
}
/**
* Parses XML headers for information about its taxonomy (if supported) or filters (solar)
*
* @param filepath
* @param corpusIsSplit is corpus split into multiple xml files, or are all entries grouped into one large xml file
* @param corpusType
*/
public static Object readXmlHeaderTaxonomyAndFilters(String filepath, boolean corpusIsSplit, CorpusType corpusType) {
boolean parseTaxonomy = Tax.getCorpusTypesWithTaxonomy().contains(corpusType);
// solar
Set<String> headTags = null;
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
String headTagName;
if (corpusType == CorpusType.SOLAR) {
headTagName = "head";
// used for filter
headTags = new HashSet<>(Arrays.asList(SOLA, PREDMET, RAZRED, REGIJA, TIP, LETO));
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else {
headTagName = "teiHeader";
}
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = null;
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String elementName = startElement.getName().getLocalPart();
if (elementName.equalsIgnoreCase(headTagName)) {
// if the corpus is split into files, we skip bodies
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
}
if (insideHeader) {
if (parseTaxonomy && elementName.equalsIgnoreCase("catRef")) {
HashMap<String, String> atts = extractAttributes(startElement);
String debug = "";
String tax = startElement.getAttributeByName(QName.valueOf("target"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
resultFilters.get(elementName).add(tagContent);
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
}
}
} catch (XMLStreamException e) {
logger.error("Streaming error", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
} catch (FileNotFoundException e) {
logger.error("File not found", e);
return parseTaxonomy ? resultTaxonomy : resultFilters;
// TODO: keep a list of files that threw this error and a dirty boolean marker -> if true, alert user
} finally {
if (xmlEventReader != null) {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return parseTaxonomy ? resultTaxonomy : resultFilters;
}
private static boolean isEndElementEndOfHeader(XMLEvent event, String headerTag) {
return event.asEndElement()
.getName()
.getLocalPart()
.equalsIgnoreCase(headerTag);
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()).replace("#", ""));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(new Word(word, lemma, msd));
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
if (currentFiletaxonomy.isEmpty()) {
// taxonomies don't match so stop
return false;
}
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "seg";
String gosType = stats.getFilter().hasMsd() ? "norm" : "orth"; // orth & norm
XMLEventReader eventReader = null;
boolean includeFile = true;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
// System.out.print(String.format("%s", event.toString().replaceAll("\\['http://www.tei-c.org/ns/1.0'\\]::", "")));
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
if (qName.equals("div")) {
HashMap<String, String> atts = extractAttributes(startElement);
if (atts.keySet().contains("type")) {
inOrthDiv = atts.get("type").equals("orth");
}
}
// "word" node
if (qName.equals("w")) {
// check that it's not a type
HashMap<String, String> atts = extractAttributes(startElement);
if (!atts.containsKey("type")) {
inWord = true;
if (atts.containsKey("msd")) {
msd = atts.get("msd");
}
if (atts.containsKey("lemma")) {
lemma = atts.get("lemma");
}
//
// if (!inOrthDiv) {
// msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
// lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
// }
}
// }
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
if (tax != null) {
// keep only taxonomy properties
currentFiletaxonomy.add(String.valueOf(tax.getValue()));
}
} else if (qName.equalsIgnoreCase("div")) {
gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
}
break;
case XMLStreamConstants.CHARACTERS:
// "word" node value
if (inWord) {
Characters characters = event.asCharacters();
if (gosType.equals("norm") && msd != null) {
sentence.add(new Word(characters.getData(), lemma, msd));
} else {
sentence.add(new Word(characters.getData()));
}
inWord = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
boolean saveSentence = computeForOrth == inOrthDiv;
if (includeFile && saveSentence && !ValidationUtil.isEmpty(sentence)) {
sentence = runFilters(sentence, stats.getFilter());
corpus.add(new Sentence(sentence));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need
// the data anymore
corpus.clear();
}
} else if (endElement.getName().getLocalPart().equals("teiHeader")) {
// before proceeding to read this file, make sure that taxonomy filters are a match
if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
// disregard this entry if taxonomies don't match
includeFile = !currentFiletaxonomy.isEmpty();
currentFiletaxonomy = new ArrayList<>();
}
}
// backup
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
} catch (Exception e) {
logger.error("general error", e);
}
}
}
return true;
}
/**
* Runs the sentence through some filters, so we don't do calculations when unnecessary.
* Filters:
* <ol>
* <li><b>Ngrams:</b> omit sentences that are shorter than the ngram value (e.g. 3 gram of a single word sentence)</li>
* <li><b>Letter ngrams:</b> omit words that are shorter than the specified string length (e.g. combinations of 3 letters when the word consists of only 2 letters)</li>
* </ol>
*
* @return Empty sentence (if fails 1.) or a sentence with some words removed (2.)
*/
private static List<Word> runFilters(List<Word> sentence, Filter filter) {
if (filter.getAl() == AnalysisLevel.STRING_LEVEL) {
// ngram level: if not 0 must be less than or equal to number of words in this sentence.
if (filter.getNgramValue() > 0 && filter.getNgramValue() > sentence.size()) {
return null;
}
// if we're calculating values for letters, omit words that are shorter than string length
if (filter.getNgramValue() == 0) {
sentence.removeIf(w -> (filter.getCalculateFor() == CalculateFor.WORD && w.getWord().length() < filter.getStringLength())
|| (filter.getCalculateFor() == CalculateFor.LEMMA && w.getLemma().length() < filter.getStringLength()));
}
}
return sentence;
}
private static HashMap<String, String> extractAttributes(StartElement se) {
Iterator attributesIt = se.getAttributes();
HashMap<String, String> atts = new HashMap<>();
while (attributesIt.hasNext()) {
Attribute a = (Attribute) attributesIt.next();
atts.put(a.getName().getLocalPart(), a.getValue());
}
return atts;
}
}

View File

@@ -0,0 +1,67 @@
package alg.inflectedJOS;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.Statistics;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = -1260951004477299634L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private Statistics stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, Statistics stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, Statistics stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
if (stats.isTaxonomySet()) {
InflectedJOSCount.calculateForAll(subCorpus, stats, stats.getInflectedJosTaxonomy());
} else {
InflectedJOSCount.calculateForAll(subCorpus, stats, null);
}
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,170 @@
package alg.inflectedJOS;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import alg.Common;
import data.Sentence;
import data.Statistics;
import data.StatisticsNew;
import data.Word;
public class InflectedJOSCount {
public static HashMap<Integer, ArrayList<ArrayList<Integer>>> indices;
// static {
// // calculate all possible combinations of indices we will substitute with a '-' for substring statistics
// indices = new HashMap<>();
// for (int i = 5; i <= 8; i++) {
// indices.put(i, calculateCombinations(i));
// }
// }
//
// private static List<Integer> calculateCombinations(int i) {
// int arr[] = {1, 2, 3, 4, 5};
// int r = 3;
// int n = arr.length;
// ArrayList<ArrayList<Integer>> result = new ArrayList<>();
//
// return printCombination(arr, n, r);
// }
//
// /* arr[] ---> Input Array
// data[] ---> Temporary array to store current combination
// start & end ---> Staring and Ending indexes in arr[]
// index ---> Current index in data[]
// r ---> Size of a combination to be printed */
// static void combinationUtil(int arr[], int data[], int start,
// int end, int index, int r, ArrayList<ArrayList<Integer>> result) {
// // Current combination is ready to be printed, print it
// ArrayList<Integer> tmpResult = new ArrayList<>();
//
// if (index == r) {
// ArrayList<Integer> tmpResult = new ArrayList<>();
// for (int j = 0; j < r; j++)
// System.out.print(data[j] + " ");
// System.out.println("");
// return;
// }
//
// // replace index with all possible elements. The condition
// // "end-i+1 >= r-index" makes sure that including one element
// // at index will make a combination with remaining elements
// // at remaining positions
// for (int i = start; i <= end && end - i + 1 >= r - index; i++) {
// data[index] = arr[i];
// combinationUtil(arr, data, i + 1, end, index + 1, r);
// }
// }
//
// // The main function that prints all combinations of size r
// // in arr[] of size n. This function mainly uses combinationUtil()
// static void printCombination(int arr[], int n, int r) {
// // A temporary array to store all combination one by one
// int data[] = new int[r];
//
// // Print all combination using temprary array 'data[]'
// combinationUtil(arr, data, 0, n - 1, 0, r);
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
// for (Sentence s : corpus) {
// // disregard if wrong taxonomy
// if (!(s.getTaxonomy().startsWith(taxonomy))) {
// continue;
// }
//
// calculateCommon(s, stats.result);
//
// for (Word word : s.getWords()) {
// // skip if current word is not inflected
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
// public static void calculateForAll(List<Sentence> corpus, Statistics stats) {
// for (Sentence s : corpus) {
// for (Word word : s.getWords()) {
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
//
// String msd = word.getMsd();
//
// StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
//
// for (int i = 1; i < msd.length(); i++) {
// entry.setCharAt(i, msd.charAt(i));
// Common.updateMap(stats.result, entry.toString());
// entry.setCharAt(i, '-');
// }
// }
// }
// }
static void calculateForAll(List<Sentence> corpus, Statistics stats, String taxonomy) {
for (Sentence s : corpus) {
// disregard if wrong taxonomy
if (taxonomy != null && !(s.getTaxonomy().startsWith(taxonomy))) {
continue;
}
for (Word word : s.getWords()) {
// skip if current word is not inflected
if (!(word.getMsd().length() > 0)) {
continue;
}
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
Common.updateMap(stats.result, entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats, String taxonomy) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
// skip if current word is not inflected
// // TODO: if has defined msd and is of correct type (create a set)
// if (!(word.getMsd().length() > 0)) {
// continue;
// }
String msd = word.getMsd();
StringBuilder entry = new StringBuilder(msd.charAt(0) + StringUtils.repeat('-', (msd.length() - 1)));
for (int i = 1; i < msd.length(); i++) {
entry.setCharAt(i, msd.charAt(i));
stats.updateResults(entry.toString());
entry.setCharAt(i, '-');
}
}
}
}
}

View File

@@ -0,0 +1,131 @@
package alg.inflectedJOS;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import data.Enums.InflectedJosTypes;
import data.StatisticsNew;
import gui.ValidationUtil;
import util.Combinations;
// adapted from http://www.geeksforgeeks.org/print-all-possible-combinations-of-r-elements-in-a-given-array-of-size-n/
public class WordFormation {
private static HashMap<String, Long> josTypeResult;
private static Object[][] tmpResults;
private static HashMap<Integer, HashSet<HashSet<Integer>>> indices;
static {
indices = new HashMap<>();
for (int i = 4; i <= 8; i++) {
indices.put(i, Combinations.generateIndices(i));
}
}
public static void calculateStatistics(StatisticsNew stat) {
Map<String, AtomicLong> result = stat.getResult();
// 1. filter - keep only inflected types
result.keySet().removeIf(x -> !InflectedJosTypes.inflectedJosTypes.contains(x.charAt(0)));
// 2. for each inflected type get all possible subcombinations
for (Character josChar : InflectedJosTypes.inflectedJosTypes) {
josTypeResult = new HashMap<>();
// filter out results for a single word type
Map<String, AtomicLong> singleTypeResults = result.entrySet().stream()
.filter(x -> x.getKey().charAt(0) == josChar)
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
if (ValidationUtil.isEmpty(singleTypeResults)) {
continue;
}
// get all possible indices combos for a msd of this length
// HashSet<HashSet<Integer>> indicesCombos = indices.get()
//Combinations.generateIndices(singleTypeResults.keySet().stream().findFirst().get().length());
for (Map.Entry<String, AtomicLong> e : singleTypeResults.entrySet()) {
int l = e.getKey().length();
for (HashSet<Integer> indicesCombo : indices.get(e.getKey().length())) {
updateResults(mask(e.getKey(), indicesCombo), e.getValue().longValue());
}
}
resultsMapToArray(singleTypeResults.values().stream().mapToLong(Number::longValue).sum());
}
stat.setResultCustom(tmpResults);
}
private static String mask(String word, HashSet<Integer> indicesCombo) {
StringBuilder sb = new StringBuilder();
sb.append(word.charAt(0));
for (int i = 1; i < word.length(); i++) {
sb.append(indicesCombo.contains(i) ? word.charAt(i) : ".");
}
return sb.toString();
}
private static void updateResults(String s, Long nOfOccurences) {
// if not in map add
Long r = josTypeResult.putIfAbsent(s, nOfOccurences);
// else update
if (r != null) {
josTypeResult.put(s, josTypeResult.get(s) + nOfOccurences);
}
}
private static void resultsMapToArray(Long totalValue) {
Double total = totalValue * 1.0;
Object[][] josTypeResultArray = new Object[josTypeResult.size()][3];
int i = 0;
for (Map.Entry<String, Long> e : josTypeResult.entrySet()) {
josTypeResultArray[i][0] = e.getKey();
josTypeResultArray[i][1] = e.getValue();
josTypeResultArray[i][2] = e.getValue() / total;
if (e.getValue() > total) {
String debug = "";
}
i++;
}
if (tmpResults == null) {
tmpResults = josTypeResultArray;
} else {
int firstLength = tmpResults.length;
int secondLength = josTypeResultArray.length;
Object[][] tmp = new Object[firstLength + secondLength][3];
System.arraycopy(tmpResults, 0, tmp, 0, firstLength);
System.arraycopy(josTypeResultArray, 0, tmp, firstLength, secondLength);
tmpResults = tmp;
// tmpResults = ArrayUtils.addAll(tmpResults, josTypeResultArray);
}
}
private static void printArray() {
for (int i = 0; i < tmpResults.length; i++) {
for (int j = 0; j < tmpResults[i].length; j++) {
System.out.print(tmpResults[i][j] + "\t");
}
System.out.println();
}
}
}

View File

@@ -0,0 +1,62 @@
package alg.ngram;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 5074814035083362355L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
Ngrams.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,204 @@
package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import data.CalculateFor;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
import gui.ValidationUtil;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
if (stats.getFilter().getNgramValue() == 0) { // letter ngram
generateNgramLetterCandidates(corpus, stats);
} else if (!ValidationUtil.isEmpty(stats.getFilter().getSkipValue()) && stats.getFilter().getSkipValue() > 0) {
generateSkipgramCandidates(corpus, stats);
} else {
generateNgramCandidates(corpus, stats);
}
}
public static void generateNgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
// skip sentences shorter than specified ngram length
if (s.getWords().size() < stats.getFilter().getNgramValue()) {
continue;
}
for (int i = 0; i < s.getWords().size() - stats.getFilter().getNgramValue() + 1; i++) {
List<Word> ngramCandidate = s.getSublist(i, i + stats.getFilter().getNgramValue());
// if msd regex is set and this candidate doesn't pass it, skip this iteration
if (stats.getFilter().hasMsd() && !passesRegex(ngramCandidate, stats.getFilter().getMsd())) {
continue;
}
stats.updateResults(wordToString(ngramCandidate, stats.getFilter().getCalculateFor()));
}
}
}
/**
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
}
for (int i = 0; i < regex.size(); i++) {
if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
return false;
}
}
return true;
}
private static String wordToString(List<Word> ngramCandidate, CalculateFor calculateFor) {
ArrayList<String> candidate = new ArrayList<>(ngramCandidate.size());
switch (calculateFor) {
case LEMMA:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
break;
case WORD:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
break;
case MORPHOSYNTACTIC_SPECS:
case MORPHOSYNTACTIC_PROPERTY:
candidate.addAll(ngramCandidate
.stream()
.map(Word::getMsd)
.collect(Collectors.toList()));
break;
case WORD_TYPE:
candidate.addAll(ngramCandidate
.stream()
.map(w -> Character.toString(w.getMsd().charAt(0)))
.collect(Collectors.toList()));
break;
}
return StringUtils.join(candidate, " ");
}
/**
* Generates candidates and updates results
*
* @param corpus
* @param stats
*/
private static void generateNgramLetterCandidates(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word w : s.getWords()) {
String word = w.getForCf(stats.getFilter().getCalculateFor(), stats.getFilter().isCvv());
// skip this iteration if:
// - word doesn't contain a proper version (missing lemma for example)
// - msd regex is given but this word's msd doesn't match it, skip this iteration
// - given substring length is larger than the word length
if (ValidationUtil.isEmpty(word)
|| stats.getFilter().hasMsd() && !w.getMsd().matches(stats.getFilter().getMsd().get(0).pattern())
|| word.length() < stats.getFilter().getStringLength()) {
continue;
}
for (int i = 0; i < word.length() - stats.getFilter().getStringLength() + 1; i++) {
// TODO: locila?
stats.updateResults(word.substring(i, i + stats.getFilter().getStringLength()));
}
}
}
}
/**
* Extracts skipgram candidates.
*
* @return List of candidates represented as a list<candidates(String)>
*/
public static void generateSkipgramCandidates(List<Sentence> corpus, StatisticsNew stats) {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
for (int i = 0; i <= sentence.size() - ngram; i++) { // 1gram
for (int j = i + 1; j <= i + skip + 1; j++) { // 2gram
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats);
} else {
for (int m = k + 1; m <= k + 1 + skip; m++) { // 5gram
if (ngram == 5 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(sentence.get(i));
currentLoop.add(sentence.get(j));
currentLoop.add(sentence.get(k));
currentLoop.add(sentence.get(l));
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats);
}
}
}
}
}
}
}
}
}
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd())) {
stats.updateResults(wordToString(skipgramCandidate, stats.getFilter().getCalculateFor()));
}
}
}

View File

@@ -0,0 +1,62 @@
package alg.word;
import java.util.List;
import java.util.concurrent.RecursiveAction;
import data.Sentence;
import data.StatisticsNew;
public class ForkJoin extends RecursiveAction {
private static final long serialVersionUID = 7711587510996456040L;
private static final int ACCEPTABLE_SIZE = 1000;
private List<Sentence> corpus;
private StatisticsNew stats;
private int start;
private int end;
/**
* Constructor for subproblems.
*/
private ForkJoin(List<Sentence> corpus, int start, int end, StatisticsNew stats) {
this.corpus = corpus;
this.start = start;
this.end = end;
this.stats = stats;
}
/**
* Default constructor for the initial problem
*/
public ForkJoin(List<Sentence> corpus, StatisticsNew stats) {
this.corpus = corpus;
this.start = 0;
this.end = corpus.size();
this.stats = stats;
}
private void computeDirectly() {
List<Sentence> subCorpus = corpus.subList(start, end);
WordLevel.calculateForAll(subCorpus, stats);
}
@Override
protected void compute() {
int subCorpusSize = end - start;
if (subCorpusSize < ACCEPTABLE_SIZE) {
computeDirectly();
} else {
int mid = start + subCorpusSize / 2;
ForkJoin left = new ForkJoin(corpus, start, mid, stats);
ForkJoin right = new ForkJoin(corpus, mid, end, stats);
// fork (push to queue)-> compute -> join
left.fork();
right.fork();
left.join();
right.join();
}
}
}

View File

@@ -0,0 +1,167 @@
package alg.word;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import alg.Common;
import data.CalculateFor;
import data.Sentence;
import data.Statistics;
import data.Word;
class WordCount {
private static void calculateNoFilter(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateVCC(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getCVVWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
if (word.length() > stats.getSubstringLength()) {
for (int i = 0; i <= word.length() - stats.getSubstringLength(); i++) {
String substring = word.substring(i, i + stats.getSubstringLength());
Common.updateMap(stats.result, substring);
}
}
}
}
}
private static void calculateForJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd() != null && word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
private static void calculateForTaxonomyAndJosType(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
List<Word> filteredWords = new ArrayList<>();
for (Word word : s.getWords()) {
if (word.getMsd().charAt(0) == stats.getDistributionJosWordType()) {
filteredWords.add(word);
}
}
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(filteredWords
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(filteredWords
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
private static void calculateForTaxonomy(List<Sentence> corpus, Statistics stats) {
for (Sentence s : corpus) {
if (s.getTaxonomy().equalsIgnoreCase(stats.getDistributionTaxonomy())) {
List<String> sentence = new ArrayList<>(s.getWords().size());
if (stats.getCf() == CalculateFor.LEMMA) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getLemma)
.collect(Collectors.toList()));
} else if (stats.getCf() == CalculateFor.WORD) {
sentence.addAll(s.getWords()
.stream()
.map(Word::getWord)
.collect(Collectors.toList()));
}
for (String word : sentence) {
Common.updateMap(stats.result, word);
}
}
}
}
static void calculateForAll(List<Sentence> corpus, Statistics stats) {
boolean taxonomyIsSet = stats.isTaxonomySet();
boolean JosTypeIsSet = stats.isJOSTypeSet();
// branching because even though the only difference is an if or two &&
// O(if) = 1, the amount of ifs adds up and this saves some time
if (taxonomyIsSet && JosTypeIsSet) {
calculateForTaxonomyAndJosType(corpus, stats);
} else if (taxonomyIsSet && !JosTypeIsSet) {
calculateForTaxonomy(corpus, stats);
} else if (!taxonomyIsSet && JosTypeIsSet) {
calculateForJosType(corpus, stats);
} else {
if (stats.isVcc()) {
calculateVCC(corpus, stats);
} else {
calculateNoFilter(corpus, stats);
}
}
}
}

View File

@@ -0,0 +1,112 @@
package alg.word;
import static data.Enums.WordLevelDefaultValues.*;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import data.Enums.WordLevelDefaultValues;
import data.Enums.WordLevelType;
import data.Sentence;
import data.StatisticsNew;
import data.Word;
@SuppressWarnings("Duplicates")
public class WordLevel {
private static HashSet<String> suffixes;
private static int minSuffixLength;
private static int maxSuffixLength;
private static HashSet<String> prefixes;
private static int minPrefixLength;
private static int maxPrefixLength;
static {
suffixes = WordLevelDefaultValues.getSuffixes();
calculateSuffixesLengths();
prefixes = WordLevelDefaultValues.getPrefixes();
calculatePrefixesLengths();
}
public static void calculateForAll(List<Sentence> corpus, StatisticsNew stats) {
for (Sentence s : corpus) {
for (Word word : s.getWords()) {
calculateForSuffixes(word.getWord(), stats);
calculateForPrefixes(word.getWord(), stats);
}
}
}
private static void calculateForPrefixes(String word, StatisticsNew stats) {
for (int tmpPrefixLength = maxPrefixLength; tmpPrefixLength >= minPrefixLength; tmpPrefixLength++) {
if (word.length() - tmpPrefixLength < MIN_N_OF_CHARACTERS_LEFT_PREFIX) {
return;
}
String extractedPrefix = StringUtils.left(word, tmpPrefixLength);
if (prefixes.contains(extractedPrefix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.PREFIX, extractedPrefix, word);
return;
}
}
}
public static void calculateForSuffixes(String word, StatisticsNew stats) {
for (int tmpSuffixLength = maxSuffixLength; tmpSuffixLength >= minSuffixLength; tmpSuffixLength++) {
// preveri, da je beseda - cuttan suffix daljši od prednastavljene vrednosti
// ker gremo od najdaljše opcije k najkrajši, se ob dosegu tega pogoja lahko zaključi računanje za trenutno besedo
if (word.length() - tmpSuffixLength < MIN_N_OF_CHARACTERS_LEFT_SUFFIX) {
return;
}
String extractedSuffix = StringUtils.right(word, tmpSuffixLength);
if (suffixes.contains(extractedSuffix)) {
// save suffix and full word
stats.updateResultsNested(WordLevelType.SUFFIX, extractedSuffix, word);
return;
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculateSuffixesLengths() {
minSuffixLength = -1;
maxSuffixLength = -1;
for (String suffix : suffixes) {
if (suffix.length() > maxSuffixLength) {
maxSuffixLength = suffix.length();
if (minSuffixLength < 0) {
minSuffixLength = maxSuffixLength;
}
} else if (suffix.length() < minSuffixLength) {
minSuffixLength = suffix.length();
}
}
}
// finds the shortest and longest suffix for quicker calculations
public static void calculatePrefixesLengths() {
minPrefixLength = -1;
maxPrefixLength = -1;
for (String prefix : prefixes) {
if (prefix.length() > maxPrefixLength) {
maxPrefixLength = prefix.length();
if (minPrefixLength < 0) {
minPrefixLength = maxPrefixLength;
}
} else if (prefix.length() < minPrefixLength) {
minPrefixLength = prefix.length();
}
}
}
}