Added new ssj500k reading option. Fixed GOS taxonomy

This commit is contained in:
Luka 2018-09-03 13:31:41 +02:00
parent 426a9ccc46
commit 1d9e9b7ed6
9 changed files with 280 additions and 40 deletions

View File

@ -52,6 +52,8 @@ public class XML_processing {
readXMLGos(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
readXMLSolar(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
readXMLSSJ500K(path, stats);
}
}
@ -91,6 +93,50 @@ public class XML_processing {
return "";
}
/**
* Reads and returns the value of a passed header attribute or an empty string.
* E.g. body base attribute, for discerning the corpus' type of ssj500k.
* Notice: returns only the value of the first occurrence of a given tag name.
*/
public static String readXMLHeaderAttribute(String path, String tag, String attribute) {
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = null;
try {
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent xmlEvent = eventReader.nextEvent();
if (xmlEvent.isStartElement()) {
StartElement startElement = xmlEvent.asStartElement();
String var = startElement.getName().getLocalPart();
if (var.equalsIgnoreCase(tag)) {
HashMap<String, String> att = extractAttributes(startElement);
if (att.containsKey("base")) {
return att.get("base").substring(0, att.get("base").length() - 12);
}
return eventReader.nextEvent().asCharacters().getData();
}
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return "";
}
private static void fj(List<Sentence> corpus, StatisticsNew stats) {
ForkJoinPool pool = new ForkJoinPool();
@ -403,6 +449,8 @@ public class XML_processing {
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else if (corpusType == CorpusType.SSJ500K) {
headTagName = "bibl";
} else {
headTagName = "teiHeader";
}
@ -436,6 +484,12 @@ public class XML_processing {
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (parseTaxonomy && elementName.equalsIgnoreCase("term")) {
String tax = startElement.getAttributeByName(QName.valueOf("ref"))
.getValue()
.replace("#", "");
resultTaxonomy.add(tax);
} else if (!parseTaxonomy && headTags.contains(elementName)) {
String tagContent = xmlEventReader.nextEvent().asCharacters().getData();
@ -646,6 +700,138 @@ public class XML_processing {
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
String msd = "";
List<Word> sentence = new ArrayList<>();
List<Sentence> corpus = new ArrayList<>(Settings.CORPUS_SENTENCE_LIMIT); // preset the list's size, so there won't be a need to resize it
String sentenceDelimiter = "s";
XMLEventReader eventReader = null;
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
while (eventReader.hasNext()) {
XMLEvent event = eventReader.nextEvent();
switch (event.getEventType()) {
case XMLStreamConstants.START_ELEMENT:
StartElement startElement = event.asStartElement();
String qName = startElement.getName().getLocalPart();
// "word" node
if (qName.equals("w")) {
inWord = true;
if (!String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(0, 4).equals("msd:")){
System.out.println("MSD written incorrectly");
}
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("ana")).getValue()).substring(4);
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
else if (qName.equals("pc")){
inPunctuation = true;
}
// taxonomy node
else if (qName.equalsIgnoreCase("term")) {
// there are some term nodes at the beginning that are of no interest to us
// they differ by not having the attribute "ref", so test will equal null
Attribute tax = startElement.getAttributeByName(QName.valueOf("ref"));
if (tax != null) {
// keep only taxonomy properties
String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
currentFiletaxonomy.add(currentFiletaxonomyElement);
Tax taxonomy = new Tax();
currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
}
}
break;
case XMLStreamConstants.CHARACTERS:
Characters characters = event.asCharacters();
// "word" node value
if (inWord) {
String word = characters.getData();
sentence.add(createWord(word, lemma, msd, word, stats.getFilter()));
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
String punctuation = characters.getData();
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
EndElement endElement = event.asEndElement();
String var = endElement.getName().getLocalPart();
String debug = "";
// parser reached end of the current sentence
if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
// add sentence to corpus if it passes filters
sentence = runFilters(sentence, stats.getFilter());
if (!ValidationUtil.isEmpty(sentence)) {
corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
}
// and start a new one
sentence = new ArrayList<>();
/* Invoke Fork-Join when we reach maximum limit of
* sentences (because we can't read everything to
* memory) or we reach the end of the file.
*/
if (corpus.size() == Settings.CORPUS_SENTENCE_LIMIT || !eventReader.hasNext()) {
fj(corpus, stats);
// empty the current corpus, since we don't need the data anymore
corpus.clear();
// TODO: if (stats.isUseDB()) {
// stats.storeTmpResultsToDB();
// }
}
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div")) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
}
break;
}
}
} catch (FileNotFoundException | XMLStreamException e) {
e.printStackTrace();
} finally {
if (eventReader != null) {
try {
eventReader.close();
} catch (XMLStreamException e) {
logger.error("closing stream", e);
}
}
}
return true;
}
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
@ -853,6 +1039,9 @@ public class XML_processing {
else if (endElement.getName().getLocalPart().equalsIgnoreCase("tei")) {
fj(corpus, stats);
corpus.clear();
currentFiletaxonomy = new ArrayList<>();
currentFiletaxonomyLong = new ArrayList<>();
}
break;
@ -914,7 +1103,7 @@ public class XML_processing {
return atts;
}
private static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
public static Word createWord(String word, String lemma, String msd, String normalizedWord, Filter f){
List<String> wString = new ArrayList<>();
if (f.getWordParts().contains(CalculateFor.WORD))
wString.add(word);

View File

@ -15,6 +15,8 @@ import org.apache.logging.log4j.Logger;
import gui.ValidationUtil;
import static alg.XML_processing.createWord;
public class Ngrams {
public final static Logger logger = LogManager.getLogger(Ngrams.class);
@ -138,16 +140,22 @@ public class Ngrams {
* Checks whether an ngram candidate passes specified regex filter.
*/
private static boolean passesRegex(List<Word> ngramCandidate, ArrayList<Pattern> regex, ArrayList<CalculateFor> wordParts) {
if (ngramCandidate.size() != regex.size()) {
logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
return false;
}
// if (ngramCandidate.size() != regex.size()) {
// logger.error("ngramCandidate.size() & msd.size() mismatch"); // should not occur anyway
// return false;
// }
for (int i = 0; i < regex.size(); i++) {
int j = 0;
for (int i = 0; i < ngramCandidate.size(); i++) {
String msd = ngramCandidate.get(i).getMsd(wordParts);
if (msd.equals("*")){
continue;
}
//if (!ngramCandidate.get(i).getMsd().matches(regex.get(i).pattern())) {
if (!ngramCandidate.get(i).getMsd(wordParts).matches(regex.get(i).pattern() + ".*")) {
if (!msd.matches(regex.get(j).pattern() + ".*")) {
return false;
}
j ++;
}
return true;
@ -270,6 +278,7 @@ public class Ngrams {
ArrayList<Word> currentLoop;
int ngram = stats.getFilter().getNgramValue();
int skip = stats.getFilter().getSkipValue();
Word w = createWord("*", "*", "*", "*", stats.getFilter());
for (Sentence s : corpus) {
List<Word> sentence = s.getWords();
@ -283,7 +292,8 @@ public class Ngrams {
if (ngram == 2 && j < sentence.size()) {
currentLoop = new ArrayList<>();
// currentLoop.add(sentence.get(i));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -291,8 +301,10 @@ public class Ngrams {
for (int k = j + 1; k <= j + 1 + skip; k++) { // 3gram
if (ngram == 3 && k < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -300,9 +312,12 @@ public class Ngrams {
for (int l = k + 1; l <= k + 1 + skip; l++) { // 4gram
if (ngram == 4 && l < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
fillSkipgrams(currentLoop, k, l, w);
currentLoop.add(sentence.get(l));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -310,10 +325,14 @@ public class Ngrams {
for (int m = l + 1; m <= l + 1 + skip; m++) { // 5gram
if (ngram == 5 && m < sentence.size()) {
currentLoop = new ArrayList<>();
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, i, j, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, j, k, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, k, l, stats));
currentLoop.add(checkAndModifySkipgramPunctuation(sentence, l, m, stats));
currentLoop.add(sentence.get(i));
fillSkipgrams(currentLoop, i, j, w);
currentLoop.add(sentence.get(j));
fillSkipgrams(currentLoop, j, k, w);
currentLoop.add(sentence.get(k));
fillSkipgrams(currentLoop, k, l, w);
currentLoop.add(sentence.get(l));
fillSkipgrams(currentLoop, l, m, w);
currentLoop.add(sentence.get(m));
validateAndCountSkipgramCandidate(currentLoop, stats, s.getTaxonomy());
@ -329,6 +348,12 @@ public class Ngrams {
}
}
private static void fillSkipgrams(ArrayList<Word> currentLoop, int i, int j, Word w){
for(int k = i + 1; k < j; k++){
currentLoop.add(w);
}
}
private static void validateAndCountSkipgramCandidate(ArrayList<Word> skipgramCandidate, StatisticsNew stats, List<String> taxonomy) {
// count if no regex is set or if it is & candidate passes it
if (!stats.getFilter().hasMsd() || passesRegex(skipgramCandidate, stats.getFilter().getMsd(), stats.getFilter().getWordParts())) {

View File

@ -4,7 +4,8 @@ public enum CorpusType {
GIGAFIDA("Gigafida", "gigafida"),
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos");
GOS("GOS", "gos"),
SSJ500K("ssj500k", "ssj500k");
private final String name;

View File

@ -142,7 +142,7 @@ public class Filter {
public void setHasMsd(boolean hasMsd) {
filter.put(HAS_MSD, hasMsd);
if (hasMsd)
if (hasMsd && !((ArrayList<CalculateFor>) filter.get(MULTIPLE_KEYS)).contains(CalculateFor.MORPHOSYNTACTIC_SPECS))
addWordPart(CalculateFor.MORPHOSYNTACTIC_SPECS);
}

View File

@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES));
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K));
static {
// GIGAFIDA ----------------------------
@ -77,6 +77,12 @@ public class Tax {
GOS_TAXONOMY.put("gos.S", "gos.S - situacija");
GOS_TAXONOMY.put("gos.S.R", "gos.S.R - situacija-radio");
GOS_TAXONOMY.put("gos.S.T", "gos.S.T - situacija-televizija");
GOS_TAXONOMY.put("gos.K", "gos.K - kanal");
GOS_TAXONOMY.put("gos.K.O", "gos.K.O - kanal-osebni stik");
GOS_TAXONOMY.put("gos.K.P", "gos.K.P - kanal-telefon");
GOS_TAXONOMY.put("gos.K.R", "gos.K.R - kanal-radio");
GOS_TAXONOMY.put("gos.K.T", "gos.K.T - kanal-televizija");
}
/**
@ -98,7 +104,7 @@ public class Tax {
public static ObservableList<String> getTaxonomyForComboBox(CorpusType corpusType, HashSet<String> foundTax) {
LinkedHashMap<String, String> tax = new LinkedHashMap<>();
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;

View File

@ -244,7 +244,7 @@ public class CorpusTab {
logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.CCKRES || corpusType == CorpusType.SSJ500K) {
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@ -429,6 +429,7 @@ public class CorpusTab {
// read first file only, maybe later do all, if toll on resources is acceptable
File f = corpusFiles.iterator().next();
String title = XML_processing.readXMLHeaderTag(f.getAbsolutePath(), "title").toLowerCase();
String attrib = XML_processing.readXMLHeaderAttribute(f.getAbsolutePath(), "body", "base").toLowerCase();
String test = CCKRES.getNameLowerCase();
String debug = "";
@ -442,6 +443,8 @@ public class CorpusTab {
corpusType = CCKRES;
} else if (title.contains(GOS.getNameLowerCase())) {
corpusType = GOS;
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
corpusType = SSJ500K;
}
if (corpusType == null) {

View File

@ -415,7 +415,6 @@ public class OneWordAnalysisTab {
Filter filter = new Filter();
filter.setNgramValue(1);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setDisplayTaxonomy(displayTaxonomy);
filter.setAl(AnalysisLevel.STRING_LEVEL);
@ -424,6 +423,9 @@ public class OneWordAnalysisTab {
filter.setSolarFilters(solarFiltersMap);
filter.setStringLength(1);
filter.setMultipleKeys(alsoVisualize);
// setMsd must be behind alsoVisualize
filter.setMsd(msd);
filter.setMinimalOccurrences(minimalOccurrences);
filter.setMinimalTaxonomy(minimalTaxonomy);
filter.setWriteMsdAtTheEnd(writeMsdAtTheEnd);

View File

@ -522,7 +522,6 @@ public class StringAnalysisTabNew2 {
Filter filter = new Filter();
filter.setNgramValue(ngramValue);
filter.setCalculateFor(calculateFor);
filter.setMsd(msd);
filter.setTaxonomy(Tax.getTaxonomyCodes(taxonomy, corpus.getCorpusType()));
filter.setDisplayTaxonomy(displayTaxonomy);
filter.setAl(AnalysisLevel.STRING_LEVEL);
@ -531,6 +530,9 @@ public class StringAnalysisTabNew2 {
filter.setSolarFilters(solarFiltersMap);
filter.setNotePunctuations(notePunctuations);
filter.setMultipleKeys(alsoVisualize);
// setMsd must be behind alsoVisualize
filter.setMsd(msd);
filter.setMinimalOccurrences(minimalOccurrences);
filter.setMinimalTaxonomy(minimalTaxonomy);

View File

@ -12,6 +12,7 @@ import java.util.concurrent.atomic.AtomicLong;
import data.CalculateFor;
import data.Filter;
import data.MultipleHMKeys;
import gui.ValidationUtil;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.csv.QuoteMode;
@ -87,6 +88,9 @@ public class Export {
//CSV file header
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
FILE_HEADER_AL.add("Izpuščene besede");
}
FILE_HEADER_AL.add(filter.getCalculateFor().toHeaderString());
if (filter.getCalculateFor().equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add("Lema male črke");
@ -125,12 +129,10 @@ public class Export {
// for (Map<MultipleHMKeys, AtomicLong> value : taxonomyResults.values()) {
for (CalculateFor otherKey : filter.getMultipleKeys()) {
if (num_taxonomy_frequencies.get(otherKey) > 0) {
FILE_HEADER_AL.add(otherKey.toHeaderString());
if (otherKey.equals(CalculateFor.LEMMA))
FILE_HEADER_AL.add("Lema male črke");
}
}
// if(otherKey.equals(CalculateFor.LEMMA)){
// FILE_HEADER_AL.add("Lema");
@ -215,9 +217,12 @@ public class Export {
for (Map.Entry<MultipleHMKeys, Long> e : map.entrySet()) {
List dataEntry = new ArrayList<>();
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
dataEntry.add(e.getKey().getK1());
}
dataEntry.add(eraseSkipgramStars(e.getKey().getK1(), filter));
if (filter.getCalculateFor().equals(CalculateFor.LEMMA)){
dataEntry.add(e.getKey().getK1().toLowerCase());
dataEntry.add(eraseSkipgramStars(e.getKey().getK1().toLowerCase(), filter));
}
int i = 0;
@ -225,20 +230,20 @@ public class Export {
switch(i){
case 0:
if (otherKey.equals(CalculateFor.LEMMA)){
dataEntry.add(e.getKey().getK2());
dataEntry.add(e.getKey().getK2().toLowerCase());
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
dataEntry.add(eraseSkipgramStars(e.getKey().getK2().toLowerCase(), filter));
} else {
dataEntry.add(e.getKey().getK2());
dataEntry.add(eraseSkipgramStars(e.getKey().getK2(), filter));
}
break;
case 1:
dataEntry.add(e.getKey().getK3());
dataEntry.add(eraseSkipgramStars(e.getKey().getK3(), filter));
break;
case 2:
dataEntry.add(e.getKey().getK4());
dataEntry.add(eraseSkipgramStars(e.getKey().getK4(), filter));
break;
case 3:
dataEntry.add(e.getKey().getK5());
dataEntry.add(eraseSkipgramStars(e.getKey().getK5(), filter));
break;
}
@ -330,6 +335,13 @@ public class Export {
return fileName;
}
private static String eraseSkipgramStars(String s, Filter filter){
if (!ValidationUtil.isEmpty(filter.getSkipValue()) && filter.getSkipValue() > 0) {
s = s.replace("* ", "");
}
return s;
}
public static String SetToCSV(String title, Object[][] result, File resultsPath, LinkedHashMap<String, String> headerInfoBlock) {
//Delimiter used in CSV file
String NEW_LINE_SEPARATOR = "\n";