Reimplementation of other signs (,/*() etc.) in ngrams.

This commit is contained in:
2018-08-28 11:41:19 +02:00
parent a8d147de52
commit 1c00f1a283
9 changed files with 203 additions and 91 deletions

View File

@@ -260,6 +260,12 @@ public class XML_processing {
} else if (qName.equals("c3")) {
String c3Content = eventReader.nextEvent().asCharacters().getData();
if(stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() &&
stavek.size() > 0){
stavek.add(new Word(c3Content, c3Content, "/"));
}
if (c3Content.equals(".") && includeThisBlock) {
// add sentence to corpus
corpus.add(new Sentence(stavek, null));
@@ -277,9 +283,6 @@ public class XML_processing {
corpus.clear();
}
}
else if(includeThisBlock){
inPunctuation = true;
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
@@ -296,17 +299,7 @@ public class XML_processing {
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
} else if(inPunctuation){
String punctuation = ",";
if (stavek.size() > 0){
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
}
inPunctuation = false;
}
}
break;
case XMLStreamConstants.END_ELEMENT:
@@ -548,13 +541,16 @@ public class XML_processing {
inWord = false;
}
if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
// String punctuation = characters.getData();
String punctuation = ",";
String punctuation = characters.getData();
sentence.add(new Word(punctuation, punctuation, "/"));
inPunctuation = false;
sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
inPunctuation = false;
// String punctuation = ",";
//
// sentence.get(sentence.size() - 1).setWord(sentence.get(sentence.size() - 1).getWord() + punctuation);
// sentence.get(sentence.size() - 1).setLemma(sentence.get(sentence.size() - 1).getLemma() + punctuation);
// sentence.get(sentence.size() - 1).setMsd(sentence.get(sentence.size() - 1).getMsd() + punctuation);
// inPunctuation = false;
}
break;

View File

@@ -56,8 +56,8 @@ public class Ngrams {
// String test = key;
// }
if (stats.getFilter().getNotePunctuations())
key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// if (stats.getFilter().getNotePunctuations())
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
@@ -68,28 +68,28 @@ public class Ngrams {
break;
case 1:
String k1_2 = wordToString(ngramCandidate, otherKeys.get(0));
if (stats.getFilter().getNotePunctuations())
k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length()-1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k2_3 = wordToString(ngramCandidate, otherKeys.get(1));
if (stats.getFilter().getNotePunctuations()) {
k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
}
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
// }
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(ngramCandidate, otherKeys.get(0));
String k3_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k3_4 = wordToString(ngramCandidate, otherKeys.get(2));
if (stats.getFilter().getNotePunctuations()) {
k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
}
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
// k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
// }
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
@@ -97,12 +97,12 @@ public class Ngrams {
String k4_3 = wordToString(ngramCandidate, otherKeys.get(1));
String k4_4 = wordToString(ngramCandidate, otherKeys.get(2));
String k4_5 = wordToString(ngramCandidate, otherKeys.get(3));
if (stats.getFilter().getNotePunctuations()) {
k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
}
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
// k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
// k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
// }
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default:
@@ -241,22 +241,22 @@ public class Ngrams {
*/
private static Word checkAndModifySkipgramPunctuation(List<Word> sentence, int i, int j, StatisticsNew stats){
// if punctuation checkbox selected and there words at indexes i and j are not next to each other
if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
boolean middleWordsHavePunctuation = false;
for (int n = i + 1; n < j; n++){
if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
middleWordsHavePunctuation = true;
break;
}
}
if (middleWordsHavePunctuation){
String punctuation = ",";
return new Word(sentence.get(i).getWord() + punctuation,
sentence.get(i).getLemma() + punctuation,
sentence.get(i).getMsd() + punctuation);
}
}
// if(stats.getFilter().getNotePunctuations() && j - i > 1 && sentence.get(i).getWord().charAt(sentence.get(i).getWord().length() - 1) != ','){
// boolean middleWordsHavePunctuation = false;
// for (int n = i + 1; n < j; n++){
// if (sentence.get(n).getWord().charAt(sentence.get(n).getWord().length() - 1) == ','){
// middleWordsHavePunctuation = true;
// break;
// }
// }
// if (middleWordsHavePunctuation){
//
// String punctuation = ",";
// return new Word(sentence.get(i).getWord() + punctuation,
// sentence.get(i).getLemma() + punctuation,
// sentence.get(i).getMsd() + punctuation);
// }
// }
return sentence.get(i);
}
@@ -348,8 +348,8 @@ public class Ngrams {
// String test = key;
// }
if (stats.getFilter().getNotePunctuations())
key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
// if (stats.getFilter().getNotePunctuations())
// key = (!key.equals("") && key.charAt(key.length()-1) == ',') ? key.substring(0, key.length() - 1) : key;
MultipleHMKeys multipleKeys;
@@ -360,28 +360,28 @@ public class Ngrams {
break;
case 1:
String k1_2 = wordToString(skipgramCandidate, otherKeys.get(0));
if (stats.getFilter().getNotePunctuations())
k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
// if (stats.getFilter().getNotePunctuations())
// k1_2 = (!k1_2.equals("") && k1_2.charAt(k1_2.length() - 1) == ',') ? k1_2.substring(0, k1_2.length() - 1) : k1_2;
multipleKeys = new MultipleHMKeys2(key, k1_2);
break;
case 2:
String k2_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k2_3 = wordToString(skipgramCandidate, otherKeys.get(1));
if (stats.getFilter().getNotePunctuations()) {
k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
}
// if (stats.getFilter().getNotePunctuations()) {
// k2_2 = (!k2_2.equals("") && k2_2.charAt(k2_2.length() - 1) == ',') ? k2_2.substring(0, k2_2.length() - 1) : k2_2;
// k2_3 = (!k2_3.equals("") && k2_3.charAt(k2_3.length() - 1) == ',') ? k2_3.substring(0, k2_3.length() - 1) : k2_3;
// }
multipleKeys = new MultipleHMKeys3(key, k2_2, k2_3);
break;
case 3:
String k3_2 = wordToString(skipgramCandidate, otherKeys.get(0));
String k3_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k3_4 = wordToString(skipgramCandidate, otherKeys.get(2));
if (stats.getFilter().getNotePunctuations()) {
k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
}
// if (stats.getFilter().getNotePunctuations()) {
// k3_2 = (!k3_2.equals("") && k3_2.charAt(k3_2.length() - 1) == ',') ? k3_2.substring(0, k3_2.length() - 1) : k3_2;
// k3_3 = (!k3_3.equals("") && k3_3.charAt(k3_3.length() - 1) == ',') ? k3_3.substring(0, k3_3.length() - 1) : k3_3;
// k3_4 = (!k3_4.equals("") && k3_4.charAt(k3_4.length() - 1) == ',') ? k3_4.substring(0, k3_4.length() - 1) : k3_4;
// }
multipleKeys = new MultipleHMKeys4(key, k3_2, k3_3, k3_4);
break;
case 4:
@@ -389,12 +389,12 @@ public class Ngrams {
String k4_3 = wordToString(skipgramCandidate, otherKeys.get(1));
String k4_4 = wordToString(skipgramCandidate, otherKeys.get(2));
String k4_5 = wordToString(skipgramCandidate, otherKeys.get(3));
if (stats.getFilter().getNotePunctuations()) {
k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
}
// if (stats.getFilter().getNotePunctuations()) {
// k4_2 = (!k4_2.equals("") && k4_2.charAt(k4_2.length() - 1) == ',') ? k4_2.substring(0, k4_2.length() - 1) : k4_2;
// k4_3 = (!k4_3.equals("") && k4_3.charAt(k4_3.length() - 1) == ',') ? k4_3.substring(0, k4_3.length() - 1) : k4_3;
// k4_4 = (!k4_4.equals("") && k4_4.charAt(k4_4.length() - 1) == ',') ? k4_4.substring(0, k4_4.length() - 1) : k4_4;
// k4_5 = (!k4_5.equals("") && k4_5.charAt(k4_5.length() - 1) == ',') ? k4_5.substring(0, k4_5.length() - 1) : k4_5;
// }
multipleKeys = new MultipleHMKeys5(key, k4_2, k4_3, k4_4, k4_5);
break;
default: