Computer formatted
This commit is contained in:
29
src/main/java/alg/XML_processing.java
Normal file → Executable file
29
src/main/java/alg/XML_processing.java
Normal file → Executable file
@@ -224,7 +224,8 @@ public class XML_processing {
|
||||
@SuppressWarnings("unused")
|
||||
public static void readXMLSolar(String path, StatisticsNew stats) {
|
||||
boolean in_word = false;
|
||||
String lemma = "";
|
||||
boolean inPunctuation = false;
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
List<Word> stavek = new ArrayList<>();
|
||||
@@ -275,6 +276,9 @@ public class XML_processing {
|
||||
corpus.clear();
|
||||
}
|
||||
}
|
||||
else if(includeThisBlock){
|
||||
inPunctuation = true;
|
||||
}
|
||||
} else if (headTags.contains(qName)) {
|
||||
String tagContent = eventReader.nextEvent().asCharacters().getData();
|
||||
headBlock.put(qName, tagContent);
|
||||
@@ -291,7 +295,13 @@ public class XML_processing {
|
||||
if (in_word) {
|
||||
stavek.add(new Word(characters.getData(), lemma, msd));
|
||||
in_word = false;
|
||||
}
|
||||
} else if(inPunctuation){
|
||||
String punctuation = ",";
|
||||
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
|
||||
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
|
||||
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
|
||||
inPunctuation = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
@@ -472,6 +482,7 @@ public class XML_processing {
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
|
||||
String lemma = "";
|
||||
@@ -501,6 +512,11 @@ public class XML_processing {
|
||||
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
|
||||
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
|
||||
}
|
||||
|
||||
if (qName.equals("c")){
|
||||
inPunctuation = true;
|
||||
}
|
||||
|
||||
// taxonomy node
|
||||
else if (qName.equalsIgnoreCase("catRef")) {
|
||||
// there are some term nodes at the beginning that are of no interest to us
|
||||
@@ -526,6 +542,14 @@ public class XML_processing {
|
||||
sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
|
||||
inWord = false;
|
||||
}
|
||||
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
|
||||
//// String punctuation = characters.getData();
|
||||
// String punctuation = ",";
|
||||
// sentence.get(sentence.size()-1).setWord(sentence.get(sentence.size()-1).getWord() + punctuation);
|
||||
// sentence.get(sentence.size()-1).setLemma(sentence.get(sentence.size()-1).getLemma() + punctuation);
|
||||
// sentence.get(sentence.size()-1).setMsd(sentence.get(sentence.size()-1).getMsd() + punctuation);
|
||||
// inPunctuation = false;
|
||||
// }
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
@@ -604,6 +628,7 @@ public class XML_processing {
|
||||
@SuppressWarnings("Duplicates")
|
||||
public static boolean readXMLGos(String path, StatisticsNew stats) {
|
||||
boolean inWord = false;
|
||||
boolean inPunctuation = false;
|
||||
boolean inOrthDiv = false;
|
||||
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
|
||||
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
|
||||
|
||||
Reference in New Issue
Block a user