Computer formatted

This commit is contained in:
2018-07-23 09:14:46 +02:00
parent 84d0086a66
commit bebc0abbb3
92 changed files with 74 additions and 12 deletions

29
src/main/java/alg/XML_processing.java Normal file → Executable file
View File

@@ -224,7 +224,8 @@ public class XML_processing {
@SuppressWarnings("unused")
public static void readXMLSolar(String path, StatisticsNew stats) {
boolean in_word = false;
String lemma = "";
boolean inPunctuation = false;
String lemma = "";
String msd = "";
List<Word> stavek = new ArrayList<>();
@@ -275,6 +276,9 @@ public class XML_processing {
corpus.clear();
}
}
else if(includeThisBlock){
inPunctuation = true;
}
} else if (headTags.contains(qName)) {
String tagContent = eventReader.nextEvent().asCharacters().getData();
headBlock.put(qName, tagContent);
@@ -291,7 +295,13 @@ public class XML_processing {
if (in_word) {
stavek.add(new Word(characters.getData(), lemma, msd));
in_word = false;
}
} else if(inPunctuation){
String punctuation = ",";
stavek.get(stavek.size()-1).setWord(stavek.get(stavek.size()-1).getWord() + punctuation);
stavek.get(stavek.size()-1).setLemma(stavek.get(stavek.size()-1).getLemma() + punctuation);
stavek.get(stavek.size()-1).setMsd(stavek.get(stavek.size()-1).getMsd() + punctuation);
inPunctuation = false;
}
break;
case XMLStreamConstants.END_ELEMENT:
@@ -472,6 +482,7 @@ public class XML_processing {
@SuppressWarnings("Duplicates")
public static boolean readXMLGigafida(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
ArrayList<String> currentFiletaxonomy = new ArrayList<>();
ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
String lemma = "";
@@ -501,6 +512,11 @@ public class XML_processing {
msd = String.valueOf(startElement.getAttributeByName(QName.valueOf("msd")).getValue());
lemma = String.valueOf(startElement.getAttributeByName(QName.valueOf("lemma")).getValue());
}
if (qName.equals("c")){
inPunctuation = true;
}
// taxonomy node
else if (qName.equalsIgnoreCase("catRef")) {
// there are some term nodes at the beginning that are of no interest to us
@@ -526,6 +542,14 @@ public class XML_processing {
sentence.add(new Word(word, lemma, msd, currentFiletaxonomyLong));
inWord = false;
}
// if (stats.getFilter().getNgramValue() > 1 && stats.getFilter().getNotePunctuations() && inPunctuation && sentence.size() > 0) {
//// String punctuation = characters.getData();
// String punctuation = ",";
// sentence.get(sentence.size()-1).setWord(sentence.get(sentence.size()-1).getWord() + punctuation);
// sentence.get(sentence.size()-1).setLemma(sentence.get(sentence.size()-1).getLemma() + punctuation);
// sentence.get(sentence.size()-1).setMsd(sentence.get(sentence.size()-1).getMsd() + punctuation);
// inPunctuation = false;
// }
break;
case XMLStreamConstants.END_ELEMENT:
@@ -604,6 +628,7 @@ public class XML_processing {
@SuppressWarnings("Duplicates")
public static boolean readXMLGos(String path, StatisticsNew stats) {
boolean inWord = false;
boolean inPunctuation = false;
boolean inOrthDiv = false;
boolean computeForOrth = stats.getCorpus().isGosOrthMode();
ArrayList<String> currentFiletaxonomy = new ArrayList<>();