Added fixes on ssj500k functionality, fixed prefix/suffix bug and some other bugs.

2018-12-01 10:50:11 +01:00
parent 9efe3d529b
commit ca83cb023b
14 changed files with 530 additions and 162 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -313,6 +313,17 @@ public class XML_processing {
                            }

 							if (c3Content.equals(".") && includeThisBlock) {
+								if (stats.getFilter().getNgramValue() == 0){
+									int numSentenceParts = 0;
+									for(Word w : stavek){
+										int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
+										numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
+									}
+									stats.updateUniGramOccurrences(numSentenceParts, new ArrayList<>());
+								} else if(stats.getFilter().getNgramValue() >= 1) {
+									stats.updateUniGramOccurrences(stavek.size(), new ArrayList<>());
+								}
+
 								// add sentence to corpus
 								corpus.add(new Sentence(stavek, null));
 								// and start a new one
@@ -637,8 +648,16 @@ public class XML_processing {
 						// parser reached end of the current sentence
 						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            // count all UniGramOccurrences in sentence for statistics
-                            stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
-
+							if (stats.getFilter().getNgramValue() == 0){
+								int numSentenceParts = 0;
+								for(Word w : sentence){
+									int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
+									numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
+								}
+								stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
+							} else if(stats.getFilter().getNgramValue() >= 1) {
+								stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
+							}
 							// add sentence to corpus if it passes filters
 							sentence = runFilters(sentence, stats.getFilter());

@@ -713,6 +732,7 @@ public class XML_processing {
    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
+        boolean taxonomyMatch = true;
        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
 //        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
@@ -759,10 +779,14 @@ public class XML_processing {
                                // keep only taxonomy properties
                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
-                                Tax taxonomy = new Tax();
+//                                Tax taxonomy = new Tax();
 //                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
-                        }
+                        } else if (qName.equals("bibl")) {
+							// before proceeding to read this file, make sure that taxonomy filters are a match
+							taxonomyMatch = true;
+
+						}
                        break;

                    case XMLStreamConstants.CHARACTERS:
@@ -789,10 +813,21 @@ public class XML_processing {

                        // parser reached end of the current sentence
                        if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
+							if (stats.getFilter().getNgramValue() == 0){
+								int numSentenceParts = 0;
+								for(Word w : sentence){
+									int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
+									numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
+								}
+								stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
+							} else if(stats.getFilter().getNgramValue() >= 1) {
+								stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
+							}
+
                            // add sentence to corpus if it passes filters
                            sentence = runFilters(sentence, stats.getFilter());

-                            if (!ValidationUtil.isEmpty(sentence)) {
+                            if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
                                corpus.add(new Sentence(sentence, currentFiletaxonomy));
                            }

@@ -821,7 +856,20 @@ public class XML_processing {

                            currentFiletaxonomy = new ArrayList<>();
 //                            currentFiletaxonomyLong = new ArrayList<>();
-                        }
+                        } else if (endElement.getName().getLocalPart().equals("bibl")) {
+							// before proceeding to read this file, make sure that taxonomy filters are a match
+
+							if (!ValidationUtil.isEmpty(stats.getFilter().getTaxonomy())) {
+								currentFiletaxonomy.retainAll(stats.getFilter().getTaxonomy()); // intersection
+
+								if (currentFiletaxonomy.isEmpty()) {
+									// taxonomies don't match so stop
+//									return false;
+									taxonomyMatch = false;
+//									System.out.println("TEST");
+								}
+							}
+						}

                        break;
                }
@@ -925,7 +973,7 @@ public class XML_processing {
 								// keep only taxonomy properties
 								Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
 								currentFiletaxonomy.add(currentFiletaxonomyElement);
-								Tax taxonomy = new Tax();
+//								Tax taxonomy = new Tax();
 //								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
 							}
 						} else if (qName.equalsIgnoreCase("div")) {
@@ -1007,6 +1055,17 @@ public class XML_processing {


                                sentence = GOSCorpusHM.remove(GOSCorpusHMKey);
+								if (stats.getFilter().getNgramValue() == 0){
+									int numSentenceParts = 0;
+									for(Word w : sentence){
+										int v = w.getW1().length() - (stats.getFilter().getStringLength() - 1);
+										numSentenceParts = (v >= 0) ? (numSentenceParts + v) : numSentenceParts;
+									}
+									stats.updateUniGramOccurrences(numSentenceParts, currentFiletaxonomy);
+								} else if(stats.getFilter().getNgramValue() >= 1) {
+									stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);
+								}
+
 								// add sentence to corpus if it passes filters
 								if (includeFile && !ValidationUtil.isEmpty(sentence)) {
 									sentence = runFilters(sentence, stats.getFilter());
@@ -1040,7 +1099,7 @@ public class XML_processing {
 								// disregard this entry if taxonomies don't match
 								includeFile = !currentFiletaxonomy.isEmpty();

-								currentFiletaxonomy = new ArrayList<>();
+//								currentFiletaxonomy = new ArrayList<>();
 							}
 						}