Taxonomy refactored

2018-11-26 13:41:35 +01:00
parent a7f3bdb925
commit 9efe3d529b
16 changed files with 1173 additions and 491 deletions
--- a/src/main/java/alg/XML_processing.java
+++ b/src/main/java/alg/XML_processing.java
@@ -536,8 +536,8 @@ public class XML_processing {
 		boolean inWord = false;
 		boolean inPunctuation = false;
 		boolean taxonomyMatch = true;
-		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
-		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
+		ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
+//		ArrayList<Taxonomy> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
 		String msd = "";

@@ -578,10 +578,10 @@ public class XML_processing {

 							if (tax != null) {
 								// keep only taxonomy properties
-								String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
+								Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
 								currentFiletaxonomy.add(currentFiletaxonomyElement);
 								Tax taxonomy = new Tax();
-								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
+//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
 							}
 						}
 						break;
@@ -637,7 +637,7 @@ public class XML_processing {
 						// parser reached end of the current sentence
 						if (endElement.getName().getLocalPart().equals(sentenceDelimiter)) {
                            // count all UniGramOccurrences in sentence for statistics
-                            stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomyLong);
+                            stats.updateUniGramOccurrences(sentence.size(), currentFiletaxonomy);

 							// add sentence to corpus if it passes filters
 							sentence = runFilters(sentence, stats.getFilter());
@@ -645,7 +645,7 @@ public class XML_processing {


 							if (!ValidationUtil.isEmpty(sentence) && taxonomyMatch) {
-								corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+								corpus.add(new Sentence(sentence, currentFiletaxonomy));
 							}

 //							taxonomyMatch = true;
@@ -713,8 +713,8 @@ public class XML_processing {
    public static boolean readXMLSSJ500K(String path, StatisticsNew stats) {
        boolean inWord = false;
        boolean inPunctuation = false;
-        ArrayList<String> currentFiletaxonomy = new ArrayList<>();
-        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
+        ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
+//        ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
        String lemma = "";
        String msd = "";

@@ -757,10 +757,10 @@ public class XML_processing {

                            if (tax != null) {
                                // keep only taxonomy properties
-                                String currentFiletaxonomyElement = String.valueOf(tax.getValue()).replace("#", "");
+                                Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""));
                                currentFiletaxonomy.add(currentFiletaxonomyElement);
                                Tax taxonomy = new Tax();
-                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
+//                                currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
                            }
                        }
                        break;
@@ -793,7 +793,7 @@ public class XML_processing {
                            sentence = runFilters(sentence, stats.getFilter());

                            if (!ValidationUtil.isEmpty(sentence)) {
-                                corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+                                corpus.add(new Sentence(sentence, currentFiletaxonomy));
                            }

                            // and start a new one
@@ -820,7 +820,7 @@ public class XML_processing {
                            corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
-                            currentFiletaxonomyLong = new ArrayList<>();
+//                            currentFiletaxonomyLong = new ArrayList<>();
                        }

                        break;
@@ -848,8 +848,8 @@ public class XML_processing {
 		boolean inOrthDiv = false;
 		boolean computeForOrth = stats.getCorpus().isGosOrthMode();
 		boolean inSeparatedWord = false;
-		ArrayList<String> currentFiletaxonomy = new ArrayList<>();
-		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
+		ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
+//		ArrayList<String> currentFiletaxonomyLong = new ArrayList<>();
 		String lemma = "";
 		String msd = "";

@@ -923,10 +923,10 @@ public class XML_processing {

 							if (tax != null) {
 								// keep only taxonomy properties
-								String currentFiletaxonomyElement = String.valueOf(tax.getValue());
+								Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()));
 								currentFiletaxonomy.add(currentFiletaxonomyElement);
 								Tax taxonomy = new Tax();
-								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
+//								currentFiletaxonomyLong.add(taxonomy.getLongTaxonomyName(currentFiletaxonomyElement));
 							}
 						} else if (qName.equalsIgnoreCase("div")) {
 							gosType = String.valueOf(startElement.getAttributeByName(QName.valueOf("type")).getValue());
@@ -1010,7 +1010,7 @@ public class XML_processing {
 								// add sentence to corpus if it passes filters
 								if (includeFile && !ValidationUtil.isEmpty(sentence)) {
 									sentence = runFilters(sentence, stats.getFilter());
-									corpus.add(new Sentence(sentence, currentFiletaxonomyLong));
+									corpus.add(new Sentence(sentence, currentFiletaxonomy));
 								}

 								wordIndex = 0;
@@ -1050,7 +1050,7 @@ public class XML_processing {
 							corpus.clear();

                            currentFiletaxonomy = new ArrayList<>();
-                            currentFiletaxonomyLong = new ArrayList<>();
+//                            currentFiletaxonomyLong = new ArrayList<>();
 						}

 						break;