diff --git a/.gitignore b/.gitignore index 349db05..127ea9a 100755 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ $RECYCLE.BIN/ src/main/resources/translation_external/ src/main/resources/translations_backup/ + +config.json +config_instructions.txt \ No newline at end of file diff --git a/config_characters.json b/config_characters.json new file mode 100644 index 0000000..bdbc284 --- /dev/null +++ b/config_characters.json @@ -0,0 +1,22 @@ +{ + "language": "SL", + + "corpusLocation": "target/classes/Gigafida_subset", + "readHeaderInfo": false, + "resultsLocation": "tmp", + "selectReader": "XML (Gigafida 1.0, Kres 1.0)", + "outputName": "", + "punctuation": "comma", + + "tab": "characters", + + "stringLength": 1, + "calculateFor": "calculateFor.WORD", + "displayTaxonomy": false, + + "msd": "", + "taxonomySetOperation": "taxonomySetOperation.UNION", + "taxonomy": ["SSJ.T.K.L - tisk-knjižno-leposlovno", "SSJ.T.K.L - tisk-knjižno-leposlovno"], + "minimalOccurrences": 1, + "minimalTaxonomy": 1 +} \ No newline at end of file diff --git a/config_characters_instructions.txt b/config_characters_instructions.txt new file mode 100644 index 0000000..282ad6d --- /dev/null +++ b/config_characters_instructions.txt @@ -0,0 +1,21 @@ + +"language": String - options: "SL", "EN" + +"corpusLocation": String - path to input location. +"readHeaderInfo": Boolean - read taxonomy from corpus files +"resultsLocation": String - path to results location +"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA +"outputName": String - Output file name +"punctuation": String - options: "comma", "point" + +"tab": String - options: "characters", "wordParts", "words", "wordSets" + +"stringLength": int - Number of characters +"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS" +"displayTaxonomy": Boolean - Display taxonomy in output + +"msd": String - A valid MSD (or empty) +"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION" +"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija" +"minimalOccurrences": int - Minimal number of occurrences +"minimalTaxonomy": int - Minimal number of taxonomy branches diff --git a/config_wordParts.json b/config_wordParts.json new file mode 100644 index 0000000..93203b4 --- /dev/null +++ b/config_wordParts.json @@ -0,0 +1,27 @@ +{ + "language": "SL", + + "corpusLocation": "target/classes/Gigafida_subset", + "readHeaderInfo": false, + "resultsLocation": "tmp", + "selectReader": "XML (Gigafida 1.0, Kres 1.0)", + "outputName": "", + "punctuation": "comma", + + "tab": "wordParts", + + "calculateFor": "calculateFor.WORD", + "alsoVisualize": ["calculateFor.LEMMA"], + "displayTaxonomy": false, + "prefixLength": 1, + "suffixLength": 0, + "prefixList": [], + "suffixList": [], + + "msd": "", + "taxonomySetOperation": "taxonomySetOperation.UNION", + "taxonomy": [], + "minimalOccurrences": 1, + "minimalTaxonomy": 1, + "minimalRelFre": 1 +} \ No newline at end of file diff --git a/config_wordParts_instructions.txt b/config_wordParts_instructions.txt new file mode 100644 index 0000000..b83bea3 --- /dev/null +++ b/config_wordParts_instructions.txt @@ -0,0 +1,26 @@ + +"language": String - options: "SL", "EN" + +"corpusLocation": String - path to input location. +"readHeaderInfo": Boolean - read taxonomy from corpus files +"resultsLocation": String - path to results location +"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA +"outputName": String - Output file name +"punctuation": String - options: "comma", "point" + +"tab": String - options: "characters", "wordParts", "words", "wordSets" + +"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS" +"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS" +"displayTaxonomy": Boolean - Display taxonomy in output +"prefixLength": int - prefix length +"suffixLength": int - suffix length +"prefixList": array of Strings - write different options in array +"suffixList": array of Strings - write different options in array + +"msd": String - A valid MSD (or empty) +"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION" +"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija" +"minimalOccurrences": int - Minimal number of occurrences +"minimalTaxonomy": int - Minimal number of taxonomy branches +"minimalRelFre": int - Minimal relative frequency diff --git a/config_wordSets.json b/config_wordSets.json new file mode 100644 index 0000000..a836619 --- /dev/null +++ b/config_wordSets.json @@ -0,0 +1,27 @@ +{ + "language": "SL", + + "corpusLocation": "target/classes/Gigafida_subset", + "readHeaderInfo": false, + "resultsLocation": "tmp", + "selectReader": "XML (Gigafida 1.0, Kres 1.0)", + "outputName": "", + "punctuation": "comma", + + "tab": "wordSets", + + "calculateFor": "calculateFor.WORD", + "alsoVisualize": ["calculateFor.MORPHOSYNTACTIC_SPECS"], + "displayTaxonomy": false, + "ngramValue": 2, + "skipValue": 0, + "notePunctuations": false, + "collocability": ["Dice"], + + "msd": "Sozei Sozei", + "taxonomySetOperation": "taxonomySetOperation.UNION", + "taxonomy": [], + "minimalOccurrences": 1, + "minimalTaxonomy": 1, + "minimalRelFre": 1 +} \ No newline at end of file diff --git a/config_wordSets_instructions.txt b/config_wordSets_instructions.txt new file mode 100644 index 0000000..ac6bdd3 --- /dev/null +++ b/config_wordSets_instructions.txt @@ -0,0 +1,26 @@ + +"language": String - options: "SL", "EN" + +"corpusLocation": String - path to input location. +"readHeaderInfo": Boolean - read taxonomy from corpus files +"resultsLocation": String - path to results location +"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA +"outputName": String - Output file name +"punctuation": String - options: "comma", "point" + +"tab": String - options: "characters", "wordParts", "words", "wordSets" + +"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS" +"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS" +"displayTaxonomy": Boolean - Display taxonomy in output +"ngramValue": int - N-gram length +"skipValue": int - Maximum number of words that can appear between two words and word set +"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag +"collocability": array of Strings - options: "Dice", "t-score", "MI", "MI3", "logDice", "simple LL" + +"msd": String - A valid MSD (or empty) +"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION" +"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija" +"minimalOccurrences": int - Minimal number of occurrences +"minimalTaxonomy": int - Minimal number of taxonomy branches +"minimalRelFre": int - Minimal relative frequency diff --git a/config_words.json b/config_words.json new file mode 100644 index 0000000..144a8e0 --- /dev/null +++ b/config_words.json @@ -0,0 +1,25 @@ +{ + "language": "SL", + + "corpusLocation": "target/classes/Gigafida_minimal/gfmin.xml", + "readHeaderInfo": false, + "resultsLocation": "tmp", + "selectReader": "XML (Gigafida 1.0, Kres 1.0)", + "outputName": "", + "punctuation": "comma", + + "tab": "words", + + "calculateFor": "calculateFor.WORD", + "alsoVisualize": ["calculateFor.LEMMA"], + "displayTaxonomy": false, + "notePunctuations": false, + "writeMsdAtTheEnd": false, + + "msd": "", + "taxonomySetOperation": "taxonomySetOperation.UNION", + "taxonomy": [" SSJ.T.K.S - tisk-knjižno-strokovno"], + "minimalOccurrences": 1, + "minimalTaxonomy": 1, + "minimalRelFre": 1 +} \ No newline at end of file diff --git a/config_words_instructions.txt b/config_words_instructions.txt new file mode 100644 index 0000000..9d4b647 --- /dev/null +++ b/config_words_instructions.txt @@ -0,0 +1,24 @@ + +"language": String - options: "SL", "EN" + +"corpusLocation": String - path to input location. +"readHeaderInfo": Boolean - read taxonomy from corpus files +"resultsLocation": String - path to results location +"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA +"outputName": String - Output file name +"punctuation": String - options: "comma", "point" + +"tab": String - options: "characters", "wordParts", "words", "wordSets" + +"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS" +"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS" +"displayTaxonomy": Boolean - Display taxonomy in output +"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag +"writeMsdAtTheEnd": Boolean - Word sets will include punctuations + +"msd": String - A valid MSD (or empty) +"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION" +"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija" +"minimalOccurrences": int - Minimal number of occurrences +"minimalTaxonomy": int - Minimal number of taxonomy branches +"minimalRelFre": int - Minimal relative frequency diff --git a/corpus-analyzer.jar b/corpus-analyzer.jar new file mode 100644 index 0000000..985a018 Binary files /dev/null and b/corpus-analyzer.jar differ diff --git a/src/main/java/data/Taxonomy.java b/src/main/java/data/Taxonomy.java index 2755fc5..604aa81 100755 --- a/src/main/java/data/Taxonomy.java +++ b/src/main/java/data/Taxonomy.java @@ -680,7 +680,7 @@ enum TaxonomyEnum { return r; } - public static ArrayList convertStringListToTaxonomyList(ObservableList stringList, Corpus corpus){ + public static ArrayList convertStringListToTaxonomyList(List stringList, Corpus corpus){ ArrayList taxonomyList = new ArrayList<>(); for (String e : stringList) { @@ -791,7 +791,7 @@ public class Taxonomy { return null; } - public static ArrayList convertStringListToTaxonomyList(ObservableList stringList, Corpus corpus){ + public static ArrayList convertStringListToTaxonomyList(List stringList, Corpus corpus){ ArrayList taxonomyList = new ArrayList<>(); for (String e : stringList) { @@ -832,7 +832,7 @@ public class Taxonomy { return r; } - public static ArrayList modifyingTaxonomy(ArrayList taxonomy, ObservableList checkedItems, Corpus corpus){ + public static ArrayList modifyingTaxonomy(ArrayList taxonomy, List checkedItems, Corpus corpus){ ArrayList checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus); if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) { TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus); diff --git a/src/main/java/gui/GUIController.java b/src/main/java/gui/GUIController.java index 17fb0fa..a024c08 100755 --- a/src/main/java/gui/GUIController.java +++ b/src/main/java/gui/GUIController.java @@ -2,6 +2,7 @@ package gui; import java.io.IOException; +import data.Filter; import javafx.beans.binding.StringBinding; import javafx.scene.layout.AnchorPane; import org.apache.logging.log4j.LogManager; @@ -20,6 +21,8 @@ import javafx.scene.control.Tab; import javafx.scene.control.TabPane; import javafx.stage.Stage; +import static nogui.NoGUIController.launch_no_gui; + public class GUIController extends Application { public final static Logger logger = LogManager.getLogger(GUIController.class); @@ -78,7 +81,13 @@ public class GUIController extends Application { } public static void main(String[] args) { - launch(args); + if (args.length > 0) { + launch_no_gui(args); + logger.info("Processing finalized!"); + } else { + launch(args); + } + System.exit(0); } public void initialize() { diff --git a/src/main/java/nogui/Characters.java b/src/main/java/nogui/Characters.java new file mode 100644 index 0000000..d49acab --- /dev/null +++ b/src/main/java/nogui/Characters.java @@ -0,0 +1,118 @@ +package nogui; + +import alg.XML_processing; +import data.*; +import gui.GUIController; +import gui.I18N; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.regex.Pattern; + +import static nogui.Utils.*; + +public class Characters { + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static void characters(JSONObject settings, Corpus corpus) { + Filter filter = new Filter(); + // fixed values + filter.setNgramValue(0); + filter.setAl(AnalysisLevel.STRING_LEVEL); + filter.setSkipValue(0); + filter.setIsCvv(false); + filter.setMultipleKeys(new ArrayList<>()); + + // tab specific values + filter.setStringLength(Math.toIntExact((Long) settings.get("stringLength"))); + String calculateForString = (String) settings.get("calculateFor"); + CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString)); + filter.setCalculateFor(calculateFor); + filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy")); + + // right part + ArrayList msd = getMsd((String) settings.get("msd")); + filter.setMsd(msd); + filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation"))); + ArrayList taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus); + filter.setTaxonomy(taxonomy); + filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences"))); + filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy"))); + + String message = Validation.validateForStringLevel(filter); + if (message == null) { + // no errors + logger.info("Executing: ", filter.toString()); + StatisticsNew statistic = new StatisticsNew(corpus, filter, false); + execute(statistic); + try { + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + } + } else { + logger.error(message); + } + + + } + + private static void execute(StatisticsNew statistic) { + logger.info("Started execution: ", statistic.getFilter()); + + Collection corpusFiles = statistic.getCorpus().getDetectedCorpusFiles(); + + final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType()); + int i = 0; + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + for (File f : corpusFiles) { + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); + i++; + if (multipleFiles) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000); + previousTime = new Date(); + } + updateProgress(i, corpusFiles.size(), String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds)); + } else { + + xml_processing.progressBarListener = new InvalidationListener() { + int remainingSeconds = -1; + Date previousTime = new Date(); + @Override + public void invalidated(Observable observable) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * + (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * + ((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); + previousTime = new Date(); + } + updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100, String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds)); + } + }; + + xml_processing.progressProperty().addListener(xml_processing.progressBarListener); + } + xml_processing.readXML(f.toString(), statistic); + } + + } +} diff --git a/src/main/java/nogui/NoGUIController.java b/src/main/java/nogui/NoGUIController.java new file mode 100644 index 0000000..af0b1cb --- /dev/null +++ b/src/main/java/nogui/NoGUIController.java @@ -0,0 +1,200 @@ +package nogui; + +import data.*; +import gui.GUIController; +import gui.I18N; +import gui.ValidationUtil; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOCase; +import org.apache.commons.io.filefilter.FileFilterUtils; +import org.apache.commons.io.filefilter.TrueFileFilter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; + + +import java.io.*; +import java.util.*; + +import static data.CorpusType.*; +import static data.CorpusType.GIGAFIDA; +import static nogui.Characters.characters; +import static nogui.WordSets.wordSets; +import static nogui.Words.words; +import static nogui.WordParts.wordParts; + +public class NoGUIController { + + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static void launch_no_gui(String [] args) { + Filter filter = new Filter(); + + String path = null; + String corpusLocation = null; + String outputName = null; + String resultsLocation = null; + // read parameters + int i = 0; + for(String s : args) { + switch (s) { + case "--config": // we check for if it is equal to -v + path = args[i + 1]; + break; + case "--corpusLocation": // in an else if to have no input = -v + corpusLocation = args[i + 1]; + break; + case "--outputName": // in an else if to have no input = -v + outputName = args[i + 1]; + break; + case "--resultsLocation": // in an else if to have no input = -v + resultsLocation = args[i + 1]; + break; + } + i ++; + } + + // read config file + JSONObject settings = read_config(path); + + // read corpus + Corpus corpus = read_corpus(settings, corpusLocation, outputName, resultsLocation); + + if (((String) settings.get("language")).equals("SL")) { + I18N.setLocale(new Locale.Builder().setLanguage("sl").setRegion("SI").build()); + } else { + I18N.setLocale(Locale.ENGLISH); + } + + // handle chars + if (((String) settings.get("tab")).equals("characters")) { + characters(settings, corpus); + } else if (((String) settings.get("tab")).equals("wordParts")) { + wordParts(settings, corpus); + } else if (((String) settings.get("tab")).equals("words")) { + words(settings, corpus); + } else if (((String) settings.get("tab")).equals("wordSets")) { + wordSets(settings, corpus); + } + } + + + private static Corpus read_corpus(JSONObject settings, String corpusLocationS, String outputNameS, String resultsLocationS) { + Corpus corpus = new Corpus(); + + if (corpusLocationS == null) { + corpusLocationS = (String) settings.get("corpusLocation"); + } + + File corpusLocation = new File(corpusLocationS); + corpus.setChosenCorpusLocation(corpusLocation); + + boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo"); + corpus.setHeaderRead(readHeaderInfo); + + if (resultsLocationS == null) { + resultsLocationS = (String) settings.get("resultsLocation"); + } + + File resultsLocation = new File(resultsLocationS); + corpus.setChosenResultsLocation(resultsLocation); + + CorpusType corpusType = selectReader((String) settings.get("selectReader")); + corpus.setCorpusType(corpusType); + + if (outputNameS == null) { + outputNameS = (String) settings.get("outputName"); + } + corpus.setCorpusName(outputNameS); + + String punctuation = ((String) settings.get("punctuation")).equals("comma") ? "punctuation.COMMA" : "punctuation.POINT"; + corpus.setPunctuation(punctuation); + + Collection corpusFiles = null; + + if (ValidationUtil.isReadableDirectory(corpusLocation)) { + logger.info("selected corpus dir: ", corpusLocation.getAbsolutePath()); + + // scan for xml files + corpusFiles = FileUtils.listFiles(corpusLocation, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE); + } else { + corpusFiles = new LinkedList(); + corpusFiles.add(corpusLocation); + } + + corpus.setDetectedCorpusFiles(corpusFiles); + + corpus.validate(); + + // MISSING: setSolarFiltersForXML + + return corpus; + } + + private static CorpusType selectReader(String selectReader) { + CorpusType corpusType = null; + switch (selectReader) { +// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)" + case "VERT + REGI": + corpusType = VERT; + break; + case "XML (Šolar 1.0)": + corpusType = SOLAR; + break; + case "XML (GOS 1.0)": + corpusType = GOS; + break; + case "XML (ssj500k 2.1)": + corpusType = SSJ500K; + break; + case "XML (Gigafida 2.0)": + corpusType = GIGAFIDA2; + break; + case "XML (Gigafida 1.0, Kres 1.0)": + corpusType = GIGAFIDA; + break; + default: + break; + } + return corpusType; + } + + private static JSONObject read_config(String path) { + JSONObject settings = null; + + //JSON parser object to parse read file + JSONParser jsonParser = new JSONParser(); + + try (FileReader reader = new FileReader(path)) + { + //Read JSON file + Object obj = jsonParser.parse(reader); + + settings = (JSONObject) obj; +// String selectReader = (String) settings.get("selectReader"); +// boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo"); +// employeeList.forEach( emp -> parseEmployeeObject( (JSONObject) emp ) ); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParseException e) { + e.printStackTrace(); + } + return settings; +// chooseCorpusLocationL - selectedDirectory_input; +// readHeaderInfo; +// chooseResultsLocationL - selectedDirectory_output; +// selectReader; +// outputName; +// punctuation; +// +// +// Collection corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE); + + + } +} diff --git a/src/main/java/nogui/Utils.java b/src/main/java/nogui/Utils.java new file mode 100644 index 0000000..b5a1f5b --- /dev/null +++ b/src/main/java/nogui/Utils.java @@ -0,0 +1,319 @@ +package nogui; + +import alg.XML_processing; +import data.*; +import gui.GUIController; +import gui.I18N; +import javafx.scene.control.Alert; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONArray; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.regex.Pattern; + +import static gui.GUIController.showAlert; + +public class Utils { + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static ArrayList getTaxonomy(JSONArray taxonomyArray, Corpus corpus) { + // convert JSONArray to ObservableList + ArrayList checkedItems = new ArrayList<>(); + for (Object o : taxonomyArray) { + checkedItems.add((String) o); + } + + ArrayList taxonomy = new ArrayList<>(); + ArrayList checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus); + return checkedItemsTaxonomy; + } + + public static ArrayList getCollocability(JSONArray collocabilityArray) { + // convert JSONArray to ObservableList + ArrayList checkedItems = new ArrayList<>(); + for (Object o : collocabilityArray) { + checkedItems.add(Collocability.factory((String) o)); + } + return checkedItems; + } + + public static ArrayList getArrayList(JSONArray array) { + // convert JSONArray to ObservableList + ArrayList arrayList = new ArrayList<>(); + for (Object o : array) { + arrayList.add((String) o); + } + return arrayList; + } + + public static ArrayList getAlsoVisualizeList(JSONArray array) { + // convert JSONArray to ObservableList + ArrayList arrayList = new ArrayList<>(); + for (Object o : array) { + arrayList.add(I18N.get((String) o)); + } + return arrayList; + } + + public static ArrayList getMsd(String stringMsd) { + ArrayList msd = new ArrayList<>(); + if (stringMsd.equals("")) { + return msd; + } + ArrayList msdTmp = new ArrayList<>(Arrays.asList(stringMsd.split(" "))); + for (String msdToken : msdTmp) { + msd.add(Pattern.compile(msdToken)); + } + return msd; + } + + public static void updateProgress(int i, int size, String format) { + } + + public static void updateProgress(double i, int size, String format) { + } + + public static void prepareTaskForMinRelFre(StatisticsNew statistic, Corpus corpus) { + Filter fi = statistic.getFilter(); + logger.info("Started execution: ", fi); + + try{ + Filter f2 = (Filter) fi.clone(); + f2.setIsMinimalRelFreScraper(true); + StatisticsNew statisticsMinRelFre = new StatisticsNew(corpus, f2, false); + + Collection corpusFiles = statisticsMinRelFre.getCorpus().getDetectedCorpusFiles(); + + final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statisticsMinRelFre.getCorpus().getCorpusType()); + + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + int corpusSize; + int i; + if(statistic.getFilter().getCollocability().size() > 0){ + i = 0; + corpusSize = corpusFiles.size() * 3; + } else { + i = 0; + corpusSize = corpusFiles.size() * 2; + } + for (File f : corpusFiles) { + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); + i++; + if (multipleFiles) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000); + previousTime = new Date(); + } + } else {} + xml_processing.readXML(f.toString(), statisticsMinRelFre); + } + + // add remaining minRelFre results + if(statisticsMinRelFre.getFilter().getIsMinimalRelFreScraper()) { + long countFor1MWords = statisticsMinRelFre.getUniGramOccurrences().get(statisticsMinRelFre.getCorpus().getTotal()).longValue(); + double absToRelFactor = (statisticsMinRelFre.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords; + + statisticsMinRelFre.updateMinimalRelFre(statisticsMinRelFre.getTaxonomyResult().get(statisticsMinRelFre.getCorpus().getTotal()).entrySet(), absToRelFactor); + + // reset all values + for(Taxonomy taxonomy : statisticsMinRelFre.getTaxonomyResult().keySet()){ + statisticsMinRelFre.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>()); + } + for(Taxonomy taxonomy : statisticsMinRelFre.getUniGramOccurrences().keySet()){ + statisticsMinRelFre.getUniGramOccurrences().put(taxonomy, new AtomicLong(0)); + } + } + + + prepareMainTask(statistic, corpus); + + + }catch(CloneNotSupportedException c){} + } + + public static void prepareMainTask(StatisticsNew statistic, Corpus corpus) { + Filter f = statistic.getFilter(); + logger.info("Started execution: ", f); + + Collection corpusFiles = statistic.getCorpus().getDetectedCorpusFiles(); + + final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType()); + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + int corpusSize; + int i; + int taskIndex = 0; + if(statistic.getFilter().getCollocability().size() > 0 && statistic.getFilter().getMinimalRelFre() > 1){ + i = corpusFiles.size(); + corpusSize = corpusFiles.size() * 3; + } else if (statistic.getFilter().getMinimalRelFre() > 1) { + i = corpusFiles.size(); + corpusSize = corpusFiles.size() * 2; + } else if (statistic.getFilter().getCollocability().size() > 0) { + i = 0; + corpusSize = corpusFiles.size() * 2; + } else { + i = 0; + corpusSize = corpusFiles.size(); + } + for (File fi : corpusFiles) { + final int iFinal = i; + XML_processing xml_processing = new XML_processing(); + xml_processing.isCancelled = false; + i++; + taskIndex++; +// if(xml_processing.progressBarListener != null) { +// xml_processing.progressProperty().removeListener(xml_processing.progressBarListener); +// } + if (multipleFiles) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000); + previousTime = new Date(); + } +// this.updateProgress(i, corpusSize); +// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds)); + + } else { +// xml_processing.progressBarListener = new InvalidationListener() { +// int remainingSeconds = -1; +// Date previousTime = new Date(); +// @Override +// public void invalidated(Observable observable) { +// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ +// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * +// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * +// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); +// previousTime = new Date(); +// } +// xml_processing.isCancelled = isCancelled(); +// updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100); +// updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), 1, 1, f.getName(), remainingSeconds)); +// } +// }; +// +// xml_processing.progressProperty().addListener(xml_processing.progressBarListener); + } + xml_processing.readXML(fi.toString(), statistic); + } + // if getMinimalRelFre > 1 erase all words that have lower occurrences at the end of processing + if (statistic.getFilter().getMinimalRelFre() > 1){ + long countFor1MWords = statistic.getUniGramOccurrences().get(statistic.getCorpus().getTotal()).longValue(); + double absToRelFactor = (statistic.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords; + + + for(Map.Entry entry : statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet()){ + if(entry.getValue().longValue() < absToRelFactor){ + statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).remove(entry.getKey()); + } + } + statistic.updateMinimalRelFre(statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet(), absToRelFactor); + } + + if (f.getCollocability().size() > 0) { + try{ + Filter f2 = (Filter) f.clone(); + f2.setNgramValue(1); + StatisticsNew statisticsOneGrams = new StatisticsNew(corpus, f2, false); + prepareTaskForCollocability(statistic, statisticsOneGrams); + }catch(CloneNotSupportedException c){} + + + + } else { + try { + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + logger.error("Error while saving", e1); + } catch (OutOfMemoryError e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + logger.error("Out of memory error", e1); + } + } + } + + public static void prepareTaskForCollocability(StatisticsNew statistic, StatisticsNew statisticsOneGrams) { + Collection corpusFiles = statisticsOneGrams.getCorpus().getDetectedCorpusFiles(); + + final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType()); + Date startTime = new Date(); + Date previousTime = new Date(); + int remainingSeconds = -1; + + int corpusSize; + int i; + int taskIndex = 0; + if(statistic.getFilter().getMinimalRelFre() > 1){ + i = corpusFiles.size() * 2; + corpusSize = corpusFiles.size() * 3; + } else { + i = corpusFiles.size(); + corpusSize = corpusFiles.size() * 2; + } + + + + for (File f : corpusFiles) { + XML_processing xml_processing = new XML_processing(); + i++; + taskIndex++; + if(xml_processing.progressBarListener != null) { + xml_processing.progressProperty().removeListener(xml_processing.progressBarListener); + } + if (multipleFiles) { + if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ + remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000); + previousTime = new Date(); + } + } else { +// xml_processing.progressBarListener = new InvalidationListener() { +// int remainingSeconds = -1; +// Date previousTime = new Date(); +// @Override +// public void invalidated(Observable observable) { +// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){ +// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) * +// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) * +// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000); +// previousTime = new Date(); +// } +// } +// }; + } + xml_processing.isCollocability = true; + xml_processing.readXML(f.toString(), statisticsOneGrams); + xml_processing.isCollocability = false; + } + + try { + System.out.print(statistic); + statistic.updateCalculateCollocabilities(statisticsOneGrams); + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + logger.error("Error while saving", e1); + } catch (OutOfMemoryError e1) { + logger.error(I18N.get("message.ERROR_NOT_ENOUGH_MEMORY")); + logger.error("Out of memory error", e1); + } + } +} diff --git a/src/main/java/nogui/WordParts.java b/src/main/java/nogui/WordParts.java new file mode 100644 index 0000000..d770b11 --- /dev/null +++ b/src/main/java/nogui/WordParts.java @@ -0,0 +1,100 @@ +package nogui; + +import alg.XML_processing; +import data.*; +import gui.GUIController; +import gui.I18N; +import javafx.beans.InvalidationListener; +import javafx.beans.Observable; +import javafx.beans.property.ReadOnlyDoubleWrapper; +import javafx.concurrent.Task; +import javafx.scene.control.Alert; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import util.Tasks; + +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.regex.Pattern; + +import static gui.GUIController.showAlert; +import static nogui.Utils.*; + +public class WordParts { + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static void wordParts(JSONObject settings, Corpus corpus) { + Filter filter = new Filter(); + // fixed values + filter.setNgramValue(1); + filter.setAl(AnalysisLevel.STRING_LEVEL); + filter.setSkipValue(0); + filter.setIsCvv(false); + filter.setStringLength(1); + + // tab specific values + // TODO + ArrayList prefixList = getArrayList((JSONArray) settings.get("prefixList")); + filter.setPrefixList(prefixList); + ArrayList suffixList = getArrayList((JSONArray) settings.get("suffixList")); + filter.setSuffixList(suffixList); + + String calculateForString = (String) settings.get("calculateFor"); + CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString)); + filter.setCalculateFor(calculateFor); + ArrayList alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize")); + filter.setMultipleKeys(alsoVisualize); + filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy")); + filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre"))); + filter.setPrefixLength(Math.toIntExact((Long) settings.get("prefixLength"))); + filter.setSuffixLength(Math.toIntExact((Long) settings.get("suffixLength"))); + + + // right part + ArrayList msd = getMsd((String) settings.get("msd")); + filter.setMsd(msd); + filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation"))); + ArrayList taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus); + filter.setTaxonomy(taxonomy); + filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences"))); + filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy"))); + + String message = Validation.validateForStringLevel(filter); + if (message == null) { + // no errors + logger.info("Executing: ", filter.toString()); + StatisticsNew statistic = new StatisticsNew(corpus, filter, false); + execute(statistic, corpus); + try { + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + } + } else { + logger.error(message); + } + } + + private static void execute(StatisticsNew statistic, Corpus corpus) { + logger.info("Started execution: ", statistic.getFilter()); + + if (statistic.getFilter().getMinimalRelFre() > 1){ + prepareTaskForMinRelFre(statistic, corpus); + } else { + prepareMainTask(statistic, corpus); + } + } +} diff --git a/src/main/java/nogui/WordSets.java b/src/main/java/nogui/WordSets.java new file mode 100644 index 0000000..393494e --- /dev/null +++ b/src/main/java/nogui/WordSets.java @@ -0,0 +1,88 @@ +package nogui; + +import data.*; +import gui.GUIController; +import gui.I18N; +import javafx.concurrent.Task; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import util.Tasks; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.regex.Pattern; + +import static nogui.Utils.*; + +public class WordSets { + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static void wordSets(JSONObject settings, Corpus corpus) { + Filter filter = new Filter(); + // fixed values + + filter.setAl(AnalysisLevel.STRING_LEVEL); + + filter.setIsCvv(false); + filter.setStringLength(1); + + // tab specific values + filter.setNgramValue(Math.toIntExact((Long) settings.get("ngramValue"))); + filter.setSkipValue(Math.toIntExact((Long) settings.get("skipValue"))); + filter.setNotePunctuations((boolean) settings.get("notePunctuations")); + filter.setCollocability(getCollocability((JSONArray) settings.get("collocability"))); + + + + String calculateForString = (String) settings.get("calculateFor"); + CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString)); + filter.setCalculateFor(calculateFor); + ArrayList alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize")); + filter.setMultipleKeys(alsoVisualize); + filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy")); + filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre"))); + + + // right part + ArrayList msd = getMsd((String) settings.get("msd")); + filter.setMsd(msd); + filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation"))); + ArrayList taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus); + filter.setTaxonomy(taxonomy); + filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences"))); + filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy"))); + + String message = Validation.validateForStringLevel(filter); + if (message == null) { + // no errors + logger.info("Executing: ", filter.toString()); + StatisticsNew statistic = new StatisticsNew(corpus, filter, false); + execute(statistic, corpus); + try { + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + } + } else { + logger.error(message); + } + } + + private static void execute(StatisticsNew statistic, Corpus corpus) { + Filter f = statistic.getFilter(); + logger.info("Started execution: ", f); + + if (f.getMinimalRelFre() > 1){ + prepareTaskForMinRelFre(statistic, corpus); + } else { + prepareMainTask(statistic, corpus); + } + } +} diff --git a/src/main/java/nogui/Words.java b/src/main/java/nogui/Words.java new file mode 100644 index 0000000..27970d6 --- /dev/null +++ b/src/main/java/nogui/Words.java @@ -0,0 +1,84 @@ +package nogui; + +import data.*; +import gui.GUIController; +import gui.I18N; +import javafx.concurrent.Task; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import util.Tasks; + +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.regex.Pattern; + +import static nogui.Utils.*; +import static nogui.Utils.getTaxonomy; + +public class Words { + public final static Logger logger = LogManager.getLogger(GUIController.class); + + public static void words(JSONObject settings, Corpus corpus) { + Filter filter = new Filter(); + // fixed values + filter.setNgramValue(1); + filter.setAl(AnalysisLevel.STRING_LEVEL); + filter.setSkipValue(0); + filter.setIsCvv(false); + filter.setStringLength(1); + + // tab specific values + filter.setNotePunctuations((boolean) settings.get("notePunctuations")); + filter.setWriteMsdAtTheEnd((boolean) settings.get("writeMsdAtTheEnd")); + + String calculateForString = (String) settings.get("calculateFor"); + CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString)); + filter.setCalculateFor(calculateFor); + ArrayList alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize")); + filter.setMultipleKeys(alsoVisualize); + filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy")); + filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre"))); + + + // right part + ArrayList msd = getMsd((String) settings.get("msd")); + filter.setMsd(msd); + filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation"))); + ArrayList taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus); + filter.setTaxonomy(taxonomy); + filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences"))); + filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy"))); + + String message = Validation.validateForStringLevel(filter); + if (message == null) { + // no errors + logger.info("Executing: ", filter.toString()); + StatisticsNew statistic = new StatisticsNew(corpus, filter, false); + execute(statistic, corpus); + try { + boolean successullySaved = statistic.saveResultToDisk(); + if (successullySaved) { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED")); + } else { + logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS")); + } + } catch (UnsupportedEncodingException e1) { + logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV")); + } + } else { + logger.error(message); + } + } + + private static void execute(StatisticsNew statistic, Corpus corpus) { + logger.info("Started execution: ", statistic.getFilter()); + + if (statistic.getFilter().getMinimalRelFre() > 1){ + prepareTaskForMinRelFre(statistic, corpus); + } else { + prepareMainTask(statistic, corpus); + } + } +}