Added no GUI option
This commit is contained in:
parent
682beabdcb
commit
30b848d853
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -164,3 +164,6 @@ $RECYCLE.BIN/
|
||||||
|
|
||||||
src/main/resources/translation_external/
|
src/main/resources/translation_external/
|
||||||
src/main/resources/translations_backup/
|
src/main/resources/translations_backup/
|
||||||
|
|
||||||
|
config.json
|
||||||
|
config_instructions.txt
|
22
config_characters.json
Normal file
22
config_characters.json
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
{
|
||||||
|
"language": "SL",
|
||||||
|
|
||||||
|
"corpusLocation": "target/classes/Gigafida_subset",
|
||||||
|
"readHeaderInfo": false,
|
||||||
|
"resultsLocation": "tmp",
|
||||||
|
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
|
||||||
|
"outputName": "",
|
||||||
|
"punctuation": "comma",
|
||||||
|
|
||||||
|
"tab": "characters",
|
||||||
|
|
||||||
|
"stringLength": 1,
|
||||||
|
"calculateFor": "calculateFor.WORD",
|
||||||
|
"displayTaxonomy": false,
|
||||||
|
|
||||||
|
"msd": "",
|
||||||
|
"taxonomySetOperation": "taxonomySetOperation.UNION",
|
||||||
|
"taxonomy": ["SSJ.T.K.L - tisk-knjižno-leposlovno", "SSJ.T.K.L - tisk-knjižno-leposlovno"],
|
||||||
|
"minimalOccurrences": 1,
|
||||||
|
"minimalTaxonomy": 1
|
||||||
|
}
|
21
config_characters_instructions.txt
Normal file
21
config_characters_instructions.txt
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
|
||||||
|
"language": String - options: "SL", "EN"
|
||||||
|
|
||||||
|
"corpusLocation": String - path to input location.
|
||||||
|
"readHeaderInfo": Boolean - read taxonomy from corpus files
|
||||||
|
"resultsLocation": String - path to results location
|
||||||
|
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
|
||||||
|
"outputName": String - Output file name
|
||||||
|
"punctuation": String - options: "comma", "point"
|
||||||
|
|
||||||
|
"tab": String - options: "characters", "wordParts", "words", "wordSets"
|
||||||
|
|
||||||
|
"stringLength": int - Number of characters
|
||||||
|
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
|
||||||
|
"displayTaxonomy": Boolean - Display taxonomy in output
|
||||||
|
|
||||||
|
"msd": String - A valid MSD (or empty)
|
||||||
|
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
|
||||||
|
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
|
||||||
|
"minimalOccurrences": int - Minimal number of occurrences
|
||||||
|
"minimalTaxonomy": int - Minimal number of taxonomy branches
|
27
config_wordParts.json
Normal file
27
config_wordParts.json
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"language": "SL",
|
||||||
|
|
||||||
|
"corpusLocation": "target/classes/Gigafida_subset",
|
||||||
|
"readHeaderInfo": false,
|
||||||
|
"resultsLocation": "tmp",
|
||||||
|
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
|
||||||
|
"outputName": "",
|
||||||
|
"punctuation": "comma",
|
||||||
|
|
||||||
|
"tab": "wordParts",
|
||||||
|
|
||||||
|
"calculateFor": "calculateFor.WORD",
|
||||||
|
"alsoVisualize": ["calculateFor.LEMMA"],
|
||||||
|
"displayTaxonomy": false,
|
||||||
|
"prefixLength": 1,
|
||||||
|
"suffixLength": 0,
|
||||||
|
"prefixList": [],
|
||||||
|
"suffixList": [],
|
||||||
|
|
||||||
|
"msd": "",
|
||||||
|
"taxonomySetOperation": "taxonomySetOperation.UNION",
|
||||||
|
"taxonomy": [],
|
||||||
|
"minimalOccurrences": 1,
|
||||||
|
"minimalTaxonomy": 1,
|
||||||
|
"minimalRelFre": 1
|
||||||
|
}
|
26
config_wordParts_instructions.txt
Normal file
26
config_wordParts_instructions.txt
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
"language": String - options: "SL", "EN"
|
||||||
|
|
||||||
|
"corpusLocation": String - path to input location.
|
||||||
|
"readHeaderInfo": Boolean - read taxonomy from corpus files
|
||||||
|
"resultsLocation": String - path to results location
|
||||||
|
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
|
||||||
|
"outputName": String - Output file name
|
||||||
|
"punctuation": String - options: "comma", "point"
|
||||||
|
|
||||||
|
"tab": String - options: "characters", "wordParts", "words", "wordSets"
|
||||||
|
|
||||||
|
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
|
||||||
|
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
|
||||||
|
"displayTaxonomy": Boolean - Display taxonomy in output
|
||||||
|
"prefixLength": int - prefix length
|
||||||
|
"suffixLength": int - suffix length
|
||||||
|
"prefixList": array of Strings - write different options in array
|
||||||
|
"suffixList": array of Strings - write different options in array
|
||||||
|
|
||||||
|
"msd": String - A valid MSD (or empty)
|
||||||
|
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
|
||||||
|
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
|
||||||
|
"minimalOccurrences": int - Minimal number of occurrences
|
||||||
|
"minimalTaxonomy": int - Minimal number of taxonomy branches
|
||||||
|
"minimalRelFre": int - Minimal relative frequency
|
27
config_wordSets.json
Normal file
27
config_wordSets.json
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"language": "SL",
|
||||||
|
|
||||||
|
"corpusLocation": "target/classes/Gigafida_subset",
|
||||||
|
"readHeaderInfo": false,
|
||||||
|
"resultsLocation": "tmp",
|
||||||
|
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
|
||||||
|
"outputName": "",
|
||||||
|
"punctuation": "comma",
|
||||||
|
|
||||||
|
"tab": "wordSets",
|
||||||
|
|
||||||
|
"calculateFor": "calculateFor.WORD",
|
||||||
|
"alsoVisualize": ["calculateFor.MORPHOSYNTACTIC_SPECS"],
|
||||||
|
"displayTaxonomy": false,
|
||||||
|
"ngramValue": 2,
|
||||||
|
"skipValue": 0,
|
||||||
|
"notePunctuations": false,
|
||||||
|
"collocability": ["Dice"],
|
||||||
|
|
||||||
|
"msd": "Sozei Sozei",
|
||||||
|
"taxonomySetOperation": "taxonomySetOperation.UNION",
|
||||||
|
"taxonomy": [],
|
||||||
|
"minimalOccurrences": 1,
|
||||||
|
"minimalTaxonomy": 1,
|
||||||
|
"minimalRelFre": 1
|
||||||
|
}
|
26
config_wordSets_instructions.txt
Normal file
26
config_wordSets_instructions.txt
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
|
||||||
|
"language": String - options: "SL", "EN"
|
||||||
|
|
||||||
|
"corpusLocation": String - path to input location.
|
||||||
|
"readHeaderInfo": Boolean - read taxonomy from corpus files
|
||||||
|
"resultsLocation": String - path to results location
|
||||||
|
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
|
||||||
|
"outputName": String - Output file name
|
||||||
|
"punctuation": String - options: "comma", "point"
|
||||||
|
|
||||||
|
"tab": String - options: "characters", "wordParts", "words", "wordSets"
|
||||||
|
|
||||||
|
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
|
||||||
|
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
|
||||||
|
"displayTaxonomy": Boolean - Display taxonomy in output
|
||||||
|
"ngramValue": int - N-gram length
|
||||||
|
"skipValue": int - Maximum number of words that can appear between two words and word set
|
||||||
|
"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag
|
||||||
|
"collocability": array of Strings - options: "Dice", "t-score", "MI", "MI3", "logDice", "simple LL"
|
||||||
|
|
||||||
|
"msd": String - A valid MSD (or empty)
|
||||||
|
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
|
||||||
|
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
|
||||||
|
"minimalOccurrences": int - Minimal number of occurrences
|
||||||
|
"minimalTaxonomy": int - Minimal number of taxonomy branches
|
||||||
|
"minimalRelFre": int - Minimal relative frequency
|
25
config_words.json
Normal file
25
config_words.json
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
{
|
||||||
|
"language": "SL",
|
||||||
|
|
||||||
|
"corpusLocation": "target/classes/Gigafida_minimal/gfmin.xml",
|
||||||
|
"readHeaderInfo": false,
|
||||||
|
"resultsLocation": "tmp",
|
||||||
|
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
|
||||||
|
"outputName": "",
|
||||||
|
"punctuation": "comma",
|
||||||
|
|
||||||
|
"tab": "words",
|
||||||
|
|
||||||
|
"calculateFor": "calculateFor.WORD",
|
||||||
|
"alsoVisualize": ["calculateFor.LEMMA"],
|
||||||
|
"displayTaxonomy": false,
|
||||||
|
"notePunctuations": false,
|
||||||
|
"writeMsdAtTheEnd": false,
|
||||||
|
|
||||||
|
"msd": "",
|
||||||
|
"taxonomySetOperation": "taxonomySetOperation.UNION",
|
||||||
|
"taxonomy": [" SSJ.T.K.S - tisk-knjižno-strokovno"],
|
||||||
|
"minimalOccurrences": 1,
|
||||||
|
"minimalTaxonomy": 1,
|
||||||
|
"minimalRelFre": 1
|
||||||
|
}
|
24
config_words_instructions.txt
Normal file
24
config_words_instructions.txt
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
|
||||||
|
"language": String - options: "SL", "EN"
|
||||||
|
|
||||||
|
"corpusLocation": String - path to input location.
|
||||||
|
"readHeaderInfo": Boolean - read taxonomy from corpus files
|
||||||
|
"resultsLocation": String - path to results location
|
||||||
|
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
|
||||||
|
"outputName": String - Output file name
|
||||||
|
"punctuation": String - options: "comma", "point"
|
||||||
|
|
||||||
|
"tab": String - options: "characters", "wordParts", "words", "wordSets"
|
||||||
|
|
||||||
|
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
|
||||||
|
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
|
||||||
|
"displayTaxonomy": Boolean - Display taxonomy in output
|
||||||
|
"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag
|
||||||
|
"writeMsdAtTheEnd": Boolean - Word sets will include punctuations
|
||||||
|
|
||||||
|
"msd": String - A valid MSD (or empty)
|
||||||
|
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
|
||||||
|
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
|
||||||
|
"minimalOccurrences": int - Minimal number of occurrences
|
||||||
|
"minimalTaxonomy": int - Minimal number of taxonomy branches
|
||||||
|
"minimalRelFre": int - Minimal relative frequency
|
BIN
corpus-analyzer.jar
Normal file
BIN
corpus-analyzer.jar
Normal file
Binary file not shown.
|
@ -680,7 +680,7 @@ enum TaxonomyEnum {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ArrayList<TaxonomyEnum> convertStringListToTaxonomyList(ObservableList<String> stringList, Corpus corpus){
|
public static ArrayList<TaxonomyEnum> convertStringListToTaxonomyList(List<String> stringList, Corpus corpus){
|
||||||
ArrayList<TaxonomyEnum> taxonomyList = new ArrayList<>();
|
ArrayList<TaxonomyEnum> taxonomyList = new ArrayList<>();
|
||||||
|
|
||||||
for (String e : stringList) {
|
for (String e : stringList) {
|
||||||
|
@ -791,7 +791,7 @@ public class Taxonomy {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(ObservableList<String> stringList, Corpus corpus){
|
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(List<String> stringList, Corpus corpus){
|
||||||
ArrayList<Taxonomy> taxonomyList = new ArrayList<>();
|
ArrayList<Taxonomy> taxonomyList = new ArrayList<>();
|
||||||
|
|
||||||
for (String e : stringList) {
|
for (String e : stringList) {
|
||||||
|
@ -832,7 +832,7 @@ public class Taxonomy {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ObservableList<String> checkedItems, Corpus corpus){
|
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){
|
||||||
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
|
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
|
||||||
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
||||||
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
|
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
|
||||||
|
|
|
@ -2,6 +2,7 @@ package gui;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import data.Filter;
|
||||||
import javafx.beans.binding.StringBinding;
|
import javafx.beans.binding.StringBinding;
|
||||||
import javafx.scene.layout.AnchorPane;
|
import javafx.scene.layout.AnchorPane;
|
||||||
import org.apache.logging.log4j.LogManager;
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
@ -20,6 +21,8 @@ import javafx.scene.control.Tab;
|
||||||
import javafx.scene.control.TabPane;
|
import javafx.scene.control.TabPane;
|
||||||
import javafx.stage.Stage;
|
import javafx.stage.Stage;
|
||||||
|
|
||||||
|
import static nogui.NoGUIController.launch_no_gui;
|
||||||
|
|
||||||
public class GUIController extends Application {
|
public class GUIController extends Application {
|
||||||
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
@ -78,8 +81,14 @@ public class GUIController extends Application {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
if (args.length > 0) {
|
||||||
|
launch_no_gui(args);
|
||||||
|
logger.info("Processing finalized!");
|
||||||
|
} else {
|
||||||
launch(args);
|
launch(args);
|
||||||
}
|
}
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
// add CSS style
|
// add CSS style
|
||||||
|
|
118
src/main/java/nogui/Characters.java
Normal file
118
src/main/java/nogui/Characters.java
Normal file
|
@ -0,0 +1,118 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import alg.XML_processing;
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import javafx.beans.InvalidationListener;
|
||||||
|
import javafx.beans.Observable;
|
||||||
|
import javafx.beans.property.ReadOnlyDoubleWrapper;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONArray;
|
||||||
|
import org.json.simple.JSONObject;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static nogui.Utils.*;
|
||||||
|
|
||||||
|
public class Characters {
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static void characters(JSONObject settings, Corpus corpus) {
|
||||||
|
Filter filter = new Filter();
|
||||||
|
// fixed values
|
||||||
|
filter.setNgramValue(0);
|
||||||
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||||
|
filter.setSkipValue(0);
|
||||||
|
filter.setIsCvv(false);
|
||||||
|
filter.setMultipleKeys(new ArrayList<>());
|
||||||
|
|
||||||
|
// tab specific values
|
||||||
|
filter.setStringLength(Math.toIntExact((Long) settings.get("stringLength")));
|
||||||
|
String calculateForString = (String) settings.get("calculateFor");
|
||||||
|
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
|
||||||
|
filter.setCalculateFor(calculateFor);
|
||||||
|
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
|
||||||
|
|
||||||
|
// right part
|
||||||
|
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
|
||||||
|
filter.setMsd(msd);
|
||||||
|
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
|
||||||
|
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
|
||||||
|
filter.setTaxonomy(taxonomy);
|
||||||
|
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
|
||||||
|
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
|
||||||
|
|
||||||
|
String message = Validation.validateForStringLevel(filter);
|
||||||
|
if (message == null) {
|
||||||
|
// no errors
|
||||||
|
logger.info("Executing: ", filter.toString());
|
||||||
|
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
|
||||||
|
execute(statistic);
|
||||||
|
try {
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.error(message);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execute(StatisticsNew statistic) {
|
||||||
|
logger.info("Started execution: ", statistic.getFilter());
|
||||||
|
|
||||||
|
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||||
|
|
||||||
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
|
||||||
|
int i = 0;
|
||||||
|
Date startTime = new Date();
|
||||||
|
Date previousTime = new Date();
|
||||||
|
int remainingSeconds = -1;
|
||||||
|
for (File f : corpusFiles) {
|
||||||
|
final int iFinal = i;
|
||||||
|
XML_processing xml_processing = new XML_processing();
|
||||||
|
i++;
|
||||||
|
if (multipleFiles) {
|
||||||
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000);
|
||||||
|
previousTime = new Date();
|
||||||
|
}
|
||||||
|
updateProgress(i, corpusFiles.size(), String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds));
|
||||||
|
} else {
|
||||||
|
|
||||||
|
xml_processing.progressBarListener = new InvalidationListener() {
|
||||||
|
int remainingSeconds = -1;
|
||||||
|
Date previousTime = new Date();
|
||||||
|
@Override
|
||||||
|
public void invalidated(Observable observable) {
|
||||||
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
|
||||||
|
(1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
|
||||||
|
((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
|
||||||
|
previousTime = new Date();
|
||||||
|
}
|
||||||
|
updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100, String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
xml_processing.progressProperty().addListener(xml_processing.progressBarListener);
|
||||||
|
}
|
||||||
|
xml_processing.readXML(f.toString(), statistic);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
200
src/main/java/nogui/NoGUIController.java
Normal file
200
src/main/java/nogui/NoGUIController.java
Normal file
|
@ -0,0 +1,200 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import gui.ValidationUtil;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.commons.io.IOCase;
|
||||||
|
import org.apache.commons.io.filefilter.FileFilterUtils;
|
||||||
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONObject;
|
||||||
|
import org.json.simple.parser.JSONParser;
|
||||||
|
import org.json.simple.parser.ParseException;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static data.CorpusType.*;
|
||||||
|
import static data.CorpusType.GIGAFIDA;
|
||||||
|
import static nogui.Characters.characters;
|
||||||
|
import static nogui.WordSets.wordSets;
|
||||||
|
import static nogui.Words.words;
|
||||||
|
import static nogui.WordParts.wordParts;
|
||||||
|
|
||||||
|
public class NoGUIController {
|
||||||
|
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static void launch_no_gui(String [] args) {
|
||||||
|
Filter filter = new Filter();
|
||||||
|
|
||||||
|
String path = null;
|
||||||
|
String corpusLocation = null;
|
||||||
|
String outputName = null;
|
||||||
|
String resultsLocation = null;
|
||||||
|
// read parameters
|
||||||
|
int i = 0;
|
||||||
|
for(String s : args) {
|
||||||
|
switch (s) {
|
||||||
|
case "--config": // we check for if it is equal to -v
|
||||||
|
path = args[i + 1];
|
||||||
|
break;
|
||||||
|
case "--corpusLocation": // in an else if to have no input = -v
|
||||||
|
corpusLocation = args[i + 1];
|
||||||
|
break;
|
||||||
|
case "--outputName": // in an else if to have no input = -v
|
||||||
|
outputName = args[i + 1];
|
||||||
|
break;
|
||||||
|
case "--resultsLocation": // in an else if to have no input = -v
|
||||||
|
resultsLocation = args[i + 1];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i ++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// read config file
|
||||||
|
JSONObject settings = read_config(path);
|
||||||
|
|
||||||
|
// read corpus
|
||||||
|
Corpus corpus = read_corpus(settings, corpusLocation, outputName, resultsLocation);
|
||||||
|
|
||||||
|
if (((String) settings.get("language")).equals("SL")) {
|
||||||
|
I18N.setLocale(new Locale.Builder().setLanguage("sl").setRegion("SI").build());
|
||||||
|
} else {
|
||||||
|
I18N.setLocale(Locale.ENGLISH);
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle chars
|
||||||
|
if (((String) settings.get("tab")).equals("characters")) {
|
||||||
|
characters(settings, corpus);
|
||||||
|
} else if (((String) settings.get("tab")).equals("wordParts")) {
|
||||||
|
wordParts(settings, corpus);
|
||||||
|
} else if (((String) settings.get("tab")).equals("words")) {
|
||||||
|
words(settings, corpus);
|
||||||
|
} else if (((String) settings.get("tab")).equals("wordSets")) {
|
||||||
|
wordSets(settings, corpus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Corpus read_corpus(JSONObject settings, String corpusLocationS, String outputNameS, String resultsLocationS) {
|
||||||
|
Corpus corpus = new Corpus();
|
||||||
|
|
||||||
|
if (corpusLocationS == null) {
|
||||||
|
corpusLocationS = (String) settings.get("corpusLocation");
|
||||||
|
}
|
||||||
|
|
||||||
|
File corpusLocation = new File(corpusLocationS);
|
||||||
|
corpus.setChosenCorpusLocation(corpusLocation);
|
||||||
|
|
||||||
|
boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo");
|
||||||
|
corpus.setHeaderRead(readHeaderInfo);
|
||||||
|
|
||||||
|
if (resultsLocationS == null) {
|
||||||
|
resultsLocationS = (String) settings.get("resultsLocation");
|
||||||
|
}
|
||||||
|
|
||||||
|
File resultsLocation = new File(resultsLocationS);
|
||||||
|
corpus.setChosenResultsLocation(resultsLocation);
|
||||||
|
|
||||||
|
CorpusType corpusType = selectReader((String) settings.get("selectReader"));
|
||||||
|
corpus.setCorpusType(corpusType);
|
||||||
|
|
||||||
|
if (outputNameS == null) {
|
||||||
|
outputNameS = (String) settings.get("outputName");
|
||||||
|
}
|
||||||
|
corpus.setCorpusName(outputNameS);
|
||||||
|
|
||||||
|
String punctuation = ((String) settings.get("punctuation")).equals("comma") ? "punctuation.COMMA" : "punctuation.POINT";
|
||||||
|
corpus.setPunctuation(punctuation);
|
||||||
|
|
||||||
|
Collection<File> corpusFiles = null;
|
||||||
|
|
||||||
|
if (ValidationUtil.isReadableDirectory(corpusLocation)) {
|
||||||
|
logger.info("selected corpus dir: ", corpusLocation.getAbsolutePath());
|
||||||
|
|
||||||
|
// scan for xml files
|
||||||
|
corpusFiles = FileUtils.listFiles(corpusLocation, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
|
||||||
|
} else {
|
||||||
|
corpusFiles = new LinkedList();
|
||||||
|
corpusFiles.add(corpusLocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
corpus.setDetectedCorpusFiles(corpusFiles);
|
||||||
|
|
||||||
|
corpus.validate();
|
||||||
|
|
||||||
|
// MISSING: setSolarFiltersForXML
|
||||||
|
|
||||||
|
return corpus;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CorpusType selectReader(String selectReader) {
|
||||||
|
CorpusType corpusType = null;
|
||||||
|
switch (selectReader) {
|
||||||
|
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
|
||||||
|
case "VERT + REGI":
|
||||||
|
corpusType = VERT;
|
||||||
|
break;
|
||||||
|
case "XML (Šolar 1.0)":
|
||||||
|
corpusType = SOLAR;
|
||||||
|
break;
|
||||||
|
case "XML (GOS 1.0)":
|
||||||
|
corpusType = GOS;
|
||||||
|
break;
|
||||||
|
case "XML (ssj500k 2.1)":
|
||||||
|
corpusType = SSJ500K;
|
||||||
|
break;
|
||||||
|
case "XML (Gigafida 2.0)":
|
||||||
|
corpusType = GIGAFIDA2;
|
||||||
|
break;
|
||||||
|
case "XML (Gigafida 1.0, Kres 1.0)":
|
||||||
|
corpusType = GIGAFIDA;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return corpusType;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static JSONObject read_config(String path) {
|
||||||
|
JSONObject settings = null;
|
||||||
|
|
||||||
|
//JSON parser object to parse read file
|
||||||
|
JSONParser jsonParser = new JSONParser();
|
||||||
|
|
||||||
|
try (FileReader reader = new FileReader(path))
|
||||||
|
{
|
||||||
|
//Read JSON file
|
||||||
|
Object obj = jsonParser.parse(reader);
|
||||||
|
|
||||||
|
settings = (JSONObject) obj;
|
||||||
|
// String selectReader = (String) settings.get("selectReader");
|
||||||
|
// boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo");
|
||||||
|
// employeeList.forEach( emp -> parseEmployeeObject( (JSONObject) emp ) );
|
||||||
|
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (ParseException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return settings;
|
||||||
|
// chooseCorpusLocationL - selectedDirectory_input;
|
||||||
|
// readHeaderInfo;
|
||||||
|
// chooseResultsLocationL - selectedDirectory_output;
|
||||||
|
// selectReader;
|
||||||
|
// outputName;
|
||||||
|
// punctuation;
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// Collection<File> corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
319
src/main/java/nogui/Utils.java
Normal file
319
src/main/java/nogui/Utils.java
Normal file
|
@ -0,0 +1,319 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import alg.XML_processing;
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import javafx.scene.control.Alert;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONArray;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static gui.GUIController.showAlert;
|
||||||
|
|
||||||
|
public class Utils {
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static ArrayList<Taxonomy> getTaxonomy(JSONArray taxonomyArray, Corpus corpus) {
|
||||||
|
// convert JSONArray to ObservableList<String>
|
||||||
|
ArrayList<String> checkedItems = new ArrayList<>();
|
||||||
|
for (Object o : taxonomyArray) {
|
||||||
|
checkedItems.add((String) o);
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<Taxonomy> taxonomy = new ArrayList<>();
|
||||||
|
ArrayList<Taxonomy> checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus);
|
||||||
|
return checkedItemsTaxonomy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<Collocability> getCollocability(JSONArray collocabilityArray) {
|
||||||
|
// convert JSONArray to ObservableList<String>
|
||||||
|
ArrayList<Collocability> checkedItems = new ArrayList<>();
|
||||||
|
for (Object o : collocabilityArray) {
|
||||||
|
checkedItems.add(Collocability.factory((String) o));
|
||||||
|
}
|
||||||
|
return checkedItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<String> getArrayList(JSONArray array) {
|
||||||
|
// convert JSONArray to ObservableList<String>
|
||||||
|
ArrayList<String> arrayList = new ArrayList<>();
|
||||||
|
for (Object o : array) {
|
||||||
|
arrayList.add((String) o);
|
||||||
|
}
|
||||||
|
return arrayList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<String> getAlsoVisualizeList(JSONArray array) {
|
||||||
|
// convert JSONArray to ObservableList<String>
|
||||||
|
ArrayList<String> arrayList = new ArrayList<>();
|
||||||
|
for (Object o : array) {
|
||||||
|
arrayList.add(I18N.get((String) o));
|
||||||
|
}
|
||||||
|
return arrayList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<Pattern> getMsd(String stringMsd) {
|
||||||
|
ArrayList<Pattern> msd = new ArrayList<>();
|
||||||
|
if (stringMsd.equals("")) {
|
||||||
|
return msd;
|
||||||
|
}
|
||||||
|
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(stringMsd.split(" ")));
|
||||||
|
for (String msdToken : msdTmp) {
|
||||||
|
msd.add(Pattern.compile(msdToken));
|
||||||
|
}
|
||||||
|
return msd;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void updateProgress(int i, int size, String format) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void updateProgress(double i, int size, String format) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void prepareTaskForMinRelFre(StatisticsNew statistic, Corpus corpus) {
|
||||||
|
Filter fi = statistic.getFilter();
|
||||||
|
logger.info("Started execution: ", fi);
|
||||||
|
|
||||||
|
try{
|
||||||
|
Filter f2 = (Filter) fi.clone();
|
||||||
|
f2.setIsMinimalRelFreScraper(true);
|
||||||
|
StatisticsNew statisticsMinRelFre = new StatisticsNew(corpus, f2, false);
|
||||||
|
|
||||||
|
Collection<File> corpusFiles = statisticsMinRelFre.getCorpus().getDetectedCorpusFiles();
|
||||||
|
|
||||||
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statisticsMinRelFre.getCorpus().getCorpusType());
|
||||||
|
|
||||||
|
Date startTime = new Date();
|
||||||
|
Date previousTime = new Date();
|
||||||
|
int remainingSeconds = -1;
|
||||||
|
int corpusSize;
|
||||||
|
int i;
|
||||||
|
if(statistic.getFilter().getCollocability().size() > 0){
|
||||||
|
i = 0;
|
||||||
|
corpusSize = corpusFiles.size() * 3;
|
||||||
|
} else {
|
||||||
|
i = 0;
|
||||||
|
corpusSize = corpusFiles.size() * 2;
|
||||||
|
}
|
||||||
|
for (File f : corpusFiles) {
|
||||||
|
final int iFinal = i;
|
||||||
|
XML_processing xml_processing = new XML_processing();
|
||||||
|
i++;
|
||||||
|
if (multipleFiles) {
|
||||||
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000);
|
||||||
|
previousTime = new Date();
|
||||||
|
}
|
||||||
|
} else {}
|
||||||
|
xml_processing.readXML(f.toString(), statisticsMinRelFre);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add remaining minRelFre results
|
||||||
|
if(statisticsMinRelFre.getFilter().getIsMinimalRelFreScraper()) {
|
||||||
|
long countFor1MWords = statisticsMinRelFre.getUniGramOccurrences().get(statisticsMinRelFre.getCorpus().getTotal()).longValue();
|
||||||
|
double absToRelFactor = (statisticsMinRelFre.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
|
||||||
|
|
||||||
|
statisticsMinRelFre.updateMinimalRelFre(statisticsMinRelFre.getTaxonomyResult().get(statisticsMinRelFre.getCorpus().getTotal()).entrySet(), absToRelFactor);
|
||||||
|
|
||||||
|
// reset all values
|
||||||
|
for(Taxonomy taxonomy : statisticsMinRelFre.getTaxonomyResult().keySet()){
|
||||||
|
statisticsMinRelFre.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>());
|
||||||
|
}
|
||||||
|
for(Taxonomy taxonomy : statisticsMinRelFre.getUniGramOccurrences().keySet()){
|
||||||
|
statisticsMinRelFre.getUniGramOccurrences().put(taxonomy, new AtomicLong(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
prepareMainTask(statistic, corpus);
|
||||||
|
|
||||||
|
|
||||||
|
}catch(CloneNotSupportedException c){}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void prepareMainTask(StatisticsNew statistic, Corpus corpus) {
|
||||||
|
Filter f = statistic.getFilter();
|
||||||
|
logger.info("Started execution: ", f);
|
||||||
|
|
||||||
|
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
|
||||||
|
|
||||||
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
|
||||||
|
Date startTime = new Date();
|
||||||
|
Date previousTime = new Date();
|
||||||
|
int remainingSeconds = -1;
|
||||||
|
int corpusSize;
|
||||||
|
int i;
|
||||||
|
int taskIndex = 0;
|
||||||
|
if(statistic.getFilter().getCollocability().size() > 0 && statistic.getFilter().getMinimalRelFre() > 1){
|
||||||
|
i = corpusFiles.size();
|
||||||
|
corpusSize = corpusFiles.size() * 3;
|
||||||
|
} else if (statistic.getFilter().getMinimalRelFre() > 1) {
|
||||||
|
i = corpusFiles.size();
|
||||||
|
corpusSize = corpusFiles.size() * 2;
|
||||||
|
} else if (statistic.getFilter().getCollocability().size() > 0) {
|
||||||
|
i = 0;
|
||||||
|
corpusSize = corpusFiles.size() * 2;
|
||||||
|
} else {
|
||||||
|
i = 0;
|
||||||
|
corpusSize = corpusFiles.size();
|
||||||
|
}
|
||||||
|
for (File fi : corpusFiles) {
|
||||||
|
final int iFinal = i;
|
||||||
|
XML_processing xml_processing = new XML_processing();
|
||||||
|
xml_processing.isCancelled = false;
|
||||||
|
i++;
|
||||||
|
taskIndex++;
|
||||||
|
// if(xml_processing.progressBarListener != null) {
|
||||||
|
// xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
|
||||||
|
// }
|
||||||
|
if (multipleFiles) {
|
||||||
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
|
||||||
|
previousTime = new Date();
|
||||||
|
}
|
||||||
|
// this.updateProgress(i, corpusSize);
|
||||||
|
// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds));
|
||||||
|
|
||||||
|
} else {
|
||||||
|
// xml_processing.progressBarListener = new InvalidationListener() {
|
||||||
|
// int remainingSeconds = -1;
|
||||||
|
// Date previousTime = new Date();
|
||||||
|
// @Override
|
||||||
|
// public void invalidated(Observable observable) {
|
||||||
|
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
|
||||||
|
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
|
||||||
|
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
|
||||||
|
// previousTime = new Date();
|
||||||
|
// }
|
||||||
|
// xml_processing.isCancelled = isCancelled();
|
||||||
|
// updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100);
|
||||||
|
// updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), 1, 1, f.getName(), remainingSeconds));
|
||||||
|
// }
|
||||||
|
// };
|
||||||
|
//
|
||||||
|
// xml_processing.progressProperty().addListener(xml_processing.progressBarListener);
|
||||||
|
}
|
||||||
|
xml_processing.readXML(fi.toString(), statistic);
|
||||||
|
}
|
||||||
|
// if getMinimalRelFre > 1 erase all words that have lower occurrences at the end of processing
|
||||||
|
if (statistic.getFilter().getMinimalRelFre() > 1){
|
||||||
|
long countFor1MWords = statistic.getUniGramOccurrences().get(statistic.getCorpus().getTotal()).longValue();
|
||||||
|
double absToRelFactor = (statistic.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
|
||||||
|
|
||||||
|
|
||||||
|
for(Map.Entry<MultipleHMKeys, AtomicLong> entry : statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet()){
|
||||||
|
if(entry.getValue().longValue() < absToRelFactor){
|
||||||
|
statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).remove(entry.getKey());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
statistic.updateMinimalRelFre(statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet(), absToRelFactor);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f.getCollocability().size() > 0) {
|
||||||
|
try{
|
||||||
|
Filter f2 = (Filter) f.clone();
|
||||||
|
f2.setNgramValue(1);
|
||||||
|
StatisticsNew statisticsOneGrams = new StatisticsNew(corpus, f2, false);
|
||||||
|
prepareTaskForCollocability(statistic, statisticsOneGrams);
|
||||||
|
}catch(CloneNotSupportedException c){}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
logger.error("Error while saving", e1);
|
||||||
|
} catch (OutOfMemoryError e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
logger.error("Out of memory error", e1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void prepareTaskForCollocability(StatisticsNew statistic, StatisticsNew statisticsOneGrams) {
|
||||||
|
Collection<File> corpusFiles = statisticsOneGrams.getCorpus().getDetectedCorpusFiles();
|
||||||
|
|
||||||
|
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
|
||||||
|
Date startTime = new Date();
|
||||||
|
Date previousTime = new Date();
|
||||||
|
int remainingSeconds = -1;
|
||||||
|
|
||||||
|
int corpusSize;
|
||||||
|
int i;
|
||||||
|
int taskIndex = 0;
|
||||||
|
if(statistic.getFilter().getMinimalRelFre() > 1){
|
||||||
|
i = corpusFiles.size() * 2;
|
||||||
|
corpusSize = corpusFiles.size() * 3;
|
||||||
|
} else {
|
||||||
|
i = corpusFiles.size();
|
||||||
|
corpusSize = corpusFiles.size() * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for (File f : corpusFiles) {
|
||||||
|
XML_processing xml_processing = new XML_processing();
|
||||||
|
i++;
|
||||||
|
taskIndex++;
|
||||||
|
if(xml_processing.progressBarListener != null) {
|
||||||
|
xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
|
||||||
|
}
|
||||||
|
if (multipleFiles) {
|
||||||
|
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
|
||||||
|
previousTime = new Date();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// xml_processing.progressBarListener = new InvalidationListener() {
|
||||||
|
// int remainingSeconds = -1;
|
||||||
|
// Date previousTime = new Date();
|
||||||
|
// @Override
|
||||||
|
// public void invalidated(Observable observable) {
|
||||||
|
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
|
||||||
|
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
|
||||||
|
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
|
||||||
|
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
|
||||||
|
// previousTime = new Date();
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// };
|
||||||
|
}
|
||||||
|
xml_processing.isCollocability = true;
|
||||||
|
xml_processing.readXML(f.toString(), statisticsOneGrams);
|
||||||
|
xml_processing.isCollocability = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
System.out.print(statistic);
|
||||||
|
statistic.updateCalculateCollocabilities(statisticsOneGrams);
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
logger.error("Error while saving", e1);
|
||||||
|
} catch (OutOfMemoryError e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_NOT_ENOUGH_MEMORY"));
|
||||||
|
logger.error("Out of memory error", e1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
100
src/main/java/nogui/WordParts.java
Normal file
100
src/main/java/nogui/WordParts.java
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import alg.XML_processing;
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import javafx.beans.InvalidationListener;
|
||||||
|
import javafx.beans.Observable;
|
||||||
|
import javafx.beans.property.ReadOnlyDoubleWrapper;
|
||||||
|
import javafx.concurrent.Task;
|
||||||
|
import javafx.scene.control.Alert;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONArray;
|
||||||
|
import org.json.simple.JSONObject;
|
||||||
|
import util.Tasks;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static gui.GUIController.showAlert;
|
||||||
|
import static nogui.Utils.*;
|
||||||
|
|
||||||
|
public class WordParts {
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static void wordParts(JSONObject settings, Corpus corpus) {
|
||||||
|
Filter filter = new Filter();
|
||||||
|
// fixed values
|
||||||
|
filter.setNgramValue(1);
|
||||||
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||||
|
filter.setSkipValue(0);
|
||||||
|
filter.setIsCvv(false);
|
||||||
|
filter.setStringLength(1);
|
||||||
|
|
||||||
|
// tab specific values
|
||||||
|
// TODO
|
||||||
|
ArrayList<String> prefixList = getArrayList((JSONArray) settings.get("prefixList"));
|
||||||
|
filter.setPrefixList(prefixList);
|
||||||
|
ArrayList<String> suffixList = getArrayList((JSONArray) settings.get("suffixList"));
|
||||||
|
filter.setSuffixList(suffixList);
|
||||||
|
|
||||||
|
String calculateForString = (String) settings.get("calculateFor");
|
||||||
|
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
|
||||||
|
filter.setCalculateFor(calculateFor);
|
||||||
|
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
|
||||||
|
filter.setMultipleKeys(alsoVisualize);
|
||||||
|
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
|
||||||
|
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
|
||||||
|
filter.setPrefixLength(Math.toIntExact((Long) settings.get("prefixLength")));
|
||||||
|
filter.setSuffixLength(Math.toIntExact((Long) settings.get("suffixLength")));
|
||||||
|
|
||||||
|
|
||||||
|
// right part
|
||||||
|
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
|
||||||
|
filter.setMsd(msd);
|
||||||
|
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
|
||||||
|
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
|
||||||
|
filter.setTaxonomy(taxonomy);
|
||||||
|
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
|
||||||
|
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
|
||||||
|
|
||||||
|
String message = Validation.validateForStringLevel(filter);
|
||||||
|
if (message == null) {
|
||||||
|
// no errors
|
||||||
|
logger.info("Executing: ", filter.toString());
|
||||||
|
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
|
||||||
|
execute(statistic, corpus);
|
||||||
|
try {
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.error(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execute(StatisticsNew statistic, Corpus corpus) {
|
||||||
|
logger.info("Started execution: ", statistic.getFilter());
|
||||||
|
|
||||||
|
if (statistic.getFilter().getMinimalRelFre() > 1){
|
||||||
|
prepareTaskForMinRelFre(statistic, corpus);
|
||||||
|
} else {
|
||||||
|
prepareMainTask(statistic, corpus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
88
src/main/java/nogui/WordSets.java
Normal file
88
src/main/java/nogui/WordSets.java
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import javafx.concurrent.Task;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONArray;
|
||||||
|
import org.json.simple.JSONObject;
|
||||||
|
import util.Tasks;
|
||||||
|
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static nogui.Utils.*;
|
||||||
|
|
||||||
|
public class WordSets {
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static void wordSets(JSONObject settings, Corpus corpus) {
|
||||||
|
Filter filter = new Filter();
|
||||||
|
// fixed values
|
||||||
|
|
||||||
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||||
|
|
||||||
|
filter.setIsCvv(false);
|
||||||
|
filter.setStringLength(1);
|
||||||
|
|
||||||
|
// tab specific values
|
||||||
|
filter.setNgramValue(Math.toIntExact((Long) settings.get("ngramValue")));
|
||||||
|
filter.setSkipValue(Math.toIntExact((Long) settings.get("skipValue")));
|
||||||
|
filter.setNotePunctuations((boolean) settings.get("notePunctuations"));
|
||||||
|
filter.setCollocability(getCollocability((JSONArray) settings.get("collocability")));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
String calculateForString = (String) settings.get("calculateFor");
|
||||||
|
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
|
||||||
|
filter.setCalculateFor(calculateFor);
|
||||||
|
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
|
||||||
|
filter.setMultipleKeys(alsoVisualize);
|
||||||
|
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
|
||||||
|
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
|
||||||
|
|
||||||
|
|
||||||
|
// right part
|
||||||
|
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
|
||||||
|
filter.setMsd(msd);
|
||||||
|
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
|
||||||
|
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
|
||||||
|
filter.setTaxonomy(taxonomy);
|
||||||
|
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
|
||||||
|
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
|
||||||
|
|
||||||
|
String message = Validation.validateForStringLevel(filter);
|
||||||
|
if (message == null) {
|
||||||
|
// no errors
|
||||||
|
logger.info("Executing: ", filter.toString());
|
||||||
|
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
|
||||||
|
execute(statistic, corpus);
|
||||||
|
try {
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.error(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execute(StatisticsNew statistic, Corpus corpus) {
|
||||||
|
Filter f = statistic.getFilter();
|
||||||
|
logger.info("Started execution: ", f);
|
||||||
|
|
||||||
|
if (f.getMinimalRelFre() > 1){
|
||||||
|
prepareTaskForMinRelFre(statistic, corpus);
|
||||||
|
} else {
|
||||||
|
prepareMainTask(statistic, corpus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
84
src/main/java/nogui/Words.java
Normal file
84
src/main/java/nogui/Words.java
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
package nogui;
|
||||||
|
|
||||||
|
import data.*;
|
||||||
|
import gui.GUIController;
|
||||||
|
import gui.I18N;
|
||||||
|
import javafx.concurrent.Task;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.json.simple.JSONArray;
|
||||||
|
import org.json.simple.JSONObject;
|
||||||
|
import util.Tasks;
|
||||||
|
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static nogui.Utils.*;
|
||||||
|
import static nogui.Utils.getTaxonomy;
|
||||||
|
|
||||||
|
public class Words {
|
||||||
|
public final static Logger logger = LogManager.getLogger(GUIController.class);
|
||||||
|
|
||||||
|
public static void words(JSONObject settings, Corpus corpus) {
|
||||||
|
Filter filter = new Filter();
|
||||||
|
// fixed values
|
||||||
|
filter.setNgramValue(1);
|
||||||
|
filter.setAl(AnalysisLevel.STRING_LEVEL);
|
||||||
|
filter.setSkipValue(0);
|
||||||
|
filter.setIsCvv(false);
|
||||||
|
filter.setStringLength(1);
|
||||||
|
|
||||||
|
// tab specific values
|
||||||
|
filter.setNotePunctuations((boolean) settings.get("notePunctuations"));
|
||||||
|
filter.setWriteMsdAtTheEnd((boolean) settings.get("writeMsdAtTheEnd"));
|
||||||
|
|
||||||
|
String calculateForString = (String) settings.get("calculateFor");
|
||||||
|
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
|
||||||
|
filter.setCalculateFor(calculateFor);
|
||||||
|
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
|
||||||
|
filter.setMultipleKeys(alsoVisualize);
|
||||||
|
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
|
||||||
|
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
|
||||||
|
|
||||||
|
|
||||||
|
// right part
|
||||||
|
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
|
||||||
|
filter.setMsd(msd);
|
||||||
|
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
|
||||||
|
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
|
||||||
|
filter.setTaxonomy(taxonomy);
|
||||||
|
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
|
||||||
|
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
|
||||||
|
|
||||||
|
String message = Validation.validateForStringLevel(filter);
|
||||||
|
if (message == null) {
|
||||||
|
// no errors
|
||||||
|
logger.info("Executing: ", filter.toString());
|
||||||
|
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
|
||||||
|
execute(statistic, corpus);
|
||||||
|
try {
|
||||||
|
boolean successullySaved = statistic.saveResultToDisk();
|
||||||
|
if (successullySaved) {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
|
||||||
|
} else {
|
||||||
|
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
|
||||||
|
}
|
||||||
|
} catch (UnsupportedEncodingException e1) {
|
||||||
|
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logger.error(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void execute(StatisticsNew statistic, Corpus corpus) {
|
||||||
|
logger.info("Started execution: ", statistic.getFilter());
|
||||||
|
|
||||||
|
if (statistic.getFilter().getMinimalRelFre() > 1){
|
||||||
|
prepareTaskForMinRelFre(statistic, corpus);
|
||||||
|
} else {
|
||||||
|
prepareMainTask(statistic, corpus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user