9 Commits
1.2 ... master

Author SHA1 Message Date
07b4161051 Updated documentation and translations. 2024-08-28 12:24:18 +02:00
5fcab958aa Merge branches 2024-08-28 11:08:15 +02:00
490f17d6b2 Added missing updates 2024-08-28 11:00:22 +02:00
eb72b380a5 Fixed taxonomy processing for KOST + Added ignoring of certain files in KOST 2024-06-20 12:13:27 +02:00
6f09cf9bed Added KOST taxonomy. 2024-06-10 15:27:15 +02:00
e58faf5604 Added instructions 2024-03-21 16:00:09 +01:00
9bb9c5669d Updated Java to 21 and pom.xml to shade 2024-03-19 09:40:46 +01:00
30b848d853 Added no GUI option 2022-06-02 09:21:56 +02:00
682beabdcb Updated About 2021-03-25 10:59:40 +01:00
42 changed files with 1374 additions and 156 deletions

3
.gitignore vendored
View File

@@ -164,3 +164,6 @@ $RECYCLE.BIN/
src/main/resources/translation_external/
src/main/resources/translations_backup/
shade
TEMP
data

View File

@@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
</component>
</module>

11
build_instructions.md Normal file
View File

@@ -0,0 +1,11 @@
# Build a jar
```shell
mvn package
```
- results are in shade folder
# Build executable using Launch4j
- Install Java on Windows
- Run Launch4j (download first) and create executable
- Copy jre from computer to jre folder that should be in the same folder as list.exe

22
configs/config_characters.json Executable file
View File

@@ -0,0 +1,22 @@
{
"language": "SL",
"corpusLocation": "target/classes/Gigafida_subset",
"readHeaderInfo": false,
"resultsLocation": "tmp",
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
"outputName": "",
"punctuation": "comma",
"tab": "characters",
"stringLength": 1,
"calculateFor": "calculateFor.WORD",
"displayTaxonomy": false,
"msd": "",
"taxonomySetOperation": "taxonomySetOperation.UNION",
"taxonomy": ["SSJ.T.K.L - tisk-knjižno-leposlovno", "SSJ.T.K.L - tisk-knjižno-leposlovno"],
"minimalOccurrences": 1,
"minimalTaxonomy": 1
}

View File

@@ -0,0 +1,21 @@
"language": String - options: "SL", "EN"
"corpusLocation": String - path to input location.
"readHeaderInfo": Boolean - read taxonomy from corpus files
"resultsLocation": String - path to results location
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
"outputName": String - Output file name
"punctuation": String - options: "comma", "point"
"tab": String - options: "characters", "wordParts", "words", "wordSets"
"stringLength": int - Number of characters
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
"displayTaxonomy": Boolean - Display taxonomy in output
"msd": String - A valid MSD (or empty)
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
"minimalOccurrences": int - Minimal number of occurrences
"minimalTaxonomy": int - Minimal number of taxonomy branches

27
configs/config_wordParts.json Executable file
View File

@@ -0,0 +1,27 @@
{
"language": "SL",
"corpusLocation": "target/classes/Gigafida_subset",
"readHeaderInfo": false,
"resultsLocation": "tmp",
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
"outputName": "",
"punctuation": "comma",
"tab": "wordParts",
"calculateFor": "calculateFor.WORD",
"alsoVisualize": ["calculateFor.LEMMA"],
"displayTaxonomy": false,
"prefixLength": 1,
"suffixLength": 0,
"prefixList": [],
"suffixList": [],
"msd": "",
"taxonomySetOperation": "taxonomySetOperation.UNION",
"taxonomy": [],
"minimalOccurrences": 1,
"minimalTaxonomy": 1,
"minimalRelFre": 1
}

View File

@@ -0,0 +1,26 @@
"language": String - options: "SL", "EN"
"corpusLocation": String - path to input location.
"readHeaderInfo": Boolean - read taxonomy from corpus files
"resultsLocation": String - path to results location
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
"outputName": String - Output file name
"punctuation": String - options: "comma", "point"
"tab": String - options: "characters", "wordParts", "words", "wordSets"
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
"displayTaxonomy": Boolean - Display taxonomy in output
"prefixLength": int - prefix length
"suffixLength": int - suffix length
"prefixList": array of Strings - write different options in array
"suffixList": array of Strings - write different options in array
"msd": String - A valid MSD (or empty)
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
"minimalOccurrences": int - Minimal number of occurrences
"minimalTaxonomy": int - Minimal number of taxonomy branches
"minimalRelFre": int - Minimal relative frequency

27
configs/config_wordSets.json Executable file
View File

@@ -0,0 +1,27 @@
{
"language": "SL",
"corpusLocation": "target/classes/Gigafida_subset",
"readHeaderInfo": false,
"resultsLocation": "tmp",
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
"outputName": "",
"punctuation": "comma",
"tab": "wordSets",
"calculateFor": "calculateFor.WORD",
"alsoVisualize": ["calculateFor.MORPHOSYNTACTIC_SPECS"],
"displayTaxonomy": false,
"ngramValue": 2,
"skipValue": 0,
"notePunctuations": false,
"collocability": ["Dice"],
"msd": "Sozei Sozei",
"taxonomySetOperation": "taxonomySetOperation.UNION",
"taxonomy": [],
"minimalOccurrences": 1,
"minimalTaxonomy": 1,
"minimalRelFre": 1
}

View File

@@ -0,0 +1,26 @@
"language": String - options: "SL", "EN"
"corpusLocation": String - path to input location.
"readHeaderInfo": Boolean - read taxonomy from corpus files
"resultsLocation": String - path to results location
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
"outputName": String - Output file name
"punctuation": String - options: "comma", "point"
"tab": String - options: "characters", "wordParts", "words", "wordSets"
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
"displayTaxonomy": Boolean - Display taxonomy in output
"ngramValue": int - N-gram length
"skipValue": int - Maximum number of words that can appear between two words and word set
"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag
"collocability": array of Strings - options: "Dice", "t-score", "MI", "MI3", "logDice", "simple LL"
"msd": String - A valid MSD (or empty)
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
"minimalOccurrences": int - Minimal number of occurrences
"minimalTaxonomy": int - Minimal number of taxonomy branches
"minimalRelFre": int - Minimal relative frequency

25
configs/config_words.json Executable file
View File

@@ -0,0 +1,25 @@
{
"language": "SL",
"corpusLocation": "target/classes/Gigafida_minimal/gfmin.xml",
"readHeaderInfo": false,
"resultsLocation": "tmp",
"selectReader": "XML (Gigafida 1.0, Kres 1.0)",
"outputName": "",
"punctuation": "comma",
"tab": "words",
"calculateFor": "calculateFor.WORD",
"alsoVisualize": ["calculateFor.LEMMA"],
"displayTaxonomy": false,
"notePunctuations": false,
"writeMsdAtTheEnd": false,
"msd": "",
"taxonomySetOperation": "taxonomySetOperation.UNION",
"taxonomy": [" SSJ.T.K.S - tisk-knjižno-strokovno"],
"minimalOccurrences": 1,
"minimalTaxonomy": 1,
"minimalRelFre": 1
}

View File

@@ -0,0 +1,24 @@
"language": String - options: "SL", "EN"
"corpusLocation": String - path to input location.
"readHeaderInfo": Boolean - read taxonomy from corpus files
"resultsLocation": String - path to results location
"selectReader": String - options: "VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)", corpusType = GIGAFIDA
"outputName": String - Output file name
"punctuation": String - options: "comma", "point"
"tab": String - options: "characters", "wordParts", "words", "wordSets"
"calculateFor": String - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.NORMALIZED_WORD", "calculateFor.LEMMA", "calculateFor.MORPHOSYNTACTIC_SPECS", "calculateFor.MORPHOSYNTACTIC_PROPERTY", "calculateFor.WORD_TYPE", "calculateFor.DIST_WORDS", "calculateFor.DIST_LEMMAS"
"alsoVisualize": array of Strings - options: "calculateFor.WORD", "calculateFor.LOWERCASE_WORD", "calculateFor.LEMMA", "calculateFor.NORMALIZED_WORD", "calculateFor.WORD_TYPE", "calculateFor.MORPHOSYNTACTIC_SPECS"
"displayTaxonomy": Boolean - Display taxonomy in output
"notePunctuations": Boolean - The output will also include parts of morphosyntactic tag
"writeMsdAtTheEnd": Boolean - Word sets will include punctuations
"msd": String - A valid MSD (or empty)
"taxonomySetOperation": String - options: "taxonomySetOperation.UNION", "taxonomySetOperation.INTERSECTION"
"taxonomy": array of Strings - options: "SSJ.T - tisk", " SSJ.T.K - tisk-knjižno", " SSJ.T.K.L - tisk-knjižno-leposlovno", " SSJ.T.K.S - tisk-knjižno-strokovno", " SSJ.T.P - tisk-periodično", " SSJ.T.P.C - tisk-periodično-časopis", " SSJ.T.P.R - tisk-periodično-revija", " SSJ.T.D - tisk-drugo", "SSJ.I - internet", "Ft.P - prenosnik", " Ft.P.G - prenosnik-govorni", " Ft.P.E - prenosnik-elektronski", " Ft.P.P - prenosnik-pisni", " Ft.P.P.O - prenosnik-pisni-objavljeno", " Ft.P.P.O.K - prenosnik-pisni-objavljeno-knjižno", " Ft.P.P.O.P - prenosnik-pisni-objavljeno-periodično", " Ft.P.P.O.P.C - prenosnik-pisni-objavljeno-periodično-časopisno", " Ft.P.P.O.P.C.D - prenosnik-pisni-objavljeno-periodično-časopisno-dnevno", " Ft.P.P.O.P.C.V - prenosnik-pisni-objavljeno-periodično-časopisno-večkrat tedensko", " Ft.P.P.O.P.C.T - prenosnik-pisni-objavljeno-periodično-časopisno-tedensko", " Ft.P.P.O.P.R - prenosnik-pisni-objavljeno-periodično-revialno", " Ft.P.P.O.P.R.T - prenosnik-pisni-objavljeno-periodično-revialno-tedensko", " Ft.P.P.O.P.R.S - prenosnik-pisni-objavljeno-periodično-revialno-štirinajstdnevno", " Ft.P.P.O.P.R.M - prenosnik-pisni-objavljeno-periodično-revialno-mesečno", " Ft.P.P.O.P.R.D - prenosnik-pisni-objavljeno-periodično-revialno-redkeje kot na mesec", " Ft.P.P.O.P.R.O - prenosnik-pisni-objavljeno-periodično-revialno-občasno", " Ft.P.P.N - prenosnik-pisni-neobjavljeno", " Ft.P.P.N.J - prenosnik-pisni-neobjavljeno-javno", " Ft.P.P.N.I - prenosnik-pisni-neobjavljeno-interno", " Ft.P.P.N.Z - prenosnik-pisni-neobjavljeno-zasebno", "Ft.Z - zvrst", " Ft.Z.U - zvrst-umetnostna", " Ft.Z.U.P - zvrst-umetnostna-pesniška", " Ft.Z.U.R - zvrst-umetnostna-prozna", " Ft.Z.U.D - zvrst-umetnostna-dramska", " Ft.Z.N - zvrst-neumetnostna", " Ft.Z.N.S - zvrst-neumetnostna-strokovna", " Ft.Z.N.S.H - zvrst-neumetnostna-strokovna-humanistična in družboslovna", " Ft.Z.N.S.N - zvrst-neumetnostna-strokovna-naravoslovna in tehnična", " Ft.Z.N.N - zvrst-neumetnostna-nestrokovna", " Ft.Z.N.P - zvrst-neumetnostna-pravna", "Ft.L - zvrst-lektorirano", " Ft.L.D - zvrst-lektorirano-da", " Ft.L.N - zvrst-lektorirano-ne", "gos.T - diskurz", " gos.T.J - diskurz-javni", " gos.T.J.I - diskurz-javni-informativno-izobraževalni", " gos.T.J.R - diskurz-javni-razvedrilni", " gos.T.N - diskurz-nejavni", " gos.T.N.N - diskurz-nejavni-nezasebni", " gos.T.N.Z - diskurz-nejavni-zasebni", "gos.S - situacija", " gos.S.R - situacija-radio", " gos.S.T - situacija-televizija", "gos.K - kanal", " gos.K.O - kanal-osebni stik", " gos.K.P - kanal-telefon", " gos.K.R - kanal-radio", " gos.K.T - kanal-televizija"
"minimalOccurrences": int - Minimal number of occurrences
"minimalTaxonomy": int - Minimal number of taxonomy branches
"minimalRelFre": int - Minimal relative frequency

24
instructions.md Normal file
View File

@@ -0,0 +1,24 @@
# Instructions
Instructions on how to run list.
## Windows
There are two options.
### Run list.exe
The easier option is to download list.zip, extract it and run list.exe.
### Run list.jar
To do this you first need to install the correct version of java (JDK). The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Secondly, you may run list using `run.bat` which will run `list.jar` for you.
## Linux
### Run list.jar
Similarly to running list.jar in Windows, you have to first make sure, that you have the appropriate version of Java installed. The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Next, you may run list using `run.sh` which will run `list.jar` for you.

0
license Normal file → Executable file
View File

128
pom.xml
View File

@@ -4,9 +4,30 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>thesis</groupId>
<artifactId>corpus-analyzer</artifactId>
<version>1.2</version>
<groupId>list</groupId>
<artifactId>list</artifactId>
<name>list</name>
<version>1.3</version>
<repositories>
<repository>
<id>central</id>
<name>Central Repository</name>
<url>https://repo.maven.apache.org/maven2/</url>
</repository>
<repository>
<id>central2</id>
<name>Central Repository2</name>
<url>https://repo1.maven.org/maven2/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.release>17</maven.compiler.release>
<javafx.version>21</javafx.version>
<ikonli.version>12.3.1</ikonli.version>
<javafx.maven.plugin.version>0.0.8</javafx.maven.plugin.version>
</properties>
<dependencies>
<dependency>
@@ -32,7 +53,7 @@
<dependency>
<groupId>org.controlsfx</groupId>
<artifactId>controlsfx</artifactId>
<version>8.40.13</version>
<version>11.2.0</version>
</dependency>
<dependency>
<groupId>org.rocksdb</groupId>
@@ -52,71 +73,82 @@
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-fontawesome-pack</artifactId>
<version>1.9.0</version>
<version>${ikonli.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-controls</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-fxml</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>win</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>linux</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>mac</classifier>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-javafx</artifactId>
<version>1.9.0</version>
<version>${ikonli.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!-- packages dependencies into the jar -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
</plugin>
<plugin>
<groupId>org.openjfx</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>${javafx.maven.plugin.version}</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
<goal>shade</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>gui.GUIController</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>artifact</outputDirectory>
<finalName>Corpus_Analyzer_${version}</finalName>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>project-classifier</shadedClassifierName>
<outputFile>shade\${project.artifactId}.jar</outputFile>
<transformers>
<transformer implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>gui.Launcher</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- JavaFX -->
<groupId>com.zenjava</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>8.8.3</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
<verbose>true</verbose>
</configuration>
<executions>
<execution>
<id>create-jfxjar</id>
<phase>package</phase>
<goals>
<goal>build-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

18
readme.md Normal file → Executable file
View File

@@ -1,10 +1,10 @@
(English version below)
LIST, korpusni luščilnik
Različica: 1.0 (Zadnja posodobitev: 21. marec 2019)
Različica: 1.3 (Zadnja posodobitev: 28. avgust 2024)
Avtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020 sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411) je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna.
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za znanstvenoraziskovalno in inovacijsko dejavnost Republike Slovenije (ARIS) iz državnega proračuna.
Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,
Institut "Jožef Stefan",
@@ -15,19 +15,16 @@ Program je dostopen pod licenco MIT License na repozitorijih CLARIN.SI (http://h
NAVODILA ZA NAMESTITEV IN ZAGON:
1) Pred uporabo programske opreme mora biti na računalniku nameščena 64-bitna java (https://java.com/en/download/manual.jsp).
2) Vse tri programske datoteke (run.sh, run.bat, list1.0.jar) skopiramo v poljubno mapo.
3) Program zaženemo z dvoklikom na datoteko run.bat na operacijskem sistemu Windows ali run.sh na operacijskem sistemu Linux.
4) Ko izbiramo lokacijo korpusa, moramo poskrbeti, da v mapi ni datotek več različnih korpusov.
Datoteko list.zip razširimo in poženemo `list.exe` znotraj razširjene mape. Druge možnosti so opisane v [razširjeni dokumentaciji](instructions.md).
---------
LIST Corpus Extraction Tool
Version: 1.0 (Last update: 21 March 2019)
Version: 1.3 (Last update: 28 August 2024)
Authors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), which was financially supported by the Slovenian Research Agency between 2017 and 2020. The authors acknowledge the financial support from the Slovenian Research Agency (research core funding No. P6-0411 Language Resources and Technologies for Slovene).
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), the Empirical foundations for digitally-supported development of writing skills project (J7-3159) and the Language Resources and Technologies for Slovene programme (P6-0411), all financed by the Slovenian Research and Innovation Agency (ARIS).
Publisher: Centre for Language Resources and Technologies, University of Ljubljana,
Jožef Stefan Institute,
@@ -38,7 +35,4 @@ The program is available under the MIT License at CLARIN.SI (http://hdl.handle.n
INSTRUCTIONS FOR INSTALLATION AND USE:
1) Make sure that 64-bit java is installed on your computer (https://java.com/en/download/manual.jsp).
2) Copy all three program files (run.sh, run.bat, list1.0.jar) in a single folder.
3) Run the program by double-clicking the run.bat file on a Windows operating system or run.sh on Linux.
4) When selecting the location of the corpus, make sure the folder does not include files of multiple different corpora.
Extract list.zip file and run list.exe. For other options please read [detailed instructions](instructions.md).

View File

@@ -50,7 +50,8 @@ public class XML_processing {
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
return readXMLSolar(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
stats.getCorpus().getCorpusType() == CorpusType.KOST) {
return readXMLSSJ500K(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
return readVERT(path, stats);
@@ -461,6 +462,8 @@ public class XML_processing {
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String headTagName;
@@ -471,7 +474,7 @@ public class XML_processing {
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else if (corpusType == CorpusType.SSJ500K) {
} else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
headTagName = "bibl";
} else {
headTagName = "teiHeader";
@@ -482,6 +485,9 @@ public class XML_processing {
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
@@ -495,6 +501,10 @@ public class XML_processing {
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
corpusType == CorpusType.KOST && elementName.equals("TEI")
) {
return resultTaxonomy;
}
if (insideHeader) {
@@ -516,6 +526,11 @@ public class XML_processing {
.replace("#", "");
resultTaxonomy.add(tax);
// kost
} else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
// solar
} else if (!parseTaxonomy) {
boolean inHeadTags = false;
@@ -533,13 +548,22 @@ public class XML_processing {
}
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
} else if (xmlEvent.isEndElement() && insideNote) {
if (taxonomyNames.contains(filterName)) {
Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
}
insideNote = false;
} else if (xmlEvent.isCharacters() && insideNote) {
Characters characters = xmlEvent.asCharacters();
filterValue = characters.getData();
}
}
} catch (XMLStreamException e) {
@@ -726,6 +750,8 @@ public class XML_processing {
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String lemma = "";
String msd = "";
@@ -760,6 +786,9 @@ public class XML_processing {
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
@@ -803,6 +832,12 @@ public class XML_processing {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
// kost
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
// get value from attribute target
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
@@ -818,6 +853,10 @@ public class XML_processing {
} else if (qName.equals("text")){
taxonomyMatch = true;
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
) {
return true;
}
break;
@@ -836,6 +875,10 @@ public class XML_processing {
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
}
// kost
if (insideNote) {
filterValue = characters.getData();
}
break;
case XMLStreamConstants.END_ELEMENT:
@@ -876,7 +919,8 @@ public class XML_processing {
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
(stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
@@ -892,7 +936,7 @@ public class XML_processing {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false;
@@ -900,6 +944,17 @@ public class XML_processing {
}
} else if (endElement.getName().getLocalPart().equals("text")){
taxonomyMatch = false;
// kost
}
if (insideNote) {
if (taxonomyNames.contains(filterName)) {
for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
}
insideNote = false;
}
break;

View File

@@ -3,11 +3,8 @@ package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;

View File

@@ -9,6 +9,7 @@ public enum CorpusType {
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos"),
KOST("KOST", "kost"),
SSJ500K("ssj500k", "ssj500k"),
VERT("vert", "vert");

View File

@@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
static {
// GIGAFIDA ----------------------------
@@ -108,7 +108,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
// if VERT only order taxonomy by alphabet
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
Collections.sort(sortedFoundTaxonomy);
@@ -199,7 +199,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
for (Taxonomy t : taxonomy) {
result.add(t.toLongNameString());
}

View File

@@ -680,7 +680,7 @@ enum TaxonomyEnum {
return r;
}
public static ArrayList<TaxonomyEnum> convertStringListToTaxonomyList(ObservableList<String> stringList, Corpus corpus){
public static ArrayList<TaxonomyEnum> convertStringListToTaxonomyList(List<String> stringList, Corpus corpus){
ArrayList<TaxonomyEnum> taxonomyList = new ArrayList<>();
for (String e : stringList) {
@@ -763,6 +763,42 @@ public class Taxonomy {
}
public static String[] format_KOST_taxonomy(String value, String parameter) {
Map<String, String> filterMap = new HashMap<>();
filterMap.put("FirstLang", "Prvi Jezik tvorca");
filterMap.put("TaskSetting", "Okoliščine nastanka");
filterMap.put("ProficSlv", "Nivo");
filterMap.put("ProgramType", "Program");
filterMap.put("InputType", "Napisano");
String[] split_value = new String[] {};
if (parameter.equals("FirstLang")) {
if (value.contains(", ")) {
split_value = value.split(", ");
} else if (value.contains(" ")) {
for (String v : value.split(" ")) {
if (v.equals("španščina") || v.equals("angleščina")) {
split_value = new String[] {v};
}
}
} else {
split_value = new String[] {value};
}
} else if (parameter.equals("ProficSlv")) {
if (value.equals("Izpopolnjevalec")) {
split_value = new String[] {"izpopolnjevalec"};
} else {
split_value = new String[] {value};
}
} else {
split_value = new String[] {value};
}
return Arrays.stream(split_value)
.map(val -> filterMap.get(parameter) + " - " + val)
.toArray(String[]::new);
}
public String toString() {
return this.name;
}
@@ -791,7 +827,7 @@ public class Taxonomy {
return null;
}
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(ObservableList<String> stringList, Corpus corpus){
public static ArrayList<Taxonomy> convertStringListToTaxonomyList(List<String> stringList, Corpus corpus){
ArrayList<Taxonomy> taxonomyList = new ArrayList<>();
for (String e : stringList) {
@@ -832,9 +868,9 @@ public class Taxonomy {
return r;
}
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, ObservableList<String> checkedItems, Corpus corpus){
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
} else {

View File

@@ -5,6 +5,7 @@ import static gui.GUIController.*;
import static util.Util.*;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
@@ -149,7 +150,7 @@ public class CorpusTab {
private String corpusLocation;
private String corpusFilesSize;
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
@@ -194,8 +195,6 @@ public class CorpusTab {
}
public void initialize() {
updateTooltipBehavior(0.0, 30000.0,0.0, true);
// add CSS style
corpusTabPane.getStylesheets().add("style.css");
corpusTabPane.getStyleClass().add("root");
@@ -499,7 +498,7 @@ public class CorpusTab {
logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@@ -740,7 +739,6 @@ public class CorpusTab {
private void selectReader() {
switch (selectReader) {
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
case "VERT + REGI":
corpusType = VERT;
break;
@@ -750,6 +748,9 @@ public class CorpusTab {
case "XML (GOS 1.0)":
corpusType = GOS;
break;
case "XML (KOST 2.0)":
corpusType = KOST;
break;
case "XML (ssj500k 2.1)":
corpusType = SSJ500K;
break;
@@ -788,6 +789,8 @@ public class CorpusTab {
corpusType = GOS;
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
corpusType = SSJ500K;
} else if (attrib.contains(KOST.getNameLowerCase())) {
corpusType = KOST;
}
if (corpusType == null) {

View File

@@ -2,6 +2,7 @@ package gui;
import java.io.IOException;
import data.Filter;
import javafx.beans.binding.StringBinding;
import javafx.scene.layout.AnchorPane;
import org.apache.logging.log4j.LogManager;
@@ -20,6 +21,8 @@ import javafx.scene.control.Tab;
import javafx.scene.control.TabPane;
import javafx.stage.Stage;
import static nogui.NoGUIController.launch_no_gui;
public class GUIController extends Application {
public final static Logger logger = LogManager.getLogger(GUIController.class);
@@ -78,7 +81,13 @@ public class GUIController extends Application {
}
public static void main(String[] args) {
launch(args);
if (args.length > 0) {
launch_no_gui(args);
logger.info("Processing finalized!");
} else {
launch(args);
}
System.exit(0);
}
public void initialize() {

27
src/main/java/gui/I18N.java Normal file → Executable file
View File

@@ -61,23 +61,13 @@ public final class I18N {
public static String get(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getLocale());
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static String getDefaultLocaleItem(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getDefaultLocale());
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static ObservableList<String> getObject(final ArrayList<String> keys, final Object... args) {
@@ -86,11 +76,7 @@ public final class I18N {
ArrayList<String> results = new ArrayList<>();
for(String key : keys){
String val = bundle.getString(key);
try {
results.add(MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
results.add(val);
}
return FXCollections.observableArrayList(results);
@@ -121,12 +107,7 @@ public final class I18N {
public static String getIndependent(final String key, Locale locale, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", locale);
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static String getRootValue(String oldValue, ArrayList<String> nGramComputeForLetters) {

View File

@@ -0,0 +1,8 @@
package gui;
public class Launcher {
public static void main(String[] args) {
GUIController.main(args);
}
}

View File

@@ -44,7 +44,7 @@ public class Messages {
// Not properly to be here. TODO move somewhere else in future
public static String HELP_URL = "http://slovnica.ijs.si/";
public static String CJVT_URL = "http://hdl.handle.net/11356/1227";
public static String CJVT_URL = "http://hdl.handle.net/11356/1964";
public static String GITHUB_URL = "https://gitea.cjvt.si/lkrsnik/list";
// helper maps

View File

@@ -0,0 +1,118 @@
package nogui;
import alg.XML_processing;
import data.*;
import gui.GUIController;
import gui.I18N;
import javafx.beans.InvalidationListener;
import javafx.beans.Observable;
import javafx.beans.property.ReadOnlyDoubleWrapper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.regex.Pattern;
import static nogui.Utils.*;
public class Characters {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static void characters(JSONObject settings, Corpus corpus) {
Filter filter = new Filter();
// fixed values
filter.setNgramValue(0);
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(false);
filter.setMultipleKeys(new ArrayList<>());
// tab specific values
filter.setStringLength(Math.toIntExact((Long) settings.get("stringLength")));
String calculateForString = (String) settings.get("calculateFor");
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
filter.setCalculateFor(calculateFor);
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
// right part
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
filter.setMsd(msd);
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
filter.setTaxonomy(taxonomy);
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
execute(statistic);
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
}
} else {
logger.error(message);
}
}
private static void execute(StatisticsNew statistic) {
logger.info("Started execution: ", statistic.getFilter());
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
int i = 0;
Date startTime = new Date();
Date previousTime = new Date();
int remainingSeconds = -1;
for (File f : corpusFiles) {
final int iFinal = i;
XML_processing xml_processing = new XML_processing();
i++;
if (multipleFiles) {
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusFiles.size() - i) / 1000);
previousTime = new Date();
}
updateProgress(i, corpusFiles.size(), String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusFiles.size(), f.getName(), remainingSeconds));
} else {
xml_processing.progressBarListener = new InvalidationListener() {
int remainingSeconds = -1;
Date previousTime = new Date();
@Override
public void invalidated(Observable observable) {
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
(1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
((corpusFiles.size() - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
previousTime = new Date();
}
updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusFiles.size() * 100, String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), iFinal + 1, corpusFiles.size(), f.getName(), remainingSeconds));
}
};
xml_processing.progressProperty().addListener(xml_processing.progressBarListener);
}
xml_processing.readXML(f.toString(), statistic);
}
}
}

View File

@@ -0,0 +1,200 @@
package nogui;
import data.*;
import gui.GUIController;
import gui.I18N;
import gui.ValidationUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOCase;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import java.io.*;
import java.util.*;
import static data.CorpusType.*;
import static data.CorpusType.GIGAFIDA;
import static nogui.Characters.characters;
import static nogui.WordSets.wordSets;
import static nogui.Words.words;
import static nogui.WordParts.wordParts;
public class NoGUIController {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static void launch_no_gui(String [] args) {
Filter filter = new Filter();
String path = null;
String corpusLocation = null;
String outputName = null;
String resultsLocation = null;
// read parameters
int i = 0;
for(String s : args) {
switch (s) {
case "--config": // we check for if it is equal to -v
path = args[i + 1];
break;
case "--corpusLocation": // in an else if to have no input = -v
corpusLocation = args[i + 1];
break;
case "--outputName": // in an else if to have no input = -v
outputName = args[i + 1];
break;
case "--resultsLocation": // in an else if to have no input = -v
resultsLocation = args[i + 1];
break;
}
i ++;
}
// read config file
JSONObject settings = read_config(path);
// read corpus
Corpus corpus = read_corpus(settings, corpusLocation, outputName, resultsLocation);
if (((String) settings.get("language")).equals("SL")) {
I18N.setLocale(new Locale.Builder().setLanguage("sl").setRegion("SI").build());
} else {
I18N.setLocale(Locale.ENGLISH);
}
// handle chars
if (((String) settings.get("tab")).equals("characters")) {
characters(settings, corpus);
} else if (((String) settings.get("tab")).equals("wordParts")) {
wordParts(settings, corpus);
} else if (((String) settings.get("tab")).equals("words")) {
words(settings, corpus);
} else if (((String) settings.get("tab")).equals("wordSets")) {
wordSets(settings, corpus);
}
}
private static Corpus read_corpus(JSONObject settings, String corpusLocationS, String outputNameS, String resultsLocationS) {
Corpus corpus = new Corpus();
if (corpusLocationS == null) {
corpusLocationS = (String) settings.get("corpusLocation");
}
File corpusLocation = new File(corpusLocationS);
corpus.setChosenCorpusLocation(corpusLocation);
boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo");
corpus.setHeaderRead(readHeaderInfo);
if (resultsLocationS == null) {
resultsLocationS = (String) settings.get("resultsLocation");
}
File resultsLocation = new File(resultsLocationS);
corpus.setChosenResultsLocation(resultsLocation);
CorpusType corpusType = selectReader((String) settings.get("selectReader"));
corpus.setCorpusType(corpusType);
if (outputNameS == null) {
outputNameS = (String) settings.get("outputName");
}
corpus.setCorpusName(outputNameS);
String punctuation = ((String) settings.get("punctuation")).equals("comma") ? "punctuation.COMMA" : "punctuation.POINT";
corpus.setPunctuation(punctuation);
Collection<File> corpusFiles = null;
if (ValidationUtil.isReadableDirectory(corpusLocation)) {
logger.info("selected corpus dir: ", corpusLocation.getAbsolutePath());
// scan for xml files
corpusFiles = FileUtils.listFiles(corpusLocation, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
} else {
corpusFiles = new LinkedList();
corpusFiles.add(corpusLocation);
}
corpus.setDetectedCorpusFiles(corpusFiles);
corpus.validate();
// MISSING: setSolarFiltersForXML
return corpus;
}
private static CorpusType selectReader(String selectReader) {
CorpusType corpusType = null;
switch (selectReader) {
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
case "VERT + REGI":
corpusType = VERT;
break;
case "XML (Šolar 1.0)":
corpusType = SOLAR;
break;
case "XML (GOS 1.0)":
corpusType = GOS;
break;
case "XML (ssj500k 2.1)":
corpusType = SSJ500K;
break;
case "XML (Gigafida 2.0)":
corpusType = GIGAFIDA2;
break;
case "XML (Gigafida 1.0, Kres 1.0)":
corpusType = GIGAFIDA;
break;
default:
break;
}
return corpusType;
}
private static JSONObject read_config(String path) {
JSONObject settings = null;
//JSON parser object to parse read file
JSONParser jsonParser = new JSONParser();
try (FileReader reader = new FileReader(path))
{
//Read JSON file
Object obj = jsonParser.parse(reader);
settings = (JSONObject) obj;
// String selectReader = (String) settings.get("selectReader");
// boolean readHeaderInfo = (Boolean) settings.get("readHeaderInfo");
// employeeList.forEach( emp -> parseEmployeeObject( (JSONObject) emp ) );
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
return settings;
// chooseCorpusLocationL - selectedDirectory_input;
// readHeaderInfo;
// chooseResultsLocationL - selectedDirectory_output;
// selectReader;
// outputName;
// punctuation;
//
//
// Collection<File> corpusFiles = FileUtils.listFiles(selectedDirectory, FileFilterUtils.suffixFileFilter("xml", IOCase.INSENSITIVE), TrueFileFilter.INSTANCE);
}
}

319
src/main/java/nogui/Utils.java Executable file
View File

@@ -0,0 +1,319 @@
package nogui;
import alg.XML_processing;
import data.*;
import gui.GUIController;
import gui.I18N;
import javafx.scene.control.Alert;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import static gui.GUIController.showAlert;
public class Utils {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static ArrayList<Taxonomy> getTaxonomy(JSONArray taxonomyArray, Corpus corpus) {
// convert JSONArray to ObservableList<String>
ArrayList<String> checkedItems = new ArrayList<>();
for (Object o : taxonomyArray) {
checkedItems.add((String) o);
}
ArrayList<Taxonomy> taxonomy = new ArrayList<>();
ArrayList<Taxonomy> checkedItemsTaxonomy = Taxonomy.modifyingTaxonomy(taxonomy, checkedItems, corpus);
return checkedItemsTaxonomy;
}
public static ArrayList<Collocability> getCollocability(JSONArray collocabilityArray) {
// convert JSONArray to ObservableList<String>
ArrayList<Collocability> checkedItems = new ArrayList<>();
for (Object o : collocabilityArray) {
checkedItems.add(Collocability.factory((String) o));
}
return checkedItems;
}
public static ArrayList<String> getArrayList(JSONArray array) {
// convert JSONArray to ObservableList<String>
ArrayList<String> arrayList = new ArrayList<>();
for (Object o : array) {
arrayList.add((String) o);
}
return arrayList;
}
public static ArrayList<String> getAlsoVisualizeList(JSONArray array) {
// convert JSONArray to ObservableList<String>
ArrayList<String> arrayList = new ArrayList<>();
for (Object o : array) {
arrayList.add(I18N.get((String) o));
}
return arrayList;
}
public static ArrayList<Pattern> getMsd(String stringMsd) {
ArrayList<Pattern> msd = new ArrayList<>();
if (stringMsd.equals("")) {
return msd;
}
ArrayList<String> msdTmp = new ArrayList<>(Arrays.asList(stringMsd.split(" ")));
for (String msdToken : msdTmp) {
msd.add(Pattern.compile(msdToken));
}
return msd;
}
public static void updateProgress(int i, int size, String format) {
}
public static void updateProgress(double i, int size, String format) {
}
public static void prepareTaskForMinRelFre(StatisticsNew statistic, Corpus corpus) {
Filter fi = statistic.getFilter();
logger.info("Started execution: ", fi);
try{
Filter f2 = (Filter) fi.clone();
f2.setIsMinimalRelFreScraper(true);
StatisticsNew statisticsMinRelFre = new StatisticsNew(corpus, f2, false);
Collection<File> corpusFiles = statisticsMinRelFre.getCorpus().getDetectedCorpusFiles();
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statisticsMinRelFre.getCorpus().getCorpusType());
Date startTime = new Date();
Date previousTime = new Date();
int remainingSeconds = -1;
int corpusSize;
int i;
if(statistic.getFilter().getCollocability().size() > 0){
i = 0;
corpusSize = corpusFiles.size() * 3;
} else {
i = 0;
corpusSize = corpusFiles.size() * 2;
}
for (File f : corpusFiles) {
final int iFinal = i;
XML_processing xml_processing = new XML_processing();
i++;
if (multipleFiles) {
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/i) * (corpusSize - i) / 1000);
previousTime = new Date();
}
} else {}
xml_processing.readXML(f.toString(), statisticsMinRelFre);
}
// add remaining minRelFre results
if(statisticsMinRelFre.getFilter().getIsMinimalRelFreScraper()) {
long countFor1MWords = statisticsMinRelFre.getUniGramOccurrences().get(statisticsMinRelFre.getCorpus().getTotal()).longValue();
double absToRelFactor = (statisticsMinRelFre.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
statisticsMinRelFre.updateMinimalRelFre(statisticsMinRelFre.getTaxonomyResult().get(statisticsMinRelFre.getCorpus().getTotal()).entrySet(), absToRelFactor);
// reset all values
for(Taxonomy taxonomy : statisticsMinRelFre.getTaxonomyResult().keySet()){
statisticsMinRelFre.getTaxonomyResult().put(taxonomy, new ConcurrentHashMap<>());
}
for(Taxonomy taxonomy : statisticsMinRelFre.getUniGramOccurrences().keySet()){
statisticsMinRelFre.getUniGramOccurrences().put(taxonomy, new AtomicLong(0));
}
}
prepareMainTask(statistic, corpus);
}catch(CloneNotSupportedException c){}
}
public static void prepareMainTask(StatisticsNew statistic, Corpus corpus) {
Filter f = statistic.getFilter();
logger.info("Started execution: ", f);
Collection<File> corpusFiles = statistic.getCorpus().getDetectedCorpusFiles();
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
Date startTime = new Date();
Date previousTime = new Date();
int remainingSeconds = -1;
int corpusSize;
int i;
int taskIndex = 0;
if(statistic.getFilter().getCollocability().size() > 0 && statistic.getFilter().getMinimalRelFre() > 1){
i = corpusFiles.size();
corpusSize = corpusFiles.size() * 3;
} else if (statistic.getFilter().getMinimalRelFre() > 1) {
i = corpusFiles.size();
corpusSize = corpusFiles.size() * 2;
} else if (statistic.getFilter().getCollocability().size() > 0) {
i = 0;
corpusSize = corpusFiles.size() * 2;
} else {
i = 0;
corpusSize = corpusFiles.size();
}
for (File fi : corpusFiles) {
final int iFinal = i;
XML_processing xml_processing = new XML_processing();
xml_processing.isCancelled = false;
i++;
taskIndex++;
// if(xml_processing.progressBarListener != null) {
// xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
// }
if (multipleFiles) {
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
previousTime = new Date();
}
// this.updateProgress(i, corpusSize);
// this.updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), i, corpusSize, f.getName(), remainingSeconds));
} else {
// xml_processing.progressBarListener = new InvalidationListener() {
// int remainingSeconds = -1;
// Date previousTime = new Date();
// @Override
// public void invalidated(Observable observable) {
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
// previousTime = new Date();
// }
// xml_processing.isCancelled = isCancelled();
// updateProgress((iFinal * 100) + ((ReadOnlyDoubleWrapper) observable).get() + 1, corpusSize * 100);
// updateMessage(String.format(I18N.get("message.ONGOING_NOTIFICATION_ANALYZING_FILE_X_OF_Y"), 1, 1, f.getName(), remainingSeconds));
// }
// };
//
// xml_processing.progressProperty().addListener(xml_processing.progressBarListener);
}
xml_processing.readXML(fi.toString(), statistic);
}
// if getMinimalRelFre > 1 erase all words that have lower occurrences at the end of processing
if (statistic.getFilter().getMinimalRelFre() > 1){
long countFor1MWords = statistic.getUniGramOccurrences().get(statistic.getCorpus().getTotal()).longValue();
double absToRelFactor = (statistic.getFilter().getMinimalRelFre() / 1000000.0) * countFor1MWords;
for(Map.Entry<MultipleHMKeys, AtomicLong> entry : statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet()){
if(entry.getValue().longValue() < absToRelFactor){
statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).remove(entry.getKey());
}
}
statistic.updateMinimalRelFre(statistic.getTaxonomyResult().get(statistic.getCorpus().getTotal()).entrySet(), absToRelFactor);
}
if (f.getCollocability().size() > 0) {
try{
Filter f2 = (Filter) f.clone();
f2.setNgramValue(1);
StatisticsNew statisticsOneGrams = new StatisticsNew(corpus, f2, false);
prepareTaskForCollocability(statistic, statisticsOneGrams);
}catch(CloneNotSupportedException c){}
} else {
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
logger.error("Error while saving", e1);
} catch (OutOfMemoryError e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
logger.error("Out of memory error", e1);
}
}
}
public static void prepareTaskForCollocability(StatisticsNew statistic, StatisticsNew statisticsOneGrams) {
Collection<File> corpusFiles = statisticsOneGrams.getCorpus().getDetectedCorpusFiles();
final boolean multipleFiles = CorpusType.multipleFilesCorpuses().contains(statistic.getCorpus().getCorpusType());
Date startTime = new Date();
Date previousTime = new Date();
int remainingSeconds = -1;
int corpusSize;
int i;
int taskIndex = 0;
if(statistic.getFilter().getMinimalRelFre() > 1){
i = corpusFiles.size() * 2;
corpusSize = corpusFiles.size() * 3;
} else {
i = corpusFiles.size();
corpusSize = corpusFiles.size() * 2;
}
for (File f : corpusFiles) {
XML_processing xml_processing = new XML_processing();
i++;
taskIndex++;
if(xml_processing.progressBarListener != null) {
xml_processing.progressProperty().removeListener(xml_processing.progressBarListener);
}
if (multipleFiles) {
if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
remainingSeconds = (int) (((new Date()).getTime() - startTime.getTime()) * (1.0/taskIndex) * (corpusSize - i) / 1000);
previousTime = new Date();
}
} else {
// xml_processing.progressBarListener = new InvalidationListener() {
// int remainingSeconds = -1;
// Date previousTime = new Date();
// @Override
// public void invalidated(Observable observable) {
// if ((new Date()).getTime() - previousTime.getTime() > 500 || remainingSeconds == -1){
// remainingSeconds = (int) (((new Date()).getTime() - xml_processing.startTime.getTime()) *
// (1.0/(iFinal * 100 + ((ReadOnlyDoubleWrapper) observable).get() + 1)) *
// ((corpusSize - iFinal - 1) * 100 + 100 - ((ReadOnlyDoubleWrapper) observable).get()) / 1000);
// previousTime = new Date();
// }
// }
// };
}
xml_processing.isCollocability = true;
xml_processing.readXML(f.toString(), statisticsOneGrams);
xml_processing.isCollocability = false;
}
try {
System.out.print(statistic);
statistic.updateCalculateCollocabilities(statisticsOneGrams);
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
logger.error("Error while saving", e1);
} catch (OutOfMemoryError e1) {
logger.error(I18N.get("message.ERROR_NOT_ENOUGH_MEMORY"));
logger.error("Out of memory error", e1);
}
}
}

View File

@@ -0,0 +1,100 @@
package nogui;
import alg.XML_processing;
import data.*;
import gui.GUIController;
import gui.I18N;
import javafx.beans.InvalidationListener;
import javafx.beans.Observable;
import javafx.beans.property.ReadOnlyDoubleWrapper;
import javafx.concurrent.Task;
import javafx.scene.control.Alert;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import util.Tasks;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import static gui.GUIController.showAlert;
import static nogui.Utils.*;
public class WordParts {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static void wordParts(JSONObject settings, Corpus corpus) {
Filter filter = new Filter();
// fixed values
filter.setNgramValue(1);
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(false);
filter.setStringLength(1);
// tab specific values
// TODO
ArrayList<String> prefixList = getArrayList((JSONArray) settings.get("prefixList"));
filter.setPrefixList(prefixList);
ArrayList<String> suffixList = getArrayList((JSONArray) settings.get("suffixList"));
filter.setSuffixList(suffixList);
String calculateForString = (String) settings.get("calculateFor");
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
filter.setCalculateFor(calculateFor);
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
filter.setMultipleKeys(alsoVisualize);
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
filter.setPrefixLength(Math.toIntExact((Long) settings.get("prefixLength")));
filter.setSuffixLength(Math.toIntExact((Long) settings.get("suffixLength")));
// right part
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
filter.setMsd(msd);
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
filter.setTaxonomy(taxonomy);
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
execute(statistic, corpus);
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
}
} else {
logger.error(message);
}
}
private static void execute(StatisticsNew statistic, Corpus corpus) {
logger.info("Started execution: ", statistic.getFilter());
if (statistic.getFilter().getMinimalRelFre() > 1){
prepareTaskForMinRelFre(statistic, corpus);
} else {
prepareMainTask(statistic, corpus);
}
}
}

View File

@@ -0,0 +1,88 @@
package nogui;
import data.*;
import gui.GUIController;
import gui.I18N;
import javafx.concurrent.Task;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import util.Tasks;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Pattern;
import static nogui.Utils.*;
public class WordSets {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static void wordSets(JSONObject settings, Corpus corpus) {
Filter filter = new Filter();
// fixed values
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setIsCvv(false);
filter.setStringLength(1);
// tab specific values
filter.setNgramValue(Math.toIntExact((Long) settings.get("ngramValue")));
filter.setSkipValue(Math.toIntExact((Long) settings.get("skipValue")));
filter.setNotePunctuations((boolean) settings.get("notePunctuations"));
filter.setCollocability(getCollocability((JSONArray) settings.get("collocability")));
String calculateForString = (String) settings.get("calculateFor");
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
filter.setCalculateFor(calculateFor);
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
filter.setMultipleKeys(alsoVisualize);
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
// right part
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
filter.setMsd(msd);
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
filter.setTaxonomy(taxonomy);
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
execute(statistic, corpus);
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
}
} else {
logger.error(message);
}
}
private static void execute(StatisticsNew statistic, Corpus corpus) {
Filter f = statistic.getFilter();
logger.info("Started execution: ", f);
if (f.getMinimalRelFre() > 1){
prepareTaskForMinRelFre(statistic, corpus);
} else {
prepareMainTask(statistic, corpus);
}
}
}

84
src/main/java/nogui/Words.java Executable file
View File

@@ -0,0 +1,84 @@
package nogui;
import data.*;
import gui.GUIController;
import gui.I18N;
import javafx.concurrent.Task;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import util.Tasks;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Pattern;
import static nogui.Utils.*;
import static nogui.Utils.getTaxonomy;
public class Words {
public final static Logger logger = LogManager.getLogger(GUIController.class);
public static void words(JSONObject settings, Corpus corpus) {
Filter filter = new Filter();
// fixed values
filter.setNgramValue(1);
filter.setAl(AnalysisLevel.STRING_LEVEL);
filter.setSkipValue(0);
filter.setIsCvv(false);
filter.setStringLength(1);
// tab specific values
filter.setNotePunctuations((boolean) settings.get("notePunctuations"));
filter.setWriteMsdAtTheEnd((boolean) settings.get("writeMsdAtTheEnd"));
String calculateForString = (String) settings.get("calculateFor");
CalculateFor calculateFor = CalculateFor.factory(I18N.get(calculateForString));
filter.setCalculateFor(calculateFor);
ArrayList<String> alsoVisualize = getAlsoVisualizeList((JSONArray) settings.get("alsoVisualize"));
filter.setMultipleKeys(alsoVisualize);
filter.setDisplayTaxonomy((boolean) settings.get("displayTaxonomy"));
filter.setMinimalRelFre(Math.toIntExact((Long) settings.get("minimalRelFre")));
// right part
ArrayList<Pattern> msd = getMsd((String) settings.get("msd"));
filter.setMsd(msd);
filter.setTaxonomySetOperation(I18N.get((String) settings.get("taxonomySetOperation")));
ArrayList<Taxonomy> taxonomy = getTaxonomy((JSONArray) settings.get("taxonomy"), corpus);
filter.setTaxonomy(taxonomy);
filter.setMinimalOccurrences(Math.toIntExact((Long) settings.get("minimalOccurrences")));
filter.setMinimalTaxonomy(Math.toIntExact((Long) settings.get("minimalTaxonomy")));
String message = Validation.validateForStringLevel(filter);
if (message == null) {
// no errors
logger.info("Executing: ", filter.toString());
StatisticsNew statistic = new StatisticsNew(corpus, filter, false);
execute(statistic, corpus);
try {
boolean successullySaved = statistic.saveResultToDisk();
if (successullySaved) {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED"));
} else {
logger.info(I18N.get("message.NOTIFICATION_ANALYSIS_COMPLETED_NO_RESULTS"));
}
} catch (UnsupportedEncodingException e1) {
logger.error(I18N.get("message.ERROR_WHILE_SAVING_RESULTS_TO_CSV"));
}
} else {
logger.error(message);
}
}
private static void execute(StatisticsNew statistic, Corpus corpus) {
logger.info("Started execution: ", statistic.getFilter());
if (statistic.getFilter().getMinimalRelFre() > 1){
prepareTaskForMinRelFre(statistic, corpus);
} else {
prepareMainTask(statistic, corpus);
}
}
}

0
src/main/java/util/Tasks.java Normal file → Executable file
View File

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

0
src/main/resources/gui/questionmark.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 855 B

After

Width:  |  Height:  |  Size: 855 B

6
src/main/resources/message_en.properties Normal file → Executable file
View File

@@ -1,5 +1,5 @@
# general
window.title=LIST 1.0
window.title=LIST
hyperlink.help=Help
button.language=SL
@@ -282,8 +282,8 @@ exportFileName.wordSets=word-sets
exportFileName.gram=-gram
exportFileName.skip=-skip
about.header=LIST Corpus Extraction Tool\nVersion: 1.0 (Last update: 21 March 2019)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n which was financially supported by the Slovenian Research Agency between 2017 and 2020.\n The authors acknowledge the financial support from the Slovenian Research Agency\n (research core funding No. P6-0411 Language Resources and Technologies for Slovene).\n
about.header=LIST Corpus Extraction Tool\nVersion: 1.3 (Last update: 28 August 2024)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n the Empirical foundations for digitally-supported development of writing skills project (J7-3159)\n and the Language Resources and Technologies for Slovene programme (P6-0411), all\n financed by the Slovenian Research and Innovation Agency (ARIS).\n
about.signature=Publisher: Centre for Language Resources and Technologies, University of Ljubljana,\nJožef Stefan Institute,\nFaculty of Computer and Information Science, University of Ljubljana
about.footer=Maintenance: Centre for Language Resources and Technologies, University of Ljubljana\nThe program is available under the Apache2 licence at CLARIN.si and GitHub.
about.links=Links:

6
src/main/resources/message_sl.properties Normal file → Executable file
View File

@@ -1,5 +1,5 @@
# general
window.title=LIST 1.0
window.title=LIST
hyperlink.help=Pomoč
button.language=EN
@@ -282,8 +282,8 @@ exportFileName.wordSets=besedni-nizi
exportFileName.gram=-gram
exportFileName.skip=-preskok
about.header=LIST, korpusni luščilnik\nRazličica: 1.0 (Zadnja posodobitev: 21. marec 2019)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020\n sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411)\n je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna.
about.header=LIST, korpusni luščilnik\nRazličica: 1.3 (Zadnja posodobitev: 28. november 2024)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno\n podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri\n in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za\n znanstvenoraziskovalno je sofinancirala Javna agencija za raziskovalno dejavnost\n Republike Slovenije iz državnega proračuna.
about.signature=Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,\nInstitut "Jožef Stefan",\nFakulteta za računalništvo in informatiko Univerze v Ljubljani
about.footer=Vzdrževanje programa: Center za jezikovne vire in tehnologije Univerze v Ljubljani\nProgram je dostopen pod licenco Apache2 na repozitorijih CLARIN.si in GitHub.
about.links=Povezave:

0
src/main/resources/questionmark.png Normal file → Executable file
View File

Before

Width:  |  Height:  |  Size: 855 B

After

Width:  |  Height:  |  Size: 855 B

0
src/main/resources/style.css Normal file → Executable file
View File

View File

@@ -13,9 +13,7 @@ public class CorpusTests {
@Test
public void solarTest() {
// File selectedDirectory = new File("/home/luka/Desktop/corpus-analyzer/src/main/resources/Solar");
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
File selectedDirectory = new File("/home/luka/Development/corpus-analyzer2/src/main/resources/Gigafida_subset/");
File selectedDirectory = new File("/home/luka/Development/CJVT/list/src/main/resources/Gigafida_subset/");
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));
@@ -23,20 +21,7 @@ public class CorpusTests {
File f = Settings.corpus.iterator().next();
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, 0, CalculateFor.WORD);
// // stats.setCorpusType(CorpusType.GOS);
// stats.setCorpusType(CorpusType.SOLAR);
// XML_processing.readXMLGos(f.toString(), stats);
// XML_processing.readXML(f.toString(), stats);
// XML_processing.readXMLHeaderTag(f.toString(), "stats");
}
// @Test
// public void test() {
// ObservableList<String> var = GosTaxonomy.getForComboBox();
// String debug = "";
//
// }
}