Compare commits

..

6 Commits

19 changed files with 251 additions and 153 deletions

5
.gitignore vendored
View File

@ -164,7 +164,6 @@ $RECYCLE.BIN/
src/main/resources/translation_external/
src/main/resources/translations_backup/
config.json
config_instructions.txt
shade
TEMP
data

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
</component>
</module>

11
build_instructions.md Normal file
View File

@ -0,0 +1,11 @@
# Build a jar
```shell
mvn package
```
- results are in shade folder
# Build executable using Launch4j
- Install Java on Windows
- Run Launch4j (download first) and create executable
- Copy jre from computer to jre folder that should be in the same folder as list.exe

Binary file not shown.

24
instructions.md Normal file
View File

@ -0,0 +1,24 @@
# Instructions
Instructions on how to run list.
## Windows
There are two options.
### Run list.exe
The easier option is to download list.zip, extract it and run list.exe.
### Run list.jar
To do this you first need to install the correct version of java (JDK). The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Secondly, you may run list using `run.bat` which will run `list.jar` for you.
## Linux
### Run list.jar
Similarly to running list.jar in Windows, you have to first make sure, that you have the appropriate version of Java installed. The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Next, you may run list using `run.sh` which will run `list.jar` for you.

128
pom.xml
View File

@ -4,9 +4,30 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>thesis</groupId>
<artifactId>corpus-analyzer</artifactId>
<version>1.2</version>
<groupId>list</groupId>
<artifactId>list</artifactId>
<name>list</name>
<version>1.3</version>
<repositories>
<repository>
<id>central</id>
<name>Central Repository</name>
<url>https://repo.maven.apache.org/maven2/</url>
</repository>
<repository>
<id>central2</id>
<name>Central Repository2</name>
<url>https://repo1.maven.org/maven2/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.release>17</maven.compiler.release>
<javafx.version>21</javafx.version>
<ikonli.version>12.3.1</ikonli.version>
<javafx.maven.plugin.version>0.0.8</javafx.maven.plugin.version>
</properties>
<dependencies>
<dependency>
@ -32,7 +53,7 @@
<dependency>
<groupId>org.controlsfx</groupId>
<artifactId>controlsfx</artifactId>
<version>8.40.13</version>
<version>11.2.0</version>
</dependency>
<dependency>
<groupId>org.rocksdb</groupId>
@ -52,71 +73,82 @@
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-fontawesome-pack</artifactId>
<version>1.9.0</version>
<version>${ikonli.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-controls</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-fxml</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>win</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>linux</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>mac</classifier>
</dependency>
<dependency>
<groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-javafx</artifactId>
<version>1.9.0</version>
<version>${ikonli.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!-- packages dependencies into the jar -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
</plugin>
<plugin>
<groupId>org.openjfx</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>${javafx.maven.plugin.version}</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
<goal>shade</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>gui.GUIController</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>artifact</outputDirectory>
<finalName>Corpus_Analyzer_${version}</finalName>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>project-classifier</shadedClassifierName>
<outputFile>shade\${project.artifactId}.jar</outputFile>
<transformers>
<transformer implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>gui.Launcher</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- JavaFX -->
<groupId>com.zenjava</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>8.8.3</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
<verbose>true</verbose>
</configuration>
<executions>
<execution>
<id>create-jfxjar</id>
<phase>package</phase>
<goals>
<goal>build-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -1,10 +1,10 @@
(English version below)
LIST, korpusni luščilnik
Različica: 1.0 (Zadnja posodobitev: 21. marec 2019)
Različica: 1.3 (Zadnja posodobitev: 28. avgust 2024)
Avtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020 sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411) je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna.
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za znanstvenoraziskovalno in inovacijsko dejavnost Republike Slovenije (ARIS) iz državnega proračuna.
Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,
Institut "Jožef Stefan",
@ -15,19 +15,16 @@ Program je dostopen pod licenco MIT License na repozitorijih CLARIN.SI (http://h
NAVODILA ZA NAMESTITEV IN ZAGON:
1) Pred uporabo programske opreme mora biti na računalniku nameščena 64-bitna java (https://java.com/en/download/manual.jsp).
2) Vse tri programske datoteke (run.sh, run.bat, list1.0.jar) skopiramo v poljubno mapo.
3) Program zaženemo z dvoklikom na datoteko run.bat na operacijskem sistemu Windows ali run.sh na operacijskem sistemu Linux.
4) Ko izbiramo lokacijo korpusa, moramo poskrbeti, da v mapi ni datotek več različnih korpusov.
Datoteko list.zip razširimo in poženemo `list.exe` znotraj razširjene mape. Druge možnosti so opisane v [razširjeni dokumentaciji](instructions.md).
---------
LIST Corpus Extraction Tool
Version: 1.0 (Last update: 21 March 2019)
Version: 1.3 (Last update: 28 August 2024)
Authors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), which was financially supported by the Slovenian Research Agency between 2017 and 2020. The authors acknowledge the financial support from the Slovenian Research Agency (research core funding No. P6-0411 Language Resources and Technologies for Slovene).
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), the Empirical foundations for digitally-supported development of writing skills project (J7-3159) and the Language Resources and Technologies for Slovene programme (P6-0411), all financed by the Slovenian Research and Innovation Agency (ARIS).
Publisher: Centre for Language Resources and Technologies, University of Ljubljana,
Jožef Stefan Institute,
@ -38,7 +35,4 @@ The program is available under the MIT License at CLARIN.SI (http://hdl.handle.n
INSTRUCTIONS FOR INSTALLATION AND USE:
1) Make sure that 64-bit java is installed on your computer (https://java.com/en/download/manual.jsp).
2) Copy all three program files (run.sh, run.bat, list1.0.jar) in a single folder.
3) Run the program by double-clicking the run.bat file on a Windows operating system or run.sh on Linux.
4) When selecting the location of the corpus, make sure the folder does not include files of multiple different corpora.
Extract list.zip file and run list.exe. For other options please read [detailed instructions](instructions.md).

View File

@ -50,7 +50,8 @@ public class XML_processing {
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
return readXMLSolar(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
stats.getCorpus().getCorpusType() == CorpusType.KOST) {
return readXMLSSJ500K(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
return readVERT(path, stats);
@ -461,6 +462,8 @@ public class XML_processing {
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String headTagName;
@ -471,7 +474,7 @@ public class XML_processing {
// init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else if (corpusType == CorpusType.SSJ500K) {
} else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
headTagName = "bibl";
} else {
headTagName = "teiHeader";
@ -482,6 +485,9 @@ public class XML_processing {
try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false;
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
@ -495,6 +501,10 @@ public class XML_processing {
// this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes)
insideHeader = true;
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
corpusType == CorpusType.KOST && elementName.equals("TEI")
) {
return resultTaxonomy;
}
if (insideHeader) {
@ -516,6 +526,11 @@ public class XML_processing {
.replace("#", "");
resultTaxonomy.add(tax);
// kost
} else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
// solar
} else if (!parseTaxonomy) {
boolean inHeadTags = false;
@ -533,13 +548,22 @@ public class XML_processing {
}
}
}
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
// if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false;
} else if (xmlEvent.isEndElement() && insideNote) {
if (taxonomyNames.contains(filterName)) {
Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
}
insideNote = false;
} else if (xmlEvent.isCharacters() && insideNote) {
Characters characters = xmlEvent.asCharacters();
filterValue = characters.getData();
}
}
} catch (XMLStreamException e) {
@ -726,6 +750,8 @@ public class XML_processing {
boolean inPunctuation = false;
boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String lemma = "";
String msd = "";
@ -760,6 +786,9 @@ public class XML_processing {
try {
XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path));
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines);
@ -803,6 +832,12 @@ public class XML_processing {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
// kost
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
// get value from attribute target
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
@ -818,6 +853,10 @@ public class XML_processing {
} else if (qName.equals("text")){
taxonomyMatch = true;
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
) {
return true;
}
break;
@ -836,6 +875,10 @@ public class XML_processing {
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false;
}
// kost
if (insideNote) {
filterValue = characters.getData();
}
break;
case XMLStreamConstants.END_ELEMENT:
@ -876,7 +919,8 @@ public class XML_processing {
}
// fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
(stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
// join corpus and stats
fj(corpus, stats);
corpus.clear();
@ -892,7 +936,7 @@ public class XML_processing {
// taxonomies don't match so stop
// union (select words that match any of selected taxonomy
taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false;
@ -900,6 +944,17 @@ public class XML_processing {
}
} else if (endElement.getName().getLocalPart().equals("text")){
taxonomyMatch = false;
// kost
}
if (insideNote) {
if (taxonomyNames.contains(filterName)) {
for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
}
insideNote = false;
}
break;

View File

@ -3,11 +3,8 @@ package alg.ngram;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;

View File

@ -9,6 +9,7 @@ public enum CorpusType {
CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"),
GOS("GOS", "gos"),
KOST("KOST", "kost"),
SSJ500K("ssj500k", "ssj500k"),
VERT("vert", "vert");

View File

@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
static {
// GIGAFIDA ----------------------------
@ -108,7 +108,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
// if VERT only order taxonomy by alphabet
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
Collections.sort(sortedFoundTaxonomy);
@ -199,7 +199,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
for (Taxonomy t : taxonomy) {
result.add(t.toLongNameString());
}

View File

@ -763,6 +763,42 @@ public class Taxonomy {
}
public static String[] format_KOST_taxonomy(String value, String parameter) {
Map<String, String> filterMap = new HashMap<>();
filterMap.put("FirstLang", "Prvi Jezik tvorca");
filterMap.put("TaskSetting", "Okoliščine nastanka");
filterMap.put("ProficSlv", "Nivo");
filterMap.put("ProgramType", "Program");
filterMap.put("InputType", "Napisano");
String[] split_value = new String[] {};
if (parameter.equals("FirstLang")) {
if (value.contains(", ")) {
split_value = value.split(", ");
} else if (value.contains(" ")) {
for (String v : value.split(" ")) {
if (v.equals("španščina") || v.equals("angleščina")) {
split_value = new String[] {v};
}
}
} else {
split_value = new String[] {value};
}
} else if (parameter.equals("ProficSlv")) {
if (value.equals("Izpopolnjevalec")) {
split_value = new String[] {"izpopolnjevalec"};
} else {
split_value = new String[] {value};
}
} else {
split_value = new String[] {value};
}
return Arrays.stream(split_value)
.map(val -> filterMap.get(parameter) + " - " + val)
.toArray(String[]::new);
}
public String toString() {
return this.name;
}
@ -834,7 +870,7 @@ public class Taxonomy {
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
} else {

View File

@ -5,6 +5,7 @@ import static gui.GUIController.*;
import static util.Util.*;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
@ -149,7 +150,7 @@ public class CorpusTab {
private String corpusLocation;
private String corpusFilesSize;
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
@ -194,8 +195,6 @@ public class CorpusTab {
}
public void initialize() {
updateTooltipBehavior(0.0, 30000.0,0.0, true);
// add CSS style
corpusTabPane.getStylesheets().add("style.css");
corpusTabPane.getStyleClass().add("root");
@ -499,7 +498,7 @@ public class CorpusTab {
logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@ -740,7 +739,6 @@ public class CorpusTab {
private void selectReader() {
switch (selectReader) {
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
case "VERT + REGI":
corpusType = VERT;
break;
@ -750,6 +748,9 @@ public class CorpusTab {
case "XML (GOS 1.0)":
corpusType = GOS;
break;
case "XML (KOST 2.0)":
corpusType = KOST;
break;
case "XML (ssj500k 2.1)":
corpusType = SSJ500K;
break;
@ -788,6 +789,8 @@ public class CorpusTab {
corpusType = GOS;
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
corpusType = SSJ500K;
} else if (attrib.contains(KOST.getNameLowerCase())) {
corpusType = KOST;
}
if (corpusType == null) {

View File

@ -61,23 +61,13 @@ public final class I18N {
public static String get(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getLocale());
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static String getDefaultLocaleItem(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getDefaultLocale());
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static ObservableList<String> getObject(final ArrayList<String> keys, final Object... args) {
@ -86,11 +76,7 @@ public final class I18N {
ArrayList<String> results = new ArrayList<>();
for(String key : keys){
String val = bundle.getString(key);
try {
results.add(MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
results.add(val);
}
return FXCollections.observableArrayList(results);
@ -121,12 +107,7 @@ public final class I18N {
public static String getIndependent(final String key, Locale locale, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", locale);
String val = bundle.getString(key);
try {
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
return MessageFormat.format(val, args);
}
public static String getRootValue(String oldValue, ArrayList<String> nGramComputeForLetters) {

View File

@ -0,0 +1,8 @@
package gui;
public class Launcher {
public static void main(String[] args) {
GUIController.main(args);
}
}

View File

@ -44,7 +44,7 @@ public class Messages {
// Not properly to be here. TODO move somewhere else in future
public static String HELP_URL = "http://slovnica.ijs.si/";
public static String CJVT_URL = "http://hdl.handle.net/11356/1227";
public static String CJVT_URL = "http://hdl.handle.net/11356/1964";
public static String GITHUB_URL = "https://gitea.cjvt.si/lkrsnik/list";
// helper maps

View File

@ -282,8 +282,8 @@ exportFileName.wordSets=word-sets
exportFileName.gram=-gram
exportFileName.skip=-skip
about.header=LIST Corpus Extraction Tool\nVersion: 1.2 (Last update: 18 November 2019)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n which was financially supported by the Slovenian Research Agency between 2017 and 2020.\n The authors acknowledge the financial support from the Slovenian Research Agency\n (research core funding No. P6-0411 Language Resources and Technologies for Slovene).\n
about.header=LIST Corpus Extraction Tool\nVersion: 1.3 (Last update: 28 August 2024)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n the Empirical foundations for digitally-supported development of writing skills project (J7-3159)\n and the Language Resources and Technologies for Slovene programme (P6-0411), all\n financed by the Slovenian Research and Innovation Agency (ARIS).\n
about.signature=Publisher: Centre for Language Resources and Technologies, University of Ljubljana,\nJožef Stefan Institute,\nFaculty of Computer and Information Science, University of Ljubljana
about.footer=Maintenance: Centre for Language Resources and Technologies, University of Ljubljana\nThe program is available under the Apache2 licence at CLARIN.si and GitHub.
about.links=Links:

View File

@ -282,8 +282,8 @@ exportFileName.wordSets=besedni-nizi
exportFileName.gram=-gram
exportFileName.skip=-preskok
about.header=LIST, korpusni luščilnik\nRazličica: 1.2 (Zadnja posodobitev: 18. november 2019)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020\n sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411)\n je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna.
about.header=LIST, korpusni luščilnik\nRazličica: 1.3 (Zadnja posodobitev: 28. november 2024)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno\n podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri\n in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za\n znanstvenoraziskovalno je sofinancirala Javna agencija za raziskovalno dejavnost\n Republike Slovenije iz državnega proračuna.
about.signature=Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,\nInstitut "Jožef Stefan",\nFakulteta za računalništvo in informatiko Univerze v Ljubljani
about.footer=Vzdrževanje programa: Center za jezikovne vire in tehnologije Univerze v Ljubljani\nProgram je dostopen pod licenco Apache2 na repozitorijih CLARIN.si in GitHub.
about.links=Povezave:

View File

@ -13,9 +13,7 @@ public class CorpusTests {
@Test
public void solarTest() {
// File selectedDirectory = new File("/home/luka/Desktop/corpus-analyzer/src/main/resources/Solar");
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
File selectedDirectory = new File("/home/luka/Development/corpus-analyzer2/src/main/resources/Gigafida_subset/");
File selectedDirectory = new File("/home/luka/Development/CJVT/list/src/main/resources/Gigafida_subset/");
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));
@ -23,20 +21,7 @@ public class CorpusTests {
File f = Settings.corpus.iterator().next();
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, 0, CalculateFor.WORD);
// // stats.setCorpusType(CorpusType.GOS);
// stats.setCorpusType(CorpusType.SOLAR);
// XML_processing.readXMLGos(f.toString(), stats);
// XML_processing.readXML(f.toString(), stats);
// XML_processing.readXMLHeaderTag(f.toString(), "stats");
}
// @Test
// public void test() {
// ObservableList<String> var = GosTaxonomy.getForComboBox();
// String debug = "";
//
// }
}