Compare commits

...

6 Commits

19 changed files with 251 additions and 153 deletions

5
.gitignore vendored
View File

@ -164,7 +164,6 @@ $RECYCLE.BIN/
src/main/resources/translation_external/ src/main/resources/translation_external/
src/main/resources/translations_backup/ src/main/resources/translations_backup/
shade
config.json TEMP
config_instructions.txt
data data

View File

@ -1,28 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
</component>
</module>

11
build_instructions.md Normal file
View File

@ -0,0 +1,11 @@
# Build a jar
```shell
mvn package
```
- results are in shade folder
# Build executable using Launch4j
- Install Java on Windows
- Run Launch4j (download first) and create executable
- Copy jre from computer to jre folder that should be in the same folder as list.exe

Binary file not shown.

24
instructions.md Normal file
View File

@ -0,0 +1,24 @@
# Instructions
Instructions on how to run list.
## Windows
There are two options.
### Run list.exe
The easier option is to download list.zip, extract it and run list.exe.
### Run list.jar
To do this you first need to install the correct version of java (JDK). The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Secondly, you may run list using `run.bat` which will run `list.jar` for you.
## Linux
### Run list.jar
Similarly to running list.jar in Windows, you have to first make sure, that you have the appropriate version of Java installed. The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
If you already have another version of Java installed you might have to delete previous version before you install this.
Next, you may run list using `run.sh` which will run `list.jar` for you.

128
pom.xml
View File

@ -4,9 +4,30 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>thesis</groupId> <groupId>list</groupId>
<artifactId>corpus-analyzer</artifactId> <artifactId>list</artifactId>
<version>1.2</version> <name>list</name>
<version>1.3</version>
<repositories>
<repository>
<id>central</id>
<name>Central Repository</name>
<url>https://repo.maven.apache.org/maven2/</url>
</repository>
<repository>
<id>central2</id>
<name>Central Repository2</name>
<url>https://repo1.maven.org/maven2/</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.release>17</maven.compiler.release>
<javafx.version>21</javafx.version>
<ikonli.version>12.3.1</ikonli.version>
<javafx.maven.plugin.version>0.0.8</javafx.maven.plugin.version>
</properties>
<dependencies> <dependencies>
<dependency> <dependency>
@ -32,7 +53,7 @@
<dependency> <dependency>
<groupId>org.controlsfx</groupId> <groupId>org.controlsfx</groupId>
<artifactId>controlsfx</artifactId> <artifactId>controlsfx</artifactId>
<version>8.40.13</version> <version>11.2.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.rocksdb</groupId> <groupId>org.rocksdb</groupId>
@ -52,71 +73,82 @@
<dependency> <dependency>
<groupId>org.kordamp.ikonli</groupId> <groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-fontawesome-pack</artifactId> <artifactId>ikonli-fontawesome-pack</artifactId>
<version>1.9.0</version> <version>${ikonli.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-controls</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-fxml</artifactId>
<version>${javafx.version}</version>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>win</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>linux</classifier>
</dependency>
<dependency>
<groupId>org.openjfx</groupId>
<artifactId>javafx-graphics</artifactId>
<version>${javafx.version}</version>
<classifier>mac</classifier>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.kordamp.ikonli</groupId> <groupId>org.kordamp.ikonli</groupId>
<artifactId>ikonli-javafx</artifactId> <artifactId>ikonli-javafx</artifactId>
<version>1.9.0</version> <version>${ikonli.version}</version>
</dependency> </dependency>
</dependencies> </dependencies>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
<!-- packages dependencies into the jar -->
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId> <artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
</plugin>
<plugin>
<groupId>org.openjfx</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>${javafx.maven.plugin.version}</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.0</version>
<executions> <executions>
<execution> <execution>
<phase>package</phase> <phase>package</phase>
<goals> <goals>
<goal>single</goal> <goal>shade</goal>
</goals> </goals>
<configuration> <configuration>
<archive> <shadedArtifactAttached>true</shadedArtifactAttached>
<manifest> <shadedClassifierName>project-classifier</shadedClassifierName>
<mainClass>gui.GUIController</mainClass> <outputFile>shade\${project.artifactId}.jar</outputFile>
</manifest> <transformers>
</archive> <transformer implementation=
<descriptorRefs> "org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<descriptorRef>jar-with-dependencies</descriptorRef> <mainClass>gui.Launcher</mainClass>
</descriptorRefs> </transformer>
<appendAssemblyId>false</appendAssemblyId> </transformers>
<outputDirectory>artifact</outputDirectory>
<finalName>Corpus_Analyzer_${version}</finalName>
</configuration> </configuration>
</execution> </execution>
</executions> </executions>
</plugin> </plugin>
<plugin>
<!-- JavaFX -->
<groupId>com.zenjava</groupId>
<artifactId>javafx-maven-plugin</artifactId>
<version>8.8.3</version>
<configuration>
<mainClass>gui.GUIController</mainClass>
<verbose>true</verbose>
</configuration>
<executions>
<execution>
<id>create-jfxjar</id>
<phase>package</phase>
<goals>
<goal>build-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins> </plugins>
</build> </build>
</project> </project>

View File

@ -1,10 +1,10 @@
(English version below) (English version below)
LIST, korpusni luščilnik LIST, korpusni luščilnik
Različica: 1.0 (Zadnja posodobitev: 21. marec 2019) Različica: 1.3 (Zadnja posodobitev: 28. avgust 2024)
Avtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja Avtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020 sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411) je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna. Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za znanstvenoraziskovalno in inovacijsko dejavnost Republike Slovenije (ARIS) iz državnega proračuna.
Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani, Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,
Institut "Jožef Stefan", Institut "Jožef Stefan",
@ -15,19 +15,16 @@ Program je dostopen pod licenco MIT License na repozitorijih CLARIN.SI (http://h
NAVODILA ZA NAMESTITEV IN ZAGON: NAVODILA ZA NAMESTITEV IN ZAGON:
1) Pred uporabo programske opreme mora biti na računalniku nameščena 64-bitna java (https://java.com/en/download/manual.jsp). Datoteko list.zip razširimo in poženemo `list.exe` znotraj razširjene mape. Druge možnosti so opisane v [razširjeni dokumentaciji](instructions.md).
2) Vse tri programske datoteke (run.sh, run.bat, list1.0.jar) skopiramo v poljubno mapo.
3) Program zaženemo z dvoklikom na datoteko run.bat na operacijskem sistemu Windows ali run.sh na operacijskem sistemu Linux.
4) Ko izbiramo lokacijo korpusa, moramo poskrbeti, da v mapi ni datotek več različnih korpusov.
--------- ---------
LIST Corpus Extraction Tool LIST Corpus Extraction Tool
Version: 1.0 (Last update: 21 March 2019) Version: 1.3 (Last update: 28 August 2024)
Authors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja Authors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), which was financially supported by the Slovenian Research Agency between 2017 and 2020. The authors acknowledge the financial support from the Slovenian Research Agency (research core funding No. P6-0411 Language Resources and Technologies for Slovene). The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), the Empirical foundations for digitally-supported development of writing skills project (J7-3159) and the Language Resources and Technologies for Slovene programme (P6-0411), all financed by the Slovenian Research and Innovation Agency (ARIS).
Publisher: Centre for Language Resources and Technologies, University of Ljubljana, Publisher: Centre for Language Resources and Technologies, University of Ljubljana,
Jožef Stefan Institute, Jožef Stefan Institute,
@ -38,7 +35,4 @@ The program is available under the MIT License at CLARIN.SI (http://hdl.handle.n
INSTRUCTIONS FOR INSTALLATION AND USE: INSTRUCTIONS FOR INSTALLATION AND USE:
1) Make sure that 64-bit java is installed on your computer (https://java.com/en/download/manual.jsp). Extract list.zip file and run list.exe. For other options please read [detailed instructions](instructions.md).
2) Copy all three program files (run.sh, run.bat, list1.0.jar) in a single folder.
3) Run the program by double-clicking the run.bat file on a Windows operating system or run.sh on Linux.
4) When selecting the location of the corpus, make sure the folder does not include files of multiple different corpora.

View File

@ -50,7 +50,8 @@ public class XML_processing {
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) { } else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
return readXMLSolar(path, stats); return readXMLSolar(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K || } else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) { stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
stats.getCorpus().getCorpusType() == CorpusType.KOST) {
return readXMLSSJ500K(path, stats); return readXMLSSJ500K(path, stats);
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) { } else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
return readVERT(path, stats); return readVERT(path, stats);
@ -461,6 +462,8 @@ public class XML_processing {
HashMap<String, HashSet<String>> resultFilters = new HashMap<>(); HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
// taxonomy corpora // taxonomy corpora
HashSet<String> resultTaxonomy = new HashSet<>(); HashSet<String> resultTaxonomy = new HashSet<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String headTagName; String headTagName;
@ -471,7 +474,7 @@ public class XML_processing {
// init results now to avoid null pointers // init results now to avoid null pointers
headTags.forEach(f -> resultFilters.put(f, new HashSet<>())); headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
} else if (corpusType == CorpusType.SSJ500K) { } else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
headTagName = "bibl"; headTagName = "bibl";
} else { } else {
headTagName = "teiHeader"; headTagName = "teiHeader";
@ -482,6 +485,9 @@ public class XML_processing {
try { try {
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath)); xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
boolean insideHeader = false; boolean insideHeader = false;
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (xmlEventReader.hasNext()) { while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent(); XMLEvent xmlEvent = xmlEventReader.nextEvent();
@ -495,6 +501,10 @@ public class XML_processing {
// this toggle is true when we're inside a header (next block of code executes) // this toggle is true when we're inside a header (next block of code executes)
// and false when we're not (skip reading unnecessary attributes) // and false when we're not (skip reading unnecessary attributes)
insideHeader = true; insideHeader = true;
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
corpusType == CorpusType.KOST && elementName.equals("TEI")
) {
return resultTaxonomy;
} }
if (insideHeader) { if (insideHeader) {
@ -516,6 +526,11 @@ public class XML_processing {
.replace("#", ""); .replace("#", "");
resultTaxonomy.add(tax); resultTaxonomy.add(tax);
// kost
} else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
// solar // solar
} else if (!parseTaxonomy) { } else if (!parseTaxonomy) {
boolean inHeadTags = false; boolean inHeadTags = false;
@ -533,13 +548,22 @@ public class XML_processing {
} }
} }
} }
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { } else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
// if the corpus is split into multiple files, each with only one header block per file // if the corpus is split into multiple files, each with only one header block per file
// that means we should stop after we reach the end of the header // that means we should stop after we reach the end of the header
return parseTaxonomy ? resultTaxonomy : resultFilters; return parseTaxonomy ? resultTaxonomy : resultFilters;
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) { } else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
// whole corpus in one file, so we have to continue reading in order to find all header blocks // whole corpus in one file, so we have to continue reading in order to find all header blocks
insideHeader = false; insideHeader = false;
} else if (xmlEvent.isEndElement() && insideNote) {
if (taxonomyNames.contains(filterName)) {
Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
}
insideNote = false;
} else if (xmlEvent.isCharacters() && insideNote) {
Characters characters = xmlEvent.asCharacters();
filterValue = characters.getData();
} }
} }
} catch (XMLStreamException e) { } catch (XMLStreamException e) {
@ -726,6 +750,8 @@ public class XML_processing {
boolean inPunctuation = false; boolean inPunctuation = false;
boolean taxonomyMatch = true; boolean taxonomyMatch = true;
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>(); ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
HashSet<String> taxonomyNames = new HashSet<String>(
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
String lemma = ""; String lemma = "";
String msd = ""; String msd = "";
@ -760,6 +786,9 @@ public class XML_processing {
try { try {
XMLInputFactory factory = XMLInputFactory.newInstance(); XMLInputFactory factory = XMLInputFactory.newInstance();
eventReader = factory.createXMLEventReader(new FileInputStream(path)); eventReader = factory.createXMLEventReader(new FileInputStream(path));
boolean insideNote = false;
String filterName = "";
String filterValue = "";
while (eventReader.hasNext()) { while (eventReader.hasNext()) {
int percentage = (int) (lineNum * 100.0 / numLines); int percentage = (int) (lineNum * 100.0 / numLines);
@ -803,6 +832,12 @@ public class XML_processing {
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus()); Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement); currentFiletaxonomy.add(currentFiletaxonomyElement);
} }
// kost
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
.getValue().replace("#", "");
insideNote = true;
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) { } else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
// get value from attribute target // get value from attribute target
Attribute tax = startElement.getAttributeByName(QName.valueOf("target")); Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
@ -818,6 +853,10 @@ public class XML_processing {
} else if (qName.equals("text")){ } else if (qName.equals("text")){
taxonomyMatch = true; taxonomyMatch = true;
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
) {
return true;
} }
break; break;
@ -836,6 +875,10 @@ public class XML_processing {
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter())); sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
inPunctuation = false; inPunctuation = false;
} }
// kost
if (insideNote) {
filterValue = characters.getData();
}
break; break;
case XMLStreamConstants.END_ELEMENT: case XMLStreamConstants.END_ELEMENT:
@ -876,7 +919,8 @@ public class XML_processing {
} }
// fallback // fallback
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") && else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) { (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
// join corpus and stats // join corpus and stats
fj(corpus, stats); fj(corpus, stats);
corpus.clear(); corpus.clear();
@ -892,7 +936,7 @@ public class XML_processing {
// taxonomies don't match so stop // taxonomies don't match so stop
// union (select words that match any of selected taxonomy // union (select words that match any of selected taxonomy
taxonomyMatch = false; taxonomyMatch = false;
//
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){ } else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
// intersection (select only words that precisely match selected taxonomy // intersection (select only words that precisely match selected taxonomy
taxonomyMatch = false; taxonomyMatch = false;
@ -900,6 +944,17 @@ public class XML_processing {
} }
} else if (endElement.getName().getLocalPart().equals("text")){ } else if (endElement.getName().getLocalPart().equals("text")){
taxonomyMatch = false; taxonomyMatch = false;
// kost
}
if (insideNote) {
if (taxonomyNames.contains(filterName)) {
for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
// keep only taxonomy properties
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
currentFiletaxonomy.add(currentFiletaxonomyElement);
}
}
insideNote = false;
} }
break; break;

View File

@ -3,11 +3,8 @@ package alg.ngram;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
import data.*; import data.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.LogManager;

View File

@ -9,6 +9,7 @@ public enum CorpusType {
CCKRES("ccKres ", "cckres"), CCKRES("ccKres ", "cckres"),
SOLAR("Šolar", "šolar"), SOLAR("Šolar", "šolar"),
GOS("GOS", "gos"), GOS("GOS", "gos"),
KOST("KOST", "kost"),
SSJ500K("ssj500k", "ssj500k"), SSJ500K("ssj500k", "ssj500k"),
VERT("vert", "vert"); VERT("vert", "vert");

View File

@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
public class Tax { public class Tax {
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY; private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
private static LinkedHashMap<String, String> GOS_TAXONOMY; private static LinkedHashMap<String, String> GOS_TAXONOMY;
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT)); private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
static { static {
// GIGAFIDA ---------------------------- // GIGAFIDA ----------------------------
@ -108,7 +108,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY; tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) { } else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY; tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){ } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
// if VERT only order taxonomy by alphabet // if VERT only order taxonomy by alphabet
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax); ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
Collections.sort(sortedFoundTaxonomy); Collections.sort(sortedFoundTaxonomy);
@ -199,7 +199,7 @@ public class Tax {
tax = GIGAFIDA_TAXONOMY; tax = GIGAFIDA_TAXONOMY;
} else if (corpusType == CorpusType.GOS) { } else if (corpusType == CorpusType.GOS) {
tax = GOS_TAXONOMY; tax = GOS_TAXONOMY;
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { } else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
for (Taxonomy t : taxonomy) { for (Taxonomy t : taxonomy) {
result.add(t.toLongNameString()); result.add(t.toLongNameString());
} }

View File

@ -763,6 +763,42 @@ public class Taxonomy {
} }
public static String[] format_KOST_taxonomy(String value, String parameter) {
Map<String, String> filterMap = new HashMap<>();
filterMap.put("FirstLang", "Prvi Jezik tvorca");
filterMap.put("TaskSetting", "Okoliščine nastanka");
filterMap.put("ProficSlv", "Nivo");
filterMap.put("ProgramType", "Program");
filterMap.put("InputType", "Napisano");
String[] split_value = new String[] {};
if (parameter.equals("FirstLang")) {
if (value.contains(", ")) {
split_value = value.split(", ");
} else if (value.contains(" ")) {
for (String v : value.split(" ")) {
if (v.equals("španščina") || v.equals("angleščina")) {
split_value = new String[] {v};
}
}
} else {
split_value = new String[] {value};
}
} else if (parameter.equals("ProficSlv")) {
if (value.equals("Izpopolnjevalec")) {
split_value = new String[] {"izpopolnjevalec"};
} else {
split_value = new String[] {value};
}
} else {
split_value = new String[] {value};
}
return Arrays.stream(split_value)
.map(val -> filterMap.get(parameter) + " - " + val)
.toArray(String[]::new);
}
public String toString() { public String toString() {
return this.name; return this.name;
} }
@ -834,7 +870,7 @@ public class Taxonomy {
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){ public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus); ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) { if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus); TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus); return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
} else { } else {

View File

@ -5,6 +5,7 @@ import static gui.GUIController.*;
import static util.Util.*; import static util.Util.*;
import java.io.File; import java.io.File;
import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.lang.reflect.Field; import java.lang.reflect.Field;
@ -149,7 +150,7 @@ public class CorpusTab {
private String corpusLocation; private String corpusLocation;
private String corpusFilesSize; private String corpusFilesSize;
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"}; private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY)); private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"}; private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
@ -194,8 +195,6 @@ public class CorpusTab {
} }
public void initialize() { public void initialize() {
updateTooltipBehavior(0.0, 30000.0,0.0, true);
// add CSS style // add CSS style
corpusTabPane.getStylesheets().add("style.css"); corpusTabPane.getStylesheets().add("style.css");
corpusTabPane.getStyleClass().add("root"); corpusTabPane.getStyleClass().add("root");
@ -499,7 +498,7 @@ public class CorpusTab {
logger.info("reading header data for ", corpusType.toString()); logger.info("reading header data for ", corpusType.toString());
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) { if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
boolean corpusIsSplit = corpusFiles.size() > 1; boolean corpusIsSplit = corpusFiles.size() > 1;
final Task<HashSet<String>> task = new Task<HashSet<String>>() { final Task<HashSet<String>> task = new Task<HashSet<String>>() {
@ -740,7 +739,6 @@ public class CorpusTab {
private void selectReader() { private void selectReader() {
switch (selectReader) { switch (selectReader) {
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
case "VERT + REGI": case "VERT + REGI":
corpusType = VERT; corpusType = VERT;
break; break;
@ -750,6 +748,9 @@ public class CorpusTab {
case "XML (GOS 1.0)": case "XML (GOS 1.0)":
corpusType = GOS; corpusType = GOS;
break; break;
case "XML (KOST 2.0)":
corpusType = KOST;
break;
case "XML (ssj500k 2.1)": case "XML (ssj500k 2.1)":
corpusType = SSJ500K; corpusType = SSJ500K;
break; break;
@ -788,6 +789,8 @@ public class CorpusTab {
corpusType = GOS; corpusType = GOS;
} else if (attrib.contains(SSJ500K.getNameLowerCase())) { } else if (attrib.contains(SSJ500K.getNameLowerCase())) {
corpusType = SSJ500K; corpusType = SSJ500K;
} else if (attrib.contains(KOST.getNameLowerCase())) {
corpusType = KOST;
} }
if (corpusType == null) { if (corpusType == null) {

View File

@ -61,23 +61,13 @@ public final class I18N {
public static String get(final String key, final Object... args) { public static String get(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getLocale()); ResourceBundle bundle = ResourceBundle.getBundle("message", getLocale());
String val = bundle.getString(key); String val = bundle.getString(key);
try { return MessageFormat.format(val, args);
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
} }
public static String getDefaultLocaleItem(final String key, final Object... args) { public static String getDefaultLocaleItem(final String key, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", getDefaultLocale()); ResourceBundle bundle = ResourceBundle.getBundle("message", getDefaultLocale());
String val = bundle.getString(key); String val = bundle.getString(key);
try { return MessageFormat.format(val, args);
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
} }
public static ObservableList<String> getObject(final ArrayList<String> keys, final Object... args) { public static ObservableList<String> getObject(final ArrayList<String> keys, final Object... args) {
@ -86,11 +76,7 @@ public final class I18N {
ArrayList<String> results = new ArrayList<>(); ArrayList<String> results = new ArrayList<>();
for(String key : keys){ for(String key : keys){
String val = bundle.getString(key); String val = bundle.getString(key);
try { results.add(val);
results.add(MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
} }
return FXCollections.observableArrayList(results); return FXCollections.observableArrayList(results);
@ -121,12 +107,7 @@ public final class I18N {
public static String getIndependent(final String key, Locale locale, final Object... args) { public static String getIndependent(final String key, Locale locale, final Object... args) {
ResourceBundle bundle = ResourceBundle.getBundle("message", locale); ResourceBundle bundle = ResourceBundle.getBundle("message", locale);
String val = bundle.getString(key); String val = bundle.getString(key);
try { return MessageFormat.format(val, args);
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return val;
} }
public static String getRootValue(String oldValue, ArrayList<String> nGramComputeForLetters) { public static String getRootValue(String oldValue, ArrayList<String> nGramComputeForLetters) {

View File

@ -0,0 +1,8 @@
package gui;
public class Launcher {
public static void main(String[] args) {
GUIController.main(args);
}
}

View File

@ -44,7 +44,7 @@ public class Messages {
// Not properly to be here. TODO move somewhere else in future // Not properly to be here. TODO move somewhere else in future
public static String HELP_URL = "http://slovnica.ijs.si/"; public static String HELP_URL = "http://slovnica.ijs.si/";
public static String CJVT_URL = "http://hdl.handle.net/11356/1227"; public static String CJVT_URL = "http://hdl.handle.net/11356/1964";
public static String GITHUB_URL = "https://gitea.cjvt.si/lkrsnik/list"; public static String GITHUB_URL = "https://gitea.cjvt.si/lkrsnik/list";
// helper maps // helper maps

View File

@ -282,8 +282,8 @@ exportFileName.wordSets=word-sets
exportFileName.gram=-gram exportFileName.gram=-gram
exportFileName.skip=-skip exportFileName.skip=-skip
about.header=LIST Corpus Extraction Tool\nVersion: 1.2 (Last update: 18 November 2019)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja about.header=LIST Corpus Extraction Tool\nVersion: 1.3 (Last update: 28 August 2024)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n which was financially supported by the Slovenian Research Agency between 2017 and 2020.\n The authors acknowledge the financial support from the Slovenian Research Agency\n (research core funding No. P6-0411 Language Resources and Technologies for Slovene).\n about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n the Empirical foundations for digitally-supported development of writing skills project (J7-3159)\n and the Language Resources and Technologies for Slovene programme (P6-0411), all\n financed by the Slovenian Research and Innovation Agency (ARIS).\n
about.signature=Publisher: Centre for Language Resources and Technologies, University of Ljubljana,\nJožef Stefan Institute,\nFaculty of Computer and Information Science, University of Ljubljana about.signature=Publisher: Centre for Language Resources and Technologies, University of Ljubljana,\nJožef Stefan Institute,\nFaculty of Computer and Information Science, University of Ljubljana
about.footer=Maintenance: Centre for Language Resources and Technologies, University of Ljubljana\nThe program is available under the Apache2 licence at CLARIN.si and GitHub. about.footer=Maintenance: Centre for Language Resources and Technologies, University of Ljubljana\nThe program is available under the Apache2 licence at CLARIN.si and GitHub.
about.links=Links: about.links=Links:

View File

@ -282,8 +282,8 @@ exportFileName.wordSets=besedni-nizi
exportFileName.gram=-gram exportFileName.gram=-gram
exportFileName.skip=-preskok exportFileName.skip=-preskok
about.header=LIST, korpusni luščilnik\nRazličica: 1.2 (Zadnja posodobitev: 18. november 2019)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja about.header=LIST, korpusni luščilnik\nRazličica: 1.3 (Zadnja posodobitev: 28. november 2024)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020\n sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411)\n je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna. about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno\n podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri\n in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za\n znanstvenoraziskovalno je sofinancirala Javna agencija za raziskovalno dejavnost\n Republike Slovenije iz državnega proračuna.
about.signature=Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,\nInstitut "Jožef Stefan",\nFakulteta za računalništvo in informatiko Univerze v Ljubljani about.signature=Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,\nInstitut "Jožef Stefan",\nFakulteta za računalništvo in informatiko Univerze v Ljubljani
about.footer=Vzdrževanje programa: Center za jezikovne vire in tehnologije Univerze v Ljubljani\nProgram je dostopen pod licenco Apache2 na repozitorijih CLARIN.si in GitHub. about.footer=Vzdrževanje programa: Center za jezikovne vire in tehnologije Univerze v Ljubljani\nProgram je dostopen pod licenco Apache2 na repozitorijih CLARIN.si in GitHub.
about.links=Povezave: about.links=Povezave:

View File

@ -13,9 +13,7 @@ public class CorpusTests {
@Test @Test
public void solarTest() { public void solarTest() {
// File selectedDirectory = new File("/home/luka/Desktop/corpus-analyzer/src/main/resources/Solar"); File selectedDirectory = new File("/home/luka/Development/CJVT/list/src/main/resources/Gigafida_subset/");
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
File selectedDirectory = new File("/home/luka/Development/corpus-analyzer2/src/main/resources/Gigafida_subset/");
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator)); Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));
@ -23,20 +21,7 @@ public class CorpusTests {
File f = Settings.corpus.iterator().next(); File f = Settings.corpus.iterator().next();
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, 0, CalculateFor.WORD);
// // stats.setCorpusType(CorpusType.GOS);
// stats.setCorpusType(CorpusType.SOLAR);
// XML_processing.readXMLGos(f.toString(), stats);
// XML_processing.readXML(f.toString(), stats);
// XML_processing.readXMLHeaderTag(f.toString(), "stats");
} }
// @Test
// public void test() {
// ObservableList<String> var = GosTaxonomy.getForComboBox();
// String debug = "";
//
// }
} }