Compare commits
No commits in common. "master" and "console-run" have entirely different histories.
master
...
console-ru
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -164,6 +164,7 @@ $RECYCLE.BIN/
|
|||
|
||||
src/main/resources/translation_external/
|
||||
src/main/resources/translations_backup/
|
||||
shade
|
||||
TEMP
|
||||
|
||||
config.json
|
||||
config_instructions.txt
|
||||
data
|
||||
|
|
28
Corpus Analyzer.iml
Executable file
28
Corpus Analyzer.iml
Executable file
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
|
||||
<output url="file://$MODULE_DIR$/target/classes" />
|
||||
<output-test url="file://$MODULE_DIR$/target/test-classes" />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" name="Maven: commons-io:commons-io:2.5" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
|
||||
<orderEntry type="library" name="Maven: com.googlecode.json-simple:json-simple:1.1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: junit:junit:4.10" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.commons:commons-csv:1.4" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.controlsfx:controlsfx:8.40.13" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.rocksdb:rocksdbjni:5.7.3" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-api:2.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.apache.logging.log4j:log4j-core:2.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-fontawesome-pack:1.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-core:1.9.0" level="project" />
|
||||
<orderEntry type="library" name="Maven: org.kordamp.ikonli:ikonli-javafx:1.9.0" level="project" />
|
||||
</component>
|
||||
</module>
|
|
@ -1,11 +0,0 @@
|
|||
# Build a jar
|
||||
|
||||
```shell
|
||||
mvn package
|
||||
```
|
||||
- results are in shade folder
|
||||
|
||||
# Build executable using Launch4j
|
||||
- Install Java on Windows
|
||||
- Run Launch4j (download first) and create executable
|
||||
- Copy jre from computer to jre folder that should be in the same folder as list.exe
|
BIN
corpus-analyzer.jar
Executable file
BIN
corpus-analyzer.jar
Executable file
Binary file not shown.
|
@ -1,24 +0,0 @@
|
|||
# Instructions
|
||||
|
||||
Instructions on how to run list.
|
||||
|
||||
## Windows
|
||||
|
||||
There are two options.
|
||||
|
||||
### Run list.exe
|
||||
The easier option is to download list.zip, extract it and run list.exe.
|
||||
|
||||
### Run list.jar
|
||||
To do this you first need to install the correct version of java (JDK). The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
|
||||
If you already have another version of Java installed you might have to delete previous version before you install this.
|
||||
|
||||
Secondly, you may run list using `run.bat` which will run `list.jar` for you.
|
||||
|
||||
## Linux
|
||||
### Run list.jar
|
||||
Similarly to running list.jar in Windows, you have to first make sure, that you have the appropriate version of Java installed. The program was developed and tested on [JDK Development Kit 21.0.2](https://www.oracle.com/java/technologies/downloads/#java21).
|
||||
If you already have another version of Java installed you might have to delete previous version before you install this.
|
||||
|
||||
Next, you may run list using `run.sh` which will run `list.jar` for you.
|
||||
|
130
pom.xml
130
pom.xml
|
@ -4,30 +4,9 @@
|
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>list</groupId>
|
||||
<artifactId>list</artifactId>
|
||||
<name>list</name>
|
||||
<version>1.3</version>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>central</id>
|
||||
<name>Central Repository</name>
|
||||
<url>https://repo.maven.apache.org/maven2/</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>central2</id>
|
||||
<name>Central Repository2</name>
|
||||
<url>https://repo1.maven.org/maven2/</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.release>17</maven.compiler.release>
|
||||
<javafx.version>21</javafx.version>
|
||||
<ikonli.version>12.3.1</ikonli.version>
|
||||
<javafx.maven.plugin.version>0.0.8</javafx.maven.plugin.version>
|
||||
</properties>
|
||||
<groupId>thesis</groupId>
|
||||
<artifactId>corpus-analyzer</artifactId>
|
||||
<version>1.2</version>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -53,7 +32,7 @@
|
|||
<dependency>
|
||||
<groupId>org.controlsfx</groupId>
|
||||
<artifactId>controlsfx</artifactId>
|
||||
<version>11.2.0</version>
|
||||
<version>8.40.13</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.rocksdb</groupId>
|
||||
|
@ -73,82 +52,71 @@
|
|||
<dependency>
|
||||
<groupId>org.kordamp.ikonli</groupId>
|
||||
<artifactId>ikonli-fontawesome-pack</artifactId>
|
||||
<version>${ikonli.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-controls</artifactId>
|
||||
<version>${javafx.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-fxml</artifactId>
|
||||
<version>${javafx.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-graphics</artifactId>
|
||||
<version>${javafx.version}</version>
|
||||
<classifier>win</classifier>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-graphics</artifactId>
|
||||
<version>${javafx.version}</version>
|
||||
<classifier>linux</classifier>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-graphics</artifactId>
|
||||
<version>${javafx.version}</version>
|
||||
<classifier>mac</classifier>
|
||||
<version>1.9.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.kordamp.ikonli</groupId>
|
||||
<artifactId>ikonli-javafx</artifactId>
|
||||
<version>${ikonli.version}</version>
|
||||
<version>1.9.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<!-- packages dependencies into the jar -->
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.11.0</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.openjfx</groupId>
|
||||
<artifactId>javafx-maven-plugin</artifactId>
|
||||
<version>${javafx.maven.plugin.version}</version>
|
||||
<configuration>
|
||||
<mainClass>gui.GUIController</mainClass>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>3.2.0</version>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<shadedArtifactAttached>true</shadedArtifactAttached>
|
||||
<shadedClassifierName>project-classifier</shadedClassifierName>
|
||||
<outputFile>shade\${project.artifactId}.jar</outputFile>
|
||||
<transformers>
|
||||
<transformer implementation=
|
||||
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||
<mainClass>gui.Launcher</mainClass>
|
||||
</transformer>
|
||||
</transformers>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>gui.GUIController</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
<appendAssemblyId>false</appendAssemblyId>
|
||||
<outputDirectory>artifact</outputDirectory>
|
||||
<finalName>Corpus_Analyzer_${version}</finalName>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- JavaFX -->
|
||||
<groupId>com.zenjava</groupId>
|
||||
<artifactId>javafx-maven-plugin</artifactId>
|
||||
<version>8.8.3</version>
|
||||
<configuration>
|
||||
<mainClass>gui.GUIController</mainClass>
|
||||
<verbose>true</verbose>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>create-jfxjar</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>build-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
||||
</project>
|
18
readme.md
18
readme.md
|
@ -1,10 +1,10 @@
|
|||
(English version below)
|
||||
|
||||
LIST, korpusni luščilnik
|
||||
Različica: 1.3 (Zadnja posodobitev: 28. avgust 2024)
|
||||
Različica: 1.0 (Zadnja posodobitev: 21. marec 2019)
|
||||
Avtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
|
||||
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za znanstvenoraziskovalno in inovacijsko dejavnost Republike Slovenije (ARIS) iz državnega proračuna.
|
||||
Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020 sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411) je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega proračuna.
|
||||
|
||||
Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,
|
||||
Institut "Jožef Stefan",
|
||||
|
@ -15,16 +15,19 @@ Program je dostopen pod licenco MIT License na repozitorijih CLARIN.SI (http://h
|
|||
|
||||
NAVODILA ZA NAMESTITEV IN ZAGON:
|
||||
|
||||
Datoteko list.zip razširimo in poženemo `list.exe` znotraj razširjene mape. Druge možnosti so opisane v [razširjeni dokumentaciji](instructions.md).
|
||||
1) Pred uporabo programske opreme mora biti na računalniku nameščena 64-bitna java (https://java.com/en/download/manual.jsp).
|
||||
2) Vse tri programske datoteke (run.sh, run.bat, list1.0.jar) skopiramo v poljubno mapo.
|
||||
3) Program zaženemo z dvoklikom na datoteko run.bat na operacijskem sistemu Windows ali run.sh na operacijskem sistemu Linux.
|
||||
4) Ko izbiramo lokacijo korpusa, moramo poskrbeti, da v mapi ni datotek več različnih korpusov.
|
||||
|
||||
|
||||
---------
|
||||
|
||||
LIST – Corpus Extraction Tool
|
||||
Version: 1.3 (Last update: 28 August 2024)
|
||||
Version: 1.0 (Last update: 21 March 2019)
|
||||
Authors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
|
||||
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), the Empirical foundations for digitally-supported development of writing skills project (J7-3159) and the Language Resources and Technologies for Slovene programme (P6-0411), all financed by the Slovenian Research and Innovation Agency (ARIS).
|
||||
The LIST corpus extraction tool is a program for extracting lists from text corpora on the levels of characters, word parts, words, and word sets. The program was developed within the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256), which was financially supported by the Slovenian Research Agency between 2017 and 2020. The authors acknowledge the financial support from the Slovenian Research Agency (research core funding No. P6-0411 Language Resources and Technologies for Slovene).
|
||||
|
||||
Publisher: Centre for Language Resources and Technologies, University of Ljubljana,
|
||||
Jožef Stefan Institute,
|
||||
|
@ -35,4 +38,7 @@ The program is available under the MIT License at CLARIN.SI (http://hdl.handle.n
|
|||
|
||||
INSTRUCTIONS FOR INSTALLATION AND USE:
|
||||
|
||||
Extract list.zip file and run list.exe. For other options please read [detailed instructions](instructions.md).
|
||||
1) Make sure that 64-bit java is installed on your computer (https://java.com/en/download/manual.jsp).
|
||||
2) Copy all three program files (run.sh, run.bat, list1.0.jar) in a single folder.
|
||||
3) Run the program by double-clicking the run.bat file on a Windows operating system or run.sh on Linux.
|
||||
4) When selecting the location of the corpus, make sure the folder does not include files of multiple different corpora.
|
||||
|
|
|
@ -50,8 +50,7 @@ public class XML_processing {
|
|||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SOLAR) {
|
||||
return readXMLSolar(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2 ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.KOST) {
|
||||
stats.getCorpus().getCorpusType() == CorpusType.GIGAFIDA2) {
|
||||
return readXMLSSJ500K(path, stats);
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.VERT) {
|
||||
return readVERT(path, stats);
|
||||
|
@ -462,8 +461,6 @@ public class XML_processing {
|
|||
HashMap<String, HashSet<String>> resultFilters = new HashMap<>();
|
||||
// taxonomy corpora
|
||||
HashSet<String> resultTaxonomy = new HashSet<>();
|
||||
HashSet<String> taxonomyNames = new HashSet<String>(
|
||||
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
|
||||
|
||||
String headTagName;
|
||||
|
||||
|
@ -474,7 +471,7 @@ public class XML_processing {
|
|||
|
||||
// init results now to avoid null pointers
|
||||
headTags.forEach(f -> resultFilters.put(f, new HashSet<>()));
|
||||
} else if (corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K) {
|
||||
} else if (corpusType == CorpusType.SSJ500K) {
|
||||
headTagName = "bibl";
|
||||
} else {
|
||||
headTagName = "teiHeader";
|
||||
|
@ -485,9 +482,6 @@ public class XML_processing {
|
|||
try {
|
||||
xmlEventReader = factory.createXMLEventReader(new FileInputStream(filepath));
|
||||
boolean insideHeader = false;
|
||||
boolean insideNote = false;
|
||||
String filterName = "";
|
||||
String filterValue = "";
|
||||
|
||||
while (xmlEventReader.hasNext()) {
|
||||
XMLEvent xmlEvent = xmlEventReader.nextEvent();
|
||||
|
@ -501,10 +495,6 @@ public class XML_processing {
|
|||
// this toggle is true when we're inside a header (next block of code executes)
|
||||
// and false when we're not (skip reading unnecessary attributes)
|
||||
insideHeader = true;
|
||||
} else if (corpusType == CorpusType.KOST && elementName.equals("standOff") ||
|
||||
corpusType == CorpusType.KOST && elementName.equals("TEI")
|
||||
) {
|
||||
return resultTaxonomy;
|
||||
}
|
||||
|
||||
if (insideHeader) {
|
||||
|
@ -526,11 +516,6 @@ public class XML_processing {
|
|||
.replace("#", "");
|
||||
|
||||
resultTaxonomy.add(tax);
|
||||
// kost
|
||||
} else if (parseTaxonomy && (corpusType == CorpusType.KOST) && elementName.equalsIgnoreCase("note")) {
|
||||
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
|
||||
.getValue().replace("#", "");
|
||||
insideNote = true;
|
||||
// solar
|
||||
} else if (!parseTaxonomy) {
|
||||
boolean inHeadTags = false;
|
||||
|
@ -548,22 +533,13 @@ public class XML_processing {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName) && (corpusType != CorpusType.KOST)) {
|
||||
} else if (xmlEvent.isEndElement() && corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// if the corpus is split into multiple files, each with only one header block per file
|
||||
// that means we should stop after we reach the end of the header
|
||||
return parseTaxonomy ? resultTaxonomy : resultFilters;
|
||||
} else if (xmlEvent.isEndElement() && !corpusIsSplit && isEndElementEndOfHeader(xmlEvent, headTagName)) {
|
||||
// whole corpus in one file, so we have to continue reading in order to find all header blocks
|
||||
insideHeader = false;
|
||||
} else if (xmlEvent.isEndElement() && insideNote) {
|
||||
if (taxonomyNames.contains(filterName)) {
|
||||
Collections.addAll(resultTaxonomy, Taxonomy.format_KOST_taxonomy(filterValue, filterName));
|
||||
}
|
||||
|
||||
insideNote = false;
|
||||
} else if (xmlEvent.isCharacters() && insideNote) {
|
||||
Characters characters = xmlEvent.asCharacters();
|
||||
filterValue = characters.getData();
|
||||
}
|
||||
}
|
||||
} catch (XMLStreamException e) {
|
||||
|
@ -750,8 +726,6 @@ public class XML_processing {
|
|||
boolean inPunctuation = false;
|
||||
boolean taxonomyMatch = true;
|
||||
ArrayList<Taxonomy> currentFiletaxonomy = new ArrayList<>();
|
||||
HashSet<String> taxonomyNames = new HashSet<String>(
|
||||
Arrays.asList("FirstLang", "TaskSetting", "ProficSlv", "ProgramType", "InputType"));
|
||||
String lemma = "";
|
||||
String msd = "";
|
||||
|
||||
|
@ -786,9 +760,6 @@ public class XML_processing {
|
|||
try {
|
||||
XMLInputFactory factory = XMLInputFactory.newInstance();
|
||||
eventReader = factory.createXMLEventReader(new FileInputStream(path));
|
||||
boolean insideNote = false;
|
||||
String filterName = "";
|
||||
String filterValue = "";
|
||||
|
||||
while (eventReader.hasNext()) {
|
||||
int percentage = (int) (lineNum * 100.0 / numLines);
|
||||
|
@ -832,12 +803,6 @@ public class XML_processing {
|
|||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(String.valueOf(tax.getValue()).replace("#", ""), stats.getCorpus());
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
}
|
||||
// kost
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("note")) {
|
||||
filterName = startElement.getAttributeByName(QName.valueOf("ana"))
|
||||
.getValue().replace("#", "");
|
||||
insideNote = true;
|
||||
|
||||
} else if (stats.getCorpus().getTaxonomy().size() > 0 && qName.equalsIgnoreCase("catRef")) {
|
||||
// get value from attribute target
|
||||
Attribute tax = startElement.getAttributeByName(QName.valueOf("target"));
|
||||
|
@ -853,10 +818,6 @@ public class XML_processing {
|
|||
|
||||
} else if (qName.equals("text")){
|
||||
taxonomyMatch = true;
|
||||
} else if (stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("standOff") ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.KOST && qName.equals("TEI")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -875,10 +836,6 @@ public class XML_processing {
|
|||
sentence.add(createWord(punctuation, punctuation, "/", punctuation, stats.getFilter()));
|
||||
inPunctuation = false;
|
||||
}
|
||||
// kost
|
||||
if (insideNote) {
|
||||
filterValue = characters.getData();
|
||||
}
|
||||
break;
|
||||
|
||||
case XMLStreamConstants.END_ELEMENT:
|
||||
|
@ -919,8 +876,7 @@ public class XML_processing {
|
|||
}
|
||||
// fallback
|
||||
else if (endElement.getName().getLocalPart().equalsIgnoreCase("div") &&
|
||||
(stats.getCorpus().getCorpusType() == CorpusType.SSJ500K ||
|
||||
stats.getCorpus().getCorpusType() == CorpusType.KOST)) {
|
||||
stats.getCorpus().getCorpusType() == CorpusType.SSJ500K) {
|
||||
// join corpus and stats
|
||||
fj(corpus, stats);
|
||||
corpus.clear();
|
||||
|
@ -936,7 +892,7 @@ public class XML_processing {
|
|||
// taxonomies don't match so stop
|
||||
// union (select words that match any of selected taxonomy
|
||||
taxonomyMatch = false;
|
||||
|
||||
//
|
||||
} else if(stats.getFilter().getTaxonomySetOperation().equals(I18N.get("taxonomySetOperation.INTERSECTION")) && currentFiletaxonomy.size() != stats.getFilter().getTaxonomy().size()){
|
||||
// intersection (select only words that precisely match selected taxonomy
|
||||
taxonomyMatch = false;
|
||||
|
@ -944,17 +900,6 @@ public class XML_processing {
|
|||
}
|
||||
} else if (endElement.getName().getLocalPart().equals("text")){
|
||||
taxonomyMatch = false;
|
||||
// kost
|
||||
}
|
||||
if (insideNote) {
|
||||
if (taxonomyNames.contains(filterName)) {
|
||||
for (String taxonomy : Taxonomy.format_KOST_taxonomy(filterValue, filterName)) {
|
||||
// keep only taxonomy properties
|
||||
Taxonomy currentFiletaxonomyElement = Taxonomy.factory(taxonomy, stats.getCorpus());
|
||||
currentFiletaxonomy.add(currentFiletaxonomyElement);
|
||||
}
|
||||
}
|
||||
insideNote = false;
|
||||
}
|
||||
|
||||
break;
|
||||
|
|
|
@ -3,8 +3,11 @@ package alg.ngram;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.sun.xml.internal.bind.v2.runtime.reflect.Lister;
|
||||
import data.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
|
|
|
@ -9,7 +9,6 @@ public enum CorpusType {
|
|||
CCKRES("ccKres ", "cckres"),
|
||||
SOLAR("Šolar", "šolar"),
|
||||
GOS("GOS", "gos"),
|
||||
KOST("KOST", "kost"),
|
||||
SSJ500K("ssj500k", "ssj500k"),
|
||||
VERT("vert", "vert");
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ import javafx.collections.ObservableList;
|
|||
public class Tax {
|
||||
private static LinkedHashMap<String, String> GIGAFIDA_TAXONOMY;
|
||||
private static LinkedHashMap<String, String> GOS_TAXONOMY;
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.KOST, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
|
||||
private static final HashSet<CorpusType> corpusTypesWithTaxonomy = new HashSet<>(Arrays.asList(CorpusType.GIGAFIDA, CorpusType.GOS, CorpusType.CCKRES, CorpusType.SSJ500K, CorpusType.GIGAFIDA2, CorpusType.VERT));
|
||||
|
||||
static {
|
||||
// GIGAFIDA ----------------------------
|
||||
|
@ -108,7 +108,7 @@ public class Tax {
|
|||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2){
|
||||
// if VERT only order taxonomy by alphabet
|
||||
ArrayList<String> sortedFoundTaxonomy = new ArrayList<>(foundTax);
|
||||
Collections.sort(sortedFoundTaxonomy);
|
||||
|
@ -199,7 +199,7 @@ public class Tax {
|
|||
tax = GIGAFIDA_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.GOS) {
|
||||
tax = GOS_TAXONOMY;
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
} else if (corpusType == CorpusType.VERT || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
for (Taxonomy t : taxonomy) {
|
||||
result.add(t.toLongNameString());
|
||||
}
|
||||
|
|
|
@ -763,42 +763,6 @@ public class Taxonomy {
|
|||
|
||||
}
|
||||
|
||||
public static String[] format_KOST_taxonomy(String value, String parameter) {
|
||||
Map<String, String> filterMap = new HashMap<>();
|
||||
filterMap.put("FirstLang", "Prvi Jezik tvorca");
|
||||
filterMap.put("TaskSetting", "Okoliščine nastanka");
|
||||
filterMap.put("ProficSlv", "Nivo");
|
||||
filterMap.put("ProgramType", "Program");
|
||||
filterMap.put("InputType", "Napisano");
|
||||
|
||||
String[] split_value = new String[] {};
|
||||
if (parameter.equals("FirstLang")) {
|
||||
if (value.contains(", ")) {
|
||||
split_value = value.split(", ");
|
||||
} else if (value.contains(" ")) {
|
||||
for (String v : value.split(" ")) {
|
||||
if (v.equals("španščina") || v.equals("angleščina")) {
|
||||
split_value = new String[] {v};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
} else if (parameter.equals("ProficSlv")) {
|
||||
if (value.equals("Izpopolnjevalec")) {
|
||||
split_value = new String[] {"izpopolnjevalec"};
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
} else {
|
||||
split_value = new String[] {value};
|
||||
}
|
||||
|
||||
return Arrays.stream(split_value)
|
||||
.map(val -> filterMap.get(parameter) + " - " + val)
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.name;
|
||||
}
|
||||
|
@ -870,7 +834,7 @@ public class Taxonomy {
|
|||
|
||||
public static ArrayList<Taxonomy> modifyingTaxonomy(ArrayList<Taxonomy> taxonomy, List<String> checkedItems, Corpus corpus){
|
||||
ArrayList<TaxonomyEnum> checkedItemsTaxonomy = TaxonomyEnum.convertStringListToTaxonomyList(checkedItems, corpus);
|
||||
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.KOST && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
||||
if (checkedItemsTaxonomy != null && corpus.getCorpusType() != CorpusType.VERT && corpus.getCorpusType() != CorpusType.SSJ500K && corpus.getCorpusType() != CorpusType.GIGAFIDA2) {
|
||||
TaxonomyEnum.modifyingTaxonomy(Taxonomy.taxonomyToTaxonomyEnum(taxonomy), checkedItemsTaxonomy, corpus);
|
||||
return taxonomyEnumToTaxonomy(checkedItemsTaxonomy, corpus);
|
||||
} else {
|
||||
|
|
|
@ -5,7 +5,6 @@ import static gui.GUIController.*;
|
|||
import static util.Util.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.Field;
|
||||
|
@ -150,7 +149,7 @@ public class CorpusTab {
|
|||
private String corpusLocation;
|
||||
private String corpusFilesSize;
|
||||
|
||||
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (KOST 2.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
|
||||
private static final String [] SELECT_READER_ARRAY = {"VERT + REGI", "XML (Šolar 1.0)", "XML (GOS 1.0)", "XML (ssj500k 2.1)", "XML (Gigafida 2.0)", "XML (Gigafida 1.0, Kres 1.0)"};
|
||||
private static final ArrayList<String> SELECT_READER = new ArrayList<>(Arrays.asList(SELECT_READER_ARRAY));
|
||||
|
||||
private static final String [] PUNCTUATION_ARRAY = {"punctuation.COMMA", "punctuation.POINT"};
|
||||
|
@ -195,6 +194,8 @@ public class CorpusTab {
|
|||
}
|
||||
|
||||
public void initialize() {
|
||||
updateTooltipBehavior(0.0, 30000.0,0.0, true);
|
||||
|
||||
// add CSS style
|
||||
corpusTabPane.getStylesheets().add("style.css");
|
||||
corpusTabPane.getStyleClass().add("root");
|
||||
|
@ -498,7 +499,7 @@ public class CorpusTab {
|
|||
|
||||
logger.info("reading header data for ", corpusType.toString());
|
||||
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.KOST || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
if (corpusType == CorpusType.GIGAFIDA || corpusType == CorpusType.GOS || corpusType == CorpusType.SSJ500K || corpusType == CorpusType.GIGAFIDA2) {
|
||||
boolean corpusIsSplit = corpusFiles.size() > 1;
|
||||
|
||||
final Task<HashSet<String>> task = new Task<HashSet<String>>() {
|
||||
|
@ -739,6 +740,7 @@ public class CorpusTab {
|
|||
|
||||
private void selectReader() {
|
||||
switch (selectReader) {
|
||||
// "vert", "Solar", "GOS", "SSJ500K", "Gigafida", "Gigafida (old)", "Kres (old)"
|
||||
case "VERT + REGI":
|
||||
corpusType = VERT;
|
||||
break;
|
||||
|
@ -748,9 +750,6 @@ public class CorpusTab {
|
|||
case "XML (GOS 1.0)":
|
||||
corpusType = GOS;
|
||||
break;
|
||||
case "XML (KOST 2.0)":
|
||||
corpusType = KOST;
|
||||
break;
|
||||
case "XML (ssj500k 2.1)":
|
||||
corpusType = SSJ500K;
|
||||
break;
|
||||
|
@ -789,8 +788,6 @@ public class CorpusTab {
|
|||
corpusType = GOS;
|
||||
} else if (attrib.contains(SSJ500K.getNameLowerCase())) {
|
||||
corpusType = SSJ500K;
|
||||
} else if (attrib.contains(KOST.getNameLowerCase())) {
|
||||
corpusType = KOST;
|
||||
}
|
||||
|
||||
if (corpusType == null) {
|
||||
|
|
|
@ -61,13 +61,23 @@ public final class I18N {
|
|||
public static String get(final String key, final Object... args) {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle("message", getLocale());
|
||||
String val = bundle.getString(key);
|
||||
return MessageFormat.format(val, args);
|
||||
try {
|
||||
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String getDefaultLocaleItem(final String key, final Object... args) {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle("message", getDefaultLocale());
|
||||
String val = bundle.getString(key);
|
||||
return MessageFormat.format(val, args);
|
||||
try {
|
||||
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
public static ObservableList<String> getObject(final ArrayList<String> keys, final Object... args) {
|
||||
|
@ -76,7 +86,11 @@ public final class I18N {
|
|||
ArrayList<String> results = new ArrayList<>();
|
||||
for(String key : keys){
|
||||
String val = bundle.getString(key);
|
||||
results.add(val);
|
||||
try {
|
||||
results.add(MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
return FXCollections.observableArrayList(results);
|
||||
|
@ -107,7 +121,12 @@ public final class I18N {
|
|||
public static String getIndependent(final String key, Locale locale, final Object... args) {
|
||||
ResourceBundle bundle = ResourceBundle.getBundle("message", locale);
|
||||
String val = bundle.getString(key);
|
||||
return MessageFormat.format(val, args);
|
||||
try {
|
||||
return MessageFormat.format(new String(val.getBytes("ISO-8859-1"), "UTF-8"), args);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String getRootValue(String oldValue, ArrayList<String> nGramComputeForLetters) {
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
package gui;
|
||||
|
||||
public class Launcher {
|
||||
|
||||
public static void main(String[] args) {
|
||||
GUIController.main(args);
|
||||
}
|
||||
}
|
|
@ -44,7 +44,7 @@ public class Messages {
|
|||
|
||||
// Not properly to be here. TODO move somewhere else in future
|
||||
public static String HELP_URL = "http://slovnica.ijs.si/";
|
||||
public static String CJVT_URL = "http://hdl.handle.net/11356/1964";
|
||||
public static String CJVT_URL = "http://hdl.handle.net/11356/1227";
|
||||
public static String GITHUB_URL = "https://gitea.cjvt.si/lkrsnik/list";
|
||||
|
||||
// helper maps
|
||||
|
|
|
@ -282,8 +282,8 @@ exportFileName.wordSets=word-sets
|
|||
exportFileName.gram=-gram
|
||||
exportFileName.skip=-skip
|
||||
|
||||
about.header=LIST – Corpus Extraction Tool\nVersion: 1.3 (Last update: 28 August 2024)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n the Empirical foundations for digitally-supported development of writing skills project (J7-3159)\n and the Language Resources and Technologies for Slovene programme (P6-0411), all\n financed by the Slovenian Research and Innovation Agency (ARIS).\n
|
||||
about.header=LIST – Corpus Extraction Tool\nVersion: 1.2 (Last update: 18 November 2019)\nAuthors: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
about.description=The LIST corpus extraction tool is a program for extracting lists from text corpora on the\n levels of characters, word parts, words, and word sets. The program was developed within\n the New Grammar of Modern Standard Slovene: Resource and Methods project (J6-8256),\n which was financially supported by the Slovenian Research Agency between 2017 and 2020.\n The authors acknowledge the financial support from the Slovenian Research Agency\n (research core funding No. P6-0411 Language Resources and Technologies for Slovene).\n
|
||||
about.signature=Publisher: Centre for Language Resources and Technologies, University of Ljubljana,\nJožef Stefan Institute,\nFaculty of Computer and Information Science, University of Ljubljana
|
||||
about.footer=Maintenance: Centre for Language Resources and Technologies, University of Ljubljana\nThe program is available under the Apache2 licence at CLARIN.si and GitHub.
|
||||
about.links=Links:
|
||||
|
|
|
@ -282,8 +282,8 @@ exportFileName.wordSets=besedni-nizi
|
|||
exportFileName.gram=-gram
|
||||
exportFileName.skip=-preskok
|
||||
|
||||
about.header=LIST, korpusni luščilnik\nRazličica: 1.3 (Zadnja posodobitev: 28. november 2024)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projektov Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), Empirična podlaga za digitalno\n podprt razvoj pisne jezikovne zmožnosti (J7-3159) in raziskovalnega programa Jezikovni viri\n in tehnologije za slovenski jezik (št. P6-0411), ki jih financira Javna agencija za\n znanstvenoraziskovalno je sofinancirala Javna agencija za raziskovalno dejavnost\n Republike Slovenije iz državnega proračuna.
|
||||
about.header=LIST, korpusni luščilnik\nRazličica: 1.2 (Zadnja posodobitev: 18. november 2019)\nAvtorji: Luka Krsnik, Špela Arhar Holdt, Jaka Čibej, Kaja Dobrovoljc, Aleksander\n Ključevšek, Simon Krek, Marko Robnik Šikonja
|
||||
about.description=Korpusni luščilnik LIST je program za luščenje spiskov iz besedilnih korpusov na nivojih\n znakov, besednih delov, besed in besednih nizov. Nastal je v okviru projekta Nova slovnica\n sodobne standardne slovenščine: viri in metode (J6-8256), ki ga je med letoma 2017 in 2020\n sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna. Raziskovalni program Jezikovni viri in tehnologije za slovenski jezik (št. P6-0411)\n je sofinancirala Javna agencija za raziskovalno dejavnost Republike Slovenije iz državnega\n proračuna.
|
||||
about.signature=Izdajatelj: Center za jezikovne vire in tehnologije Univerze v Ljubljani,\nInstitut "Jožef Stefan",\nFakulteta za računalništvo in informatiko Univerze v Ljubljani
|
||||
about.footer=Vzdrževanje programa: Center za jezikovne vire in tehnologije Univerze v Ljubljani\nProgram je dostopen pod licenco Apache2 na repozitorijih CLARIN.si in GitHub.
|
||||
about.links=Povezave:
|
||||
|
|
|
@ -13,7 +13,9 @@ public class CorpusTests {
|
|||
|
||||
@Test
|
||||
public void solarTest() {
|
||||
File selectedDirectory = new File("/home/luka/Development/CJVT/list/src/main/resources/Gigafida_subset/");
|
||||
// File selectedDirectory = new File("/home/luka/Desktop/corpus-analyzer/src/main/resources/Solar");
|
||||
// File selectedDirectory = new File("/home/andrej/Desktop/corpus-analyzer/src/main/resources/GOS");
|
||||
File selectedDirectory = new File("/home/luka/Development/corpus-analyzer2/src/main/resources/Gigafida_subset/");
|
||||
|
||||
Settings.resultsFilePath = new File(selectedDirectory.getAbsolutePath().concat(File.separator));
|
||||
|
||||
|
@ -21,7 +23,20 @@ public class CorpusTests {
|
|||
|
||||
File f = Settings.corpus.iterator().next();
|
||||
|
||||
// Statistics stats = new Statistics(AnalysisLevel.STRING_LEVEL, 2, 0, CalculateFor.WORD);
|
||||
// // stats.setCorpusType(CorpusType.GOS);
|
||||
// stats.setCorpusType(CorpusType.SOLAR);
|
||||
|
||||
// XML_processing.readXMLGos(f.toString(), stats);
|
||||
// XML_processing.readXML(f.toString(), stats);
|
||||
// XML_processing.readXMLHeaderTag(f.toString(), "stats");
|
||||
|
||||
}
|
||||
|
||||
// @Test
|
||||
// public void test() {
|
||||
// ObservableList<String> var = GosTaxonomy.getForComboBox();
|
||||
// String debug = "";
|
||||
//
|
||||
// }
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user