forked from kristjan/cjvt-valency
Compare commits
14 Commits
Author | SHA1 | Date |
---|---|---|
Luka | ec083a8d63 | 3 years ago |
Luka | 69c3521e4b | 3 years ago |
Luka | 75b015dcda | 3 years ago |
Luka | c18aaff11f | 3 years ago |
Luka | 34b776be11 | 3 years ago |
lkrsnik | 26bca0b083 | 3 years ago |
Luka | 2551a9c6a8 | 4 years ago |
Luka | 5cdc963c2d | 4 years ago |
Luka | ce1fb46b4e | 4 years ago |
Luka | 220529b777 | 4 years ago |
Luka | ae5f2869bc | 4 years ago |
Luka | 931b3531b3 | 4 years ago |
Luka | 3d91251905 | 4 years ago |
Luka | c803057164 | 5 years ago |
@ -1,3 +1,6 @@
|
|||||||
[submodule "src/pkg/cjvt-corpusparser"]
|
[submodule "src/pkg/cjvt-corpusparser"]
|
||||||
path = src/pkg/cjvt-corpusparser
|
path = src/pkg/cjvt-corpusparser
|
||||||
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
|
url = git@gitea.cjvt.si:kristjan/cjvt-corpusparser.git
|
||||||
|
[submodule "src/pkg/luscenje_struktur"]
|
||||||
|
path = src/pkg/luscenje_struktur
|
||||||
|
url = https://gitea.cjvt.si/ozbolt/luscenje_struktur.git
|
||||||
|
@ -1 +0,0 @@
|
|||||||
/home/kristjan/workdir/final_json/
|
|
@ -1 +0,0 @@
|
|||||||
/home/kristjan/kres_mount/kres_parsed/tei/
|
|
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
/home/kristjan/git/diploma/data/ssj500k-sl.TEI/ssj500k-sl.body.xml
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -1,5 +1,5 @@
|
|||||||
FROM mongo:latest
|
FROM mongo:4.2.9
|
||||||
|
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
COPY init_inside_container.sh /.
|
COPY init_inside_mongo_container.sh /.
|
||||||
COPY create.js /.
|
COPY create_mongo.js /.
|
||||||
|
@ -0,0 +1,2 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
docker ps | grep postgres | awk '{print $1}'
|
@ -1,3 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
mongo admin < /create.js
|
|
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
mongo admin < /create_mongo.js
|
@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
/scripts/create_postgres.js
|
@ -1,26 +0,0 @@
|
|||||||
version: '3.1'
|
|
||||||
|
|
||||||
services:
|
|
||||||
|
|
||||||
my_mongo:
|
|
||||||
image: my-mongo
|
|
||||||
restart: always
|
|
||||||
ports:
|
|
||||||
- 27017:27017
|
|
||||||
environment:
|
|
||||||
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
|
|
||||||
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
|
|
||||||
volumes:
|
|
||||||
- ${HOME}/mongo_container/data/:/data/db
|
|
||||||
|
|
||||||
mongo_express:
|
|
||||||
image: mongo-express
|
|
||||||
restart: always
|
|
||||||
ports:
|
|
||||||
- 8087:8081
|
|
||||||
environment:
|
|
||||||
ME_CONFIG_BASICAUTH_USERNAME: ${MONGOEXPRESS_USER}
|
|
||||||
ME_CONFIG_BASICAUTH_PASSWORD: ${MONGOEXPRESS_PASS}
|
|
||||||
ME_CONFIG_MONGODB_ADMINUSERNAME: ${DB_ADM_USER}
|
|
||||||
ME_CONFIG_MONGODB_ADMINPASSWORD: ${DB_ADM_PASS}
|
|
||||||
ME_CONFIG_MONGODB_SERVER: my_mongo
|
|
@ -0,0 +1,27 @@
|
|||||||
|
version: '3.1'
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
my_mongo:
|
||||||
|
image: my-mongo
|
||||||
|
restart: always
|
||||||
|
ports:
|
||||||
|
- 127.0.0.1:27017:27017
|
||||||
|
environment:
|
||||||
|
MONGO_INITDB_ROOT_USERNAME: ${DB_ADM_USER}
|
||||||
|
MONGO_INITDB_ROOT_PASSWORD: ${DB_ADM_PASS}
|
||||||
|
volumes:
|
||||||
|
- ${HOME}/valency_data/mongo_container/data/:/data/db
|
||||||
|
- ./:/scripts
|
||||||
|
|
||||||
|
my_postgres:
|
||||||
|
image: postgres
|
||||||
|
restart: always
|
||||||
|
ports:
|
||||||
|
- 127.0.0.1:5432:5432
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: ${DB_ADM_USER}
|
||||||
|
POSTGRES_PASSWORD: ${DB_ADM_PASS}
|
||||||
|
volumes:
|
||||||
|
- ${HOME}/valency_data/postgres_container/data/:/var/lib/postgresql/data
|
||||||
|
- ./:/scripts
|
@ -0,0 +1,37 @@
|
|||||||
|
asn1crypto==0.24.0
|
||||||
|
beautifulsoup4==4.8.0
|
||||||
|
bs4==0.0.1
|
||||||
|
cffi==1.12.3
|
||||||
|
Click==7.0
|
||||||
|
cryptography==2.1.4
|
||||||
|
Flask==1.1.1
|
||||||
|
Flask-Cors==3.0.8
|
||||||
|
Flask-PyMongo==2.3.0
|
||||||
|
gunicorn==19.9.0
|
||||||
|
idna==2.6
|
||||||
|
itsdangerous==1.1.0
|
||||||
|
Jinja2==2.10.1
|
||||||
|
joblib==0.13.2
|
||||||
|
keyring==10.6.0
|
||||||
|
keyrings.alt==3.0
|
||||||
|
lxml==4.4.0
|
||||||
|
MarkupSafe==1.1.1
|
||||||
|
numpy==1.17.0
|
||||||
|
pandas==0.25.0
|
||||||
|
pathlib==1.0.1
|
||||||
|
psycopg2==2.8.4
|
||||||
|
pycparser==2.19
|
||||||
|
pycrypto==2.6.1
|
||||||
|
pymongo==3.8.0
|
||||||
|
python-dateutil==2.8.0
|
||||||
|
pytz==2019.2
|
||||||
|
pyxdg==0.25
|
||||||
|
PyYAML==5.1.2
|
||||||
|
scikit-learn==0.21.3
|
||||||
|
scipy==1.3.0
|
||||||
|
SecretStorage==2.3.1
|
||||||
|
six==1.11.0
|
||||||
|
sklearn==0.0
|
||||||
|
soupsieve==1.9.3
|
||||||
|
SQLAlchemy==1.3.12
|
||||||
|
Werkzeug==0.15.5
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,189 @@
|
|||||||
|
import copy
|
||||||
|
import csv
|
||||||
|
from xml.etree import ElementTree
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import pickle
|
||||||
|
import time
|
||||||
|
import gc
|
||||||
|
import subprocess
|
||||||
|
import concurrent.futures
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
def read_gigafida(path):
|
||||||
|
words = {}
|
||||||
|
with open(path) as tsvfile:
|
||||||
|
reader = csv.reader(tsvfile, delimiter='\t')
|
||||||
|
for row in reader:
|
||||||
|
words[row[0]] = int(row[2])
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def read_sloleks(path):
|
||||||
|
words = set()
|
||||||
|
with open(path) as tsvfile:
|
||||||
|
reader = csv.reader(tsvfile, delimiter='\t')
|
||||||
|
for row in reader:
|
||||||
|
words.add(row[1])
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def read_zele(path):
|
||||||
|
with open(path) as f:
|
||||||
|
content = f.readlines()
|
||||||
|
# fix content
|
||||||
|
content[0] = content[0][1:]
|
||||||
|
# a = content[2]
|
||||||
|
# a = content[2].split()
|
||||||
|
# a = content[2].split()[0].split('<IZT>')[1]
|
||||||
|
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
|
||||||
|
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
|
||||||
|
# content = [x.split() for x in content]
|
||||||
|
return set(content)
|
||||||
|
|
||||||
|
|
||||||
|
def read_wordlist(path):
|
||||||
|
with open(path) as f:
|
||||||
|
content = [line[:-1] for line in f.readlines()]
|
||||||
|
print(content[-1])
|
||||||
|
return set(content)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_gigafida(gigafida_raw, min_limit, max_limit):
|
||||||
|
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
|
||||||
|
|
||||||
|
|
||||||
|
def set_list_intersection(gigafida_filtered, sloleks):
|
||||||
|
intersection = {}
|
||||||
|
for word, num in gigafida_filtered.items():
|
||||||
|
if word in sloleks:
|
||||||
|
intersection[word] = num
|
||||||
|
return intersection
|
||||||
|
|
||||||
|
|
||||||
|
def list_list_union(list1, list2):
|
||||||
|
union = copy.copy(list1)
|
||||||
|
for w, n in list2.items():
|
||||||
|
if w not in list1:
|
||||||
|
union[w] = list2[w]
|
||||||
|
return union
|
||||||
|
|
||||||
|
|
||||||
|
def list_list_subtraction(list1, list2):
|
||||||
|
subtraction = {}
|
||||||
|
for w, n in list2.items():
|
||||||
|
# if w == 'dejati':
|
||||||
|
# print('here')
|
||||||
|
if w not in list1:
|
||||||
|
subtraction[w] = n
|
||||||
|
return subtraction
|
||||||
|
|
||||||
|
|
||||||
|
def set_set_subtraction(set1, set2):
|
||||||
|
subtraction = {}
|
||||||
|
for w in set2:
|
||||||
|
if w not in set1:
|
||||||
|
subtraction[w] = -1
|
||||||
|
return subtraction
|
||||||
|
|
||||||
|
|
||||||
|
def create_document(list1, path):
|
||||||
|
with open(path, "w") as text_file:
|
||||||
|
for w, n in list1.items():
|
||||||
|
text_file.write("%s\t%d\n" % (w, n))
|
||||||
|
|
||||||
|
|
||||||
|
def create_document_set(list1, path):
|
||||||
|
with open(path, "w") as text_file:
|
||||||
|
for w in sorted(list(list1)):
|
||||||
|
text_file.write("%s\n" % w)
|
||||||
|
|
||||||
|
|
||||||
|
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
|
||||||
|
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
|
||||||
|
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
|
||||||
|
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
|
||||||
|
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||||
|
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||||
|
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||||
|
return sloleks_zele_subtraction
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
gigafida_raw = read_gigafida(args.gigafida_verb_list)
|
||||||
|
sloleks = read_sloleks(args.sloleks)
|
||||||
|
zele = read_zele(args.zele)
|
||||||
|
if args.wordlist is not None:
|
||||||
|
sloleks_wordlist = set()
|
||||||
|
# sloleks_wordlist = set()
|
||||||
|
for el in sloleks:
|
||||||
|
if el in gigafida_raw:
|
||||||
|
sloleks_wordlist.add(el)
|
||||||
|
filtered_wordlist = read_wordlist(args.wordlist)
|
||||||
|
|
||||||
|
# sloleks_wordlist = set()
|
||||||
|
for el in sloleks:
|
||||||
|
if el in gigafida_raw:
|
||||||
|
filtered_wordlist.add(el)
|
||||||
|
|
||||||
|
create_document_set(filtered_wordlist, 'wordlist.tsv')
|
||||||
|
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
|
||||||
|
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
|
||||||
|
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
|
||||||
|
|
||||||
|
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
res = [el[0] for el in nouns_sloleks_gf_intersect]
|
||||||
|
|
||||||
|
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
|
||||||
|
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||||
|
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||||
|
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
|
||||||
|
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
|
||||||
|
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
|
||||||
|
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
|
||||||
|
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
|
||||||
|
|
||||||
|
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
|
||||||
|
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||||
|
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
|
||||||
|
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
|
||||||
|
|
||||||
|
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
|
||||||
|
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||||
|
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
|
||||||
|
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
|
||||||
|
# pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Extract keywords from multiple lists.')
|
||||||
|
parser.add_argument('gigafida_verb_list',
|
||||||
|
help='Path to gigafida list of verbs in tsv format.')
|
||||||
|
parser.add_argument('sloleks',
|
||||||
|
help='Path to Sloleks in tsv format.')
|
||||||
|
parser.add_argument('--zele',
|
||||||
|
help='Path to zele valency dictionary.')
|
||||||
|
parser.add_argument('--wordlist', default=None,
|
||||||
|
help='Path to filtered wordlist.')
|
||||||
|
parser.add_argument('--handchecked_words', default=None,
|
||||||
|
help='Path to handchecked words.')
|
||||||
|
# parser.add_argument('--min_limit',
|
||||||
|
# help='Limit min number of ocurrences',
|
||||||
|
# type=int, default=0)
|
||||||
|
# parser.add_argument('--max_limit',
|
||||||
|
# help='Limit max number of ocurrences',
|
||||||
|
# type=int, default=sys.maxsize)
|
||||||
|
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
||||||
|
choices=["warning", "info", "debug"], default="info",
|
||||||
|
const="info", nargs='?')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
main(args)
|
||||||
|
logging.info("TIME: {}".format(time.time() - start))
|
@ -0,0 +1,117 @@
|
|||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
from lxml import etree, objectify, html
|
||||||
|
|
||||||
|
|
||||||
|
def write_general_statistics(path, out_list):
|
||||||
|
if len(out_list) == 0:
|
||||||
|
return
|
||||||
|
with open(path, 'w') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter='\t',
|
||||||
|
quotechar='"')
|
||||||
|
writer.writerow(['Semantic role', 'Valency pattern ratio', 'Valency sentence ratio'])
|
||||||
|
for line in out_list:
|
||||||
|
writer.writerow(line)
|
||||||
|
|
||||||
|
|
||||||
|
def write_statistics(path, out_list):
|
||||||
|
if len(out_list) == 0:
|
||||||
|
return
|
||||||
|
with open(path, 'w') as csvfile:
|
||||||
|
writer = csv.writer(csvfile, delimiter='\t',
|
||||||
|
quotechar='"')
|
||||||
|
writer.writerow(['Valency pattern id', 'Frequency all GF', 'Semantic role', 'Pattern representation', 'Corpus example'])
|
||||||
|
for line in out_list:
|
||||||
|
writer.writerow(line)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
for file in sorted(os.listdir(args.input)):
|
||||||
|
path = os.path.join(args.input, file)
|
||||||
|
tree = etree.parse(path)
|
||||||
|
gf_output = []
|
||||||
|
ssj_output = []
|
||||||
|
head = next(tree.iter('head'))
|
||||||
|
headword = head.find('headword').find('lemma').text
|
||||||
|
#for div in root.iterfind('.//div'):
|
||||||
|
for elem in tree.iter('statisticsContainer'):
|
||||||
|
# for element in tree.iterfind('statisticsContainer'):
|
||||||
|
# for element in tree.find('statisticsContainer'):
|
||||||
|
semRole = elem.find('semanticRole').text
|
||||||
|
gf_pattern = None
|
||||||
|
gf_sentence = None
|
||||||
|
ssj_pattern = None
|
||||||
|
ssj_sentence = None
|
||||||
|
measure = elem.find('measureList')
|
||||||
|
for el in measure:
|
||||||
|
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
gf_pattern = el.text
|
||||||
|
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
gf_sentence = el.text
|
||||||
|
if el.attrib['type'] == 'valency_pattern_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||||
|
ssj_pattern = el.text
|
||||||
|
if el.attrib['type'] == 'valency_sentence_ratio' and el.attrib['source'] == 'ssj500k 2.2':
|
||||||
|
ssj_sentence = el.text
|
||||||
|
if gf_pattern is not None and gf_sentence is not None:
|
||||||
|
gf_output.append([semRole, gf_pattern, gf_sentence])
|
||||||
|
if ssj_pattern is not None and ssj_sentence is not None:
|
||||||
|
ssj_output.append([semRole, ssj_pattern, ssj_sentence])
|
||||||
|
|
||||||
|
print(file)
|
||||||
|
|
||||||
|
analyze_output = []
|
||||||
|
for elem in tree.iter('valencyPattern'):
|
||||||
|
valency_pattern_id = elem.attrib['id']
|
||||||
|
|
||||||
|
# get frequency
|
||||||
|
measure = ''
|
||||||
|
for measure_el in elem.find('measureList').findall('measure'):
|
||||||
|
if measure_el.attrib['source'] == 'Gigafida 2.0':
|
||||||
|
measure = measure_el.text
|
||||||
|
|
||||||
|
# get semantic roles
|
||||||
|
semantic_roles_list = []
|
||||||
|
for semantic_rol_con in elem.find('semanticRoleContainerList').findall('semanticRoleContainer'):
|
||||||
|
semantic_roles_list.append(semantic_rol_con.find('semanticRole').text)
|
||||||
|
semantic_roles = '_'.join(semantic_roles_list)
|
||||||
|
|
||||||
|
# pattern representation
|
||||||
|
pattern_representation = elem.find('patternRepresentation').text
|
||||||
|
|
||||||
|
# corpus example
|
||||||
|
if elem.find('exampleContainerList') is not None and elem.find('exampleContainerList').find('exampleContainer') is not None and elem.find('exampleContainerList').find('exampleContainer').find('corpusExample') is not None:
|
||||||
|
corpus_example_text = html.tostring(elem.find('exampleContainerList').find('exampleContainer').find('corpusExample'), encoding='unicode')
|
||||||
|
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ugly postprocessing to remove xmlns:xsi=... duh..
|
||||||
|
root = etree.fromstring(corpus_example_text)
|
||||||
|
|
||||||
|
# Remove namespace prefixes
|
||||||
|
for elem in root.getiterator():
|
||||||
|
elem.tag = etree.QName(elem).localname
|
||||||
|
# Remove unused namespace declarations
|
||||||
|
etree.cleanup_namespaces(root)
|
||||||
|
|
||||||
|
corpus_example = etree.tostring(root, encoding='unicode')
|
||||||
|
|
||||||
|
print(f"Valency pattern {valency_pattern_id}")
|
||||||
|
|
||||||
|
|
||||||
|
analyze_output.append([valency_pattern_id, measure, semantic_roles, pattern_representation, corpus_example])
|
||||||
|
|
||||||
|
write_general_statistics(os.path.join(args.output, headword + '_gf_stats.tsv'), gf_output)
|
||||||
|
write_general_statistics(os.path.join(args.output, headword + '_ssj_stats.tsv'), ssj_output)
|
||||||
|
write_statistics(os.path.join(args.output, headword + '_patterns.tsv'), analyze_output)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
arg_parser = argparse.ArgumentParser(description='Export and validate collocation data from DDD database.')
|
||||||
|
arg_parser.add_argument('--input', type=str, help='Input directory')
|
||||||
|
arg_parser.add_argument('--output', type=str, help='Output directory')
|
||||||
|
|
||||||
|
args = arg_parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
@ -0,0 +1 @@
|
|||||||
|
../src/pkg/valency/valency
|
@ -0,0 +1,8 @@
|
|||||||
|
from lxml import etree as lxml
|
||||||
|
|
||||||
|
with open('../data/inventory.xsd') as f:
|
||||||
|
xmlschema_doc = lxml.parse(f)
|
||||||
|
xmlschema = lxml.XMLSchema(xmlschema_doc)
|
||||||
|
with open('../data/xmls/output.xml') as op:
|
||||||
|
doc = lxml.parse(op)
|
||||||
|
print(xmlschema.validate(doc))
|
@ -0,0 +1,106 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
from flask import Flask
|
||||||
|
from flask_pymongo import PyMongo
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
app.config.from_object("db_config")
|
||||||
|
mongo = PyMongo(app)
|
||||||
|
|
||||||
|
app.config["BANNED_HEADWORDS"] = ["biti"]
|
||||||
|
|
||||||
|
def _is_banned(hw):
|
||||||
|
banned = True
|
||||||
|
if hw in app.config["BANNED_HEADWORDS"]:
|
||||||
|
banned = True
|
||||||
|
elif hw in sskj_wordlist["wordlist"]:
|
||||||
|
banned = False
|
||||||
|
elif (hw + " se") in sskj_wordlist["wordlist"]:
|
||||||
|
banned = False
|
||||||
|
return banned
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_app_index(appindex_json, corporas, previous_json=None):
|
||||||
|
if previous_json:
|
||||||
|
with Path(previous_json).open("r") as fp:
|
||||||
|
tmp_app_index = json.load(fp)
|
||||||
|
else:
|
||||||
|
tmp_app_index = {}
|
||||||
|
# create app_index (used in frontend, left side word index)
|
||||||
|
for c in corporas:
|
||||||
|
tmp_app_index[c] = {}
|
||||||
|
|
||||||
|
for corpus in corporas:
|
||||||
|
res_hws = {}
|
||||||
|
res_fns = {}
|
||||||
|
|
||||||
|
# print('CORPUS...!!...')
|
||||||
|
# print(corpus)
|
||||||
|
# a = mongo.db[corpus]
|
||||||
|
# print('TEST_OK')
|
||||||
|
# print(a)
|
||||||
|
# print(mongo.db)
|
||||||
|
# a = mongo.db.list_collection_names()
|
||||||
|
# print('TEST_OK2')
|
||||||
|
nentries = mongo.db[corpus].count()
|
||||||
|
idx = 0
|
||||||
|
for e in mongo.db[corpus].find({}):
|
||||||
|
if "headwords" not in e:
|
||||||
|
continue
|
||||||
|
for hw in e["headwords"]:
|
||||||
|
if hw in res_hws:
|
||||||
|
res_hws[hw] += 1
|
||||||
|
else:
|
||||||
|
res_hws[hw] = 1
|
||||||
|
if "functors" not in e:
|
||||||
|
continue
|
||||||
|
for fn in e["functors"]:
|
||||||
|
if fn in res_fns:
|
||||||
|
res_fns[fn] += 1
|
||||||
|
else:
|
||||||
|
res_fns[fn] = 1
|
||||||
|
idx += 1
|
||||||
|
if idx % 10000 == 0:
|
||||||
|
print("indexing {}: {}/{}".format(
|
||||||
|
corpus, idx, nentries))
|
||||||
|
|
||||||
|
alphabetical = {}
|
||||||
|
for k, e in res_hws.items():
|
||||||
|
fst = k[0].lower()
|
||||||
|
if fst in alphabetical:
|
||||||
|
alphabetical[fst].append((k, e))
|
||||||
|
else:
|
||||||
|
alphabetical[fst] = [(k, e)]
|
||||||
|
|
||||||
|
for letter, words in alphabetical.items():
|
||||||
|
filtered_words = [x for x in words if not _is_banned(x[0])]
|
||||||
|
# filtered_words = [x for x in words]
|
||||||
|
alphabetical[letter] = sorted(filtered_words, key=lambda x: x[0])
|
||||||
|
|
||||||
|
tmp_app_index[corpus]["words"] = alphabetical
|
||||||
|
|
||||||
|
|
||||||
|
functors = [(k, e) for (k, e) in res_fns.items()]
|
||||||
|
functors = sorted(functors, key=lambda x: x[0])
|
||||||
|
tmp_app_index[corpus]["functors"] = functors
|
||||||
|
|
||||||
|
with Path(appindex_json).open("w") as fp:
|
||||||
|
json.dump(tmp_app_index, fp)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Starting app.py main()")
|
||||||
|
aparser = argparse.ArgumentParser(description="Arguments for app.py")
|
||||||
|
aparser.add_argument("--previous-json", type=str, default=None)
|
||||||
|
aparser.add_argument("--appindex-json", type=str)
|
||||||
|
aparser.add_argument("--sskj-wordlist", type=str)
|
||||||
|
args = aparser.parse_args()
|
||||||
|
|
||||||
|
corporas = ['gigafida']
|
||||||
|
|
||||||
|
with Path(args.sskj_wordlist).open("r") as fp:
|
||||||
|
sskj_wordlist = json.load(fp)
|
||||||
|
|
||||||
|
prepare_app_index(args.appindex_json, corporas, args.previous_json)
|
@ -1,2 +1,2 @@
|
|||||||
MONGO_URI = "mongodb://sizif:p5e3r4u8t7@my_mongo:27017/valdb"
|
MONGO_URI = "mongodb://user:user@0.0.0.0:27017/valdb"
|
||||||
MONGO_AUTH_SOURCE = 'admin'
|
MONGO_AUTH_SOURCE = 'admin'
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
input_dir = "/media/luka/Portable Disk/Datasets/gigafida_jos/final_json"
|
||||||
|
output_file = "../../all_sentences.json"
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
filenames = os.listdir(input_dir)
|
||||||
|
len(filenames)
|
||||||
|
for i, filename in enumerate(filenames):
|
||||||
|
if filename.endswith(".json"):
|
||||||
|
with open(os.path.join(input_dir, filename)) as json_file:
|
||||||
|
data = json.load(json_file)
|
||||||
|
results[filename.split('-')[0]] = list(data.keys())
|
||||||
|
print('Progress: %.2f %%' % (i/len(filenames)))
|
||||||
|
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(results, f)
|
@ -1,3 +1,3 @@
|
|||||||
{
|
{
|
||||||
"api_addr": "http://193.2.76.103:8084"
|
"api_addr": "http://0.0.0.0:8084"
|
||||||
}
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 01adf47b9b63b43f86bff52429792b0de2327ddd
|
Subproject commit 92b3ac4ea3a73b93c25b363b5b9cb096d4d011cd
|
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 8c87d07b8a3ca73faac2fac30c39969bc5f97d45
|
Loading…
Reference in new issue