forked from kristjan/cjvt-valency
First commit on scripts branch
This commit is contained in:
1643
scripts/create_xml.py
Normal file
1643
scripts/create_xml.py
Normal file
File diff suppressed because it is too large
Load Diff
189
scripts/extract_keywords.py
Normal file
189
scripts/extract_keywords.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import copy
|
||||
import csv
|
||||
from xml.etree import ElementTree
|
||||
import re
|
||||
import sys
|
||||
import logging
|
||||
import argparse
|
||||
import pickle
|
||||
import time
|
||||
import gc
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import tempfile
|
||||
|
||||
|
||||
def read_gigafida(path):
|
||||
words = {}
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words[row[0]] = int(row[2])
|
||||
return words
|
||||
|
||||
|
||||
def read_sloleks(path):
|
||||
words = set()
|
||||
with open(path) as tsvfile:
|
||||
reader = csv.reader(tsvfile, delimiter='\t')
|
||||
for row in reader:
|
||||
words.add(row[1])
|
||||
return words
|
||||
|
||||
|
||||
def read_zele(path):
|
||||
with open(path) as f:
|
||||
content = f.readlines()
|
||||
# fix content
|
||||
content[0] = content[0][1:]
|
||||
# a = content[2]
|
||||
# a = content[2].split()
|
||||
# a = content[2].split()[0].split('<IZT>')[1]
|
||||
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
|
||||
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
|
||||
# content = [x.split() for x in content]
|
||||
return set(content)
|
||||
|
||||
|
||||
def read_wordlist(path):
|
||||
with open(path) as f:
|
||||
content = [line[:-1] for line in f.readlines()]
|
||||
print(content[-1])
|
||||
return set(content)
|
||||
|
||||
|
||||
def filter_gigafida(gigafida_raw, min_limit, max_limit):
|
||||
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
|
||||
|
||||
|
||||
def set_list_intersection(gigafida_filtered, sloleks):
|
||||
intersection = {}
|
||||
for word, num in gigafida_filtered.items():
|
||||
if word in sloleks:
|
||||
intersection[word] = num
|
||||
return intersection
|
||||
|
||||
|
||||
def list_list_union(list1, list2):
|
||||
union = copy.copy(list1)
|
||||
for w, n in list2.items():
|
||||
if w not in list1:
|
||||
union[w] = list2[w]
|
||||
return union
|
||||
|
||||
|
||||
def list_list_subtraction(list1, list2):
|
||||
subtraction = {}
|
||||
for w, n in list2.items():
|
||||
# if w == 'dejati':
|
||||
# print('here')
|
||||
if w not in list1:
|
||||
subtraction[w] = n
|
||||
return subtraction
|
||||
|
||||
|
||||
def set_set_subtraction(set1, set2):
|
||||
subtraction = {}
|
||||
for w in set2:
|
||||
if w not in set1:
|
||||
subtraction[w] = -1
|
||||
return subtraction
|
||||
|
||||
|
||||
def create_document(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w, n in list1.items():
|
||||
text_file.write("%s\t%d\n" % (w, n))
|
||||
|
||||
|
||||
def create_document_set(list1, path):
|
||||
with open(path, "w") as text_file:
|
||||
for w in sorted(list(list1)):
|
||||
text_file.write("%s\n" % w)
|
||||
|
||||
|
||||
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
|
||||
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
return sloleks_zele_subtraction
|
||||
|
||||
|
||||
def main(args):
|
||||
gigafida_raw = read_gigafida(args.gigafida_verb_list)
|
||||
sloleks = read_sloleks(args.sloleks)
|
||||
zele = read_zele(args.zele)
|
||||
if args.wordlist is not None:
|
||||
sloleks_wordlist = set()
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
sloleks_wordlist.add(el)
|
||||
filtered_wordlist = read_wordlist(args.wordlist)
|
||||
|
||||
# sloleks_wordlist = set()
|
||||
for el in sloleks:
|
||||
if el in gigafida_raw:
|
||||
filtered_wordlist.add(el)
|
||||
|
||||
create_document_set(filtered_wordlist, 'wordlist.tsv')
|
||||
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
|
||||
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
|
||||
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
|
||||
|
||||
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
|
||||
res = [el[0] for el in nouns_sloleks_gf_intersect]
|
||||
|
||||
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
|
||||
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
|
||||
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
|
||||
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
|
||||
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
|
||||
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
|
||||
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
|
||||
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
|
||||
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
|
||||
|
||||
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
|
||||
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
|
||||
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
|
||||
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
|
||||
# pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract keywords from multiple lists.')
|
||||
parser.add_argument('gigafida_verb_list',
|
||||
help='Path to gigafida list of verbs in tsv format.')
|
||||
parser.add_argument('sloleks',
|
||||
help='Path to Sloleks in tsv format.')
|
||||
parser.add_argument('--zele',
|
||||
help='Path to zele valency dictionary.')
|
||||
parser.add_argument('--wordlist', default=None,
|
||||
help='Path to filtered wordlist.')
|
||||
parser.add_argument('--handchecked_words', default=None,
|
||||
help='Path to handchecked words.')
|
||||
# parser.add_argument('--min_limit',
|
||||
# help='Limit min number of ocurrences',
|
||||
# type=int, default=0)
|
||||
# parser.add_argument('--max_limit',
|
||||
# help='Limit max number of ocurrences',
|
||||
# type=int, default=sys.maxsize)
|
||||
parser.add_argument('--verbose', help='Enable verbose output to stderr',
|
||||
choices=["warning", "info", "debug"], default="info",
|
||||
const="info", nargs='?')
|
||||
|
||||
args = parser.parse_args()
|
||||
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
|
||||
|
||||
start = time.time()
|
||||
main(args)
|
||||
logging.info("TIME: {}".format(time.time() - start))
|
||||
8
scripts/xsd_checker.py
Normal file
8
scripts/xsd_checker.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from lxml import etree as lxml
|
||||
|
||||
with open('../data/inventory.xsd') as f:
|
||||
xmlschema_doc = lxml.parse(f)
|
||||
xmlschema = lxml.XMLSchema(xmlschema_doc)
|
||||
with open('../data/xmls/output.xml') as op:
|
||||
doc = lxml.parse(op)
|
||||
print(xmlschema.validate(doc))
|
||||
Reference in New Issue
Block a user