cjvt-valency/scripts/extract_keywords.py

189 lines
6.6 KiB
Python

import copy
import csv
from xml.etree import ElementTree
import re
import sys
import logging
import argparse
import pickle
import time
import gc
import subprocess
import concurrent.futures
import tempfile
def read_gigafida(path):
words = {}
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words[row[0]] = int(row[2])
return words
def read_sloleks(path):
words = set()
with open(path) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
for row in reader:
words.add(row[1])
return words
def read_zele(path):
with open(path) as f:
content = f.readlines()
# fix content
content[0] = content[0][1:]
# a = content[2]
# a = content[2].split()
# a = content[2].split()[0].split('<IZT>')[1]
# a = content[2].split()[0].split('<IZT>')[1].split('</IZT>')[0]
content = [x.split()[0].split('<IZT>')[1].split('</IZT>')[0] for x in content]
# content = [x.split() for x in content]
return set(content)
def read_wordlist(path):
with open(path) as f:
content = [line[:-1] for line in f.readlines()]
print(content[-1])
return set(content)
def filter_gigafida(gigafida_raw, min_limit, max_limit):
return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit}
def set_list_intersection(gigafida_filtered, sloleks):
intersection = {}
for word, num in gigafida_filtered.items():
if word in sloleks:
intersection[word] = num
return intersection
def list_list_union(list1, list2):
union = copy.copy(list1)
for w, n in list2.items():
if w not in list1:
union[w] = list2[w]
return union
def list_list_subtraction(list1, list2):
subtraction = {}
for w, n in list2.items():
# if w == 'dejati':
# print('here')
if w not in list1:
subtraction[w] = n
return subtraction
def set_set_subtraction(set1, set2):
subtraction = {}
for w in set2:
if w not in set1:
subtraction[w] = -1
return subtraction
def create_document(list1, path):
with open(path, "w") as text_file:
for w, n in list1.items():
text_file.write("%s\t%d\n" % (w, n))
def create_document_set(list1, path):
with open(path, "w") as text_file:
for w in sorted(list(list1)):
text_file.write("%s\n" % w)
def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max):
gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks)
gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
return sloleks_zele_subtraction
def main(args):
gigafida_raw = read_gigafida(args.gigafida_verb_list)
sloleks = read_sloleks(args.sloleks)
zele = read_zele(args.zele)
if args.wordlist is not None:
sloleks_wordlist = set()
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
sloleks_wordlist.add(el)
filtered_wordlist = read_wordlist(args.wordlist)
# sloleks_wordlist = set()
for el in sloleks:
if el in gigafida_raw:
filtered_wordlist.add(el)
create_document_set(filtered_wordlist, 'wordlist.tsv')
# gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize)
gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize)
sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks)
nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True)
res = [el[0] for el in nouns_sloleks_gf_intersect]
gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize)
zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele)
sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect)
sloleks_zele_subtraction = set_set_subtraction(sloleks, zele)
create_document(gigafida_filtered3, 'gigafida_3+.tsv')
# create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv')
create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv')
create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize)
create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv')
# gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10)
# sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered)
gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10)
create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv')
# pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Extract keywords from multiple lists.')
parser.add_argument('gigafida_verb_list',
help='Path to gigafida list of verbs in tsv format.')
parser.add_argument('sloleks',
help='Path to Sloleks in tsv format.')
parser.add_argument('--zele',
help='Path to zele valency dictionary.')
parser.add_argument('--wordlist', default=None,
help='Path to filtered wordlist.')
parser.add_argument('--handchecked_words', default=None,
help='Path to handchecked words.')
# parser.add_argument('--min_limit',
# help='Limit min number of ocurrences',
# type=int, default=0)
# parser.add_argument('--max_limit',
# help='Limit max number of ocurrences',
# type=int, default=sys.maxsize)
parser.add_argument('--verbose', help='Enable verbose output to stderr',
choices=["warning", "info", "debug"], default="info",
const="info", nargs='?')
args = parser.parse_args()
logging.basicConfig(stream=sys.stderr, level=args.verbose.upper())
start = time.time()
main(args)
logging.info("TIME: {}".format(time.time() - start))