import copy import csv from xml.etree import ElementTree import re import sys import logging import argparse import pickle import time import gc import subprocess import concurrent.futures import tempfile def read_gigafida(path): words = {} with open(path) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: words[row[0]] = int(row[2]) return words def read_sloleks(path): words = set() with open(path) as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: words.add(row[1]) return words def read_zele(path): with open(path) as f: content = f.readlines() # fix content content[0] = content[0][1:] # a = content[2] # a = content[2].split() # a = content[2].split()[0].split('')[1] # a = content[2].split()[0].split('')[1].split('')[0] content = [x.split()[0].split('')[1].split('')[0] for x in content] # content = [x.split() for x in content] return set(content) def read_wordlist(path): with open(path) as f: content = [line[:-1] for line in f.readlines()] print(content[-1]) return set(content) def filter_gigafida(gigafida_raw, min_limit, max_limit): return {word[0]: word[1] for word in gigafida_raw.items() if (word[0][-2:] == 'ti' or word[0][-2:] == 'či') and word[1] > min_limit and word[1] <= max_limit} def set_list_intersection(gigafida_filtered, sloleks): intersection = {} for word, num in gigafida_filtered.items(): if word in sloleks: intersection[word] = num return intersection def list_list_union(list1, list2): union = copy.copy(list1) for w, n in list2.items(): if w not in list1: union[w] = list2[w] return union def list_list_subtraction(list1, list2): subtraction = {} for w, n in list2.items(): # if w == 'dejati': # print('here') if w not in list1: subtraction[w] = n return subtraction def set_set_subtraction(set1, set2): subtraction = {} for w in set2: if w not in set1: subtraction[w] = -1 return subtraction def create_document(list1, path): with open(path, "w") as text_file: for w, n in list1.items(): text_file.write("%s\t%d\n" % (w, n)) def create_document_set(list1, path): with open(path, "w") as text_file: for w in sorted(list(list1)): text_file.write("%s\n" % w) def gigafida_merge(sloleks, zele, gigafida_raw, giga_min, giga_max): gigafida_filtered = filter_gigafida(gigafida_raw, giga_min, giga_max) sloleks_gf_intersect = set_list_intersection(gigafida_filtered, sloleks) gigafida_filtered1 = filter_gigafida(gigafida_raw, 1, sys.maxsize) zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) return sloleks_zele_subtraction def main(args): gigafida_raw = read_gigafida(args.gigafida_verb_list) sloleks = read_sloleks(args.sloleks) zele = read_zele(args.zele) if args.wordlist is not None: sloleks_wordlist = set() # sloleks_wordlist = set() for el in sloleks: if el in gigafida_raw: sloleks_wordlist.add(el) filtered_wordlist = read_wordlist(args.wordlist) # sloleks_wordlist = set() for el in sloleks: if el in gigafida_raw: filtered_wordlist.add(el) create_document_set(filtered_wordlist, 'wordlist.tsv') # gigafida_merge(sloleks, zele, gigafida_raw, 3, sys.maxsize) gigafida_filtered3 = filter_gigafida(gigafida_raw, 2, sys.maxsize) sloleks_gf_intersect = set_list_intersection(gigafida_filtered3, sloleks) nouns_sloleks_gf_intersect = sorted(sloleks_gf_intersect.items(), key=lambda x: x[1], reverse=True) res = [el[0] for el in nouns_sloleks_gf_intersect] gigafida_filtered1 = filter_gigafida(gigafida_raw, 0, sys.maxsize) zele_gf_intersect = set_list_intersection(gigafida_filtered1, zele) sloleks_zele_union = list_list_union(sloleks_gf_intersect, zele_gf_intersect) sloleks_zele_subtraction = set_set_subtraction(sloleks, zele) create_document(gigafida_filtered3, 'gigafida_3+.tsv') # create_document(sloleks_gf_intersect, 'gigafida_3+-sloleks-presek.tsv') create_document(sloleks_zele_union, 'gigafida_3+-sloleks_zele-presek.tsv') create_document(sloleks_zele_subtraction, 'sloleks-zele-razlika.tsv') # gigafida_filtered = filter_gigafida(gigafida_raw, 10, sys.maxsize) # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) gigafida_10 = gigafida_merge(sloleks, zele, gigafida_raw, 10, sys.maxsize) create_document(gigafida_10, 'gigafida_10+-sloleks_zele-razlika.tsv') # gigafida_filtered = filter_gigafida(gigafida_raw, 3, 10) # sloleks_zele_subtraction = list_list_subtraction(sloleks_zele_union, gigafida_filtered) gigafida_3_10 = gigafida_merge(sloleks, zele, gigafida_raw, 2, 10) create_document(gigafida_3_10, 'gigafida_3-10-sloleks_zele-razlika.tsv') # pass if __name__ == '__main__': parser = argparse.ArgumentParser( description='Extract keywords from multiple lists.') parser.add_argument('gigafida_verb_list', help='Path to gigafida list of verbs in tsv format.') parser.add_argument('sloleks', help='Path to Sloleks in tsv format.') parser.add_argument('--zele', help='Path to zele valency dictionary.') parser.add_argument('--wordlist', default=None, help='Path to filtered wordlist.') parser.add_argument('--handchecked_words', default=None, help='Path to handchecked words.') # parser.add_argument('--min_limit', # help='Limit min number of ocurrences', # type=int, default=0) # parser.add_argument('--max_limit', # help='Limit max number of ocurrences', # type=int, default=sys.maxsize) parser.add_argument('--verbose', help='Enable verbose output to stderr', choices=["warning", "info", "debug"], default="info", const="info", nargs='?') args = parser.parse_args() logging.basicConfig(stream=sys.stderr, level=args.verbose.upper()) start = time.time() main(args) logging.info("TIME: {}".format(time.time() - start))