47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
|
from collections import defaultdict
|
||
|
|
||
|
from match import StructureMatch
|
||
|
from representation_assigner import RepresentationAssigner
|
||
|
|
||
|
try:
|
||
|
from tqdm import tqdm
|
||
|
except ImportError:
|
||
|
tqdm = lambda x: x
|
||
|
|
||
|
class MatchStore:
|
||
|
def __init__(self, args):
|
||
|
self.data = {}
|
||
|
self.min_frequency = args.min_freq
|
||
|
self.dispersions = {}
|
||
|
|
||
|
def _add_match(self, key, structure, match):
|
||
|
if key not in self.data:
|
||
|
self.data[key] = StructureMatch(str(len(self.data) + 1), structure)
|
||
|
self.data[key].append(match)
|
||
|
|
||
|
def get(self, key, n):
|
||
|
return self.data[key][n]
|
||
|
|
||
|
def add_matches(self, matches):
|
||
|
for structure, nms in matches.items():
|
||
|
for nm in nms:
|
||
|
self._add_match(nm[1], structure, nm[0])
|
||
|
|
||
|
def get_matches_for(self, structure):
|
||
|
for _cid_tup, sm in self.data.items():
|
||
|
if sm.structure != structure:
|
||
|
continue
|
||
|
|
||
|
yield sm
|
||
|
|
||
|
def set_representations(self, word_renderer):
|
||
|
for _1, sm in tqdm(self.data.items()):
|
||
|
RepresentationAssigner.set_representations(sm, word_renderer)
|
||
|
|
||
|
def determine_colocation_dispersions(self):
|
||
|
dispersions = defaultdict(int)
|
||
|
for (structure_id, *word_tups) in self.data.keys():
|
||
|
for component_id, lemma in word_tups:
|
||
|
dispersions[(structure_id, component_id, lemma)] += 1
|
||
|
self.dispersions = dict(dispersions)
|