diff --git a/issue1000/README.md b/issue1000/README.md new file mode 100644 index 0000000..56df9d3 --- /dev/null +++ b/issue1000/README.md @@ -0,0 +1,5 @@ +# issue 1000 + +These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here. + +If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly. diff --git a/issue1000/step0.py b/issue1000/step0.py new file mode 100644 index 0000000..caabd6e --- /dev/null +++ b/issue1000/step0.py @@ -0,0 +1,25 @@ +import sys +import json + +FILE_OUT2D = sys.argv[1] +C2 = sys.argv[2] +FILE_OUT = sys.argv[3] + +data_out = {} + +with open(FILE_OUT2D, 'r') as fp: + for line in fp: + cells = line.split(", ") + + lemma1 = cells[1] + lemma2 = cells[1 + int(C2) * 5] + rep = cells[-7] + freq = cells[-6] + fd = cells[-1] + cid = cells[-8] + + data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid] + +with open(FILE_OUT, 'w') as fp: + json.dump(data_out, fp) + diff --git a/issue1000/step1.py b/issue1000/step1.py new file mode 100644 index 0000000..b9d81a9 --- /dev/null +++ b/issue1000/step1.py @@ -0,0 +1,43 @@ +import sqlite3 +import sys + +STRUCTURE_ID = '1' + +con = sqlite3.connect(sys.argv[1]) +cur = con.cursor() + +data_out = {} + +cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id +FROM Matches, Colocations, ColocationMatches +WHERE Matches.match_id = ColocationMatches.mid_match_id +AND Colocations.colocation_id = ColocationMatches.mid_colocation_id +AND Colocations.structure_id = ?""", (STRUCTURE_ID, )) + +prev_mid = None +idx = 0 + +while True: + row = cur.fetchone() + if row is None: + break + + mid, wid, cid = row + if mid == prev_mid: + continue + elif cid not in data_out: + data_out[cid] = [] + + wid_int = int(wid[2:9]) + data_out[cid].append(wid_int) + + prev_mid = mid + idx += 1 + if(idx % 10000 == 0): + print("\r{}".format(idx), end="", flush=True, file=sys.stderr) + +print("", file=sys.stderr) +for mid, wids in data_out.items(): + print(mid, *wids) + +con.close() diff --git a/issue1000/step2.py b/issue1000/step2.py new file mode 100644 index 0000000..f9cb295 --- /dev/null +++ b/issue1000/step2.py @@ -0,0 +1,60 @@ +import sys +import re +import pathlib +import mmap +from datetime import datetime +import json + +FOLDER_XMLS = sys.argv[1] +FILE_OUT = sys.argv[2] + +TYPES = { + "SSJ.I": "internet", + "SSJ.T.D": "drugo", + "SSJ.T.P.C": "casopisi", + "SSJ.T.P.R": "revije", + "SSJ.T.K.S": "stvarno", + "SSJ.T.K.L": "leposlovje", + "SSJ.T.K.N": "stvarno", +} + +xml_data = {} + +searcher_date = re.compile(b"([^<]+)") +searcher_type = re.compile(b"= 0: + fidx = data.find(b" num_inserts: + for match in inserts: + for component_id, text in match.representations.items(): + self.db.execute(""" + INSERT INTO Representations (colocation_id, component_id, text) + VALUES (?,?,?)""", (match.match_id, component_id, text)) + inserts = [] self.db.step_is_done(step_name) + def has_colocation_id_enough_frequency(self, colocation_id): + matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0] + return matches >= self.min_freq + def determine_colocation_dispersions(self): step_name = 'dispersions' if self.db.is_step_done(step_name): @@ -116,7 +129,10 @@ class MatchStore: return dispersions = defaultdict(int) - for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"): + for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"): + if not self.has_colocation_id_enough_frequency(colocation_id): + continue + word_tups = literal_eval(word_tups_str) for component_id, lemma in word_tups: dispersions[(str(structure_id), component_id, lemma)] += 1 diff --git a/src/writer.py b/src/writer.py index 9e3e96d..8695de4 100644 --- a/src/writer.py +++ b/src/writer.py @@ -77,7 +77,7 @@ class Writer: rows = [] components = structure.components - for match in colocation_ids.get_matches_for(structure): + for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)): if len(match) < self.min_frequency: continue