From 6bb3586051764beb5fd3a53cc4f32912404c334f Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Tue, 10 Sep 2019 16:22:43 +0200 Subject: [PATCH 1/4] Attempt at speed optimization with sql-join --- src/match.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/match.py b/src/match.py index 294029d..1d65db0 100644 --- a/src/match.py +++ b/src/match.py @@ -11,16 +11,22 @@ class StructureMatch: @staticmethod def from_db(db, colocation_id, structure): result = StructureMatch(colocation_id, structure) - for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)): - to_add = {} + prev_match_id = None - for component_id, word_lemma, word_text, word_msd, word_id in db.execute(""" - SELECT component_id, word_lemma, word_text, word_msd, word_id - FROM Matches WHERE match_id=?""", match_id): + stmt = """SELECT match_id, component_id, word_lemma, word_text, word_msd, word_id + FROM ColocationMatches + JOIN Matches ON Matches.match_id=ColocationMatches.mid_match_id + WHERE mid_colocation_id=? + ORDER BY match_id""" - to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False) + for row in db.execute(stmt, (colocation_id,)): + match_id, component_id, word_lemma, word_text, word_msd, word_id = row - result.matches.append(to_add) + if match_id != prev_match_id: + result.matches.append({}) + prev_match_id = match_id + + result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False) for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)): result.representations[str(component_id)] = text -- 2.45.1 From 8fee3f8a8e6cfac8dd397f7e1eba96bec35602dc Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 11 Sep 2019 08:58:02 +0200 Subject: [PATCH 2/4] Testing delayed insertions of representations --- src/match_store.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/match_store.py b/src/match_store.py index d5736f6..2eb189d 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -96,16 +96,24 @@ class MatchStore: print("Representation step already done, skipping") return + num_inserts = 1000 + inserts = [] + structures_dict = {s.id: s for s in structures} num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0]) for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations): structure = structures_dict[sid] match = StructureMatch.from_db(self.db, cid, structure) RepresentationAssigner.set_representations(match, word_renderer) - for component_id, text in match.representations.items(): - self.db.execute(""" - INSERT INTO Representations (colocation_id, component_id, text) - VALUES (?,?,?)""", (match.match_id, component_id, text)) + + inserts.append(match) + if len(inserts) > num_inserts: + for match in inserts: + for component_id, text in match.representations.items(): + self.db.execute(""" + INSERT INTO Representations (colocation_id, component_id, text) + VALUES (?,?,?)""", (match.match_id, component_id, text)) + inserts = [] self.db.step_is_done(step_name) -- 2.45.1 From 1d4c0238a66d0d5ef9750cb82e1d0028b9a8814c Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Wed, 6 Nov 2019 02:39:26 +0100 Subject: [PATCH 3/4] fixing how min_freq is used and more verbose writer --- src/match_store.py | 10 +++++++++- src/writer.py | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/match_store.py b/src/match_store.py index 2eb189d..378278f 100644 --- a/src/match_store.py +++ b/src/match_store.py @@ -9,6 +9,7 @@ class MatchStore: def __init__(self, args, db): self.db = db self.dispersions = {} + self.min_freq = args.min_freq self.db.init("""CREATE TABLE Colocations ( @@ -117,6 +118,10 @@ class MatchStore: self.db.step_is_done(step_name) + def has_colocation_id_enough_frequency(self, colocation_id): + matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0] + return matches >= self.min_freq + def determine_colocation_dispersions(self): step_name = 'dispersions' if self.db.is_step_done(step_name): @@ -124,7 +129,10 @@ class MatchStore: return dispersions = defaultdict(int) - for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"): + for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"): + if not self.has_colocation_id_enough_frequency(colocation_id): + continue + word_tups = literal_eval(word_tups_str) for component_id, lemma in word_tups: dispersions[(str(structure_id), component_id, lemma)] += 1 diff --git a/src/writer.py b/src/writer.py index 9e3e96d..8695de4 100644 --- a/src/writer.py +++ b/src/writer.py @@ -77,7 +77,7 @@ class Writer: rows = [] components = structure.components - for match in colocation_ids.get_matches_for(structure): + for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)): if len(match) < self.min_frequency: continue -- 2.45.1 From 9e8cd2a2ec06dcb152f3e862a672ddbc3a0c597b Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 2 Mar 2020 19:13:19 +0100 Subject: [PATCH 4/4] Issue #1000 --- issue1000/README.md | 5 ++++ issue1000/step0.py | 25 +++++++++++++++++++ issue1000/step1.py | 43 ++++++++++++++++++++++++++++++++ issue1000/step2.py | 60 ++++++++++++++++++++++++++++++++++++++++++++ issue1000/step3.py | 61 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+) create mode 100644 issue1000/README.md create mode 100644 issue1000/step0.py create mode 100644 issue1000/step1.py create mode 100644 issue1000/step2.py create mode 100644 issue1000/step3.py diff --git a/issue1000/README.md b/issue1000/README.md new file mode 100644 index 0000000..56df9d3 --- /dev/null +++ b/issue1000/README.md @@ -0,0 +1,5 @@ +# issue 1000 + +These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here. + +If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly. diff --git a/issue1000/step0.py b/issue1000/step0.py new file mode 100644 index 0000000..caabd6e --- /dev/null +++ b/issue1000/step0.py @@ -0,0 +1,25 @@ +import sys +import json + +FILE_OUT2D = sys.argv[1] +C2 = sys.argv[2] +FILE_OUT = sys.argv[3] + +data_out = {} + +with open(FILE_OUT2D, 'r') as fp: + for line in fp: + cells = line.split(", ") + + lemma1 = cells[1] + lemma2 = cells[1 + int(C2) * 5] + rep = cells[-7] + freq = cells[-6] + fd = cells[-1] + cid = cells[-8] + + data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid] + +with open(FILE_OUT, 'w') as fp: + json.dump(data_out, fp) + diff --git a/issue1000/step1.py b/issue1000/step1.py new file mode 100644 index 0000000..b9d81a9 --- /dev/null +++ b/issue1000/step1.py @@ -0,0 +1,43 @@ +import sqlite3 +import sys + +STRUCTURE_ID = '1' + +con = sqlite3.connect(sys.argv[1]) +cur = con.cursor() + +data_out = {} + +cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id +FROM Matches, Colocations, ColocationMatches +WHERE Matches.match_id = ColocationMatches.mid_match_id +AND Colocations.colocation_id = ColocationMatches.mid_colocation_id +AND Colocations.structure_id = ?""", (STRUCTURE_ID, )) + +prev_mid = None +idx = 0 + +while True: + row = cur.fetchone() + if row is None: + break + + mid, wid, cid = row + if mid == prev_mid: + continue + elif cid not in data_out: + data_out[cid] = [] + + wid_int = int(wid[2:9]) + data_out[cid].append(wid_int) + + prev_mid = mid + idx += 1 + if(idx % 10000 == 0): + print("\r{}".format(idx), end="", flush=True, file=sys.stderr) + +print("", file=sys.stderr) +for mid, wids in data_out.items(): + print(mid, *wids) + +con.close() diff --git a/issue1000/step2.py b/issue1000/step2.py new file mode 100644 index 0000000..f9cb295 --- /dev/null +++ b/issue1000/step2.py @@ -0,0 +1,60 @@ +import sys +import re +import pathlib +import mmap +from datetime import datetime +import json + +FOLDER_XMLS = sys.argv[1] +FILE_OUT = sys.argv[2] + +TYPES = { + "SSJ.I": "internet", + "SSJ.T.D": "drugo", + "SSJ.T.P.C": "casopisi", + "SSJ.T.P.R": "revije", + "SSJ.T.K.S": "stvarno", + "SSJ.T.K.L": "leposlovje", + "SSJ.T.K.N": "stvarno", +} + +xml_data = {} + +searcher_date = re.compile(b"([^<]+)") +searcher_type = re.compile(b"= 0: + fidx = data.find(b"