From 6bb3586051764beb5fd3a53cc4f32912404c334f Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Tue, 10 Sep 2019 16:22:43 +0200
Subject: [PATCH 1/4] Attempt at speed optimization with sql-join

---
 src/match.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/match.py b/src/match.py
index 294029d..1d65db0 100644
--- a/src/match.py
+++ b/src/match.py
@@ -11,16 +11,22 @@ class StructureMatch:
     @staticmethod
     def from_db(db, colocation_id, structure):
         result = StructureMatch(colocation_id, structure)
-        for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
-            to_add = {}
+        prev_match_id = None
 
-            for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
-                SELECT component_id, word_lemma, word_text, word_msd, word_id 
-                FROM Matches WHERE match_id=?""", match_id):
+        stmt = """SELECT match_id, component_id, word_lemma, word_text, word_msd, word_id
+                  FROM ColocationMatches 
+                  JOIN Matches ON Matches.match_id=ColocationMatches.mid_match_id 
+                  WHERE mid_colocation_id=? 
+                  ORDER BY match_id"""
 
-                to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
+        for row in db.execute(stmt, (colocation_id,)): 
+            match_id, component_id, word_lemma, word_text, word_msd, word_id = row
 
-            result.matches.append(to_add)
+            if match_id != prev_match_id:
+                result.matches.append({})
+                prev_match_id = match_id
+
+            result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
         
         for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
             result.representations[str(component_id)] = text
-- 
2.45.2


From 8fee3f8a8e6cfac8dd397f7e1eba96bec35602dc Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Wed, 11 Sep 2019 08:58:02 +0200
Subject: [PATCH 2/4] Testing delayed insertions of representations

---
 src/match_store.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/match_store.py b/src/match_store.py
index d5736f6..2eb189d 100644
--- a/src/match_store.py
+++ b/src/match_store.py
@@ -96,16 +96,24 @@ class MatchStore:
             print("Representation step already done, skipping")
             return
 
+        num_inserts = 1000
+        inserts = []
+
         structures_dict = {s.id: s for s in structures}
         num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
         for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
             structure = structures_dict[sid]
             match = StructureMatch.from_db(self.db, cid, structure)
             RepresentationAssigner.set_representations(match, word_renderer)
-            for component_id, text in match.representations.items():
-                self.db.execute("""
-                    INSERT INTO Representations (colocation_id, component_id, text) 
-                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+
+            inserts.append(match)
+            if len(inserts) > num_inserts:
+                for match in inserts:
+                    for component_id, text in match.representations.items():
+                        self.db.execute("""
+                            INSERT INTO Representations (colocation_id, component_id, text) 
+                            VALUES (?,?,?)""", (match.match_id, component_id, text))
+                inserts = []
 
         self.db.step_is_done(step_name)
 
-- 
2.45.2


From 1d4c0238a66d0d5ef9750cb82e1d0028b9a8814c Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Wed, 6 Nov 2019 02:39:26 +0100
Subject: [PATCH 3/4] fixing how min_freq is used and more verbose writer

---
 src/match_store.py | 10 +++++++++-
 src/writer.py      |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/match_store.py b/src/match_store.py
index 2eb189d..378278f 100644
--- a/src/match_store.py
+++ b/src/match_store.py
@@ -9,6 +9,7 @@ class MatchStore:
     def __init__(self, args, db):
         self.db = db
         self.dispersions = {}
+        self.min_freq = args.min_freq
 
 
         self.db.init("""CREATE TABLE Colocations (
@@ -117,6 +118,10 @@ class MatchStore:
 
         self.db.step_is_done(step_name)
 
+    def has_colocation_id_enough_frequency(self, colocation_id):
+        matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
+        return matches >= self.min_freq
+
     def determine_colocation_dispersions(self):
         step_name = 'dispersions'
         if self.db.is_step_done(step_name):
@@ -124,7 +129,10 @@ class MatchStore:
             return
 
         dispersions = defaultdict(int)
-        for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
+        for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
+            if not self.has_colocation_id_enough_frequency(colocation_id):
+                continue
+
             word_tups = literal_eval(word_tups_str)
             for component_id, lemma in word_tups:
                 dispersions[(str(structure_id), component_id, lemma)] += 1
diff --git a/src/writer.py b/src/writer.py
index 9e3e96d..8695de4 100644
--- a/src/writer.py
+++ b/src/writer.py
@@ -77,7 +77,7 @@ class Writer:
         rows = []
         components = structure.components
 
-        for match in colocation_ids.get_matches_for(structure):
+        for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
             if len(match) < self.min_frequency:
                 continue
 
-- 
2.45.2


From 9e8cd2a2ec06dcb152f3e862a672ddbc3a0c597b Mon Sep 17 00:00:00 2001
From: Ozbolt Menegatti <ozbolt.menegatti@gmail.com>
Date: Mon, 2 Mar 2020 19:13:19 +0100
Subject: [PATCH 4/4] Issue #1000

---
 issue1000/README.md |  5 ++++
 issue1000/step0.py  | 25 +++++++++++++++++++
 issue1000/step1.py  | 43 ++++++++++++++++++++++++++++++++
 issue1000/step2.py  | 60 ++++++++++++++++++++++++++++++++++++++++++++
 issue1000/step3.py  | 61 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 194 insertions(+)
 create mode 100644 issue1000/README.md
 create mode 100644 issue1000/step0.py
 create mode 100644 issue1000/step1.py
 create mode 100644 issue1000/step2.py
 create mode 100644 issue1000/step3.py

diff --git a/issue1000/README.md b/issue1000/README.md
new file mode 100644
index 0000000..56df9d3
--- /dev/null
+++ b/issue1000/README.md
@@ -0,0 +1,5 @@
+# issue 1000
+
+These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
+
+If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.
diff --git a/issue1000/step0.py b/issue1000/step0.py
new file mode 100644
index 0000000..caabd6e
--- /dev/null
+++ b/issue1000/step0.py
@@ -0,0 +1,25 @@
+import sys
+import json
+
+FILE_OUT2D = sys.argv[1]
+C2 = sys.argv[2]
+FILE_OUT = sys.argv[3]
+
+data_out = {}
+
+with open(FILE_OUT2D, 'r') as fp:
+    for line in fp:
+        cells = line.split(", ")
+
+        lemma1 = cells[1]
+        lemma2 = cells[1 + int(C2) * 5]
+        rep = cells[-7]
+        freq = cells[-6]
+        fd = cells[-1]
+        cid = cells[-8]
+
+        data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
+
+with open(FILE_OUT, 'w') as fp:
+    json.dump(data_out, fp)
+
diff --git a/issue1000/step1.py b/issue1000/step1.py
new file mode 100644
index 0000000..b9d81a9
--- /dev/null
+++ b/issue1000/step1.py
@@ -0,0 +1,43 @@
+import sqlite3
+import sys
+
+STRUCTURE_ID = '1'
+
+con = sqlite3.connect(sys.argv[1])
+cur = con.cursor()
+
+data_out = {}
+
+cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
+FROM Matches, Colocations, ColocationMatches
+WHERE Matches.match_id = ColocationMatches.mid_match_id 
+AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
+AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
+
+prev_mid = None
+idx = 0
+
+while True:
+    row = cur.fetchone()
+    if row is None:
+        break
+
+    mid, wid, cid = row
+    if mid == prev_mid:
+        continue
+    elif cid not in data_out:
+        data_out[cid] = []
+
+    wid_int = int(wid[2:9])
+    data_out[cid].append(wid_int) 
+
+    prev_mid = mid
+    idx += 1
+    if(idx % 10000 == 0):
+        print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
+
+print("", file=sys.stderr)
+for mid, wids in data_out.items():
+    print(mid, *wids)
+
+con.close()
diff --git a/issue1000/step2.py b/issue1000/step2.py
new file mode 100644
index 0000000..f9cb295
--- /dev/null
+++ b/issue1000/step2.py
@@ -0,0 +1,60 @@
+import sys
+import re
+import pathlib
+import mmap
+from datetime import datetime
+import json
+
+FOLDER_XMLS = sys.argv[1]
+FILE_OUT = sys.argv[2]
+
+TYPES = { 
+    "SSJ.I": "internet",
+    "SSJ.T.D": "drugo",
+    "SSJ.T.P.C": "casopisi",
+    "SSJ.T.P.R": "revije",
+    "SSJ.T.K.S": "stvarno",
+    "SSJ.T.K.L": "leposlovje",
+    "SSJ.T.K.N": "stvarno",
+}
+
+xml_data = {}
+
+searcher_date = re.compile(b"<date>([^<]+)</date>")
+searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
+
+idx = 0
+N = 38411
+
+for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
+    idx += 1
+    print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
+
+    with open(str(filename), "rb") as fp:
+        data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
+
+        match1_iter = searcher_date.finditer(data)
+        next(match1_iter)
+        match1 = next(match1_iter)
+        match2 = searcher_type.search(data)
+
+        key = int(filename.stem[2:9])
+        date = int(match1.group(1)[:4])
+        typ = TYPES[match2.group(1).decode('ascii')]
+
+        words, fidx = 0, 0
+        while fidx >= 0:
+            fidx = data.find(b"<w a", fidx + 3)
+            words += 1
+
+        xml_data[key] = {
+            "date": date,
+            "type": typ,
+            "words": words
+        }
+
+        data.close()
+
+
+with open(FILE_OUT, 'w') as fp:
+    json.dump(xml_data, fp, indent=2)
diff --git a/issue1000/step3.py b/issue1000/step3.py
new file mode 100644
index 0000000..d392439
--- /dev/null
+++ b/issue1000/step3.py
@@ -0,0 +1,61 @@
+import sys
+import json
+from collections import Counter, defaultdict
+
+FILE_STEP1 = sys.argv[1]
+XMLS_INFO = sys.argv[2]
+CIDS_INFO = sys.argv[3]
+FREQ_MIN = int(sys.argv[4])
+FILE_STEP3 = sys.argv[5]
+
+
+with open(XMLS_INFO, "r") as fp:
+    xml_data = json.load(fp)
+
+with open(CIDS_INFO, "r") as fp:
+    cids_data = json.load(fp)
+
+years = [file_data['date'] for file_data in xml_data.values()]
+min_year = min(years)
+max_year = max(years)
+
+year_sums = defaultdict(int)
+for file_data in xml_data.values():
+    year_sums[file_data['date']] += file_data['words']
+
+all_types = [file_data['type'] for file_data in xml_data.values()]
+all_types = list(set(all_types))
+
+data_out = {}
+
+with open(FILE_STEP1, "r") as fp:
+    next(fp) #skip header
+    for line in fp:
+        cid, *wids = line.split()
+        data_out[int(cid)] = (
+            Counter(xml_data[wid]['date'] for wid in wids),
+            Counter(xml_data[wid]['type'] for wid in wids)
+        )
+
+with open(FILE_STEP3, 'w') as fp:
+    line1 = ["colocation_id"] + list()
+    print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
+    print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
+    print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
+
+    for cid in sorted(data_out.keys()):
+        ctr_year, ctr_type = data_out[cid]
+
+        # frequency < 2 is filtered in cids data!
+        if str(cid) not in cids_data:
+            continue
+
+        lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
+        freq, fd = int(freq), int(fd)
+        if freq < FREQ_MIN:
+            continue
+
+        print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
+        print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
+        print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)
+
-- 
2.45.2