diff --git a/issue1000/README.md b/issue1000/README.md
new file mode 100644
index 0000000..56df9d3
--- /dev/null
+++ b/issue1000/README.md
@@ -0,0 +1,5 @@
+# issue 1000
+
+These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
+
+If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.
diff --git a/issue1000/step0.py b/issue1000/step0.py
new file mode 100644
index 0000000..caabd6e
--- /dev/null
+++ b/issue1000/step0.py
@@ -0,0 +1,25 @@
+import sys
+import json
+
+FILE_OUT2D = sys.argv[1]
+C2 = sys.argv[2]
+FILE_OUT = sys.argv[3]
+
+data_out = {}
+
+with open(FILE_OUT2D, 'r') as fp:
+ for line in fp:
+ cells = line.split(", ")
+
+ lemma1 = cells[1]
+ lemma2 = cells[1 + int(C2) * 5]
+ rep = cells[-7]
+ freq = cells[-6]
+ fd = cells[-1]
+ cid = cells[-8]
+
+ data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
+
+with open(FILE_OUT, 'w') as fp:
+ json.dump(data_out, fp)
+
diff --git a/issue1000/step1.py b/issue1000/step1.py
new file mode 100644
index 0000000..b9d81a9
--- /dev/null
+++ b/issue1000/step1.py
@@ -0,0 +1,43 @@
+import sqlite3
+import sys
+
+STRUCTURE_ID = '1'
+
+con = sqlite3.connect(sys.argv[1])
+cur = con.cursor()
+
+data_out = {}
+
+cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
+FROM Matches, Colocations, ColocationMatches
+WHERE Matches.match_id = ColocationMatches.mid_match_id
+AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
+AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
+
+prev_mid = None
+idx = 0
+
+while True:
+ row = cur.fetchone()
+ if row is None:
+ break
+
+ mid, wid, cid = row
+ if mid == prev_mid:
+ continue
+ elif cid not in data_out:
+ data_out[cid] = []
+
+ wid_int = int(wid[2:9])
+ data_out[cid].append(wid_int)
+
+ prev_mid = mid
+ idx += 1
+ if(idx % 10000 == 0):
+ print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
+
+print("", file=sys.stderr)
+for mid, wids in data_out.items():
+ print(mid, *wids)
+
+con.close()
diff --git a/issue1000/step2.py b/issue1000/step2.py
new file mode 100644
index 0000000..f9cb295
--- /dev/null
+++ b/issue1000/step2.py
@@ -0,0 +1,60 @@
+import sys
+import re
+import pathlib
+import mmap
+from datetime import datetime
+import json
+
+FOLDER_XMLS = sys.argv[1]
+FILE_OUT = sys.argv[2]
+
+TYPES = {
+ "SSJ.I": "internet",
+ "SSJ.T.D": "drugo",
+ "SSJ.T.P.C": "casopisi",
+ "SSJ.T.P.R": "revije",
+ "SSJ.T.K.S": "stvarno",
+ "SSJ.T.K.L": "leposlovje",
+ "SSJ.T.K.N": "stvarno",
+}
+
+xml_data = {}
+
+searcher_date = re.compile(b"([^<]+)")
+searcher_type = re.compile(b"= 0:
+ fidx = data.find(b" num_inserts:
+ for match in inserts:
+ for component_id, text in match.representations.items():
+ self.db.execute("""
+ INSERT INTO Representations (colocation_id, component_id, text)
+ VALUES (?,?,?)""", (match.match_id, component_id, text))
+ inserts = []
self.db.step_is_done(step_name)
+ def has_colocation_id_enough_frequency(self, colocation_id):
+ matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
+ return matches >= self.min_freq
+
def determine_colocation_dispersions(self):
step_name = 'dispersions'
if self.db.is_step_done(step_name):
@@ -116,7 +129,10 @@ class MatchStore:
return
dispersions = defaultdict(int)
- for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
+ for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
+ if not self.has_colocation_id_enough_frequency(colocation_id):
+ continue
+
word_tups = literal_eval(word_tups_str)
for component_id, lemma in word_tups:
dispersions[(str(structure_id), component_id, lemma)] += 1
diff --git a/src/writer.py b/src/writer.py
index 9e3e96d..8695de4 100644
--- a/src/writer.py
+++ b/src/writer.py
@@ -77,7 +77,7 @@ class Writer:
rows = []
components = structure.components
- for match in colocation_ids.get_matches_for(structure):
+ for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
if len(match) < self.min_frequency:
continue