sql-join-test #1

Manually merged
ozbolt merged 4 commits from sql-join-test into master 2020-03-02 19:12:38 +00:00
8 changed files with 229 additions and 13 deletions

5
issue1000/README.md Normal file
View File

@ -0,0 +1,5 @@
# issue 1000
These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.

25
issue1000/step0.py Normal file
View File

@ -0,0 +1,25 @@
import sys
import json
FILE_OUT2D = sys.argv[1]
C2 = sys.argv[2]
FILE_OUT = sys.argv[3]
data_out = {}
with open(FILE_OUT2D, 'r') as fp:
for line in fp:
cells = line.split(", ")
lemma1 = cells[1]
lemma2 = cells[1 + int(C2) * 5]
rep = cells[-7]
freq = cells[-6]
fd = cells[-1]
cid = cells[-8]
data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
with open(FILE_OUT, 'w') as fp:
json.dump(data_out, fp)

43
issue1000/step1.py Normal file
View File

@ -0,0 +1,43 @@
import sqlite3
import sys
STRUCTURE_ID = '1'
con = sqlite3.connect(sys.argv[1])
cur = con.cursor()
data_out = {}
cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
FROM Matches, Colocations, ColocationMatches
WHERE Matches.match_id = ColocationMatches.mid_match_id
AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
prev_mid = None
idx = 0
while True:
row = cur.fetchone()
if row is None:
break
mid, wid, cid = row
if mid == prev_mid:
continue
elif cid not in data_out:
data_out[cid] = []
wid_int = int(wid[2:9])
data_out[cid].append(wid_int)
prev_mid = mid
idx += 1
if(idx % 10000 == 0):
print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
print("", file=sys.stderr)
for mid, wids in data_out.items():
print(mid, *wids)
con.close()

60
issue1000/step2.py Normal file
View File

@ -0,0 +1,60 @@
import sys
import re
import pathlib
import mmap
from datetime import datetime
import json
FOLDER_XMLS = sys.argv[1]
FILE_OUT = sys.argv[2]
TYPES = {
"SSJ.I": "internet",
"SSJ.T.D": "drugo",
"SSJ.T.P.C": "casopisi",
"SSJ.T.P.R": "revije",
"SSJ.T.K.S": "stvarno",
"SSJ.T.K.L": "leposlovje",
"SSJ.T.K.N": "stvarno",
}
xml_data = {}
searcher_date = re.compile(b"<date>([^<]+)</date>")
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
idx = 0
N = 38411
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
idx += 1
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
with open(str(filename), "rb") as fp:
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
match1_iter = searcher_date.finditer(data)
next(match1_iter)
match1 = next(match1_iter)
match2 = searcher_type.search(data)
key = int(filename.stem[2:9])
date = int(match1.group(1)[:4])
typ = TYPES[match2.group(1).decode('ascii')]
words, fidx = 0, 0
while fidx >= 0:
fidx = data.find(b"<w a", fidx + 3)
words += 1
xml_data[key] = {
"date": date,
"type": typ,
"words": words
}
data.close()
with open(FILE_OUT, 'w') as fp:
json.dump(xml_data, fp, indent=2)

61
issue1000/step3.py Normal file
View File

@ -0,0 +1,61 @@
import sys
import json
from collections import Counter, defaultdict
FILE_STEP1 = sys.argv[1]
XMLS_INFO = sys.argv[2]
CIDS_INFO = sys.argv[3]
FREQ_MIN = int(sys.argv[4])
FILE_STEP3 = sys.argv[5]
with open(XMLS_INFO, "r") as fp:
xml_data = json.load(fp)
with open(CIDS_INFO, "r") as fp:
cids_data = json.load(fp)
years = [file_data['date'] for file_data in xml_data.values()]
min_year = min(years)
max_year = max(years)
year_sums = defaultdict(int)
for file_data in xml_data.values():
year_sums[file_data['date']] += file_data['words']
all_types = [file_data['type'] for file_data in xml_data.values()]
all_types = list(set(all_types))
data_out = {}
with open(FILE_STEP1, "r") as fp:
next(fp) #skip header
for line in fp:
cid, *wids = line.split()
data_out[int(cid)] = (
Counter(xml_data[wid]['date'] for wid in wids),
Counter(xml_data[wid]['type'] for wid in wids)
)
with open(FILE_STEP3, 'w') as fp:
line1 = ["colocation_id"] + list()
print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
for cid in sorted(data_out.keys()):
ctr_year, ctr_type = data_out[cid]
# frequency < 2 is filtered in cids data!
if str(cid) not in cids_data:
continue
lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
freq, fd = int(freq), int(fd)
if freq < FREQ_MIN:
continue
print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)

View File

@ -11,16 +11,22 @@ class StructureMatch:
@staticmethod
def from_db(db, colocation_id, structure):
result = StructureMatch(colocation_id, structure)
for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
to_add = {}
prev_match_id = None
for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
SELECT component_id, word_lemma, word_text, word_msd, word_id
FROM Matches WHERE match_id=?""", match_id):
stmt = """SELECT match_id, component_id, word_lemma, word_text, word_msd, word_id
FROM ColocationMatches
JOIN Matches ON Matches.match_id=ColocationMatches.mid_match_id
WHERE mid_colocation_id=?
ORDER BY match_id"""
to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
for row in db.execute(stmt, (colocation_id,)):
match_id, component_id, word_lemma, word_text, word_msd, word_id = row
result.matches.append(to_add)
if match_id != prev_match_id:
result.matches.append({})
prev_match_id = match_id
result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
result.representations[str(component_id)] = text

View File

@ -9,6 +9,7 @@ class MatchStore:
def __init__(self, args, db):
self.db = db
self.dispersions = {}
self.min_freq = args.min_freq
self.db.init("""CREATE TABLE Colocations (
@ -96,19 +97,31 @@ class MatchStore:
print("Representation step already done, skipping")
return
num_inserts = 1000
inserts = []
structures_dict = {s.id: s for s in structures}
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
structure = structures_dict[sid]
match = StructureMatch.from_db(self.db, cid, structure)
RepresentationAssigner.set_representations(match, word_renderer)
for component_id, text in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (match.match_id, component_id, text))
inserts.append(match)
if len(inserts) > num_inserts:
for match in inserts:
for component_id, text in match.representations.items():
self.db.execute("""
INSERT INTO Representations (colocation_id, component_id, text)
VALUES (?,?,?)""", (match.match_id, component_id, text))
inserts = []
self.db.step_is_done(step_name)
def has_colocation_id_enough_frequency(self, colocation_id):
matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
return matches >= self.min_freq
def determine_colocation_dispersions(self):
step_name = 'dispersions'
if self.db.is_step_done(step_name):
@ -116,7 +129,10 @@ class MatchStore:
return
dispersions = defaultdict(int)
for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
if not self.has_colocation_id_enough_frequency(colocation_id):
continue
word_tups = literal_eval(word_tups_str)
for component_id, lemma in word_tups:
dispersions[(str(structure_id), component_id, lemma)] += 1

View File

@ -77,7 +77,7 @@ class Writer:
rows = []
components = structure.components
for match in colocation_ids.get_matches_for(structure):
for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
if len(match) < self.min_frequency:
continue