Merge branch 'sql-join-test' of ozbolt/luscenje_struktur into master

OK
2020-03-02 19:12:37 +00:00 · 2020-03-02 19:12:37 +00:00 · ec113f9cd2
commit ec113f9cd2
parent 4124036474 9e8cd2a2ec
8 changed files with 229 additions and 13 deletions
--- a/issue1000/README.md
+++ b/issue1000/README.md
@ -0,0 +1,5 @@
 # issue 1000
 These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
 If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.
--- a/issue1000/step0.py
+++ b/issue1000/step0.py
@ -0,0 +1,25 @@
 import sys
 import json
 FILE_OUT2D = sys.argv[1]
 C2 = sys.argv[2]
 FILE_OUT = sys.argv[3]
 data_out = {}
 with open(FILE_OUT2D, 'r') as fp:
    for line in fp:
        cells = line.split(", ")
        lemma1 = cells[1]
        lemma2 = cells[1 + int(C2) * 5]
        rep = cells[-7]
        freq = cells[-6]
        fd = cells[-1]
        cid = cells[-8]
        data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
 with open(FILE_OUT, 'w') as fp:
    json.dump(data_out, fp)
--- a/issue1000/step1.py
+++ b/issue1000/step1.py
@ -0,0 +1,43 @@
 import sqlite3
 import sys
 STRUCTURE_ID = '1'
 con = sqlite3.connect(sys.argv[1])
 cur = con.cursor()
 data_out = {}
 cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
 FROM Matches, Colocations, ColocationMatches
 WHERE Matches.match_id = ColocationMatches.mid_match_id 
 AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
 AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
 prev_mid = None
 idx = 0
 while True:
    row = cur.fetchone()
    if row is None:
        break
    mid, wid, cid = row
    if mid == prev_mid:
        continue
    elif cid not in data_out:
        data_out[cid] = []
    wid_int = int(wid[2:9])
    data_out[cid].append(wid_int) 
    prev_mid = mid
    idx += 1
    if(idx % 10000 == 0):
        print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
 print("", file=sys.stderr)
 for mid, wids in data_out.items():
    print(mid, *wids)
 con.close()
--- a/issue1000/step2.py
+++ b/issue1000/step2.py
@ -0,0 +1,60 @@
 import sys
 import re
 import pathlib
 import mmap
 from datetime import datetime
 import json
 FOLDER_XMLS = sys.argv[1]
 FILE_OUT = sys.argv[2]
 TYPES = { 
    "SSJ.I": "internet",
    "SSJ.T.D": "drugo",
    "SSJ.T.P.C": "casopisi",
    "SSJ.T.P.R": "revije",
    "SSJ.T.K.S": "stvarno",
    "SSJ.T.K.L": "leposlovje",
    "SSJ.T.K.N": "stvarno",
 }
 xml_data = {}
 searcher_date = re.compile(b"<date>([^<]+)</date>")
 searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
 idx = 0
 N = 38411
 for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
    idx += 1
    print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
    with open(str(filename), "rb") as fp:
        data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
        match1_iter = searcher_date.finditer(data)
        next(match1_iter)
        match1 = next(match1_iter)
        match2 = searcher_type.search(data)
        key = int(filename.stem[2:9])
        date = int(match1.group(1)[:4])
        typ = TYPES[match2.group(1).decode('ascii')]
        words, fidx = 0, 0
        while fidx >= 0:
            fidx = data.find(b"<w a", fidx + 3)
            words += 1
        xml_data[key] = {
            "date": date,
            "type": typ,
            "words": words
        }
        data.close()
 with open(FILE_OUT, 'w') as fp:
    json.dump(xml_data, fp, indent=2)
--- a/issue1000/step3.py
+++ b/issue1000/step3.py
@ -0,0 +1,61 @@
 import sys
 import json
 from collections import Counter, defaultdict
 FILE_STEP1 = sys.argv[1]
 XMLS_INFO = sys.argv[2]
 CIDS_INFO = sys.argv[3]
 FREQ_MIN = int(sys.argv[4])
 FILE_STEP3 = sys.argv[5]
 with open(XMLS_INFO, "r") as fp:
    xml_data = json.load(fp)
 with open(CIDS_INFO, "r") as fp:
    cids_data = json.load(fp)
 years = [file_data['date'] for file_data in xml_data.values()]
 min_year = min(years)
 max_year = max(years)
 year_sums = defaultdict(int)
 for file_data in xml_data.values():
    year_sums[file_data['date']] += file_data['words']
 all_types = [file_data['type'] for file_data in xml_data.values()]
 all_types = list(set(all_types))
 data_out = {}
 with open(FILE_STEP1, "r") as fp:
    next(fp) #skip header
    for line in fp:
        cid, *wids = line.split()
        data_out[int(cid)] = (
            Counter(xml_data[wid]['date'] for wid in wids),
            Counter(xml_data[wid]['type'] for wid in wids)
        )
 with open(FILE_STEP3, 'w') as fp:
    line1 = ["colocation_id"] + list()
    print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
    print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
    print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
    for cid in sorted(data_out.keys()):
        ctr_year, ctr_type = data_out[cid]
        # frequency < 2 is filtered in cids data!
        if str(cid) not in cids_data:
            continue
        lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
        freq, fd = int(freq), int(fd)
        if freq < FREQ_MIN:
            continue
        print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
        print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
        print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)
--- a/src/match.py
+++ b/src/match.py
@ -11,16 +11,22 @@ class StructureMatch:
    @staticmethod
    def from_db(db, colocation_id, structure):
        result = StructureMatch(colocation_id, structure)
-        for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
+        prev_match_id = None
            to_add = {}
-            for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
+        stmt = """SELECT match_id, component_id, word_lemma, word_text, word_msd, word_id
-                SELECT component_id, word_lemma, word_text, word_msd, word_id 
+                  FROM ColocationMatches 
-                FROM Matches WHERE match_id=?""", match_id):
+                  JOIN Matches ON Matches.match_id=ColocationMatches.mid_match_id 
                  WHERE mid_colocation_id=? 
                  ORDER BY match_id"""
-                to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
+        for row in db.execute(stmt, (colocation_id,)): 
            match_id, component_id, word_lemma, word_text, word_msd, word_id = row
-            result.matches.append(to_add)
+            if match_id != prev_match_id:
                result.matches.append({})
                prev_match_id = match_id
            result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
        for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
            result.representations[str(component_id)] = text
--- a/src/match_store.py
+++ b/src/match_store.py
@ -9,6 +9,7 @@ class MatchStore:
    def __init__(self, args, db):
        self.db = db
        self.dispersions = {}
        self.min_freq = args.min_freq
        self.db.init("""CREATE TABLE Colocations (
@ -96,19 +97,31 @@ class MatchStore:
            print("Representation step already done, skipping")
            return
        num_inserts = 1000
        inserts = []
        structures_dict = {s.id: s for s in structures}
        num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
        for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
            structure = structures_dict[sid]
            match = StructureMatch.from_db(self.db, cid, structure)
            RepresentationAssigner.set_representations(match, word_renderer)
-            for component_id, text in match.representations.items():
+
-                self.db.execute("""
+            inserts.append(match)
-                    INSERT INTO Representations (colocation_id, component_id, text) 
+            if len(inserts) > num_inserts:
-                    VALUES (?,?,?)""", (match.match_id, component_id, text))
+                for match in inserts:
                    for component_id, text in match.representations.items():
                        self.db.execute("""
                            INSERT INTO Representations (colocation_id, component_id, text) 
                            VALUES (?,?,?)""", (match.match_id, component_id, text))
                inserts = []
        self.db.step_is_done(step_name)
    def has_colocation_id_enough_frequency(self, colocation_id):
        matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
        return matches >= self.min_freq
    def determine_colocation_dispersions(self):
        step_name = 'dispersions'
        if self.db.is_step_done(step_name):
@ -116,7 +129,10 @@ class MatchStore:
            return
        dispersions = defaultdict(int)
-        for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
+        for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
            if not self.has_colocation_id_enough_frequency(colocation_id):
                continue
            word_tups = literal_eval(word_tups_str)
            for component_id, lemma in word_tups:
                dispersions[(str(structure_id), component_id, lemma)] += 1
--- a/src/writer.py
+++ b/src/writer.py
@ -77,7 +77,7 @@ class Writer:
        rows = []
        components = structure.components
-        for match in colocation_ids.get_matches_for(structure):
+        for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
            if len(match) < self.min_frequency:
                continue