Merge branch 'sql-join-test' of ozbolt/luscenje_struktur into master
OK
This commit is contained in:
commit
ec113f9cd2
5
issue1000/README.md
Normal file
5
issue1000/README.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# issue 1000
|
||||||
|
|
||||||
|
These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
|
||||||
|
|
||||||
|
If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.
|
25
issue1000/step0.py
Normal file
25
issue1000/step0.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
FILE_OUT2D = sys.argv[1]
|
||||||
|
C2 = sys.argv[2]
|
||||||
|
FILE_OUT = sys.argv[3]
|
||||||
|
|
||||||
|
data_out = {}
|
||||||
|
|
||||||
|
with open(FILE_OUT2D, 'r') as fp:
|
||||||
|
for line in fp:
|
||||||
|
cells = line.split(", ")
|
||||||
|
|
||||||
|
lemma1 = cells[1]
|
||||||
|
lemma2 = cells[1 + int(C2) * 5]
|
||||||
|
rep = cells[-7]
|
||||||
|
freq = cells[-6]
|
||||||
|
fd = cells[-1]
|
||||||
|
cid = cells[-8]
|
||||||
|
|
||||||
|
data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
|
||||||
|
|
||||||
|
with open(FILE_OUT, 'w') as fp:
|
||||||
|
json.dump(data_out, fp)
|
||||||
|
|
43
issue1000/step1.py
Normal file
43
issue1000/step1.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
|
||||||
|
STRUCTURE_ID = '1'
|
||||||
|
|
||||||
|
con = sqlite3.connect(sys.argv[1])
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
data_out = {}
|
||||||
|
|
||||||
|
cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
|
||||||
|
FROM Matches, Colocations, ColocationMatches
|
||||||
|
WHERE Matches.match_id = ColocationMatches.mid_match_id
|
||||||
|
AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
|
||||||
|
AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
|
||||||
|
|
||||||
|
prev_mid = None
|
||||||
|
idx = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
row = cur.fetchone()
|
||||||
|
if row is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
mid, wid, cid = row
|
||||||
|
if mid == prev_mid:
|
||||||
|
continue
|
||||||
|
elif cid not in data_out:
|
||||||
|
data_out[cid] = []
|
||||||
|
|
||||||
|
wid_int = int(wid[2:9])
|
||||||
|
data_out[cid].append(wid_int)
|
||||||
|
|
||||||
|
prev_mid = mid
|
||||||
|
idx += 1
|
||||||
|
if(idx % 10000 == 0):
|
||||||
|
print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
|
||||||
|
|
||||||
|
print("", file=sys.stderr)
|
||||||
|
for mid, wids in data_out.items():
|
||||||
|
print(mid, *wids)
|
||||||
|
|
||||||
|
con.close()
|
60
issue1000/step2.py
Normal file
60
issue1000/step2.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import pathlib
|
||||||
|
import mmap
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
FOLDER_XMLS = sys.argv[1]
|
||||||
|
FILE_OUT = sys.argv[2]
|
||||||
|
|
||||||
|
TYPES = {
|
||||||
|
"SSJ.I": "internet",
|
||||||
|
"SSJ.T.D": "drugo",
|
||||||
|
"SSJ.T.P.C": "casopisi",
|
||||||
|
"SSJ.T.P.R": "revije",
|
||||||
|
"SSJ.T.K.S": "stvarno",
|
||||||
|
"SSJ.T.K.L": "leposlovje",
|
||||||
|
"SSJ.T.K.N": "stvarno",
|
||||||
|
}
|
||||||
|
|
||||||
|
xml_data = {}
|
||||||
|
|
||||||
|
searcher_date = re.compile(b"<date>([^<]+)</date>")
|
||||||
|
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
N = 38411
|
||||||
|
|
||||||
|
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
|
||||||
|
idx += 1
|
||||||
|
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
|
||||||
|
|
||||||
|
with open(str(filename), "rb") as fp:
|
||||||
|
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
|
||||||
|
|
||||||
|
match1_iter = searcher_date.finditer(data)
|
||||||
|
next(match1_iter)
|
||||||
|
match1 = next(match1_iter)
|
||||||
|
match2 = searcher_type.search(data)
|
||||||
|
|
||||||
|
key = int(filename.stem[2:9])
|
||||||
|
date = int(match1.group(1)[:4])
|
||||||
|
typ = TYPES[match2.group(1).decode('ascii')]
|
||||||
|
|
||||||
|
words, fidx = 0, 0
|
||||||
|
while fidx >= 0:
|
||||||
|
fidx = data.find(b"<w a", fidx + 3)
|
||||||
|
words += 1
|
||||||
|
|
||||||
|
xml_data[key] = {
|
||||||
|
"date": date,
|
||||||
|
"type": typ,
|
||||||
|
"words": words
|
||||||
|
}
|
||||||
|
|
||||||
|
data.close()
|
||||||
|
|
||||||
|
|
||||||
|
with open(FILE_OUT, 'w') as fp:
|
||||||
|
json.dump(xml_data, fp, indent=2)
|
61
issue1000/step3.py
Normal file
61
issue1000/step3.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
|
FILE_STEP1 = sys.argv[1]
|
||||||
|
XMLS_INFO = sys.argv[2]
|
||||||
|
CIDS_INFO = sys.argv[3]
|
||||||
|
FREQ_MIN = int(sys.argv[4])
|
||||||
|
FILE_STEP3 = sys.argv[5]
|
||||||
|
|
||||||
|
|
||||||
|
with open(XMLS_INFO, "r") as fp:
|
||||||
|
xml_data = json.load(fp)
|
||||||
|
|
||||||
|
with open(CIDS_INFO, "r") as fp:
|
||||||
|
cids_data = json.load(fp)
|
||||||
|
|
||||||
|
years = [file_data['date'] for file_data in xml_data.values()]
|
||||||
|
min_year = min(years)
|
||||||
|
max_year = max(years)
|
||||||
|
|
||||||
|
year_sums = defaultdict(int)
|
||||||
|
for file_data in xml_data.values():
|
||||||
|
year_sums[file_data['date']] += file_data['words']
|
||||||
|
|
||||||
|
all_types = [file_data['type'] for file_data in xml_data.values()]
|
||||||
|
all_types = list(set(all_types))
|
||||||
|
|
||||||
|
data_out = {}
|
||||||
|
|
||||||
|
with open(FILE_STEP1, "r") as fp:
|
||||||
|
next(fp) #skip header
|
||||||
|
for line in fp:
|
||||||
|
cid, *wids = line.split()
|
||||||
|
data_out[int(cid)] = (
|
||||||
|
Counter(xml_data[wid]['date'] for wid in wids),
|
||||||
|
Counter(xml_data[wid]['type'] for wid in wids)
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(FILE_STEP3, 'w') as fp:
|
||||||
|
line1 = ["colocation_id"] + list()
|
||||||
|
print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
|
||||||
|
print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
|
||||||
|
print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
|
||||||
|
|
||||||
|
for cid in sorted(data_out.keys()):
|
||||||
|
ctr_year, ctr_type = data_out[cid]
|
||||||
|
|
||||||
|
# frequency < 2 is filtered in cids data!
|
||||||
|
if str(cid) not in cids_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
|
||||||
|
freq, fd = int(freq), int(fd)
|
||||||
|
if freq < FREQ_MIN:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
|
||||||
|
print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
|
||||||
|
print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)
|
||||||
|
|
20
src/match.py
20
src/match.py
|
@ -11,16 +11,22 @@ class StructureMatch:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_db(db, colocation_id, structure):
|
def from_db(db, colocation_id, structure):
|
||||||
result = StructureMatch(colocation_id, structure)
|
result = StructureMatch(colocation_id, structure)
|
||||||
for match_id in db.execute("SELECT mid_match_id FROM ColocationMatches WHERE mid_colocation_id=?", (colocation_id,)):
|
prev_match_id = None
|
||||||
to_add = {}
|
|
||||||
|
|
||||||
for component_id, word_lemma, word_text, word_msd, word_id in db.execute("""
|
stmt = """SELECT match_id, component_id, word_lemma, word_text, word_msd, word_id
|
||||||
SELECT component_id, word_lemma, word_text, word_msd, word_id
|
FROM ColocationMatches
|
||||||
FROM Matches WHERE match_id=?""", match_id):
|
JOIN Matches ON Matches.match_id=ColocationMatches.mid_match_id
|
||||||
|
WHERE mid_colocation_id=?
|
||||||
|
ORDER BY match_id"""
|
||||||
|
|
||||||
to_add[str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
|
for row in db.execute(stmt, (colocation_id,)):
|
||||||
|
match_id, component_id, word_lemma, word_text, word_msd, word_id = row
|
||||||
|
|
||||||
result.matches.append(to_add)
|
if match_id != prev_match_id:
|
||||||
|
result.matches.append({})
|
||||||
|
prev_match_id = match_id
|
||||||
|
|
||||||
|
result.matches[-1][str(component_id)] = Word(word_lemma, word_msd, word_id, word_text, False)
|
||||||
|
|
||||||
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
|
for component_id, text in db.execute("SELECT component_id, text FROM Representations WHERE colocation_id=?", (colocation_id,)):
|
||||||
result.representations[str(component_id)] = text
|
result.representations[str(component_id)] = text
|
||||||
|
|
|
@ -9,6 +9,7 @@ class MatchStore:
|
||||||
def __init__(self, args, db):
|
def __init__(self, args, db):
|
||||||
self.db = db
|
self.db = db
|
||||||
self.dispersions = {}
|
self.dispersions = {}
|
||||||
|
self.min_freq = args.min_freq
|
||||||
|
|
||||||
|
|
||||||
self.db.init("""CREATE TABLE Colocations (
|
self.db.init("""CREATE TABLE Colocations (
|
||||||
|
@ -96,19 +97,31 @@ class MatchStore:
|
||||||
print("Representation step already done, skipping")
|
print("Representation step already done, skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
num_inserts = 1000
|
||||||
|
inserts = []
|
||||||
|
|
||||||
structures_dict = {s.id: s for s in structures}
|
structures_dict = {s.id: s for s in structures}
|
||||||
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
|
num_representations = int(self.db.execute("SELECT Count(*) FROM Colocations").fetchone()[0])
|
||||||
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
for cid, sid in progress(self.db.execute("SELECT colocation_id, structure_id FROM Colocations"), "representations", total=num_representations):
|
||||||
structure = structures_dict[sid]
|
structure = structures_dict[sid]
|
||||||
match = StructureMatch.from_db(self.db, cid, structure)
|
match = StructureMatch.from_db(self.db, cid, structure)
|
||||||
RepresentationAssigner.set_representations(match, word_renderer)
|
RepresentationAssigner.set_representations(match, word_renderer)
|
||||||
|
|
||||||
|
inserts.append(match)
|
||||||
|
if len(inserts) > num_inserts:
|
||||||
|
for match in inserts:
|
||||||
for component_id, text in match.representations.items():
|
for component_id, text in match.representations.items():
|
||||||
self.db.execute("""
|
self.db.execute("""
|
||||||
INSERT INTO Representations (colocation_id, component_id, text)
|
INSERT INTO Representations (colocation_id, component_id, text)
|
||||||
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
VALUES (?,?,?)""", (match.match_id, component_id, text))
|
||||||
|
inserts = []
|
||||||
|
|
||||||
self.db.step_is_done(step_name)
|
self.db.step_is_done(step_name)
|
||||||
|
|
||||||
|
def has_colocation_id_enough_frequency(self, colocation_id):
|
||||||
|
matches = self.db.execute("SELECT MIN(MAX(COUNT(*), ?), ?) FROM ColocationMatches WHERE mid_colocation_id=?", (self.min_freq - 1, self.min_freq, colocation_id)).fetchone()[0]
|
||||||
|
return matches >= self.min_freq
|
||||||
|
|
||||||
def determine_colocation_dispersions(self):
|
def determine_colocation_dispersions(self):
|
||||||
step_name = 'dispersions'
|
step_name = 'dispersions'
|
||||||
if self.db.is_step_done(step_name):
|
if self.db.is_step_done(step_name):
|
||||||
|
@ -116,7 +129,10 @@ class MatchStore:
|
||||||
return
|
return
|
||||||
|
|
||||||
dispersions = defaultdict(int)
|
dispersions = defaultdict(int)
|
||||||
for structure_id, word_tups_str in progress(self.db.execute("SELECT structure_id, key FROM Colocations"), "dispersion"):
|
for colocation_id, structure_id, word_tups_str in progress(self.db.execute("SELECT colocation_id, structure_id, key FROM Colocations"), "dispersion"):
|
||||||
|
if not self.has_colocation_id_enough_frequency(colocation_id):
|
||||||
|
continue
|
||||||
|
|
||||||
word_tups = literal_eval(word_tups_str)
|
word_tups = literal_eval(word_tups_str)
|
||||||
for component_id, lemma in word_tups:
|
for component_id, lemma in word_tups:
|
||||||
dispersions[(str(structure_id), component_id, lemma)] += 1
|
dispersions[(str(structure_id), component_id, lemma)] += 1
|
||||||
|
|
|
@ -77,7 +77,7 @@ class Writer:
|
||||||
rows = []
|
rows = []
|
||||||
components = structure.components
|
components = structure.components
|
||||||
|
|
||||||
for match in colocation_ids.get_matches_for(structure):
|
for match in progress(colocation_ids.get_matches_for(structure), "Writing matches: {}".format(structure.id)):
|
||||||
if len(match) < self.min_frequency:
|
if len(match) < self.min_frequency:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user