This commit is contained in:
Ozbolt Menegatti 2020-03-02 19:13:19 +01:00
parent 1d4c0238a6
commit 9e8cd2a2ec
5 changed files with 194 additions and 0 deletions

5
issue1000/README.md Normal file
View File

@ -0,0 +1,5 @@
# issue 1000
These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.

25
issue1000/step0.py Normal file
View File

@ -0,0 +1,25 @@
import sys
import json
FILE_OUT2D = sys.argv[1]
C2 = sys.argv[2]
FILE_OUT = sys.argv[3]
data_out = {}
with open(FILE_OUT2D, 'r') as fp:
for line in fp:
cells = line.split(", ")
lemma1 = cells[1]
lemma2 = cells[1 + int(C2) * 5]
rep = cells[-7]
freq = cells[-6]
fd = cells[-1]
cid = cells[-8]
data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
with open(FILE_OUT, 'w') as fp:
json.dump(data_out, fp)

43
issue1000/step1.py Normal file
View File

@ -0,0 +1,43 @@
import sqlite3
import sys
STRUCTURE_ID = '1'
con = sqlite3.connect(sys.argv[1])
cur = con.cursor()
data_out = {}
cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
FROM Matches, Colocations, ColocationMatches
WHERE Matches.match_id = ColocationMatches.mid_match_id
AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
prev_mid = None
idx = 0
while True:
row = cur.fetchone()
if row is None:
break
mid, wid, cid = row
if mid == prev_mid:
continue
elif cid not in data_out:
data_out[cid] = []
wid_int = int(wid[2:9])
data_out[cid].append(wid_int)
prev_mid = mid
idx += 1
if(idx % 10000 == 0):
print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
print("", file=sys.stderr)
for mid, wids in data_out.items():
print(mid, *wids)
con.close()

60
issue1000/step2.py Normal file
View File

@ -0,0 +1,60 @@
import sys
import re
import pathlib
import mmap
from datetime import datetime
import json
FOLDER_XMLS = sys.argv[1]
FILE_OUT = sys.argv[2]
TYPES = {
"SSJ.I": "internet",
"SSJ.T.D": "drugo",
"SSJ.T.P.C": "casopisi",
"SSJ.T.P.R": "revije",
"SSJ.T.K.S": "stvarno",
"SSJ.T.K.L": "leposlovje",
"SSJ.T.K.N": "stvarno",
}
xml_data = {}
searcher_date = re.compile(b"<date>([^<]+)</date>")
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
idx = 0
N = 38411
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
idx += 1
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
with open(str(filename), "rb") as fp:
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
match1_iter = searcher_date.finditer(data)
next(match1_iter)
match1 = next(match1_iter)
match2 = searcher_type.search(data)
key = int(filename.stem[2:9])
date = int(match1.group(1)[:4])
typ = TYPES[match2.group(1).decode('ascii')]
words, fidx = 0, 0
while fidx >= 0:
fidx = data.find(b"<w a", fidx + 3)
words += 1
xml_data[key] = {
"date": date,
"type": typ,
"words": words
}
data.close()
with open(FILE_OUT, 'w') as fp:
json.dump(xml_data, fp, indent=2)

61
issue1000/step3.py Normal file
View File

@ -0,0 +1,61 @@
import sys
import json
from collections import Counter, defaultdict
FILE_STEP1 = sys.argv[1]
XMLS_INFO = sys.argv[2]
CIDS_INFO = sys.argv[3]
FREQ_MIN = int(sys.argv[4])
FILE_STEP3 = sys.argv[5]
with open(XMLS_INFO, "r") as fp:
xml_data = json.load(fp)
with open(CIDS_INFO, "r") as fp:
cids_data = json.load(fp)
years = [file_data['date'] for file_data in xml_data.values()]
min_year = min(years)
max_year = max(years)
year_sums = defaultdict(int)
for file_data in xml_data.values():
year_sums[file_data['date']] += file_data['words']
all_types = [file_data['type'] for file_data in xml_data.values()]
all_types = list(set(all_types))
data_out = {}
with open(FILE_STEP1, "r") as fp:
next(fp) #skip header
for line in fp:
cid, *wids = line.split()
data_out[int(cid)] = (
Counter(xml_data[wid]['date'] for wid in wids),
Counter(xml_data[wid]['type'] for wid in wids)
)
with open(FILE_STEP3, 'w') as fp:
line1 = ["colocation_id"] + list()
print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
for cid in sorted(data_out.keys()):
ctr_year, ctr_type = data_out[cid]
# frequency < 2 is filtered in cids data!
if str(cid) not in cids_data:
continue
lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
freq, fd = int(freq), int(fd)
if freq < FREQ_MIN:
continue
print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)