pull/1/head
Ozbolt Menegatti 4 years ago
parent 1d4c0238a6
commit 9e8cd2a2ec

@ -0,0 +1,5 @@
# issue 1000
These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here.
If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly.

@ -0,0 +1,25 @@
import sys
import json
FILE_OUT2D = sys.argv[1]
C2 = sys.argv[2]
FILE_OUT = sys.argv[3]
data_out = {}
with open(FILE_OUT2D, 'r') as fp:
for line in fp:
cells = line.split(", ")
lemma1 = cells[1]
lemma2 = cells[1 + int(C2) * 5]
rep = cells[-7]
freq = cells[-6]
fd = cells[-1]
cid = cells[-8]
data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid]
with open(FILE_OUT, 'w') as fp:
json.dump(data_out, fp)

@ -0,0 +1,43 @@
import sqlite3
import sys
STRUCTURE_ID = '1'
con = sqlite3.connect(sys.argv[1])
cur = con.cursor()
data_out = {}
cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id
FROM Matches, Colocations, ColocationMatches
WHERE Matches.match_id = ColocationMatches.mid_match_id
AND Colocations.colocation_id = ColocationMatches.mid_colocation_id
AND Colocations.structure_id = ?""", (STRUCTURE_ID, ))
prev_mid = None
idx = 0
while True:
row = cur.fetchone()
if row is None:
break
mid, wid, cid = row
if mid == prev_mid:
continue
elif cid not in data_out:
data_out[cid] = []
wid_int = int(wid[2:9])
data_out[cid].append(wid_int)
prev_mid = mid
idx += 1
if(idx % 10000 == 0):
print("\r{}".format(idx), end="", flush=True, file=sys.stderr)
print("", file=sys.stderr)
for mid, wids in data_out.items():
print(mid, *wids)
con.close()

@ -0,0 +1,60 @@
import sys
import re
import pathlib
import mmap
from datetime import datetime
import json
FOLDER_XMLS = sys.argv[1]
FILE_OUT = sys.argv[2]
TYPES = {
"SSJ.I": "internet",
"SSJ.T.D": "drugo",
"SSJ.T.P.C": "casopisi",
"SSJ.T.P.R": "revije",
"SSJ.T.K.S": "stvarno",
"SSJ.T.K.L": "leposlovje",
"SSJ.T.K.N": "stvarno",
}
xml_data = {}
searcher_date = re.compile(b"<date>([^<]+)</date>")
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
idx = 0
N = 38411
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
idx += 1
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
with open(str(filename), "rb") as fp:
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
match1_iter = searcher_date.finditer(data)
next(match1_iter)
match1 = next(match1_iter)
match2 = searcher_type.search(data)
key = int(filename.stem[2:9])
date = int(match1.group(1)[:4])
typ = TYPES[match2.group(1).decode('ascii')]
words, fidx = 0, 0
while fidx >= 0:
fidx = data.find(b"<w a", fidx + 3)
words += 1
xml_data[key] = {
"date": date,
"type": typ,
"words": words
}
data.close()
with open(FILE_OUT, 'w') as fp:
json.dump(xml_data, fp, indent=2)

@ -0,0 +1,61 @@
import sys
import json
from collections import Counter, defaultdict
FILE_STEP1 = sys.argv[1]
XMLS_INFO = sys.argv[2]
CIDS_INFO = sys.argv[3]
FREQ_MIN = int(sys.argv[4])
FILE_STEP3 = sys.argv[5]
with open(XMLS_INFO, "r") as fp:
xml_data = json.load(fp)
with open(CIDS_INFO, "r") as fp:
cids_data = json.load(fp)
years = [file_data['date'] for file_data in xml_data.values()]
min_year = min(years)
max_year = max(years)
year_sums = defaultdict(int)
for file_data in xml_data.values():
year_sums[file_data['date']] += file_data['words']
all_types = [file_data['type'] for file_data in xml_data.values()]
all_types = list(set(all_types))
data_out = {}
with open(FILE_STEP1, "r") as fp:
next(fp) #skip header
for line in fp:
cid, *wids = line.split()
data_out[int(cid)] = (
Counter(xml_data[wid]['date'] for wid in wids),
Counter(xml_data[wid]['type'] for wid in wids)
)
with open(FILE_STEP3, 'w') as fp:
line1 = ["colocation_id"] + list()
print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
for cid in sorted(data_out.keys()):
ctr_year, ctr_type = data_out[cid]
# frequency < 2 is filtered in cids data!
if str(cid) not in cids_data:
continue
lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
freq, fd = int(freq), int(fd)
if freq < FREQ_MIN:
continue
print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)
Loading…
Cancel
Save