From 9e8cd2a2ec06dcb152f3e862a672ddbc3a0c597b Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Mon, 2 Mar 2020 19:13:19 +0100 Subject: [PATCH] Issue #1000 --- issue1000/README.md | 5 ++++ issue1000/step0.py | 25 +++++++++++++++++++ issue1000/step1.py | 43 ++++++++++++++++++++++++++++++++ issue1000/step2.py | 60 ++++++++++++++++++++++++++++++++++++++++++++ issue1000/step3.py | 61 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+) create mode 100644 issue1000/README.md create mode 100644 issue1000/step0.py create mode 100644 issue1000/step1.py create mode 100644 issue1000/step2.py create mode 100644 issue1000/step3.py diff --git a/issue1000/README.md b/issue1000/README.md new file mode 100644 index 0000000..56df9d3 --- /dev/null +++ b/issue1000/README.md @@ -0,0 +1,5 @@ +# issue 1000 + +These four scripts were created as part of issue number 1000. This will one day be integrated into application itself, but for now it's here. + +If you have any questions, contact me. Anyway, that scripts are really short and you should be able to get them fairly quickly. diff --git a/issue1000/step0.py b/issue1000/step0.py new file mode 100644 index 0000000..caabd6e --- /dev/null +++ b/issue1000/step0.py @@ -0,0 +1,25 @@ +import sys +import json + +FILE_OUT2D = sys.argv[1] +C2 = sys.argv[2] +FILE_OUT = sys.argv[3] + +data_out = {} + +with open(FILE_OUT2D, 'r') as fp: + for line in fp: + cells = line.split(", ") + + lemma1 = cells[1] + lemma2 = cells[1 + int(C2) * 5] + rep = cells[-7] + freq = cells[-6] + fd = cells[-1] + cid = cells[-8] + + data_out[cid] = [lemma1, lemma2, rep, freq, fd, cid] + +with open(FILE_OUT, 'w') as fp: + json.dump(data_out, fp) + diff --git a/issue1000/step1.py b/issue1000/step1.py new file mode 100644 index 0000000..b9d81a9 --- /dev/null +++ b/issue1000/step1.py @@ -0,0 +1,43 @@ +import sqlite3 +import sys + +STRUCTURE_ID = '1' + +con = sqlite3.connect(sys.argv[1]) +cur = con.cursor() + +data_out = {} + +cur.execute("""SELECT Matches.match_id, Matches.word_id, Colocations.colocation_id +FROM Matches, Colocations, ColocationMatches +WHERE Matches.match_id = ColocationMatches.mid_match_id +AND Colocations.colocation_id = ColocationMatches.mid_colocation_id +AND Colocations.structure_id = ?""", (STRUCTURE_ID, )) + +prev_mid = None +idx = 0 + +while True: + row = cur.fetchone() + if row is None: + break + + mid, wid, cid = row + if mid == prev_mid: + continue + elif cid not in data_out: + data_out[cid] = [] + + wid_int = int(wid[2:9]) + data_out[cid].append(wid_int) + + prev_mid = mid + idx += 1 + if(idx % 10000 == 0): + print("\r{}".format(idx), end="", flush=True, file=sys.stderr) + +print("", file=sys.stderr) +for mid, wids in data_out.items(): + print(mid, *wids) + +con.close() diff --git a/issue1000/step2.py b/issue1000/step2.py new file mode 100644 index 0000000..f9cb295 --- /dev/null +++ b/issue1000/step2.py @@ -0,0 +1,60 @@ +import sys +import re +import pathlib +import mmap +from datetime import datetime +import json + +FOLDER_XMLS = sys.argv[1] +FILE_OUT = sys.argv[2] + +TYPES = { + "SSJ.I": "internet", + "SSJ.T.D": "drugo", + "SSJ.T.P.C": "casopisi", + "SSJ.T.P.R": "revije", + "SSJ.T.K.S": "stvarno", + "SSJ.T.K.L": "leposlovje", + "SSJ.T.K.N": "stvarno", +} + +xml_data = {} + +searcher_date = re.compile(b"([^<]+)") +searcher_type = re.compile(b"= 0: + fidx = data.find(b"