luscenje_struktur/issue1000/step2.py

import sys
import re
import pathlib
import mmap
from datetime import datetime
import json

FOLDER_XMLS = sys.argv[1]
FILE_OUT = sys.argv[2]

TYPES = {
    "SSJ.I": "internet",
    "SSJ.T.D": "drugo",
    "SSJ.T.P.C": "casopisi",
    "SSJ.T.P.R": "revije",
    "SSJ.T.K.S": "stvarno",
    "SSJ.T.K.L": "leposlovje",
    "SSJ.T.K.N": "stvarno",
}

xml_data = {}

searcher_date = re.compile(b"<date>([^<]+)</date>")
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")

idx = 0
N = 38411

for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
    idx += 1
    print("\r{}/{}: {}".format(idx, N, filename.stem), end="")

    with open(str(filename), "rb") as fp:
        data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)

        match1_iter = searcher_date.finditer(data)
        next(match1_iter)
        match1 = next(match1_iter)
        match2 = searcher_type.search(data)

        key = int(filename.stem[2:9])
        date = int(match1.group(1)[:4])
        typ = TYPES[match2.group(1).decode('ascii')]

        words, fidx = 0, 0
        while fidx >= 0:
            fidx = data.find(b"<w a", fidx + 3)
            words += 1

        xml_data[key] = {
            "date": date,
            "type": typ,
            "words": words
        }

        data.close()


with open(FILE_OUT, 'w') as fp:
    json.dump(xml_data, fp, indent=2)