61 lines
1.3 KiB
Python
61 lines
1.3 KiB
Python
import sys
|
|
import re
|
|
import pathlib
|
|
import mmap
|
|
from datetime import datetime
|
|
import json
|
|
|
|
FOLDER_XMLS = sys.argv[1]
|
|
FILE_OUT = sys.argv[2]
|
|
|
|
TYPES = {
|
|
"SSJ.I": "internet",
|
|
"SSJ.T.D": "drugo",
|
|
"SSJ.T.P.C": "casopisi",
|
|
"SSJ.T.P.R": "revije",
|
|
"SSJ.T.K.S": "stvarno",
|
|
"SSJ.T.K.L": "leposlovje",
|
|
"SSJ.T.K.N": "stvarno",
|
|
}
|
|
|
|
xml_data = {}
|
|
|
|
searcher_date = re.compile(b"<date>([^<]+)</date>")
|
|
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
|
|
|
|
idx = 0
|
|
N = 38411
|
|
|
|
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
|
|
idx += 1
|
|
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
|
|
|
|
with open(str(filename), "rb") as fp:
|
|
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
|
|
|
|
match1_iter = searcher_date.finditer(data)
|
|
next(match1_iter)
|
|
match1 = next(match1_iter)
|
|
match2 = searcher_type.search(data)
|
|
|
|
key = int(filename.stem[2:9])
|
|
date = int(match1.group(1)[:4])
|
|
typ = TYPES[match2.group(1).decode('ascii')]
|
|
|
|
words, fidx = 0, 0
|
|
while fidx >= 0:
|
|
fidx = data.find(b"<w a", fidx + 3)
|
|
words += 1
|
|
|
|
xml_data[key] = {
|
|
"date": date,
|
|
"type": typ,
|
|
"words": words
|
|
}
|
|
|
|
data.close()
|
|
|
|
|
|
with open(FILE_OUT, 'w') as fp:
|
|
json.dump(xml_data, fp, indent=2)
|