luscenje_struktur/issue1000/step2.py

61 lines
1.3 KiB
Python
Raw Normal View History

2020-03-02 18:13:19 +00:00
import sys
import re
import pathlib
import mmap
from datetime import datetime
import json
FOLDER_XMLS = sys.argv[1]
FILE_OUT = sys.argv[2]
TYPES = {
"SSJ.I": "internet",
"SSJ.T.D": "drugo",
"SSJ.T.P.C": "casopisi",
"SSJ.T.P.R": "revije",
"SSJ.T.K.S": "stvarno",
"SSJ.T.K.L": "leposlovje",
"SSJ.T.K.N": "stvarno",
}
xml_data = {}
searcher_date = re.compile(b"<date>([^<]+)</date>")
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
idx = 0
N = 38411
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
idx += 1
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
with open(str(filename), "rb") as fp:
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
match1_iter = searcher_date.finditer(data)
next(match1_iter)
match1 = next(match1_iter)
match2 = searcher_type.search(data)
key = int(filename.stem[2:9])
date = int(match1.group(1)[:4])
typ = TYPES[match2.group(1).decode('ascii')]
words, fidx = 0, 0
while fidx >= 0:
fidx = data.find(b"<w a", fidx + 3)
words += 1
xml_data[key] = {
"date": date,
"type": typ,
"words": words
}
data.close()
with open(FILE_OUT, 'w') as fp:
json.dump(xml_data, fp, indent=2)