Issue #1000
This commit is contained in:
60
issue1000/step2.py
Normal file
60
issue1000/step2.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import sys
|
||||
import re
|
||||
import pathlib
|
||||
import mmap
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
FOLDER_XMLS = sys.argv[1]
|
||||
FILE_OUT = sys.argv[2]
|
||||
|
||||
TYPES = {
|
||||
"SSJ.I": "internet",
|
||||
"SSJ.T.D": "drugo",
|
||||
"SSJ.T.P.C": "casopisi",
|
||||
"SSJ.T.P.R": "revije",
|
||||
"SSJ.T.K.S": "stvarno",
|
||||
"SSJ.T.K.L": "leposlovje",
|
||||
"SSJ.T.K.N": "stvarno",
|
||||
}
|
||||
|
||||
xml_data = {}
|
||||
|
||||
searcher_date = re.compile(b"<date>([^<]+)</date>")
|
||||
searcher_type = re.compile(b"<catRef target\=\"ssj:([^\"]+)")
|
||||
|
||||
idx = 0
|
||||
N = 38411
|
||||
|
||||
for filename in pathlib.Path(FOLDER_XMLS).glob("**/*.xml"):
|
||||
idx += 1
|
||||
print("\r{}/{}: {}".format(idx, N, filename.stem), end="")
|
||||
|
||||
with open(str(filename), "rb") as fp:
|
||||
data = mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
|
||||
match1_iter = searcher_date.finditer(data)
|
||||
next(match1_iter)
|
||||
match1 = next(match1_iter)
|
||||
match2 = searcher_type.search(data)
|
||||
|
||||
key = int(filename.stem[2:9])
|
||||
date = int(match1.group(1)[:4])
|
||||
typ = TYPES[match2.group(1).decode('ascii')]
|
||||
|
||||
words, fidx = 0, 0
|
||||
while fidx >= 0:
|
||||
fidx = data.find(b"<w a", fidx + 3)
|
||||
words += 1
|
||||
|
||||
xml_data[key] = {
|
||||
"date": date,
|
||||
"type": typ,
|
||||
"words": words
|
||||
}
|
||||
|
||||
data.close()
|
||||
|
||||
|
||||
with open(FILE_OUT, 'w') as fp:
|
||||
json.dump(xml_data, fp, indent=2)
|
||||
Reference in New Issue
Block a user