62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
import sys
|
|
import json
|
|
from collections import Counter, defaultdict
|
|
|
|
FILE_STEP1 = sys.argv[1]
|
|
XMLS_INFO = sys.argv[2]
|
|
CIDS_INFO = sys.argv[3]
|
|
FREQ_MIN = int(sys.argv[4])
|
|
FILE_STEP3 = sys.argv[5]
|
|
|
|
|
|
with open(XMLS_INFO, "r") as fp:
|
|
xml_data = json.load(fp)
|
|
|
|
with open(CIDS_INFO, "r") as fp:
|
|
cids_data = json.load(fp)
|
|
|
|
years = [file_data['date'] for file_data in xml_data.values()]
|
|
min_year = min(years)
|
|
max_year = max(years)
|
|
|
|
year_sums = defaultdict(int)
|
|
for file_data in xml_data.values():
|
|
year_sums[file_data['date']] += file_data['words']
|
|
|
|
all_types = [file_data['type'] for file_data in xml_data.values()]
|
|
all_types = list(set(all_types))
|
|
|
|
data_out = {}
|
|
|
|
with open(FILE_STEP1, "r") as fp:
|
|
next(fp) #skip header
|
|
for line in fp:
|
|
cid, *wids = line.split()
|
|
data_out[int(cid)] = (
|
|
Counter(xml_data[wid]['date'] for wid in wids),
|
|
Counter(xml_data[wid]['type'] for wid in wids)
|
|
)
|
|
|
|
with open(FILE_STEP3, 'w') as fp:
|
|
line1 = ["colocation_id"] + list()
|
|
print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp)
|
|
print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp)
|
|
print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp)
|
|
|
|
for cid in sorted(data_out.keys()):
|
|
ctr_year, ctr_type = data_out[cid]
|
|
|
|
# frequency < 2 is filtered in cids data!
|
|
if str(cid) not in cids_data:
|
|
continue
|
|
|
|
lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)]
|
|
freq, fd = int(freq), int(fd)
|
|
if freq < FREQ_MIN:
|
|
continue
|
|
|
|
print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp)
|
|
print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp)
|
|
print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)
|
|
|