import sys import json from collections import Counter, defaultdict FILE_STEP1 = sys.argv[1] XMLS_INFO = sys.argv[2] CIDS_INFO = sys.argv[3] FREQ_MIN = int(sys.argv[4]) FILE_STEP3 = sys.argv[5] with open(XMLS_INFO, "r") as fp: xml_data = json.load(fp) with open(CIDS_INFO, "r") as fp: cids_data = json.load(fp) years = [file_data['date'] for file_data in xml_data.values()] min_year = min(years) max_year = max(years) year_sums = defaultdict(int) for file_data in xml_data.values(): year_sums[file_data['date']] += file_data['words'] all_types = [file_data['type'] for file_data in xml_data.values()] all_types = list(set(all_types)) data_out = {} with open(FILE_STEP1, "r") as fp: next(fp) #skip header for line in fp: cid, *wids = line.split() data_out[int(cid)] = ( Counter(xml_data[wid]['date'] for wid in wids), Counter(xml_data[wid]['type'] for wid in wids) ) with open(FILE_STEP3, 'w') as fp: line1 = ["colocation_id"] + list() print("collocation_id, lemma1, lemma2, Joint-representative-form, frequency, distinct_forms, ", end="", file=fp) print(", ".join(str(x) for x in range(min_year, max_year + 1)), end=", ", file=fp) print(", ".join(str(x) + "pm" for x in range(min_year, max_year + 1)), file=fp) for cid in sorted(data_out.keys()): ctr_year, ctr_type = data_out[cid] # frequency < 2 is filtered in cids data! if str(cid) not in cids_data: continue lemma1, lemma2, rep, freq, fd, _ = cids_data[str(cid)] freq, fd = int(freq), int(fd) if freq < FREQ_MIN: continue print("{}, {}, {}, {}, {}, {}, ".format(cid, lemma1, lemma2, rep, freq, fd), end="", file=fp) print(", ".join(str(ctr_year[y]) for y in range(min_year, max_year + 1)), end=", ", file=fp) print(", ".join("{:2.3f}".format(ctr_year[y] / year_sums[y] * 1e6) for y in range(min_year, max_year + 1)), file=fp)