added functors and headwords to db entry
This commit is contained in:
		
							parent
							
								
									86e56767dd
								
							
						
					
					
						commit
						f0b0abac1b
					
				| @ -107,12 +107,12 @@ class Parser(): | ||||
|             yield from self.xml_file_to_generator(self.ssj_file) | ||||
| 
 | ||||
|     def parse_xml_file(self, xml_file): | ||||
|         tstart = time.time() | ||||
|         # tstart = time.time() | ||||
|         file_data = [] | ||||
|         for tpl in self.xml_file_to_generator(xml_file): | ||||
|             file_data += [tpl[1]] | ||||
|         tend = time.time() | ||||
|         self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart)) | ||||
|         # self.logger.info("Parsed {} in {:.4f} s".format(xml_file, tend - tstart)) | ||||
|         return file_data | ||||
| 
 | ||||
|     def xml_file_to_generator(self, xml_file): | ||||
| @ -211,7 +211,7 @@ class Parser(): | ||||
|                         "text": sentence_text, | ||||
|                         "tokens": sentence_tokens, | ||||
|                         "jos_links": jos_links, | ||||
|                         "srl_links": srl_links_parsed | ||||
|                         "srl_links": srl_links_parsed, | ||||
|                     } | ||||
|                     self.stats["parsed_count"] += 1 | ||||
|                     yield (xml_file, sentence_entry) | ||||
|  | ||||
| @ -4,20 +4,44 @@ import argparse | ||||
| import logging | ||||
| import json | ||||
| from pymongo import MongoClient | ||||
| import pymongo | ||||
| import sys | ||||
| from multiprocessing import Pool | ||||
| import time | ||||
| 
 | ||||
| logging.basicConfig(level=logging.INFO) | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| n_kres_files = -1  # for logging | ||||
| 
 | ||||
| 
 | ||||
| def _helper_tid_to_token(tid, tokens): | ||||
|     for t in tokens: | ||||
|         if t["tid"] == tid: | ||||
|             return t | ||||
|     return None | ||||
| 
 | ||||
| 
 | ||||
| def _db_preprocess(e): | ||||
|     if e["srl_links"] is None: | ||||
|         e["headwords"] = [] | ||||
|         e["functors"] = [] | ||||
|     else: | ||||
|         hw_tids = list(set([x["from"] for x in e["srl_links"]])) | ||||
|         hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] | ||||
|         headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] | ||||
|         e["headwords"] = headwords | ||||
| 
 | ||||
|         functors = list(set([x["afun"] for x in e["srl_links"]])) | ||||
|         e["functors"] = functors | ||||
|     return e | ||||
| 
 | ||||
| 
 | ||||
| # handler for concurrency | ||||
| def _handle_kres_file_tpl(kres_file_tpl): | ||||
|     tstart = time.time() | ||||
|     kres_file_idx = kres_file_tpl[0] | ||||
|     kres_file = kres_file_tpl[1] | ||||
|     logging.info("Handling {} ({}/{})".format( | ||||
|         kres_file, kres_file_idx, n_kres_files)) | ||||
|     kres_data = kres_parser.parse_xml_file(kres_file) | ||||
|     if args.output == "file": | ||||
|         kres_outdir = outdir / "kres_json" | ||||
| @ -26,9 +50,37 @@ def _handle_kres_file_tpl(kres_file_tpl): | ||||
|         with kres_outfile.open("w") as fp: | ||||
|             json.dump(kres_data, fp) | ||||
|     elif args.output == "db": | ||||
|         # mongoclient needs to be created after forking | ||||
|         dbclient = MongoClient( | ||||
|             "mongodb://{}".format(args.dbaddr), | ||||
|             username=args.dbuser, | ||||
|             password=args.dbpass, | ||||
|             authSource="valdb", | ||||
|             authMechanism='SCRAM-SHA-1' | ||||
|         ) | ||||
|         valdb = dbclient.valdb | ||||
|         kres_col = valdb["kres"] | ||||
| 
 | ||||
|         # HUUUUGE BOTTLENECK | ||||
|         """ | ||||
|         for sentence in kres_data: | ||||
|             kres_col.update({"sid": sentence["sid"]}, sentence, upsert=True) | ||||
|         """ | ||||
| 
 | ||||
|         kres_data_1 = [_db_preprocess(x) for x in kres_data] | ||||
|         kres_col.insert_many(kres_data_1)  # much much better (just make sure sid has a unique index) | ||||
|         logging.info("Handled {} ({}/{}) in {:.2f} s".format( | ||||
|             kres_file, kres_file_idx, n_kres_files, time.time() - tstart)) | ||||
| 
 | ||||
| def _get_dbclient(args): | ||||
|     dbclient = MongoClient( | ||||
|         "mongodb://{}".format(args.dbaddr), | ||||
|         username=args.dbuser, | ||||
|         password=args.dbpass, | ||||
|         authSource="valdb", | ||||
|         authMechanism='SCRAM-SHA-1' | ||||
|     ) | ||||
|     return dbclient | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser(description="Parsing corpora kres and ssj500k.") | ||||
| @ -44,19 +96,16 @@ if __name__ == "__main__": | ||||
|     args = parser.parse_args() | ||||
| 
 | ||||
|     outdir = None | ||||
|     valdb = None | ||||
|     if args.output == "file": | ||||
|         outdir = Path(args.outdir) | ||||
|         outdir.mkdir(parents=True, exist_ok=True) | ||||
|     elif args.output == "db": | ||||
|         dbclient = MongoClient( | ||||
|             "mongodb://{}".format(args.dbaddr), | ||||
|             username=args.dbuser, | ||||
|             password=args.dbpass, | ||||
|             authSource="valdb", | ||||
|             authMechanism='SCRAM-SHA-1' | ||||
|         ) | ||||
|         valdb = dbclient.valdb | ||||
|         # Force unique sid | ||||
|         dbclient = _get_dbclient(args) | ||||
|         for corpus in ["kres", "ssj"]: | ||||
|             dbclient.valdb[corpus].ensure_index([("sid", pymongo.ASCENDING)]) | ||||
|             dbclient.valdb[corpus].ensure_index([("headwords", pymongo.ASCENDING)]) | ||||
|             dbclient.valdb[corpus].ensure_index([("functors", pymongo.ASCENDING)]) | ||||
| 
 | ||||
|     # SSJ | ||||
|     logger.info("Parsing Ssj: {}".format(args.ssj_file)) | ||||
| @ -67,8 +116,11 @@ if __name__ == "__main__": | ||||
|         with ssj_outfile.open("w") as fp: | ||||
|             json.dump(ssj_data, fp) | ||||
|     elif args.output == "db": | ||||
|         dbclient = _get_dbclient(args) | ||||
|         valdb = dbclient.valdb | ||||
|         ssj_col = valdb["ssj"] | ||||
|         for sentence in ssj_data: | ||||
|             sentence = _db_preprocess(sentence) | ||||
|             ssj_col.update({"sid": sentence["sid"]}, sentence, upsert=True) | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user