From c6b8426fb30f11f64bccb7f8bf0f1dd367f1a6f2 Mon Sep 17 00:00:00 2001 From: voje Date: Fri, 19 Apr 2019 07:41:50 +0200 Subject: [PATCH] added adjective handling (appending _ to headwords) --- corpusparser/__init__.py | 3 ++- corpusparser/main.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/corpusparser/__init__.py b/corpusparser/__init__.py index d993b8c..5463a20 100644 --- a/corpusparser/__init__.py +++ b/corpusparser/__init__.py @@ -1 +1,2 @@ -from corpusparser.Parser import Parser \ No newline at end of file +from corpusparser.Parser import Parser +from corpusparser.main import enriched_lemma \ No newline at end of file diff --git a/corpusparser/main.py b/corpusparser/main.py index 2998a69..38e8156 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -15,6 +15,10 @@ logger = logging.getLogger(__name__) n_kres_files = -1 # for logging +def enriched_lemma(token): + return (token["lemma"] if token["msd"][0] == "G" else token["lemma"] + "_") + + def _helper_tid_to_token(tid, tokens): for t in tokens: if t["tid"] == tid: @@ -29,7 +33,7 @@ def _db_preprocess(e): else: hw_tids = list(set([x["from"] for x in e["srl_links"]])) hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] - headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] + headwords = [enriched_lemma(t) for t in hw_tokens] e["headwords"] = headwords functors = list(set([x["afun"] for x in e["srl_links"]]))