diff --git a/corpusparser/__init__.py b/corpusparser/__init__.py index d993b8c..5463a20 100644 --- a/corpusparser/__init__.py +++ b/corpusparser/__init__.py @@ -1 +1,2 @@ -from corpusparser.Parser import Parser \ No newline at end of file +from corpusparser.Parser import Parser +from corpusparser.main import enriched_lemma \ No newline at end of file diff --git a/corpusparser/main.py b/corpusparser/main.py index 2998a69..38e8156 100644 --- a/corpusparser/main.py +++ b/corpusparser/main.py @@ -15,6 +15,10 @@ logger = logging.getLogger(__name__) n_kres_files = -1 # for logging +def enriched_lemma(token): + return (token["lemma"] if token["msd"][0] == "G" else token["lemma"] + "_") + + def _helper_tid_to_token(tid, tokens): for t in tokens: if t["tid"] == tid: @@ -29,7 +33,7 @@ def _db_preprocess(e): else: hw_tids = list(set([x["from"] for x in e["srl_links"]])) hw_tokens = [_helper_tid_to_token(tid, e["tokens"]) for tid in hw_tids] - headwords = [(t["lemma"] if t["msd"][0] == "G" else t["lemma"] + "_") for t in hw_tokens] + headwords = [enriched_lemma(t) for t in hw_tokens] e["headwords"] = headwords functors = list(set([x["afun"] for x in e["srl_links"]]))