From 92fa061dfc09e9068571518e7ad66338460b2f23 Mon Sep 17 00:00:00 2001 From: Cyprian Laskowski Date: Thu, 25 Mar 2021 10:56:37 +0100 Subject: [PATCH] Redmine #1487: updated pipeline for classla/obeliks changes --- package/structure_assignment/constants.py | 2 +- package/structure_assignment/pipeline.py | 13 +++++-------- scripts/setup.sh | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/package/structure_assignment/constants.py b/package/structure_assignment/constants.py index e1c54f0..118bfbb 100644 --- a/package/structure_assignment/constants.py +++ b/package/structure_assignment/constants.py @@ -24,6 +24,6 @@ FILE_MAP = {'strings-list': 'strings.txt', NLP_CONFIG_MAP = { 'type': 'standard_jos', 'processors': 'tokenize,pos,lemma,depparse', - 'tokenize_pretokenized': True, + 'tokenize_pretokenized': 'conllu', 'pos_use_lexicon': True, } diff --git a/package/structure_assignment/pipeline.py b/package/structure_assignment/pipeline.py index 7e12460..6b10c33 100644 --- a/package/structure_assignment/pipeline.py +++ b/package/structure_assignment/pipeline.py @@ -5,10 +5,7 @@ from types import SimpleNamespace import lxml.etree as lxml import obeliks - import classla -from classla import Document -from classla.utils.conll import CoNLL from structure_assignment.constants import * from structure_assignment.tweak_conllu import tweak as tweak_conllu @@ -134,11 +131,11 @@ class Pipeline: print('Parsing with classla ...') input_file_name = self.file_map['obeliks-tweaked'] output_file_name = self.file_map['classla-parsed'] - doc = Document(text=None) - conll_file = CoNLLFile(filename=input_file_name) - doc.conll_file = conll_file - result = nlp(doc) - result.conll_file.write_conll(output_file_name) + with open(input_file_name, 'r') as input_file: + input_conllu = input_file.read() + doc = self.nlp(input_conllu) + with open(output_file_name, 'w') as output_file: + output_file.write(doc.to_conll()) def do_translate_jos(self): print('Translating JOS ...') diff --git a/scripts/setup.sh b/scripts/setup.sh index ddca1d5..8315ead 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -17,7 +17,7 @@ pip install lxml pip install psycopg2cffi pip install sqlalchemy pip install classla -python -c "import classla; classla.download('sl_ssj_jos')" <<< $'Y\nresources/classla' +python -c "import classla; classla.download('sl', type='standard_jos')" <<< $'Y\nresources/classla' pip install obeliks pip install nova_slovnica/python/package/ pip install luscenje_struktur/