From 40db51adf110a3b62ddd952852a0162f7ba0d499 Mon Sep 17 00:00:00 2001 From: Ozbolt Menegatti Date: Tue, 12 Feb 2019 11:58:04 +0100 Subject: [PATCH] msd translate now optional --- wani.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/wani.py b/wani.py index 8896d2a..0320c95 100644 --- a/wani.py +++ b/wani.py @@ -710,9 +710,9 @@ def get_msd(comp): raise NotImplementedError("MSD?") class Word: - def __init__(self, xml): + def __init__(self, xml, do_msd_translate): self.lemma = xml.get('lemma') - self.msd = MSD_TRANSLATE[get_msd(xml)] + self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml) self.id = xml.get('id') self.text = xml.text self.links = defaultdict(list) @@ -720,9 +720,9 @@ class Word: assert(None not in (self.id, self.lemma, self.msd)) @staticmethod - def pcWord(pc): + def pcWord(pc, do_msd_translate): pc.set('lemma', pc.text) - return Word(pc) + return Word(pc, do_msd_translate) def add_link(self, link, to): self.links[link].append(to) @@ -742,12 +742,13 @@ def is_root_id(id_): def load_files(args): filenames = args.input skip_id_check = args.skip_id_check + do_msd_translate = not args.no_msd_translate for fname in filenames: - yield load_tei_file(fname, skip_id_check) + yield load_tei_file(fname, skip_id_check, do_msd_translate) -def load_tei_file(filename, skip_id_check): +def load_tei_file(filename, skip_id_check, do_msd_translate): logging.info("LOADING FILE: {}".format(filename)) with open(filename, 'r') as fp: @@ -757,9 +758,9 @@ def load_tei_file(filename, skip_id_check): words = {} for w in et.iter("w"): - words[w.get('id')] = Word(w) + words[w.get('id')] = Word(w, do_msd_translate) for pc in et.iter("pc"): - words[pc.get('id')] = Word.pcWord(pc) + words[pc.get('id')] = Word.pcWord(pc, do_msd_translate) for l in et.iter("link"): if 'dep' in l.keys(): @@ -963,6 +964,7 @@ if __name__ == '__main__': parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+') parser.add_argument('--output', help='Output file (if none given, then output to stdout)') + parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true') parser.add_argument('--skip-id-check', help='Skips checks for ids of and , if they are in correct format', action='store_true') parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true') parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')