msd translate now optional

This commit is contained in:
Ozbolt Menegatti 2019-02-12 11:58:04 +01:00
parent f89212f7c9
commit 40db51adf1

18
wani.py
View File

@ -710,9 +710,9 @@ def get_msd(comp):
raise NotImplementedError("MSD?")
class Word:
def __init__(self, xml):
def __init__(self, xml, do_msd_translate):
self.lemma = xml.get('lemma')
self.msd = MSD_TRANSLATE[get_msd(xml)]
self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
self.id = xml.get('id')
self.text = xml.text
self.links = defaultdict(list)
@ -720,9 +720,9 @@ class Word:
assert(None not in (self.id, self.lemma, self.msd))
@staticmethod
def pcWord(pc):
def pcWord(pc, do_msd_translate):
pc.set('lemma', pc.text)
return Word(pc)
return Word(pc, do_msd_translate)
def add_link(self, link, to):
self.links[link].append(to)
@ -742,12 +742,13 @@ def is_root_id(id_):
def load_files(args):
filenames = args.input
skip_id_check = args.skip_id_check
do_msd_translate = not args.no_msd_translate
for fname in filenames:
yield load_tei_file(fname, skip_id_check)
yield load_tei_file(fname, skip_id_check, do_msd_translate)
def load_tei_file(filename, skip_id_check):
def load_tei_file(filename, skip_id_check, do_msd_translate):
logging.info("LOADING FILE: {}".format(filename))
with open(filename, 'r') as fp:
@ -757,9 +758,9 @@ def load_tei_file(filename, skip_id_check):
words = {}
for w in et.iter("w"):
words[w.get('id')] = Word(w)
words[w.get('id')] = Word(w, do_msd_translate)
for pc in et.iter("pc"):
words[pc.get('id')] = Word.pcWord(pc)
words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)
for l in et.iter("link"):
if 'dep' in l.keys():
@ -963,6 +964,7 @@ if __name__ == '__main__':
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')