msd translate now optional
This commit is contained in:
parent
f89212f7c9
commit
40db51adf1
18
wani.py
18
wani.py
|
@ -710,9 +710,9 @@ def get_msd(comp):
|
||||||
raise NotImplementedError("MSD?")
|
raise NotImplementedError("MSD?")
|
||||||
|
|
||||||
class Word:
|
class Word:
|
||||||
def __init__(self, xml):
|
def __init__(self, xml, do_msd_translate):
|
||||||
self.lemma = xml.get('lemma')
|
self.lemma = xml.get('lemma')
|
||||||
self.msd = MSD_TRANSLATE[get_msd(xml)]
|
self.msd = MSD_TRANSLATE[get_msd(xml)] if do_msd_translate else get_msd(xml)
|
||||||
self.id = xml.get('id')
|
self.id = xml.get('id')
|
||||||
self.text = xml.text
|
self.text = xml.text
|
||||||
self.links = defaultdict(list)
|
self.links = defaultdict(list)
|
||||||
|
@ -720,9 +720,9 @@ class Word:
|
||||||
assert(None not in (self.id, self.lemma, self.msd))
|
assert(None not in (self.id, self.lemma, self.msd))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def pcWord(pc):
|
def pcWord(pc, do_msd_translate):
|
||||||
pc.set('lemma', pc.text)
|
pc.set('lemma', pc.text)
|
||||||
return Word(pc)
|
return Word(pc, do_msd_translate)
|
||||||
|
|
||||||
def add_link(self, link, to):
|
def add_link(self, link, to):
|
||||||
self.links[link].append(to)
|
self.links[link].append(to)
|
||||||
|
@ -742,12 +742,13 @@ def is_root_id(id_):
|
||||||
def load_files(args):
|
def load_files(args):
|
||||||
filenames = args.input
|
filenames = args.input
|
||||||
skip_id_check = args.skip_id_check
|
skip_id_check = args.skip_id_check
|
||||||
|
do_msd_translate = not args.no_msd_translate
|
||||||
|
|
||||||
for fname in filenames:
|
for fname in filenames:
|
||||||
yield load_tei_file(fname, skip_id_check)
|
yield load_tei_file(fname, skip_id_check, do_msd_translate)
|
||||||
|
|
||||||
|
|
||||||
def load_tei_file(filename, skip_id_check):
|
def load_tei_file(filename, skip_id_check, do_msd_translate):
|
||||||
logging.info("LOADING FILE: {}".format(filename))
|
logging.info("LOADING FILE: {}".format(filename))
|
||||||
|
|
||||||
with open(filename, 'r') as fp:
|
with open(filename, 'r') as fp:
|
||||||
|
@ -757,9 +758,9 @@ def load_tei_file(filename, skip_id_check):
|
||||||
|
|
||||||
words = {}
|
words = {}
|
||||||
for w in et.iter("w"):
|
for w in et.iter("w"):
|
||||||
words[w.get('id')] = Word(w)
|
words[w.get('id')] = Word(w, do_msd_translate)
|
||||||
for pc in et.iter("pc"):
|
for pc in et.iter("pc"):
|
||||||
words[pc.get('id')] = Word.pcWord(pc)
|
words[pc.get('id')] = Word.pcWord(pc, do_msd_translate)
|
||||||
|
|
||||||
for l in et.iter("link"):
|
for l in et.iter("link"):
|
||||||
if 'dep' in l.keys():
|
if 'dep' in l.keys():
|
||||||
|
@ -963,6 +964,7 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
parser.add_argument('input', help='input xml file in `ssj500k form`, can list more than one', nargs='+')
|
||||||
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
parser.add_argument('--output', help='Output file (if none given, then output to stdout)')
|
||||||
|
|
||||||
|
parser.add_argument('--no-msd-translate', help='MSDs are translated from slovene to english by default', action='store_true')
|
||||||
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
parser.add_argument('--skip-id-check', help='Skips checks for ids of <w> and <pc>, if they are in correct format', action='store_true')
|
||||||
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
|
parser.add_argument('--lemma-only', help='Will not write word ids, forms and msds in output', action='store_true')
|
||||||
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
|
parser.add_argument('--without-rep', help='Will not write representtaions in output', action='store_true')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user